R/read.deeptools.R

#' Read deeptools computeMatrix files
#'
#' @param PATH An abosulte path to a computeMatrix file
#' @param FUNC Which summarising function? Default: mean
#' @param sortOnMiddle Sort the matrices on the centers?
#' @param quantiles A two-number vector. c(0.05,0.95) will dampen the top and bottem 5%. Handy for outliers.
#' @return A list:
#' \itemize{
#'  \item{matrixList: }{A named list of matrices: one per bigwig/bed combination. Names are bigwig<>bed.}
#'  \item{metaData: }{A named list of metadata from you computeMatrix-run.}
#'  \item{tidyDF: }{A dataframe for easy ggplotting.}
#'  \item{enrichmentDF: }{A dataframe with per-loop enrichment of average signal.}
#'  \item{BED: }{A dataframe of the BED-entries.}
#' }
#' @export
read.deeptools <- function(PATH, FUNC = mean , sortOnMiddle = T, quantiles = c(0,1)){
  require(reshape2)
  require(dplyr)
  json <- read.delim(PATH, header = F, nrows = 1, stringsAsFactors = F)
  json <- unlist(json[,1])
  json <- gsub(json, pattern = "^@", replacement = "")
  json <- strsplit(json, split = ",")[[1]]

  jsonLE <- list()
  jsonListEntries <- cbind(grep(json,pattern = "\\["),grep(json,pattern = "\\]"))
  rmEntriesIndexes <- c()
  for(i in 1:nrow(jsonListEntries)){
    row = json[jsonListEntries[i ,1]:jsonListEntries[i ,2] ]
    NAAM = strsplit(row[1], split = ":\\[")[[1]][1]
    row[1] = strsplit(row[1], split = ":\\[")[[1]][2]
    row[length(row)] <- gsub(row[length(row)], pattern = "\\]", replacement = "")
    if(any("null" %in% row)){
      row = NULL
    }
    if(!any(grepl(row, pattern = "[a-zA-Z]"))){
      row = as.numeric(row)
    }
    jsonLE[[gsub(NAAM, pattern = " ",replacement = "_")]] = row
    rmEntriesIndexes <- c(rmEntriesIndexes,  jsonListEntries[i ,1]:jsonListEntries[i ,2])
  }
  json = json[-rmEntriesIndexes]
  json = gsub(json , pattern = "[\\{\\}]", replacement = "")
  for(i in 1:length(json)){
    row = strsplit(json[i], split = ":")[[1]]
    NAAM = row[1]
    entry = row[2]
    if(entry == 'null'){
      entry = NULL
    } else if( entry %in% c('false', 'true')  ){
      entry = as.logical(toupper(entry))
    } else if( grepl(entry, pattern = "[a-zA-Z]") & !is.logical(entry) ){
      entry = as.character(entry)
    } else {
      entry = as.numeric(entry)
    }

    jsonLE[[gsub(NAAM, pattern = " ",replacement = "_")]] = entry
  }

  mat <- read.delim(PATH, skip = 1, h = F)

  meta <- mat[, 1:6]
  mat <- mat[, -c(1:6)]

  matList <- list()
  for(Si in 1:(length(jsonLE$sample_boundaries)-1)){
    SB = (jsonLE$sample_boundaries[Si]+1):jsonLE$sample_boundaries[Si +1]

    for(Gi in 1:(length(jsonLE$group_boundaries)-1)){

      GB = (jsonLE$group_boundaries[Gi]+1):jsonLE$group_boundaries[Gi +1]

      matChunk <- mat[GB,SB]

      for(C in 1:ncol(matChunk)){

        col = matChunk[, C]
        Q = quantile(col, probs = quantiles, na.rm = T)
        col[col > Q[2]] <- Q[2]
        col[col < Q[1]] <- Q[1]
        matChunk[, C] = col
      }

      if(sortOnMiddle){
        before = jsonLE$upstream[Si] / jsonLE$bin_size[Si]
        middle = jsonLE$body[Si] / jsonLE$bin_size[Si]
        after = jsonLE$downstream[Si] / jsonLE$bin_size[Si]

        leftIDX <- ncol(matChunk) - (before + middle) + 1
        rightIDX <- ncol(matChunk) - (before )

        O <- order(rowMeans(matChunk[, leftIDX:rightIDX]))

        matChunk <- matChunk[O, ]
      }



      matList[[paste0(c(jsonLE$sample_labels[Si], jsonLE$group_labels[Gi]), collapse = "<>")]] = matChunk
    }

  }



  before = jsonLE$upstream[Si] / jsonLE$bin_size[Si]
  middle = jsonLE$body[Si] / jsonLE$bin_size[Si]
  after = jsonLE$downstream[Si] / jsonLE$bin_size[Si]

  if(jsonLE$body[1] == 0){
    # refpoint
    newBinNames <- c( rev(1:before * jsonLE$bin_size[1])*-1 ,
                      1:after * jsonLE$bin_size[1])
  } else {
    newBinNames <- c( rev(1:before * jsonLE$bin_size[1])*-1 ,
                      seq(-jsonLE$bin_size[1],
                          jsonLE$bin_size[1],
                          length.out =  middle+2)[-c(1, middle+2)],
                      1:after * jsonLE$bin_size[1])
  }




  forGG <- data.frame()
  enrich <- data.frame()
  for(i in 1:length(matList)){

    NAAM = names(matList)[i]
    NAAM = strsplit(NAAM, split = "<>", fixed = T)[[1]]
    BW = NAAM[1]
    BED = NAAM[2]
    if(BED == "genes"){BED = ""}

    matChunk = matList[[i]]
    colnames(matChunk) <- newBinNames
    rownames(matChunk) <- 1:nrow(matChunk)
    melted = reshape2::melt(as.matrix(matChunk))

    melted$BW = BW
    melted$BED = BED
    colnames(melted)[2] = 'bin'
    melted$binNumber <- match(as.character(melted$bin), as.character(newBinNames))
    forGG  <- rbind(forGG, melted)

    midSize <- unique(jsonLE$body/jsonLE$bin_size)
    if(jsonLE$body[1] == 0){
      midSize = 1
    }
    midStart <- unique(jsonLE$upstream/jsonLE$bin_size)
    middleTidy <- melted[melted$binNumber >=  midStart + 1 &
                           melted$binNumber <= midStart + midSize,]

    surroundingTidy <- melted[melted$binNumber <=  midSize |
                                melted$binNumber > max(melted$binNumber) - midSize,]

    middleScore <- dplyr::summarise( dplyr::group_by(middleTidy, Var1, BW, BED),
                                     value = FUNC(value, na.rm = T))
    surroundingScore <- dplyr::summarise( dplyr::group_by(surroundingTidy, Var1, BW, BED),
                                          value = FUNC(value, na.rm = T))
    enrichRow <- merge(middleScore, surroundingScore, by = c(1,2,3))
    colnames(enrichRow) <- c('loopID', 'BW', 'BED','middle','surrounding')
    enrichRow$log2FC <- log2(enrichRow$middle/enrichRow$surrounding)
    enrich <- rbind(enrich, enrichRow)
  }
  forGG = dplyr::summarise(dplyr::group_by(forGG, bin, BED, BW, binNumber),
                           value = FUNC(value, na = T))

  return(list(matrixList = matList, metaData = jsonLE,
              tidyDF = forGG, enrichmentDF = enrich))
}
robinweide/RHWlib documentation built on May 7, 2019, 8:03 a.m.