R/p05_profile_matrix_import_functions.R

Defines functions import_profiles import_profile_from_file bigwig_profile_matrix na_impute

Documented in bigwig_profile_matrix import_profile_from_file import_profiles na_impute

##################################################################################
## function to impute NA values for the rows with NAs
#' Replace NA values by row mean
#'
#' @param index row number with NA values found
#' @param mat matrix
#'
#' @return corrected row in which NA values are replaced by mean of row
#' @export
#'
#' @examples NA
na_impute = function(index, mat){
  xNew = imputeTS::na.mean(mat[index, ])
  return(xNew)
}

##################################################################################


##################################################################################

#' Generate profile matrix using normalizeToMatrix()
#'
#' @param bwFile BigWig file for the signal
#' @param regions bed file for the regions/genes of interest or a GenomicRanges object
#' @param signalName Signal name
#' @param extend extend argument of normalizeToMatrix function. Default: c(2000, 1000)
#' @param w w argument of normalizeToMatrix function: Default: 10
#' @param targetName Name to be used for target. Eg: "gene", "TSS", "TES", "summit" etc. default: gene
#' @param storeLocal Logical. Whether to store the matrix locally as gzipped file.
#'  Default: FALSE
#' @param localPath File path pointing to locally stored matrix
#' @param ... other arguments passed to function \code{normalizeToMatrix}
#'  function
#'
#' @return profile matrix of class normalizedMatrix
#' @export
#'
#' @examples NULL
bigwig_profile_matrix <- function(bwFile, regions, signalName,
                                  extend = c(2000, 1000), w = 10,
                                  targetName = "gene",
                                  storeLocal = FALSE, localPath,
                                  ...){


  profileMat <- NULL

  cat("Generating normalized matrix from bigWig file\n")

  bwGr <- rtracklayer::import(con = bwFile, format = "BigWig")

  regionGr <- NULL
  if(class(regions) == "GRanges"){
    regionGr <- regions

  } else if(file.exists(regions)){
    regionGr <- rtracklayer::import(con = regions, format = "bed")

  }

  if(is.null(mcols(regionGr)$name)){
    warning("missing name column in regions")
  }

  names(regionGr) <- regionGr$name

  bedRegion <- NULL

  profileMat <- EnrichedHeatmap::normalizeToMatrix(signal = bwGr,
                                                   target = regionGr,
                                                   extend = extend,
                                                   w = w,
                                                   value_column = "score",
                                                   ...)


  attr(profileMat, "signal_name") = signalName
  attr(profileMat, "target_name") = toupper(targetName)


  ## save the matrix locally
  if(isTRUE(storeLocal)){

    tempFile <- paste(localPath, ".temp_mat.tab", sep = "")

    data.table::fwrite(x = as.data.frame(profileMat[1:nrow(profileMat), 1:ncol(profileMat)]),
                       file = tempFile, col.names = F, row.names = T, sep = "\t", quote = F, eol = "\n")

    ## store the matrix in gzipped format and delete the temp file
    system2(command = "gzip", args = c("-f", tempFile))

    tempGzFile <- paste(tempFile, ".gz", sep = "")

    if(file.rename(from = tempGzFile, to = localPath)){
      cat("Stored the matrix into file", localPath, "\n")
    }

  }

  return(profileMat)

}

##################################################################################



##################################################################################
## Read profile matrix from file
#' Read profile matrix from file
#'
#' This function reads the locally stored profile matrix and convert it
#' to \code{normalizedMatrix} class. This matrix can be used for generating profile plot
#' using EnrichedHeatmap package.
#'
#' @param file profile matrix for e.g. generated by deeptools' \code{computeMatrix scale-regions}
#'  command
#' @param source A character string for the source of the matrix being read. One of \code{"deeptools",
#'  "miao", "normalizedmatrix"}. Default: normalizedmatrix
#' @param signalName name of the signal track
#' @param selectGenes A vector of gene IDs which are to be plotted.. Only these genes' profile is extracted
#' from the profile matrix for plotting.
#' @param up number of bins in upstream region
#' @param target number of bins in gene body
#' @param down number of bins in downstream region
#' @param binSize bin size used while generating profile matrix
#' @param targetType One of "region", "point". If targetType is "region", target is used to decide
#' the number of bins over region. Otherwise, for "point", a signle point matrix is expected where
#' the region is extracted around single point. Default: region
#' @param targetName Name to be used for target. Eg: "gene", "TSS", "TES", "summit" etc. default: gene
#' @param keep Same as \code{EnrichedHeatmap::normalizeToMatrix}. First value is used as lower quantile
#' and any value in profile matrix less than lower quantile is set to lower quantile. Second value is
#' used as upper quantile and any value greater than upper quantile is set to upper quantile.
#' Default: \code{c(0, 1)}
#' @param returnDf If TRUE, returns the dataframe instead of profile matrix. Default: FALSE
#'
#' @return profile matrix of class \code{normalizedMatrix}
#' @export
#'
#' @examples NA
import_profile_from_file = function(file, source = "normalizedmatrix", signalName, selectGenes,
                                    up = 200, target = 200, down = 100, binSize = 10,
                                    targetType = "region", targetName = "gene",
                                    keep = c(0, 1),
                                    returnDf = FALSE){


  targetType <- match.arg(arg = tolower(targetType), choices = c("region", "point"))

  cat("Reading profile matrix generated by ", source, "for sample:", signalName, "\n")

  if(!is.vector(selectGenes)) stop("selectGenes should be a character vector of gene IDs")

  geneDf <- data.frame(geneId = selectGenes, stringsAsFactors = F)


  extraCols <- character()
  freadSkip <- 0

  if(tolower(source) == "deeptools"){
    extraCols <- c("chr", "start", "end", "geneId", "length", "strand")
    freadSkip <- 1

  } else if(tolower(source) == "miao"){
    extraCols <- c("chr", "start", "end", "strand", "geneId")

  } else if(all(grepl(pattern = "^normalizedmatrix", x = source, ignore.case = T, perl = T))){
    extraCols <- c("geneId")
  }

  ## profile matrix column names
  header <-  c(extraCols,
               paste(c("u"), 1:up, sep = ""),
               paste(c("g"), 1:target, sep = ""),
               paste(c("d"), 1:down, sep = "")
  )

  if(tolower(targetType) == "point"){
    target <- 1
    header <- c(extraCols,
                paste(c("u"), 1:up, sep = ""),
                paste(c("d"), 1:down, sep = "")
    )
  }


  z1 <- file

  ## read the profile matrix which is in .gz file
  ## for windows system: install IGV tools which has gzip binary. Add it to the PATH variable
  if(grepl(pattern = ".gz$", x = file, perl = T)){
    z1 <- paste("gzip -c -d ", file, sep = " ")
  }


  extraCols[extraCols != "geneId"]

  df <- data.table::fread(
    cmd = z1, sep = "\t", header = F, skip = freadSkip, na.strings = "nan",
    col.names = header, data.table = F) %>%
    {
      if(length(extraCols[extraCols != "geneId"]) == 0){
        .
      } else{
        dplyr::select(., -c(!!! extraCols[extraCols != "geneId"]))
      }
    }


  profileDf <- dplyr::left_join(x = geneDf, y = df, by = c("geneId" = "geneId")) %>%
    tibble::column_to_rownames(var = "geneId")


  profileMat <- data.matrix(profileDf)

  ## remove the rows with all NA values
  allNaRows <- which(apply(profileMat, 1, function(x) all(is.na(x))))

  if (length(allNaRows) > 0) {
    warning("Removing ", length(allNaRows), " genes (", rownames(profileMat)[allNaRows],
            ") with all NA values while reading profile matrix")
    profileMat <- profileMat[-allNaRows, ]
  }



  ## find rows with NA values and ipmute the NA values: replace NA with mean(row)
  naRows <- which(apply(profileMat, 1, function(x) any(is.na(x))))

  profileMat[naRows, ] <- do.call(rbind, lapply(naRows, na_impute, mat = profileMat))

  ## compare the imputed values with non-imputed values
  # rowN = 13
  # plotNA.imputations(as.numeric(profileDf[naRows[rowN], ]), as.vector(profileMat[naRows[rowN], ]))

  ## adjust the extreme values
  ## lower quantile
  if(keep[1] > 0){
    profileMat[which(profileMat < quantile(profileMat, keep[1]))] <- quantile(profileMat, keep[1])
  }

  ## upper quantile
  if(keep[2] < 1){
    profileMat[which(profileMat > quantile(profileMat, keep[2]))] <- quantile(profileMat, keep[2])
  }


  ## set attributes for the profileMat to make it of class "normalizedMatrix"
  attr(profileMat, "target_is_single_point") <- FALSE
  attr(profileMat, "upstream_index") <- 1:up
  attr(profileMat, "target_index") <- (up+1):(up+target)
  attr(profileMat, "downstream_index") <- (up+target+1):(up+target+down)
  attr(profileMat, "extend") <- c(up * binSize, down * binSize)

  ## if the matrix is reference-point
  if(tolower(targetType) == "point"){
    attr(profileMat, "target_is_single_point") <- TRUE
    attr(profileMat, "target_index") <- integer(0)
    attr(profileMat, "downstream_index") <- (up+1):(up+down)
  }

  attr(profileMat, "signal_name") <- signalName
  attr(profileMat, "target_name") <- toupper(targetName)
  attr(profileMat, "empty_value") <- "NA"

  class(profileMat) <- c("normalizedMatrix", "matrix")

  if(isTRUE(returnDf)){
    ## return the dataframe
    base::colnames(profileDf) <- paste(signalName, colnames(profileDf), sep = "_")
    profileDf <- tibble::rownames_to_column(profileDf, var = "geneId")
    return(profileDf)
  } else {
    # returns: profile matrix of class "normalizedMatrix"
    return(profileMat)
  }


}

##################################################################################



##################################################################################
#' Generate profile matrix list
#'
#' This function reads/generate the profile matrix of \code{normalizedMatrix} class for each
#' sample
#'
#' @param exptInfo experiment info as data frame with information like sampleID, type, path etc
#' @param geneList A vector of gene IDs for which the profile needs to be extracted
#' @param ... Other argument to \code{import_profile_from_file} function
#'
#' @return A list of normalized matrix where each matrix is of \code{normalizedMatrix} class
#' @export
#'
#' @examples NA
import_profiles <- function(exptInfo, geneList, ...){

  matList <- NULL

  for(i in 1:nrow(exptInfo)){
    sampleName <- exptInfo$sampleId[i]

    # cat("Extracting matrix for sample:", sampleName, "...\n")

    matList[[sampleName]] <- import_profile_from_file(file = exptInfo$matFile[i],
                                                      signalName = sampleName,
                                                      selectGenes = geneList,
                                                      ...)

  }

  return(matList)

}

##################################################################################
lakhanp1/chipmine documentation built on March 6, 2021, 9:06 a.m.