##################################################################################
## function to impute NA values for the rows with NAs
#' Replace NA values by row mean
#'
#' @param index row number with NA values found
#' @param mat matrix
#'
#' @return corrected row in which NA values are replaced by mean of row
#' @export
#'
#' @examples NA
na_impute = function(index, mat){
xNew = imputeTS::na.mean(mat[index, ])
return(xNew)
}
##################################################################################
##################################################################################
#' Generate profile matrix using normalizeToMatrix()
#'
#' @param bwFile BigWig file for the signal
#' @param regions bed file for the regions/genes of interest or a GenomicRanges object
#' @param signalName Signal name
#' @param extend extend argument of normalizeToMatrix function. Default: c(2000, 1000)
#' @param w w argument of normalizeToMatrix function: Default: 10
#' @param targetName Name to be used for target. Eg: "gene", "TSS", "TES", "summit" etc. default: gene
#' @param storeLocal Logical. Whether to store the matrix locally as gzipped file.
#' Default: FALSE
#' @param localPath File path pointing to locally stored matrix
#' @param ... other arguments passed to function \code{normalizeToMatrix}
#' function
#'
#' @return profile matrix of class normalizedMatrix
#' @export
#'
#' @examples NULL
bigwig_profile_matrix <- function(bwFile, regions, signalName,
extend = c(2000, 1000), w = 10,
targetName = "gene",
storeLocal = FALSE, localPath,
...){
profileMat <- NULL
cat("Generating normalized matrix from bigWig file\n")
bwGr <- rtracklayer::import(con = bwFile, format = "BigWig")
regionGr <- NULL
if(class(regions) == "GRanges"){
regionGr <- regions
} else if(file.exists(regions)){
regionGr <- rtracklayer::import(con = regions, format = "bed")
}
if(is.null(mcols(regionGr)$name)){
warning("missing name column in regions")
}
names(regionGr) <- regionGr$name
bedRegion <- NULL
profileMat <- EnrichedHeatmap::normalizeToMatrix(signal = bwGr,
target = regionGr,
extend = extend,
w = w,
value_column = "score",
...)
attr(profileMat, "signal_name") = signalName
attr(profileMat, "target_name") = toupper(targetName)
## save the matrix locally
if(isTRUE(storeLocal)){
tempFile <- paste(localPath, ".temp_mat.tab", sep = "")
data.table::fwrite(x = as.data.frame(profileMat[1:nrow(profileMat), 1:ncol(profileMat)]),
file = tempFile, col.names = F, row.names = T, sep = "\t", quote = F, eol = "\n")
## store the matrix in gzipped format and delete the temp file
system2(command = "gzip", args = c("-f", tempFile))
tempGzFile <- paste(tempFile, ".gz", sep = "")
if(file.rename(from = tempGzFile, to = localPath)){
cat("Stored the matrix into file", localPath, "\n")
}
}
return(profileMat)
}
##################################################################################
##################################################################################
## Read profile matrix from file
#' Read profile matrix from file
#'
#' This function reads the locally stored profile matrix and convert it
#' to \code{normalizedMatrix} class. This matrix can be used for generating profile plot
#' using EnrichedHeatmap package.
#'
#' @param file profile matrix for e.g. generated by deeptools' \code{computeMatrix scale-regions}
#' command
#' @param source A character string for the source of the matrix being read. One of \code{"deeptools",
#' "miao", "normalizedmatrix"}. Default: normalizedmatrix
#' @param signalName name of the signal track
#' @param selectGenes A vector of gene IDs which are to be plotted.. Only these genes' profile is extracted
#' from the profile matrix for plotting.
#' @param up number of bins in upstream region
#' @param target number of bins in gene body
#' @param down number of bins in downstream region
#' @param binSize bin size used while generating profile matrix
#' @param targetType One of "region", "point". If targetType is "region", target is used to decide
#' the number of bins over region. Otherwise, for "point", a signle point matrix is expected where
#' the region is extracted around single point. Default: region
#' @param targetName Name to be used for target. Eg: "gene", "TSS", "TES", "summit" etc. default: gene
#' @param keep Same as \code{EnrichedHeatmap::normalizeToMatrix}. First value is used as lower quantile
#' and any value in profile matrix less than lower quantile is set to lower quantile. Second value is
#' used as upper quantile and any value greater than upper quantile is set to upper quantile.
#' Default: \code{c(0, 1)}
#' @param returnDf If TRUE, returns the dataframe instead of profile matrix. Default: FALSE
#'
#' @return profile matrix of class \code{normalizedMatrix}
#' @export
#'
#' @examples NA
import_profile_from_file = function(file, source = "normalizedmatrix", signalName, selectGenes,
up = 200, target = 200, down = 100, binSize = 10,
targetType = "region", targetName = "gene",
keep = c(0, 1),
returnDf = FALSE){
targetType <- match.arg(arg = tolower(targetType), choices = c("region", "point"))
cat("Reading profile matrix generated by ", source, "for sample:", signalName, "\n")
if(!is.vector(selectGenes)) stop("selectGenes should be a character vector of gene IDs")
geneDf <- data.frame(geneId = selectGenes, stringsAsFactors = F)
extraCols <- character()
freadSkip <- 0
if(tolower(source) == "deeptools"){
extraCols <- c("chr", "start", "end", "geneId", "length", "strand")
freadSkip <- 1
} else if(tolower(source) == "miao"){
extraCols <- c("chr", "start", "end", "strand", "geneId")
} else if(all(grepl(pattern = "^normalizedmatrix", x = source, ignore.case = T, perl = T))){
extraCols <- c("geneId")
}
## profile matrix column names
header <- c(extraCols,
paste(c("u"), 1:up, sep = ""),
paste(c("g"), 1:target, sep = ""),
paste(c("d"), 1:down, sep = "")
)
if(tolower(targetType) == "point"){
target <- 1
header <- c(extraCols,
paste(c("u"), 1:up, sep = ""),
paste(c("d"), 1:down, sep = "")
)
}
z1 <- file
## read the profile matrix which is in .gz file
## for windows system: install IGV tools which has gzip binary. Add it to the PATH variable
if(grepl(pattern = ".gz$", x = file, perl = T)){
z1 <- paste("gzip -c -d ", file, sep = " ")
}
extraCols[extraCols != "geneId"]
df <- data.table::fread(
cmd = z1, sep = "\t", header = F, skip = freadSkip, na.strings = "nan",
col.names = header, data.table = F) %>%
{
if(length(extraCols[extraCols != "geneId"]) == 0){
.
} else{
dplyr::select(., -c(!!! extraCols[extraCols != "geneId"]))
}
}
profileDf <- dplyr::left_join(x = geneDf, y = df, by = c("geneId" = "geneId")) %>%
tibble::column_to_rownames(var = "geneId")
profileMat <- data.matrix(profileDf)
## remove the rows with all NA values
allNaRows <- which(apply(profileMat, 1, function(x) all(is.na(x))))
if (length(allNaRows) > 0) {
warning("Removing ", length(allNaRows), " genes (", rownames(profileMat)[allNaRows],
") with all NA values while reading profile matrix")
profileMat <- profileMat[-allNaRows, ]
}
## find rows with NA values and ipmute the NA values: replace NA with mean(row)
naRows <- which(apply(profileMat, 1, function(x) any(is.na(x))))
profileMat[naRows, ] <- do.call(rbind, lapply(naRows, na_impute, mat = profileMat))
## compare the imputed values with non-imputed values
# rowN = 13
# plotNA.imputations(as.numeric(profileDf[naRows[rowN], ]), as.vector(profileMat[naRows[rowN], ]))
## adjust the extreme values
## lower quantile
if(keep[1] > 0){
profileMat[which(profileMat < quantile(profileMat, keep[1]))] <- quantile(profileMat, keep[1])
}
## upper quantile
if(keep[2] < 1){
profileMat[which(profileMat > quantile(profileMat, keep[2]))] <- quantile(profileMat, keep[2])
}
## set attributes for the profileMat to make it of class "normalizedMatrix"
attr(profileMat, "target_is_single_point") <- FALSE
attr(profileMat, "upstream_index") <- 1:up
attr(profileMat, "target_index") <- (up+1):(up+target)
attr(profileMat, "downstream_index") <- (up+target+1):(up+target+down)
attr(profileMat, "extend") <- c(up * binSize, down * binSize)
## if the matrix is reference-point
if(tolower(targetType) == "point"){
attr(profileMat, "target_is_single_point") <- TRUE
attr(profileMat, "target_index") <- integer(0)
attr(profileMat, "downstream_index") <- (up+1):(up+down)
}
attr(profileMat, "signal_name") <- signalName
attr(profileMat, "target_name") <- toupper(targetName)
attr(profileMat, "empty_value") <- "NA"
class(profileMat) <- c("normalizedMatrix", "matrix")
if(isTRUE(returnDf)){
## return the dataframe
base::colnames(profileDf) <- paste(signalName, colnames(profileDf), sep = "_")
profileDf <- tibble::rownames_to_column(profileDf, var = "geneId")
return(profileDf)
} else {
# returns: profile matrix of class "normalizedMatrix"
return(profileMat)
}
}
##################################################################################
##################################################################################
#' Generate profile matrix list
#'
#' This function reads/generate the profile matrix of \code{normalizedMatrix} class for each
#' sample
#'
#' @param exptInfo experiment info as data frame with information like sampleID, type, path etc
#' @param geneList A vector of gene IDs for which the profile needs to be extracted
#' @param ... Other argument to \code{import_profile_from_file} function
#'
#' @return A list of normalized matrix where each matrix is of \code{normalizedMatrix} class
#' @export
#'
#' @examples NA
import_profiles <- function(exptInfo, geneList, ...){
matList <- NULL
for(i in 1:nrow(exptInfo)){
sampleName <- exptInfo$sampleId[i]
# cat("Extracting matrix for sample:", sampleName, "...\n")
matList[[sampleName]] <- import_profile_from_file(file = exptInfo$matFile[i],
signalName = sampleName,
selectGenes = geneList,
...)
}
return(matList)
}
##################################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.