R/p03_data_preprocess_functions.R

Defines functions preProcess_polII_expression

Documented in preProcess_polII_expression

##################################################################################
## function to process polII expression matrix and add top 10% info
#' Pre-process polII expression data
#'
#' @param expMat Path to the polII expression file generated by Miao's script
#' @param sampleId Sample id to be used
#' @param expFraction Fraction of the genes to be considered as expressed
#'   (normally 10\% genes are under active transcription in the cell)
#' @param polIIExpFile Complete path for processed output file
#'
#' @return complete output file path
#' @export
#'
#' @examples NA
preProcess_polII_expression <- function(expMat, sampleId, expFraction, polIIExpFile){

  polIIDf <- data.table::fread(
    input = expMat, header = F, drop = c(1,2,3,5,6), col.names = c("geneId", sampleId),
    stringsAsFactors = F, sep = "\t", data.table = F)

  topFraction <- round(nrow(polIIDf) * expFraction / 100)

  expressedDf <- polIIDf %>% dplyr::top_n(topFraction, !! sym(sampleId))

  isExpCol <- paste("is_expressed.", sampleId, sep = "")
  expressedDf[isExpCol] <- TRUE

  expressedDf <- expressedDf %>% dplyr::select(geneId, starts_with("is_expressed"))


  finalDf <- dplyr::left_join(x = polIIDf, y = expressedDf, by = c("geneId" = "geneId")) %>%
    dplyr::mutate(!!isExpCol := ifelse(is.na(!! sym(isExpCol)), FALSE, !! sym(isExpCol)))

  write.table(x = finalDf, file = polIIExpFile, sep = "\t", col.names = T, quote = F, row.names = F)

  # cat("Processed polII expression for sample ", sampleId, "\n")

  return(polIIExpFile)
}

##################################################################################
lakhanp1/chipmine documentation built on March 6, 2021, 9:06 a.m.