R/create-coverage-matrix.R

Defines functions createCoverageMatrix

Documented in createCoverageMatrix

#' Create normalized data frame from  input coverage bed files 
#' The function will read in bed files, coverage column will be specified by user
#' and collate them to make a data matrix and then normalize by the library size. 
#' If the library sizes are not provided by then use sum of reads across loci as proxy. 
#' Though this is not recommended.
#' 
#' @param inputPath Path to coverage or junction bed files. 
#' Assumptions for coverage bed file - 1st 3 columns are the loci of interest (chr start stop) and the last column is the coverage readcount
#' 
#' @param filePattern character string or regular expression to search inputPath to get all coverage files
#' 
#' @param junctionReads logical flag specifying whether junction reads are to be calculated OR not
#' 
#' @param junctionType character type of the junction bed file. Either 'tophat' or 'star'
#' 
#' @param librarySizeFile full path file name with library sizes. Should have two columns
#' 1st should be sample name - same as file name in @param inputPath and the 2nd as the library size. No of rows should be equal to
#' no of coverage files read from @param inputpath 
#' 
#' @param promoterFile Full path file name of promoter loci - needs to be a bed file if junctionReads flag is equal to TRUE
#' @param normalize logical flag specifying whether normalization by library size should be done or not
#' 
#' @return A data.frame object with normalized RNAseq reads in each sample (rows) 
#' @author Aditi Qamra
#' @export
#'
#' @examples
#' \dontrun{
#' norm_matrix <- createCoverageMatrix(inputPath="readcount/",
#'                                    filePattern="coverage_",
#'                                    junctionReads=FALSE,
#'                                    junctionType='',
#'                                    promoterFile='',
#'                                    normalize=TRUE,
#'                                    librarySizeFile="libsizes.txt")
#' 
#'norm_matrix <- createCoverageMatrix(inputPath="test/",
#'                                    filePattern=".junction.bed",
#'                                    junctionReads=TRUE,
#'                                    junctionType='tophat',
#'                                    promoterFile='test/aploci.bed',
#'                                    normalize=TRUE,
#'                                    librarySizeFile="libsizes.txt")
#' 
#' 
#' 
#' }
#'
#'


createCoverageMatrix=function(inputPath,
                                  filePattern,
                                  junctionReads,
                                  junctionType, 
                                  promoterFile,
                                  gencodefile,
                                  normalize,
                                  librarySizeFile,... ){
  
  if (missing(inputPath)) {
      stop(paste0('Error: inputPath of coverage or junction files is missing' ))
  }
  
  
  if (missing(filePattern)) {
    stop(paste0('Error: string to identify coverage or junction files in inputPath is missing' ))
  }
  
  
  if (missing(junctionReads)) {
    stop(paste0('Error: logical flag specifying whether junction reads are being read in or not is missing. ' ))
  }
  
  if (junctionReads==TRUE &  missing(junctionType)) {
    stop(paste0('Error: Invalid junction type: ', junctionType, '! Possible values: "tophat" or "star"'))
  }  
  
  
  if (junctionReads==TRUE & missing(promoterFile)) {
    stop(paste0('Error: Input promoter file against which junction reads need to be calculated is missing'))
  }  

  if (missing(normalize)) {
    stop(paste0('Error: logical flag to normalize count matrix is missing' ))
  }  
  
  if (missing(librarySizeFile)) {
    paste0('no library size file was provided. 
           sum of read counts across loci will be used as proxy' )
  }
  
  
  if (junctionReads!=TRUE){
    
    # read in files
    
    files <- dir(inputPath, pattern=filePattern, all.files=T, full.names=T)
    
    # combine
    
    promoterReadCounts <- data.table::fread(files[1],sep="\t", stringsAsFactors = F, header=F, data.table=F)
    promoterReadCounts <- promoterReadCounts[,c(1,2,3,(ncol(promoterReadCounts)-3) )]
    colnames(promoterReadCounts)[ncol(promoterReadCounts)] <- gsub(filePattern, "",basename(files[1]))
    promoterReadCounts$name <- paste(promoterReadCounts[,1], promoterReadCounts[,2], promoterReadCounts[,3], sep="_")
    promoterReadCounts <- promoterReadCounts[,c("name",  gsub(filePattern, "",basename(files[1])))]
    
    for (f in 2:length(files)){
      dat <- fread(files[f],sep="\t", stringsAsFactors = F, header=F, data.table=F)
      dat <- dat[,c(1,2,3,(ncol(dat)-3) )]
      promoterReadCounts <- cbind(promoterReadCounts, dat[,ncol(dat)])
      colnames(promoterReadCounts)[ncol(promoterReadCounts)] <- gsub(filePattern, "", basename(files[f]))
    }
    
    promoterReadCounts <- promoterReadCounts[!duplicated(promoterReadCounts$name), ]
    rownames(promoterReadCounts) <- promoterReadCounts$name
    promoterReadCounts$name <- NULL
    
  } else {
    
    files <- dir(inputPath, pattern=filePattern, all.files=T, full.names=T)
    
    promoterReadCounts <- as.data.frame(lapply(files,calculateJunctionReadCounts,
                                  junctionType=junctionType,
                                 promoterFile=promoterFile, 
                                 gencodeFile=gencode.v19.intron.first.dedup))
    
    colnames(promoterReadCounts) <- gsub(filePattern,"", colnames(promoterReadCounts))
  }
  

  
  if (normalize!=TRUE){
    
    return(as.data.frame(promoterReadCounts))
    
  } else {
    
    # Normalize by library size and region size
    
    dist_vec <- as.numeric(str_split_fixed(rownames(promoterReadCounts), "_",3)[,3])-as.numeric(str_split_fixed(rownames(promoterReadCounts), "_",3)[,2])
    
    # region size
    promoterReadCounts_norm <- apply(promoterReadCounts,2, function(e) e/dist_vec)
    
    # libsize
    
    if (!missing(librarySizeFile)) {
      libsize <- read.table(librarySizeFile,sep="\t", stringsAsFactors = F, header=F)
      rownames(libsize) <- gsub(filePattern,"", libsize[,1] )
      libsize <- libsize[colnames(promoterReadCounts_norm),2]
      promoterReadCounts_norm_libsize <- as.data.frame(t(apply(promoterReadCounts_norm,1, function(e) e*1000000/libsize)))
    } else {
      
      libsize <- apply(promoterReadCounts,2, sum)
      promoterReadCounts_norm_libsize <- as.data.frame(t(apply(promoterReadCounts_norm,1, function(e) e*1000000/libsize)))
    }
    
    # return data frame
    return(as.data.frame(promoterReadCounts_norm_libsize))
    
  }
 
} 
aditiqamra/APcalc documentation built on Jan. 23, 2020, 4:39 p.m.