R/GetCoverageMatrix.R

#' Get coverage matrix for genes which are highly expressed in every file listed in CoverageDFList.
#' Only rows with gene data from all files included in the output (i.e. intersection of gene 
#' list from each file). Only the genes that have counts in every file will be "merged".
#'
#' @param CoverageDFList List of coverage counts from bedgraph coverage file (*.bedgraph.gz).
#' @keywords GetCoverageMatrix
#' @author 
#' Nathaniel J. Madrid, Jason Byars
#' @return An coverage matrix for genes which are highly expressed in every file listed in CoverageDFList.
#' @examples
#' refseq <- import.gff("RefSeq_hg19_exons_nodups_021114g1k.gff")
#' refseq$gene <- sub("_exon_[0-9]+", "", refseq$gene_id)
#' refseq$width <- width(refseq)
#' txlength <- tapply(refseq$width, refseq$gene, sum)
#' 
#' # Get file paths of aligned bedgraph files
#' BedGraphFiles <- dir(path="/home/ubuntu/Projects/RNAseqPackage/bedgraphs", full.names=T)
#' 
#' # coverage calculated w/ bedtools genomecov -bga -split
#' BedgraphList <- lapply(BedGraphFiles[1:3], function(bg) import.bedGraph(bg))
#' BedGraphRefSeqOverlapsList <- lapply(BedgraphList, function(bg) GetBedGraphRefSeqOverlaps(bg, refseq))
#' BedGraphRefSeqOverlapsDFList <- lapply(BedGraphRefSeqOverlapsList, function(bg) as.data.frame(bg@elementMetadata@listData))
#' for (i in 1:length(BedGraphRefSeqOverlapsDFList)) { BedGraphRefSeqOverlapsDFList[[i]]$width <- BedGraphRefSeqOverlapsList[[i]]@ranges@width }
#' CoverageDFList <- lapply(BedGraphRefSeqOverlapsDFList, function(bg) GetTranscriptomeCoverage(bg, txlength))
#' avgReadDepthPerGene <- unlist(lapply(CoverageDFList, function(df) mean(df$score)))
#' mmdOverTranscriptome <- unlist(lapply(CoverageDFList, function(df) sum(df$TotalCoverage) / sum(df$TranscriptLengths)))
#' 
#' # Only rows with gene data from all files are included in the output (i.e. intersection of gene 
#' # list from each file). Only the genes that have counts in every file will be "merged".
#' CoverageMatrix <- GetCoverageMatrix(CoverageDFList)
#' @export
GetCoverageMatrix <- function(CoverageDFList) {
  
  if (length(CoverageDFList) < 2) { 
    return (CoverageDFList[[1]]) 
  }
  
  CoverageDFCombined <- merge(CoverageDFList[[1]], CoverageDFList[[2]], by="gene")
  
  for (i in 3:length(CoverageDFList)) { 
    CoverageDFCombined <- merge(CoverageDFCombined, CoverageDFList[[i]], by="gene") 
  }
  
  CoverageMatrix <- CoverageDFCombined[ , seq(from=4, to=ncol(CoverageDFCombined), by=3)];
  colnames(CoverageMatrix) <- paste("score", 1:ncol(CoverageMatrix), sep="");
  rownames(CoverageMatrix) <- CoverageDFCombined$gene;
  
  return(as.matrix(CoverageMatrix));
}
njmadrid/RNAseqQuality documentation built on May 20, 2019, 3:32 p.m.