R/scoreMatrix.R

Defines functions binner getColors file.ext target.type constrainRanges checkClass galpTo2Ranges readBam readBigWig

Documented in binner checkClass constrainRanges file.ext galpTo2Ranges getColors readBam readBigWig target.type

#######################################
# S3 functions
#######################################

#' binner function
#'
#'given a vector and length smooths the vector to a given size
# the function is not safe - check for the window length before
#'
#' @param start start position
#' @param end end position
#' @param nbins number of bins
#' @keywords internal
binner=function(start,end,nbins){
  
  if(! is.numeric(start))
    stop('start needs to be class numeric')
  if(! is.numeric(end))
    stop('end needs to be class numeric')
  if(! is.numeric(nbins))
    stop('nbins needs to be class numeric')
  
  x = unique(seq(from = start, to = end,length.out=nbins + 1 ) )
  my.start = ceiling(x)[-length(x)]
  my.end = floor(x)[-1]
  
  return( t(cbind(my.start, my.end) )  )
}

# ---------------------------------------------------------------------------- #
#' getColors function
#'
#'gets colors for a factor variable
#'
#' @param n n
#' @keywords internal
getColors = function(n) {
  
  black = "#000000"
  c(black,hcl(h=seq(0,(n-2)/(n-1),length=n-1)*360,c=100,l=65,fixup=TRUE))
}

#' file.ext function
#'
#' Extract file extension from file path
#'
#' @param x file 
#' @keywords internal
file.ext = function(x) {
  pos <- regexpr("\\.([[:alnum:]]+)$", x)
  ifelse(pos > -1L, substring(x, pos + 1L), "")
}

#' target.type function
#'
#'Check if a target file is in bam and bigWig formats
#'by looking at the file extension
#'
#' @param target target file
#' @param type type name
#' @keywords internal
target.type = function(target, type=""){
  
  if(length(target)!=1){
    stop("target has to be a single character")
  }
  bw.exts = c("bw","bigWig","bigwig","BigWig", "BIGWIG", "BW")
  bam.exts = c("bam", "BAM", "Bam")
  if(type=="" | length(type)!=1 | nchar(type)==0 | is.null(type))
      stop(paste0('set argument type to "auto", "bam" or "bigWig"\n'))
  
  if(type=="auto"){
    # automatically recognize type of target files
    # by looking at the file extensions
    exts = file.ext(target)  
    if( any(!(exts %in% c(bam.exts, bw.exts))) )
      stop(paste0('currently supported formats are bam and bigWig\n'))
    if( exts %in% bw.exts)  return("bigWig") 
    if( exts %in% bam.exts) return("bam")
    
  }else if(type %in% c(bam.exts, bw.exts)){
    # if type is BAM or bigWig
    bam.grepl = paste0((paste0(bam.exts, collapse="$|")), "$")
    bw.grepl = paste0((paste0(bw.exts, collapse="$|")), "$")
    if(type %in% bam.exts){
      if(!grepl(bam.grepl, target)){
        warning(paste0('you have set type="',type,
                       '", but the designated file does not have ',
                       'an extension of BAM file (.bam)'))}
      return("bam")
    }
    if(type %in% bw.exts){
      if(!grepl(bw.grepl, target)){
        warning(paste0('you have set type="',type,
                       '", but the designated file does not have',
                       ' an extension of biWig file (.bw or .bigWig)')) }
      return("bigWig")
    }
  }
}

# ---------------------------------------------------------------------------- #

#' constrainRanges function
#' 
#' removes ranges that fell of the rle object
#' does not check for the correspondence of the chromosome names - always 
#' check before using this function
#'
#' @param target target file
#' @param windows windows
#' @keywords internal
constrainRanges = function(target, windows){
  
  checkClass(target, c('SimpleRleList','RleList','CompressedRleList'))
  checkClass(windows, 'GRanges')
  
  mcols(windows)$X_rank = 1:length(windows)
  r.chr.len = elementNROWS(target)
  constraint = GRanges(seqnames=names(r.chr.len),
                       IRanges(start=rep(1,length(r.chr.len)),
                               end=as.numeric(r.chr.len)))
  # suppressWarnings is done becuause GenomicRanges function give warnings 
  #if you don't have the same seqnames in both objects
  win.list.chr = suppressWarnings(subsetByOverlaps(windows, 
                                                   constraint,
                                                   type = "within",
                                                   ignore.strand = TRUE))
  
  if(length(win.list.chr) == 0)
    stop('All windows fell have coordinates outside windows boundaries')
  return(win.list.chr)
}


# ---------------------------------------------------------------------------- #

#' checkClass function
#' 
#' check whether the x object corresponds to the given class
#'
#' @param x object
#' @param class.name class name
#' @param var.name uses x object
#' @keywords internal
checkClass = function(x, class.name, var.name = deparse(substitute(x))){
  
  fun.name = match.call(call=sys.call(sys.parent(n=1)))[[1]]
  if(!class(x) %in% class.name)
    stop(paste(fun.name,': ', 
               var.name, 
               ' is not of class: ', 
               paste(class.name, collapse=' '), 
               '\n', sep=''))
}

#' galpTo2Ranges function
#'
#' @param x object
#' @keywords internal
galpTo2Ranges <- function(x)
{
  gr1 <- granges(first(x))
  gr2 <- granges(invertStrand(last(x)))
  ans <- split(c(gr1, gr2), rep.int(seq_along(x), 2L))
  names(ans) <- names(x)
  ans
}

# ---------------------------------------------------------------------------- #
#' readBam function
#' 
#' given a big bam path reads the big wig file into a RleList
#' to be used by ScoreMatrix:char,GRanges
#'
#' @param target target object
#' @param windows windows
#' @param rpm logical
#' @param unique logical
#' @param extend numeric
#' @param param ScanBamParam object 
#' @param paired.end llogical
#' @param library.size numeric
#' @param ... additional parameters
#' @keywords internal

readBam = function(target, windows, rpm=FALSE,
                   unique=FALSE, extend=0, param=NULL, 
                   paired.end=FALSE, library.size=NULL, ...){

  # check the ScanBamParam object
  if(!is.null(param) & class(param)!='ScanBamParam')
    stop('param needs to be an object of class ScanBamParam')
  
  if(paired.end){
    
    flag <- scanBamFlag(isPaired=TRUE, isProperPair=TRUE, 
                        isUnmappedQuery=FALSE, hasUnmappedMate=FALSE)
    # reads that map into window which stretches from start of the first window
    # to end of the last window
    if(is.null(param)){
      param <- ScanBamParam(which=reduce(windows, ignore.strand=TRUE), flag=flag)
    }else{
      bamWhich(param) <- reduce(windows, ignore.strand=TRUE)
    }
    alnp <- readGAlignmentPairs(target, param=param, use.names=TRUE)
    
    # remove rows that are duplicated when mates of pair map into two different windows
    alnp = alnp[!duplicated(names(alnp))]
    alns <- granges(alnp)
    
  }else{
    
    if(is.null(param)){
      param <- ScanBamParam(which=reduce(windows, ignore.strand=TRUE))
    }else{
      bamWhich(param) <- reduce(windows, ignore.strand=TRUE)
    }
    alns <- granges(readGAlignments(target, param=param, use.names=TRUE))
  }
  
  if(unique)
    alns = unique(alns)
  
  if(extend > 0)
    alns = resize(alns, width=extend, fix='start')
  if(extend < 0)
    stop('extend needs to be a positive integer')
  
  # get the coverage vector for given locations
  covs=coverage(alns)
  
  if(rpm){
    message('Normalizing to rpm ...')
    if(is.null(library.size)){
      total = 1e6/sum(idxstatsBam(normalizePath(target))[3])
    }else{
      total = 1e6/library.size
    }
    covs = covs * total
  }
  
  return(covs)
  
}

# ---------------------------------------------------------------------------- #
#' readBigWig function
#' 
#' given a big wig path reads the big wig file into a RleList
#' to be used by ScoreMatrix:char,GRanges
#'
#' @param target target object
#' @param windows windows
#' @param ... additional parameters
#' @keywords internal
readBigWig = function(target, windows=NULL, ...){
  
  
  if(class(windows) != 'GRanges')
    stop('windows argument needs to be a GRanges object')
  
  
  if(is.null(windows)){
    bw = import(target)
  }else{
    bw = import(target, which=windows)
  }
  if(length(bw) == 0)
    stop('There are no ranges selected')
  
  covs = coverage(bw, weight=bw$score)
  return(covs)
}



#######################################
# S4 functions
#######################################


# ---------------------------------------------------------------------------- #
#' Get base-pair score for bases in each window
#'
#' The funcion produces a base-pair resolution matrix of scores for given equal
#' width windows of interest. The returned matrix  can be used to 
#' draw meta profiles or heatmap of read coverage or wig track-like data.
#' The \code{windows} argument can be a predefined region around transcription start sites 
#' or other regions of interest that have equal lengths
#' The function removes all window that fall off the Rle object - 
#' have the start coordinate < 1 or end coordinate > length(Rle)
#' The function takes the intersection of names in the Rle and GRanges objects.
#' On Windows OS the function will give an error if the target is a file in .bigWig format.
#'
#' @param target \code{RleList} , \code{GRanges}, a BAM file or a BigWig
#'  to be overlapped with ranges in \code{windows}
#' @param windows \code{GRanges} object that contains the windows of interest. 
#'                It could be promoters, CpG islands, exons, introns. 
#'                However the sizes of windows have to be equal.
#' @param strand.aware If TRUE (default: FALSE), the strands of the
#'                   windows will be taken into account in the resulting
#'                    \code{ScoreMatrix}.
#'                     If the strand of a window is -, the values of the bins 
#'                     for that window will be reversed
#' @param weight.col if the object is \code{GRanges} object a numeric column
#'                 in meta data part can be used as weights. This is particularly
#'                useful when genomic regions have scores other than their
#'                coverage values, such as percent methylation, conservation
#'                scores, GC content, etc. 
#' @param is.noCovNA (Default:FALSE)
#'                  if TRUE,and if 'target' is a GRanges object with 'weight.col'
#'                   provided, the bases that are uncovered will be preserved as
#'                   NA in the returned object. This useful for situations where
#'                   you can not have coverage all over the genome, such as CpG 
#'                   methylation values.
#' @param type  (Default:"auto")
#'              if target is a character vector of file paths, then type designates
#'              the type of the corresponding files (bam or bigWig).
#' @param rpm boolean telling whether to normalize the coverage to per milion 
#'                    reads. FALSE by default. See \code{library.size}.
#' @param unique boolean which tells the function to remove duplicated reads 
#'              based on chr, start, end and strand
#' @param extend numeric which tells the function to extend the reads to width=extend
#' @param param ScanBamParam object 
#' @param bam.paired.end boolean indicating whether given BAM file contains 
#'                       paired-end reads (default:FALSE).
#'                       Paired-reads will be treated as fragments.
#' @param library.size numeric indicating total number of mapped reads in a BAM file
#'                            (\code{rpm} has to be set to TRUE).
#'                            If is not given (default: NULL) then library size 
#'                            is calculated using the Rsamtools idxstatsBam function:
#'                            sum(idxstatsBam(target)$mapped).
#' 
#' @note
#' We assume that a paired-end BAM file contains reads with unique ids and we remove 
#' both mates of reads if they are repeated. Due to the fact that \code{ScoreMatrix} 
#' uses the GenomicAlignments:readGAlignmentPairs function to read paired-end BAM files
#' a duplication of reads occurs when mates of one pair map into two different windows.
#' 
#' Strands of reads in a paired-end BAM are inferred depending on strand of 
#' first alignment from the pair. This is a default setting in the 
#' GenomicAlignments:readGAlignmentPairs function (see a strandMode argument). 
#' This mode should be used when the paired-end data was generated using 
#' one of the following stranded protocols: 
#' Directional Illumina (Ligation), Standard SOLiD.
#' 
#' @return returns a \code{ScoreMatrix} object
#' @seealso \code{\link{ScoreMatrixBin}}
#' 
#' @examples
#
#' # When target is GRanges
#' data(cage)
#' data(promoters)
#' scores1=ScoreMatrix(target=cage,windows=promoters,strand.aware=TRUE,
#'                                 weight.col="tpm")                                
#'                                  
#' 
#' # When target is RleList
#' library(GenomicRanges)
#' covs = coverage(cage)
#' scores2 = ScoreMatrix(target=covs,windows=promoters,strand.aware=TRUE)   
#' scores2 
#'
#' # When target is a bam file
#' bam.file = system.file('unitTests/test.bam', package='genomation')
#' windows = GRanges(rep(c(1,2),each=2), IRanges(rep(c(1,2), times=2), width=5))
#' scores3 = ScoreMatrix(target=bam.file,windows=windows, type='bam') 
#' scores3
#' 
#' @useDynLib genomation
#' @importFrom Rcpp sourceCpp
#' @docType methods
#' @rdname ScoreMatrix-methods           
#' @export
setGeneric("ScoreMatrix",
           function(target,windows,
                    strand.aware=FALSE,
                    weight.col=NULL,
                    is.noCovNA=FALSE,
                    type="auto",
                    rpm=FALSE, 
                    unique=FALSE, 
                    extend=0,
                    param=NULL,
                    bam.paired.end=FALSE,
                    library.size=NULL) 
             standardGeneric("ScoreMatrix") )


#' @aliases ScoreMatrix,RleList,GRanges-method
#' @rdname ScoreMatrix-methods
#' @usage  \\S4method{ScoreMatrix}{RleList,GRanges}(target,windows,strand.aware)
setMethod("ScoreMatrix",signature("RleList","GRanges"),
          function(target,windows,strand.aware){
            
            #check if all windows are equal length
            if( length(unique(width(windows))) >1 ){
              stop("width of 'windows' are not equal, provide 'windows' with equal widths")
            }  
            # check if windows have width > 1
            if( any(width(windows)==1) ){
              stop("provide 'windows' with widths greater than 1")
            } 
            
            # set a uniq id for the GRanges
            windows.len=length(windows)
            windows = constrainRanges(target, windows)
            if(length(windows)!=windows.len){
              warning(paste0(windows.len-length(windows),
                             " windows fall off the target"))
            }
            
            # fetches the windows and the scores
            chrs = sort(intersect(names(target), as.character(unique(seqnames(windows)))))
            myViews=Views(target[chrs],as(windows,"IntegerRangesList")[chrs]) # get subsets of RleList
            
            #  get a list of matrices from Views object
            #  operation below lists a matrix for each chromosome
            mat = lapply(myViews,function(x) t(viewApply(x,as.vector)) )
            
            # combine the matrices from chromosomes 
            mat = do.call("rbind",mat)   
            
            # get the ranks of windows, when things are reorganized by as(...,"IntegerRangesList")
            r.list=split(mcols(windows)[,"X_rank"], as.vector(seqnames(windows))  )
            r.list=r.list[order(names(r.list))]
            ranks=do.call("c",r.list)
            
            # put original window order as rownames
            # this is very important to keep
            # if we have multiple RleLists for the same set of windows
            # we need to know which windows are removed from which set
            # so when we are doing something comparative (clustering windows
            # based on different histone marks) we only work with windows
            # that are covered by all histone marks
            rownames(mat) = ranks  
            
            # if strand aware is TRUE, we need to flip the windows on the minus strand
            if(strand.aware == TRUE){
              orig.rows=windows[strand(windows) == '-',]$X_rank
              mat[rownames(mat) %in% orig.rows,] = mat[rownames(mat) %in% 
                                                         orig.rows, ncol(mat):1]
            }
            
            # reorder matrix
            mat = mat[order(ranks),] 
            
            return(new("ScoreMatrix",mat))
          })



# ---------------------------------------------------------------------------- #
#' @aliases ScoreMatrix,GRanges,GRanges-method
#' @rdname ScoreMatrix-methods
#' @usage \\S4method{ScoreMatrix}{GRanges,GRanges}(target, windows, strand.aware,
#'                                                 weight.col, is.noCovNA)
setMethod("ScoreMatrix",signature("GRanges","GRanges"),
          function(target, windows, strand.aware, weight.col, is.noCovNA){
            
            #make coverage vector  from target
            if(is.null(weight.col)){
              target.rle=coverage(target)
            }else{
              if(! weight.col %in% names(mcols(target)) ){
                stop("provided column 'weight.col' does not exist in target\n")
              }
              if(is.noCovNA)
              { # adding 1 to figure out NA columns later
                target.rle=coverage(target,weight=(mcols(target)[weight.col][,1]+1) )
                mat=ScoreMatrix(target.rle,windows,strand.aware)
                mat=mat-1 # substract 1
                mat[mat<0]=NA # everything that are <0 are NA
                return(mat)
              }
              target.rle=coverage(target,weight= weight.col ) 
              
            }
            
            # call ScoreMatrix function
            ScoreMatrix(target.rle,windows,strand.aware)
          })

# ---------------------------------------------------------------------------- #
#' @aliases ScoreMatrix,character,GRanges-method
#' @rdname ScoreMatrix-methods
#' @usage \\S4method{ScoreMatrix}{character,GRanges}(target, windows, strand.aware,
#'                                                   weight.col=NULL,is.noCovNA=FALSE, 
#'                                                   type='auto', rpm=FALSE,
#'                                                   unique=FALSE, extend=0, param=NULL, 
#'                                                   bam.paired.end=FALSE,
#'                                                   library.size=NULL)
setMethod("ScoreMatrix",signature("character","GRanges"),
          function(target,windows, strand.aware,
                   weight.col=NULL,is.noCovNA=FALSE,
                   type='auto',rpm=FALSE, 
                   unique=FALSE, extend=0,param=NULL,
                   bam.paired.end=FALSE,
                   library.size=NULL){
            
            if(!file.exists(target)){
              stop("Indicated 'target' file does not exist\n")
            }
            
            type = target.type(target, type)
            
            if( type=="bigWig" & rpm==TRUE)
              warning("rpm=TRUE is not supported for type='bigWig'")

            if(type == 'bam'){
              
              covs = readBam(target, windows, rpm=rpm, unique=unique, 
                             extend=extend, param=param, 
                             paired.end=bam.paired.end, library.size)
              #get coverage vectors
              return(ScoreMatrix(covs,windows,strand.aware))
            }
              
            if(type == 'bigWig'){
              if(is.noCovNA==FALSE){
                
                covs = readBigWig(target=target, windows=windows)
                ScoreMatrix(covs,windows,strand.aware)
                
              }else{
                
                if(is.null(windows)){
                  bw = import(target)
                }else{
                  bw = import(target, which=windows)
                }
                if(length(bw) == 0)
                  stop('There are no ranges selected')
                
                return(ScoreMatrix(bw, windows, strand.aware, 
                                   weight.col=weight.col, 
                                   is.noCovNA=is.noCovNA))
              }
            }           
          })



# ---------------------------------------------------------------------------- #
#' Bins the columns of a matrix using a user provided function 
#'
#' @param x \code{ScoreMatrix} or a \code{ScoreMatrixList} object
#' @param bin.num \code{integer} number of bins in the final matrix
#' @param fun \code{character} vector or an anonymous function that will be used for binning
#'
#' @return \code{ScoreMatrix} or \code{ScoreMatrixList} object
#'
#' @examples
#' 
#' # binning the columns in a ScoreMatrix object
#' library(GenomicRanges)
#' target = GRanges(rep(c(1,2),each=7), IRanges(rep(c(1,1,2,3,7,8,9), times=2), width=5), 
#' weight = rep(c(1,2),each=7), 
#' strand=c('-', '-', '-', '-', '+', '-', '+', '-', '-', '-', '-', '-', '-', '+'))
#' windows = GRanges(rep(c(1,2),each=2), IRanges(rep(c(1,2), times=2), width=5), 
#' strand=c('-','+','-','+'))
#' sm = ScoreMatrix(target, windows)
#' bin = binMatrix(sm, bin.num=2)
#'
#' @docType methods
#' @rdname binMatrix-methods
#' @export
setGeneric("binMatrix", 
           function(x, bin.num=NULL, fun='mean')
             standardGeneric("binMatrix") )

#' @aliases binMatrix,ScoreMatrix-method
#' @rdname binMatrix-methods
setMethod("binMatrix", signature("ScoreMatrix"),
          function(x, bin.num=NULL, fun='mean'){
            
            if(is.null(bin.num))
              return(x)
            
            if(bin.num > ncol(x))
              stop("number of given bins is bigger 
                   than the number of matrix columns")
            
            if(is.character(fun))
              fun = match.fun(fun)
            
            coord = binner(1, ncol(x), bin.num)
            bmat = mapply(function(a,b)apply(x[,a:b],1,fun), coord[1,], coord[2,])
            
            return(new("ScoreMatrix", bmat))
          }
)

# ---------------------------------------------------------------------------- #
# show Methods
#' @rdname show-methods
#' @return Shows the dimension of the ScoreMatrix
setMethod("show", "ScoreMatrix",
          function(object){
            dims = dim(object)
            
            s=sprintf("  %s %d %d", "scoreMatrix with dims:", dims[1], dims[2])
            message(s)  
          })

# ---------------------------------------------------------------------------- #
#' Scales the values in the matrix by rows and/or columns
#'
#' @param mat \code{ScoreMatrix} object
#' @param columns \code{columns} whether to scale the matrix by columns. Set by default to FALSE.
#' @param rows  \code{rows} Whether to scale the matrix by rows. Set by default to TRUE
#' @param scalefun function object that takes as input a matrix and returns a matrix. 
#'         By default  the argument is set to (x - mean(x))/(max(x)-min(x)+1)
#' 
#' @return \code{ScoreMatrix} object
#'
#' @examples
#' 
#' # scale the rows of a scoreMatrix object
#' library(GenomicRanges)
#' target = GRanges(rep(c(1,2),each=7), IRanges(rep(c(1,1,2,3,7,8,9), times=2), width=5), 
#'          weight = rep(c(1,2),each=7), 
#'          strand=c('-', '-', '-', '-', '+', '-', '+', '-', '-', '-', '-', '-', '-', '+'))
#' windows = GRanges(rep(c(1,2),each=2), IRanges(rep(c(1,2), times=2), width=5), 
#'            strand=c('-','+','-','+'))
#' sm = ScoreMatrix(target, windows)
#' ssm = scaleScoreMatrix(sm, rows=TRUE)
#'
#' @docType methods
#' @rdname scaleScoreMatrix-methods
#' @export
setGeneric("scaleScoreMatrix", 
           function(mat, 
                    columns=FALSE, rows=TRUE, 
                    scalefun=NULL) 
             standardGeneric("scaleScoreMatrix") )

#' @aliases scaleScoreMatrix,ScoreMatrix-method
#' @rdname scaleScoreMatrix-methods
setMethod("scaleScoreMatrix", signature("ScoreMatrix"),
          function(mat, columns, rows, scalefun){
            
            if(is.null(scalefun))
              scalefun = function(x)(x-mean(x))/(max(x)-min(x)+1)
            
            if(columns)
              mat = apply(mat, 2, scalefun)
            
            if(rows)
              mat = t(apply(mat,1,scalefun))
            
            return(new("ScoreMatrix", mat))
          }
)
BIMSBbioinfo/genomation documentation built on March 13, 2020, 5:28 a.m.