R/normalization.R

Defines functions .applyMatricesScaling renormalizeSignalMatrices renormalizeBorders .getRefSampleFromPeaks .getCovVals .randomTiles .S3norm .topNorm .MAnorm bwNormFactors getNormFactors

Documented in bwNormFactors getNormFactors renormalizeBorders renormalizeSignalMatrices

#' getNormFactors : estimate normalization factors from genomic signal files
#' 
#' Estimates normalization factors for a set of samples (i.e. bam/bigwig files).
#'
#' @param x A vector of paths to bigwig files, to bam files, or alternatively a 
#'   list of coverages in RleList format. (Mixed formats are not supported)
#' @param method Normalization method (see details below).
#' @param wsize The size of the random windows. If any of the bigwig files 
#'  records point events (e.g. insertions) at high resolution (e.g. nucleotide),
#'  use a lot (e.g. >10k) of small windows (e.g. `wsize=10`), as per default
#'  settings. Otherwise the process can be lightened by using fewer bigger 
#'  windows.
#' @param nwind The number of random windows
#' @param peaks A list of peaks (GRanges) for each experiment in `x`, or a
#'   vector of paths to such files, or a single GRanges of unified peaks to use 
#'   (e.g. for top/MAnorm).
#' @param trim Amount of trimming when calculating means.
#' 
#' @return A vector of normalization factors, or for the 'S3norm' and '2cLinear'
#'   methods, a numeric matrix with a number of rows equal to the length 
#'   of `x`, and two columns indicating the alpha and beta terms.
#' 
#' @details 
#' The function computes per-sample (bigwig or bam file) normalization factors
#' using one of the following methods:
#' * The 'background' or 'SES' normalization method (they are synonyms here)
#' (Diaz et al., Stat Appl Gen et Mol Biol, 2012) assumes that the background
#' noise should on average be the same across experiments, an assumption that 
#' works well in practice when there are not very large differences in 
#' signal-to-noise ratio. The implementation uses the trimmed mean number of 
#' reads in random windows with non-zero counts.
#' * The 'MAnorm' approach (Shao et al., Genome Biology 2012) assumes that 
#' regions that are commonly enriched (i.e. common peaks) in two experiments 
#' should on average have the same signal in the two experiments. If the data is
#' strictly positive (e.g. counts or CPMs), TMM normalization (Robinson et al.,
#' Genome Biology 2010) is then employed on the common peaks. When done on more 
#' than two samples, the sample with the highest number of overlaps with other
#' samples is used as a reference.
#' * The 'enriched' approach assumes that enriched regions are on average 
#' similarly enriched across samples. Contrarily to 'MAnorm', these regions do 
#' not need to be in common across samples/experiments. Note that trimming via
#' the `trim` parameter is performed before calculating means.
#' * The 'top' approach assumes that the maximum enrichment (after trimming 
#' according to the `trim` parameter) in peaks is the same across 
#' samples/experiments.
#' * The 'S3norm' (Xiang et al., NAR 2020) and '2cLinear' methods try to 
#' normalize both enrichment and background simultaneously. S3norm does this in 
#' a log-linear fashion (as in the publication), while '2cLinear' does it on 
#' the original scale.
#' 
#' Several of the methods rely on random regions to estimate background levels.
#' This means that they will not be entirely deterministic, although normally 
#' quite reproducible with experiments that are not very sparse. For fully 
#' reproducible results, be sure to set the random seed.
#' 
#' When the data is very sparse (e.g. low sequencing depth, or compiling single
#' nucleotide hits rather than fragment coverage), it might be necessary to 
#' increase the `wsize` and `nwind` to get more robust estimates.
#'
#' @return A vector of normalization factors or, for methods 'S3norm' and 
#'   '2cLinear', a matrix of per-sample normalization parameters.
#' @export
#' @examples
#' # we get an example bigwig file, and use it twice:
#' bw <- system.file("extdata/example_atac.bw", package="epiwraps")
#' bw <- c(sample1=bw, sample2=bw)
#' # we indicate to use only chr1, because it's the only one in the example file
#' getNormFactors(bw, useSeqLevels="1")
getNormFactors <- function(x, method=c("background","SES","enriched","top",
                                       "MAnorm","S3norm","2cLinear"),
                           wsize=10L, nwind=20000L, peaks=NULL, trim=0.02,
                           useSeqLevels=NULL, paired=NULL, ..., verbose=TRUE){
  method <- match.arg(method)
  if(is(x, "GRanges") || length(x)==1)
    stop("Can't normalize a single sample on its own!")
  
  if(all(vapply(x, FUN=is.character, FUN.VALUE=logical(1)))){
    # input are file paths
    inType <- .parseFiletypeFromName(x, requireUnique=TRUE)
    stopifnot(inType %in% c("bam","bw"))
    if(inType=="bam"){
      if(is.null(paired))
        message("`paired` unspecified, assuming the data to be single-end...")
      paired <- FALSE
      chrsizes <- lapply(x, FUN=function(x) seqlengths(BamFile(x)))
    }else{
      chrsizes <- lapply(x, FUN=function(x) seqlengths(BigWigFile(x)))
    }
  }else if(all(vapply(x, class2="GRanges", FUN=is, FUN.VALUE=logical(1)))){
    # input are granges
    stop("Not currently implemented with this type of input.")
  }else if(all(vapply(x, class2="RleList", FUN=is, FUN.VALUE=logical(1)))){
    # input are RleList
    chrsizes <- lapply(x, lengths)
  }else{
    stop("`x` is of unknown format or contains multiple formats.")
  }
  
  # check chr matching
  tt <- table(unlist(lapply(chrsizes, names)))
  tt <- names(tt)[tt==length(x)]
  if(length(tt)==0) stop("The files appear to have no chromosome in common.")
  chrsizes <- do.call(cbind, lapply(chrsizes, FUN=function(x) x[tt]))
  chrsizes <- matrixStats::rowMins(chrsizes)
  names(chrsizes) <- tt
  if(!is.null(useSeqLevels)){
    if(!all(useSeqLevels %in% names(chrsizes)))
      stop("Some of the requested seqlevels are not found in the data!")
    chrsizes <- chrsizes[useSeqLevels]
  }
  stopifnot(length(chrsizes)>0)
  names(seqlvls) <- seqlvls <- names(chrsizes)
  
  
  if(!(method %in% c("background","SES")) && is.null(peaks))
    stop("The selected normalization method requires peaks.")
  if(is(peaks,"GRanges")) peaks <- list(peaks)
  peaks <- lapply(peaks, FUN=function(x){
    if(is.character(x)) x <- rtracklayer::import(x)
    if(!is.null(useSeqLevels)) x <- x[seqnames(x) %in% useSeqLevels]
    x
  })
  
  if(method %in% c("MAnorm","top","enriched")){
    if(verbose) message("Comparing coverage in peaks...")
    return(switch(method,
      "MAnorm"=.MAnorm(x, peaks, trim=trim, paired=paired, ...),
      "top"=.topNorm(x, peaks, trim=trim, paired=paired, ...),
      "enriched"=.topNorm(x, peaks, trim=trim, paired=paired, useMean=TRUE, ...)
    ))
  }
  
  if(verbose) message("Comparing coverage in random regions...")
  windows <- .randomTiles(chrsizes, nwind, wsize)
  wc <- do.call(cbind, lapply(x, windows=windows, ..., FUN=.getCovVals))
  w <- which(rowSums(is.na(wc))==0 & rowSums(wc)>0)
  windows <- windows[w]
  wc <- wc[w,]
  if(any(colSums(wc)<50))
    warning("Some samples have less than 50 non-zero windows. Consider ",
            "increasing the window size or (better) the number of windows.")
  
  if(method %in% c("background","SES")){
    nf <- apply(wc, 2, FUN=function(x){
      if(sum(x!=0)<10)
        warning("Too few non-zero windows, please increase `nwind`.")
      y <- mean(x, trim=trim, na.rm=TRUE)
      if(y==0) y <- mean(x, na.rm=TRUE)
      y
    })
    return(setNames(median(nf, na.rm=TRUE)/nf, names(x)))
  }
  if(verbose) message("Comparing coverage in peaks...")
  wc <- wc[!overlapsAny(windows, 
                        reduce(unlist(GRangesList(peaks), use.names=FALSE))),]
  .S3norm(x, peaks, bgWindows=wc, chrsizes, trim=trim, method=method, 
          wsize=wsize, ...)
}

#' @describeIn getNormFactors deprecated in favor of getNormFactors
#' @export
bwNormFactors <- function(x, ...){
  .Deprecated(old="bwNormFactors", new="getNormFactors",
              msg=paste("bwNormFactors is deprecated, please gradually switch",
                        "to `getNormFactors`."))
  getNormFactors(x, ...)
}

.MAnorm <- function(x, peaks, trim=0.02, ...){
  if(length(peaks)==1) peaks <- peaks[[1]]
  stopifnot(is(peaks, "GRanges") || length(peaks)==length(x))
  if(is(peaks, "GRanges")){
    refP <- sort(peaks)
    peaks <- lapply(seq_len(length(x)), FUN=function(x) peaks)
    ref <- 1
  }else{
    ref <- .getRefSampleFromPeaks(peaks)
    refP <- sort(peaks[[ref]])
  }
  refP$ID <- seq_along(refP)
  peaks <- lapply(peaks, FUN=function(x){
    if(identical(refP,x)) return(NULL)
    refP[overlapsAny(refP, x)]$ID
  })
  refC <- .getCovVals(x[[ref]], refP, ...)
  nf <- mapply(p=peaks, x=x, FUN=function(p,x){
    if(is.null(p)) return(1)
    co <- .getCovVals(x, refP[p], ...)
    if(any(refC<0))
      return(mean(refC[p],trim=trim)/mean(co, trim=trim))
    nf <- edgeR::calcNormFactors(cbind(refC[p], co))
    nf[1]/nf[2]
  })
  setNames(nf, names(x))
}

.topNorm <- function(x, peaks, trim=0.02, useMean=FALSE, ...){
  stopifnot(is.list(peaks) && is(peaks[[1]], "GRanges"))
  stopifnot(length(peaks)==1 || length(peaks)==length(x))
  sf <- mapply(x=x, p=peaks, FUN=function(x,p){
    cv <- .getCovVals(x, p, ...)
    if(useMean) return(mean(cv, na.rm=TRUE, trim=trim))
    as.numeric(quantile(cv, 1-trim, na.rm=TRUE))
  })
  median(sf)/sf
}

.S3norm <- function(x, peaks, bgWindows, chrsizes, trim, wsize=10L, 
                    method="S3norm", ...){
  if(any(colSums(bgWindows)<50))
    warning("Some samples have less than 50 non-zero windows. Consider ",
            "increasing the window size or (better) the number of windows.")
  
  cost <- function(p, x1, x2, bg1, bg2){
    a <- p["a"]
    b <- p["b"]
    if(method=="S3norm"){
      x1 <- a*(x1)^b
      bg1 <- a*(bg1[bg1>0])^b
    }else{
      x1 <- a+b*x1
      bg1 <- a+(bg1[bg1>0])*b
    }
    abs(mean(x1, trim=trim)-mean(x2, trim=trim)) +
      abs(mean(bg1, trim=trim) - mean(bg2[bg2>0L], trim=trim))
  }
  
  common <- NULL
  if(length(peaks)==1){
    common <- peaks[[1]]
    ref <- 1L
  }else{
    ref <- .getRefSampleFromPeaks(peaks)
  }
  nf <- lapply(seq_along(x), FUN=function(i){
    if(is.null(common)){
      common <- reduce(resize(intersect(peaks[[ref]],peaks[[i]]), width=wsize))
      common <- common[seqnames(common) %in% names(chrsizes)]
      common <- keepSeqlevels(common, seqlevelsInUse(common), pruning.mode="coarse")
      seqlengths(common) <- chrsizes[seqlevels(common)]
    }
    x1 <- .getCovVals(x[[ref]], common, ...)
    x2 <- .getCovVals(x[[i]], common, ...)
    bg1 <- bgWindows[,ref]
    bg2 <- bgWindows[,i]
    optim(c(a=ifelse(method=="S3norm",1,0), b=1), 
          x1=x1, x2=x2, bg1=bg1, bg2=bg2, fn=cost)$par
  })
  
  nf <- do.call(rbind, nf)
  row.names(nf) <- names(x)
  colnames(nf) <- letters[1:2]
  attributes(nf)$applyLinearly <- method=="2cLinear"
  nf
}


#' @importFrom IRanges IRangesList
.randomTiles <- function(chrsizes, nwind, wsize){
  winPerChr <- round(nwind*(chrsizes/sum(chrsizes)))
  winPerChr <- winPerChr[winPerChr>=1]
  windows <- as(IRangesList(lapply(setNames(names(winPerChr),names(winPerChr)),
       FUN=function(x){
         possibleWindows <- floor(chrsizes[x]/wsize)
         IRanges(sort(1L+wsize*(sample.int(possibleWindows, winPerChr[x])-1L)), 
                 width=wsize)
       })), "GRanges")
  seqlengths(windows) <- chrsizes[seqlevels(windows)]
  windows
}

.getCovVals <- function(x, windows, paired=FALSE, ...){
  if(is.character(x)){
    if(.parseFiletypeFromName(x)=="bam"){
      if(paired){
        x <- readGAlignmentPairs(BamFile(x),
                                 param=ScanBamParam(which=windows, ...))
      }else{
        x <- readGAlignments(BamFile(x), param=ScanBamParam(which=windows, ...))
      }
      x <- coverage(x)
    }else{
      x <- rtracklayer::import(x, format="BigWig", as="RleList",
                               selection=BigWigSelection(windows))
    }
  }
  windows <- sort(windows)
  y <- rep(NA_integer_, length(windows))
  w <- which(as.factor(seqnames(windows)) %in% names(x))
  if(length(w)==0) stop("No window found in coverage! Wrong seqlevel style?")
  windows <- windows[w]
  windows <- keepSeqlevels(windows, seqlevelsInUse(windows), pruning.mode="coarse")
  windows <- split(ranges(windows),seqnames(windows),drop=TRUE)
  y[w] <- unlist(viewMaxs(Views(x[names(windows)], windows)),
                 use.names=FALSE)
  y
}

.getRefSampleFromPeaks <- function(peaks){
  if(is.list(peaks) && length(peaks)==1) return(1L)
  po <- sapply(seq_along(peaks), FUN=function(i){
    sapply(seq_along(peaks), FUN=function(j){
      if(i==j) return(NA_integer_)
      sum(overlapsAny(peaks[[i]], peaks[[j]]))
    })
  })
  ref <- which.max(matrixStats::rowMaxs(po,na.rm=TRUE))
  if(min(po[ref,],na.rm=TRUE)<100){
    if(min(po[ref,],na.rm=TRUE)<1){
      stop("Some of the experiments have no peak in common!")
    }
    warning("Some of the experiments have few (<100) peaks in common.",
            "The calculated factors might be inaccurate.")
  }
  ref
}

#' @export
#' @describeIn renormalizeSignalMatrices deprecated > renormalizeSignalMatrices
renormalizeBorders <- function(ml, trim=NULL, assay="input", nWindows=NULL){
  .Deprecated(
    old="renormalizeBorders", new="renormalizeSignalMatrices",
    msg=paste('renormalizeBorders is deprecated, please gradually',
              'switch to `renormalizeSignalMatrices(..., method="border"`.'))
  renormalizeSignalMatrices(ml, trim=trim, assay=assay, nWindows=nWindows,
                            method="border")
}


#' renormalizeSignalMatrices
#'
#' Renormalizes a list of signal matrices or an EnrichmentSE object.
#'
#' @param ml A named matrix list or EnrichmentSE object as produced by 
#'   \code{\link{signal2Matrix}}.
#' @param method Either "border" or "top" (see details below).
#' @param trim Quantiles trimmed at each extreme before calculating 
#'   normalization factors.
#' @param fromAssay Assay to use (ignored unless `ml` is an EnrichmentSE 
#'   object), defaults to the first assay.
#' @param toAssay Assay in which to store the normalized data (ignored unless 
#'   `ml` is an EnrichmentSE object). By default an assay name will be set based
#'   on the normalization method used.
#' @param scaleFactors A numeric vector of same length as `ml`, 
#'   indicating the scaling factors by which to multiply each matrix.
#'   Alternatively, a numeric matrix with a number of rows equal to the length 
#'   of `ml`, and two columns indicating the alpha and beta arguments of a 
#'   s3norm normalization. Ignored unless `method="manual"`.
#'
#' @details
#' * `method="border"` works on the assumption that the left/right borders of the 
#' matrices represent background signal which should be equal across samples. As
#' a result, it will work only if 1) the left/right borders of the matrices are 
#' sufficiently far from the signal (e.g. peaks) to be chiefly noise, and 
#' 2) the signal-to-noise ratio is comparable across tracks/samples.
#' * `method="top"` instead works on the assumption that the highest signal should
#' be the same across tracks/samples.
#' By default, extreme values are trimmed before establishing either kind of 
#' normalization factor. The proportion trimmed can be set using the `trim` 
#' argument, and is by default 10% of the non-zero values.
#' * `method="manual"` enables the use of independently computed normalization
#' factors, for instance obtained through \code{\link{getNormFactors}}.
#'
#' @return Either a renormalized list of signal matrices or, if `ml` was an 
#'   `EnrichmentSE` object, the same object with an additional normalized
#'   assay automatically put at the front.
#' @export
#' @examples
#' # we first get an EnrichmentSE object:
#' data(exampleESE)
#' # we normalize them
#' m <- renormalizeSignalMatrices(m)
#' # see the `vignette("multiRegionPlot")` for more info on normalization.
renormalizeSignalMatrices <- function(ml, method=c("border","top","manual"), 
                                      trim=NULL, fromAssay="input", toAssay=NULL,
                                      nWindows=NULL, scaleFactors=NULL, ...){
  if(missing(method) && !is.null(scaleFactors)) method <- "manual"
  method <- match.arg(method)
  if(is(ml, "EnrichmentSE")){
    ml2 <- renormalizeSignalMatrices(.ese2ml(ml,assay=fromAssay), method=method,
                                     trim=trim, nWindows=nWindows,
                                     toAssay=toAssay, scaleFactors=scaleFactors)
    if(is.null(toAssay))
      toAssay <- switch(method,
                         "border"="borderNormalized",
                         "top"="topNormalized",
                         "normalized")
    return(addAssayToESE(ml, a=ml2, name=toAssay, replace=TRUE))
  }
  if(method=="manual"){
    stopifnot(!is.null(scaleFactors) && length(scaleFactors)==length(ml))
  }else{
    if(!is.null(scaleFactors)) 
      message('`scaleFactors` is ignored when method=!"manual"')
  }
  if(is.null(nWindows)) nWindows <- max(floor(ncol(ml[[1]])/10),1)
  ml <- .comparableMatrices(ml, checkAttributes=TRUE)
  b <- do.call(cbind, lapply(ml, FUN=function(x){
    x <- unclass(x)
    as.numeric(cbind(x[,seq_len(nWindows)],
                     x[,seq(from=ncol(x)-nWindows+1L,to=ncol(x))]))
  }))
  if(method=="border"){
    if(is.null(trim)) trim <- (sum(b!=0)/length(b))/10
    nf <- apply(b, 2, FUN=function(x){
      y <- mean(x, trim=trim)
      if(y==0) y <- mean(x)
      y
    })
    nf <- nf/median(nf)
  }else if(method=="top"){
    if(is.null(trim)) trim <- 0.005
    nf <- apply(b, 2, FUN=function(x){
      y <- quantile(x, 1-trim)
      if(y==0) y <- max(x)
      y
    })
    nf <- nf/median(nf)
  }else if(method=="manual"){
    nf <- 1/scaleFactors
  }
  ml <- .applyMatricesScaling(ml, 1/nf)
  ml
}

# @param ml A named matrix list as produced by \code{\link{signal2Matrix}}.
# @param scaleFactors A numeric vector of same length as `ml`, 
#   indicating the scaling factors by which to multiply each matrix.
#   Alternatively, a numeric matrix with a number of rows equal to the length 
#   of `ml`, and two columns indicating the alpha and beta arguments of a 
#   s3norm normalization.
# @return A renormalized list of signal matrices.
.applyMatricesScaling <- function(ml, scaleFactors, applyLinearly=NULL){
  ml <- .comparableMatrices(ml, checkAttributes=TRUE)
  if(is.matrix(scaleFactors)){
    stopifnot(ncol(scaleFactors)==2 && nrow(scaleFactors)==length(ml))
    if(is.null(applyLinearly)){
      if(is.null(applyLinearly <- attributes(scaleFactors)[["applyLinearly"]]))
        stop("Could not determine whether the scaling factors should be",
             "applied linearly or on the log scale. Please use the ",
             "`applyLinearly` argument.")
    }
    for(i in seq_along(ml)){
      if(applyLinearly){
        ml[[i]] <- scaleFactors[i,1]+ml[[i]]*scaleFactors[i,2]
      }else{
        ml[[i]] <- scaleFactors[i,1]*ml[[i]]^scaleFactors[i,2]
      }
    }
  }else{
    stopifnot(is.numeric(scaleFactors) && length(scaleFactors)==length(ml))
    for(i in seq_along(scaleFactors)) ml[[i]] <- ml[[i]]*scaleFactors[i]
  }
  ml
}
ETHZ-INS/epiwraps documentation built on July 14, 2024, 6:50 p.m.