R/processMultipleSamples.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Documented in processMultipleSamples

#' Multi sample normalization and segmentation
#' 
#' This function performs normalization and segmentation when multiple 
#' for the same patient are available. 
#' 
#' 
#' @param tumor.coverage.files Coverage data for tumor samples.
#' @param sampleids Sample ids, used in output files.
#' @param normalDB Database of normal samples, created with
#' \code{\link{createNormalDatabase}}.
#' @param num.eigen Number of eigen vectors used.
#' @param genome Genome version, for example hg19. Needed to get centromere
#' positions.
#' @param plot.cnv Segmentation plots.
#' @param min.interval.weight Can be used to ignore intervals with low weights.
#' @param w Weight of samples. Can be used to downweight poor quality samples.
#' If \code{NULL}, sets to inverse of median on-target duplication rate if
#' available, otherwise does not do any weighting.
#' @param max.segments If not \code{NULL}, try a higher \code{undo.SD}
#' parameter if number of segments exceeds the threshold.
#' @param chr.hash Mapping of non-numerical chromsome names to numerical names
#' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
#' properly ordered.
#' @param centromeres A \code{GRanges} object with centromere positions.
#' @param ... Arguments passed to the segmentation function.
#' @return \code{data.frame} containing the segmentation.
#' @author Markus Riester
#' @references Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O.,
#' Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A., 
#' Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and
#' multi-track copy number segmentation." BMC Genomics, 13(1), 591.
#'
#' @seealso \code{\link{runAbsoluteCN}}
#' @examples
#' 
#' normal1.coverage.file <- system.file("extdata", "example_normal.txt", 
#'     package="PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt", 
#'     package="PureCN")
#' tumor1.coverage.file <- system.file("extdata", "example_tumor.txt",
#'     package="PureCN")
#' tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt",
#'     package="PureCN")
#'
#' normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file)
#' tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file)
#'
#' normalDB <- createNormalDatabase(normal.coverage.files)
#'
#' seg <- processMultipleSamples(tumor.coverage.files, 
#'          sampleids = c("Sample1", "Sample2"),
#'          normalDB = normalDB,
#'          genome = "hg19")
#'
#' @export processMultipleSamples
processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB, 
    num.eigen = 20, genome, plot.cnv = TRUE, w = NULL,
    min.interval.weight = 1/3,
    max.segments = NULL, chr.hash = NULL, centromeres = NULL, ...) {

    if (!requireNamespace("copynumber", quietly = TRUE)) {
        .stopUserError("processMultipleSamples requires the copynumber package.")
    }
    tumors <- lapply(.readNormals(tumor.coverage.files), 
        calculateTangentNormal, normalDB, num.eigen = num.eigen)

    interval.weights <- NULL
    intervalsUsed <- rep(TRUE, length(tumors[[1]]))
    if (!is.null(normalDB$sd$weights) && !is.null(min.interval.weight)) {
       interval.weights <- normalDB$sd$weights$weights
       if (length(interval.weights) != length(tumors[[1]])) {
            # should not happen because it is checked upstream
            .stopRuntimeError("interval.weights and tumors does not align.")
       }    
       flog.info("Interval weights found, but currently not supported by copynumber. %s",
            "Will simply exclude intervals with low weight.")
       lowWeightIntervals <- interval.weights < min.interval.weight
       intervalsUsed[which(lowWeightIntervals)] <- FALSE
    }
    #MR: fix for missing chrX/Y 
    intervalsUsed[is.na(intervalsUsed)] <- FALSE
    if (is.null(chr.hash)) chr.hash <- .getChrHash(seqlevels(tumors[[1]]))
    intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumors[[1]], chr.hash)
    centromeres <- .getCentromerePositions(centromeres, genome, 
            if (is.null(tumors[[1]])) NULL else seqlevelsStyle(tumors[[1]]))
    if (is.null(centromeres)) {
        .stopUserError("Cannot find centromeres for ", genome,
            ". Provide them manually or select a supported genome.")
    }    
    armLocations <- .getArmLocations(tumors[[1]], chr.hash, centromeres)
    armLocationsGr <- GRanges(armLocations)
    arms <- armLocationsGr$arm[findOverlaps(tumors[[1]], armLocationsGr, select="first")]
    lrs <- data.frame(do.call(cbind, lapply(tumors, function(x) unlist(x$log.ratio))))
    colnames(lrs) <- sampleids
    lrs <- data.frame(chrom = .strip.chr.name(seqnames(tumors[[1]]), chr.hash), 
                      pos = start(tumors[[1]]), lrs)
    # ignore arms with only 2 or fewer probes
    carms <- paste0(lrs[,1], arms)
    intervalsUsed <- as.logical(intervalsUsed & complete.cases(lrs) 
                                & !is.na(arms)) & table(carms)[carms] > 2

    lrs <- lrs[intervalsUsed,]
    arms <- arms[intervalsUsed]
    lrsw <- copynumber::winsorize(lrs, arms = arms, verbose = FALSE)
    if (is.null(w)) {
        w <- 1
        dupr <- vapply(tumors, function(x) 
                       median(x[x$on.target]$duplication.rate, na.rm = TRUE),
                       double(1))
        if (!sum(is.na(dupr)) && min(dupr, na.rm = TRUE) > 0) { 
            w <- (1 / dupr)
            w <- w / max(w)

            flog.info("Setting weights by duplication rate. Lowest weight for %s (%.2f), heighest for %s.",
                sampleids[which.min(w)], min(w), sampleids[which.max(w)])
        }
    }
    lrsm <- copynumber::multipcf(lrsw, arms = arms, w = w, ...)
    if (plot.cnv) {
        copynumber::plotHeatmap(segments = lrsm, upper.lim = 1)
        copynumber::plotGenome(lrsw, segments = lrsm, onefile = TRUE)
    }
    idx.enough.markers <- lrsm$n.probes > 1
    rownames(lrsm) <- NULL
    lrsm[idx.enough.markers,]
    #transform to DNAcopy format
    m <- data.table::melt(data.table(lrsm), id.vars=1:5)
    m <- m[, c(6,1,3,4,5,7)]
    colnames(m) <- c("ID", "chrom", "loc.start", "loc.end", "num.mark", "seg.mean")
    data.frame(m)
}

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/processMultipleSamples.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions processMultipleSamples

Documented in processMultipleSamples

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN Copy number calling and SNV classification using targeted short read sequencing

R/processMultipleSamples.R In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions processMultipleSamples

Documented in processMultipleSamples

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/processMultipleSamples.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing