R/filterIntervals.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Documented in filterIntervals

#' Remove low quality intervals
#' 
#' This function determines which intervals in the coverage files should be
#' included or excluded in the segmentation. It is called via the
#' \code{fun.filterIntervals} argument of \code{\link{runAbsoluteCN}}. The
#' arguments are passed via \code{args.filterIntervals}.
#' 
#' 
#' @param normal Coverage data for normal sample.
#' @param tumor Coverage data for tumor sample.
#' @param log.ratio Copy number log-ratios, one for each interval in the
#' coverage file.
#' @param seg.file If not \code{NULL}, then do not filter intervals, because data
#' is already segmented via the provided segmentation file.
#' @param filter.lowhigh.gc Quantile q (defines lower q and upper 1-q) for
#' removing intervals with outlier GC profile. Assuming that GC correction might
#' not have been worked on those. Requires \code{interval.file}.
#' @param min.coverage Minimum coverage in both normal and tumor. Intervals with
#' lower coverage are ignored. If a \code{normalDB} is provided, then this
#' database already provides information about low quality intervals and the
#' \code{min.coverage} is set to \code{min.coverage/10000}.
#' @param min.targeted.base Exclude intervals with targeted base (size in bp)
#' smaller than this cutoff. This is useful when the same interval file was
#' used to calculate GC content. For such small targets, the GC content is
#' likely very different from the true GC content of the probes.
#' @param normalDB Normal database, created with
#' \code{\link{createNormalDatabase}}.
#' @return \code{logical(length(log.ratio))} specifying which intervals should be
#' used in segmentation.
#' @author Markus Riester
#' @examples
#' 
#' normal.coverage.file <- system.file("extdata", "example_normal.txt", 
#'     package="PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt", 
#'     package="PureCN")
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' normalDB <- createNormalDatabase(normal.coverage.files)
#' 
#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt", 
#'     package="PureCN")
#' vcf.file <- system.file("extdata", "example.vcf.gz", 
#'     package="PureCN")
#' interval.file <- system.file("extdata", "example_intervals.txt", 
#'     package="PureCN")
#' 
#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
#' # non-default values to speed-up this example.  This is not a good idea for real
#' # samples.
#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file,
#'     tumor.coverage.file=tumor.coverage.file, genome="hg19", vcf.file=vcf.file,
#'     sampleid="Sample1", interval.file=interval.file, normalDB=normalDB,
#'     args.filterIntervals=list(min.targeted.base=10), max.ploidy=4, 
#'     test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
#' 
#' @export filterIntervals
filterIntervals <- function(normal, tumor, log.ratio, seg.file, 
    filter.lowhigh.gc = 0.001, min.coverage = 15, min.targeted.base = 5, 
    normalDB = NULL) {
    intervalsUsed <- .filterIntervalsNotNA(log.ratio)
    # With segmentation file, ignore all filters
    if (!is.null(seg.file)) return(intervalsUsed)

    if (!is.null(tumor$gc_bias)) {
        .checkFraction(filter.lowhigh.gc, "filter.lowhigh.gc")
        intervalsUsed <- .filterIntervalsLowHighGC(intervalsUsed, tumor,
            filter.lowhigh.gc)
    }
    intervalsUsed <- .filterIntervalsNormalDB(intervalsUsed, tumor, normalDB)

    intervalsUsed <- .filterIntervalsTargetedBase(intervalsUsed, tumor,
        min.targeted.base)
    intervalsUsed <- .filterIntervalsTotalNormalCoverage(intervalsUsed, normal,
        min.targeted.base, min.coverage)

    if (!is.null(normalDB)) {
        min.coverage <- min.coverage/10000
        flog.info("normalDB provided. Setting minimum coverage for segmentation to %.4fX.", min.coverage)
    } else {
        flog.warn("No normalDB provided. Provide one for better results.")
    }    
        
    intervalsUsed <- .filterIntervalsCoverage(intervalsUsed, normal, tumor, 
        min.coverage)

    return(intervalsUsed)
}

.checkNormalDB <- function(tumor, normalDB) {
    if (!is(normalDB, "list")) {
        .stopUserError("normalDB not a valid normalDB object. ",
            "Use createNormalDatabase to create one.")
    }    
    if (is.null(normalDB$version) || normalDB$version < 4) {
        .stopUserError("normalDB incompatible with this PureCN version. ",
                       "Please re-run NormalDB.R.")
    }
    if (is.null(normalDB$normal.coverage.files) ||
        !length(normalDB$normal.coverage.files)) {
        .stopUserError("normalDB appears to be empty.")
    }    
    intervals <- normalDB[["intervals"]]
    # TODO remove in PureCN 1.14 
    if (is.null(intervals)) {
        intervals <- as.character(readCoverageFile(normalDB$normal.coverage.files[1]))
    }
    return(identical(intervals, as.character(tumor)))
}

.filterIntervalsNotNA <- function(log.ratio) {
    nBefore <- length(log.ratio) 
    # NA's in log.ratio confuse the CBS function
    intervalsUsed <- !is.na(log.ratio) & !is.infinite(log.ratio) 
    nAfter <- sum(intervalsUsed)

    if (nAfter < nBefore) {
        flog.info("Removing %i intervals with missing log.ratio.", 
            nBefore-nAfter)
    }

    intervalsUsed
}
        
.filterIntervalsNormalDB <- function(intervalsUsed, tumor, normalDB,
normalDB.min.coverage, normalDB.max.missing) {
    if (is.null(normalDB)) return(intervalsUsed)
    # make sure that normalDB matches tumor
    if (!.checkNormalDB(tumor, normalDB)) {
        flog.warn("normalDB does not align with coverage. Ignoring normalDB.")
        return(intervalsUsed)
    }    

    nBefore <- sum(intervalsUsed) 
    intervalsUsed <- intervalsUsed & normalDB$intervals.used
    nAfter <- sum(intervalsUsed)

    if (nAfter < nBefore) {
        flog.info("Removing %i intervals excluded in normalDB.", 
            nBefore-nAfter)
    }
    intervalsUsed
}

.filterIntervalsChrHash <- function(intervalsUsed, tumor, chr.hash) {
    if (is.null(chr.hash)) return(intervalsUsed)
    nBefore <- sum(intervalsUsed)
    intervalsUsed <- intervalsUsed & seqnames(tumor) %in% chr.hash$chr
    nAfter <- sum(intervalsUsed)

    if (nAfter < nBefore) {
        flog.info("Removing %i intervals on chromosomes outside chr.hash.", 
            nBefore-nAfter)
    }
    intervalsUsed
}

.filterIntervalsTargetedBase <- function(intervalsUsed, tumor, min.targeted.base) {
    if (is.null(min.targeted.base)) return(intervalsUsed)
    nBefore <- sum(intervalsUsed)
    intervalsUsed <- intervalsUsed & width(tumor) >= min.targeted.base
    nAfter <- sum(intervalsUsed)
    if (nAfter < nBefore) {
        flog.info("Removing %i small (< %ibp) intervals.", nBefore-nAfter, 
            min.targeted.base)
    }    
    intervalsUsed
}

.filterIntervalsTotalNormalCoverage <- function(intervalsUsed, normal,
    min.targeted.base, min.coverage) {
    if (is.null(min.targeted.base) || is.null(min.coverage)) {
        return(intervalsUsed)
    }    
    nBefore <- sum(intervalsUsed)
    cutoff <- min.targeted.base * min.coverage * 2
    intervalsUsed <- intervalsUsed & normal$coverage >= cutoff
    nAfter <- sum(intervalsUsed)
    if (nAfter < nBefore) {
        flog.info("Removing %i intervals with low total coverage in normal (< %.2f reads).", 
            nBefore-nAfter, cutoff)
    }    
    intervalsUsed
}

.filterIntervalsCoverage <- function(intervalsUsed, normal, tumor, min.coverage) {
    #MR: we try to not remove homozygous deletions in very pure samples.
    #  to distinguish low quality from low copy number, we keep if normal
    # has good coverage. If normal coverage is very high, we adjust for that.  
    total.cov.normal <- sum(as.numeric(normal$coverage), na.rm = TRUE)
    total.cov.tumor <- sum(as.numeric(tumor$coverage), na.rm = TRUE)

    f <- max(total.cov.normal / total.cov.tumor, 1)

    well.covered.interval.idx <- (normal$average.coverage >= min.coverage &
        tumor$average.coverage >= min.coverage) | 
        (normal$average.coverage >= 1.5 * f * min.coverage &  
        tumor$average.coverage >= 0.5 * min.coverage)
    #MR: fix for missing chrX/Y
    well.covered.interval.idx[is.na(well.covered.interval.idx)] <- FALSE

    nBefore <- sum(intervalsUsed)
    intervalsUsed <- intervalsUsed & well.covered.interval.idx
    nAfter <- sum(intervalsUsed)
    if (nAfter < nBefore) {
        flog.info("Removing %i low coverage (< %.4fX) intervals.", nBefore-nAfter, 
            min.coverage)
    }    
    intervalsUsed
}

.filterIntervalsLowHighGC <- function(intervalsUsed, tumor, filter.lowhigh.gc) {
    qq <- quantile(tumor$gc_bias, p=c(filter.lowhigh.gc, 
        1-filter.lowhigh.gc), na.rm=TRUE)

    nBefore <- sum(intervalsUsed)
    intervalsUsed <- intervalsUsed & !is.na(tumor$gc_bias) & 
        !(tumor$gc_bias < qq[1] | tumor$gc_bias > qq[2])
    nAfter <- sum(intervalsUsed)

    if (nAfter < nBefore) {
        flog.info("Removing %i low/high GC targets.", nBefore-nAfter)
    }
    intervalsUsed
}

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/filterIntervals.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions .filterIntervalsLowHighGC .filterIntervalsCoverage .filterIntervalsTotalNormalCoverage .filterIntervalsTargetedBase .filterIntervalsChrHash .filterIntervalsNormalDB .filterIntervalsNotNA .checkNormalDB filterIntervals

Documented in filterIntervals

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN Copy number calling and SNV classification using targeted short read sequencing

R/filterIntervals.R In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions .filterIntervalsLowHighGC .filterIntervalsCoverage .filterIntervalsTotalNormalCoverage .filterIntervalsTargetedBase .filterIntervalsChrHash .filterIntervalsNormalDB .filterIntervalsNotNA .checkNormalDB filterIntervals

Documented in filterIntervals

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/filterIntervals.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing