R/readSegmentationFile.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Documented in readSegmentationFile

#' Read file containing segmentations
#' 
#' Read segmentation files produced by DNAcopy, CNVkit or GATK4.
#' 
#' @param seg.file File with segmentation
#' @param sampleid Sampleid, for segmentation files containing multiple samples
#' @param model.homozygous Unless \code{TRUE}, checks for very small log2-ratios
#' that cannot happen in samples with normal contamination
#' @param format File format. If missing, derived from the file 
#' extension. Currently DNAcopy, and GATK4 
#' (ModelSegments) format supported. CNVkit uses DNAcopy format.
#' @param zero Start position is 0-based. Default is \code{FALSE}.
#' @param verbose Verbose output.
#' @return A \code{data.frame}.
#' @author Markus Riester
#' @examples
#' 
#' seg.file <- system.file("extdata", "example_seg.txt",
#'     package = "PureCN")
#' seg <- readSegmentationFile(seg.file, "Sample1")
#' 
#' @export readSegmentationFile
readSegmentationFile <- function(seg.file, sampleid, model.homozygous = FALSE,
    format, zero = FALSE, verbose = TRUE) {
    if (is.null(seg.file)) return(NULL)
    seg <- read.delim(seg.file, comment.char = "@")
    if (missing(format)) format <- .getSegFormat(seg)
    if (format == "GATK4") seg <- .convertSegGATK4(seg, sampleid)
    if (verbose) flog.info("Loaded provided segmentation file %s (format %s).", 
        basename(seg.file), format)
    .checkSeg(seg, sampleid, model.homozygous, verbose)
}
.getSegFormat <- function(seg) {
    if (colnames(seg)[1] == "CONTIG") return("GATK4")
    return("DNAcopy")
}
.convertSegGATK4 <- function(seg, sampleid) {
    data.frame(
        ID = sampleid,
        chrom = seg$CONTIG,
        loc.start = seg$START,
        loc.end = seg$END,
        num.mark = seg$NUM_POINTS_COPY_RATIO,
        seg.mean = seg$LOG2_COPY_RATIO_POSTERIOR_50
    )
}    
.checkSeg <- function(seg, sampleid, model.homozygous, verbose=TRUE) {
    
    required.colnames <- c("ID", "chrom", "loc.start", "loc.end", "num.mark", 
        "seg.mean")
    required.colnames2 <- c("ID", "chromosome", "start", "end", "num_probes", 
        "mean")
    if (ncol(seg) > length(required.colnames)) {
        seg <- seg[,seq_along(required.colnames)]
    }    
    if (identical(colnames(seg), required.colnames2)) {
        colnames(seg) <- required.colnames
    }    

    if (!identical(as.character(as.numeric(seg$chrom)), as.character(seg$chrom))) {
        flog.warn("Expecting numeric chromosome names in seg.file, assuming file is properly sorted.")
        seg$chrom <- .strip.chr.name(seg$chrom, .getChrHash(seg$chrom))
    }    
    
    if (any(is.na(seg$chrom) | is.na(seg$loc.start) | is.na(seg$loc.end))) {
        flog.warn("Coordinates in seg.file contain missing values.")
    }
    # The smallest possible log-ratio is about 8
    # for 0.99 purity and high ploidy.
    # remove artifacts with lower log-ratio
    if (!model.homozygous && min(seg$seg.mean, na.rm=TRUE) < -8) {
        nBefore <- nrow(seg)
        seg <- seg[which(seg$seg.mean >= -8 | seg$num.mark >= 4),]
        if (verbose) flog.warn("Removing %i short segments with log-ratio < -8.", 
            nBefore-nrow(seg))
    }    

    if (!identical(colnames(seg), required.colnames)) {
        .stopUserError(paste("Segmentation file expected with colnames", 
                paste(required.colnames, collapse = ", ")))
    }
    segs <- split(seg, seg$ID)
    matchedSeg <- match(make.names(sampleid), make.names(names(segs)))

    if (length(segs)==1) {
        if (!is.null(sampleid) && is.na(matchedSeg)) {
            flog.warn("Provided sampleid (%s) does not match %s found in %s",
                      sampleid, names(segs)[1], "segmentation.")
        }   
        matchedSeg <- 1 
    } else if (is.null(sampleid)) {
        .stopUserError("seg.file contains multiple samples and sampleid missing.")
    } else if (is.na(matchedSeg)) {
        .stopUserError("seg.file contains multiple samples and sampleid does not match any.")
    } else {
        seg <- segs[[matchedSeg]]
    }
    offset <- weighted.mean(seg$seg.mean, seg$num.mark, na.rm = TRUE)
    if (abs(offset) > 0.001 && verbose) {
        flog.info("Re-centering provided segment means (offset %.4f).", offset)
    }
    seg$seg.mean <- seg$seg.mean - offset
    seg
}

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/readSegmentationFile.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions .checkSeg .convertSegGATK4 .getSegFormat readSegmentationFile

Documented in readSegmentationFile

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN Copy number calling and SNV classification using targeted short read sequencing

R/readSegmentationFile.R In PureCN: Copy number calling and SNV classification using targeted short read sequencing

Defines functions .checkSeg .convertSegGATK4 .getSegFormat readSegmentationFile

Documented in readSegmentationFile

Try the PureCN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/readSegmentationFile.R
In PureCN: Copy number calling and SNV classification using targeted short read sequencing