R/facets-wrapper.R

Defines functions logRlogORspider plotSample procSample preProcSample readSnpMatrix

Documented in logRlogORspider plotSample preProcSample procSample readSnpMatrix

readSnpMatrix <- function(filename, skip=0L, err.thresh=Inf, del.thresh=Inf,
                          perl.pileup=FALSE, MandUnormal=FALSE, spanT=0.2,
                          spanA=0.2, spanX=0.2, gbuild="hg19",
                          ReferencePileupFile=NULL, ReferenceLoessFile=NULL,
                          MinOverlap=0.90, useMatchedX=FALSE, refX=FALSE, unmatched=FALSE, donorCounts=FALSE) {
  #' Read in the snp-pileup generated SNP read count matrix file 
  #' @importFrom utils read.csv
  #' @param filename counts file from snp-pileup
  #' @param skip (character) Skip n number of lines in the input file.
  #' @param err.thresh (numeric) Error threshold to be used to filter snp-pileup data frame.
  #' @param del.thresh (numeric) Deletion threshold to be used to filter snp-pileup data frame.
  #' @param perl.pileup (logical) Is the pileup data generated using perl pileup tool?
  #' @param MandUnormal (logical) Is CNLR analysis to be peformed using unmatched reference normals?
  #' @param spanT (numeric) Default span value to be used for loess normalization in tumor sample.
  #' @param unmatched (logical) is the tumor being analyzed unmatched
  #' @param spanA (numeric) Default span value to be used for loess normalization across autosomal chromosomes in the normal sample.
  #' @param spanX (numeric) Default span value to be used for loess normalization in Chr X in the normal sample.
  #' @param gbuild (character) Genome build (Default: hg19).
  #' @param ReferencePileupFile (character) Filepath to an optional snp-pileup generated pileup data of one or more reference normals.
  #' @param ReferenceLoessFile (character) Filepath to an optional loess data, generated using the facets2n package, of one or more reference normals. The number of normals in this data should match that in the ReferencePileupFile, and should be in the same order.
  #' @param MinOverlap (numeric) Mininum overlap fraction of loci between a tumor pileup and reference pileup data.
  #' @param useMatchedX (logical) Is the matched normal to be used for ChrX normalization?
  #' @param refX (logical) Use matched or reference normal for chrX normalization. excludes unmatched normals, such as pooled references, present in tumor counts matrix. 
  #' @param donorCounts (logical) is the counts matrix baseline donor sample(s)
  #' @return A dataframe of pileup depth values for Tumor and Matched Normal if MandUnormal is FALSE. Else, a list of data frame with pileup depth values of Tumor, matched Normal, and a best unmatched normal, and the associated span values.
  #' @export

  # could have been generated by original VES snp-pileup.pl code (perl)
  if (perl.pileup) {
    rcmat <- scan(pileupfilename, what=list(Chromosome="", Position=0,
                                      NOR.DP=0, NOR.RD=0, TUM.DP=0,
                                      TUM.RD=0), skip=skip)
    if (rcmat$Chromosome[1] == "chr1") {
      rcmat$Chromosome <- gsub("chr","",rcmat$Chromosome)
    }
    rcmat <- as.data.frame(rcmat, stringsAsFactors=FALSE)
  }
  else if (donorCounts) {
    tumor.pileup <- PreProcSnpPileup(filename, err.thresh,
                                     del.thresh, is.Reference=T, gbuild)
    colnames(tumor.pileup) = gsub("File", "Donor", colnames(tumor.pileup))
    return(tumor.pileup)
  }else {
    # read the read count matrix generated by snp-pileup.cpp code
    tumor.pileup <- PreProcSnpPileup(filename, err.thresh,
                      del.thresh, is.Reference=F, gbuild)
    if (MandUnormal) {
      tumor.loess <- MakeLoessObject(tumor.pileup, is.Reference=F)
      tumor.loess.key <- tumor.loess[,1]
      reference.loess <- NULL
      reference.pileup <- NULL
      rcmat=NULL
      if (!is.null(ReferencePileupFile) & !is.null(ReferenceLoessFile)) {
        reference.pileup <- PreProcSnpPileup(ReferencePileupFile, err.thresh,
                               del.thresh, is.Reference = T)
        reference.loess <- as.matrix(read.csv(ReferenceLoessFile, sep="\t",
                               stringsAsFactors=FALSE, header=T))
        rcmat <- FindBestNormalParameters(
          tumor.loess,
          tumor.pileup,
          reference.loess,
          reference.pileup,
          MinOverlap,
          useMatchedX, 
          refX, 
          unmatched
        )
      }
      else{
        tumor.pileup = tumor.pileup[which(tumor.pileup$key %in% tumor.loess.key),]
        rcmat <- FindBestNormalParameters(
        tumor.loess,
        tumor.pileup,
        reference.loess,
        reference.pileup,
        MinOverlap,
        useMatchedX, 
        refX, 
        unmatched
        )
      }
      return(rcmat)
    }
    else {
      rcmat <- subset(tumor.pileup, select=c(Chromosome, Position, Ref, Alt, File1DP, File1R, File2DP, File2R))
      colnames(rcmat) <- c("Chromosome", "Position","Ref","Alt", "NOR.DP", "NOR.RD", "TUM.DP", "TUM.RD")
      return(rcmat)
    }
  }
}


preProcSample <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, cval=25, deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10"), hetscale=TRUE, unmatched=FALSE, MandUnormal=FALSE, ndepthmax=5000, spanT=0.2, spanA=0.2, spanX=0.2,donorCounts=NULL) {
#' Pre-process a sample
#' @description Processes a snp read count matrix and generates a segmentation tree
#' @param rcmat data frame with 6 required columns: Chrom, Pos, NOR.DP, NOR.RD, TUM.DP and TUM.RD. Additional variables are ignored. Ref and Alt columns required for transplant cases with option donorCounts.
#' @param ndepth minimum normal sample depth to keep
#' @param het.thresh vaf threshold to call a SNP heterozygous
#' @param snp.nbhd window size
#' @param cval critical value for segmentation
#' @param deltaCN minimum detectable difference in CN from diploid state
#' @param gbuild genome build used for the alignment of the genome. Default value is human genome build hg19. Other possibilities are hg38 & hg18 for human and mm9 & mm10 for mouse. Chromosomes used for analysis are 1-22, X for humans and 1-19 for mouse. Option udef can be used to analyze other genomes.
#' @param hetscale (logical) variable to indicate if logOR should get more weight in the test statistics for segmentation and clustering. Usually only 10 % of snps are hets and hetscale gives the logOR contribution to T-square as 0.25/proportion of hets.
#' @param unmatched indicator of whether the normal sample is unmatched. When this is TRUE hets are called using tumor reads only and logOR calculations are different. Use het.thresh = 0.1 or lower when TRUE.
#' @param ndepthmax loci for which normal coverage exceeds this number (default is 1000) will be discarded as PCR duplicates. Fof high coverage sample increase this and ndepth commensurately.
#' @param MandUnormal analyzing both matched and unmatched normal for log ratio normalization
#' @param spanT span value tumor
#' @param spanA span value autosomes
#' @param spanX span value X
#' @param donorCounts snp read count matrix for donor sample(s). Required columns: Chromosome Position Ref Alt and for each donor sample,i: RefDonoriR RefDonoriA RefDonoriE RefDonoriD RefDonoriDP
#' @details  The SNPs in a genome are not evenly spaced. Some regions have multiple SNPs in a small neighborhood. 
#' Thus using all loci will induce serial correlation in the data. To avoid it we sample loci such that only 
#' a single locus is used in an interval of length snp.nbhd. So in order to get 
#' reproducible results use set.seed to fix the random number generator seed.
#' @return \item{pmat}{Read counts and other elements of all the loci}
#' \item{seg.tree}{a list of matrices one for each chromosome. the matrix gives the tree structure of the splits. each row corresponds to a segment with the parent row as the first element the start-1 and end index of each segment and the maximal T^2 statistic. the first row is the whole chromosome and its parent row is by definition 0.}
#' \item{jointseg}{The data that were segmented. Only the loci that were sampled within a snp.nbhd are present. segment results given.}
#' \item{hscl}{scaling factor for logOR data.}
#' @export

  gbuild <- match.arg(gbuild)
  # integer value for chromosome X depends on the genome
  if (gbuild %in% c("hg19", "hg38", "hg18")) nX <- 23
  if (gbuild %in% c("mm9", "mm10")) nX <- 20
  pmat <- procSnps(rcmat, ndepth, het.thresh, snp.nbhd, gbuild, unmatched, ndepthmax, donorCounts)
  dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched, MandUnormal, 0.2, spanT,spanA, spanX)
  tmp <- segsnps(dmat, cval, hetscale, deltaCN)
  out <- list(pmat=pmat, gbuild=gbuild, nX=nX)
  c(out, tmp)
}

procSample <- function(x, cval=150, min.nhet=15, dipLogR=NULL) {
  #' Process a sample
  #' @description  Processes the output from preProcSample for given cval and min.nhet
  #' @param x the output from preProcSample; should contain seg.tree and jointseg
  #' @param cval critical value for segmentation
  #' @param min.nhet minimum number of heterozygote snps in a segment used for bivariate t-statistic during clustering of segments
  #' @param dipLogR diploid level obtained from a fit, typically using a higher cval, can be used with lower cval to recover focal changes
  #' @details The minor copy number lcn may not be estimated with confidence when a segment has fewer than min.nhet heterozygous SNPs and hence will return NA. If there are too few heterozygous SNPs in a segment then mafR and mafR.clust can be NA.
  #' @return  \item{jointseg}{The data that were segmented. Only the loci that were sampled within a snp.nbhd are present}
  #' \item{out}{data frame of segment summaries pre and post clustering of segments. The columns are: chrom the chromosome to which the segment belongs; seg the segment number; num.mark the number of SNPs in the segment; nhet the number of SNPs that are deemed heterozygous; cnlr.median the median log-ratio of the segment; mafR the log-odds-ratio summary for the segment; segclust the segment cluster to which segment belongs; cnlr.median.clust the median log-ratio of the segment cluster; mafR.clust the log-odds-ratio summary for the segment cluster; cf the cellular fraction of the segment; tcn the total copy number of the segment; lcn the minor copy number of the segment.}
  #' \item{dipLogR}{specified or obtained from data}
  #' \item{...}{other output when findDiploidLogR is used}
  #' @export

  # ensure availability of seg.tree
  if (is.null(x$seg.tree)) stop("seg.tree is not available")
  # get the numeric value of chromosome X
  nX <- x$nX
  # make sure that original cval is smaller than current one
  cval.fit <- attr(x$seg.tree, "cval")
  if (cval.fit > cval) stop("original fit used cval = ", cval.fit)
  # jointseg etc
  jseg <- x$jointseg
  jseg <- jseg[is.finite(jseg$cnlr),]
  # chromosomes with data and their counts
  chrs <- x$chromlevels
  nchr <- length(chrs)
  # get chromlevels from chrs
  if (x$gbuild %in% c("hg19", "hg38", "hg18")) chromlevels <- c(1:22,"X")[chrs]
  if (x$gbuild %in% c("mm9", "mm10")) chromlevels <- c(1:19,"X")[chrs]
  # get the segment summary for the fit in seg.tree
  nsegs <- 0
  # jointseg already has a seg variable numbered 1 thru number of segments for each chromosome
  for (i in 1:nchr) {
    jseg$seg[jseg$chrom==chrs[i]] <- nsegs + jseg$seg[jseg$chrom==chrs[i]]
    nsegs <- max(jseg$seg[jseg$chrom==chrs[i]])
  }
  focalout <- jointsegsummary(jseg)
  # cnlr.median to the left and right
  cnlr.med.l <- c(0, focalout$cnlr.median[-nsegs])
  cnlr.med.r <- c(focalout$cnlr.median[-1], 0)
  # mad of cnlr noise
  cnlr.mad <- mad(jseg$cnlr - rep(focalout$cnlr.median, focalout$num.mark))
  # segments that show focal changes have big jump in cnlr.median
  focalout$focal <- 1*(focalout$cnlr.median > pmax(cnlr.med.l, cnlr.med.r)+3*cnlr.mad) + 1*(focalout$cnlr.median < pmin(cnlr.med.l, cnlr.med.r)-3*cnlr.mad)
  # get the segments for the specified cval
  nsegs <- 0
  for (i in 1:nchr) {
    seg.widths <- diff(prune.cpt.tree(x$seg.tree[[i]], cval))
    jseg$seg[jseg$chrom==chrs[i]] <- nsegs + rep(1:length(seg.widths), seg.widths)
    nsegs <- nsegs + length(seg.widths)
  }
  # adding the focal change segments - need a jump at the beginning and end
  jseg$seg0 <- jseg$seg # detected segments
  # jump at the beginning (twice the height)
  jseg$seg <- jseg$seg + rep(cumsum(2*focalout$focal), focalout$num.mark)
  # drop back for the focal segment to get the steps right
  jseg$seg <- jseg$seg - rep(focalout$focal, focalout$num.mark)
  # focal segment could already be in; so change seg indicator
  jseg$seg <- cumsum(c(1, 1*(diff(jseg$seg) > 0)))
  # segment summaries
  out <- jointsegsummary(jseg)
  # cluster the segments
  out <- clustersegs(out, jseg, min.nhet)
  # put in the clustered values for snps
  jseg$segclust[is.finite(jseg$cnlr)] <- rep(out$segclust, out$num.mark)
  # find dipLogR and fit cncf
  if (is.null(dipLogR)) {
    oo <- findDiploidLogR(out, jseg$cnlr)
  } else {
    oo <- list()
    oo$out0 <- "empty"
    oo$dipLogR <- dipLogR
  }
  out <- fitcncf(out, oo$dipLogR, nX)
  c(list(jointseg=jseg, out=out, nX=nX, chromlevels=chromlevels), oo[-1])
}

plotSample <- function(x, emfit=NULL, clustered=FALSE, plot.type=c("em","naive","both","none"), sname=NULL) {
  #' Plot the data and results for a single sample
  #' @description Plots copy number log-ratio, variant allele log-odds ratio as well as the copy number and cellular fraction fits.
  #' @importFrom grDevices colorRampPalette
  #' @param x (character) output from procSample
  #' @param emfit (character) output of emcncf
  #' @param clustered (logical) indicator of whether segment or cluster summary plotted
  #' @param plot.type (character) the type of plot. The default is em in which the logR and logOR data as well as the copy number and cellular fraction fits from EM are graphed. For naive the naive copy number and cellular fraction fits are used instead of EM. For none only the data are shown and for both both fits are shown
  #' @param sname (character) sample name give as a character string
  #' @export
  def.par <- par(no.readonly = TRUE) # save default, for resetting...
  # plot.type
  plot.type <- match.arg(plot.type)
  # layout of multi panel figure
  if (plot.type=="none") layout(matrix(1:2, ncol=1))
  if (plot.type=="em") layout(matrix(rep(1:4, c(9,9,6,1)), ncol=1))
  if (plot.type=="naive") layout(matrix(rep(1:4, c(9,9,6,1)), ncol=1))
  if (plot.type=="both") layout(matrix(rep(1:6, c(9,9,6,1,6,1)), ncol=1))
  par(mar=c(0.25,3,0.25,1), mgp=c(1.75, 0.6, 0), oma=c(3,0,1.25,0))
  # raw data used for joint segmentation
  jseg <- x$jointseg
  # chromosome boundaries
  chrbdry <- which(diff(jseg$chrom) != 0)
  out=NULL
  if (missing(emfit)) {
    out <- x$out
    out$lcn[out$tcn == 1] = 0 #fix bad NAs
    if (plot.type=="em" | plot.type=="both") {
      warning("emfit is missing; plot.type set to naive")
      plot.type <- "naive"
    }
  } else {
    out <- emfit$cncf
    # add the naive tcn, lcn and cf to out
    out$tcn <- x$out$tcn 
    out$lcn <- x$out$lcn
    out$lcn[out$tcn == 1] = 0 #fix bad NAs
    out$lcn.em[out$tcn.em == 1] = 0
    out$cf <- x$out$cf
  }
  # determine which of the cnlr.median & mafR to show
  if (clustered) {
    cnlr.median <- out$cnlr.median.clust
    mafR <- out$mafR.clust
    mafR[is.na(mafR)] <- out$mafR[is.na(mafR)]
  } else {
    cnlr.median <- out$cnlr.median
    mafR <- out$mafR
  }
  mafR <- abs(mafR)
  # chromosome colors
  chrcol <- 1+rep(out$chr-2*floor(out$chr/2), out$num.mark)
  nn <- cumsum(table(jseg$chrom[is.finite(jseg$cnlr)]))
  segbdry <- cumsum(c(0,out$num.mark))
  segstart <- segbdry[-length(segbdry)]
  segend <- segbdry[-1]
  # plot the logR data and segment medians
  ymin = floor(min(out$cnlr.median, na.rm = T))
  ymax = ceiling(max(out$cnlr.median, na.rm = T))
  if (ymin > -2) ymin = -2
  if (ymax < 2) ymax = 2
  
  plot(jseg$cnlr[is.finite(jseg$cnlr)], pch=1, cex=.5, col = c("grey","lightblue","azure4","slateblue")[chrcol], ylab="log-ratio", xaxt="n", ylim=c(ymin,ymax))
  abline(v=chrbdry, lwd=0.25)
  abline(h=median(jseg$cnlr, na.rm=TRUE), col="green2")
  abline(h = x$dipLogR, col = "magenta4")
  segments(segstart, cnlr.median, segend, cnlr.median, lwd=1.75, col='red')
  # plot the logOR data and mafR
  plot(jseg$valor[is.finite(jseg$cnlr)], pch=1, cex=.5, col = c("grey","lightblue","azure4","slateblue")[chrcol], ylab="log-odds-ratio", ylim=c(-4,4), xaxt="n")
  abline(v=chrbdry, lwd=0.25)
  segments(segstart, sqrt(mafR), segend, sqrt(mafR), lwd=1.75, col='red')
  segments(segstart, -sqrt(mafR), segend, -sqrt(mafR), lwd=1.75, col='red')

  cfpalette <- c(colorRampPalette(c("white", "steelblue"))(10),"bisque2")
  # EM copy number and cellular faction pieces
  if (plot.type=="em" | plot.type=="both") {
    # plot the estimated copy numbers and cf
    out$tcn.em[out$tcn.em > 10] <- 9 + log10(out$tcn.em[out$tcn.em > 10])
    ii <- which(out$lcn.em > 5)
    if (length(ii)>0) out$lcn.em[ii] <- 5 + log10(out$lcn.em[ii])
    plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn.em)), type="n", ylab="copy number (em)", xaxt="n")
    abline(v=chrbdry, lwd=0.25)
    segments(segstart, out$tcn.em, segend, out$tcn.em, lwd=2, col='black')
    segments(segstart, out$lcn.em, segend, out$lcn.em, lwd=1.5, col='red')
    # add the cf
    plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
    mtext("cf-em", side=2, at=0.5, line=0.2, las=2, cex=0.75)
    cfcol <- cfpalette[round(10*out$cf.em+0.501)]
    rect(segstart, 0, segend, 1, col=cfcol, border=NA)
  }
  # naive copy number and cellular faction pieces
  if (plot.type=="naive" | plot.type=="both") {
    # plot the estimated copy numbers and cf
    out$tcn[out$tcn > 10] <- 9 + log10(out$tcn[out$tcn > 10])
    ii <- which(out$lcn > 5)
    if (length(ii)>0) out$lcn[ii] <- 5 + log10(out$lcn[ii])
    plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn)), type="n", ylab="copy number (nv)", xaxt="n")
    abline(v=chrbdry, lwd=0.25)
    segments(segstart, out$tcn, segend, out$tcn, lwd=2, col='black')
    segments(segstart, out$lcn, segend, out$lcn, lwd=1.5, col='red')
    # add the cf
    plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
    mtext("cf-nv", side=2, at=0.5, line=0.3, las=2, cex=0.75)
    cfcol <- cfpalette[round(10*out$cf+0.501)]
    rect(segstart, 0, segend, 1, col=cfcol, border=NA)
  }

  # now add the chromosome ticks on x-axis
  chromlevels <- x$chromlevels
  # just make sure chromlevels actually exists
  if (is.null(chromlevels)) chromlevels <- 1:length(nn)
  axis(labels=chromlevels, side=1, at=(nn+c(0,nn[-length(nn)]))/2, cex=0.65)
  mtext(side=1, line=1.75, "Chromosome", cex=0.8)
  if (!missing(sname)) mtext(sname, side=3, line=0, outer=TRUE, cex=0.75)
  par(def.par)  #- reset to default
}

logRlogORspider <- function(cncf, dipLogR=0, nfrac=0.005) {
  #' logRlogRsplide plot generation from cncf input
  #' @description Plots copy number log-ratio, variant allele log-odds ratio as well as the copy number and cellular fraction fits.
  #' @param cncf Copy number and cellular fraction data frame either the naive one (out) from procSample or the EM fit (cncf) from emcncf.
  #' @param dipLogR the log-ratio value corresponding to the diploid state.
  #' @param nfrac a segment is shown if the proportion of loci and het SNPs (num.mark and nhet) nfrac. Default is 0.01.
  #' @details This is a diagnostic plot to check how well the copy number fits work. The estimated segment summaries are plotted as circles where the size of the circle increases with the number of loci in the segment. The expected value for various integer copy number states are drawn as curves for purity ranging from 0 to 0.95. For a good fit, the segment summaries should be close to one of the lines.
  #' @export
  rho <- seq(0, 0.95, by=0.01)
  nrho <- length(rho)
  logACR <- logCNR <- matrix(0, nrho, 19)
  # initialize index
  l <- 1
  # one copy loss
  logCNR[,l] <- log2(2*(1-rho) + 1*rho) -1
  logACR[,l] <- log(1/(1-rho))
  # integer copy numbers (clonal)
  for(i in 2:7) {
    for(j in 0:floor(i/2)) {
      l <- l+1
      logCNR[,l] <- log2(2*(1-rho) + i*rho) -1 # base-2
      logACR[,l] <- log(1-rho+(i-j)*rho) - log(1-rho+j*rho)
    }
  }

  plot(c(-0.95, 1.8), c(0, 5), type="n", xlab="Expected(logR - dipLogR)", ylab=" Expected(|logOR|)")
  l <- 1; i <-1; j <-0
  linecols <- c("black","cyan3","green3","blue")
  lines(logCNR[,l], logACR[,l], lty=1, col=j+1, lwd=1.25)
  text(logCNR[nrho,l]+0.03, logACR[nrho,l], paste(i,j,sep="-"), cex=0.65)
  for(i in 2:7) {
    for(j in 0:floor(i/2)) {
      l <- l+1
      lines(logCNR[,l], logACR[,l], lty=i-1, col=linecols[j+1], lwd=1.25)
      text(logCNR[nrho,l]+0.03, logACR[nrho,l], paste(i,j,sep="-"), cex=0.65)
    }
  }

  nsnps <- sum(cncf$num.mark)
  nhets <- sum(cncf$nhet)
  ii <- cncf$num.mark > nfrac*nsnps & cncf$nhet > nfrac*nhets
  cex <- 0.3 + 2.7*(cncf$num.mark[ii]/sum(0.1*cncf$num.mark[ii]))
  points(cncf$cnlr.median[ii] - dipLogR, sqrt(abs(cncf$mafR[ii])), cex=cex, col="magenta4", lwd=1.5)
}
rptashkin/facets2n documentation built on May 11, 2022, 1:34 p.m.