readSnpMatrix <- function(filename, skip=0L, err.thresh=Inf, del.thresh=Inf, perl.pileup=FALSE) {
# could have been generated by original VES snp-pileup.pl code (perl)
if (perl.pileup) {
rcmat <- scan(filename, what=list(Chromosome="", Position=0, NOR.DP=0, NOR.RD=0, TUM.DP=0, TUM.RD=0), skip=skip)
if (grepl("chr",rcmat$Chromosome[1])) rcmat$Chromosome <- gsub("chr","",rcmat$Chromosome)
rcmat <- as.data.frame(rcmat, stringsAsFactors=FALSE)
} else {
# read the read count matrix generated by snp-pileup.cpp code
pileup <- read.csv(filename, stringsAsFactors=FALSE, colClasses=rep(c("character", "numeric","character", "numeric"), c(1,1,2,8)))
# remove chr if present in Chrom
if (grepl("chr",pileup$Chromosome[1])) {
pileup$Chromosome <- gsub("chr", "", pileup$Chromosome)
}
# remove loci where errors and deletions exceeded thresholds
ii <- which(pileup$File1E <= err.thresh & pileup$File1D <= del.thresh & pileup$File2E <= err.thresh & pileup$File2D <= del.thresh)
rcmat <- pileup[ii, 1:2]
rcmat$NOR.DP <- pileup$File1R[ii] + pileup$File1A[ii]
rcmat$NOR.RD <- pileup$File1R[ii]
rcmat$TUM.DP <- pileup$File2R[ii] + pileup$File2A[ii]
rcmat$TUM.RD <- pileup$File2R[ii]
}
rcmat
}
preProcSample <- function(rcmat, ndepth=35, het.thresh=0.25, snp.nbhd=250, cval=25, deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10", "udef"), ugcpct=NULL, hetscale=TRUE, unmatched=FALSE, ndepthmax=1000) {
gbuild <- match.arg(gbuild)
# integer value for chromosome X depends on the genome
if (gbuild %in% c("hg19", "hg38", "hg18")) nX <- 23
if (gbuild %in% c("mm9", "mm10")) nX <- 20
if (gbuild == "udef") {
if (missing(ugcpct)) {
stop("GC percent data should be supplied if udef option is used")
} else {
nX <- length(ugcpct)
}
}
pmat <- procSnps(rcmat, ndepth, het.thresh, snp.nbhd, nX, unmatched, ndepthmax)
if (gbuild == "udef") {
dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched, ugcpct)
} else {
dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched)
}
tmp <- segsnps(dmat, cval, hetscale, deltaCN)
out <- list(pmat=pmat, gbuild=gbuild, nX=nX)
c(out, tmp)
}
procSample <- function(x, cval=150, min.nhet=15, dipLogR=NULL) {
# ensure availability of seg.tree
if (is.null(x$seg.tree)) stop("seg.tree is not available")
# get the numeric value of chromosome X
nX <- x$nX
# make sure that original cval is smaller than current one
cval.fit <- attr(x$seg.tree, "cval")
if (cval.fit > cval) stop("original fit used cval = ", cval.fit)
# jointseg etc
jseg <- x$jointseg
jseg <- jseg[is.finite(jseg$cnlr),]
# chromosomes with data and their counts
chrs <- x$chromlevels
nchr <- length(chrs)
# get chromlevels from chrs
chromlevels <- c(1:(nX-1), "X")[chrs]
# get the segment summary for the fit in seg.tree
nsegs <- 0
# jointseg already has a seg variable numbered 1 thru number of segments for each chromosome
for (i in 1:nchr) {
jseg$seg[jseg$chrom==chrs[i]] <- nsegs + jseg$seg[jseg$chrom==chrs[i]]
nsegs <- max(jseg$seg[jseg$chrom==chrs[i]])
}
focalout <- jointsegsummary(jseg)
# cnlr.median to the left and right
cnlr.med.l <- c(0, focalout$cnlr.median[-nsegs])
cnlr.med.r <- c(focalout$cnlr.median[-1], 0)
# mad of cnlr noise
cnlr.mad <- mad(jseg$cnlr - rep(focalout$cnlr.median, focalout$num.mark))
# segments that show focal changes have big jump in cnlr.median
focalout$focal <- 1*(focalout$cnlr.median > pmax(cnlr.med.l, cnlr.med.r)+3*cnlr.mad) + 1*(focalout$cnlr.median < pmin(cnlr.med.l, cnlr.med.r)-3*cnlr.mad)
# get the segments for the specified cval
nsegs <- 0
for (i in 1:nchr) {
seg.widths <- diff(prune.cpt.tree(x$seg.tree[[i]], cval))
jseg$seg[jseg$chrom==chrs[i]] <- nsegs + rep(1:length(seg.widths), seg.widths)
nsegs <- nsegs + length(seg.widths)
}
# adding the focal change segments - need a jump at the beginning and end
jseg$seg0 <- jseg$seg # detected segments
# jump at the beginning (twice the height)
jseg$seg <- jseg$seg + rep(cumsum(2*focalout$focal), focalout$num.mark)
# drop back for the focal segment to get the steps right
jseg$seg <- jseg$seg - rep(focalout$focal, focalout$num.mark)
# focal segment could already be in; so change seg indicator
jseg$seg <- cumsum(c(1, 1*(diff(jseg$seg) > 0)))
# segment summaries
out <- jointsegsummary(jseg)
# cluster the segments
out <- clustersegs(out, jseg, min.nhet)
# put in the clustered values for snps
jseg$segclust[is.finite(jseg$cnlr)] <- rep(out$segclust, out$num.mark)
# find dipLogR and fit cncf
if (is.null(dipLogR)) {
oo <- findDiploidLogR(out, jseg$cnlr)
} else {
oo <- list()
oo$out0 <- "empty"
oo$dipLogR <- dipLogR
}
out <- fitcncf(out, oo$dipLogR, nX)
c(list(jointseg=jseg, out=out, nX=nX, chromlevels=chromlevels), oo[-1])
}
plotSample <- function(x, emfit=NULL, clustered=FALSE, plot.type=c("em","naive","both","none"), sname=NULL) {
def.par <- par(no.readonly = TRUE) # save default, for resetting...
# plot.type
plot.type <- match.arg(plot.type)
# layout of multi panel figure
if (plot.type=="none") layout(matrix(1:2, ncol=1))
if (plot.type=="em") layout(matrix(rep(1:4, c(9,9,6,1)), ncol=1))
if (plot.type=="naive") layout(matrix(rep(1:4, c(9,9,6,1)), ncol=1))
if (plot.type=="both") layout(matrix(rep(1:6, c(9,9,6,1,6,1)), ncol=1))
par(mar=c(0.25,3,0.25,1), mgp=c(1.75, 0.6, 0), oma=c(3,0,1.25,0))
# raw data used for joint segmentation
jseg <- x$jointseg
# chromosome boundaries
chrbdry <- which(diff(jseg$chrom) != 0)
if (missing(emfit)) {
out <- x$out
if (plot.type=="em" | plot.type=="both") {
warning("emfit is missing; plot.type set to naive")
plot.type <- "naive"
}
} else {
out <- emfit$cncf
# add the naive tcn, lcn and cf to out
out$tcn <- x$out$tcn
out$lcn <- x$out$lcn
out$cf <- x$out$cf
}
# determine which of the cnlr.median & mafR to show
if (clustered) {
cnlr.median <- out$cnlr.median.clust
mafR <- out$mafR.clust
mafR[is.na(mafR)] <- out$mafR[is.na(mafR)]
} else {
cnlr.median <- out$cnlr.median
mafR <- out$mafR
}
mafR <- abs(mafR)
# chromosome colors
chrcol <- 1+rep(out$chrom-2*floor(out$chrom/2), out$num.mark)
nn <- cumsum(table(jseg$chrom[is.finite(jseg$cnlr)]))
segbdry <- cumsum(c(0,out$num.mark))
segstart <- segbdry[-length(segbdry)]
segend <- segbdry[-1]
# plot the logR data and segment medians
plot(jseg$cnlr[is.finite(jseg$cnlr)], pch=".", cex=2, col = c("grey","lightblue","azure4","slateblue")[chrcol], ylab="log-ratio", xaxt="n")
abline(v=chrbdry, lwd=0.25)
abline(h=median(jseg$cnlr, na.rm=TRUE), col="green2")
abline(h = x$dipLogR, col = "magenta4")
segments(segstart, cnlr.median, segend, cnlr.median, lwd=1.75, col=2)
# plot the logOR data and mafR
plot(jseg$valor[is.finite(jseg$cnlr)], pch=".", cex=2.5, col = c("grey","lightblue","azure4","slateblue")[chrcol], ylab="log-odds-ratio", ylim=c(-4,4), xaxt="n")
abline(v=chrbdry, lwd=0.25)
segments(segstart, sqrt(mafR), segend, sqrt(mafR), lwd=1.75, col=2)
segments(segstart, -sqrt(mafR), segend, -sqrt(mafR), lwd=1.75, col=2)
# naive copy number and cellular faction pieces
cfpalette <- c(colorRampPalette(c("white", "steelblue"))(10),"bisque2")
if (plot.type=="naive" | plot.type=="both") {
# plot the estimated copy numbers and cf
out$tcn[out$tcn > 10] <- 9 + log10(out$tcn[out$tcn > 10])
ii <- which(out$lcn > 5)
if (length(ii)>0) out$lcn[ii] <- 5 + log10(out$lcn[ii])
plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn)), type="n", ylab="copy number (nv)", xaxt="n")
abline(v=chrbdry, lwd=0.25)
segments(segstart, out$lcn, segend, out$lcn, lwd=1.75, col=2)
segments(segstart, out$tcn, segend, out$tcn, lwd=1.75, col=1)
# add the cf
plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
mtext("cf-nv", side=2, at=0.5, line=0.3, las=2, cex=0.75)
cfcol <- cfpalette[round(10*out$cf+0.501)]
rect(segstart, 0, segend, 1, col=cfcol, border=NA)
}
# EM copy number and cellular faction pieces
if (plot.type=="em" | plot.type=="both") {
# plot the estimated copy numbers and cf
out$tcn.em[out$tcn.em > 10] <- 9 + log10(out$tcn.em[out$tcn.em > 10])
ii <- which(out$lcn.em > 5)
if (length(ii)>0) out$lcn.em[ii] <- 5 + log10(out$lcn.em[ii])
plot(c(0,length(jseg$cnlr)), c(0,max(out$tcn.em)), type="n", ylab="copy number (em)", xaxt="n")
abline(v=chrbdry, lwd=0.25)
segments(segstart, out$lcn.em, segend, out$lcn.em, lwd=1.75, col=2)
segments(segstart, out$tcn.em, segend, out$tcn.em, lwd=1.75, col=1)
# add the cf
plot(c(0,length(jseg$cnlr)), 0:1, type="n", ylab="", xaxt="n", yaxt="n")
mtext("cf-em", side=2, at=0.5, line=0.2, las=2, cex=0.75)
cfcol <- cfpalette[round(10*out$cf.em+0.501)]
rect(segstart, 0, segend, 1, col=cfcol, border=NA)
}
# now add the chromosome ticks on x-axis
chromlevels <- x$chromlevels
# just make sure chromlevels actually exists
if (is.null(chromlevels)) chromlevels <- 1:length(nn)
axis(labels=chromlevels, side=1, at=(nn+c(0,nn[-length(nn)]))/2, cex=0.65)
mtext(side=1, line=1.75, "Chromosome", cex=0.8)
if (!missing(sname)) mtext(sname, side=3, line=0, outer=TRUE, cex=0.8)
par(def.par) #- reset to default
}
logRlogORspider <- function(cncf, dipLogR=0, nfrac=0.005) {
rho <- seq(0, 0.95, by=0.01)
nrho <- length(rho)
logACR <- logCNR <- matrix(0, nrho, 19)
# initialize index
l <- 1
# one copy loss
logCNR[,l] <- log2(2*(1-rho) + 1*rho) -1
logACR[,l] <- log(1/(1-rho))
# integer copy numbers (clonal)
for(i in 2:7) {
for(j in 0:floor(i/2)) {
l <- l+1
logCNR[,l] <- log2(2*(1-rho) + i*rho) -1 # base-2
logACR[,l] <- log(1-rho+(i-j)*rho) - log(1-rho+j*rho)
}
}
plot(c(-0.95, 1.8), c(0, 5.2), type="n", xlab="Expected(logR - dipLogR)", ylab=" Expected(|logOR|)")
l <- 1; i <-1; j <-0
linecols <- c("black","cyan3","green3","blue")
lines(logCNR[,l], logACR[,l], lty=1, col=j+1, lwd=1.25)
text(logCNR[nrho,l]+0.03, logACR[nrho,l], paste(i,j,sep="-"), cex=0.65)
for(i in 2:7) {
for(j in 0:floor(i/2)) {
l <- l+1
lines(logCNR[,l], logACR[,l], lty=i-1, col=linecols[j+1], lwd=1.25)
text(logCNR[nrho,l]+0.03, logACR[nrho,l], paste(i,j,sep="-"), cex=0.65)
}
}
nsnps <- sum(cncf$num.mark)
nhets <- sum(cncf$nhet)
ii <- cncf$num.mark > nfrac*nsnps & cncf$nhet > nfrac*nhets
cex <- 0.3 + 2.7*(cncf$num.mark[ii]/sum(0.1*cncf$num.mark[ii]))
chrcol <- rainbow(24)
points(cncf$cnlr.median[ii] - dipLogR, sqrt(abs(cncf$mafR[ii])), cex=cex, pch=10, col=chrcol[cncf$chrom[ii]], lwd=1.5)
legend(-1, 5.25, paste("chr", c(1:22, "X"), sep=""), ncol=4, pch=10, col=chrcol[1:23], cex=0.65)
}
# this function can be used to rerun facets on output from procSample
# usage is xx <- rePreProcSample(out$jointseg[,1:8])
# oo <- procSample(xx, ...) etc.
rePreProcSample<- function(jseg, cval=25, deltaCN=0, gbuild=c("hg19", "hg38", "hg18", "mm9", "mm10"), hetscale=TRUE, unmatched=FALSE) {
pmat <- jseg[, c("chrom", "maploc", "rCountT", "rCountN", "vafT", "vafN", "het", "keep")]
gbuild <- match.arg(gbuild)
# integer value for chromosome X depends on the genome
if (gbuild %in% c("hg19", "hg38", "hg18")) nX <- 23
if (gbuild %in% c("mm9", "mm10")) nX <- 20
dmat <- counts2logROR(pmat[pmat$rCountT>0,], gbuild, unmatched)
tmp <- segsnps(dmat, cval, hetscale, deltaCN)
out <- list(pmat=pmat, gbuild=gbuild, nX=nX)
c(out, tmp)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.