# function to find logR level corresponding to the diploid state
# uses the clusters of segments of (logR, logOR) data with high cval
# we then look for clusters with allelic balance (as in normal diploid)
# allelic balance is also in k+k state (k copies of both parental chroms)
# we believe 0+0 can only happen for small focal regions and >2+2 rare
#
findDiploidLogR <- function(out, cnlr) {
flags <- NULL
# summarize the segment specific info into clusters
num.mark <- tapply(out$num.mark, out$segclust, sum)
segclust <- as.numeric(names(num.mark))
nhet <- tapply(out$nhet, out$segclust, sum)
cnlr.median <- tapply(out$cnlr.median.clust, out$segclust, median)
mafR <- tapply(out$mafR.clust, out$segclust, median)
out0 <- data.frame(segclust, num.mark, nhet, cnlr.median, mafR)
# further clustering to get ocn states (e.g. 1+1 and 2+0 at same level)
# set up the cnlr data and segid (segclust level)
cnlr <- cnlr[is.finite(cnlr)]
segid <- rep(out$segclust, out$num.mark)
# re-order data by cnlr.medians
segid <- rank(out0$cnlr.median, ties.method="random")[segid]
cnlr <- cnlr[order(segid)]
segid <- sort(segid)
# re-order the clusters by cnlr.median
out0 <- out0[order(out0$cnlr.median),]
# the components need to be re-ordered too
mafR <- out0$mafR
cnlr.median <- out0$cnlr.median
num.mark <- out0$num.mark
nsnps <- sum(num.mark)
segclust <- out0$segclust
nhet <- out0$nhet
#
ocnlevels <- unique(out0$cnlr.median)
# revised ocn
#
# mafR < 0.025 allows for single copy change at no more than 17.13%
# check if there are balanced clusters
mafR.thresh <- 0.025
bsegs <- which(out0$mafR < mafR.thresh)
# if none exists of if balanced segs span less than 10% of genome
# use clusters with smallest mafR
if (sum(num.mark[bsegs])/nsnps < 0.1) {
# set flag
flags <- "mafR not sufficiently small"
# get bsegs with mafR < 0.05; single copy change at 25%
mafR.thresh <- 0.05
bsegs <- which(out0$mafR < mafR.thresh)
# if still not 10% use mafR.thresh of 0.09 (single copy gain at 35%)
if (sum(num.mark[bsegs])/nsnps < 0.1) {
mafR.thresh <- 0.09
bsegs <- which(out0$mafR < mafR.thresh)
if (sum(num.mark[bsegs])/nsnps < 0.1) {
flags <- c(flags, "mafR<0.09 in less than 10% genome")
}
}
}
#
# do the cnrl.median of balanced segs suggest two possible locations?
if (length(bsegs) > 1) {
# find the low and high values by clustering them
cm2cls <- km2class(cnlr.median[bsegs], num.mark[bsegs])
# if the centers are separated by < 0.1375 only one solution
# 0.2016 = log2(2.3) - log2(2); 2.3 implies CN of 4 at 15% cf
if (diff(cm2cls$centers) < 0.2) {
dipLogR <- median(rep(cnlr.median[bsegs], num.mark[bsegs]))
nbal <- sum(cm2cls$nbal)
} else {
# check if the lower value represent 1+1 or 2+2?
dipLogR <- cm2cls$centers
nbal <- cm2cls$nbal
# if the lower value represents small fraction of data can be 0+0
if (nbal[1]/nsnps < 0.01) {
dipLogR <- cm2cls$centers[2]
nbal <- cm2cls$nbal[2]
}
}
} else {
# make sure bsegs is not empty
if (length(bsegs) == 0) {
# if no balanced segs set dipLogR at the median of cnlr
dipLogR <- median(cnlr)
nbal <- 0
} else {
dipLogR <- cnlr.median[bsegs]
nbal <- num.mark[bsegs]
}
}
names(dipLogR) <- NULL
names(nbal) <- NULL
# flag if dipLogR has two values separated by < log2(1.25) (CN=4 at 25%)
if (length(dipLogR) == 2)
if (diff(dipLogR) < log2(1.25))
flags <- c(flags, "possibly subclonal 2+2 states present")
# cat("dipLogR =", dipLogR, "\n")
# print(out0)
# first remove the balanced segs
out1 <- out0[-bsegs,]
# is dipLogR[1] the 1+1 level; if so all lower cnlr.med values are losses
# since 2+0 will have same true cnlr as 1+1 check only clear losses
# clear losses defined as ocn=1.85 (loss at 15% cf); log2(1.85/2)=-0.1125
lsegs <- which(out1$cnlr.median <= dipLogR[1] - 0.1125 & is.finite(out1$mafR))
not1plus1 <- FALSE # indicator whether dipLogR[1] is not 1+1 state
wgd.likely <- NULL # placeholder variable for ambiguos wgd call
# identifiability issues when lsegs empty; call dipLogR[1] as 1+1
if (length(lsegs) > 0) {
# check if dipLogR[1] is 1+1 or 2+2
out1 <- out1[lsegs,]
# assume dipLogR[1] is 1+1. compute cf using logR and logOR data
cflr <- pmin(2 - 2^(1 + out1$cnlr.median - dipLogR[1]), 1)
cflor <- 1 - exp(-sqrt(pmax(0,out1$mafR)))
# if segments where cflor > cflr + 0.1 has >5% of snps
# cflor > cflr+0.1 because mafR can be low with 1+0 & 2+0 mixture
if (sum(out1$num.mark[cflor > cflr+0.1])/nsnps > 0.05) {
not1plus1 <- TRUE
flags <- c(flags, paste("mafR larger than expected if", dipLogR[1], "is diploid level"))
}
# assume dipLogR[1] is clonal 2+2 get acn consistent with cnlr, mafR
# search acn from 3+0, 2+0 & 1+0; as 2+1 looks like 2+0 with lower cf
# 0+0 has balanced alleles and so won't be in lsegs
out2 <- t(sapply(1:nrow(out1), function(i, dlr, out1) {
acnsplit(dlr, out1$cnlr.median[i], out1$mafR[i])
}, dipLogR[1], out1))
# print(out2)
out1$acn <- apply(out2[,2*(1:3), drop=FALSE], 1, which.min)
# proportion of genome that fits 1+0, 2+0 & 3+0
acn1prop <- sum(out1$num.mark[out1$acn == 1])/nsnps
acn2prop <- sum(out1$num.mark[out1$acn == 2])/nsnps
acn3prop <- sum(out1$num.mark[out1$acn == 3])/nsnps
# mix of 2+0 and 1+0 from 1+1 (on same segs) can mimic 3+0 from 2+2
# that is, it looks like a loss but has a large mafR
# so allow small % if 1+0 and a slightly bigger % of 3+0
if (!not1plus1 & acn1prop < 0.005 & acn3prop < 0.05) {
# if acn3prop > 0 flag it
if (acn3prop > 0)
flags <- c(flags, paste("likely mixture of 1+0 & 2+0 in segclust:", paste(out1$segclust[out1$acn==3], collapse=", ")))
# most of the losses are either 2+0 or 2+1
# 2+2 --> {2+1, 2+0} indistinguishable from 1+1 --> 1+0
# check that it's not 2+2 with 2+1 and 2+0 states only
# ***** can only be done if more than one cluster exists *****
# cf2 is cellular fraction of 2+0 segment (delta-logR from 2+2)
# cf3 is cellular fraction of 2+1 calculated as if its is 2+0
# then cf3 = cf2/(2+cf2)
# need 2 or more clusters with 2+0 call; otherwise it is 1+1
if (sum(out1$acn==2) > 1) {
# get optimal cellular fraction
rho <- seq(0.01, 0.99, by=0.01)
# deviance
cf2 <- out2[out1$acn==2,3]
nm <- out1$num.mark[out1$acn==2]
dev <- sapply(rho, function(x, cf2, nm) {
sum(nm*pmin((cf2-x)^2, (cf2-x/(2+x))^2))
}, cf2, nm)
cf0 <- rho[which.min(dev)]
# are both cf0 and cf0/(2+cf0) represented among segments
acn <- sapply(cf2, function(x, cf0) {
which.min((c(x,x)-c(cf0, cf0/(2+cf0)))^2)
}, cf0)
#cat("acn =", acn, "\n")
#cat("cf2 =", cf2, "\n")
# are the two levels represented in sufficient proportions
nm2 <- sum(nm[acn==1]) # this is the 2+0 fit
nm3 <- sum(nm[acn==2]) # this is the 2+1 fit
if (min(nm3, nm2)/(nm3+nm2) > 0.25) {
# if the discrepancy from cf large call it polyclonal
if (max(abs(cf2-c(cf0, cf0/(2+cf0))[acn])) > 0.05) {
flags <- c(flags, "polyclonal 1 copy loss fits better")
} else {
# polyclonal 1 copy loss not obvious & wgd ambiguity
# so calculate alternate dipLogR and present both
not1plus1 <- TRUE
flags <- c(flags, "alternate dipLogR is possible")
# acn2prop >1/3 and 2+1 as likely as 2+0 call it WGD
if (acn2prop > 1/3 & nm3 > nm2) {
wgd.likely <- TRUE
} else {
wgd.likely <- FALSE
}
}
}
}
} else {
not1plus1 <- TRUE
}
if (not1plus1 & acn1prop+acn3prop > 0) {
flags <- c(flags, paste("not consistent for 1 copy loss from diploid in segclust:", paste(out1$segclust[out1$acn!=2], collapse=", ")))
}
}
# if dipLogR[1] is possibly not 1+1 estimate CN=2 level
if (not1plus1) {
# find deviance for each ocnlevel
out1 <- out0[out0$cnlr.median <= max(dipLogR) & is.finite(out0$mafR),]
# ocn levels cannot be any lower than lr4-1
ocnlevels0 <- ocnlevels[ocnlevels > dipLogR[1]-1 & ocnlevels < dipLogR[1]]
dev1 <- sapply(ocnlevels0, dlrdev, dipLogR[1], out1)
#colr <- rep(1, length(dev1))
if (length(dipLogR) == 2) {
# if dipLogR[1] has >2 (1+1 & 2+0) segclust at >3% don't do this
if (sum(out0$num.mark[out0$cnlr.median==dipLogR[1]]/nsnps > 0.03) > 2) {
flags <- c(flags, "multiple mafR levels at bal allele level 1")
} else {
ocnlevels1 <- ocnlevels[ocnlevels > dipLogR[2]-1 & ocnlevels < dipLogR[2]]
dev2 <- sapply(ocnlevels1, dlrdev, dipLogR[2], out1)
ocnlevels0 <- c(ocnlevels0, ocnlevels1)
dev1 <- c(dev1, dev2)
#colr <- c(colr, rep(2, length(dev2)))
}
}
#plot(ocnlevels0, dev1, pch=16, col=colr)
#print(dev1)
cn2logR <- ocnlevels0[which.min(dev1)]
# if wgd.likely is non-null add dipLogR[1]
if (!is.null(wgd.likely)) {
# if wgd.likely set dipLogR[1] as altDipLogR
altDipLogR <- dipLogR[1]
# o/w change altDipLogR to cn2logR to dipLogR[1] &
if (!wgd.likely) {
altDipLogR <- cn2logR
cn2logR <- dipLogR[1]
}
names(altDipLogR) <- NULL # remove names it may have acquired
}
} else {
cn2logR <- dipLogR[1]
}
names(cn2logR) <- NULL # remove names it may have acquired
# 2+2 in more than half the genome seems a stretch
if (any(nbal/nsnps > 0.5)) {
i <- which(nbal/nsnps > 0.5)
# if chosen dipLogR is below that level
if (cn2logR < dipLogR[i] - 0.05) {
flags <- c(flags, ">1/2 genome balanced at chosen dipLogR")
if (exists("altDipLogR")) {
altDipLogR <- c(altDipLogR, cn2logR)
} else {
altDipLogR <- cn2logR
flags <- c(flags, "alternate dipLogR is possible")
}
cn2logR <- dipLogR[i]
}
}
# alternate check if "mafR not sufficiently small"
if ("mafR not sufficiently small" %in% flags) {
pbal <- sapply(dipLogR, function(x, y, n) {
sum(abs(y-x) < 0.1375)/n
}, rep(cnlr.median, num.mark), nsnps)
i <- which(pbal > 2/3)
if (length(i) > 0) {
if (cn2logR < dipLogR[i] - 0.05) {
flags <- c(flags, ">2/3 genome balanced at chosen dipLogR")
if (exists("altDipLogR")) {
altDipLogR <- c(altDipLogR, cn2logR)
} else {
altDipLogR <- cn2logR
flags <- c(flags, "alternate dipLogR is possible")
}
}
cn2logR <- dipLogR[i]
}
}
out0 <- out0[order(out0$segclust),] # reorder by segclust
if (is.null(wgd.likely)) {
list(out0=out0, dipLogR=cn2logR, alBalLogR=cbind(dipLogR,nbal/nsnps), mafR.thresh=mafR.thresh, flags=flags)
} else {
list(out0=out0, dipLogR=cn2logR, altDipLogR=altDipLogR, alBalLogR=cbind(dipLogR,nbal/nsnps), mafR.thresh=mafR.thresh, flags=flags)
}
}
# split the segclust medians into two groups high and low
km2class <- function(cnlr.med, num.mark) {
n <- length(cnlr.med)
ii <- order(cnlr.med)
csx <- rep(cnlr.med[ii], num.mark[ii])
iseg <- rep(1:n, num.mark[ii])
gg <- rep(1:2, c(1,n-1))[iseg]
l1norm <- sapply(1:(n-1), function(i, n, csx, iseg) {
gg <- rep(1:2, c(i,n-i))[iseg]
med2 <- tapply(csx, gg, median)
sum(abs(csx - med2[gg]))
}, n, csx, iseg)
j0 <- which.min(l1norm)
med2 <- tapply(csx, rep(1:2, c(j0,n-j0))[iseg], median)
out <- list()
out$centers <- med2
out$cluster <- rep(2,n)
out$cluster[ii][1:j0] <- 1
out$nbal <- tapply(num.mark, out$cluster, sum)
out
}
# redo cf using log-logOR
acnsplit <- function(lr0, lr1, lorsq) {
deltalr <- (lr0-lr1)
llor <- log(sqrt(lorsq))
rho <- seq(0.01, 0.99, by=0.01)
# log-logOR has a much wider range so weigh logR contribution more
util3.0 <- 5*(deltalr-(log2(2+2*rho) - log2(2+rho)))^2 + (llor-log(log((1+2*rho)/(1-rho))))^2
util2.0 <- 5*(deltalr-(log2(2+2*rho) - log2(2)))^2 + (llor-log(log((1+rho)/(1-rho))))^2
util1.0 <- 5*(deltalr-(log2(2+2*rho) - log2(2-rho)))^2 + (llor-log(log(1/(1-rho))))^2
optrho <- matrix(0,2,3)
i <- which.min(util1.0); optrho[,1] <- c(rho[i], util1.0[i])
i <- which.min(util2.0); optrho[,2] <- c(rho[i], util2.0[i])
i <- which.min(util3.0); optrho[,3] <- c(rho[i], util3.0[i])
# if optrho is low weigh the utilities in favor of 2
if (optrho[1,1] < 0.25) optrho[2,] <- optrho[2,]*(1+c(50,0,50)*(0.25-optrho[1,1]))
rownames(optrho) <- c("cf","util")
colnames(optrho) <- c("acn1.0","acn2.0","acn3.0")
optrho
}
# deviance metric for candidate dipLogR
# lr2 is candidate logR value at CN=2
# lr4 is the balanced allele logR value (CN=4)
# (lr, lor) are the logR logOR for the clusters
dlrdev <- function(lr2, lr4, out) {
# use cnlr.median, mafR etc from out
k <- nrow(out)
lr <- out$cnlr.median
# change mafR to log-odds ratio
lor <- sqrt(pmax(0,out$mafR))
# cellular fraction
rho <- 2^(lr4-lr2) - 1
# 1 and 3 copy logR levels
lr1 <- lr2 - log2(2/(2-rho))
lr3 <- lr2 + log2((2+rho)/2)
dev <- sapply(1:k, function(i) {
# closest copy number state
cn <- which.min(abs(c(lr1,lr2,lr3,lr4) - lr[i]))
# log-ratio deviance
dev <- (lr[i] - c(lr1,lr2,lr3,lr4)[cn])^2
# allele ratio
if (cn == 1) dev <- dev + (lor[i] - log(1/(1-rho)))^2
if (cn == 2) dev <- dev + min(lor[i]^2, (lor[i] - log((1+rho)/(1-rho)))^2)
if (cn == 3) dev <- dev + min((lor[i]-log(1+rho))^2, (lor[i] - log((1+rho)/(1-rho)))^2)
if (cn == 4) dev <- dev + min(lor[i]^2, (lor[i]-log(1+2*rho))^2, (lor[i] - log((1+3*rho)/(1-rho)))^2)
dev
})
sum(dev*out$num.mark)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.