readVCF <- function (prefix, chr)
{
addup <- function(v) {c(v[1] + v[2], v[3] + v[4])}
chr = paste("chr", chr, sep = "");
aFile=paste(prefix, chr, "vcf", sep=".")
xx = read.table(aFile, comment.char = "#", stringsAsFactors=FALSE)
# remove INDELS and records without DP4 field
sel = grep("INDEL", xx[[8]], invert=T)
sel = intersect(grep("DP4", xx[[8]]), sel)
xx = xx[sel,]
# extract DP4 field from column 8
m=regexec("DP4=[[:digit:]]+,[[:digit:]]+,[[:digit:]]+,[[:digit:]]+;", xx[[8]])
tmp=regmatches( xx[[8]], m)
#remove pos with no DP4 field
digits=lapply(tmp, strsplit, split="[,=;]")
nrow = dim(xx)[1]
res = (matrix(nrow=nrow, ncol=5)) # pos ref alt freq1 freq2
for (i in 1:nrow)
{
res[i,4:5] = addup(as.integer(digits[[i]][[1]][2:(2+3)]))
}
res = as.data.frame(res, stringsAsFactors = F)
res[,1] = ( xx[[2]])
res[,2] = ( xx[[4]])
res[,3] = ( xx[[5]])
rownames(res) = unlist( xx[[2]])
colnames(res) = c("pos","ref", "alt", "NRef", "NAlt")
res
}
# --------------------------------------------------------------------------------------------------
# Read in the frequency data includes the A,B allele frequences in tumor and control.
# Parameters :
# merge is to specify how to merge the frequences at SNPi into regions. Don't merge by default.
# rm.zero is for remove zero frequency records. Remove zero by default.
#
loadFreq <- function(chrID, merge, doflip=T, rm.zero = T, ana = "p.tumor", nPrefix, tPrefix, forceRead = F, allelicMapability = F)
{
# how to merge regions
#preprocess = merge
countDataF = paste(ana, paste(chrID, "dat", sep="."), sep=".")
print(countDataF)
if ( !forceRead & file.exists(countDataF))
{
print(sprintf("loading chr%s from object", chrID) )
load(countDataF)
} else
{
print(sprintf("loading chr%s from file", chrID) )
tAFreq = readVCF(tPrefix, chrID)
nAFreq = readVCF(nPrefix, chrID)
aFile=paste(nPrefix, paste("chr", chrID, sep=""), "haps", sep=".")
rules <- read.table(aFile, stringsAsFactors = F)
rules=rules[((rules[[6]] + rules[[7]]) ==1),]
rules=(rules[, c(3, 6)])
colnames(rules) = c("pos", "flip1")
save(tAFreq, nAFreq, rules, file=countDataF)
}
# align the rows
rules [with(rules, order(pos)), ]
tAFreq[with(tAFreq, order(pos)), ]
nAFreq[with(nAFreq, order(pos)), ]
# align ref haps with normal snps
rules = rules [rules$pos %in% nAFreq$pos, ]
nAFreq = nAFreq[nAFreq$pos %in% rules$pos, ]
if(nrow(nAFreq) == 0)
{
print("[error] Could not phase SNPs. Check if *.haps file is empty.")
}
# align tumor snps. This gives
#tSnps = data.frame( NA, dim(nAFreq), rownames=rownames(nAFreq), colnames(nAFreq))
tSnps <- nAFreq
tSnps$NRef = NA
tSnps$NAlt = NA
tSnps$pos = nAFreq$pos
tSnps[tSnps$pos %in% tAFreq$pos, ] = tAFreq[tAFreq$pos %in% tSnps$pos, ]
tAFreq = tSnps
missingIdx = is.na(tSnps$NAlt)
naInTumor = c("loci" = length(tAFreq$NAlt),
"missings" = length(tAFreq$NAlt[is.na(tSnps$NAlt)]))
tAFreq$NAlt[missingIdx] = 0.5
tAFreq$NRef[missingIdx] = 0.5
# 1. correct the different number of read mapped to ref than alt.
if(allelicMapability == T)
{
allelicMapability = 0.953
nAFreq$NAlt = (nAFreq$NAlt / allelicMapability)
tAFreq$NAlt = (tAFreq$NAlt / allelicMapability)
}
# 2. generated phased allelic depth
if(doflip == T)
{
flipSites = rules$flip1 == 1
nalt = nAFreq$NAlt [ flipSites]
nref = nAFreq$NRef [ flipSites]
nAFreq$NAlt [flipSites ] = nref
nAFreq$NRef [flipSites ] = nalt
nalt = tAFreq$NAlt [flipSites]
nref = tAFreq$NRef [flipSites]
tAFreq$NAlt [flipSites] = nref
tAFreq$NRef [flipSites] = nalt
}
# 3. heterogenous alts (normal.alt != tumor.alt)
# 1. alt_normal alt_tumor
# T A
# This is inconsist
#
# 2. alt_normal alt_tumor
# T G
# Consist, but the num of alt in tumor has to be zero
heteroAlt=c( "loci" = length(tAFreq$NAlt),
"heteroAlts" = 0)
#inconsis = ((tAFreq$alt != nAFreq$alt) & (tAFreq$NAlt > 4))
inconsis = ((tAFreq$alt != nAFreq$alt) & (tAFreq$alt != "."))
if(length(which(inconsis, T)))
{
numOfInconsis = length(which(inconsis, T))
# tAFreq = tAFreq[!inconsis,]
# nAFreq = nAFreq[!inconsis,]
heteroAlt = c("loci" = length(tAFreq$NAlt),
"heteroAlts" = numOfInconsis,
"inconsis" = inconsis)
}
if ( nrow(tAFreq) != nrow(nAFreq) ) stop(sprintf("Frequnencies from two samples are not well aligned : %d!=%d", nrow(tAFreq), nrow(nAFreq)) )
# if (preprocess == "mergeSite")
# {
# nAFreq <- merge(nAFreq[,c("NRef","NAlt")], len = 20)
# tAFreq <- merge(tAFreq[,c("NRef","NAlt")], len = 20)
# } else if (preprocess == "mergeDist")
# {
# nAFreq <- mergeByDist(nAFreq[,c("NRef","NAlt")], dist=10000)
# tAFreq <- mergeByDist(tAFreq[,c("NRef","NAlt")], dist=10000)
# }
nB <- nAFreq$NRef
nSum <- nAFreq$NRef + nAFreq$NAlt
tB <- tAFreq$NRef
tSum <- tAFreq$NRef + tAFreq$NAlt
names(tSum) = rownames(tAFreq)
names(nSum) = rownames(nAFreq)
names(tB) = rownames(tAFreq)
names(nB) = rownames(nAFreq)
minCount = nSum > 5
nB = nB [minCount]
nSum = nSum [minCount]
tB = tB [minCount]
tSum = tSum [minCount]
# 4. mix into 0.1% tumor into the sample, this way all tB > 0.
mixit=F
if (mixit == T)
{
tB = tB + 0.001 * nB
tSum = tSum + 0.001 * nSum
}
#
list(tSum=tSum, nSum=nSum, tB=tB, nB=nB, naInTumor = naInTumor, heteroAlt = heteroAlt)
}
maxll <- function(tab,genotypes, tc)
{
mll = 0
state = 0
res <- .C( "maxll_improv",
as.integer(as.vector(t(tab))),
as.integer(dim(tab)[1]),
as.integer(genotypes),
as.integer(length(genotypes)/2),
as.double(tc),
state = as.integer(state),
mll = as.numeric(mll),
PACKAGE = "sCNAphase"
)
res$mll
}
mergeByDistML<-function(tab,genotypes, tc=1, dist=10000)
{
fluctuation = c()
fullLabel = c()
pos <- as.numeric(rownames(tab))
sep = c(0)
length(sep) <- dim(tab)[1]
idx = 1
sep[idx] = 1
idx = idx +1
for(k in seq(2,length(pos),1))
{
if (pos[k] - pos[sep[idx -1]] > dist)
{
sep[idx] = k
idx = idx +1
}
}
if(idx < length(pos))
{
sep[idx] = length(pos)
idx = idx +1
}
depths = matrix(0, nrow = idx -2, ncol = dim(tab)[2])
rownames(depths) <- rep("",idx -2)
numDist = c()
inc=0
for(k in 2:(idx-1)){
start = sep[k -1]
end = sep[k]
if(dim(tab[start:end,,drop=F])[1] > 3)
{
inc = inc +1
res1 = apply(tab[start:end,,drop=F],2,sum)
fluctuation[inc] <- maxll(tab[start:end,,drop=F], tc=tc, genotypes) # lexically scoped
depths[inc,] = res1
rownames(depths)[inc] = toString(pos[start])
fullLabel[(inc *2 -1):(inc * 2)] <- rownames(tab) [c(start,end)]
}
}
names(fluctuation) <- rownames(depths[1:inc,])
colnames(depths) = colnames(tab)
list("fluctuation" = fluctuation,
"fullLabel" = fullLabel,
"depths" = data.frame(depths[1:inc,]))
}
# calculating the Maximum log-likelihood for different genotypes. Then use ML to determine if there is a state switching.
mergeUsingML <- function(tab, genotypes, tc=0.99, len=20, maxSeg=1000000)
{
sumUp <- function(seg) {
#print(tab[seg[1]:seg[2],,drop=F])
res1 = apply(tab[seg[1]:seg[2],,drop=F],2,sum)
as.integer(res1)
}
consistency <- function(seg, doCheck=T)
{
#print(str(tab[seg[1]:seg[2],,drop=F]))
#print(str(transform(tab[seg[1]:seg[2],,drop=F], nB=as.integer(nB), nSum=as.integer(nSum),
#tB=as.integer(tB), tSum=as.integer(tSum)) ))
#print(dim(as.integer()))
if(doCheck)
{
res1 = maxll(transform(tab[seg[1]:seg[2],,drop=F], nB=as.integer(nB),
nSum=as.integer(nSum),
tB=as.integer(tB),
tSum=as.integer(tSum)),
genotypes,tc)
}else
{
res1 = 0
}
#res1 = maxll(as.integer(tab[seg[1]:seg[2],,drop=F]), genotypes, tc)
res1
}
rep = ceiling(dim(tab)[1]/len)
res = matrix(0,nrow = rep,ncol = dim(tab)[2])
startAt = (c(1:rep)-1)*len +1
endAt = startAt +len
endAt[length(endAt)] = dim(tab)[1]
# filter sparse snp region
sel = as.numeric(rownames(tab)[endAt]) - as.numeric(rownames(tab)[startAt]) < maxSeg
#tab = tab[sel,,drop=F]
startAt = startAt[sel]
endAt = endAt[sel]
res=apply(cbind(startAt,endAt),1, sumUp)
res=t(res)
fluctuation=apply(cbind(startAt,endAt),1, consistency, doCheck = T)
fullLabel = cbind(rownames(tab)[startAt], rownames(tab)[endAt])
colnames(fullLabel) = c("start", "end")
colnames(res) = colnames(tab)
rownames(res) = rownames(tab)[startAt]
names(fluctuation) <- rownames(tab)[startAt]
depths = data.frame(res)
list("fluctuation" = fluctuation,
"fullLabel" = fullLabel,
"depths" = depths)
}
filtering <- function (data, thresh)
{
fluctuation = data[["fluctuation"]]
fullLabel = data[["fullLabel"]]
depths = data[["depths"]]
indexing <- function(idx) {as.numeric(names(idx))}
selection = fluctuation > thresh
#selection = fluctuation > thresh & indexing(fluctuation) > 88000000 & indexing(fluctuation) < 89000000
#dim(fullLabel) = c(2,length(fullLabel)/2)
#fullLabel = t(data.frame((fullLabel),row.names=c("start", "end")))
baf=depths$tB/depths$tSum
hlRegion = baf < 0.6 & baf > 0.4
col = rep(1, length(fluctuation))
col[hlRegion] = 2
plot(indexing(fluctuation), fluctuation,pch=4,col=col, main = "Log likelihood at regions",
xlab="position (bp)", ylim=c(-300,0))
#dev.new()
plot(indexing(fluctuation), depths$tB/depths$tSum, pch=4, col=col, ylim=c(0,1), main="BAF before filtering")
print("before filtering")
col = rep(2, length(fluctuation))
col[selection] = 1
depths = depths [selection, ]
fullLabel = fullLabel [selection, ]
fluctuation = fluctuation[selection ]
print(length(fluctuation))
plot(indexing(fluctuation), depths$tB/depths$tSum, pch=4, col=col[selection], ylim=c(0,1),
main="BAF after filtering", xlab="postion (bp)")
plot(indexing(fluctuation), depths$nB/depths$nSum, pch=4, col=col[selection], main = "normal BAF", xlab="postion (bp)", ylim=c(0,1))
#plot(indexing(fluctuation), depths$tSum, pch=4, col=col[selection], main = "normal RD", xlab="postion (bp)")
#plot(indexing(fluctuation), depths$nSum, pch=4, col=col[selection], main = "tumor RD", xlab="postion (bp)")
list("fluctuation" = fluctuation,
"fullLabel" = fullLabel,
"depths" = depths )
}
inferStates <- function (anaName, preprocess, chrID, runHMM = T, method = "Baum_Welch",
nB,nSum, tB, tSum,
genotypes, maxiter=5, fullLabel = fullLabel, DOA_range = c(1.0, 1.4),
underate = 1, tpmType = 1, reRun = T, mlen = 40)
{
dataFile = paste("res", anaName, preprocess,"chr", chrID,"dat", sep=".")
if ( !runHMM & file.exists(dataFile))
{
load(dataFile)
} else
{
allRes = list()
for(start in c(1:(length(DOA_range)-1)))
{
print(c(DOA_range[start], DOA_range[start +1]))
if(!exists("maxRes"))
{
maxRes = segHMM2(rbind(nB,nSum,tB,tSum),
length(genotypes)/2,
genotypes,
maxiter = maxiter,
DOA_range = c(DOA_range[start], DOA_range[start +1]),
method = method, underate=underate, tpmType = tpmType)
allRes[[start]] = maxRes
} else{
res= segHMM2(rbind(nB,nSum,tB,tSum),
length(genotypes)/2,
genotypes,
maxiter = maxiter,
DOA_range = c(DOA_range[start], DOA_range[start +1]),
method = method, underate=underate, tpmType = tpmType)
allRes[[start]] = res
# Do the inference for different DOA_ranges.
# If the change is bigger than 1000, accept the better range.
if(res$log.lik - maxRes$log.lik > 1) maxRes = res
}
}
}
allRes
}
runForPloidy <- function (anaName, preprocess, chrID, runHMM = T, method = "Baum_Welch",
nB,nSum, tB, tSum, maxRes,
genotypes, maxiter=5, fullLabel = fullLabel, DOA_range = c(1.0, 1.4),
underate = 1, tpmType = 1, reRun = T, mlen = 40, unmergedDepth)
{
mtB = tB
mnB = nB
mtSum = tSum
mnSum = nSum
#frequencyFile = paste(anaName, chrID, ".gw.dat", sep='.')
#load(frequencyFile)
startingPos = which(names(unmergedDepth$tB) %in% names(mtB))
#startingPos = startingPos[1:(length(startingPos)-1)]
genotypes = maxRes$genotypes
merge <- function(aa, mlen=2){max(tapply(aa, as.integer(((1:length(aa)) +1) / mlen), sum))}
print(genotypes)
maxCN = max(merge(genotypes, mlen=2))
CN = genotypes[maxRes$hidden.states*2-1] + genotypes[maxRes$hidden.states*2]
tab = data.frame(cbind(unmergedDepth$nB,
unmergedDepth$nSum,
unmergedDepth$tB,
unmergedDepth$tSum))
pos = as.numeric(names(mtB))
ml = sapply(startingPos,
function(starting, tab, genotypes, tc, mlen) {
ending = min(starting+mlen, nrow(tab))
maxll(tab[starting:ending, , drop=F],
genotypes = genotypes, tc = tc) },
tab = tab, genotypes=genotypes, tc = maxRes$tc, mlen = mlen)
#ml = c(ml,0)
clean = 0.10
# if the maxCN =12, the loci with cn > 6 will be defined as significantly AMP and filtered.
cnRange = c(1, maxCN/2)
selIncluded = rep(T, length(ml))
for(eachGeno in unique(maxRes$hidden.states))
{
aGenotype = maxRes$hidden.states == eachGeno
selIncluded[aGenotype] = ml[aGenotype] > quantile(ml[aGenotype], prob=clean, type=1)
}
depthClean = 0.01
outlayerBound = quantile(mtSum/mnSum, prob=1 - depthClean, type =1)
bafClean = 0.01
bafThresh = quantile(abs(mnB/mnSum - 0.5), prob = 1 - bafClean, type = 1)
print(sprintf("length of ml %d and mtB %d", length(ml), length(mtB)))
selIncluded = selIncluded & CN <= cnRange[2] &
CN >= cnRange[1] & (mtSum / mnSum) < outlayerBound &
abs(mnB/mnSum - 0.5) < bafThresh
print(sprintf("Downsampled %f from %d",
sum(selIncluded)/length(selIncluded),
length(selIncluded)))
nBMerged = c()
tBMerged = c()
nSumMerged = c()
tSumMerged = c()
mergeDegree = 6
sameState <- function(states) { sum (states != states[1]) }
for(idx in c(1:(sum(selIncluded)/mergeDegree)))
{
range = seq( (idx-1) * mergeDegree + 1, idx * mergeDegree )
if(sameState( maxRes$hidden.states[selIncluded][range]) == 0)
#if(maxRes$hidden.states[selIncluded][idx*2-1] == maxRes$hidden.states[selIncluded][idx*2])
{
nBMerged = c(nBMerged, sum(mnB [selIncluded][range]))
tBMerged = c(tBMerged, sum(mtB [selIncluded][range]))
nSumMerged = c(nSumMerged, sum(mnSum[selIncluded][range]))
tSumMerged = c(tSumMerged, sum(mtSum[selIncluded][range]))
}
}
likelihoods = c()
for(start in c(1:(length(DOA_range)-1)))
{
print(c(DOA_range[start], DOA_range[start +1]))
res= segHMM2(rbind(nBMerged, nSumMerged, tBMerged, tSumMerged),
length(genotypes)/2,
genotypes,
maxiter = maxiter,
DOA_range = c(DOA_range[start], DOA_range[start +1]),
method = method, underate=underate, tpmType = tpmType)
likelihoods = c(likelihoods, res$log.lik)
}
list(likelihoods=likelihoods, ml=ml, selIncluded = selIncluded)
}
loadBamRegions <- function (bamF, chrID, start,len, prefix, winSize = 10000, qc = 2)
{
forceRead = F
datFile = sprintf("%s.chr%s.readDepth.win.%d.dat", prefix, chrID, winSize)
if ( !forceRead & file.exists(datFile))
{
print(sprintf("loading chr%s from object", chrID) )
load(datFile)
} else
{
rDepths = rep(0, length(start))
rFailure = F
all <- .C("countReads",
as.character(bamF),
as.character(chrID),
as.integer(start),
as.integer(len),
as.integer(length(start)),
as.integer(qc),
rDepths = as.integer(rDepths),
rFailure = as.logical(rFailure))
if (all$rFailure) return(NA)
names(all$rDepths) = start
save(all, file=datFile)
}
all$rDepths
}
#============================================================================
# --------- FUNCTION plotMAF -------------------
# This is for plot of Major allele frequence with or without the LD profile.
# LDdata is a list of two vectors for coords and r-square.
#
plotMAF <- function(pos, baf, prange=1, LDdata="", interval=1, title="test", ceiling=1, ttp = 2)
{
beginAt = head(pos,1)
endAt = tail(pos,1)
if (length(interval) ==2 )
{
beginAt = interval[1]
endAt = interval[2]
}
range=beginAt < pos & pos < endAt
rbind(pos[range], baf[range])
if (ttp == 1)
{
plot(pos[range],baf[range],pch=20,col=3, xaxs ="i", ylim=c(0,1), xaxp=c(beginAt,endAt,20), las =2, ylab="MAF", xlab="")
#plot(pos[range],baf[range],pch=20,col=3, xaxs ="i", ylim=c(0,1))
if( is.list(LDdata) )
{
rspos=LDdata[[1]]
r=LDdata[[2]]
range=beginAt < rspos & rspos < endAt
plotBars(r[range], rspos[range], level=0, ct=2, ceiling=ceiling)
}
}
if (ttp == 2)
{
colScale = 250
col=colorRampPalette(brewer.pal(9,"YlOrRd"))(colScale)
if( is.list(LDdata) )
{
rspos = LDdata[[1]]
r = LDdata[[2]]
rspos = rspos[rspos%in%pos]
r = r[rspos%in%pos]
pos = pos[pos%in%rspos]
baf = baf[pos%in%rspos]
range2=beginAt < rspos & rspos < endAt
if(is.vector(prange))
{
print(prange)
plot(pos[range], baf[range], pch=20, col=col[as.integer(r[range] * colScale)],xlim=prange, ylim=c(0,1), xaxp=c(beginAt,endAt,20), las =2, ylab="MAF", xlab="", main=title)
}else
{
plot(pos[range], baf[range], pch=20, col=col[as.integer(r[range] * colScale)], ylim=c(0,1), xaxp=c(beginAt,endAt,20), las =2, ylab="MAF", xlab="", main=title)
}
} else {
print (length(pos[range]))
print (length(baf[range]))
print (length(r[range]))
plot(pos[range], baf[range], pch=20, ylim=c(0,1), xaxp=c(beginAt,endAt,20), las =2, ylab="MAF", xlab="", main=title)
}
}
}
#
# CN(A) CN(B) RCN Genotype
# 1 3 3 ABBB
# 1 2 2 ABB
# 1 1 1 AB
# 2 1 1/2 AAB
# 3 1 1/3 AAAB
#
# in which RCN is defined as
# CN(B)
# RCN = ──────────────
# CN(A)
#
#
# The posterier probabbility of RCN = x is give as :
#
#
# P(D | RCN = x) * P(RCN = x)
# P(RCN = x | D) = ───────────────────────────────
# sum [ P(D | RCN) * P(RCN) ]
#
#
#
# P(D | RCN = x) = P(Nfb, Nfa, Tfb, Tfa | RCN)
# => Nfb ~ Bino(Tfb + Tfa, P = f(c, RCN) )
# in which c is the cellular of tumor, aka the percentage of tumor in the mix.
# Nfb Nfb * RCN
# P = f(c, RCN) = (1 - c) ─────────── + c ────────────────────
# Nfa + Nfb Nfa + Nfb * RCN
# The prior distribution of RCN is treated as a uniform distribution.
# In this case, P(RCN = x) = 1/5
# {M} {MMMP} {MMP} {MP} {MPP} {MPPP} {P}
# res <- segHMM(baf, length(RCNs$RCN), c(0.999, 3/4, 2/3, 1/2, 1/3, 1/4, 0.001))
# Scaling is to bring down the emission probility.
# This cause problems when it is being too big,
# because the transition probility needs to be summed on the probility (not logProb).
# The sum will be 0, and an error occurs when divide 0.
# Fix_tc determine if tc need to be estimated. If T, tc will be used as an initial guess.
segHMM2 <- function(allelicDepth, k, genotypes, tc = 0.5, fix_tc = F, diag.prob = 0.9, maxiter = 10, eps = 0.0000001, DOA = 1.5, combineRdAd = FALSE, method="Baum_Welch", DOA_range=c(1, 1.5), underate=1, tpmType =1)
{
# Start the clock!
ptm <- proc.time()
effectSize = 5
k = length(genotypes)/2
diag.prob = 0.5
gamma <- matrix((1 - diag.prob) / (k - 1), k, k)
diag(gamma) <- diag.prob
CN = genotypes[c(1:k)*2 - 1] + genotypes[c(1:k)*2 ]
if(tpmType == 2)
{
print("Using copy number transition TPM")
diag.prob = 0.5
off.prob = (1 - diag.prob)/ ( length(unique(CN)) -1)
gamma <- matrix(diag.prob, k, k)
for (i in c(1:k))
{
for(j in 1:k)
{
cni = genotypes[2*i -1] + genotypes[2*i ]
cnj = genotypes[2*j -1] + genotypes[2*j ]
if(cni != cnj)
gamma[i,j] <- off.prob
}
}
}
if(tpmType == 3)
{
print("Using symetric states TPM")
# redundency defined as the states reverse complement. e.g. 1,0 - 0,1
numOfNonRedundent = 0
for (aCN in unique(CN))
{
num = sum(CN == aCN)
if(num %% 2 !=0) numOfNonRedundent = (num + 1) / 2 + numOfNonRedundent
else numOfNonRedundent = (num) / 2 + numOfNonRedundent
}
t = 30 # 12 times
diag.prob = t / (numOfNonRedundent -1 +t)
off.prob = (1 - diag.prob)/ ( numOfNonRedundent -1)
aSymPenalty = 0.1
for (i in c(1:k))
{
for(j in 1:k)
{
cni = genotypes[2*i -1] + genotypes[2*i ]
cnj = genotypes[2*j -1] + genotypes[2*j ]
if(cni > 3 && cni == cnj)
{
highCNPenalty = 0.90
}
else
{
highCNPenalty = 1
}
if(genotypes[2*i -1] == genotypes[2*j ] & genotypes[2*j -1] == genotypes[2*i ])
{
# higher quatile, the closer to off.prob
#gamma[i,j] <- (diag.prob - off.prob) / quantile + off.prob
#gamma[i,j] <- gamma[i,j] * highCNPenalty
#gamma[i,j] <- diag.prob * highCNPenalty * aSymPenalty
gamma[i,j] <- off.prob * 2
}
else
{
gamma[i,j] <- off.prob
}
if(i == j)
{
gamma[i,j] <- diag.prob * highCNPenalty
}
}
}
}
initP = as.double(rep(-log(k), k))
numobs = dim(allelicDepth)[2]
ifSigAmp = rep(0, numobs)
prob = rep(0.0, numobs * k)
res <-
.C(method,
as.integer(numobs),
as.integer(length(genotypes)/2),
depths = as.integer(as.vector(allelicDepth)),
genotypes = as.integer(genotypes),
tc = as.double(tc),
gamma = as.double(gamma),
pi = as.double(rep(-log(k), k)),
num.iter = as.integer(maxiter),
as.double(eps),
log.lik = double(1),
filtered.cond.probs = double(k * numobs),
hidden.states = integer(numobs),
prob = as.double(prob),
as.logical(fix_tc),
DOA=as.double(DOA),
DOA_range=as.double(DOA_range),
ifSigAmp = as.integer(ifSigAmp),
as.double(underate),
as.logical(combineRdAd),
#as.double(g_rcov),
PACKAGE = "sCNAphase"
)
res$hidden.states <- res$hidden.states + 1
res$gamma <- matrix(res$gamma, nr = k)
# Stop the clock
print(proc.time() - ptm)
res$depths = c()
res$filtered.cond.probs = c()
res
}
calcGC <- function(chrID, seqFile, winSize, pos)
{
print(length(pos))
range = cbind(pos - winSize, pos + winSize)
fa <- open(FaFile(seqFile))
selRange = GRanges(chrID, IRanges(range[,1], range[,2]) )
dna <- scanFa(fa, param=selRange)
gcContect = sapply(as.character(dna), function(seq){ (1-nchar(gsub("[GCgc]", "",seq)) / nchar(seq)) * 100})
gcContect
}
inferCNA <- function(anaName, nPrefix, tPrefix, chroms, doPhase = T, forceRead=F, maxCopyNum=11, mlen = 30, maxiter = 5, doFilter = F, ploidy = c(1.0, 1.4), fakeNormal = F, underate = 1, tpmType=1, runHMM = T, allelicMapability = F, reRun = T, method = "Baum_Welch", generateLog = T, runit = T)
{
preprocess="phased"
g_rcov = 0.5
chrID = "W"
#************************************************************************
# Possible genotypes
#************************************************************************
startPos = 0
gap = 100000 # in bps
label = c()
breakPoints = startPos
tB = c()
nB = c()
tSum = c()
nSum = c()
gcCombine = c()
depthCombine = c()
forceLoad = T
if(!forceLoad)
{
load(paste(anaName, chrID, ".gw.dat", sep='.'))
}else
{
for (AChrID in chroms)
{
res = loadFreq( ana = anaName,
doflip = doPhase,
AChrID,
merge="perSite",
nPrefix = nPrefix,
tPrefix = tPrefix,
forceRead = F,
allelicMapability = allelicMapability
)
tB = c(tB , res$tB )
nB = c(nB , res$nB )
tSum = c(tSum , res$tSum)
nSum = c(nSum , res$nSum)
notNA=!is.na(tB)
tB=tB[notNA]
tSum=tSum[notNA]
nB=nB[notNA]
nSum=nSum[notNA]
label = c(label, as.numeric(names(res$tB)) + startPos + gap)
startPos = max(label)
breakPoints = c(breakPoints, startPos)
if(fakeNormal)
{
baseDir = "/panfs/home/wenhanchen/work/DATA/humanAssembly/HG19/"
print(sprintf("%s/chr%d.fa",baseDir, AChrID))
xx = calcGC (AChrID, sprintf("%s/chr%d.fa",baseDir, AChrID), winSize = 500, pos = as.numeric(names(res$tB)) )
yy = res$tSum
gcCombine = c(gcCombine, xx)
depthCombine = c(depthCombine, yy)
}
}
names(tB) = (label)
names(nB) = (label)
names(tSum) = (label)
names(nSum) = (label)
if(fakeNormal)
{
fit2 <- lm(depthCombine~poly(gcCombine,7))
expectedDepth <- predict(fit2)
save(file="gc.dat", tB,tSum,nB,nSum, fit2, gcCombine, depthCombine)
pdf(file=sprintf("%s.gcEffect.pdf", anaName))
plot(gcCombine, predict(fit2), type="p", col="red", lwd=1)
dev.off()
print("Generating faked read depth that reflects the GC bias")
nB = nB/nSum * expectedDepth +1
nB = 0.5 * expectedDepth +1
nSum = expectedDepth +1
sel = nB <1 | nSum <1
nB[sel] = 0.5
nSum[sel]= 1
sel = gcCombine > 40 & gcCombine < 60
tB = tB [sel]
nB = nB [sel]
tSum = tSum [sel]
nSum = nSum [sel]
label = label[sel]
print("depth generated")
}
save(tB,nB,tSum,nSum, breakPoints, file=paste(anaName, chrID, ".gw.dat", sep='.'))
ADepth <- function(tB, tSum, nB, nSum, breakPoints)
{
me <- list(
tB= tB,
tSum = tSum,
nB = nB,
nSum = nSum,
breakPoints = breakPoints
)
## Set the name for the class
class(me) <- append(class(me),"ADepth")
return(me)
}
unmergedDepth = ADepth(tB, tSum, nB, nSum, breakPoints)
}
#************************************************************************
# Possible genotypes
#************************************************************************
genotypes = genGenotypes(maxCopyNum) # all the possible genotypes with copy number < maxCopyNum
genotypes = c(0, 0, genotypes) # adding doulbe deletion
#************************************************************************
# Merging (state shifting detection)
#************************************************************************
par(cex = 1.2)
par(cex.axis = 1)
par(cex.lab = 1)
outfile=paste(doPhase, anaName, chrID, preprocess, ".pdf", sep=".")
pdf(file=outfile, width=24, height=6, title=outfile)
#************************************************************************
# Merging (state shifting detection)
#************************************************************************
indexing<-function(idx) {as.numeric(names(idx))}
mergedData = mergeUsingML(data.frame(cbind(nB,nSum, tB, tSum)),
genotypes,
tc=0.99,
len = mlen) # Merge the allelic depth and mind the allelic depth consistency.
fluctuation = mergedData[["fluctuation"]]
fullLabel = mergedData[["fullLabel"]]
mergedDepth = mergedData[["depths"]]
tB = ceiling(mergedDepth$tB)
nB = ceiling(mergedDepth$nB)
tSum = ceiling(mergedDepth$tSum)
nSum = ceiling(mergedDepth$nSum)
names(tB) = names(fluctuation)
#************************************************************************
# Remove germ-line CNVs
#************************************************************************
removeCNVs = T
if(removeCNVs)
{
cleanBAF = 0.01
bafThresh = quantile(abs(nB/nSum - 0.5), prob = 1 - cleanBAF, type = 1)
selNonCNVs = abs(nB/nSum - 0.5) < bafThresh
tB = tB [selNonCNVs]
nB = nB [selNonCNVs]
tSum = tSum [selNonCNVs]
nSum = nSum [selNonCNVs]
fluctuation = fluctuation [selNonCNVs]
fullLabel = fullLabel[selNonCNVs,]
label = label[selNonCNVs]
}
#************************************************************************
# CN state prediction
#************************************************************************
if(runit == T)
{
allRes = inferStates (anaName, preprocess, chrID, method = method,
nB,nSum, tB, tSum,
genotypes, maxiter=maxiter, fullLabel = fullLabel, DOA_range = ploidy,
tpmType = tpmType,
underate = underate,
runHMM = runHMM,
mlen = mlen)
extract <- function(aList, field){ aList[field] }
logLiks = unlist( sapply(allRes, extract, field = "log.lik"))
maxRes = allRes [[ which(max(logLiks) == logLiks)[1] ]]
dataFile = paste("res", anaName, preprocess,"chr", chrID,"dat", sep=".")
label = as.numeric(names(tB))
} else {
dataFile = paste("res", anaName, preprocess,"chr", chrID,"dat", sep=".")
load(dataFile)
}
if(reRun)
{
likelihoodEst = runForPloidy (anaName, preprocess, chrID, method = method,
nB,nSum, tB, tSum,
maxRes = maxRes,
genotypes, maxiter=maxiter, fullLabel = fullLabel, DOA_range = ploidy,
tpmType = tpmType,
underate = underate,
runHMM = runHMM,
mlen = mlen, unmergedDepth = unmergedDepth)
likelihoods = likelihoodEst$likelihoods
ml = likelihoodEst$ml
selIncluded = likelihoodEst$selIncluded
# find the max
maxIdx = which(likelihoods == max(likelihoods))[1]
theDOA = allRes[[maxIdx]]$DOA
genotypes = maxRes$genotypes
res = segHMM2(rbind(nB[selIncluded], nSum[selIncluded], tB[selIncluded], tSum[selIncluded]),
length(genotypes)/2,
genotypes,
maxiter = 1,
DOA_range = c(theDOA, theDOA + 0.05),
method = method, underate=underate, tpmType = tpmType)
similiarity <- function(aRes, tStates ) {sum(aRes$hidden.states[selIncluded] == tStates)}
trueMaxIdx = which.max(unlist(lapply(allRes, similiarity, tStates = res$hidden.states)))
maxRes = allRes[[trueMaxIdx]]
save(unmergedDepth, chroms, nB,nSum, tB, tSum, label, fullLabel, allRes, mlen, maxRes,
gap, nPrefix, tPrefix, likelihoods, ml, selIncluded, file=dataFile)
} else
{
save(unmergedDepth, chroms, nB,nSum, tB, tSum, label, fullLabel, allRes,
mlen, maxRes, gap, nPrefix, tPrefix, file=dataFile)
}
cat(paste ("Degree of Amplification : ", maxRes$DOA, sep=":"))
cat(paste ("Predict tumor cellularity : ", maxRes$tc, maxRes$log.lik, sep=":"))
if(generateLog)
{
plotPred (fluctuation, nB, nSum, tB, tSum,
fullLabel, genotypes, maxRes, len=mlen, chrID=chrID)
depths = rbind(nB,nSum,tB,tSum)
T = dim(depths)[2]
prob_RD=rep(0.1, T)
prob_AD=rep(0.1, T)
geno = maxRes$genotypes
cnStates = maxRes$hidden.states
cns = geno[maxRes$hidden.states*2-1] + geno[maxRes$hidden.states*2]
tc = maxRes$tc
DOA = maxRes$DOA
res <- .C("emissionDist_Debug",
depths = as.integer(as.vector(depths)),
T = as.integer(T),
geno = as.integer(geno),
k = as.integer( length(geno)/2),
cnStates = as.integer(cnStates),
tc = as.double(tc),
DOA = as.double(DOA),
prob_RD = as.double(prob_RD),
prob_AD = as.double(prob_AD),
underate = underate,
PACKAGE="sCNAphase"
)
col = rep(1, length(cns))
col [cns>8] = 2
plot(res$prob_AD, pch = 4, col = col)
plot(res$prob_RD, pch = 4, col = col)
save(file="prob.dat", res)
}
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.