#This function takes the cancer and normal variants, and the coverage analysis as input
#It tries to find the germline het variants for each sample.
#It then clusters regions of the genome that have similar het frequencies and coverage.
#It then calls the CN of each clustered region, as well as clonality, uncertainty estimate and p-value
#for normal CN.
#returns a list of data frames with each region of the genome on a row.
callCNVs = function(variants, fitS, SNPs, samples, individuals, normals, Rdirectory, plotDirectory, genome='hg19', cpus=1, forceRedoCNV=F, correctReferenceBias=T, mode='exome', ploidyPriors=NULL) {
clustersSaveFile = paste0(Rdirectory, '/clusters.Rdata')
if ( file.exists(clustersSaveFile) & !forceRedoCNV ) {
catLog('Loading saved CNV results.\n')
load(file=clustersSaveFile)
return(clusters)
}
#some capture regions can name regions very far apart the same name, such as '-', which will be treated as a
#single gene, messing up resolution. So remove any gene-regions larger than 5Mbp.
#If this causes problems, rename the capture regions to avoid equal names at long distance.
fitS = subsetFit(fitS, rows = abs(fitS$x2 - fitS$x1) < 5e6)
#identify cancer-normal pairs, check which cancers have normals from the same individual
correspondingNormal = findCorrespondingNormal(samples, individuals, normals)
#run cnv on the samples, using cancer-normal where available
clusters = lapply(samples, function(sample) {
catLog('\nCalling CNVs for ', sample, '.\n', sep='')
ploidyPrior = NULL
if ( inherits(ploidyPriors, 'numeric') & sample %in% names(ploidyPriors) ) {
ploidyPrior = ploidyPriors[sample]
catLog('Using prior bias towards ploidy of ', ploidyPrior, '.\n', sep='')
}
if ( !is.na(correspondingNormal[sample]) ) {
catLog('Using', correspondingNormal[sample], 'as matched normal.\n')
return(callCancerNormalCNVs(cancerVariants=variants$variants[[sample]],
normalVariants=variants$variants[[correspondingNormal[sample]]],
fit = subsetFit(fitS, cols=paste0(sample, '-normal')),
plotDirectory, sample, individuals, SNPs,
genome=genome, cpus=cpus,
correctReferenceBias=correctReferenceBias,
mode=mode, ploidyPrior=ploidyPrior))
}
else
return(callCancerNormalCNVs(cancerVariants=variants$variants[[sample]],
normalVariants=FALSE,
fit = subsetFit(fitS, cols=paste0(sample, '-normal')),
plotDirectory, sample, individuals, SNPs,
genome=genome, cpus=cpus,
correctReferenceBias=correctReferenceBias,
mode=mode, ploidyPrior=ploidyPrior))
})
names(clusters) = samples
save(clusters, file=clustersSaveFile)
return(clusters)
}
#the high level function that controls the steps of the CNV calling for given sample and normal variant objects.
callCancerNormalCNVs = function(cancerVariants, normalVariants, fit, plotDirectory, name, individuals, SNPs, genome='hg19', cpus=1, correctReferenceBias=T, mode='exome', ploidyPrior=NULL) {
#select good germline het variants from normals:
if ( inherits(normalVariants, 'logical') )
use = selectGermlineHetsFromCancer(cancerVariants, fit$sex, SNPs, genome, cpus=cpus)
else
use = selectGermlineHets(normalVariants, fit$sex, SNPs, genome, cpus=cpus)
is = rownames(cancerVariants) %in% use
cancerVariants = cancerVariants[is,]
#summarise by capture region
catLog('Summarising capture regions..')
mC = getMaxCov()
catLog('using effective coverage capped at ', mC, '..', sep='')
d = cancerVariants$cov
effectiveCov = round(d*(1 + d/mC)/(1 + d/mC + d^2/mC^2))
effectiveVar = round(cancerVariants$var/cancerVariants$cov*effectiveCov)
effectiveFreqs = data.frame(var=mirrorDown(effectiveVar, cov=effectiveCov),
cov=effectiveCov, x=cancerVariants$x)
effectiveFreqs = effectiveFreqs[effectiveFreqs$cov > 0,]
cancerCR = unifyCaptureRegions(effectiveFreqs, fit, cpus=cpus)
catLog('done!\n')
#correct width if variance is underestimated
diagnosticPlotsDirectory = paste0(plotDirectory, '/diagnostics')
if ( !file.exists(diagnosticPlotsDirectory) ) dir.create(diagnosticPlotsDirectory)
boostDirectory = paste0(diagnosticPlotsDirectory, '/varianceBoost')
if ( !file.exists(boostDirectory) ) dir.create(boostDirectory)
boostFile = paste0(boostDirectory, '/', name, '.pdf')
catLog('Plotting boost to ', boostFile, '.\n')
pdf(boostFile, width=14, height=7)
if ( mode == 'RNA' ) cancerCR = getCorrectedCR(cancerCR, fit, plot=T)
else cancerCR = boostCRwidth(cancerCR, plot=T)
dev.off()
#run clustering algorithm
cancerCluster = mergeChromosomes(cancerCR, effectiveFreqs, genome=genome, cpus=cpus)
#run post processing, such as correcting normalisation from AB regions, calling CNVs and clonalities.
catLog('Postprocessing...')
diagnosticPlotsDirectory = paste0(plotDirectory, '/diagnostics')
ploidyDirectory = paste0(diagnosticPlotsDirectory, '/ploidy')
if ( !file.exists(ploidyDirectory) ) dir.create(ploidyDirectory)
plotFile = paste0(ploidyDirectory, '/', name, '.png')
catLog('Plotting to ', plotFile, '.\n')
png(plotFile, width=10, height=7, res=150, units='in')
post = superFreq:::postProcess(cancerCluster, cancerCR, effectiveFreqs, plotDirectory, name, genome, cpus=cpus, ploidyPrior=ploidyPrior)
dev.off()
cancerCluster = post$clusters
catLog('found total of', sum(cancerCluster$call != 'AB' & !grepl('\\?', cancerCluster$call)), 'CNVs..')
cancerCR$M = cancerCR$M + post$shift
#plot diagnostics for the calls
diagnosticPlotsDirectory = paste0(plotDirectory, '/diagnostics')
CNVcallDirectory = paste0(diagnosticPlotsDirectory, '/CNVcall')
if ( !file.exists(CNVcallDirectory) ) dir.create(CNVcallDirectory)
plotFile = paste0(CNVcallDirectory, '/', name, '.pdf')
pdf(plotFile, width=20, height=10)
plotMAFLFC(cancerCluster)
dev.off()
#return clustered regions with calls, as well as the raw capture region data.
catLog('done!\n')
effectiveFreqs = data.frame(var=effectiveVar, cov=effectiveCov, x=cancerVariants$x)
return(list(clusters=cancerCluster, CR=cancerCR,
eFreqs=effectiveFreqs))
}
#helper function that selects germline het SNPs in the presence of a normal sample from the same individual.
selectGermlineHets = function(normalVariants, sex, SNPs, genome, minCoverage = 10, cpus=1) {
#only bother with variants that have enough coverage so that we can actually see a change in frequency
catLog('Taking variants with minimum coverage of', minCoverage, '...')
decentCoverage = normalVariants$cov >= minCoverage
use = rownames(normalVariants)[decentCoverage]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(decentCoverage), 'variants.\n')
if ( length(use) == 0 ) return(use)
#only use dbSNPs
catLog('Taking variants in valideted dbSNP or unfiltered exac positions...')
isDB = normalVariants$db & normalVariants$dbValidated
isExac = rep(FALSE, length(isDB))
if ( 'exac' %in% names(normalVariants) )
isExac = normalVariants$exac & normalVariants$exacFilter == 'PASS'
use = use[isDB | isExac]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(isDB), 'variants.\n')
if ( length(use) == 0 ) return(use)
#dont use insertions or deletions, too error-prone.
catLog('Removing indels..')
indel = grepl('[+-]', normalVariants$variant)
use = use[!indel]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(!indel), 'variants.\n')
#pick het in matching normal sample
catLog('Taking variants that are het in the normal sample..')
normalF = normalVariants$var/normalVariants$cov
normalHet = pBinom(normalVariants$cov, normalVariants$var, refBias(0.5)) > 0.1 & abs(normalF-0.5) < 0.15 & normalVariants$flag == ''
use = use[normalHet]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(normalHet), 'variants.\n')
if ( length(use) == 0 ) return(use)
#again to filter out noisy variants, filter on the RIB statistic in the cancer sample
catLog('Taking variants that have low expected rate of incorrect basecalls in the normal sample..')
highNormalRIB = normalVariants$RIB > 0.1
use = use[!highNormalRIB]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(!highNormalRIB), 'variants.\n')
#be extra picky with the mapping and base quality
catLog('Taking variants that have good mapping and base quality in the normal sample..')
highQ = normalVariants$pbq > 0.001 & normalVariants$pmq > 0.001
use = use[highQ]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(highQ), 'variants.\n')
#Restrict to validated dbSNPs with population frequency > 1%
catLog('Restrict to SNPs with population frequency > 1%...')
isFrequentDb = normalVariants$db & normalVariants$dbValidated & !is.na(normalVariants$dbValidated) &
normalVariants$dbMAF > 0.01 & !is.na(normalVariants$dbMAF)
isFrequentExac = rep(FALSE, length(isFrequentDb))
if ( 'exac' %in% names(normalVariants) )
isFrequentExac = normalVariants$exac & normalVariants$exacFilter == 'PASS' & !is.na(normalVariants$exacFilter) &
normalVariants$exacAF > 0.01 & !is.na(normalVariants$exacAF)
use = use[isFrequentDb | isFrequentExac]
normalVariants = normalVariants[use,]
catLog('done! Got', sum(isFrequentDb | isFrequentExac), 'variants.\n')
#remove variants from male X and Y, and female Y-chromsomes
if ( sex == 'male' )
catLog('Removing variants in male X and Y...')
else
catLog('Removing variants in female Y...')
notHet = xToChr(normalVariants$x, genome) %in% ifelse(sex=='male', c('X','Y'), 'Y')
use = rownames(normalVariants)[!notHet]
normalVariants = normalVariants[use,]
extrapolatedFalse = round(sum(chrLengths(genome))/sum(chrLengths(genome)[ifelse(sex=='male', c('X','Y'), 'Y')])*sum(notHet))
catLog('done! Discarded', sum(notHet), 'variants. This naively extrapolates to', extrapolatedFalse, 'false SNPs genomewide.\n')
if ( length(use) == 0 ) return(use)
return(use)
}
#helper function that selects germline het SNPs in the absence of a normal sample from the same individual.
selectGermlineHetsFromCancer = function(cancerVariants, sex, SNPs, genome, minCoverage = 10, cpus=1) {
#only bother with variants that have enough coverage so that we can actually see a change in frequency
catLog('Taking variants with minimum coverage of', minCoverage, '...')
decentCoverage = cancerVariants$cov >= minCoverage
use = rownames(cancerVariants)[decentCoverage]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(decentCoverage), 'variants.\n')
#only use dbSNPs
catLog('Taking variants in valideted dbSNP or unfiltered exac positions...')
isDB = cancerVariants$db & cancerVariants$dbValidated
isExac = rep(FALSE, length(isDB))
if ( 'exac' %in% names(cancerVariants) )
isExac = cancerVariants$exac & cancerVariants$exacFilter == 'PASS'
use = use[isDB | isExac]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(isDB), 'variants.\n')
#dont use insertions or deletions, too error-prone.
catLog('Removing indels..')
indel = grepl('[+-]', cancerVariants$variant)
use = use[!indel]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(!indel), 'variants.\n')
#again to filter out noisy variants, filter on the RIB statistic in the cancer sample
catLog('Taking variants that have low expected rate of incorrect basecalls in the sample..')
highCancerRIB = cancerVariants$RIB > 0.01
use = use[!highCancerRIB]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(!highCancerRIB), 'variants.\n')
#be extra picky with the mapping and base quality
catLog('Taking variants that have good mapping and base quality..')
highQ = cancerVariants$pbq > 0.001 & cancerVariants$pmq > 0.001
use = use[highQ]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(highQ), 'variants.\n')
#Restrict to validated dbSNPs with population frequency > 1% in dbSNP and ExAC
catLog('Restrict to SNPs with population frequency > 1%...')
isFrequentDb = cancerVariants$db & cancerVariants$dbValidated & !is.na(cancerVariants$dbValidated) &
cancerVariants$dbMAF > 0.01 & !is.na(cancerVariants$dbMAF)
isFrequentExac = rep(FALSE, length(isFrequentDb))
if ( 'exac' %in% names(cancerVariants) )
isFrequentExac = cancerVariants$exac & cancerVariants$exacFilter == 'PASS' & !is.na(cancerVariants$exacFilter) &
cancerVariants$exacAF > 0.01 & !is.na(cancerVariants$exacAF)
use = use[isFrequentDb | isFrequentExac]
cancerVariants = cancerVariants[use,]
catLog('done! Got', sum(isFrequentDb | isFrequentExac), 'variants.\n')
#before filtering homozygous, look for stretches of no het, which indicates clonal LOH.
catLog('Checking for regions with depletion of heterozygous SNPs...')
LOHhets = checkForMissedLOH(cancerVariants, genome)
catLog('done: Protecting ', length(LOHhets), ' variants above 95% VAF as germline heterozygous.\n', sep='')
#require support from at least 2 reads for both alleles
catLog('Taking cancer variants that have support from at least 2 reads on both alleles..')
cancerF = cancerVariants$var/cancerVariants$cov
supported = cancerVariants$var > 1 & cancerVariants$var < cancerVariants$cov-1
use = use[supported | rownames(cancerVariants) %in% LOHhets]
cancerVariants = cancerVariants[use,]
catLog('done! Retaining', sum(supported), 'variants.\n')
#take variants between 5% and 95%
catLog('Taking unflagged cancer variants that have frequency above 5% and below 95%..')
cancerF = cancerVariants$var/cancerVariants$cov
normalHet = abs(cancerF-0.5) < 0.45 & cancerVariants$flag == ''
use = use[normalHet | rownames(cancerVariants) %in% LOHhets]
cancerVariants = cancerVariants[use,]
catLog('done! Retaining', sum(normalHet), 'variants.\n')
#remove variants from male X and Y, and female Y-chromsomes
if ( sex == 'male' )
catLog('Removing variants in male X and Y...')
else
catLog('Removing variants in female Y...')
notHet = xToChr(cancerVariants$x, genome) %in% ifelse(sex=='male', c('X','Y'), 'Y')
use = rownames(cancerVariants)[!notHet]
cancerVariants = cancerVariants[use,]
extrapolatedFalse = round(sum(chrLengths(genome))/sum(chrLengths(genome)[ifelse(sex=='male', c('X','Y'), 'Y')])*sum(notHet))
catLog('done! Discarded', sum(notHet), 'variants. This naively extrapolates to', extrapolatedFalse, 'false SNPs genomewide.\n')
if ( length(use) == 0 ) return(use)
return(use)
}
unifyCaptureRegions = function(eFreqs, fit, cpus=1) {
#group SNPs by capture region
x = eFreqs$x
snps = lapply(1:nrow(fit), function(row) which(x < fit$x2[row] & x > fit$x1[row]))
uniFreq = do.call(rbind, lapply(snps, function(is) c(sum(eFreqs$var[is]), sum(eFreqs$cov[is]))))
colnames(uniFreq) = c('var', 'cov')
pHet = rep(0.5, length(snps))
pHet[uniFreq[,'cov'] > 0] = unlist(mclapply(snps[uniFreq[,'cov'] > 0], function(is) fisherTest(pBinom(eFreqs$cov[is], eFreqs$var[is], refBias(0.5)))[2], mc.cores=cpus))
pAlt = rep(0.5, length(snps))
pAlt[uniFreq[,'cov'] > 0] = unlist(lapply(snps[uniFreq[,'cov'] > 0], function(is)
fisherTest(pTwoBinom(eFreqs$cov[is], eFreqs$var[is], sum(eFreqs$var[is])/sum(eFreqs$cov[is])))[2]))
odsHet = rep(0.5, length(snps))
odsHet[uniFreq[,'cov'] > 0] = unlist(lapply(snps[uniFreq[,'cov'] > 0], function(is) {
fAlt = refBias(alternativeFrequency(eFreqs[is,])['f'])
dpHet = dbinom(eFreqs$var[is], eFreqs$cov[is], refBias(0.5))
dpAlt = twoBinom(eFreqs$var[is], eFreqs$cov[is], fAlt, refBiasMirror(fAlt))
odsHet = dpHet/(dpHet+dpAlt)
return(fisherTest(odsHet)[2])
}))
Nsnps = sapply(snps, length)
#relWidth = sqrt(fit$df.total[1]/(fit$df.total[1]-2))
width = fit$coefficients[,1]/fit$t[,1]
#handle division by zero by giving it a very large width
#this will give a very low weight downstream and will effectively remove it from analysis
width[fit$t == 0] = 10
cR = data.frame(x1=fit$x1, x2=fit$x2, M=fit$coefficients[,1], width=width, df=fit$df.total,
uniFreq[,1:2], Nsnps=Nsnps, pHet=pHet, pAlt=pAlt, odsHet=odsHet)
return(cR)
}
#this function calculates the alternative frequency by maximising the likelihood.
#efs should be a data.frame with two entries "cov" and "var" giving the coverage
#and variance of the SNP. The SNPs should be mirrored down to below 50% frequency.
alternativeFrequency = function(efs, plot=F) {
f = (0:200)/200
rbf = refBias(f)
fps = sapply(1:nrow(efs), function(row) {
p=pBinom(rep(efs$cov[row], 201), rep(efs$var[row], 201), rbf)
return(p/sum(p))
})
MAFchi2 = apply(fps + fps[201:1,], 1, fisherTest)[1,]
minChi2 = which(MAFchi2 == min(MAFchi2))[1]
within1pc = which(MAFchi2 < min(MAFchi2)*1.01 & f <= 0.5)
error = max(1/200, max(abs(f[within1pc]-f[minChi2])), 1/sqrt(sum(efs$cov)))
if ( plot ) plot(f[1:101], MAFchi2[1:101], ylim=c(min(MAFchi2), min(MAFchi2)*2), pch=16, col=ifelse(1:101 %in% within1pc, 'red', 'black'))
return(c('f'=f[minChi2], 'ferr'=error))
}
#merges regions in each chromosome.
mergeChromosomes = function(cR, eFreqs, genome='hg19', cpus=1, ...) {
chrs = xToChr(cR$x1, genome=genome)
breakpoints = pmax(nrow(cR) - length(unique(chrs)), 1)
MHTcut = 1/breakpoints
catLog('Merging capture regions with same coverage and MAF: ')
clusters = mclapply(unique(chrs), function(chr) {
catLog(chr, '..', sep='')
ret = mergeRegions(cR[chrs == chr,], minScore=MHTcut, ...)
singleSNP = calledFromSingleSNP(ret, eFreqs)
if ( any(singleSNP) ) ret = mergeRegions(ret, force=singleSNP, minScore=MHTcut, ...)
ret$altStatErr = ret$nullStatErr = ret$altStat = ret$nullStat = ret$stat = ret$f = rep(NA, nrow(ret))
ret$postHet = rep(1, nrow(ret))
if ( any(ret$cov > 0) ) {
ret$f[ret$cov > 0] = refUnbias(ret$var[ret$cov>0]/ret$cov[ret$cov>0])
ret[ret$cov>0,] = redoHetCalculations(ret[ret$cov>0,], eFreqs, cpus=1)
ret = mergeRegions(ret, ...)
}
return(ret)
}, mc.cores=cpus, mc.preschedule=F)
#catch one of the more common places for out-of-memory spots.
if ( !all(sapply(clusters, class) == 'data.frame') ) {
catLog('\nERROR: Seems like some of the chromosome forks failed segmentation. This may (or may not) be caused by out-of-memory. Out-of-memory can be mitigated by decreasing cpus and rerunning.\n\n')
stop('Seems like some of the chromosome forks failed segmentation. This may (or may not) be caused by out-of-memory. Out-of-memory can be mitigated by decreasing cpus and rerunning.')
}
clusters = do.call(rbind, clusters)
catLog('done!\n')
return(clusters)
}
mergeRegions = function(cR, minScore = 0.05, plot=F, debug=F, force=NA) {
merged = c()
scores = c()
if ( debug ) Nloop = 0
cR = cR[order(cR$x1+cR$x2),]
pairScore = sameCNV(cR)
if ( !is.na(force[1]) & length(force) == length(pairScore) ) pairScore[force] = 1
while(dim(cR)[1] > 1) {
#find the pair of regions with the largest pairing probability
best = which(pairScore == max(pairScore))[1]
if ( debug && Nloop < 1 ) {
print (cR)
catLog('DEBUG: Merging pair at ', cR$x2[best], ' with score ', pairScore[best], ', ', nrow(cR), ' regions.\n', sep='')
plotCR(cR)
points(cR$x1[-1], pairScore, pch=4)
segments((cR$x1[best+1]+cR$x2[best])/2, 0, (cR$x1[best+1]+cR$x2[best])/2, 1)
a = scan()
if ( length(a) > 0 ) Nloop = a
else Nloop = 1
catLog('DEBUG: looping another ', Nloop, ' times.\n', sep='')
}
if ( debug ) Nloop = Nloop - 1
if ( plot & length(scores) == 0 ) {
plotCR(cR)
}
#break loop if no clusters that are sufficiently similar
if ( pairScore[best] < minScore ) break
#otherwise save the merged regions
merged = c(merged, best)
scores = c(scores, pairScore[best])
#merge the regions, using the provided weight
cR$x1[best] = min(cR$x1[best],cR$x1[best+1])
cR$x2[best] = max(cR$x2[best],cR$x2[best+1])
cR$var[best] = cR$var[best] + cR$var[best+1]
cR$cov[best] = cR$cov[best] + cR$cov[best+1]
cR$Nsnps[best] = cR$Nsnps[best] + cR$Nsnps[best+1]
cR$M[best] = (cR$M[best]/cR$width[best]^2 + cR$M[best+1]/cR$width[best+1]^2)/(1/cR$width[best]^2+1/cR$width[best+1]^2)
cR$width[best] = 1/sqrt(1/cR$width[best]^2 + 1/cR$width[best+1]^2)
cR$pHet[best] = if ( cR$cov[best] > 0 ) stoufferTest(c(cR$pHet[best], cR$pHet[best+1]), c(cR$cov[best], cR$cov[best+1]))[2] else 0.5
cR = cR[-(best+1),]
pairScore = pairScore[-best]
if ( best > 1 ) pairScore[best-1] = sameCNV(cR[(best-1):best,])
if ( best <= length(pairScore) ) pairScore[best] = sameCNV(cR[best:(best+1),])
}
if ( plot ) {
plotCR(cR)
layout(1)
}
return(cR)
}
calledFromSingleSNP = function(cR, eFreqs) {
first = 1:(nrow(cR)-1)
second = 2:nrow(cR)
x = eFreqs$x
snps = lapply(first, function(row) which(x < cR$x2[row] & x > cR$x1[row]))
singleSNP = sapply(snps, length) == 1 | sapply(snps, function(is)
max(c(-Inf, eFreqs$x[is])) - min(c(Inf, eFreqs$x[is])) < fragmentLength())
width = cR$width + 0*getSystematicVariance()
meanM = (cR$M[first]/width[first]^2 + cR$M[second]/width[second]^2)/(1/width[first]^2 + 1/width[second]^2)
MP1 = pt(-abs(cR$M[first] - meanM)/width[first], df = cR$df[first])
MP2 = pt(-abs(cR$M[second] - meanM)/width[second], df = cR$df[second])
pM = sapply(first, function(i) min(c(Inf, p.adjust(c(MP1[i], MP2[i]), method='fdr'))))
return(pM > 0.05 & singleSNP)
}
fragmentLength = function() {return(300)}
#takes clustered regions and post processes them by calling copy numbers and clonality.
postProcess = function(clusters, cRs, eFreqs, plotDirectory, name, genome='hg19', cpus=1, ploidyPrior=NULL, hasMatchedNormal=T) {
catLog('corrected frequencies..')
cf = correctedFrequency(clusters, eFreqs, cpus=cpus)
clusters$f = cf[,'f']
clusters$ferr = cf[,'ferr']
catLog('frequency probability..')
diagnosticPlotsDirectory = paste0(plotDirectory, '/diagnostics')
if ( !file.exists(diagnosticPlotsDirectory) ) dir.create(diagnosticPlotsDirectory)
diagnosticMAFDirectory = paste0(diagnosticPlotsDirectory, '/MAFstat/')
if ( !file.exists(diagnosticMAFDirectory) ) dir.create(diagnosticMAFDirectory)
plotFile = paste0(diagnosticMAFDirectory, name, '.pdf')
pdf(plotFile, width=15, height=7)
clusters = redoHetCalculations(clusters, eFreqs, cpus=cpus, plot=T)
dev.off()
catLog('renormalising..')
if ( exists('.doSuperFreqPloidyManually', envir= .GlobalEnv) && get('.doSuperFreqPloidyManually', envir= .GlobalEnv) )
renorm = findShiftManually(clusters, cpus=cpus, plot=F, plotPloidy=T)
else
renorm = findShift(clusters, cpus=cpus, plot=F, plotPloidy=T, ploidyPrior=ploidyPrior)
shift = renorm$M[1] - clusters$M[1]
clusters = renorm
catLog('call CNVs..')
clusters = addCall(clusters, eFreqs)
clusters = fixFalseSNPcall(clusters, eFreqs)
sCAN = sameCallAsNeighbour(clusters, genome)
if ( any(sCAN) ) {
catLog('found', sum(sCAN),'neighbouring regions with same call: merge and redo postprocessing!\n')
clusters = forceMerge(clusters, sCAN, genome)
ret = postProcess(clusters, cRs, eFreqs, plotDirectory, name, genome=genome, cpus=cpus, ploidyPrior=ploidyPrior)
ret$shift = ret$shift + shift
return(ret)
}
catLog('find subclones..')
clusters = findSubclones(clusters)
rownames(clusters) = make.names(paste0('chr', xToChr(clusters$x1, genome=genome)), unique=T)
clusters = clusters[order(clusters$x1),]
return(list(clusters=clusters, shift=shift))
}
fixFalseSNPcall = function(clusters, eFreqs) {
#isAA = gsub('\\?', '', clusters$call) == 'AA'
lfcDist = abs(clusters$M[2:nrow(clusters)] - clusters$M[1:(nrow(clusters)-1)])/
sqrt(clusters$width[2:nrow(clusters)]^2 + clusters$width[1:(nrow(clusters)-1)]^2)
basedOnSNPs = c(F, lfcDist < 1) | c(lfcDist < 1, F)
if ( nrow(clusters) == 1 ) basedOnSNPs = F
smallRegion = clusters$x2 - clusters$x1 < 1e7
x = eFreqs$x
snps = lapply(1:nrow(clusters), function(row) which(x < clusters$x2[row] & x > clusters$x1[row]))
basedOnFewSNPs = sapply(snps, length) < 5
basedOnSmallRegion = sapply(snps, function(is) length(is) > 0 && max(eFreqs$x[is]) - min(eFreqs$x[is]) < 1e5)
inconsistentEvidence = clusters$sigma > 2
falseCall = basedOnSNPs & smallRegion & (basedOnFewSNPs | basedOnSmallRegion | inconsistentEvidence)
if ( any(falseCall) ) {
catLog('found', sum(falseCall), 'AA calls from false SNPs..')
isab = sapply(which(falseCall), function(i) unlist(isAB(clusters[i,], eFreqs[snps[[i]],])))
clusters[falseCall,]$sigma = isab[2,]
clusters[falseCall,]$pCall = clusters[falseCall,]$pHet
clusters[falseCall,]$clonality = 1
clusters[falseCall,]$clonalityError = 0
if ( 'subclonality' %in% names(clusters) ) clusters[falseCall,]$subclonality = 1
if ( 'subclonalityError' %in% names(clusters) ) clusters[falseCall,]$subclonalityError = 0
clusters[falseCall,]$call = ifelse(isab[2,] > 10, 'AB??', ifelse(isab[2,] > 5, 'AB?', 'AB'))
}
return(clusters)
}
sameCallAsNeighbour = function(clusters, genome) {
if ( nrow(clusters) == 1 ) return(F)
chr = xToChr(clusters$x1, genome)
first = 1:(nrow(clusters)-1)
second = 2:nrow(clusters)
sameCall = chr[first] == chr[second] &
gsub('\\?','',clusters$call[first]) == gsub('\\?','',clusters$call[second]) &
abs(clusters$clonality[first] - clusters$clonality[second]) <=
sqrt(clusters$clonalityError[first]^2 + clusters$clonalityError[second]^2)*2
return(sameCall)
}
forceMerge = function(clusters, toMerge, genome) {
chrs = xToChr(clusters$x1, genome)
redoChrs = unique(chrs[which(toMerge)])
newClusters = list()
for ( chr in redoChrs ) {
is = which(chrs == chr)
newClusters[[chr]] = mergeRegions(clusters[is,], force=toMerge[is][-length(is)])
}
clusters = clusters[!(chrs %in% redoChrs),]
for ( chr in redoChrs ) {
clusters = rbind(clusters, newClusters[[chr]])
}
clusters = clusters[order(clusters$x1),]
return(clusters)
}
correctedFrequency = function(clusters, eFreqs, maxCov = getMaxCov(), cpus=1) {
ret = do.call(rbind, mclapply(1:nrow(clusters), function(row) {
efs = eFreqs[eFreqs$x > clusters$x1[row] & eFreqs$x < clusters$x2[row],]
if ( nrow(efs) == 0 ) return(c('f'=NA, 'ferr'=NA))
return(alternativeFrequency(efs))
}, mc.cores=cpus))
return(ret)
}
getMaxCov = function() {
if ( !exists('.maxCov', envir = .GlobalEnv) ) {
assign('.maxCov', value=0.02, envir = .GlobalEnv)
warnings('setting maxCov to default 150')
}
return(max(1, get('.maxCov', envir = .GlobalEnv)))
}
#helper function that calculates the probability that a region has 50% frequency.
redoHetCalculations = function(clusters, eFreqs, plot=F, cpus=1) {
x = eFreqs$x
snps = lapply(1:nrow(clusters), function(row) which(x < clusters$x2[row] & x > clusters$x1[row]))
Z = matrix(rep(0, length(snps)*5), nrow=5)
Zfreq = mclapply(snps[clusters$cov > 0], function(is) {
efs = eFreqs[is,]
fAlt = max(0.001, refBias(alternativeFrequency(efs)['f']))
#z score for each SNP
z = sapply(1:length(efs$cov), function(i)
whichDistribution(y1 = twoBinom(0:floor(efs$cov[i]*refBias(0.5)), efs$cov[i], refBias(0.5), refBias(0.5)),
y2 = twoBinom(0:floor(efs$cov[i]*refBias(0.5)), efs$cov[i], fAlt, refBiasMirror(fAlt)),
x = min(floor(efs$cov[i]*refBias(0.5)), efs$var[i])+1))
ret = sum(z[2,])/sum(z[1,])
meanRetNull = sum(z[3,])/sum(z[5,])
errorRetNull = sqrt(sum(z[7,]-z[3,]^2))/sum(z[5,])
meanRetAlt = sum(z[4,])/sum(z[6,])
errorRetAlt = sqrt(sum(z[8,]) - sum(z[4,]^2))/sum(z[6,])
if ( sum(z[1,]) == 0 ) return(rep(0,5))
return(c(ret, meanRetNull, meanRetAlt, errorRetNull, errorRetAlt))
}, mc.cores=cpus)
Zfreq = do.call(cbind, Zfreq)
Z[,clusters$cov > 0] = Zfreq
if ( plot ) {
plot(Z[1,], ylim=c(-0.3,0.3), type='n', xlab='region', ylab='mean log likelihood ratio')
segments(0,0,1000,0)
x = 1:ncol(Z)
points(x-0.05, Z[2,], pch=16, col='blue')
segments(x-0.05, Z[2,]-Z[4,], x-0.05, Z[2,]+Z[4,], pch=16, col='blue')
points(x+0.05, Z[3,], pch=16, col='red')
segments(x+0.05, Z[3,]-Z[5,], x+0.05, Z[3,]+Z[5,], pch=16, col='red')
points(Z[1,], pch=4, cex=1.5, lwd=2)
legend('topright', c('expected from null', 'expected from CNA', 'measured'), col=c('blue', 'red', 'black'),
lwd = c(1,1,0.001), pt.lwd=c(1,1,2), pt.cex=c(1,1,1.5), pch=c(16,16,4), bg='white')
plot(Z[1,], ylim=c(min(-0.3, Z[1:3,], na.rm=T), max(0.3, Z[1:3,], na.rm=T)), type='n', xlab='region', ylab='mean log likelihood ratio')
segments(0,0,1000,0)
x = 1:ncol(Z)
points(x-0.05, Z[2,], pch=16, col='blue')
segments(x-0.05, Z[2,]-Z[4,], x-0.05, Z[2,]+Z[4,], pch=16, col='blue')
points(x+0.05, Z[3,], pch=16, col='red')
segments(x+0.05, Z[3,]-Z[5,], x+0.05, Z[3,]+Z[5,], pch=16, col='red')
points(Z[1,], pch=4, cex=1.5, lwd=2)
legend('topright', c('expected from null', 'expected from CNA', 'measured'), col=c('blue', 'red', 'black'),
lwd = c(1,1,0.001), pt.lwd=c(1,1,2), pt.cex=c(1,1,1.5), pch=c(16,16,4), bg='white')
}
pAlt = pnorm(-Z[1,], -Z[3,], Z[5,])
pAlt[is.nan(pAlt)] = 0.5
pHet = pnorm(Z[1,], Z[2,], Z[4,])
pHet = pmax(pHet, 10^(pmin(0,Z[1,])*10)) #if the stat signal is small, het is always likely
pHet[is.nan(pHet)] = 0.5
pHet[pAlt == 0 & pHet == 0] = 1e-10
clusters$pHet = pHet
clusters$pAlt = pAlt
clusters$postHet = 0.5*pHet/(0.5*pHet+(1-0.5)*pAlt) #prior belief that 50% of the regions are normal
clusters$stat = Z[1,]
clusters$nullStat = Z[2,]
clusters$altStat = Z[3,]
clusters$nullStatErr = Z[4,]
clusters$altStatErr = Z[5,]
return(clusters)
}
#between two input function y1 and y2 on some common set (y1 and y2 must be same length)
#if you pick x, what is the likleyhood ratio that it came from y1, not y2? Also returns weight, indicating how
#different the functions are, ie how much power x has to differentiate the two functions.
whichDistribution = function(y1, y2, x) {
y1 = y1/sum(y1)
y2 = y2/sum(y2)
ord1 = order(y1)
y1 = y1[ord1]
y2 = y2[ord1]
x = which(ord1==x)
p1 = cumsum(y1)
ord2 = order(y2)
y1 = y1[ord2]
y2 = y2[ord2]
p1 = p1[ord2]
x = which(ord2==x)
p2 = cumsum(y2)
lods = log(y1/y2)
weight = ifelse(pmax(p1, p2) > 0.05, 1, 0)
retStat = weight*lods
meanStatNull = sum(y1*retStat)
meanWeightNull = sum(y1*weight)
meanStatAlt = sum(y2*retStat)
meanWeightAlt = sum(y2*weight)
meanStat2Null = sum(y1*retStat^2)
meanStat2Alt = sum(y2*retStat^2)
return(c('weight'=weight[x], 'stat'=retStat[x], 'meanStatNull'=meanStatNull, 'meanStatAlt'=meanStatAlt,
'meanWeightNull'=meanWeightNull, 'meanWeightAlt'=meanWeightAlt,
'meanStat2Null'=meanStat2Null, 'meanStat2Alt'=meanStat2Alt))
}
#this functions estimates the variance of total statistic due to the addition of this observation.
#to be combined into a single variance later on.
meanDistribution = function(y1, y2, Wnull, Walt, totNull, totAlt) {
y1 = y1/sum(y1)
y2 = y2/sum(y2)
ord1 = order(y1)
y1 = y1[ord1]
y2 = y2[ord1]
p1 = cumsum(y1)
ord2 = order(y2)
y1 = y1[ord2]
y2 = y2[ord2]
p1 = p1[ord2]
p2 = cumsum(y2)
weight = pmax(p1, p2)
stat = log(y1/y2)
meanStatNull = sum(y1*weight*log(y1/y2))
meanStatAlt = sum(y2*weight*log(y1/y2))
totStatNull = (y1*weight*log(y1/y2) + tot)/(y1*weight + W)
varTotNull = mean(totStatNull^2) - mean(totStatNull)^2
mean(y1^2*weight*log(y1/y2)^2) - mean(y1*weight*log(y1/y2))^2
varRetAlt = mean(y2^2*weight*log(y1/y2)^2) - mean(y2*weight*log(y1/y2))^2
return(c('weight'=weight, 'stat'=retStat, 'meanStatNull'=meanRetNull, 'meanStatAlt'=meanRetAlt,
'varStatNull'=varRetNull, 'varStatAlt'=varRetAlt))
}
#remormalise the coverage so that the AB calls are average of 0.
normaliseCoverageToHets = function(clusters) {
is = which(clusters$call == 'AB' & abs(clusters$M) < 0.3)
if ( length(is) > 0 )
meanM = sum((clusters$M/clusters$width^2)[is])/sum(1/clusters$width[is]^2)
else meanM = 0
catLog('Shifting overall normalisation to correct for a mean LFC of', meanM, 'in AB calls.\n')
clusters$M = clusters$M - meanM
return(list(clusters=clusters, meanM=meanM))
}
#calls the allelic copy number and clonality of the region.
addCall = function(clusters, eFreqs) {
catLog('Calling CNVs in clustered regions..')
for ( row in 1:nrow(clusters) ) {
efs = eFreqs[eFreqs$x > clusters$x1[row] & eFreqs$x < clusters$x2[row],]
isab = isAB(clusters[row,], efs, sigmaCut=2)
clusters$call[row] = 'AB'
clusters$clonality[row] = 1
clusters$clonalityError[row] = 0
clusters$sigma[row] = isab$sigma
clusters$pCall[row] = clusters$postHet[row]
if ( !(isab$call) ) {
for ( tryCall in allCalls() ) {
iscnv = superFreq:::isCNV(clusters[row,], efs, superFreq:::callTofM(tryCall)['M'], superFreq:::callTofM(tryCall)['f'],
superFreq:::callPrior(tryCall), sigmaCut=max(2, clusters$sigma[row]))
if ( (iscnv$call & clusters$sigma[row] > 2) |
(iscnv$call & (iscnv$clonality > clusters$clonality[row] | (iscnv$clonality == clusters$clonality[row] & iscnv$sigma < clusters$sigma[row]))) ) {
clusters$clonality[row] = iscnv$clonality
clusters$clonalityError[row] = iscnv$clonalityError
clusters$sigma[row] = iscnv$sigma
clusters$call[row] = tryCall
clusters$pCall[row] = iscnv$pCall
}
}
}
}
aBitWeird = clusters$sigma > 4 | clusters$clonalityError > clusters$clonality/2 | clusters$pCall < 1e-4
veryWeird = clusters$sigma > 8 | clusters$clonalityError > clusters$clonality | clusters$pCall < 1e-8
clusters$call[aBitWeird] = paste0(clusters$call[aBitWeird], '?')
clusters$call[veryWeird] = paste0(clusters$call[veryWeird], '?')
catLog('done!\n')
return(clusters)
}
#probability that a region is AB.
isAB = function(cluster, efs, sigmaCut=3) {
if ( sum(efs$cov) == 0 ) pF = 0.5
else if ( cluster$stat >= 0 ) pF = 1
else {
pF = cluster$postHet
}
pM = 2*pt(-abs(cluster$M)/(cluster$width+getSystematicVariance()), df=cluster$df) #allow systematic effects
pBoth = sapply(1:length(pF), function(row) fisherTest(c(pF[row], pM[row]))[2])
sigma = abs(qnorm(pBoth/2, 0, 1))
return(list(call=sigma < sigmaCut, sigma = sigma, clonalityError=0))
}
#the considered calls in the algorithm. (CL is complete loss, ie loss of both alleles.)
allCalls = function() {
return(c('AB', 'A', 'AA', 'AAA', 'AAAA', 'AAAAA', 'AAB', 'AAAB', 'AAAAB', 'AAAAAB', 'AAAAAAB',
paste0(c(rep('A',9),'B'),collapse=''),
paste0(c(rep('A',13),'B'),collapse=''),
paste0(c(rep('A',19),'B'),collapse=''),
paste0(c(rep('A',27),'B'),collapse=''),
paste0(c(rep('A',39),'B'),collapse=''),
paste0(c(rep('A',56),'B'),collapse=''),
paste0(c(rep('A',79),'B'),collapse=''),
'AABB', 'AAABB', 'AAAABB', 'AAAAABB',
paste0(c(rep('A',8),'BB'),collapse=''),
paste0(c(rep('A',12),'BB'),collapse=''),
paste0(c(rep('A',18),'BB'),collapse=''),
paste0(c(rep('A',25),'BBB'),collapse=''),
paste0(c(rep('A',36),'BBBB'),collapse=''),
paste0(c(rep('A',51),'BBBBBB'),collapse=''),
paste0(c(rep('A',72),'BBBBBBBB'),collapse=''),
'CL'))
}
#returns the prior of a call.
callPrior = function(call) {
priors = c('AB'=10, 'A'=1, 'AA'=1/2, 'AAA'=1/3, 'AAAA'=1/4, 'AAAAA'=1/5, 'AAB'=1, 'AAAB'=1/2, 'AAAAB'=1/3, 'AAAAAB'=1/4, 'AAAAAAB'=1/5,
'a9b'=1/10,
'a13b'=1/12,
'a19b'=1/14,
'a27b'=1/16,
'a39b'=1/18,
'a56b'=1/20,
'a79b'=1/22,
'AABB'=1/2, 'AAABB'=1/4, 'AAAABB'=1/5, 'AAAAABB'=1/6,
'a8bb'=1/11,
'a12bb'=1/13,
'a18bb'=1/16,
'a25bbb'=1/19,
'a38bbbb'=1/22,
'a51bbbbbb'=1/25,
'a78bbbbbbbb'=1/28,
'CL'=1/2)
names(priors) = allCalls()
priors = priors/max(priors)
if ( call %in% names(priors) ) return(priors[call])
else return(0.1)
}
#helper function that returns the expected coverage and frequency of a CNV call.
callTofM = function(call) {
nA = nchar(call) - nchar(gsub('A', '', call))
nB = nchar(call) - nchar(gsub('B', '', call))
f = min(nA, nB)/(nA+nB)
if ( call %in% c('CL', '') ) f = 0.5 #if complete loss, assume not completely clonal and a normal AB background.
return(c(f=f, M=log2((nA+nB)/2)))
}
#estimates how likely a certain call is for a given region.
isCNV = function(cluster, efs, M, f, prior, sigmaCut=3) {
if ( cluster$cov > 0 & nrow(efs) == 0 ) {
catLog("WARNING, non 0 coverage, but no SNPs in cluster.\n")
catLog(unlist(cluster), '\n')
warning("non 0 coverage, but no SNPs in cluster.")
}
#set an estimate of the error on the measured MAF in the cluster
ferr = if ( cluster$cov == 0 ) Inf else cluster$ferr
#add the systematic variance to the setting, to not overestimate confidence in coverage
mWidth = sqrt(cluster$width^2 + getSystematicVariance()^2)
#The MAF of the cluster
if ( cluster$cov == 0 ) cf = 0 else cf = pmax(0.001, cluster$f)
#If no basis to determine clonality, the no point in continuing
if ( (f == 0.5 | cluster$cov == 0) & M == 0 )
return(list(call=F, clonality=0, sigma = Inf, clonalityError=Inf, pCall=1))
#estimate clonality from frequency and propagate uncertainty
freqClonality = (0.5-cf)/(0.5-cf+2^M*(cf-f))
clonalityErrorF = ferr*(freqClonality*(2^M-1)+1)/(0.5-cf+2^M*(cf-f))
#clonality as function of BAF can be non-linear at the scale of BAF error.
#do empirical error check by going ferr in either direction and take max.
cfp = cf+ferr
cfm = cf-ferr
empClonalityErrorF =
max(abs((0.5-cfp)/(0.5-cfp+2^M*(cfp-f))-freqClonality),
abs((0.5-cfm)/(0.5-cfm+2^M*(cfm-f))-freqClonality))
if ( !is.na(empClonalityErrorF) )
clonalityErrorF = max(clonalityErrorF, empClonalityErrorF)
else
clonalityErrorF = Inf
if ( f == 0.5 | cluster$cov == 0 ) {
freqClonality = 1
clonalityErrorF = Inf
}
#set the weight associated with the MAF
if ( f == 0.5 | cluster$cov == 0 ) fweight = 0
else fweight = 1/clonalityErrorF^2
#estimate clonality from coverage and propagate uncertainty
covClonality = (2^cluster$M - 1)/(2^M - 1)
clonalityErrorM = 2^cluster$M*log(2)*mWidth/abs(2^M-1)
if ( M == 0 ) {
covClonality = 1
clonalityErrorM = Inf
}
if ( !is.na(clonalityErrorM) ) Mweight = 1/clonalityErrorM^2
else Mweight = 0
#common estimate of cloanlity based on MAF and coverage, propagate uncertainty
clonality = (freqClonality*fweight + covClonality*Mweight)/(fweight+Mweight)
clonality = max(0, min(1, clonality))
clonalityError = 1/sqrt(1/clonalityErrorF^2 + 1/clonalityErrorM^2)
#if the MAF and coverage estimates are not within errors, increase the uncertainty
#to cover both estimates.
clonalityDifference =
if ( fweight > 0 & Mweight > 0 ) noneg(abs(freqClonality-covClonality) - sqrt(clonalityErrorF^2 + clonalityErrorM^2))
else 0
if ( fweight == 0 ) { #if information from only one source, add some extra uncertainty to the clonality
clonalityErrorF = 0.3
fweight = Mweight/10
}
if ( Mweight == 0 ) {
clonalityErrorM = 0.3
Mweight = fweight/10
}
clonalityError = sqrt(clonalityError^2 + clonalityDifference^2)
#the expected MAF and LFC for the estimated clonality
fClone = (clonality*(2*f*2^M-1) + 1)/(clonality*(2*2^M - 2) + 2)
MClone = log2(1 + (2^M-1)*clonality)
#calculate likelihood for these f and M
if ( cluster$cov == 0 ) pCall = exp(-1)
else pCall = min(p.adjust(superFreq:::pBinom(efs$cov, efs$var, superFreq:::refBias(fClone)), method='fdr'))
pM = 2*pt(-noneg(abs(cluster$M - MClone))/(cluster$width+superFreq:::getSystematicVariance()), df=cluster$df) #allow systematic error
#likelihood for both MAF and LFC fitting the called clonality
pBoth = superFreq:::fisherTest(c(pCall + 1e-10, pM))[2]
if ( cluster$cov == 0 ) pBoth = pM
#add prior for this call to get posterior
pBoth = prior*pBoth/(prior*pBoth + (1-prior)*(1-pBoth))
#convert posterior to a sigma statistic
sigma = abs(qnorm(pBoth/2, 0, 1))
#decide whether the data fit well enough to make a call
call = sigma < sigmaCut & clonality > 3*clonalityError
if ( is.na(call) ) call = F
if ( is.na(sigma) ) warning("NA sigma from cluster: ", cluster, ' tested for call with M=', M, ", f=", f)
return(list(call=call, clonality=clonality, sigma = sigma, clonalityError=clonalityError, pCall=pCall))
}
#groups up CNV regions with similar clonalities to estimate the subclonal structure of the sample
findSubclones = function(cR) {
catLog('Finding subclones..')
clones = order(-cR$clonality)
clones = clones[cR$clonality[clones] < 1]
regions = as.list(clones)
clonality = cR$clonality[clones]
error = cR$clonalityError[clones]
while( length(clonality) > 1 ) {
pairScore = sameClone(clonality, error)
best = which(pairScore == max(pairScore))[1]
if ( pairScore[best] < 0.05 ) break
clonality[best] = sum((clonality/error^2)[best:(best+1)])/sum(1/error[best:(best+1)]^2)
error[best] = 1/sqrt(sum(1/error[best:(best+1)]^2))
regions[[best]] = unlist(c(regions[best], regions[best+1]))
regions = regions[-(best+1)]
clonality = clonality[-(best+1)]
error = error[-(best+1)]
}
subclonality = rep(1, nrow(cR))
subclonalityError = rep(0, nrow(cR))
if ( length(regions) > 0 ) {
for ( i in 1:length(regions) ) {
subclonality[regions[[i]]] = clonality[i]
subclonalityError[regions[[i]]] = error[i]
}
}
cR = cbind(cR, subclonality, subclonalityError)
catLog('done!\n')
return(cR)
}
#helper function that subsets fit objects properly, with all the extra columns.
subsetFit = function(fit, rows=NA, cols=NA) {
if ( is.na(cols)[1] ) cols = 1:ncol(fit)
if ( is.na(rows)[1] ) rows = 1:nrow(fit)
fit = fit[rows,cols]
if ('best.guess' %in% names(fit) ) fit$best.guess = fit$best.guess[rows,cols, drop=F]
if ('posterior' %in% names(fit) ) fit$posterior = lapply(fit$posterior[cols], function(post) post[rows,,drop=F])
if ('prior' %in% names(fit) ) fit$prior[cols]
if ('XRank' %in% names(fit) ) fit$XRank = fit$XRank[rows, cols, drop=F]
if ('postWidth' %in% names(fit) ) fit$postWidth = fit$postWidth[rows, cols, drop=F]
if ('x' %in% names(fit) ) fit$x = fit$x[rows]
if ('x1' %in% names(fit) ) fit$x1 = fit$x1[rows]
if ('x2' %in% names(fit) ) fit$x2 = fit$x2[rows]
if ('chr' %in% names(fit) ) fit$chr = fit$chr[rows]
if ('longNames' %in% names(fit) ) fit$longNames = fit$longNames[rows]
if ('sex' %in% names(fit) ) fit$sex = fit$sex[cols]
return(fit)
}
getSystematicVariance = function() {
if ( !exists('.systematicVariance') ) {
assign('.systematicVariance', 0, envir = .GlobalEnv)
warning('.systematicVariance not previouly defined. Setting to 0.\n')
}
return(get('.systematicVariance', envir = .GlobalEnv))
}
#the posterior probability that two capture regions belong the same CNVinterval
sameCNV = function(cR) {
if ( nrow(cR)[1] < 2 ) return(1)
#find the prior from the gap lengths
first = 1:(nrow(cR)-1)
second = 2:nrow(cR)
#this is prior from distance between segments. Overly conservative for large segments.
#dx = noneg(cR$x1[second] - cR$x2[first]) + 10000
#this is distance between center of segments. Might be too liberal for small-to-large segments
#also wont have as much power to favour large gaps in genes like centromeres.
#dx = noneg((cR$x1[second]+cR$x2[second])/2 - (cR$x1[first]+cR$x2[first])/2) + 10000
#this is a measure of the resolution of the breakpoint.
#smallest distance of the edge of one segment to the center of the other.
#should give a low prior to a small segment on either side, large prior to two large segments
#and should (somewhat) favour centromeres as breakpoints
dx = pmin(cR$x1[second] - (cR$x1[first]+cR$x2[first])/2,
(cR$x1[second]+cR$x2[second])/2 - cR$x2[first]) + 10000
prior = exp(-dx*CNVregionsPerBP())
#find the probabilities of getting measure values if the SNP frequencies are equal
#a fisher test would be more appropriate, but can be computationally slow. These two binomals give very similar results.
f = (cR$var[first] + cR$var[second])/(cR$cov[first] + cR$cov[second])
p1 = pBinom(cR$cov[first], cR$var[first], f)
p2 = pBinom(cR$cov[second], cR$var[second], f)
simesFP = sapply(first, function(i) min(p.adjust(c(p1[i], p2[i]), method='fdr')))
#get posterior using the dx-based prior above. alternative hypothesis is a flat distribution on the frequency.
#include the possibility of both being 50% hets, in which case a different variance (coverage) between the regions
#can cause different mean MAF. We catch that case by grouping regions that are consistent with 50%, independently of MAF.
if ( 'postHet' %in% names(cR) ) pBoth50 = sapply(first, function(i) min(p.adjust(c(cR$postHet[i], cR$postHet[i+1]), method='fdr')))
else pBoth50 = sapply(first, function(i) min(p.adjust(c(cR$pHet[i], cR$pHet[i+1]), method='fdr')))
Nsnps = pmin(cR$Nsnps[first], cR$Nsnps[second])
pF = pmax(simesFP, pBoth50, 0.01^Nsnps)
#find the probability densities (P*dM) of getting measure values if the regions have the same fold change
width = cR$width
#meanM = (cR$M[first]/width[first]^2 + cR$M[second]/width[second]^2)/(1/width[first]^2 + 1/width[second]^2)
#MP1 = pt(-abs(cR$M[first] - meanM)/width[first], df = cR$df[first])
#MP2 = pt(-abs(cR$M[second] - meanM)/width[second], df = cR$df[second])
#pM = sapply(first, function(i) min(p.adjust(c(MP1[i], MP2[i]), method='fdr')))
pM = pt(-abs(cR$M[first] - cR$M[second])/sqrt(width[first]^2 + width[second]^2), df = cR$df[first])
#pMandF = sapply(first, function(i) min(p.adjust(c(pF[i], pM[i]), method='fdr')))
pMandF = sapply(first, function(i) fisherTest(c(pF[i], pM[i]))['pVal'])
#get posteriors, and take average
post = prior*pMandF/(prior*pMandF + (1-prior))
names(post) = cR$x1[first]
return(post)
}
#prior of density of CNV region breakpoints. This corresponds to around 30 breakpoints in a sample.
CNVregionsPerBP = function() {return(1/1e8)}
#helper function doing the stouffer Test
stoufferTest = function(p, w) {
p = pmin(0.99999, pmax(1e-20, p))
if (missing(w)) {
w = rep(1, length(p))/length(p)
} else {
if (length(w) != length(p))
stop("Length of p and w must equal!")
}
Zi = ifelse(p > 0.5, qnorm(1-p), -qnorm(p))
Z = sum(w*Zi)/sqrt(sum(w^2))
p.val = 1-pnorm(Z)
return(c(Z = Z, p.value = p.val))
}
#helper function calculating the posterior of clonalities being the same
sameClone = function(clonality, error, prior = 0.99) {
first = 1:(length(clonality)-1)
second = 2:length(clonality)
totalError = sqrt(error[first]^2 + error[second]^2)
sigma = abs(clonality[first]-clonality[second])/totalError
pval = 2*pnorm(-sigma, mean=0, sd=1)
#alternative hypothesis is a flat distribution on difference in clonality from -0.5 to 0.5.
post = pval*prior/(pval*prior + totalError*(1-prior))
return(post)
}
#helper function that use neighbours to study the variance estimate
#will add a constant to width to correct for underrestimated variance
boostCRwidth = function(CR, plot=F) {
CR = CR[order(CR$x1),]
x = (CR$x1+CR$x2)/2
x = (-1000:1000)/100
y1 = rt(100000, CR$df[1])
y2 = rt(100000, CR$df[1])
the = median(abs(y1-y2))
last = nrow(CR)
changeBoost = function(boost) median(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+boost)^2+(CR$width[-last]+boost)^2)))
testRange = (0:1000)/1000
changes = sapply(testRange, changeBoost)
widthBoost = testRange[max(c(1, which(changes > the)))]
if ( widthBoost > 0 ) catLog('Boosted biological variance of LFC by ', widthBoost, '.\n', sep='')
else catLog('No boost needed for biological variance.\n')
if ( plot ) {
maxDev = max(abs(y1-y2), abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)),
abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)))
breaks = (0:500)/500*maxDev
xmax = quantile(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), probs=0.99)
h0 = hist(abs(y1-y2), plot=T, breaks=breaks, col='grey', xlab='difference/sqrt(width1^2 + width2^2)',
ylab='#regions', main='LFC difference of neighbouring regions', freq=F, xlim=c(0, xmax))
h1 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), plot=F, breaks=breaks)
h2 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)), plot=F, breaks=breaks)
lines(h1$mids, h1$density, col=mcri('blue'), lwd=5)
lines(h2$mids, h2$density, col=mcri('red'), lwd=3)
legend('topright', c('expected', 'no boost', 'boosted variance'), lwd=c(10, 5, 3),
col=mcri(c('grey', 'blue', 'red')))
ymax = max(changes, the)
plot(testRange, changes, type='l', lwd=3, col=mcri('blue'), xlab='width boost', ylab='median difference',
main='boost optimisation curve', ylim=c(0, ymax), xlim=c(0, max(min(1, 1.5*widthBoost), 0.2)))
segments(widthBoost, 0, widthBoost, 2*xmax, lwd=3, col=mcri('red'))
segments(-1, the, 1, the, lwd=3, col=mcri('grey'))
legend('topright', c('median difference', 'expected median', 'selected boost'), lwd=c(3,3,3),
col=mcri(c('blue', 'grey', 'red')), bg='white')
}
CR$width = CR$width + widthBoost
return(CR)
}
#helper function that corrects variance in RNA-Seq mode.
#identifies housekeeping genes with high (but not too high) expression
#and nto too high variance in reference normals. Increases variance of other genes.
#then runs the same variance boost alrogithm based on neighbours, but using 90% quantile instead of median to tune.
getCorrectedCR = function(CR, fit, plot=F) {
houseExpression = quantile(fit$Amean, probs=c(0.65, 0.98)) #housekeeping = 65%-98% expression quantile
maxPenalty = 1
distance = pmax(noneg(fit$Amean-houseExpression[2]), noneg(houseExpression[1]-fit$Amean))
expressionPenalty = pmin(maxPenalty,2*distance/(houseExpression[2]-houseExpression[1]))
correctedWidth = CR$width + expressionPenalty
houseVariance = quantile(correctedWidth[expressionPenalty==0], probs=c(0.5, 0.95)) #penalty for top 50% most variable housekeeping genes.
maxPenalty = 1
distance = noneg((correctedWidth-houseVariance[1]))
variancePenalty = pmin(maxPenalty,maxPenalty*distance/(houseVariance[2]-houseVariance[1]))
correctedWidth = correctedWidth + variancePenalty
housekeeping = expressionPenalty + variancePenalty == 0
boost = boostCRwidthRNA(CR[housekeeping,], plot=plot, quant=0.9)
ret = CR
ret$width = correctedWidth + boost
return(ret)
}
boostCRwidthRNA = function(CR, plot=F, quant=0.5) {
CR = CR[order(CR$x1),]
x = (CR$x1+CR$x2)/2
x = (-1000:1000)/100
y1 = rt(100000, CR$df[1])
y2 = rt(100000, CR$df[1])
the = quantile(abs(y1-y2), probs=quant)
last = nrow(CR)
changeBoost = function(boost) quantile(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+boost)^2+(CR$width[-last]+boost)^2)), probs=quant)
testRange = (0:1000)/1000
changes = sapply(testRange, changeBoost)
widthBoost = testRange[max(c(1, which(changes > the)))]
if ( widthBoost > 0 ) catLog('Boosted biological variance of LFC by ', widthBoost, '.\n', sep='')
else catLog('No boost needed for biological variance.\n')
if ( plot ) {
maxDev = max(abs(y1-y2), abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)),
abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)))
breaks = (0:500)/500*maxDev
xmax = quantile(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), probs=0.99)
h0 = hist(abs(y1-y2), plot=T, breaks=breaks, col='grey', xlab='difference/sqrt(width1^2 + width2^2)',
ylab='#regions', main='LFC difference of neighbouring regions', freq=F, xlim=c(0, xmax))
h1 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1])^2+(CR$width[-last])^2)), plot=F, breaks=breaks)
h2 = hist(abs((CR$M[-1]-CR$M[-last])/sqrt((CR$width[-1]+widthBoost)^2+(CR$width[-last]+widthBoost)^2)), plot=F, breaks=breaks)
lines(h1$mids, h1$density, col=mcri('blue'), lwd=5)
lines(h2$mids, h2$density, col=mcri('red'), lwd=3)
legend('topright', c('expected', 'no boost', 'boosted variance'), lwd=c(10, 5, 3),
col=mcri(c('grey', 'blue', 'red')))
ymax = max(changes, the)
plot(testRange, changes, type='l', lwd=3, col=mcri('blue'), xlab='width boost', ylab='median difference',
main='boost optimisation curve', ylim=c(0, ymax), xlim=c(0, max(min(1, 1.5*widthBoost), 0.2)))
segments(widthBoost, 0, widthBoost, 2*xmax, lwd=3, col=mcri('red'))
segments(-1, the, 1, the, lwd=3, col=mcri('grey'))
legend('topright', c('median difference', 'expected median', 'selected boost'), lwd=c(3,3,3),
col=mcri(c('blue', 'grey', 'red')), bg='white')
}
#CR$width = CR$width + widthBoost
return(widthBoost)
}
#plotting function for diagnostic plot of calls. Good for spotting failed average CNV, or dubious calls in general.
plotMAFLFC = function(clusters, xlim=c(-1.2, 1.2)) {
plot(0, type='n', xlim=xlim, ylim=c(0, 0.5), xlab='LFC', ylab='MAF')
clonalities = (0:500)/500
markClonalities = c(0.25, 0.5, 0.75)
for ( call in c('AB', 'CL', 'A', 'AA', 'AAA', 'AAAA', 'AAAAA', 'AAB', 'AAAB', 'AAAAB', 'AABB', 'AAABB') ) {
fMs = callCloneTofM(call, clonalities)
points(fMs[,2], fMs[,1], pch=16, cex=2*clonalities, col=callsToColMaypole(call))
markFMs = callCloneTofM(call, markClonalities)
if ( call != 'AB' ) points(markFMs[,2], markFMs[,1], pch=4, cex=4*markClonalities,, lwd=2, col=callsToColMaypole(call))
text(fMs[round(length(clonalities)*0.6),2]+0.05, fMs[round(length(clonalities)*0.6),1]+0.01, call, col=callsToColMaypole(call))
}
w = sqrt(clusters$width^2 + getSystematicVariance()^2)
f = clusters$f
ferr = ifelse(clusters$cov == 0, 0.25, clusters$ferr)
f = ifelse(clusters$cov == 0, 0.25, f)
if ( !('call' %in% names(clusters)) ) clusters$call = rep('', nrow(clusters))
points(clusters$M, f, pch=16, cex = pmin(1.5,pmax(0.2, sqrt((0.1/(w/2))^2 + (0.1/(ferr/0.5))^2))), col=callsToColMaypole(clusters$call))
segments(clusters$M+w, f, clusters$M-w, f,
lwd = pmin(1.5,pmax(0.2, 0.1/(w/2))), col=callsToColMaypole(clusters$call))
segments(clusters$M, f+ferr, clusters$M, f-ferr,
lwd = pmin(1.5,pmax(0.2, 0.1/(ferr/0.5))), col=callsToColMaypole(clusters$call))
legend('topright', paste0('clonality=',markClonalities), pch=4, pt.cex=4*markClonalities, pt.lwd=2)
}
#helper function
callCloneTofM = function(call, clonality) {
fM = callTofM(call)
M = log2(2^fM['M']*clonality + 1 - clonality)
f = (fM['f']*2^fM['M']*clonality + (1-clonality)/2)/(2^fM['M']*clonality + 1 - clonality)
return(cbind(f, M))
}
#helper function
callsToColMaypole = function(calls) {
return(sapply(calls, callToColMaypole))
}
callToColMaypole = function(call) {
alpha = if ( grepl('\\?', call) ) 0.3 else 1
call = gsub('\\?', '', call)
if ( call == 'AB' ) return(mcri('black', alpha))
if ( call == 'A' ) return(mcri('red', alpha))
if ( call == 'AAB' ) return(mcri('cyan', alpha))
if ( call == 'AAAB' ) return(mcri('orange', alpha))
if ( call == 'AAAAB' ) return(mcri('green', alpha))
if ( call == 'AAAAAB' ) return(mcri('darkred', alpha))
if ( call == 'AAAAAAB' ) return(mcri('darkred', alpha))
if ( call == 'AA' ) return(mcri('green', alpha))
if ( call == 'CL' ) return(mcri('orange', alpha))
if ( call == 'AABB' ) return(mcri('purple', alpha))
if ( call == 'AAA' ) return(mcri('blue', alpha))
if ( call == 'AAAA' ) return(mcri('grey', alpha))
if ( call == 'AAAAA' ) return(mcri('darkred', alpha))
if ( call == 'AAABB' ) return(mcri('darkred', alpha))
if ( nchar(call) > 5 ) return(mcri('darkred', alpha))
return(mcri('black', alpha))
}
shortenCalls = function(calls) {
calls = gsub(' AAAAB', ' 4AB', calls)
calls = gsub(' AAAAAB', ' 5AB', calls)
calls = gsub(' AAAAAAB', ' 6AB', calls)
calls = gsub(' AAAAAAAAAB', ' 9AB', calls)
calls = gsub(' AAAAAAAAAAAAAB', ' 13AB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAB', ' 19AB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAB', ' 27AB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB', ' 39AB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB', ' 56AB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB', ' 79AB', calls)
calls = gsub(' AAAAAAAABB', ' 8ABB', calls)
calls = gsub(' AAAAAAAAAAAABB', ' 12ABB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAABB', ' 18ABB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAABBB', ' 25ABBB', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBB', ' 36A4B', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBB', ' 51A6B', calls)
calls = gsub(' AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABBBBBBBB', ' 72A8B', calls)
return(calls)
}
#function that checks for large regions with fewer than expected het SNPs
#This can be a sign of clonal LOH that is hard to find without matched normals
checkForMissedLOH = function(cancerVariants, genome, binsize=3e6, cpus=1) {
#count number of het and hom variants, binned across the genome
hr = getHetRatio(cancerVariants, genome=genome, binsize=binsize, plot=F)
hr = identifyTooLowHetRatio(hr, genome=genome, binsize=binsize)
protectedHets = selectedProtectedHets(cancerVariants, hr, genome)
#check global rate of het vs hom SNPs, if not enough, don't even bother trying
#cluster by chromosome
#check for large enough regions, larger than 10MBs seems reasonable
#also require enough covered area in the region (sum x2-x1)
#test if the fraction of het/hom SNPs is way below expected
#also want the total number of hets to be below expected
#for those regions, switch some of the hom variants to het status to closer match expected fraction.
return(protectedHets)
}
#this function take a fit object across genes, and corrects fit$coefficients for regional GC content.
#correction is done with a loess, and the function returns the fit object with corrected coefficients.
regionalGCcorrect = function(fit, captureRegions, genome, plotDirectory, regionRange=1e6, plot=T) {
catLog('Calculating regional binding strength...')
chrs = as.character(seqnames(captureRegions))
mids = (start(captureRegions)+end(captureRegions))/2
ws = end(captureRegions)-start(captureRegions)
dns = captureRegions$dn
#-1 dns are regions with lots on N bases. set weight to essentially zero and token dn.
ws[dns < 0] = 1
dns[dns < 0] = mean(dns)
regionalGC = lapply(unique(fit$chr), function(chr) {
capturePos = mids[chrs==chr]
captureWs = ws[chrs==chr]
captureDns = dns[chrs==chr]
ret = sapply(xToPos(fit$x[fit$chr==chr], genome=genome), function(mid) {
hits = abs(capturePos-mid) < regionRange
return(sum(captureDns[hits]*captureWs[hits])/sum(captureWs[hits]))
})
return(ret)
})
regionalGC = unlist(regionalGC)
fit$regionalGC = regionalGC
#plot regional GC
regionalPlotDir = paste0(plotDirectory, '/diagnostics/regionalBS')
superFreq:::ensureDirectoryExists(regionalPlotDir)
plotFile = paste0(regionalPlotDir, '/regionalBS.pdf')
pdf(plotFile, width=20, height=10)
ylim = c(min(fit$regionalGC),max(fit$regionalGC))
plotColourScatter(fit$x, fit$regionalGC, ylim=ylim,
xlab='genomic position', ylab='regional binding strength', main='genomewide')
superFreq:::addChromosomeLines(ylim=ylim, col=mcri('green'), genome=genome)
for ( chr in names(chrLengths(genome)) ) {
use = fit$chr == chr
if ( !any(use) ) next
plotColourScatter(xToPos(fit$x[use], genome), fit$regionalGC[use], ylim=ylim, cex=1.5,
xlab='genomic position', ylab='regional binding strength', main=paste0('chr', chr))
}
dev.off()
catLog('done.\nNow correct LFC based on regional binding strength by sample: ', sep='')
plotFile = paste0(regionalPlotDir, '/regionalBScorrection.pdf')
pdf(plotFile, width=20, height=10)
for ( sampleName in colnames(fit$coefficients) ) {
w = fit$coefficients[,sampleName]/fit$t[,sampleName]
plotColourScatter(fit$regionalGC, fit$coefficients[,sampleName], cex=pmin(2,0.3/w), ylim=c(-5,5), xlab='regional binding strength', ylab='LFC', main=sampleName)
lo = loess(cov~dn, data=data.frame(cov=fit$coefficients[,sampleName], dn=fit$regionalGC),weights=1/w^2, span=0.3, degrees=1, trace.hat='approximate', family='symmetric')
lines((500:1500)/100, predict(lo, (500:1500)/100), lwd=5, col=mcri('orange'))
corrections = predict(lo, fit$regionalGC)
fit$coefficients[,sampleName] = fit$coefficients[,sampleName] - corrections
fit$t[,sampleName] = fit$coefficients[,sampleName]/w
catLog(sampleName, ' (max correction ', signif(max(abs(corrections-mean(corrections))), 2), '), ', sep='')
}
dev.off()
catLog('done.\n')
return(fit)
}
selectedProtectedHets = function(q, hr, genome, binsize=3e6) {
hr$addHets = round(hr$addHets)
if ( sum(hr$addHets) == 0 ) return(c())
qBAF = extractqBAF(q)
isHom = qBAF$var > 0.95*qBAF$cov
breaks = unique(sort(c(0:ceiling(sum(chrLengths(genome))/binsize), cumsum(chrLengths(genome))/binsize)))
changeToHetList = lapply(which(hr$addHets > 0), function(bin) {
xmin = breaks[bin]*binsize
xmax = breaks[bin+1]*binsize
homs = qBAF$x > xmin & qBAF$x < xmax & isHom
qHom = qBAF[homs,]
#from the available homozygous variants, pick lowest VAF, and close to median coverage
changeToHet = rownames(qHom)[order(qHom$var/qHom$cov,
abs(qHom$cov-median(qBAF$cov)))][1:hr$addHets[bin]]
return(changeToHet)
})
protectedHets = unlist(changeToHetList)
return(protectedHets)
}
getHetRatio = function(q, genome, binsize=3e6, plot=F) {
qBAF = extractqBAF(q)
isHom = qBAF$var > 0.95*qBAF$cov
breaks = unique(sort(c(0:ceiling(sum(chrLengths(genome))/binsize), cumsum(chrLengths(genome))/binsize)))
homs = hist(qBAF$x[isHom]/binsize, breaks=breaks,plot=F)
hets = hist(qBAF$x[!isHom]/binsize, breaks=breaks,plot=F)
ret = data.frame('hom'=homs$counts, 'het'=hets$counts, 'mid'=homs$mids*binsize)
if ( plot ) plotHR(ret, genome=genome)
return(ret)
}
extractqBAF = function(q) {
return(q[q$flag %in% c('', 'Svr') & q$exac & q$exacFilter == 'PASS' & q$exacAF > 0.01 & q$cov > 20 & q$var > 0.05*q$cov,])
}
plotHR = function(hr, genome, trend=F, add=F, col='black', drawMean=T, binBlur=5) {
meanHR = sum(hr$het)/sum(hr$hom+hr$het)
xmax = sum(chrLengths(genome))
if ( !add ) {
plot(0, type='n', ylim=c(0,1), xlim=c(0, xmax), xlab='genomic position', ylab='fraction SNPs het')
superFreq:::addChromosomeLines(genome=genome, col=mcri('green'))
}
if ( drawMean ) segments(0, meanHR, xmax, meanHR, lwd=3, col='grey')
chrL = chrLengths(genome)
if ( trend ) {
for ( chr in names(chrL) ) {
use = hr$mid < cumsum(chrL)[chr] & hr$mid > cumsum(chrL)[chr] - chrL[chr]
if ( sum(use) < 5 ) next
hom = superFreq:::multiBlur(hr$hom[use], 1, binBlur)
het = superFreq:::multiBlur(hr$het[use], 1, binBlur)
lines(hr$mid[use], het/(het+hom), lwd=1, col=mcri('orange'))
}
}
points(hr$mid, hr$het/(hr$het+hr$hom), pch=16, cex=sqrt(hr$hom/10), col=col)
}
multiBlur = function(x, range, repeats) {
if ( repeats == 0 ) return(x)
for ( i in 1:repeats )
x = blurN(x, range)
return(x)
}
blurN = function(vec, M) {
N = length(vec)
vec = c(rev(vec), vec, rev(vec))
ret = 1:N
for(i in 1:N) {
ret[i] = sum(vec[N + (i-M):(i+M)])/(2*M+1)
}
return(ret)
}
identifyTooLowHetRatio = function(hr, genome, binsize=3e6, binBlur=5) {
#genomwide rates
autosomal = !(xToChr(hr$mid, genome) %in% c('X', 'Y', 'M'))
meanHetsPerBin = binsize*sum(hr$het[autosomal])/max(hr$mid[autosomal])
meanHetFrac = sum(hr$het[autosomal])/sum((hr$hom+hr$het)[autosomal])
#if het rates to low genomewide, then we dont have power to detect regions
#where it's too low. Then just return without adjustments.
hr$addHets = 0
if ( meanHetsPerBin < 1 ) {
return(hr)
}
#find bins that are suspiciously low on hets, both in absolute rate and wrt homs.
#also needs enough homs to provide power
chrL = chrLengths(genome)
chrL = chrL[!(names(chrL) %in% c('X', 'Y', 'M'))]
for ( chr in names(chrL) ) {
use = hr$mid < cumsum(chrL)[chr] & hr$mid > cumsum(chrL)[chr] - chrL[chr]
if ( sum(use) < 5 ) next
hom = multiBlur(hr$hom[use], 1, binBlur)
het = multiBlur(hr$het[use], 1, binBlur)
correctRatioP = pbinom(round(het), round(het+hom), meanHetFrac)
correctHetCountP = ppois(het, meanHetsPerBin)
#significantly few hets
tooFewHets = correctRatioP < 0.01 & correctHetCountP < 0.01 & het < 0.3*meanHetFrac*(het+hom)
#few hets, but maybe not significantly
fewHets = het < 0.5*meanHetFrac*(het+hom) & het < meanHetsPerBin*0.5
#bins with fewHets with neighbouring bins with tooFewhets are also tooFew. Recur.
N = length(hom)
while( TRUE ) {
spread = fewHets & !tooFewHets & (tooFewHets[c(2:N,N)] | tooFewHets[c(1,1:(N-1))])
if ( !any(spread) ) break
tooFewHets[spread] = T
}
#if too few hets, request relabelling some of the homs into hets.
#relabel enough to bring it to half the global mean rate.
#should be enough power to call LOH downstream, but will limit damage of FPs.
hr$addHets[use] = tooFewHets*((hr$het+hr$hom)[use]*meanHetFrac/2 - hr$het[use])
}
return(hr)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.