# Functions for dealing with the CIGAR I/D and H/S filters.
#
# In general, these filters are designed to test whether the single cell
# has an excessive number of indel (I/D) or clipping (H/S) CIGAR operations
# compared to the matched bulk.
#
# N.B. mismatches, as would be caused by a somatic SNV, generate an M CIGAR
# op, so the bulk and single cell should not have different CIGAR profiles.
# However, a somatic indel *does* generate an I or D OP that would not be
# seen in the bulk. So this filter is not well-designed for indels; yet it
# is still applied.
#
# Currently, several CIGAR scoring operations cannot be applied to chunked
# objects, so we are stuck using relatively slow calculations and small
# training sets.
cigar.emp.score <- function (training, test, which = c("id", "hs"), legacy=FALSE, quiet=FALSE) {
xt <- training[[paste0(which, ".score.x")]]
yt <- training[[paste0(which, ".score.y")]]
x <- test[[paste0(which, ".score.x")]]
y <- test[[paste0(which, ".score.y")]]
dp <- test$dp.cigars
bulkdp <- test$dp.cigars.bulk
progressr::with_progress({
if (!quiet) p <- progressr::progressor(along=1:(length(x)/100))
# in legacy mode, NA and NaN values in x or y weren't caught
if (legacy) {
ret <- future.apply::future_mapply(function(dp, bulkdp, x, y, i) {
if (!quiet & i %% 100 == 1) p()
mean(xt >= x & yt >= y, na.rm = T)
}, test$dp.cigars, test$dp.cigars.bulk, x, y, 1:length(x))
} else {
ret <- future.apply::future_mapply(function(dp, bulkdp, x, y, i) {
if (!quiet & i %% 100 == 1) p()
ifelse(dp == 0 | bulkdp == 0, 0,
mean(xt >= x & yt >= y, na.rm = T))
}, test$dp.cigars, test$dp.cigars.bulk, x, y, 1:length(x))
}
}, enable=!quiet)
# future_mapply returns list() when x is length 0. make this a 0-length
# numeric so quantile() doesn't fail on it later.
if (length(ret) == 0)
return(numeric(0))
else
return(ret)
}
compute.cigar.scores <- function(cigar.data) {
cigar.data[, c('id.score.y', 'id.score.x', 'hs.score.y', 'hs.score.x') :=
list(ID.cigars / dp.cigars,
ID.cigars.bulk / dp.cigars.bulk,
HS.cigars / dp.cigars,
HS.cigars.bulk / dp.cigars.bulk)]
}
read.cigar.data <- function(path, region, quiet=FALSE) {
if (!quiet) cat('Importing CIGAR stats from', path, '\n')
col.classes <- c('character', rep('integer', 6))
read.tabix.data(path=path, region=region, quiet=quiet, colClasses=col.classes)
}
setGeneric("add.cigar.data", function(object, sc.cigars.path, bulk.cigars.path, quiet=FALSE)
standardGeneric("add.cigar.data"))
setMethod("add.cigar.data", "SCAN2", function(object, sc.cigars.path, bulk.cigars.path, quiet=FALSE) {
check.slots(object, 'gatk')
sc <- read.cigar.data(sc.cigars.path, region=object@region, quiet=quiet)
bulk <- read.cigar.data(bulk.cigars.path, region=object@region, quiet=quiet)
if (!quiet) cat('joining CIGAR data..\n')
object@gatk[sc, on=c('chr', 'pos'),
c('M.cigars', 'ID.cigars', 'HS.cigars', 'other.cigars', 'dp.cigars') :=
list(i.M.cigars, i.ID.cigars, i.HS.cigars, i.other.cigars, i.dp.cigars)]
object@gatk[bulk, on=c('chr', 'pos'),
c('M.cigars.bulk', 'ID.cigars.bulk', 'HS.cigars.bulk', 'other.cigars.bulk', 'dp.cigars.bulk') :=
list(i.M.cigars, i.ID.cigars, i.HS.cigars, i.other.cigars, i.dp.cigars)]
if (!quiet) cat('computing CIGAR op rates..\n')
compute.cigar.scores(object@gatk) # modifies by reference
object@cigar.data <- data.frame(sc.sites=nrow(sc), bulk.sites=nrow(bulk),
sc.path=sc.cigars.path, bulk.path=bulk.cigars.path)
object
})
# It would be nice to use all training sites for this; however, computing excess
# cigar scores is O(n^2), n=number of null sites.
cigar.get.null.sites <- function(object, path=NULL, legacy=TRUE, quiet=FALSE) {
if (is.null(path)) {
check.slots(object, c('gatk', 'cigar.data'))
if (legacy) {
null.sites <- object@gatk[resampled.training.site==TRUE]
} else {
null.sites <- object@gatk[training.site==TRUE]
}
compute.cigar.scores(null.sites)
} else {
# When read from disk, scores were already computed
null.sites <- data.table::fread(path)
}
null.sites
}
setGeneric("compute.excess.cigar.scores", function(object, null.path=NULL, precomputed.path=NULL, legacy=TRUE, quiet=FALSE)
standardGeneric("compute.excess.cigar.scores"))
setMethod("compute.excess.cigar.scores", "SCAN2", function(object, null.path=NULL, precomputed.path=NULL, legacy=TRUE, quiet=FALSE) {
check.slots(object, c('gatk', 'static.filter.params', 'cigar.data'))
null.sites <- cigar.get.null.sites(object, null.path, legacy, quiet)
if (!quiet) {
if (legacy) {
cat(sprintf('LEGACY: computing CIGAR op rates only at resampled training sites (n=%d)..\n',
nrow(null.sites)))
} else {
cat(sprintf('WARNING: using the full set of training het germline sites (n=%d) for CIGAR op calculations may be prohibitively slow\n',
nrow(null.sites)))
}
}
muttypes <- c('snv', 'indel')
object@excess.cigar.scores <- setNames(lapply(muttypes, function(mt) {
null.sites.mt <- null.sites[muttype == mt]
test <- object@gatk[muttype == mt]
if (is.null(precomputed.path)) {
pc <- perfcheck("excess CIGAR ops (ID)",
id.scores <- cigar.emp.score(training=null.sites.mt, test=test,
which='id', quiet=quiet, legacy=legacy),
report.mem=FALSE)
pc <- perfcheck("excess CIGAR ops (HS)",
hs.scores <- cigar.emp.score(training=null.sites.mt, test=test,
which='hs', quiet=quiet, legacy=legacy),
report.mem=FALSE)
if (!quiet) cat(pc, '\n')
} else {
tab <- read.tabix.data(path=precomputed.path, region=object@region, quiet=quiet)
tab <- tab[muttype == mt]
id.scores <- tab$id.score
hs.scores <- tab$hs.score
# Extra sanity check to ensure read-in data is parallel to the gatk table
if (nrow(tab) != nrow(test) || any(tab$chr != test$chr))
stop('excess cigar score file', path, 'does not exactly match internal table chromosomes')
if (nrow(tab) != nrow(test) || any(tab$pos != test$pos))
stop('excess cigar score file', path, 'does not exactly match internal table positions')
if (nrow(tab) != nrow(test) || any(tab$refnt != test$refnt))
stop('excess cigar score file', path, 'does not exactly match internal table refnt')
if (nrow(tab) != nrow(test) || any(tab$altnt != test$altnt))
stop('excess cigar score file', path, 'does not exactly match internal table altnt')
}
object@gatk[muttype == mt, c('id.score', 'hs.score') := list(id.scores, hs.scores)]
sfp <- object@static.filter.params[[mt]]
data.frame(sites=nrow(object@gatk[muttype==mt]),
null.sites=nrow(null.sites.mt),
# These quantile calculations are identical for each chunk (because the set of
# null sites is always the set over the whole genome). Recalculating doesn't
# waste that much time.
id.score.q=quantile(null.sites.mt$id.score, prob=sfp$cg.id.q, na.rm=TRUE),
hs.score.q=quantile(null.sites.mt$hs.score, prob=sfp$cg.hs.q, na.rm=TRUE),
legacy=legacy)
}), muttypes)
object
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.