# Wrappers and expected formats for read.table.1sample
gatk.meta.cols <- c(
chr='character',
pos='integer',
dbsnp='character',
refnt='character',
altnt='character'
)
read.gatk.1sample <- function(path, sample.id, region=NULL, quiet=FALSE) {
read.table.1sample(path=path, sample.id=sample.id, region=region,
meta.cols=gatk.meta.cols, quiet=quiet)
}
integrated.table.meta.cols <- c(
chr='character',
pos='integer',
dbsnp='character',
refnt='character',
altnt='character',
bulk.gt='character',
bref='integer',
balt='integer',
bulk.dp='integer',
bulk.af='numeric',
bulk.binom.prob='numeric',
tref='integer',
talt='integer',
tcells='integer',
muttype='character',
mutsig='character',
balt.lowmq='integer',
phased.gt='character',
nalleles='integer',
unique.donors='integer',
unique.cells='integer',
unique.bulks='integer',
max.out='integer',
sum.out='integer',
sum.bulk='integer',
somatic.candidate='logical',
training.site='logical',
resampled.training.site='logical'
)
read.integrated.table.1sample <- function(path, sample.id, region=NULL, quiet=FALSE) {
read.table.1sample(path=path, sample.id=sample.id, region=region,
meta.cols=integrated.table.meta.cols, quiet=quiet)
}
# Some extra work to make sure we only read in the part of the
# table relevant to one sample. Otherwise, memory can become
# an issue for projects with 10s-100s of cells.
#
# meta.cols - columns that should be read in addition to columns
# specific to `sample.id`.
#
# region can be a GRanges object with a single interval to read only
# a subset of the GATK table. The table is tabix indexed, so this can
# be done quickly.
read.table.1sample <- function(path, sample.id, meta.cols, region=NULL, quiet=FALSE) {
if (!quiet) cat("Importing GATK table..\n")
n.meta.cols <- length(meta.cols)
# Step 1: just get the header and detect the columns corresponding to sample.id
tf <- Rsamtools::TabixFile(path)
open(tf)
header <- read.tabix.header(tf)
close(tf)
col.strings <- strsplit(header, '\t')[[1]]
tot.cols <- length(col.strings)
sample.idx <- which(col.strings == sample.id)
if (!quiet) {
cat("Selecting columns:\n")
for (i in 1:length(col.strings)) {
if (i <= n.meta.cols) {
cat(sprintf(" (%d)", i), col.strings[i], '\n')
} else if (any(i == sample.idx + 0:2)) {
cat(sprintf(" (%d)", i), col.strings[i], '[single cell]\n')
}
}
}
# Step 2: really read the tables in, but only the relevant columns
cols.to.read <- rep("NULL", tot.cols)
cols.to.read[1:n.meta.cols] <- meta.cols
# Read 3 columns for the single cell, 3 columns for bulk
cols.to.read[sample.idx + 0:2] <- c('character', 'integer', 'integer')
gatk <- read.tabix.data(path=path, region=region, header=header, quiet=quiet, colClasses=cols.to.read)
new.sample.idx <- which(colnames(gatk) == sample.id)
colnames(gatk)[new.sample.idx+1:2] <- c('scref', 'scalt')
# Rearrange columns so that the single cell triplet is first, then bulk triplet
cols.to.keep <- col.strings[1:n.meta.cols]
cols.to.keep <- c(cols.to.keep, sample.id, c('scref', 'scalt'))
gatk <- gatk[,..cols.to.keep]
gatk
}
# annotate `gatk.meta` with 'bulk.gt', 'bulk.dp', 'bulk.af', 'bref' and 'balt' taken from `gatk`,
# which are the bulk genotype string assigned by HaplotypeCaller, bulk depth, bulk VAF, the number
# of ref supporting bulk reads and mutation supporting bulk reads.
#
# also add the total sum of alt/ref reads across all single cells in the batch.
#
# sc.samples - list of all sample IDs supplied to SCAN2 as --sc-bam arguments
#
# legacy - SCANSNV's legacy behavior was to sum alt-supporting reads from ALL BAMs
# except the named bulk sample to determine somatic candidate sites.
# This can cause problems if the user supplies additional single cells
# not for primary analysis (like large sets of low-depth cells) or
# additional bulks that are not the primary bulk comparison (i.e., bulks
# from other tissues).
#
# only `gatk.meta` is modified (by reference).
annotate.gatk.counts <- function(gatk.meta, gatk, bulk.sample, sc.samples, legacy=FALSE, quiet=FALSE) {
col.strings <- colnames(gatk)
tot.cols <- length(col.strings)
bulk.idx <- Inf
if (!missing(bulk.sample))
bulk.idx <- which(col.strings == bulk.sample)
if (!quiet) {
cat("Selecting columns:\n")
for (i in 1:length(col.strings)) {
if (any(i == bulk.idx + 0:2)) {
cat(sprintf(" (%d)", i), col.strings[i], '[bulk]\n')
}
}
}
gatk.meta[, c('bulk.gt', 'bref', 'balt') :=
list(gatk[[bulk.idx]], gatk[[bulk.idx+1]], gatk[[bulk.idx+2]])]
gatk.meta[, c('bulk.dp', 'bulk.af') := list(bref+balt, balt/(bref+balt))]
# One-sided binomial test that the site has 50% VAF in bulk. Alt hypothesis
# is that VAF < 50% (i.e., a somatic mosaic).
gatk.meta[, 'bulk.binom.prob' := pbinom(q=balt, size=bulk.dp, prob=1/2)]
refs <- c()
alts <- c()
if (legacy) {
# legacy SCANSNV behavior used every alt read count
alts <- which(colnames(gatk) == 'alt')
refs <- which(colnames(gatk) == 'ref')
} else {
#sample.col.idxs <- seq(8, ncol(gatk), 3) # all sample column IDs
# adding the bulk sample so that the `- bref` and `- balt` calculations below can
# be applied whether legacy mode is chosen or not.
sample.col.idxs <- which(colnames(gatk) %in% c(bulk.sample, sc.samples))
refs <- sample.col.idxs + 1
alts <- sample.col.idxs + 2
}
# data.table is very finicky about accessing variables in the calling scope.
# especially bad when data.tables are nested in data.table formulae.
tref.var <- rowSums(as.matrix(gatk[, ..refs])) - gatk.meta$bref
talt.var <- rowSums(as.matrix(gatk[, ..alts])) - gatk.meta$balt
just.cell.alts <- which(colnames(gatk) %in% sc.samples) + 2
# how many cells have any supporting reads for this site
tcells.var <- rowSums(as.matrix(gatk[, ..just.cell.alts]) > 0)
# Annotate the total number of single cell alt and ref reads across all SC samples.
gatk.meta[, c('tref', 'talt', 'tcells') := list(tref.var, talt.var, tcells.var)]
gatk.meta
}
# `gatk` is a data.table, so all of these updates happen by reference.
# no need to return the result.
#
# This can be slow with add.mutsig, particularly for indels. Best used
# on chunked data.
annotate.gatk <- function(gatk, genome.string, add.mutsig=TRUE) {
data.table::setkey(gatk, chr, pos, refnt, altnt)
# Determine SNV/indel status and then annotate mutation signature channels
gatk[, muttype := ifelse(nchar(refnt) == 1 & nchar(altnt) == 1, 'snv', 'indel')]
# allow for fast selection of SNVs or indels
setindex(gatk, muttype)
if (add.mutsig) {
# This is the only place in scan2 where BSgenome is necessary.
# This is important because BSgenome has several dependencies (in
# particular, rtracklayer) that uses ~350 MB of RAM over the other
# scan2 dependencies. This mem usage is multiplied when using
# library(future), plan(multicore) for parallelization so is quite
# significant.
#
# N.B. using the integrated table workflow, we should only ever
# have to annotate these values once.
require(BSgenome)
gatk[muttype == 'snv',
mutsig := get.3mer(chr=chr, pos=pos, refnt=refnt, altnt=altnt,
genome=genome.string.to.bsgenome.object(genome.string))]
chs <- classify.indels(gatk[muttype == 'indel'], genome.string=genome.string)
gatk[muttype == 'indel', mutsig := chs]
}
}
# 'gatk' is a data.table to be modified by reference
annotate.gatk.lowmq <- function(gatk, path, bulk, region, quiet=FALSE) {
lowmq <- read.gatk.1sample(path, sample.id=bulk, region=region, quiet=quiet)
data.table::setkey(lowmq, chr, pos, refnt, altnt)
gatk[lowmq, on=.(chr,pos,refnt,altnt), balt.lowmq := i.scalt]
}
# 'gatk' is a data.table to be modified by reference
#
# 'phasing.path' points to a bgzipped, tabix indexed 5 column table with format:
# chr, pos, refnt, altnt, phased GT string
# N.B., the first column name must be "chr"
#
# to allow the very simple table join to work well, we assume phasing was
# restricted to biallelic sites for this individual. while not ideal, the
# performance of such phasing has been good enough.
#
# To annotate phasing status (phased.hap1, phased.hap2) the estimated
# phase needs to be joined to a single cell table with alt and ref read
# counts. Based on phase, (alt,ref) maps to either (hap1,hap2) or (hap2,hap1).
annotate.gatk.phasing <- function(gatk, phasing.path, region, quiet=FALSE) {
phase.data <- read.tabix.data(path=phasing.path, region=region, quiet=quiet,
colClasses=list(character='chr'))
if (ncol(phase.data) != 5)
stop('expected 5 column table')
colnames(phase.data) <- c('chr', 'pos', 'refnt', 'altnt', 'phasedgt')
unrecognized <- setdiff(unique(phase.data$phasedgt), c('1|1', '1|0', '0|1', './.'))
if (length(unrecognized) > 0)
stop(paste('phasing genotypes expected to be either 1|1, 0|1, 1|0 or ./., but found', unique(unrecognized), collapse='\n'))
gatk[phase.data, on=.(chr,pos,refnt,altnt), phased.gt := i.phasedgt]
}
# if 'panel.path' is NULL, then columns with dummy counts of 0 (so as to
# not throw off filters) will be joined. this is easily discernable from
# real data since unique.donors can never be <1 for any non-ref site in
# this sample (the donor would include this one).
annotate.gatk.panel <- function(gatk, panel.path, region=NULL, quiet=FALSE) {
if (!is.null(panel.path)) {
panel <- read.tabix.data(path=panel.path, region=region, quiet=quiet,
colClasses=list(character='chr')) # force chromosome to be interpreted as a string
# update by reference
# N.B. only join by chromosome and position here. Because the panel often
# involves 50+ cells and 10+ individuals, it is common for sites that
# would be biallelic in this sample to become multiallelic when considering
# all samples.
# At multiallelic sites, the panel counts actually do not tell us which of
# the alleles is supported in the other cells/subjects, only that there are
# non-reference alleles supported in those other cells/subjects. One day
# we will hopefully avoid this by decomposing alleles via vcfallelicprimitives
# or some similar tool.
gatk[panel, on=.(chr,pos),
c('nalleles', 'unique.donors', 'unique.cells', 'unique.bulks', 'max.out', 'sum.out', 'sum.bulk') :=
list(i.nalleles, i.unique.donors, i.unique.cells, i.unique.bulks, i.max.out, i.sum.out, i.sum.bulk)]
} else {
gatk[,
c('nalleles', 'unique.donors', 'unique.cells', 'unique.bulks', 'max.out', 'sum.out', 'sum.bulk') :=
list(0, 0, 0, 0, 0, 0, 0)]
}
}
# Adds a columns `somatic.candidate` to `gatk` by reference. Loci with
# somatic.candidate=TRUE pass some of the basic requirements to be considered
# a mutation locus, meaning the site may be a candidate in at least one single
# cell.
#
# if max.bulk.af or max.bulk.binom.prob are 1, then the filter would not remove
# any sites (except perhaps NA/NaNs). When that is the case, do not change
# behavior w.r.t. requiring bulk.gt=0/0.
#
# Any filter not dependent on specific single cell info (e.g., min.bulk.dp)
# should be applied here.
annotate.gatk.candidate.loci <- function(gatk, snv.min.bulk.dp, snv.max.bulk.alt, snv.max.bulk.af, snv.max.bulk.binom.prob, indel.min.bulk.dp, indel.max.bulk.alt, indel.max.bulk.af, indel.max.bulk.binom.prob, mode=c('new', 'legacy'))
{
mode <- match.arg(mode)
# In legacy mode, bulk depth and bulk alt reads in the low MMQ GATK table
# were checked later in the pipeline. However, since these do not change
# from single cell to single cell, we prefer to handle them here.
# bulk.af was never checked because max bulk alt was always 0 in legacy.
if (mode == 'new') {
snv.allow.bulk.gt <- snv.max.bulk.alt > 0 | (snv.max.bulk.af < 1 & snv.max.bulk.af > 0) | (snv.max.bulk.binom.prob < 1 & snv.max.bulk.binom.prob > 0)
indel.allow.bulk.gt <- indel.max.bulk.alt > 0 | (indel.max.bulk.af < 1 & indel.max.bulk.af > 0) | (indel.max.bulk.binom.prob < 1 & indel.max.bulk.binom.prob > 0)
gatk[, somatic.candidate :=
((muttype == 'snv' & balt <= snv.max.bulk.alt &
bulk.dp >= snv.min.bulk.dp &
!is.na(bulk.af) & bulk.af <= snv.max.bulk.af &
!is.na(bulk.binom.prob) & bulk.binom.prob <= snv.max.bulk.binom.prob &
(is.na(balt.lowmq) | balt.lowmq <= snv.max.bulk.alt) &
# to continue old SCAN2 (intended for non-clonal calling) behavior, require
# bulk.gt==0/0 when the user doesn't allow any bulk read support.
(snv.allow.bulk.gt | bulk.gt == '0/0')
) |
(muttype == 'indel' & balt <= indel.max.bulk.alt &
bulk.dp >= indel.min.bulk.dp &
!is.na(bulk.af) & bulk.af <= indel.max.bulk.af &
!is.na(bulk.binom.prob) & bulk.binom.prob <= indel.max.bulk.binom.prob &
(is.na(balt.lowmq) | balt.lowmq <= indel.max.bulk.alt) &
(indel.allow.bulk.gt | bulk.gt == '0/0')
)) &
# N.B. talt >= 2 becomes a less useful cutoff as #cells increases..
dbsnp == '.' & talt >= 2]
} else if (mode == 'legacy') {
gatk[, somatic.candidate :=
((muttype == 'snv' & balt <= snv.max.bulk.alt) |
(muttype == 'indel' & balt <= indel.max.bulk.alt)) &
bulk.gt == '0/0' & dbsnp == '.' & talt >= 2]
} else
stop('unrecognized mode')
gatk
}
# Again, modifying 'gatk' by reference.
# Returns list of resampling auxiliary data with one entry for SNVs and one for indels
# and modifies `gatk` by reference.
# FIXME: germline indels, not germline SNVs, were used by legacy SCAN2 calling when
# downsampling. This is right and wrong in various scenarios:
# - For sensitivity estimation, this is WRONG because only hSNPs are used to
# in AB model inference. Thus, to best reflect sensitivity due to proximity
# to AB model training sites (somatic sites closer to training sites have more
# accurate AB estimates and therefore probably higher sensitivity), only hSNPs
# should be used.
# - For CIGAR op filtering, this is CORRECT because somatic indel CIGAR profiles
# should be compared to germline indel CIGAR profiles. Of course, applying certain
# CIGAR op filters (indel ops) is actually incorrect either way.
# In any case, this likely has a relatively insignificant effect so we are leaving it as
# it was in legacy calling for now.
gatk.select.and.resample.training.sites <- function(gatk, haploid.chroms, M=20, seed=0) {
ret <- list()
# First set the training sites. This does not exactly match legacy SCAN2, which
# excluded training sites based on one single cell characteristic (having phased GT!=./.)
gatk[, training.site := !is.na(phased.gt) &
# diploid chroms: 0|1 and 1|0 hets are informative, hom 1|1 are not
((!(chr %in% haploid.chroms) & (!is.na(phased.gt) & phased.gt == '0|1' | phased.gt == '1|0')) |
# haploid chroms: 0|1 and 1|0 hets are artifacts and hom 1|1 is informative
(chr %in% haploid.chroms & !is.na(phased.gt) & phased.gt == '1|1')) & bulk.gt != './.']
for (mt in c('snv', 'indel')) {
aux.data <- resample.germline(
sites=gatk[somatic.candidate == TRUE & muttype == mt],
hsnps=gatk[training.site == TRUE & muttype == mt],
M=M, seed=seed)
# aux.data$selection is aligned to the 'hsnps' table above in resample.germline()
gatk[training.site == TRUE & muttype == mt,
resampled.training.site := aux.data$selection$keep]
ret[[mt]] <- aux.data
}
gatk[is.na(resampled.training.site), resampled.training.site := FALSE]
# reorder columns so that training.site and resampled.training.site, applicable to all single
# cells in the integrated table, are in the left block of columns. the right block of
# columns is intended to only contain per-sample read count data.
#
# must use column numbers because column names are not unique: each sample has a 'ref'
# and 'alt' column.
n.meta.cols <- length(integrated.table.meta.cols)
data.table::setcolorder(gatk, neworder=c(1:(n.meta.cols-2), ncol(gatk)-1:0, (n.meta.cols-1):(ncol(gatk)-2)))
return(ret)
}
# Write, bgzip and tabix index the integrated table
# 'out.tab' is TEMPORARY. The final file is out.tab.gz.
write.integrated.table <- function(inttab, out.tab,
out.tab.gz=paste0('out.tab', '.gz'), overwrite=FALSE)
{
if (file.exists(out.tab) & !overwrite)
stop(paste('output file', out.tab, 'already exists, please delete it first'))
if (file.exists(out.tab.gz) & !overwrite)
stop(paste('output file', out.tab.gz, 'already exists, please delete it first'))
colnames(inttab)[1] <- paste0('#', colnames(inttab)[1]) # hack to comment out the header
data.table::fwrite(inttab, file=out.tab, sep='\t', na='NA', quote=FALSE)
Rsamtools::bgzip(out.tab, out.tab.gz)
# if we get here, overwrite is either TRUE or the file didn't exist,
# so there's no danger of deleting a file without warning.
#unlink(out.tab)
Rsamtools::indexTabix(file=out.tab.gz, format='vcf', comment='#')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.