# use sigprofilermatrixgenerator to classify mutations
# note that sigprofilermatrixgenerator always generates the most granular classification
# (e.g., SBS6144 for SNVs and ID415 for indels)
# mostly useful for indel classification, but also good for doing TSB on SNVs
#
# `df` must be a data.table
classify.muts <- function(df, genome.string, spectype='SNV',
sample.name='dummy', save.plot=F, auto.delete=T, verbose=FALSE)
{
if (!data.table::is.data.table(df))
stop('argument `df` must be a data.table')
if (nrow(df) == 0)
return(df)
recognized.spectypes <- c('SNV', 'ID')
if (!(spectype %in% recognized.spectypes))
stop(sprintf("unrecognized spectype '%s', currently only supporting %s",
spectype, paste('"', recognized.spectypes, '"', collapse=', ')))
require(SigProfilerMatrixGeneratorR)
spmgd <- tempfile() # For multithreaded workflows, it is CRITICAL that library(future)
# supply different random seeds to child processes.
if (file.exists(spmgd))
stop(paste('temporary directory', spmgd, 'already exists'))
dir.create(spmgd, recursive=TRUE)
# Write out the VCF
out.file <- paste0(spmgd, '/', sample.name, '.vcf')
f <- file(out.file, "w")
vcf.header <- c("##fileformat=VCFv4.0", "##source=SCAN2",
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
paste(c("#CHROM", "POS", "ID", "REF", "ALT",
"QUAL", "FILTER", "INFO", "FORMAT", sample.name), collapse = "\t"))
writeLines(vcf.header, con = f)
# SigProfilerMatrixGenerator doesn't classify duplicated mutations
# from the same sample, it throws errors instead. It also will not
# detect duplicates if they are not adjacent in the file. If the
# duplicate is not detected in this way, it causes the bug where
# the final newdf dataframe does not match the input df.
# To circumvent all these headaches: just remove duplicates up front.
s <- df
mutid <- paste(s$chr, s$pos, s$refnt, s$altnt)
dupmut <- duplicated(mutid)
if (verbose)
cat("Removing", sum(dupmut), "/", nrow(s), "duplicated mutations before annotating\n")
s <- s[!dupmut,]
# will write, eg., position=7000000 as 7e6, which will
# confuse sigprofilermatrixgenerator
old.opt <- options('scipen')$scipen
options(scipen=10000)
writeLines(paste(s$chr, s$pos, '.', s$refnt, s$altnt,
".", "PASS", ".", "GT", "0/1", sep = "\t"), con = f)
close(f)
options(scipen=old.opt)
if (verbose) {
mat <- SigProfilerMatrixGeneratorR::SigProfilerMatrixGeneratorR(sample.name, genome.string, spmgd, seqInfo=TRUE, plot=save.plot)
} else {
# Prevent sigprofilermatrixgenerator's output from being printed
# XXX: should probably do some error handling here
reticulate::py_capture_output(
mat <- SigProfilerMatrixGeneratorR::SigProfilerMatrixGeneratorR(sample.name, genome.string, spmgd, seqInfo=TRUE, plot=save.plot))
}
# Read in the types
# SigProfilerMatrixGenerator output files have chromosomes named 1..22 without
# the 'chr' prefix, even if the prefix was present in the input vcf.
chrs.to.read <- sub('chr', '', s$chr[!duplicated(s$chr)])
annot.files <- paste0(spmgd, '/output/vcf_files/', spectype, '/', chrs.to.read, "_seqinfo.txt")
if (spectype == 'ID') {
colclasses <- c(V2='character', V5='character', V6='character')
} else if (spectype == 'SNV') {
colclasses <- c(V2='character')
}
annots <- data.table::rbindlist(lapply(annot.files, function(f) {
tryCatch(x <- fread(f, header=F, stringsAsFactors=FALSE,
colClasses=colclasses),
error=function(e) NULL)
}))
# SigProfilerMatrixGenerator always strips leading 'chr' prefixes. Add
# them back if the input had them. Column 2 is chromosome (1 is sample name).
if (substr(s$chr[1], 1, 3) == 'chr')
annots[[2]] <- paste0('chr', annots[[2]])
if (spectype == 'ID') {
colnames(annots) <- c('sample', 'chr', 'pos', 'iclass', 'refnt', 'altnt', 'unknown')
newdf <- annots[df,,on=.(chr, pos, refnt, altnt)]
} else if (spectype == 'SNV') {
colnames(annots) <- c('sample', 'chr', 'pos', 'iclass', 'unknown')
newdf <- annots[df,,on=.(chr, pos)]
}
if (save.plot) {
plotfiles <- list.files(paste0(spmgd, '/output/plots/'), full.names=T)
file.copy(plotfiles, '.')
}
if (!all(df$chr == newdf$chr))
stop('df and newdf do not perfectly correspond: df$chr != newdf$chr')
if (!all(df$pos == newdf$pos))
stop('df and newdf do not perfectly correspond: df$pos != newdf$pos')
if (auto.delete)
unlink(spmgd, recursive=TRUE)
newdf$iclass
}
genome.to.spmgr.format <- c(
hs37d5='GRCh37',
hg38='GRCh38',
mm10='mm10')
# new version: just returns the vector of indel classes
# FORCES ID83 FOR NOW
# N.B.: don't provide a default genome.string; it's dangerous.
#
# `df` must be a data.table
classify.indels <- function(df, genome.string, sample.name='dummy', save.plot=F, auto.delete=T, verbose=FALSE) {
# SigProfilerMatrixGenerator returns ID415 by default, which is ID83 plus one of
# 5 transcription strand states: B, N, Q, T, U. The format of the string is, e.g.,
# U:1:Del:T:1
# Removing the first two characters "U:" leaves an ID83 type.
id415 <- classify.muts(df=df, genome.string=genome.to.spmgr.format[genome.string],
spectype='ID', sample.name=sample.name, save.plot=save.plot,
auto.delete=auto.delete, verbose=verbose)
# id83() converts strings into a factor
id83(substr(id415, 3, nchar(id415)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.