knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
MutSim package performs substitution simulation within the exons of the genes of interest following a given mutational process and fixed numbers of mutations per sample. Let us check if the mutations in nucleotide excision repair genes in TCGA melanoma dataset (SKCM-US projects, available from ICGC Data Portal).
First we will need the data and reference objects:
library(MutSim) library(BSgenome.Hsapiens.UCSC.hg19) library(TxDb.Hsapiens.UCSC.hg19.knownGene) targets <- import('~/Downloads/gencode.v19.basic.exome.bed') seqlevels(targets) <- paste0('chr',seqlevels(targets)) genome <- BSgenome.Hsapiens.UCSC.hg19 txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene mutations_uv <- read.delim('~/TCGA/SKCM_cavemanpindel.txt', skip = 66) uv <- read.table('~/TCGA/SKCM_mutation_matrix.dat', sep = '\t', header = T) head(mutations_uv[,c('Sample','Chrom','Pos','Ref','Alt','Filter','Type','Effect')])
Now let's define the mutational process. As we are looking and melanoma samples of sun-exposed skin, we will take the UV light associated COSMIC signature 7. We will select only samples with visible UV exposure:
upload_signatures() colnames(uv) <- types.full samples <- unique(as.character(mutations_uv$Sample)) samples <- samples[sapply(samples, function(x) cosine(cancer_signatures[,7], as.numeric(uv[x,1:96])))>0.8] samples <- samples[!is.na(samples)] signature <- cancer_signatures[,7] signature
Let's check the mutations in NER core genes (as per Knijnenburg et al. 2018). We will take all the mutations with VAF>0.05 and annotate them again.
NER.core <- c('XPC','XPA','CUL5','ERCC1','ERCC2','ERCC4','ERCC5','ERCC6') MMR.core <- c('MLH1','MLH3','MSH2','MSH3','MSH6','PMS1','PMS2','EXO1') NER.mutations <- mutations_uv[mutations_uv$PM.Tum > 0.05 & mutations_uv$Type == 'Sub' & mutations_uv$Sample %in% samples & mutations_uv$Gene %in% NER.core, c('Sample','Chrom','Pos','Ref','Alt','Gene','Effect')] MMR.mutations <- mutations_uv[mutations_uv$PM.Tum > 0.05 & mutations_uv$Type == 'Sub' & mutations_uv$Sample %in% samples & mutations_uv$Gene %in% MMR.core, c('Sample','Chrom','Pos','Ref','Alt','Gene','Effect')] NER.mutations.annot <- annotate_simulated_variants(variants = NER.mutations[,-c(7:8)], txdb = txdb, genome = genome) MMR.mutations.annot <- annotate_simulated_variants(variants = MMR.mutations[,-c(7:8)], txdb = txdb, genome = genome) table(NER.mutations.annot[,'Effect']) table(as.character(NER.mutations[,'Effect'])) table(MMR.mutations.annot[,'Effect']) table(as.character(MMR.mutations[,'Effect']))
In order to simulate the correct number of mutations, the algorithm requires also total numbers of silent and non-silent mutations per sample as well as the total number of silent mutations per gene. (This step can take a couple of minutes if you set of mutations is large...)
# Count the silent mutations all_silent <- nrow(mutations_uv[mutations_uv$Sample %in% samples & mutations_uv$PM.Tum > 0.05 & mutations_uv$Effect == 'silent',]) # Count non-silent mutations total_per_sample <- sapply(samples, function(x) nrow(mutations_uv[mutations_uv$Sample == x & mutations_uv$PM.Tum > 0.05 & mutations_uv$Effect %in% c('silent','nonsense','missense'),])) names(total_per_sample) <- samples # Count silent mutations per gene NER_silent_mutations_per_gene <- sapply(NER.core, function(x) nrow(mutations_uv[mutations_uv$Gene == x & mutations_uv$Sample %in% samples & mutations_uv$PM.Tum > 0.05 & mutations_uv$Effect == 'silent',])) names(NER_silent_mutations_per_gene) <- NER.core MMR_silent_mutations_per_gene <- sapply(MMR.core, function(x) nrow(mutations_uv[mutations_uv$Gene == x & mutations_uv$Sample %in% samples & mutations_uv$PM.Tum > 0.05 & mutations_uv$Effect == 'silent',])) names(MMR_silent_mutations_per_gene) <- MMR.core
Mutations per gene will be simulated as X ~ Poisson(\lambda), where \eq{\lambda = silent_{i} \frac{\sum_{k in G} silent_{G}}{\sum_{j in S} silent_{j}} \left( \frac{silent_{i} + nonsilent_{i}}{silent_{i}} \right)}}. Here \eq{silent_{i}} denotes the number of silent mutations in sample i, \eq{silent_{G}} - number of silent mutations per gene G in the whole dataset, \eq{non_silent_{i}} - number of non-silent mutations in sample i. This number of mutations then is sampled from the gene sequence, based on adjusted mutation probability distribution \eq{P_{G} = signature / contexts_{exome} * contexts_{G}}.
Now we can run the simulation and check, if the number of mutations in these genes observed in the dataset agrees with mutational burden of the samples:
resNER <- generate_mutations(geneset = NER.core, N = 100, samples = samples, all_silent = all_silent, silent_mutations_per_gene = NER_silent_mutations_per_gene, total_mutations = total_per_sample, genome = genome, transcripts = txdb, signature = signature, targets = targets)
resMMR <- generate_mutations(geneset = MMR.core, N = 100, samples = samples, all_silent = all_silent, silent_mutations_per_gene = MMR_silent_mutations_per_gene, total_mutations = total_per_sample, genome = genome, transcripts = txdb, signature = signature, targets = targets)
Observed values fall into the dirstibution, so it looks like all those non-silent mutations were generated by UV exposure simply by chance:
NER
library(gridExtra) library(grid) library(ggplot2) library(lattice) df <- data.frame(silent = resNER$silent, nonsilent = resNER$nonsilent) g1 <- ggplot(data = df, aes(silent)) + geom_histogram(binwidth = 1, fill = 'darkgreen', col = 'black', size = 1) + geom_vline(xintercept = sum(NER.mutations.annot[,'Effect'] == 'synonymous'), linetype = 'dashed', color = 'red', size = 1) + ggtitle(label = 'Silent mutations in NER genes') + theme_bw() + theme(panel.border = element_blank(), panel.grid = element_blank()) g2 <- ggplot(data = df, aes(nonsilent)) + geom_histogram(binwidth = 1, fill = 'darkblue', col = 'black', size = 1) + geom_vline(xintercept = sum(NER.mutations.annot[,'Effect'] %in% c('nonsynonymous','nonsense')), linetype = 'dashed', color = 'red', size = 1) + ggtitle(label = 'Non-silent mutations in NER genes') + theme_bw() + theme(panel.border = element_blank(), panel.grid = element_blank()) q <- grid.arrange(g1, g2, ncol = 2) ggsave(filename = '~/generated_vs_real_mutations_in_NER_genes_UV.pdf', plot = q, device = 'pdf',width = 10, height = 6)
MMR
df <- data.frame(silent = resMMR$silent, nonsilent = resMMR$nonsilent) g1 <- ggplot(data = df, aes(silent)) + geom_histogram(binwidth = 1, fill = 'darkgreen', col = 'black', size = 1) + geom_vline(xintercept = sum(MMR.mutations.annot[,'Effect']=='synonymous'), linetype = 'dashed', color = 'red', size = 2) + ggtitle(label = 'Silent mutations in MMR genes') + theme_bw() + theme(panel.border = element_blank(), panel.grid = element_blank()) g2 <- ggplot(data = df, aes(nonsilent)) + geom_histogram(binwidth = 1, fill = 'darkblue', col = 'black', size = 1) + geom_vline(xintercept = sum(MMR.mutations.annot[,'Effect'] %in% c('nonsynonymous','nonsense')), linetype = 'dashed', color = 'red', size = 2) + ggtitle(label = 'Non-silent mutations in MMR genes') + theme_bw() + theme(panel.border = element_blank(), panel.grid = element_blank()) q <- grid.arrange(g1, g2, ncol = 2) ggsave(filename = '~/generated_vs_real_mutations_in_MMR_genes_UV.pdf', plot = q, device = 'pdf',width = 10, height = 6)
save(resNER, resMMR, file = '~/UV_simulation_NER_MMR.RData')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.