#' Annotating base substitutions generated by generate_mutations()
#'
#' @description This function generates mutations in a set of genes given the total number of mutations
#' in samples, numbers of silent mutations per gene and total numbers of silent mutations
#' per sample. Mutations are annotated, and numbers of silent and non-silent mutations
#' calculated per iteration.
#' @param variants Matrix of type character; needs to contain columns
#' \code{Chrom} for chromosomes, \code{Pos} for substitution position,
#' \code{Ref} for reference allele, \code{Mut} for alternative allele, and
#' \code{Sample} for sample names.
#' @param genome BSgenome object with relevant genome built.
#' @param txdb A [TxDb](http://127.0.0.1:11491/help/library/GenomicFeatures/html/TxDb.html) object which serves as the annotation for
#' distinguishing synonymous and non-synonymous variants.
#' @param current_source \code{seqSource} argument for [predictCoding](http://127.0.0.1:11491/help/library/VariantAnnotation/help/predictCoding)
#' annotation function. Default is "Hsapiens".
#' @details Modifies the matrix with variants and makes it suitable for annotation using [predictCoding](http://127.0.0.1:11491/help/library/VariantAnnotation/help/predictCoding)
#' function. Returns a matrix with additional column - \code{Effect} - which contains
#' the prediction (unknown, synonymous, non-synonymous, or nonsense).
#' @examples
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' genome <- BSgenome.Hsapiens.UCSC.hg19
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' variants <- as.matrix(data.frame(Sample = c('T1','T2'),
#' Chrom = c('chr2','chr2'),
#' Pos = c(1764958, 19845736),
#' Ref = c('A', 'G'),
#' Mut = c('T', 'A')))
#' annotate_simulated_variants(variants, txdb, genome)
#'
#'
#' @export
annotate_simulated_variants <- function(variants, txdb, genome, current_source = Hsapiens) {
variants <- as.matrix(variants)
rownames(variants) <- NULL
variants[,'Pos'] <- as.character(as.numeric(variants[,'Pos'])) # as.matrix does terrible things to numbers
if ('Alt' %in% colnames(variants)) {
colnames(variants)[colnames(variants)=='Alt'] <- 'Mut'
}
if (nrow(variants) > 0) {
if (!grepl('chr',variants[1,'Chrom'])) {
variants[, "Chrom"] <- paste0('chr',variants[, "Chrom"])
}
gr <- GRanges(seqnames = variants[, "Chrom"],
ranges = IRanges(start = as.numeric(variants[, "Pos"]),
end = as.numeric(variants[, "Pos"])),
seqinfo = seqinfo(genome))
if ("Strand" %in% colnames(variants)) {
strand(gr) <- variants[, "Strand"]
}
a <- VariantAnnotation::predictCoding(query = gr, subject = txdb,
seqSource = current_source,
varAllele = DNAStringSet(x = variants[, "Mut"]))
aaa <- sapply(split(as.character(a$CONSEQUENCE), start(a)), function(x) x[1])
if (!('Effect' %in% colnames(variants))) {
Effect <- rep('unknown', nrow(variants))
variants <- cbind(variants, Effect)
}
if (length(aaa) > 0)
variants[match(names(aaa), as.character(variants[, 'Pos'])), "Effect"] <- aaa
} else print("No variants to annotate!")
return(variants)
}
#' Generating base substitutions based on silent mutation rate
#'
#' @description This function generates mutations in a set of genes given the mutation probabilities
#' for a particular process, total number of mutations in samples, numbers of silent
#' mutations per gene and total numbers of silent mutations per sample. Mutations are
#' annotated, and numbers of silent and non-silent mutations calculated per iteration.
#' @param geneset A vector of characters containing a set of
#' gene names in HUGO Symbols (http://www.genenames.org/)
#' @param mutations Matrix with mutations; may be any format, but needs to contain columns
#' 'Chrom' for chromosomes, 'Pos' for substitution position,
#' 'Ref' for reference allele, 'Mut' for alternative allele, and
#' 'Sample' for sample names. If it contains 'Effect' column, the effects
#' will be taken from there, with silent mutations looking for 'silent' or
#' 'synonymous' entry, otherwise an initial annotation will be generated.
#' @param signature Numeric vector of 96 probabilities for mutations of 6 types in 16 contexts.
#' @param N Integer, number of iterations for simulations. Default is 100.
#' @param genome BSgenome object with relevant genome built.
#' @param transcripts A [TxDb](http://127.0.0.1:11491/help/library/GenomicFeatures/html/TxDb.html) object which serves as the annotation for
#' distinguishing synonymous and non-synonymous variants.
#' @param samples Names of samples to generate mutations for.
#' @param all_silent Number of silent mutations in the dataset.
#' @param silent_mutations_per_gene Single numeric value, or a named vector of numbers reflecting expected number of silent mutations per gene.
#' Same length as \code{geneset}.
#' @param total_mutations Single numeric value, or a named vector of numbers reflecting expected total number of mutations per
#' sample. Same length as \code{samples}.
#' @param targets Genomic ranges reflecting the sequencing regions.
#' @details Mutations are generated per gene according to a Poisson process with the following
#' expected value: \eqn{$\lambda = silent_{i} \frac{\sum_{k in G} silent_{G}}{\sum_{j in S} silent_{j}} \left( \frac{silent_{i} + nonsilent_{i}}{silent_{i}} \right)}.
#' The mutational process is adjusted to trinucleotide frequencies in the exons of the genes
#' under consideration.
#' @return variants A list of N matrices with variants generated for the geneset per simulation.
#' @return silent Number of silent mutations generated in the geneset per simulation.
#' @return nonsilent Number of non-silent mutations generated in the geneset per simulation.
#' @examples library(BSgenome.Hsapiens.UCSC.hg19)
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' genome <- BSgenome.Hsapiens.UCSC.hg19
#' data(sample_cancer_mutations)
#' res <- generate_mutations(geneset = c('TP53','BRAF'), mutations = mutations, signature = rep(1/96,96), N = 100, type='exome',genome=genome,trascripts=txdb)
#'
#'
#' generate_mutations()
#'
#' @export
generate_mutations <- function(geneset = NULL,
N = 100, samples = NULL,
all_silent = NULL,
silent_mutations_per_gene = NULL,
total_mutations = NULL,
signature = NULL, genome = NULL, transcripts = NULL,
targets = NULL) {
if (is.null(geneset)) {
message('Please specify gene names.')
return(NULL)
}
if (is.null(samples)) {
message('Please specify sample names.')
return(NULL)
}
if (is.null(genome)) {
message('Please specify the genome object.')
return(NULL)
}
if (is.null(transcripts)) {
message('Please specify the TxDb object.')
return(NULL)
}
if (is.null(signature)) {
message('Signature is not specified, so the overall mutation rate is considered uniform.')
}
if (is.null(silent_mutations_per_gene)) {
message('Please specify the total number of silent mutations per gene.')
return(NA)
}
if (length(silent_mutations_per_gene)==1) {
silent_mutations_per_gene <- rep(silent_mutations_per_gene, length(samples))
names(silent_mutations_per_gene) <- geneset
}
if (is.null(all_silent)) {
message('Please specify the total number of silent mutations.')
return(NA)
}
if (is.null(total_mutations)) {
message('Please specify the total number of mutations per sample.')
}
if (length(total_mutations)==1) {
total_mutations <- rep(total_mutations, length(samples))
names(total_mutations) <- samples
}
if (is.null(signature)) {
message('Signature in not specified, so the overall mutation rate is considered uniform.')
}
upload_signatures()
gene_features <- get_all_features(geneset, transcripts, genome, targets)
new_probs_long <- get_all_probabilities(geneset,
gene_features,
signature)
variants <- list()
silent <- NULL
nonsilent <- NULL
print('Starting simulation:')
for (j in 1:N) {
variants[[j]] <- matrix(0, ncol = 7, dimnames = list(NULL, c("Sample", "Chrom", "Pos", "Ref", "Mut", "Gene", "Effect")))
for (pat in samples) {
for (y in geneset) {
new_probs <- c(signature[types.full],
signature[types.full])/
tri.counts.exome[c(types, types), 1] *
tri.counts(gene_features[[y]])[new.types]
new_probs <- new_probs/sum(new_probs)
names(new_probs) <- new.types.full
local_total <- rpois(1, lambda = silent_mutations_per_gene[y] / all_silent * total_mutations[pat])
if (local_total > 0) {
nn <- sample(1:length(positions(gene_features[[y]])), local_total, prob = new_probs_long[[y]])
for (ind in nn) {
mut_to_prob <- new_probs[new.types == trinucleotides(gene_features[[y]])[ind]]
mut_to_base <- sample(substr(names(mut_to_prob), 5, 5), size = 1, prob = mut_to_prob)
variants[[j]] <- rbind(variants[[j]],
c(pat,
gen_chrom(gene_features[[y]]),
positions(gene_features[[y]])[ind],
substr(trinucleotides(gene_features[[y]])[ind], 2, 2),
mut_to_base,
y, "unknown"))
}
}
}
}
variants[[j]] <- annotate_simulated_variants(variants[[j]][-1, ],
txdb = transcripts,
genome = genome)
silent <- c(silent, sum(variants[[j]][,'Effect'] == "synonymous"))
nonsilent <- c(nonsilent, sum(!(variants[[j]][,'Effect'] %in% c("synonymous", "unknown"))))
if (j %% 10 == 0) {
print(paste0('\nProgress: ', j/N * 100, '%'))
}
}
return(list(silent = silent, nonsilent = nonsilent, variants = variants))
}
#' Upload cancer signatures and mutation types from COSMIC
#'
#' @description Uploades COSMIC mutation signature set from [COSMIC website]("http://cancer.sanger.ac.uk/cancergenome/assets/signatures_probabilities.txt"),
#' defines context and mutation types: \code{types}, \code{types.full}. Also
#' defines exome trinucleotide counts.
#'
#' upload_signatures()
#'
#' @export
upload_signatures <- function() {
# upload cancer signatures
sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
"signatures_probabilities.txt", sep = "")
cancer_signatures <<- read.table(sp_url, sep = "\t", header = TRUE)
cancer_signatures <<- cancer_signatures[order(cancer_signatures[,1]),]
types <<- as.character(cancer_signatures$Trinucleotide) # trinucleotide classes
types.full <<- as.character(cancer_signatures$Somatic.Mutation.Type) # substitution types
row.names(cancer_signatures) <<- types.full
cancer_signatures <<- as.matrix(cancer_signatures[,4:33])
new.types <<- c(types, sapply(types, RevCom))
new.types.full <<- c(types.full, sapply(types.full, RevComMutType))
tri.counts.exome <<- deconstructSigs::tri.counts.exome
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.