R/generateMutations.R

#'  Annotating base substitutions generated by generate_mutations()
#'
#'  @description This function generates mutations in a set of genes given the total number of mutations
#'               in samples, numbers of silent mutations per gene and total numbers of silent mutations
#'               per sample. Mutations are annotated, and numbers of silent and non-silent mutations
#'               calculated per iteration.
#'  @param variants Matrix of type character; needs to contain columns
#'                 \code{Chrom} for chromosomes, \code{Pos} for substitution position,
#'                  \code{Ref} for reference allele, \code{Mut} for alternative allele, and
#'                  \code{Sample} for sample names.
#'  @param genome BSgenome object with relevant genome built.
#'  @param txdb A [TxDb](http://127.0.0.1:11491/help/library/GenomicFeatures/html/TxDb.html) object which serves as the annotation for
#'              distinguishing synonymous and non-synonymous variants.
#'  @param current_source \code{seqSource} argument for [predictCoding](http://127.0.0.1:11491/help/library/VariantAnnotation/help/predictCoding)
#'                        annotation function. Default is "Hsapiens".
#'  @details Modifies the matrix with variants and makes it suitable for annotation using [predictCoding](http://127.0.0.1:11491/help/library/VariantAnnotation/help/predictCoding)
#'           function. Returns a matrix with additional column - \code{Effect} - which contains
#'           the prediction (unknown, synonymous, non-synonymous, or nonsense).
#'  @examples
#'    library(BSgenome.Hsapiens.UCSC.hg19)
#'    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'    genome <- BSgenome.Hsapiens.UCSC.hg19
#'    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#'    variants <- as.matrix(data.frame(Sample = c('T1','T2'),
#'      Chrom = c('chr2','chr2'),
#'      Pos = c(1764958, 19845736),
#'      Ref = c('A', 'G'),
#'      Mut = c('T', 'A')))
#'    annotate_simulated_variants(variants, txdb, genome)
#'
#'
#' @export
annotate_simulated_variants <- function(variants, txdb, genome, current_source = Hsapiens) {

  variants <- as.matrix(variants)
  rownames(variants) <- NULL

  variants[,'Pos'] <- as.character(as.numeric(variants[,'Pos'])) # as.matrix does terrible things to numbers

    if ('Alt' %in% colnames(variants)) {

      colnames(variants)[colnames(variants)=='Alt'] <- 'Mut'

    }

    if (nrow(variants) > 0) {

        if (!grepl('chr',variants[1,'Chrom'])) {

          variants[, "Chrom"] <- paste0('chr',variants[, "Chrom"])

        }

        gr <- GRanges(seqnames = variants[, "Chrom"],
                      ranges = IRanges(start = as.numeric(variants[, "Pos"]),
                                       end = as.numeric(variants[, "Pos"])),
                      seqinfo = seqinfo(genome))

        if ("Strand" %in% colnames(variants)) {

          strand(gr) <- variants[, "Strand"]

        }

        a <- VariantAnnotation::predictCoding(query = gr, subject = txdb,
                           seqSource = current_source,
                           varAllele = DNAStringSet(x = variants[, "Mut"]))

        aaa <- sapply(split(as.character(a$CONSEQUENCE), start(a)), function(x) x[1])

        if (!('Effect' %in% colnames(variants))) {

          Effect <- rep('unknown', nrow(variants))
          variants <- cbind(variants, Effect)

        }

        if (length(aaa) > 0)
            variants[match(names(aaa), as.character(variants[, 'Pos'])), "Effect"] <- aaa

    } else print("No variants to annotate!")

    return(variants)
}


#'  Generating base substitutions based on silent mutation rate
#'
#' @description This function generates mutations in a set of genes given the mutation probabilities
#'              for a particular process, total number of mutations in samples, numbers of silent
#'              mutations per gene and total numbers of silent mutations per sample. Mutations are
#'              annotated, and numbers of silent and non-silent mutations calculated per iteration.
#'  @param geneset A vector of characters containing a set of
#'                 gene names in HUGO Symbols (http://www.genenames.org/)
#'  @param mutations Matrix with mutations; may be any format, but needs to contain columns
#'                   'Chrom' for chromosomes, 'Pos' for substitution position,
#'                   'Ref' for reference allele, 'Mut' for alternative allele, and
#'                   'Sample' for sample names. If it contains 'Effect' column, the effects
#'                   will be taken from there, with silent mutations looking for 'silent' or
#'                   'synonymous' entry, otherwise an initial annotation will be generated.
#'  @param signature Numeric vector of 96 probabilities for mutations of 6 types in 16 contexts.
#'  @param N Integer, number of iterations for simulations. Default is 100.
#'  @param genome BSgenome object with relevant genome built.
#'  @param transcripts A [TxDb](http://127.0.0.1:11491/help/library/GenomicFeatures/html/TxDb.html) object which serves as the annotation for
#'                     distinguishing synonymous and non-synonymous variants.
#'  @param samples Names of samples to generate mutations for.
#'  @param all_silent Number of silent mutations in the dataset.
#'  @param silent_mutations_per_gene Single numeric value, or a named vector of numbers reflecting expected number of silent mutations per gene.
#'                                   Same length as \code{geneset}.
#'  @param total_mutations Single numeric value, or a named vector of numbers reflecting expected total number of mutations per
#'                         sample. Same length as \code{samples}.
#'  @param targets Genomic ranges reflecting the sequencing regions.
#'  @details Mutations are generated per gene according to a Poisson process with the following
#'           expected value: \eqn{$\lambda = silent_{i} \frac{\sum_{k in G} silent_{G}}{\sum_{j in S} silent_{j}} \left( \frac{silent_{i} + nonsilent_{i}}{silent_{i}} \right)}.
#'           The mutational process is adjusted to trinucleotide frequencies in the exons of the genes
#'           under consideration.
#'  @return variants A list of N matrices with variants generated for the geneset per simulation.
#'  @return silent Number of silent mutations generated in the geneset per simulation.
#'  @return nonsilent Number of non-silent mutations generated in the geneset per simulation.
#'  @examples library(BSgenome.Hsapiens.UCSC.hg19)
#'            library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'            txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#'            genome <- BSgenome.Hsapiens.UCSC.hg19
#'            data(sample_cancer_mutations)
#'            res <- generate_mutations(geneset = c('TP53','BRAF'), mutations = mutations, signature = rep(1/96,96), N = 100, type='exome',genome=genome,trascripts=txdb)
#'
#'
#'  generate_mutations()
#'
#' @export
generate_mutations <- function(geneset = NULL,
                               N = 100, samples = NULL,
                               all_silent = NULL,
                               silent_mutations_per_gene = NULL,
                               total_mutations = NULL,
                               signature = NULL, genome = NULL, transcripts = NULL,
                               targets = NULL) {

  if (is.null(geneset)) {

    message('Please specify gene names.')

    return(NULL)

  }

  if (is.null(samples)) {

    message('Please specify sample names.')

    return(NULL)

  }

  if (is.null(genome)) {

    message('Please specify the genome object.')

    return(NULL)

  }

  if (is.null(transcripts)) {

    message('Please specify the TxDb object.')

    return(NULL)

  }

  if (is.null(signature)) {

    message('Signature is not specified, so the overall mutation rate is considered uniform.')

  }


  if (is.null(silent_mutations_per_gene)) {

    message('Please specify the total number of silent mutations per gene.')

    return(NA)
  }

  if (length(silent_mutations_per_gene)==1) {

    silent_mutations_per_gene <- rep(silent_mutations_per_gene, length(samples))

    names(silent_mutations_per_gene) <- geneset

  }

  if (is.null(all_silent)) {

    message('Please specify the total number of silent mutations.')

    return(NA)
  }


  if (is.null(total_mutations)) {

    message('Please specify the total number of mutations per sample.')

  }

  if (length(total_mutations)==1) {

    total_mutations <- rep(total_mutations, length(samples))

    names(total_mutations) <- samples

  }


  if (is.null(signature)) {

    message('Signature in not specified, so the overall mutation rate is considered uniform.')

  }

  upload_signatures()

  gene_features <- get_all_features(geneset, transcripts, genome, targets)

  new_probs_long <- get_all_probabilities(geneset,
                                          gene_features,
                                          signature)

  variants <- list()

  silent <- NULL

  nonsilent <- NULL

  print('Starting simulation:')

  for (j in 1:N) {

      variants[[j]] <- matrix(0, ncol = 7, dimnames = list(NULL, c("Sample", "Chrom", "Pos", "Ref", "Mut", "Gene", "Effect")))

      for (pat in samples) {

          for (y in geneset) {

              new_probs <- c(signature[types.full],
                             signature[types.full])/
                               tri.counts.exome[c(types, types), 1] *
                               tri.counts(gene_features[[y]])[new.types]

              new_probs <- new_probs/sum(new_probs)

              names(new_probs) <- new.types.full

              local_total <- rpois(1, lambda = silent_mutations_per_gene[y] / all_silent * total_mutations[pat])

              if (local_total > 0) {

                nn <- sample(1:length(positions(gene_features[[y]])), local_total, prob = new_probs_long[[y]])

                for (ind in nn) {

                  mut_to_prob <- new_probs[new.types == trinucleotides(gene_features[[y]])[ind]]

                  mut_to_base <- sample(substr(names(mut_to_prob), 5, 5), size = 1, prob = mut_to_prob)

                  variants[[j]] <- rbind(variants[[j]],
                                         c(pat,
                                           gen_chrom(gene_features[[y]]),
                                           positions(gene_features[[y]])[ind],
                                           substr(trinucleotides(gene_features[[y]])[ind], 2, 2),
                                           mut_to_base,
                                           y, "unknown"))

                }

              }

          }


        }


        variants[[j]] <- annotate_simulated_variants(variants[[j]][-1, ],
                                                     txdb = transcripts,
                                                     genome = genome)

        silent <- c(silent, sum(variants[[j]][,'Effect'] == "synonymous"))

        nonsilent <- c(nonsilent, sum(!(variants[[j]][,'Effect'] %in% c("synonymous", "unknown"))))

        if (j %% 10 == 0) {

          print(paste0('\nProgress: ', j/N * 100, '%'))

        }

    }

    return(list(silent = silent, nonsilent = nonsilent, variants = variants))

}



#' Upload cancer signatures and mutation types from COSMIC
#'
#' @description Uploades COSMIC mutation signature set from [COSMIC website]("http://cancer.sanger.ac.uk/cancergenome/assets/signatures_probabilities.txt"),
#'              defines context and mutation types: \code{types}, \code{types.full}. Also
#'              defines exome trinucleotide counts.
#'
#' upload_signatures()
#'
#' @export
upload_signatures <- function() {

  # upload cancer signatures

  sp_url <- paste("http://cancer.sanger.ac.uk/cancergenome/assets/",
                  "signatures_probabilities.txt", sep = "")

  cancer_signatures <<- read.table(sp_url, sep = "\t", header = TRUE)

  cancer_signatures <<- cancer_signatures[order(cancer_signatures[,1]),]

  types <<- as.character(cancer_signatures$Trinucleotide) # trinucleotide classes

  types.full <<- as.character(cancer_signatures$Somatic.Mutation.Type) # substitution types

  row.names(cancer_signatures) <<- types.full

  cancer_signatures <<- as.matrix(cancer_signatures[,4:33])

  new.types <<- c(types, sapply(types, RevCom))

  new.types.full <<- c(types.full, sapply(types.full, RevComMutType))

  tri.counts.exome <<- deconstructSigs::tri.counts.exome

}
nvolkova/MutSim documentation built on May 15, 2019, 4:48 p.m.