R/get_mutation_types.R

Defines functions all_snv_mutation_types get_trinucleotide base_complement collapse_mutation_types stitch_mutation_types get_snv_mutation_type mutations_to_catalog

Documented in all_snv_mutation_types base_complement collapse_mutation_types get_snv_mutation_type get_trinucleotide mutations_to_catalog stitch_mutation_types

#' Gets a list of the standard 96 mutation types used in SNV mutation signature analysis.
#'
#' @return A 96-element character vector containing SNV mutation types
#'
#' @import dplyr
#' @import tidyr
#' @export

all_snv_mutation_types <- function() {
    bases = c('A', 'C', 'G', 'T')
    changes = c('C>A', 'C>G', 'C>T', 'T>A', 'T>C', 'T>G')
    all_mutation_types <- crossing(
        five_prime = bases,
        substitution = changes,
        three_prime = bases
    ) %>% 
    mutate(
        mutation_type = paste0(five_prime, '[', substitution, ']', three_prime)
    ) %>% 
    .$mutation_type
}

#' Gets the trinucleotide context for a set of genomic coordinates.
#'
#' @param chr       Character vector of chromosome names. Must match provided genome.
#' @param pos       Integer vector of genomic positions.
#' @param genome    BSgenome object - default is BSgenome.Hsapiens.UCSC.hg19
#'
#' @return          Character vector of trinucleotide contexts
#'
#' @importFrom BSgenome getBSgenome getSeq
#' @import dplyr

get_trinucleotide <- function(chr, pos, ref, genome = NULL) {
    if (is.null(genome)) {
        genome = getBSgenome('BSgenome.Hsapiens.UCSC.hg19')
    }
    chr_levels <- chr %>% unique
    if (all(chr_levels %in% seqlevels(genome))) {
        message('Chromosome levels match. Proceeding...')
        chromosome = chr
    } else if (all(paste0('chr', chr_levels) %in% seqlevels(genome))) {
        message('Chromosome levels match after prepending "chr". Proceeding...')
        warning('"chr" was automatically prepended to all chromosome names to match genome seqlevels')
        chromosome = paste0('chr', chr)
    } else if (all(gsub('chr', '', chr_levels) %in% seqlevels(genome))) {
        message('Chromosome levels match after removing "chr". Proceeding...')
        warning('"chr" was automatically removed from all chromosome names to match genome seqlevels')
        chromosome = gsub('chr', '', chr)
    } else {
        stop('Some chromosome names do not match genome seqlevels. Cannot proceed.')
    }

    trinucleotide = getSeq(genome, chromosome, pos - 1, pos + 1) %>% as.character
    genome_ref = substr(trinucleotide, 2, 2)
    
    if (any(genome_ref != ref)) {
        if (all(genome_ref[genome_ref != ref] == 'N')) {
            warning(sprintf('%s positions matched N in genome ref.', sum(genome_ref != ref)))
        } else {
            stop('Some positions did not match reference genome. Stopping.')
        }
    }

    return(trinucleotide)
}

#' Returns the complementary sequence for a set of strings
#'
#' @param base_strings  Character vector of sequences to return the complement of.
#'                      Any characters aside from A, C, G, T will be returned as is.
#'
#' @param reverse       If TRUE, then the sequence is reversed before it is returned.
#'
#' @return Character vector of complement or reverse complement sequences
#'
#' @import dplyr

base_complement <- function(base_strings, reverse = FALSE) {
    conversion = c(A = 'T', T = 'A', C = 'G', G = 'C')
    strsplit(base_strings, '') %>%
        sapply(function(bases) {
            base_vec = if_else(
                bases %in% names(conversion),
                conversion[bases],
                bases
            )
            if (reverse) {
                base_vec <- rev(base_vec)
            }
            base_vec %>% paste(collapse='') 
        })
}

#' Collapses mutation types of G>N and A>N mutations into C>N and T>N mutations
#' so that there are 6 possible base changes.
#'
#' @param mutation_types Character vector of mutation types in format C[T>A]G
#'                       as an example of a T>A mutation in CTG trinucleotide context.
#' 
#' @return New mutation types with any G>N/A>N mutations replaced and their
#'         trinucleotide contexts reverse complemented
#'

collapse_mutation_types <- function(mutation_types) {
    trinucleotide = gsub('(.)\\[(.)>(.)\\](.)', '\\1\\2\\4', mutation_types)
    substitution = gsub('(.)\\[(.)>(.)\\](.)', '\\2>\\3', mutation_types)
    ref_base = gsub('(.)\\[(.)>(.)\\](.)', '\\2', mutation_types)

    to_reverse = ref_base %in% c('G', 'A')
    trinucleotide[to_reverse] = base_complement(trinucleotide[to_reverse], reverse = TRUE)
    substitution[to_reverse] = base_complement(substitution[to_reverse], reverse = FALSE)

    stitch_mutation_types(substitution, trinucleotide)
}

#' Given a set of substitutions and trinucleotides, merge into unique mutation
#' types. For example, T>A mutation in CTG context will become C[T>A]G.
#'
#' @param substitution      Character vector of substitutions in the form T>A
#' @param trinucleotide     Character vector of trinucleotide contexts in the form CTG
#' @return Character vector of mutation types in the form C[T>A]G

stitch_mutation_types <- function(substitution, trinucleotide) {
    sprintf(
        '%s[%s]%s',
        substr(trinucleotide, 1, 1),
        substitution,
        substr(trinucleotide, 3, 3)
    )
}

#' Given chromosome, position, reference, and altered allele values, return a vector of
#' SNV mutation types conforming to the standard 96-element mutation types.
#'
#' @param chr       Character vector of chromosomes (must match genome seqlevels)
#' @param pos       Integer vector of mutation positions
#' @param ref       Character vector of reference alleles (A, C, G, or T)
#' @param alt       Character vector of mutated alleles (A, C, G, or T)
#' @param genome    BSgenome object from which to derive trinucleotide context sequences
#' @return          Character vector of mutation types
#'
#' @import tibble
#' @import dplyr
#' @export

get_snv_mutation_type <- function(chr, pos, ref, alt, genome = NULL) {
    if (is.null(genome)) {
        message('No BSgenome object provided. Using default: BSgenome.Hsapiens.UCSC.hg19')
        genome = getBSgenome('BSgenome.Hsapiens.UCSC.hg19')
    }
    tibble(chr, pos, ref, alt) %>%
        mutate(
            trinucleotide_ = get_trinucleotide(chr, pos, ref, genome = genome),
            substitution_ = paste(ref, alt, sep='>'),
            mutation_type_ = stitch_mutation_types(substitution_, trinucleotide_)
        ) %>%
        mutate(mutation_type = collapse_mutation_types(mutation_type_)) %>%
        .$mutation_type
}


#' Given chromosome, position, reference, and altered allele values, returns a mutation catalog.
#' Assumes SNV mutation types conforming to the standard 96-element mutation types.
#'
#' @param chr       Character vector of chromosomes (must match genome seqlevels)
#' @param pos       Integer vector of mutation positions
#' @param ref       Character vector of reference alleles (A, C, G, or T)
#' @param alt       Character vector of mutated alleles (A, C, G, or T)
#' @param genome    BSgenome object from which to derive trinucleotide context sequences
#' @return          Data frame with columns mutation_type and count
#'
#' @import tibble
#' @import dplyr
#' @export

mutations_to_catalog <- function(chr, pos, ref, alt, genome=NULL) {
    tibble(chr, pos, ref, alt) %>%
        mutate(
            mutation_type = get_snv_mutation_type(chr, pos, ref, alt, genome)
        ) %>%
        group_by(mutation_type) %>%
        summarise(count = n()) %>%
        ungroup() %>%
        right_join(tibble(mutation_type = all_snv_mutation_types()), by = 'mutation_type') %>%
        replace_na(list(count = 0))
}
eyzhao/SignIT documentation built on Dec. 6, 2019, 11:45 a.m.