R/doMutationAnalysis.R

#' doMutationAnalysis
#'
#' This function calculates the proportion of samples with either a loss of function mutation (truncation etc)
#' or just a general protein coding change mutation in the cohort
#'
#' @param con A \code{SQLiteConnection} object
#' @param genes A vector of human ENSEMBL gene ids
#' @return a data frame with the results of the analysis
#' @export

doMutationAnalysis <- function(con, genes) {

    #get the data
    mutdata <- dplyr::src_sqlite(con@dbname) %>%
        dplyr::tbl('tcga_mutation_data') %>%
        dplyr::filter(gene_id %in% genes) %>%
        dplyr::collect()

    #process the data
    mutdata_processed <- mutdata %>%
        dplyr::mutate(is_lof = grepl('Nonsense|Frame_Shift|Splice', Variant_Classification), #only LOF mutations
                      is_protein_coding = grepl('Nonsense|Frame_Shift|Splice|Missense', Variant_Classification)) %>% #any protein coding mutation
        as.data.frame()


    #how many patients in total
    Npatients <- dplyr::src_sqlite(con@dbname) %>%
        dplyr::tbl('tcga_mutation_data') %>%
        dplyr::select(patient_id) %>%
        dplyr::distinct() %>%
        dplyr::collect() %>%
        nrow()

    #count up number of mutated samples per gene
    output_df <- mutdata_processed %>%
        dplyr::group_by(gene_id) %>%
        dplyr::summarise(N_lof = sum(is_lof),
                         pct_lof = round(N_lof*100/Npatients, 2),
                         N_protein_coding = sum(is_protein_coding),
                         pct_protein_coding = round(N_protein_coding*100/Npatients, 2),
                         patient_ids = paste(patient_id, collapse=';'),
                         mutations = paste(Protein_Change, collapse=';')) %>%
        dplyr::ungroup()

    #add in genes with no mutations
    no_mut_genes <- data.frame(gene_id = setdiff(genes, output_df$gene_id),
                               N_lof=0,
                               pct_lof=0,
                               N_protein_coding=0,
                               pct_protein_coding=0,
                               patient_ids='',
                               mutations='',
                               stringsAsFactors = FALSE)

    #combine output
    output_df <- dplyr::bind_rows(output_df, no_mut_genes) %>% as.data.frame()

    #put data into sqlite database
    DBI::dbWriteTable(con, "mutation_analysis_results", output_df, overwrite=TRUE)
    DBI::dbWriteTable(con, 'mutation_analysis_data', mutdata_processed, overwrite=TRUE)
    message('Finished')

    return(output_df)

}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.