R/getTCGAMutationData.R

#' Get TCGA mutation data
#'
#' Downloads, filters and populates the SQLite database with TCGA mutation data using functionality from the \code{\link[RTCGA]{RTCGA}} package
#'
#' @param con A \code{SQLiteConnection} object
#' @param releaseDate GDAC release date.  See \code{\link[RTCGA]{downloadTCGA}} documentation
#' @param sampletag Refers to the sample part of the barcode.  Default is 01A.  See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @param sampleids A vector of sample ids to include.  Default is NULL - ie no filtering.  See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @return a data frame
#' @export


getTCGAMutationData <- function(con, cancerTypes='LUAD', releaseDate='2015-11-01', sampletag='01A', sampleids=NULL) {

    #set up temporary dir
    dld <- paste(tempdir(), format(Sys.time(), "%Y%m%d%H%M%OS6"), sep='/')
    dir.create(dld)
    RTCGA::downloadTCGA( cancerTypes = cancerTypes, destDir = dld, date = releaseDate,
                         dataSet = "Mutation_Packager_Calls.Level" )

    # reading data
    datadir <- list.files( dld) %>% file.path( dld, .)
    #maf_files <- list.files(datadir) %>%
    #    grep(x = ., pattern = "TCGA", value = TRUE)
    maf_data_import <- RTCGA::readTCGA(datadir, 'mutations')
    maf_data <- maf_data_import %>%
        dplyr::select(Hugo_Symbol, Entrez_Gene_Id, Center, NCBI_Build, Chromosome,
                                                  Start_position, End_position, Strand, Variant_Classification,
                                                  Variant_Type, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
                                                  dbSNP_RS, dbSNP_Val_Status, Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode,
                                                  Match_Norm_Seq_Allele1, Match_Norm_Seq_Allele2, Tumor_Validation_Allele1,
                                                  Tumor_Validation_Allele2, Match_Norm_Validation_Allele1,
                                                  Match_Norm_Validation_Allele2, Verification_Status, Validation_Status,
                                                  Mutation_Status, Sequencing_Phase, Sequence_Source, Validation_Method,
                                                  Score, BAM_file, Sequencer, Tumor_Sample_UUID, Matched_Norm_Sample_UUID,
                                                  Genome_Change, Annotation_Transcript, Transcript_Strand,
                                                  Transcript_Exon, Transcript_Position, cDNA_Change, Codon_Change,
                                                  Protein_Change) %>%
        dplyr::mutate_each(dplyr::funs(as.character)) %>% #convert to character
        dplyr::filter(Hugo_Symbol != 'Hugo_Symbol')  #get rid of headings

    # make ensembl to entrez conversion table
    conv_table <- dplyr::src_sqlite(con@dbname) %>%
        dplyr::tbl('human_genes') %>%
        dplyr::select(gene_id, entrezid) %>%
        dplyr::collect() %>%
        dplyr::group_by(entrezid) %>%
        dplyr::summarise(N=n(), gene_id=first(gene_id, order_by=NULL)) %>%
        dplyr::ungroup() %>%
        dplyr::arrange(desc(N))

    #combine ensembl and entrezid tables
    combo_table <- maf_data %>%
        dplyr::inner_join(conv_table, by=c('Entrez_Gene_Id'='entrezid'))

    #add PatientID and SampleID columns
    combo_table <- combo_table %>%
        dplyr::mutate(sample_id = substr(Tumor_Sample_Barcode, 1, 16),
                      patient_id = substr(sample_id, 1, 12),
                      sample_tag = substr(sample_id,14,16))

    #if patient ids to use are specified then filter
    if(!is.null(sampleids)) {
        combo_table <- combo_table %>% dplyr::filter (sample_id %in% sampleids)
    }

    #if sample tags to use are specified then filter
    if(!is.null(sampletag)) {
        combo_table <- combo_table %>% dplyr::filter(sample_tag %in% sampletag) #just the 01A sample by default, ie first primary tumor sample for the patient

    }

    ###  MAKE SURE ONLY ONE SAMPLE PER PATIENT!!
    dupcheck_df <- combo_table %>%
        dplyr::select(patient_id, sample_id) %>%
        dplyr::distinct() %>%
        dplyr::group_by(patient_id) %>%
        dplyr::summarise(N=n(), sample_id=first(sample_id, order_by=NULL)) %>%
        dplyr::ungroup()

    combo_table <- combo_table %>% dplyr::filter(sample_id %in% dupcheck_df$sample_id)
    numdups <- dupcheck_df %>% dplyr::filter(N>1) %>% nrow()
    if(numdups > 0) {
        warning(sprintf('%s patients had more than one sample, taking highest ordered sample ie 01A before 06A etc', numdups))
    }

    ###  WRITE TO DATABASE
    DBI::dbWriteTable(con, 'tcga_mutation_data', as.data.frame(combo_table), overwrite=TRUE)

    ###  INDEX DATABASE
    #print ( 'now indexing tcga_mutation_data')
    #res <- DBI::dbGetQuery ( con , ' CREATE INDEX `tcga_rnaseq_data_gene_id` ON `tcga_rnaseq_data` (`gene_id` ASC); '   )
    #print ( 'finished indexing' )

    return(combo_table)

}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.