#' Get TCGA mutation data
#'
#' Downloads, filters and populates the SQLite database with TCGA mutation data using functionality from the \code{\link[RTCGA]{RTCGA}} package
#'
#' @param con A \code{SQLiteConnection} object
#' @param releaseDate GDAC release date. See \code{\link[RTCGA]{downloadTCGA}} documentation
#' @param sampletag Refers to the sample part of the barcode. Default is 01A. See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @param sampleids A vector of sample ids to include. Default is NULL - ie no filtering. See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @return a data frame
#' @export
getTCGAMutationData <- function(con, cancerTypes='LUAD', releaseDate='2015-11-01', sampletag='01A', sampleids=NULL) {
#set up temporary dir
dld <- paste(tempdir(), format(Sys.time(), "%Y%m%d%H%M%OS6"), sep='/')
dir.create(dld)
RTCGA::downloadTCGA( cancerTypes = cancerTypes, destDir = dld, date = releaseDate,
dataSet = "Mutation_Packager_Calls.Level" )
# reading data
datadir <- list.files( dld) %>% file.path( dld, .)
#maf_files <- list.files(datadir) %>%
# grep(x = ., pattern = "TCGA", value = TRUE)
maf_data_import <- RTCGA::readTCGA(datadir, 'mutations')
maf_data <- maf_data_import %>%
dplyr::select(Hugo_Symbol, Entrez_Gene_Id, Center, NCBI_Build, Chromosome,
Start_position, End_position, Strand, Variant_Classification,
Variant_Type, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
dbSNP_RS, dbSNP_Val_Status, Tumor_Sample_Barcode, Matched_Norm_Sample_Barcode,
Match_Norm_Seq_Allele1, Match_Norm_Seq_Allele2, Tumor_Validation_Allele1,
Tumor_Validation_Allele2, Match_Norm_Validation_Allele1,
Match_Norm_Validation_Allele2, Verification_Status, Validation_Status,
Mutation_Status, Sequencing_Phase, Sequence_Source, Validation_Method,
Score, BAM_file, Sequencer, Tumor_Sample_UUID, Matched_Norm_Sample_UUID,
Genome_Change, Annotation_Transcript, Transcript_Strand,
Transcript_Exon, Transcript_Position, cDNA_Change, Codon_Change,
Protein_Change) %>%
dplyr::mutate_each(dplyr::funs(as.character)) %>% #convert to character
dplyr::filter(Hugo_Symbol != 'Hugo_Symbol') #get rid of headings
# make ensembl to entrez conversion table
conv_table <- dplyr::src_sqlite(con@dbname) %>%
dplyr::tbl('human_genes') %>%
dplyr::select(gene_id, entrezid) %>%
dplyr::collect() %>%
dplyr::group_by(entrezid) %>%
dplyr::summarise(N=n(), gene_id=first(gene_id, order_by=NULL)) %>%
dplyr::ungroup() %>%
dplyr::arrange(desc(N))
#combine ensembl and entrezid tables
combo_table <- maf_data %>%
dplyr::inner_join(conv_table, by=c('Entrez_Gene_Id'='entrezid'))
#add PatientID and SampleID columns
combo_table <- combo_table %>%
dplyr::mutate(sample_id = substr(Tumor_Sample_Barcode, 1, 16),
patient_id = substr(sample_id, 1, 12),
sample_tag = substr(sample_id,14,16))
#if patient ids to use are specified then filter
if(!is.null(sampleids)) {
combo_table <- combo_table %>% dplyr::filter (sample_id %in% sampleids)
}
#if sample tags to use are specified then filter
if(!is.null(sampletag)) {
combo_table <- combo_table %>% dplyr::filter(sample_tag %in% sampletag) #just the 01A sample by default, ie first primary tumor sample for the patient
}
### MAKE SURE ONLY ONE SAMPLE PER PATIENT!!
dupcheck_df <- combo_table %>%
dplyr::select(patient_id, sample_id) %>%
dplyr::distinct() %>%
dplyr::group_by(patient_id) %>%
dplyr::summarise(N=n(), sample_id=first(sample_id, order_by=NULL)) %>%
dplyr::ungroup()
combo_table <- combo_table %>% dplyr::filter(sample_id %in% dupcheck_df$sample_id)
numdups <- dupcheck_df %>% dplyr::filter(N>1) %>% nrow()
if(numdups > 0) {
warning(sprintf('%s patients had more than one sample, taking highest ordered sample ie 01A before 06A etc', numdups))
}
### WRITE TO DATABASE
DBI::dbWriteTable(con, 'tcga_mutation_data', as.data.frame(combo_table), overwrite=TRUE)
### INDEX DATABASE
#print ( 'now indexing tcga_mutation_data')
#res <- DBI::dbGetQuery ( con , ' CREATE INDEX `tcga_rnaseq_data_gene_id` ON `tcga_rnaseq_data` (`gene_id` ASC); ' )
#print ( 'finished indexing' )
return(combo_table)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.