#' Get TCGA RNAseq data
#'
#' Downloads, filters and populates the SQLite database with TCGA RNAseq data using functionality from the \code{\link[RTCGA]{RTCGA}} package
#'
#' @param con A \code{SQLiteConnection} object
#' @param releaseDate GDAC release date. See \code{\link[RTCGA]{downloadTCGA}} documentation
#' @param sampletag Refers to the sample part of the barcode. Default is 01A. See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @param sampleids A vector of sample ids to include. Default is NULL - ie no filtering. See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @return a data frame
#' @export
getTCGARNAseqData <- function(con, cancerTypes='LUAD', releaseDate='2015-11-01', sampletag='01A', sampleids=NULL) {
#set up temporary dir
dld <- paste(tempdir(), format(Sys.time(), "%Y%m%d%H%M%OS6"), sep='/')
dir.create(dld)
RTCGA::downloadTCGA( cancerTypes = cancerTypes, destDir = dld, date = releaseDate,
dataSet = "rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level" )
# reading data
datadir <- list.files( dld) %>% file.path( dld, .)
RNAseqfile <- datadir %>% list.files %>% grep( pattern = 'illuminahiseq', x = ., value = TRUE)
rnaseq_data <- RTCGA::readTCGA( path = file.path( datadir, RNAseqfile), dataType = 'rnaseq' )
# convert colnames to ensembl ids
colname_table <- data.frame(colnames=colnames(rnaseq_data)[-1],
entrezid = gsub('^(.*)\\|', '', colnames(rnaseq_data)[-1]),
stringsAsFactors = FALSE) %>%
dplyr::mutate(idx=1+row_number())
# make ensembl to entrez conversion table
conv_table <- dplyr::src_sqlite(con@dbname) %>%
dplyr::tbl('human_genes') %>%
dplyr::select(gene_id, entrezid) %>%
dplyr::collect() %>%
dplyr::group_by(entrezid) %>%
dplyr::summarise(N=n(), first_ensid=first(gene_id, order_by=NULL)) %>%
dplyr::ungroup() %>%
dplyr::arrange(desc(N))
#combine ensembl and entrezid tables
combo_table <- colname_table %>%
dplyr::inner_join(conv_table, by='entrezid') %>%
dplyr::group_by(first_ensid) %>%
dplyr::mutate(dup_ensid = n()>1) %>%
dplyr::ungroup()
#filter rnaseq columns
rnaseq_data_filtered <- rnaseq_data [ , c(1,combo_table$idx) ]
colnames(rnaseq_data_filtered)[-1] <- combo_table$first_ensid
#make tall and skinny
rnaseq_data_tall <- rnaseq_data_filtered %>%
tidyr::gather(key = 'gene_id', value = 'value', -bcr_patient_barcode)
#add PatientID and SampleID columns
rnaseq_data_tall <- rnaseq_data_tall %>%
dplyr::mutate(sample_id = substr(bcr_patient_barcode, 1, 16),
patient_id = substr(sample_id, 1, 12),
sample_tag = substr(sample_id,14,16))
#if patient ids to use are specified then filter
if(!is.null(sampleids)) {
rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter (sample_id %in% sampleids)
}
#if sample tags to use are specified then filter
if(!is.null(sampletag)) {
rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter(sample_tag %in% sampletag) #just the 01A sample by default, ie first primary tumor sample for the patient
}
### MAKE SURE ONLY ONE SAMPLE PER PATIENT!!
dupcheck_df <- rnaseq_data_tall %>%
dplyr::select(patient_id, sample_id) %>%
dplyr::distinct() %>%
dplyr::group_by(patient_id) %>%
dplyr::summarise(N=n(), sample_id=first(sample_id, order_by=NULL)) %>%
dplyr::ungroup()
rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter(sample_id %in% dupcheck_df$sample_id)
numdups <- dupcheck_df %>% dplyr::filter(N>1) %>% nrow()
if(numdups > 0) {
warning(sprintf('%s patients had more than one sample, taking highest ordered sample ie 01A before 06A etc', numdups))
}
### WRITE TO DATABASE
DBI::dbWriteTable(con, 'tcga_rnaseq_data', rnaseq_data_tall, overwrite=TRUE)
### INDEX DATABASE
print ( 'now indexing tcga_rnaseq_data')
res <- DBI::dbGetQuery ( con , ' CREATE INDEX `tcga_rnaseq_data_gene_id` ON `tcga_rnaseq_data` (`gene_id` ASC); ' )
print ( 'finished indexing' )
return(rnaseq_data_tall)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.