R/getTCGARNAseqData.R

#' Get TCGA RNAseq data
#'
#' Downloads, filters and populates the SQLite database with TCGA RNAseq data using functionality from the \code{\link[RTCGA]{RTCGA}} package
#'
#' @param con A \code{SQLiteConnection} object
#' @param releaseDate GDAC release date.  See \code{\link[RTCGA]{downloadTCGA}} documentation
#' @param sampletag Refers to the sample part of the barcode.  Default is 01A.  See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @param sampleids A vector of sample ids to include.  Default is NULL - ie no filtering.  See \link[=https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode]{TCGA wiki}
#' @return a data frame
#' @export


getTCGARNAseqData <- function(con, cancerTypes='LUAD', releaseDate='2015-11-01', sampletag='01A', sampleids=NULL) {

    #set up temporary dir
    dld <- paste(tempdir(), format(Sys.time(), "%Y%m%d%H%M%OS6"), sep='/')
    dir.create(dld)
    RTCGA::downloadTCGA( cancerTypes = cancerTypes, destDir = dld, date = releaseDate,
                  dataSet = "rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level" )

    # reading data
    datadir <- list.files( dld) %>% file.path( dld, .)
    RNAseqfile <- datadir %>% list.files %>% grep( pattern = 'illuminahiseq', x = ., value = TRUE)
    rnaseq_data <- RTCGA::readTCGA( path = file.path( datadir, RNAseqfile), dataType = 'rnaseq' )

    # convert colnames to ensembl ids
    colname_table <- data.frame(colnames=colnames(rnaseq_data)[-1],
                                entrezid = gsub('^(.*)\\|', '', colnames(rnaseq_data)[-1]),
                                stringsAsFactors = FALSE) %>%
        dplyr::mutate(idx=1+row_number())

    # make ensembl to entrez conversion table
    conv_table <- dplyr::src_sqlite(con@dbname) %>%
        dplyr::tbl('human_genes') %>%
        dplyr::select(gene_id, entrezid) %>%
        dplyr::collect() %>%
        dplyr::group_by(entrezid) %>%
        dplyr::summarise(N=n(), first_ensid=first(gene_id, order_by=NULL)) %>%
        dplyr::ungroup() %>%
        dplyr::arrange(desc(N))

    #combine ensembl and entrezid tables
    combo_table <- colname_table %>%
        dplyr::inner_join(conv_table, by='entrezid') %>%
        dplyr::group_by(first_ensid) %>%
        dplyr::mutate(dup_ensid = n()>1) %>%
        dplyr::ungroup()

    #filter rnaseq columns
    rnaseq_data_filtered <- rnaseq_data [ , c(1,combo_table$idx) ]
    colnames(rnaseq_data_filtered)[-1] <- combo_table$first_ensid

    #make tall and skinny
    rnaseq_data_tall <- rnaseq_data_filtered %>%
        tidyr::gather(key = 'gene_id', value = 'value', -bcr_patient_barcode)

    #add PatientID and SampleID columns
    rnaseq_data_tall <- rnaseq_data_tall %>%
        dplyr::mutate(sample_id = substr(bcr_patient_barcode, 1, 16),
               patient_id = substr(sample_id, 1, 12),
               sample_tag = substr(sample_id,14,16))

    #if patient ids to use are specified then filter
    if(!is.null(sampleids)) {
        rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter (sample_id %in% sampleids)
    }

    #if sample tags to use are specified then filter
    if(!is.null(sampletag)) {
        rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter(sample_tag %in% sampletag) #just the 01A sample by default, ie first primary tumor sample for the patient

    }

    ###  MAKE SURE ONLY ONE SAMPLE PER PATIENT!!
    dupcheck_df <- rnaseq_data_tall %>%
        dplyr::select(patient_id, sample_id) %>%
        dplyr::distinct() %>%
        dplyr::group_by(patient_id) %>%
        dplyr::summarise(N=n(), sample_id=first(sample_id, order_by=NULL)) %>%
        dplyr::ungroup()

    rnaseq_data_tall <- rnaseq_data_tall %>% dplyr::filter(sample_id %in% dupcheck_df$sample_id)
    numdups <- dupcheck_df %>% dplyr::filter(N>1) %>% nrow()
    if(numdups > 0) {
        warning(sprintf('%s patients had more than one sample, taking highest ordered sample ie 01A before 06A etc', numdups))
    }

    ###  WRITE TO DATABASE
    DBI::dbWriteTable(con, 'tcga_rnaseq_data', rnaseq_data_tall, overwrite=TRUE)

    ###  INDEX DATABASE
    print ( 'now indexing tcga_rnaseq_data')
    res <- DBI::dbGetQuery ( con , ' CREATE INDEX `tcga_rnaseq_data_gene_id` ON `tcga_rnaseq_data` (`gene_id` ASC); '   )
    print ( 'finished indexing' )

    return(rnaseq_data_tall)

}
chapmandu2/CollateralVulnerability2016 documentation built on May 13, 2019, 3:27 p.m.