readTCGA: Read TCGA data to the tidy format
In RTCGA: The Cancer Genome Atlas Data Integration

Description Usage Arguments Details Value Issues Author(s) See Also Examples

readTCGA function allows to read unzipped files:

clinical data - Merge_Clinical.Level_1
rnaseq data (genes' expressions) - rnaseqv2__illuminahiseq_rnaseqv2
genes' mutations data - Mutation_Packager_Calls.Level
Reverse phase protein array data (RPPA) - protein_normalization__data.Level_3
Merge transcriptome agilent data (mRNA) - Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3
miRNASeq data - Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3 or "Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3"
methylation data - Merge_methylation__humanmethylation27
isoforms data - Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3

from TCGA project. Those files can be easily downloded with downloadTCGA function. See examples.

1	readTCGA(path, dataType, ...)

`path`	See details and examples.
`dataType`	One of `'clinical', 'rnaseq', 'mutations', 'RPPA', 'mRNA', 'miRNASeq', 'methylation', 'isoforms'` depending on which type of data user is trying to read in the tidy format.
`...`	Further arguments passed to the as.data.frame.

All cohort names can be checked using: sub( x = names( infoTCGA() ), '-counts', '').

Parameter path specification:

If dataType = 'clinical' a path to a cancerType.clin.merged.txt file.
If dataType = 'mutations' a path to the unzziped folder Mutation_Packager_Calls.Level containing .maf files.
If dataType = 'rnaseq' a path to the uzziped file rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level.
If dataType = 'RPPA' a path to the unzipped file in folder protein_normalization__data.Level_3.
If dataType = 'mRNA' a path to the unzipped file cancerType.transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.data.txt.
If dataType = 'miRNASeq' a path to unzipped files cancerType.mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt or cancerType.mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.data.txt
If dataType = 'methylation' a path to unzipped files cancerType.methylation__humanmethylation27__jhu_usc_edu__Level_3__within_bioassay_data_set_function__data.data.txt.
If dataType = 'isoforms' a path to unzipped files cancerType.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.data.txt.

An output:

If dataType = 'clinical' a data.frame with clinical data.
If dataType = 'rnaseq' a data.frame with rnaseq data.
If dataType = 'mutations' a data.frame with mutations data.
If dataType = 'RPPA' a data.frame with RPPA data.
If dataType = 'mRNA' a data.frame with mRNA data.
If dataType = 'miRNASeq' a data.frame with miRNASeq data.
If dataType = 'methylation' a data.frame with methylation data.
If dataType = 'isoforms' a data.frame with isoforms data.

If you have any problems, issues or think that something is missing or is not clear please post an issue on https://github.com/RTCGA/RTCGA/issues.

Marcin Kosinski, m.p.kosinski@gmail.com

Witold Chodor, witoldchodor@gmail.com

RTCGA website http://rtcga.github.io/RTCGA/Download.html.

Other RTCGA: RTCGA-package, boxplotTCGA, checkTCGA, convertTCGA, datasetsTCGA, downloadTCGA, expressionsTCGA, heatmapTCGA, infoTCGA, installTCGA, kmTCGA, mutationsTCGA, pcaTCGA, survivalTCGA, theme_RTCGA

## Not run:  

##############
##### clinical
##############

dir.create('data')

# downloading clinical data
# dataset = "clinical" is default parameter so we may omit it
downloadTCGA( cancerTypes = c('BRCA', 'OV'),
              destDir = 'data' )

    
# reading datasets    
sapply( c('BRCA', 'OV'), function( element ){
    folder <- grep( paste0( '(_',element,'\\.', '|','_',element,'-FFPE)', '.*Clinical'),
                    list.files('data/'),value = TRUE)
    path <- paste0( 'data/', folder, '/', element, '.clin.merged.txt')
    assign( value = readTCGA( path, 'clinical' ), 
            x = paste0(element, '.clin.data'), envir = .GlobalEnv)
    })
     
############
##### rnaseq
############

dir.create('data2')

# downloading rnaseq data
downloadTCGA( cancerTypes = 'BRCA', 
dataSet = 'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level',
              destDir = 'data2' )

# shortening paths and directories
list.files( 'data2/') %>% 
    file.path( 'data2', .) %>%
    file.rename( to = substr(.,start=1,stop=50))

# reading data
list.files( 'data2/') %>% 
    file.path( 'data2', .) -> folder

folder %>%
    list.files %>%
    file.path( folder, .) %>%
    grep( pattern = 'illuminahiseq', x = ., value = TRUE) -> pathRNA
readTCGA( path = pathRNA, dataType = 'rnaseq' ) -> my_data


###############
##### mutations
###############

# Example directory in which untarred data will be stored
dir.create('data3')


downloadTCGA( cancerTypes = 'OV', 
              dataSet = 'Mutation_Packager_Calls.Level',
              destDir = 'data3' )

# reading data
list.files( 'data3/') %>% 
    file.path( 'data3', .) -> folder

readTCGA(folder, 'mutations') -> mut_file

#################
##### methylation
#################

# Example directory in which untarred data will be stored
dir.create('data4')

# Download KIRP methylation data and store it in data4 folder
cancerType = "KIRP"
downloadTCGA(cancerTypes = cancerType,
             dataSet = "Merge_methylation__humanmethylation27",
             destDir = "data4")

# Shorten path of subdirectory with KIRP methylation data
list.files(path = "data4", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data4", paste0(cancerType, ".methylation")))

# Remove manifest.txt file
list.files(path = "data4", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read KIRP methylation data
path <- list.files(path = "data4", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

KIRP.methylation <- readTCGA(path, dataType = "methylation")


##########
##### RPPA
##########

# Directory in which untarred data will be stored
dir.create('data5')

# Download BRCA RPPA data and store it in data5 folder
cancerType = "BRCA"
downloadTCGA(cancerTypes = cancerType,
             dataSet = "protein_normalization__data.Level_3",
             destDir = "data5")

# Shorten path of subdirectory with BRCA RPPA data
list.files(path = "data5", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data5", paste0(cancerType, ".RPPA")))

# Remove manifest.txt file
list.files(path = "data5", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read BRCA RPPA data
path <- list.files(path = "data5", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

BRCA.RPPA <- readTCGA(path, dataType = "RPPA")


##########
##### mRNA
##########

# Directory in which untarred data will be stored
dir.create('data6')

# Download UCEC mRNA data and store it in data6 folder
cancerType = "UCEC"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3",
             destDir = "data6")

# Shorten path of subdirectory with UCEC mRNA data
list.files(path = "data6", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data6",paste0(cancerType, ".mRNA")))

# Remove manifest.txt file
list.files(path = "data6", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read UCEC mRNA data
path <- list.files(path = "data6", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

UCEC.mRNA <- readTCGA(path, dataType = "mRNA")

##############
##### miRNASeq
##############

# Directory in which untarred data will be stored
dir.create('data7')

# Download BRCA miRNASeq data and store it in data7 folder
# Remember that miRNASeq data are produced by two machines:
# Illumina Genome Analyzer and Illumina HiSeq 2000 machines
cancerType <- "BRCA"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3",
             destDir = "data7")

downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3",
             destDir = "data7")

# Shorten path of subdirectory with BRCA miRNASeq data
list.files(path = "data7", full.names = TRUE) %>%
    sapply(function(path){
        if (grepl(pattern = "illuminaga", path)){
            file.rename(from = grep(pattern = "illuminaga", path, value = TRUE),
                        to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminaga")))
        } else if (grepl(pattern = "illuminahiseq", path)){
            file.rename(from = grep(pattern = "illuminahiseq", path, value = TRUE),
                        to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminahiseq")))
        }
    })
    
# Remove manifest.txt file
list.files(path = "data7", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read BRCA miRNASeq data
path <- list.files(path = "data7", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)
path_illuminaga <- grep("illuminaga", path, fixed = TRUE, value = TRUE)
path_illuminahiseq <- grep("illuminahiseq", path, fixed = TRUE, value = TRUE)

BRCA.miRNASeq.illuminaga <- readTCGA(path_illuminaga, dataType = "miRNASeq")
BRCA.miRNASeq.illuminahiseq <- readTCGA(path_illuminahiseq, dataType = "miRNASeq")

BRCA.miRNASeq.illuminaga <- cbind(machine = "Illumina Genome Analyzer", BRCA.miRNASeq.illuminaga)
BRCA.miRNASeq.illuminahiseq <- cbind(machine = "Illumina HiSeq 2000", BRCA.miRNASeq.illuminahiseq)

BRCA.miRNASeq <- rbind(BRCA.miRNASeq.illuminaga, BRCA.miRNASeq.illuminahiseq)

##############
##### isoforms
##############

# Directory in which untarred data will be stored
dir.create('data8')

# Download ACC isoforms data and store it in data8 folder
cancerType = "ACC"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3",
             destDir = "data8")

# Shorten path of subdirectory with ACC isoforms data
list.files(path = "data8", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data8",paste0(cancerType, ".isoforms")))

# Remove manifest.txt file
list.files(path = "data8", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read ACC isoforms data
path <- list.files(path = "data8", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

ACC.isoforms <- readTCGA(path, dataType = "isoforms")


## End(Not run)