readTCGA: Read TCGA data to the tidy format

Description Usage Arguments Details Value Issues Author(s) See Also Examples

View source: R/readTCGA.R

Description

readTCGA function allows to read unzipped files:

from TCGA project. Those files can be easily downloded with downloadTCGA function. See examples.

Usage

1
readTCGA(path, dataType, ...)

Arguments

path

See details and examples.

dataType

One of 'clinical', 'rnaseq', 'mutations', 'RPPA', 'mRNA', 'miRNASeq', 'methylation', 'isoforms' depending on which type of data user is trying to read in the tidy format.

...

Further arguments passed to the as.data.frame.

Details

All cohort names can be checked using: sub( x = names( infoTCGA() ), '-counts', '').

Parameter path specification:

Value

An output:

Issues

If you have any problems, issues or think that something is missing or is not clear please post an issue on https://github.com/RTCGA/RTCGA/issues.

Author(s)

Marcin Kosinski, m.p.kosinski@gmail.com

Witold Chodor, witoldchodor@gmail.com

See Also

RTCGA website http://rtcga.github.io/RTCGA/Download.html.

Other RTCGA: RTCGA-package, boxplotTCGA, checkTCGA, convertTCGA, datasetsTCGA, downloadTCGA, expressionsTCGA, heatmapTCGA, infoTCGA, installTCGA, kmTCGA, mutationsTCGA, pcaTCGA, survivalTCGA, theme_RTCGA

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
## Not run:  

##############
##### clinical
##############

dir.create('data')

# downloading clinical data
# dataset = "clinical" is default parameter so we may omit it
downloadTCGA( cancerTypes = c('BRCA', 'OV'),
              destDir = 'data' )

    
# reading datasets    
sapply( c('BRCA', 'OV'), function( element ){
    folder <- grep( paste0( '(_',element,'\\.', '|','_',element,'-FFPE)', '.*Clinical'),
                    list.files('data/'),value = TRUE)
    path <- paste0( 'data/', folder, '/', element, '.clin.merged.txt')
    assign( value = readTCGA( path, 'clinical' ), 
            x = paste0(element, '.clin.data'), envir = .GlobalEnv)
    })
     
############
##### rnaseq
############

dir.create('data2')

# downloading rnaseq data
downloadTCGA( cancerTypes = 'BRCA', 
dataSet = 'rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level',
              destDir = 'data2' )

# shortening paths and directories
list.files( 'data2/') %>% 
    file.path( 'data2', .) %>%
    file.rename( to = substr(.,start=1,stop=50))

# reading data
list.files( 'data2/') %>% 
    file.path( 'data2', .) -> folder

folder %>%
    list.files %>%
    file.path( folder, .) %>%
    grep( pattern = 'illuminahiseq', x = ., value = TRUE) -> pathRNA
readTCGA( path = pathRNA, dataType = 'rnaseq' ) -> my_data


###############
##### mutations
###############

# Example directory in which untarred data will be stored
dir.create('data3')


downloadTCGA( cancerTypes = 'OV', 
              dataSet = 'Mutation_Packager_Calls.Level',
              destDir = 'data3' )

# reading data
list.files( 'data3/') %>% 
    file.path( 'data3', .) -> folder

readTCGA(folder, 'mutations') -> mut_file

#################
##### methylation
#################

# Example directory in which untarred data will be stored
dir.create('data4')

# Download KIRP methylation data and store it in data4 folder
cancerType = "KIRP"
downloadTCGA(cancerTypes = cancerType,
             dataSet = "Merge_methylation__humanmethylation27",
             destDir = "data4")

# Shorten path of subdirectory with KIRP methylation data
list.files(path = "data4", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data4", paste0(cancerType, ".methylation")))

# Remove manifest.txt file
list.files(path = "data4", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read KIRP methylation data
path <- list.files(path = "data4", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

KIRP.methylation <- readTCGA(path, dataType = "methylation")


##########
##### RPPA
##########

# Directory in which untarred data will be stored
dir.create('data5')

# Download BRCA RPPA data and store it in data5 folder
cancerType = "BRCA"
downloadTCGA(cancerTypes = cancerType,
             dataSet = "protein_normalization__data.Level_3",
             destDir = "data5")

# Shorten path of subdirectory with BRCA RPPA data
list.files(path = "data5", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data5", paste0(cancerType, ".RPPA")))

# Remove manifest.txt file
list.files(path = "data5", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read BRCA RPPA data
path <- list.files(path = "data5", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

BRCA.RPPA <- readTCGA(path, dataType = "RPPA")


##########
##### mRNA
##########

# Directory in which untarred data will be stored
dir.create('data6')

# Download UCEC mRNA data and store it in data6 folder
cancerType = "UCEC"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_transcriptome__agilentg4502a_07_3__unc_edu__Level_3__unc_lowess_normalization_gene_level__data.Level_3",
             destDir = "data6")

# Shorten path of subdirectory with UCEC mRNA data
list.files(path = "data6", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data6",paste0(cancerType, ".mRNA")))

# Remove manifest.txt file
list.files(path = "data6", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read UCEC mRNA data
path <- list.files(path = "data6", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

UCEC.mRNA <- readTCGA(path, dataType = "mRNA")

##############
##### miRNASeq
##############

# Directory in which untarred data will be stored
dir.create('data7')

# Download BRCA miRNASeq data and store it in data7 folder
# Remember that miRNASeq data are produced by two machines:
# Illumina Genome Analyzer and Illumina HiSeq 2000 machines
cancerType <- "BRCA"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_mirnaseq__illuminaga_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3",
             destDir = "data7")

downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_mirnaseq__illuminahiseq_mirnaseq__bcgsc_ca__Level_3__miR_gene_expression__data.Level_3",
             destDir = "data7")

# Shorten path of subdirectory with BRCA miRNASeq data
list.files(path = "data7", full.names = TRUE) %>%
    sapply(function(path){
        if (grepl(pattern = "illuminaga", path)){
            file.rename(from = grep(pattern = "illuminaga", path, value = TRUE),
                        to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminaga")))
        } else if (grepl(pattern = "illuminahiseq", path)){
            file.rename(from = grep(pattern = "illuminahiseq", path, value = TRUE),
                        to = file.path("data7",paste0(cancerType, ".miRNASeq.illuminahiseq")))
        }
    })
    
# Remove manifest.txt file
list.files(path = "data7", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read BRCA miRNASeq data
path <- list.files(path = "data7", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)
path_illuminaga <- grep("illuminaga", path, fixed = TRUE, value = TRUE)
path_illuminahiseq <- grep("illuminahiseq", path, fixed = TRUE, value = TRUE)

BRCA.miRNASeq.illuminaga <- readTCGA(path_illuminaga, dataType = "miRNASeq")
BRCA.miRNASeq.illuminahiseq <- readTCGA(path_illuminahiseq, dataType = "miRNASeq")

BRCA.miRNASeq.illuminaga <- cbind(machine = "Illumina Genome Analyzer", BRCA.miRNASeq.illuminaga)
BRCA.miRNASeq.illuminahiseq <- cbind(machine = "Illumina HiSeq 2000", BRCA.miRNASeq.illuminahiseq)

BRCA.miRNASeq <- rbind(BRCA.miRNASeq.illuminaga, BRCA.miRNASeq.illuminahiseq)

##############
##### isoforms
##############

# Directory in which untarred data will be stored
dir.create('data8')

# Download ACC isoforms data and store it in data8 folder
cancerType = "ACC"
downloadTCGA(cancerTypes = cancerType,
dataSet = "Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_isoforms_normalized__data.Level_3",
             destDir = "data8")

# Shorten path of subdirectory with ACC isoforms data
list.files(path = "data8", full.names = TRUE) %>%
    file.rename(from = ., to = file.path("data8",paste0(cancerType, ".isoforms")))

# Remove manifest.txt file
list.files(path = "data8", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE) %>% 
    grep("MANIFEST.txt", x = ., value = TRUE) %>%
    file.remove()

# Read ACC isoforms data
path <- list.files(path = "data8", full.names = TRUE) %>%
    list.files(path = ., full.names = TRUE)

ACC.isoforms <- readTCGA(path, dataType = "isoforms")


## End(Not run)

RTCGA documentation built on Nov. 8, 2020, 5:11 p.m.