kidney: Kidney Renal Clear Cell Carcinoma [KIRC] RNA-Seq data

Description Usage Format Source Examples

Description

The KIRC RNA-seq dataset from The Cancer Genome Atlas containing 20531 genes and 72 paired columns of data with rows corresponding to genes and columns corresponding to replicates; replic vector specifies replicates and treatment vector specifies non-tumor and tumor group samples respectively within replicate.

The maximum possible sample size that can be simulated with this dataset is 36 replicates in each of two treatment groups. However, it is recommended to use a sample size of 20 or lower for simulation studies to ensure each simulated dataset is sufficiently different from the other simulated datasets.

Disclaimer: The version of the KIRC dataset provided is: unc.edu_KIRC.IlluminaHiSeq_RNASeqV2.Level_3.1.5.0.

The Cancer Genome Atlas updates its datasets periodically. The latest version of the KIRC dataset can always be downloaded from: https://tcga-data.nci.nih.gov/tcga/.

Please appropriately reference the source below when using this dataset. The source code used to assemble this dataset is provided below.

Usage

1

Format

List containing:

Source

https://tcga-data.nci.nih.gov/tcga/

The Cancer Genome Atlas Research Network (2013). Comprehensive molecular characterization of clear cell renal cell carcinoma. Nature, 499(7456), 43-49.

Examples

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
data(kidney)

## Not run: 
### Source code used to assemble KIRC dataset

### load in SimSeq package for sorting counts matrix
library(SimSeq)

### htmlToText function used to scrape barcode data from uuid
htmlToText <- function(input, ...) {
  ###---PACKAGES ---###
  library(RCurl)
  library(XML) 
  
  ###--- LOCAL FUNCTIONS ---###
  # Determine how to grab html for a single input element
  evaluate_input <- function(input) {    
    # if input is a .html file
    if(file.exists(input)) {
      char.vec <- readLines(input, warn = FALSE)
      return(paste(char.vec, collapse = ""))
    }
    
    # if input is html text
    if(grepl("</html>", input, fixed = TRUE)) return(input)
    
    # if input is a URL, probably should use a regex here instead?
    if(!grepl(" ", input)) {
      # downolad SSL certificate in case of https problem
      if(!file.exists("cacert.perm")) {
        download.file(url = "http://curl.haxx.se/ca/cacert.pem", destfile = "cacert.perm")
      }
      return(getURL(input, followlocation = TRUE, cainfo = "cacert.perm"))
    }
    
    # return NULL if none of the conditions above apply
    return(NULL)
  }
  
  # convert HTML to plain text
  convert_html_to_text <- function(html) {
    doc <- htmlParse(html, asText = TRUE)
    text <- xpathSApply(doc, paste0("//text()",
                   "[not(ancestor::script)][not(ancestor::style)]",
                   "[not(ancestor::noscript)][not(ancestor::form)]"), xmlValue)
    return(text)
  }
  
  # format text vector into one character string
  collapse_text <- function(txt) {
    return(paste(txt, collapse = " "))
  }
  
  ###--- MAIN ---###
  # STEP 1: Evaluate input
  html.list <- lapply(input, evaluate_input)
  
  # STEP 2: Extract text from HTML
  text.list <- lapply(html.list, convert_html_to_text)
  
  # STEP 3: Return text
  text.vector <- sapply(text.list, collapse_text)
  return(text.vector)
}

### Specify path name for folder containing raw counts for each sample
mainDir <- getwd()
folder.path <- "unc.edu_KIRC.IlluminaHiSeq_RNASeqV2.Level_3.1.5.0"

### Determine list of files containing summarized raw counts
file.list <- dir(file.path(mainDir, folder.path))
keep <- grepl("genes.results", file.list)
file.list <- file.list[keep]

### Create summarized count matrix.
### Get n.row and n.col for summarized count matrix number of genes in first
### sample and number of total samples from file.list

file.temp <- file.path(mainDir, folder.path, file.list[1])
n.row <- nrow(read.table(file = file.temp, header = TRUE))
n.col <- length(file.list)

### initialize counts matrix
counts <- matrix(NA, nrow = n.row, ncol = n.col)

### get gene id's
gene.id <- read.table(file.temp, header = TRUE, stringsAsFactors = FALSE)$gene_id

### read in raw read counts from file.list
for(i in 1:n.col){
  file.temp <- file.path(mainDir, folder.path, file.list[i])
  counts[, i] <- read.table(file.temp, header = TRUE)$raw_count
}

### Data was summarized using RSEM software which produces non_integer 
### counts for ambiguous reads. Counts are rounded as a preprocessing
### step.
counts <- round(counts)

### Cast counts matrix as integer type
counts <- matrix(as.integer(counts), nrow = nrow(counts), ncol = ncol(counts))

### Get uuid's for each sample
uuid <- substr(file.list, start = 9, stop = 44)

### Create urls from uuid list
urls <- paste(rep("https://tcga-data.nci.nih.gov/uuid/uuidws/mapping/xml/uuid/",
                  length(uuid)), uuid, sep = "")

### Scrape barcodes from urls
l <- length(urls)
barcodes <- vector("character", l)
for(i in 1:l){
  barcodes[i] <- htmlToText(urls[i])
}


barcodes <- substr(barcodes, start = 1, stop = 28)

### Get metadata on which samples were taken from each individual, 
### tumor type of sample, etc. from barcodes for each sample
metadata <- data.frame(barcodes, stringsAsFactors = FALSE)

### Study Participant
metadata$participant <- substr(barcodes, start = 9, stop = 12)

### Sample type code. See:
### https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm?codeTable=Sample%20type
### for full list of codes and details on TCGA barcodes.
### 01: Primary Solid Tumor
### 02: Recurrent Solid Tumor
### 05: Additional New Primary
### 06: Metastatic Tumor
### 11: Solid Tissue Normal
metadata$type <- substr(barcodes, start = 14, stop = 15)

### Only keep Primary Solid Tumor and Solid Tissue Normal
keep.metadata <- metadata$type == "01" | metadata$type == "11"
metadata <- metadata[keep.metadata, ]
counts <- counts[, keep.metadata]

### Code from 01 to Tumor and 11 to Non-Tumor for easy identifiability
metadata$tumor <- "Non-Tumor"
metadata$tumor[metadata$type == "01"] <- "Tumor"

### tag participant, type, and tumor as factors
metadata$participant <- as.factor(metadata$participant)
metadata$type <-as.factor(metadata$type)
metadata$tumor <- as.factor(metadata$tumor)

### Sort and subset down to paired data
sorting <- 
  SortData(counts, treatment = metadata$tumor, 
           replic = metadata$participant, sort.method = "paired")$sorting

counts <- counts[, sorting]
metadata <- metadata[sorting, ]
metadata$participant <- factor(metadata$participant)

### Add in attributes of counts matrix
dimnames(counts) <- list(gene.id, metadata$barcodes)
attr(counts, "uuid") <- uuid

kidney <- vector("list", 3)
kidney[[1]] <- counts
kidney[[2]] <- metadata$participant
kidney[[3]] <- metadata$tumor
names(kidney) <- c("counts", "replic", "treatment")

###Save file
save(kidney, file = "kidney.rda")

## End(Not run)

SimSeq documentation built on May 2, 2019, 5:11 a.m.