## Dataset download and preparation
get_tcga_data <- function(tumor_types) {
futile.logger::flog.info("Starting download for tumor types: %s", paste(tumor_types, collapse = ", "))
firehose_datasets <- RTCGAToolbox::getFirehoseDatasets()
# firehose_dates= getFirehoseRunningDates()
FIREHOSE_DATE <- "20150402" # we fix it so data is consistent during development
FIREHOSE_DIR <<- get_package_folder("inst/firehose")
datasets <- list() # whole datasets
matrices <- list()
X <- NULL
Y <- NULL
Z <- NULL
# tumor_types = c('BRCA', 'OV')
for (tumor_type in tumor_types) {
futile.logger::flog.info("Download data for %s", tumor_type)
# downloads the data
datasets[[tumor_type]] <- RTCGAToolbox::getFirehoseData(dataset = tumor_type, runDate = FIREHOSE_DATE, RNAseq2_Gene_Norm = TRUE, RPPA = TRUE, destdir = FIREHOSE_DIR)
futile.logger::flog.info("Loading data for %s", tumor_type)
# gets data matrices
matrices[[tumor_type]]$X <- datasets[[tumor_type]]@RNASeq2GeneNorm
matrices[[tumor_type]]$Y <- datasets[[tumor_type]]@RPPAArray[[1]]@DataMatrix
matrices[[tumor_type]]$Z <- as.data.frame(datasets[[tumor_type]]@Clinical)
if (grepl("-.-(V|C|NA)$", rownames(matrices[[tumor_type]]$Y))) {
rownames(matrices[[tumor_type]]$Y) = gsub("-.-(V|C|NA)$", "", rownames(matrices[[tumor_type]]$Y)) # antobodies names include a suffix for OV dataset...
}
# Normalize sample ids
matrices[[tumor_type]] <- normalize_sample_identifiers(matrices[[tumor_type]]$X, matrices[[tumor_type]]$Y, matrices[[tumor_type]]$Z)
futile.logger::flog.info("No. of %s cancer samples: %s", tumor_type, dim(matrices[[tumor_type]]$X)[2])
futile.logger::flog.info("No. of %s cancer features for RNAseq: %s", tumor_type, dim(matrices[[tumor_type]]$X)[1])
futile.logger::flog.info("No. of %s cancer features for RPPA: %s", tumor_type, dim(matrices[[tumor_type]]$Y)[1])
futile.logger::flog.info("No. of %s cancer features for clinical data: %s", tumor_type, dim(matrices[[tumor_type]]$Z)[2])
matrices[[tumor_type]] <- get_RPPA_annotations(tumor_type, matrices[[tumor_type]])
# adds the tumor type in the clinical data matrix
matrices[[tumor_type]]$Z <- cbind(matrices[[tumor_type]]$Z, Tumor_type = c(tumor_type))
# unlists matrices of each data type for later processing
if (is.null(X)) {
X <- matrices[[tumor_type]]$X
} else {
shared_features <- intersect(rownames(X), rownames(matrices[[tumor_type]]$X))
X <- cbind2(X[rownames(X) %in% shared_features, ], matrices[[tumor_type]]$X[rownames(matrices[[tumor_type]]$X) %in% shared_features, ])
}
if (is.null(Y)) {
Y <- matrices[[tumor_type]]$Y
} else {
shared_features = intersect(rownames(Y), rownames(matrices[[tumor_type]]$Y))
Y <- cbind2(Y[rownames(Y) %in% shared_features, ], matrices[[tumor_type]]$Y[rownames(matrices[[tumor_type]]$Y) %in% shared_features, ])
}
if (is.null(Z)) {
Z <- matrices[[tumor_type]]$Z
} else {
Z <- subset(Z, select = intersect(colnames(Z), colnames(matrices[[tumor_type]]$Z)))
matrices[[tumor_type]]$Z <- subset(matrices[[tumor_type]]$Z, select = intersect(colnames(Z), colnames(matrices[[tumor_type]]$Z)))
Z <- rbind2(Z, matrices[[tumor_type]]$Z)
}
}
# 18147 gene expression variables + 133 protein expression variables for 407 samples
futile.logger::flog.info("Total number of features for RNAseq: %s", dim(X)[1])
futile.logger::flog.info("Total number of features for RPPA: %s", dim(Y)[1])
futile.logger::flog.info("Total number of features for clinical data: %s", dim(Z)[2])
futile.logger::flog.info("Total number of samples: %s", dim(X)[2])
# Transposes
X <- t(X)
Y <- t(Y)
# Sorts matrices by sample
X <- X[order(rownames(X)), ]
Y <- Y[order(rownames(Y)), ]
Z <- Z[order(rownames(Z)), ]
# Writes input matrices
write.table(X, file = paste(RESULTS_FOLDER, "X_rnaseqnorm.txt", sep = "/"), quote = F, sep = "\t", na = "")
write.table(Y, file = paste(RESULTS_FOLDER, "Y_rppa.txt", sep = "/"), quote = F, sep = "\t", na = "")
write.table(Z, file = paste(RESULTS_FOLDER, "Z_clinical.txt", sep = "/"), quote = F, sep = "\t", na = "")
futile.logger::flog.info("Finished downloading TCGA data.")
# returns matrices
list(X = X, Y = Y, Z = Z)
}
# removed as it has no RNAseq 'ESCA' =
# 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/ESCA/20150821/gdac.broadinstitute.org_ESCA.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz', removed as there is no RPPA data
# 'KICH' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/KICH/20150821/gdac.broadinstitute.org_KICH.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz', remove due to incoherent
# annotations and impl 'LGG' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/LGG/20150821/gdac.broadinstitute.org_LGG.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz', removed
# as there is no RPPA data 'LIHC' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/LIHC/20150821/gdac.broadinstitute.org_LIHC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz',
# removed as there is no RPPA data 'MESO' =
# 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/MESO/20150821/gdac.broadinstitute.org_MESO.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz', removed as there is no RPPA data
# 'PRAD' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/PRAD/20150821/gdac.broadinstitute.org_PRAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz', removed for strange
# error'SARC' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/SARC/20150821/gdac.broadinstitute.org_SARC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz', remove due to
# incoherent annotations and impl'STAD' =
# 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/STAD/20150821/gdac.broadinstitute.org_STAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz', remove due to incoherent
# annotations and impl'THCA' = 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/THCA/20150821/gdac.broadinstitute.org_THCA.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz',
RPPA_ANNOTATIONS <- list(ACC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/ACC/20150821/gdac.broadinstitute.org_ACC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz", BLCA = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/BLCA/20150821/gdac.broadinstitute.org_BLCA.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
BRCA = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/BRCA/20150821/gdac.broadinstitute.org_BRCA.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz", CESC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/CESC/20150821/gdac.broadinstitute.org_CESC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz",
COAD = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/COAD/20150821/gdac.broadinstitute.org_COAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", COADREAD = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/COADREAD/20150821/gdac.broadinstitute.org_COADREAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
DLBC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/DLBC/20150821/gdac.broadinstitute.org_DLBC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz", GBM = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/GBM/20150821/gdac.broadinstitute.org_GBM.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
HNSC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/HNSC/20150821/gdac.broadinstitute.org_HNSC.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz", KIRC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/KIRC/20150821/gdac.broadinstitute.org_KIRC.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
KIRP = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/KIRP/20150821/gdac.broadinstitute.org_KIRP.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", LUAD = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/LUAD/20150821/gdac.broadinstitute.org_LUAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
LUSC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/LUSC/20150821/gdac.broadinstitute.org_LUSC.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", OV = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/OV/20150821/gdac.broadinstitute.org_OV.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz",
PAAD = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/PAAD/20150821/gdac.broadinstitute.org_PAAD.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", PCPG = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/PCPG/20150821/gdac.broadinstitute.org_PCPG.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz",
READ = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/READ/20150821/gdac.broadinstitute.org_READ.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", SKCM = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/SKCM/20150821/gdac.broadinstitute.org_SKCM.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz",
TGCT = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/TGCT/20150821/gdac.broadinstitute.org_TGCT.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz", THYM = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/THYM/20150821/gdac.broadinstitute.org_THYM.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz",
UCEC = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/UCEC/20150821/gdac.broadinstitute.org_UCEC.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz", UCS = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/UCS/20150821/gdac.broadinstitute.org_UCS.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz",
UVM = "http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/UVM/20150821/gdac.broadinstitute.org_UVM.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz")
# Downloads annotations of RPPA antibodies for a given tumor type
get_RPPA_annotations <- function(tumor_type, matrices) {
futile.logger::flog.info("Loading RPPA annotations for %s", tumor_type)
# RPPA.annotations.URL.template =
# 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/%TUMOR_TYPE%/20150821/gdac.broadinstitute.org_%TUMOR_TYPE%.RPPA_AnnotateWithGene.Level_3.2015082100.1.0.tar.gz'
# RPPA.annotations.URL.template_2 =
# 'http://gdac.broadinstitute.org/runs/stddata__2015_08_21/data/%TUMOR_TYPE%/20150821/gdac.broadinstitute.org_%TUMOR_TYPE%.RPPA_AnnotateWithGene.Level_3.2015082100.0.0.tar.gz'
RPPA_annotations_file_template <- paste(FIREHOSE_DIR, "%TUMOR_FOLDER%/%TUMOR_TYPE%.antibody_annotation.txt", sep = "/")
# Downloads the annotations if necessary
RPPA_annotations_URL <- RPPA_ANNOTATIONS[[tumor_type]]
RPPA_annotations_file <- paste(FIREHOSE_DIR, basename(RPPA_annotations_URL), sep = "/")
if (!file.exists(RPPA_annotations_file)) {
download.file(RPPA_annotations_URL, RPPA_annotations_file)
}
# Uncompressing annotations
if (file.exists(RPPA_annotations_file)) {
untar(RPPA_annotations_file, exdir = FIREHOSE_DIR)
output_matrices <- normalize_variable_names(matrices$X, matrices$Y, matrices$Z, gsub("%TUMOR_FOLDER%", gsub(".tar.gz", "", basename(RPPA_annotations_file)), gsub("%TUMOR_TYPE%", tumor_type,
RPPA_annotations_file_template)))
} else {
# Annotations not found where expected
futile.logger::flog.error("Cannot find RPPA annotations for tumor type %s.", tumor_type)
}
futile.logger::flog.info("Finished loading RPPA annotations.")
output_matrices
}
# Normalize sample identifiers
normalize_sample_identifiers <- function(X, Y, Z) {
futile.logger::flog.info("Normalizing sample identifiers...")
# gets samples
gene_expression_samples <- colnames(X)
protein_expression_samples <- colnames(Y)
# Normalizes sample identifiers for X and Y See https://wiki.nci.nih.gov/display/TCGA/TCGA+Barcode for details on sample identifiers Check
# https://tcga-data.nci.nih.gov/datareports/codeTablesReport.htm?codeTable=Sample%20type to identify tumor and normal samples removes the part in the sample id related with the data type
protein_expression_samples_processed <- unlist(lapply(protein_expression_samples, substr, start = 1, stop = 16))
colnames(Y) <- protein_expression_samples_processed
gene_expression_samples_processed <- unlist(lapply(gene_expression_samples, substr, start = 1, stop = 16))
colnames(X) <- gene_expression_samples_processed
# Normalizes sample identifiers for clinical data (identifiers are of the type 'TCGA-MS-A51U', they don't contain the part related with the tissue)
row.names(Z) <- gsub("\\.", "-", toupper(row.names(Z)))
# Gets common samples (407 samples having gene expression and protein expression)
common_samples <- intersect(protein_expression_samples_processed, gene_expression_samples_processed)
# Get samples subset based on histology type
# Removes samples not being common from both data types
X <- subset(X, select = common_samples)
Y <- subset(Y, select = common_samples)
# TODO: Remove repetitions for the same individual (there are none in this dataset as we have no controls for protein expression) common_samples = uniq(substr(common_samples, 1, 12))
# Normalizes sample identifiers with clinical data Gets only the part of the sample identifier corresponding to individual so we can match to clinical data ('TCGA-MS-A51U')
colnames(X) <- substr(colnames(X), 1, 12)
colnames(Y) <- substr(colnames(Y), 1, 12)
Z_t <- subset(t(Z), select = intersect(colnames(Y), rownames(Z)))
Z <- as.data.frame(t(Z_t))
rownames(Z) <- substr(rownames(Z), 1, 12)
# return
list(X = X, Y = Y, Z = Z)
}
# Normalizes variables into HGNC symbols
normalize_variable_names <- function(X, Y, Z, antibody_annotation_file) {
futile.logger::flog.info("Normalizing feature names...")
# Normalizes gene identifiers using Ensembl BioMart for Homo sapiens Removes genes from X not having a correct HUGO identifier
hgnc_symbols <- query_biomart(attributes = c("hgnc_symbol"), filters = c("hgnc_symbol"), values = rownames(X))[, 1]
X <- X[rownames(X) %in% hgnc_symbols, ]
# Normalizes RPPA antibody identifiers with Firehose annotation to gene (we use BRCA annotations as they are subset of OV) Some antibodies map to several genes -> we will keep only the first in
# the list Some genes are mapped by several antibodies -> we will keep only the one with the highest average expression
antibody_annotation <- read.csv(antibody_annotation_file, sep = "\t")
# remove more than one gene to the same antibody
antibody_annotation$Gene.Name.processed <- gsub(" .*", "", antibody_annotation$Gene.Name)
antibodies <- intersect(rownames(Y), antibody_annotation$Composite.Element.REF)
Y <- Y[rownames(Y) %in% antibodies, ]
# remove more than one antibody to the same gene, we keep that one with the greatest average expression
antibody_annotation <- antibody_annotation[antibody_annotation$Composite.Element.REF %in% antibodies, ]
antibody_annotation$max.avg.exp <- apply(Y[rownames(Y) %in% antibodies, ], MARGIN = 1, FUN = function(x) {
abs(mean(x))
})
antibody_annotation <- antibody_annotation[order(antibody_annotation$Gene.Name.processed, antibody_annotation$max.avg.exp, decreasing = TRUE), ]
antibody_annotation <- antibody_annotation[!duplicated(antibody_annotation$Gene.Name.processed), ]
antibodies <- intersect(rownames(Y), antibody_annotation$Composite.Element.REF)
Y <- Y[rownames(Y) %in% antibodies, ]
# Replace antibody identifiers with gene symbols
antibody_annotation <- antibody_annotation[order(antibody_annotation$Composite.Element.REF), ]
Y <- Y[order(rownames(Y)), ]
rownames(Y) <- antibody_annotation$Gene.Name.processed
# return
list(X = X, Y = Y, Z = Z)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.