knitr::opts_chunk$set(echo = TRUE) library(tidyverse)
All the different diseases from TCGA2STAT are listed here.
diseases = c( "ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM" )
They don't all share the exact same RNA sequences or clinical data, so I need to create a function that can detect which columns are available, and mutate accordingly. I can either process them ahead of time, or download all the data first, and then find the set of common columns.
bad_ids <- NULL cancer <- list() for (disease in diseases) { tryCatch({ dat <- TCGA2STAT::getTCGA(disease = disease, data.type = "RNASeq2", clinical = TRUE) rna <- dat$dat %>% t() %>% as_tibble(rownames = "patient") %>% mutate(patient = str_remove(patient, "-\\w{3}-\\w{3}-\\w{4}-\\w{2}$")) clin <- dat$clinical %>% as_tibble(rownames = "patient") cancer[[disease]] <- inner_join(clin, rna, by = "patient") %>% add_column(disease = disease, .after = 1) }, error = function(e) { bad_ids <<- c(bad_ids, disease) }) } str(cancer, max.level = 1)
all_colnames <- map(cancer, colnames) %>% reduce(union) shared_colnames <- map(cancer, colnames) %>% reduce(intersect) missing <- setdiff(all_colnames, shared_colnames) missing
complete_cancer <- do.call(bind_rows, cancer) str(complete_cancer) saveRDS(complete_cancer, file = "../data/complete_tcga2stat_RNASeq2_with_clinical.rds") complete_cancer <- readRDS("../data/complete_tcga2stat_RNASeq2_with_clinical.rds")
cn <- colnames(complete_cancer) cn1_20 <- cn[1:20] # First few clinical data columns cn21_20521 <- cn[21:20521] # RNA sequence columns cn20522_20572 <- cn[20522:20572] # last few clinical data columns # reordered columns cn <- c(cn1_20, cn20522_20572, cn21_20521) # data in order of clinical then rna dat <- complete_cancer[,cn] non_genes <- c(cn1_20, cn20522_20572)
The selection below is determined by going through the first 71 columns one by one and verifying which data are numeric or factors. This process is HIGHLY DEPENDENT ON THE ASSUMPTION THAT THE ORDER OF DATA DOES NOT CHANGE.
# Remove these columns rem <- cn[c(3, 58:60)] # Numeric columns num <- cn[c(4:7, 14, 18, 21:24, 27:30, 32:37, 43:44, 46, 49, 52, 54, 57, 61:64, 67, 71)] # factor columns fct <- cn[c(1, 2, 8:13, 15:17, 19:20, 25:26, 31, 38:42, 45, 47:48, 50:51, 53, 55:56, 65:66, 68:70)]
Now that I have the columns selected, I can reverse the dependency by printing the column names, copying the output, and reassign the selection by name, rather than by number.
cat("c(", paste0("\"", rem, "\"", collapse = ", "), ")", sep = "") cat("c(", paste0("\"", num, "\"", collapse = ", "), ")", sep = "") cat("c(", paste0("\"", fct, "\"", collapse = ", "), ")", sep = "")
# These columns have data that is all the same, or all NA values rem <- c("Composite Element REF", "tumorgrade", "daystotumorrecurrence", "chemotherapy") num <- c("yearstobirth", "vitalstatus", "daystodeath", "daystolastfollowup", "dateofinitialpathologicdiagnosis", "numberoflymphnodes", "daystolastknownalive", "karnofskyperformancescore", "numberpackyearssmoked", "weightkgatdiagnosis", "tobaccosmokingyearstopped", "tobaccosmokingpackyearssmoked", "tobaccosmokinghistory", "agebegansmokinginyears", "pregnanciescounttotal", "pregnanciescountstillbirth", "pregnancyspontaneousabortioncount", "pregnanciescountlivebirth", "pregnancytherapeuticabortioncount", "pregnanciescountectopic", "lymphnodesexaminedhecount", "lymphnodesexamined", "initialpathologicdxyear", "heightcmatdiagnosis", "cervixsuvresults", "ageatdiagnosis", "yearoftobaccosmokingonset", "gleasonscore", "psavalue", "daystopsa", "daystosubmittedspecimendx", "Breslowthickness", "tumorsize") fct <- c("patient", "disease", "tumortissuesite", "pathologicstage", "pathologyTstage", "pathologyNstage", "pathologyMstage", "gender", "radiationtherapy", "histologicaltype", "residualtumor", "race", "ethnicity", "tumorstatus", "neoplasmhistologicgrade", "radiationtherapystatus", "cervicalcarcinomapelvicextensiontext", "lymphnodelocation", "poslymphnodelocation", "menopausestatus", "lymphovascularinvolvement", "keratinizationsquamouscell", "hysterectomytype", "historyhormonalcontraceptivesuse", "corpusinvolvement", "chemoconcurrenttype", "causeofdeath", "clinicalstage", "tumorstage", "melanomaulceration", "melanomaprimaryknown", "radiationexposure", "extrathyroidalextension", "multifocality") # The RNA gene names can be defined by the remaining column names rna <- setdiff(cn, c(rem, num, fct)) dat2 <- dat %>% mutate_at(vars(all_of(num)), as.numeric) %>% mutate_at(vars(all_of(fct)), factor) %>% mutate_at(vars(all_of(rna)), as.numeric) %>% select(-all_of(rem))
saveRDS(dat2, file = "../data/complete_processed_tcga2stat_RNASeq2_with_clinical.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.