library(tidyverse)
library(TCGA2STAT)
diseases = c(
"ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA",
"GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML",
"LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ",
"SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM"
)
# tmp <- getTCGA(disease = diseases[2],
# data.type = "RNASeq2",
# clinical = TRUE)
#
# tmp_p <- TumorNormalMatch(tmp$dat)
#
# tmp_n <- tmp_p$normal %>% t() %>%
# as_tibble(rownames = "patient") %>%
# add_column(sampletype = "normal", .after = 1)
#
# tmp_t <- tmp_p$primary.tumor %>% t() %>%
# as_tibble(rownames = "patient") %>%
# add_column(sampletype = "primary tumor", .after = 1)
#
# tmp_c <- tmp$clinical %>%
# as_tibble(rownames = "patient") %>%
# filter(patient %in% colnames(tmp_p$normal))
cancer <- list()
for (disease in diseases) {
tryCatch({
tmp <- getTCGA(disease = disease,
data.type = "RNASeq2",
clinical = TRUE)
tmp_p <- TumorNormalMatch(tmp$dat)
tmp_n <- tmp_p$normal %>% t() %>%
as_tibble(rownames = "patient") %>%
add_column(sampletype = "normal", .after = 1)
tmp_t <- tmp_p$primary.tumor %>% t() %>%
as_tibble(rownames = "patient") %>%
add_column(sampletype = "primary tumor", .after = 1)
patients <- tmp_n$patient
dat <- bind_rows(tmp_n, tmp_t) %>%
arrange(patient, sampletype)
tmp_c <- tmp$clinical %>%
as_tibble(rownames = "patient") %>%
filter(patient %in% patients)
cancer[[disease]] <- inner_join(tmp_c, dat, by = "patient") %>%
add_column(disease = disease, .after = 1)
},
error = function(e) bad_ids <<- c(bad_ids, disease))
}
str(cancer, max.level = 1)
all_colnames <- map(cancer, colnames) %>% reduce(union)
shared_colnames <- map(cancer, colnames) %>% reduce(intersect)
missing <- setdiff(all_colnames, shared_colnames)
complete_cancer <- do.call(bind_rows, cancer)
cn <- colnames(complete_cancer)
# These columns have data that is all the same, or all NA values
rem <- c("Composite Element REF", "tumorgrade", "daystotumorrecurrence",
"chemotherapy")
num <- c("yearstobirth", "vitalstatus", "daystodeath", "daystolastfollowup",
"dateofinitialpathologicdiagnosis", "numberoflymphnodes",
"daystolastknownalive", "karnofskyperformancescore",
"numberpackyearssmoked", "weightkgatdiagnosis",
"tobaccosmokingyearstopped", "tobaccosmokingpackyearssmoked",
"tobaccosmokinghistory", "agebegansmokinginyears",
"pregnanciescounttotal", "pregnanciescountstillbirth",
"pregnancyspontaneousabortioncount", "pregnanciescountlivebirth",
"pregnancytherapeuticabortioncount", "pregnanciescountectopic",
"lymphnodesexaminedhecount", "lymphnodesexamined",
"initialpathologicdxyear", "heightcmatdiagnosis", "cervixsuvresults",
"ageatdiagnosis", "yearoftobaccosmokingonset", "gleasonscore",
"psavalue", "daystopsa", "daystosubmittedspecimendx",
"Breslowthickness", "tumorsize")
fct <- c("patient", "disease", "sampletype", "tumortissuesite", "pathologicstage",
"pathologyTstage", "pathologyNstage", "pathologyMstage", "gender",
"radiationtherapy", "histologicaltype", "residualtumor", "race",
"ethnicity", "tumorstatus", "neoplasmhistologicgrade",
"radiationtherapystatus", "cervicalcarcinomapelvicextensiontext",
"lymphnodelocation", "poslymphnodelocation", "menopausestatus",
"lymphovascularinvolvement", "keratinizationsquamouscell",
"hysterectomytype", "historyhormonalcontraceptivesuse",
"corpusinvolvement", "chemoconcurrenttype", "causeofdeath",
"clinicalstage", "tumorstage", "melanomaulceration",
"melanomaprimaryknown", "radiationexposure",
"extrathyroidalextension", "multifocality")
# The RNA gene names can be defined by the remaining column names
rna <- setdiff(cn, c(rem, num, fct))
dat <- complete_cancer %>%
select(any_of(c(fct, num, rna, rem)))
all_na <- function(x) all(is.na(x))
dat2 <- dat %>%
mutate_at(vars(any_of(num)), as.numeric) %>%
mutate_at(vars(any_of(fct)), factor) %>%
mutate_at(vars(any_of(rna)), as.numeric) %>%
select(-any_of(rem))
dat3 <- dat2 %>%
select_if(function(col) !all_na(col))
saveRDS(dat3, "data/paired_processed_tcga2stat_RNASeq2_with_clinical.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.