In adknudson/cp3brca: Accelerated Failure Time model of the BRCA data from TCGA2STAT package

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

All the different diseases from TCGA2STAT are listed here.

diseases = c(
  "ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA", 
  "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML", 
  "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ", 
  "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM"
)

They don't all share the exact same RNA sequences or clinical data, so I need to create a function that can detect which columns are available, and mutate accordingly. I can either process them ahead of time, or download all the data first, and then find the set of common columns.

bad_ids <- NULL
cancer <- list()
for (disease in diseases) {
  tryCatch({
    dat <- TCGA2STAT::getTCGA(disease = disease,
                            data.type = "RNASeq2",
                            clinical = TRUE)

    rna  <- dat$dat %>% t() %>% 
      as_tibble(rownames = "patient") %>%
      mutate(patient = str_remove(patient, "-\\w{3}-\\w{3}-\\w{4}-\\w{2}$"))

    clin <- dat$clinical %>% 
      as_tibble(rownames = "patient")

    cancer[[disease]] <- inner_join(clin, rna, by = "patient") %>%
      add_column(disease = disease, .after = 1)
  },
  error = function(e) {
    bad_ids <<- c(bad_ids, disease)
  })
}

str(cancer, max.level = 1)

all_colnames <- map(cancer, colnames) %>% reduce(union)
shared_colnames <- map(cancer, colnames) %>% reduce(intersect)
missing <- setdiff(all_colnames, shared_colnames)
missing

complete_cancer <- do.call(bind_rows, cancer)
str(complete_cancer)
saveRDS(complete_cancer, 
        file = "../data/complete_tcga2stat_RNASeq2_with_clinical.rds")
complete_cancer <- readRDS("../data/complete_tcga2stat_RNASeq2_with_clinical.rds")

cn <- colnames(complete_cancer)
cn1_20 <- cn[1:20] # First few clinical data columns
cn21_20521 <- cn[21:20521] # RNA sequence columns
cn20522_20572 <- cn[20522:20572] # last few clinical data columns

# reordered columns
cn <- c(cn1_20, cn20522_20572, cn21_20521)

# data in order of clinical then rna
dat <- complete_cancer[,cn]

non_genes <- c(cn1_20, cn20522_20572)

The selection below is determined by going through the first 71 columns one by one and verifying which data are numeric or factors. This process is HIGHLY DEPENDENT ON THE ASSUMPTION THAT THE ORDER OF DATA DOES NOT CHANGE.

# Remove these columns
rem <- cn[c(3, 58:60)]
# Numeric columns
num <- cn[c(4:7, 14, 18, 21:24, 27:30, 32:37, 43:44, 46, 49, 52, 54, 57, 61:64,
         67, 71)]
# factor columns
fct <- cn[c(1, 2, 8:13, 15:17, 19:20, 25:26, 31, 38:42, 45, 47:48, 50:51, 53, 
         55:56, 65:66, 68:70)]

Now that I have the columns selected, I can reverse the dependency by printing the column names, copying the output, and reassign the selection by name, rather than by number.

cat("c(", paste0("\"", rem, "\"", collapse = ", "), ")", sep = "")
cat("c(", paste0("\"", num, "\"", collapse = ", "), ")", sep = "")
cat("c(", paste0("\"", fct, "\"", collapse = ", "), ")", sep = "")

# These columns have data that is all the same, or all NA values
rem <- c("Composite Element REF", "tumorgrade", "daystotumorrecurrence",
         "chemotherapy")

num <- c("yearstobirth", "vitalstatus", "daystodeath", "daystolastfollowup",
         "dateofinitialpathologicdiagnosis", "numberoflymphnodes",
         "daystolastknownalive", "karnofskyperformancescore",
         "numberpackyearssmoked", "weightkgatdiagnosis", 
         "tobaccosmokingyearstopped", "tobaccosmokingpackyearssmoked",
         "tobaccosmokinghistory", "agebegansmokinginyears",
         "pregnanciescounttotal", "pregnanciescountstillbirth",
         "pregnancyspontaneousabortioncount", "pregnanciescountlivebirth",
         "pregnancytherapeuticabortioncount", "pregnanciescountectopic",
         "lymphnodesexaminedhecount", "lymphnodesexamined",
         "initialpathologicdxyear", "heightcmatdiagnosis", "cervixsuvresults",
         "ageatdiagnosis", "yearoftobaccosmokingonset", "gleasonscore", 
         "psavalue", "daystopsa", "daystosubmittedspecimendx", 
         "Breslowthickness", "tumorsize")

fct <- c("patient", "disease", "tumortissuesite", "pathologicstage",
         "pathologyTstage", "pathologyNstage", "pathologyMstage", "gender",
         "radiationtherapy", "histologicaltype", "residualtumor", "race",
         "ethnicity", "tumorstatus", "neoplasmhistologicgrade",
         "radiationtherapystatus", "cervicalcarcinomapelvicextensiontext",
         "lymphnodelocation", "poslymphnodelocation", "menopausestatus",
         "lymphovascularinvolvement", "keratinizationsquamouscell",
         "hysterectomytype", "historyhormonalcontraceptivesuse", 
         "corpusinvolvement", "chemoconcurrenttype", "causeofdeath", 
         "clinicalstage", "tumorstage", "melanomaulceration", 
         "melanomaprimaryknown", "radiationexposure",
         "extrathyroidalextension", "multifocality")

# The RNA gene names can be defined by the remaining column names
rna <- setdiff(cn, c(rem, num, fct))


dat2 <- dat %>%
  mutate_at(vars(all_of(num)), as.numeric) %>%
  mutate_at(vars(all_of(fct)), factor) %>%
  mutate_at(vars(all_of(rna)), as.numeric) %>%
  select(-all_of(rem))

saveRDS(dat2,
        file = "../data/complete_processed_tcga2stat_RNASeq2_with_clinical.rds")

adknudson/cp3brca documentation built on June 9, 2020, 11:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

adknudson/cp3brca
Accelerated Failure Time model of the BRCA data from TCGA2STAT package

In adknudson/cp3brca: Accelerated Failure Time model of the BRCA data from TCGA2STAT package

R Package Documentation

Browse R Packages

We want your feedback!

adknudson/cp3brca Accelerated Failure Time model of the BRCA data from TCGA2STAT package

In adknudson/cp3brca: Accelerated Failure Time model of the BRCA data from TCGA2STAT package

R Package Documentation

Browse R Packages

We want your feedback!

adknudson/cp3brca
Accelerated Failure Time model of the BRCA data from TCGA2STAT package