data-raw/all_paired_data.R

library(tidyverse)
library(TCGA2STAT)

diseases = c(
  "ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "COADREAD", "DLBC", "ESCA",
  "GBM", "GBMLGG", "HNSC", "KICH", "KIPAN", "KIRC", "KIRP", "LAML",
  "LGG", "LIHC", "LUAD", "LUSC", "MESO", "OV", "PAAD", "PCPG", "PRAD", "READ",
  "SARC", "SKCM", "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM"
)


# tmp <- getTCGA(disease = diseases[2],
#                data.type = "RNASeq2",
#                clinical = TRUE)
#
# tmp_p <- TumorNormalMatch(tmp$dat)
#
# tmp_n <- tmp_p$normal %>% t() %>%
#   as_tibble(rownames = "patient") %>%
#   add_column(sampletype = "normal", .after = 1)
#
# tmp_t <- tmp_p$primary.tumor %>% t() %>%
#   as_tibble(rownames = "patient") %>%
#   add_column(sampletype = "primary tumor", .after = 1)
#
# tmp_c <- tmp$clinical %>%
#   as_tibble(rownames = "patient") %>%
#   filter(patient %in% colnames(tmp_p$normal))


cancer <- list()
for (disease in diseases) {
  tryCatch({
    tmp <- getTCGA(disease = disease,
                   data.type = "RNASeq2",
                   clinical = TRUE)

    tmp_p <- TumorNormalMatch(tmp$dat)

    tmp_n <- tmp_p$normal %>% t() %>%
      as_tibble(rownames = "patient") %>%
      add_column(sampletype = "normal", .after = 1)

    tmp_t <- tmp_p$primary.tumor %>% t() %>%
      as_tibble(rownames = "patient") %>%
      add_column(sampletype = "primary tumor", .after = 1)

    patients <- tmp_n$patient

    dat <- bind_rows(tmp_n, tmp_t) %>%
      arrange(patient, sampletype)

    tmp_c <- tmp$clinical %>%
      as_tibble(rownames = "patient") %>%
      filter(patient %in% patients)

    cancer[[disease]] <- inner_join(tmp_c, dat, by = "patient") %>%
      add_column(disease = disease, .after = 1)
  },
  error = function(e) bad_ids <<- c(bad_ids, disease))
}

str(cancer, max.level = 1)


all_colnames <- map(cancer, colnames) %>% reduce(union)
shared_colnames <- map(cancer, colnames) %>% reduce(intersect)
missing <- setdiff(all_colnames, shared_colnames)

complete_cancer <- do.call(bind_rows, cancer)

cn <- colnames(complete_cancer)

# These columns have data that is all the same, or all NA values
rem <- c("Composite Element REF", "tumorgrade", "daystotumorrecurrence",
         "chemotherapy")

num <- c("yearstobirth", "vitalstatus", "daystodeath", "daystolastfollowup",
         "dateofinitialpathologicdiagnosis", "numberoflymphnodes",
         "daystolastknownalive", "karnofskyperformancescore",
         "numberpackyearssmoked", "weightkgatdiagnosis",
         "tobaccosmokingyearstopped", "tobaccosmokingpackyearssmoked",
         "tobaccosmokinghistory", "agebegansmokinginyears",
         "pregnanciescounttotal", "pregnanciescountstillbirth",
         "pregnancyspontaneousabortioncount", "pregnanciescountlivebirth",
         "pregnancytherapeuticabortioncount", "pregnanciescountectopic",
         "lymphnodesexaminedhecount", "lymphnodesexamined",
         "initialpathologicdxyear", "heightcmatdiagnosis", "cervixsuvresults",
         "ageatdiagnosis", "yearoftobaccosmokingonset", "gleasonscore",
         "psavalue", "daystopsa", "daystosubmittedspecimendx",
         "Breslowthickness", "tumorsize")

fct <- c("patient", "disease", "sampletype", "tumortissuesite", "pathologicstage",
         "pathologyTstage", "pathologyNstage", "pathologyMstage", "gender",
         "radiationtherapy", "histologicaltype", "residualtumor", "race",
         "ethnicity", "tumorstatus", "neoplasmhistologicgrade",
         "radiationtherapystatus", "cervicalcarcinomapelvicextensiontext",
         "lymphnodelocation", "poslymphnodelocation", "menopausestatus",
         "lymphovascularinvolvement", "keratinizationsquamouscell",
         "hysterectomytype", "historyhormonalcontraceptivesuse",
         "corpusinvolvement", "chemoconcurrenttype", "causeofdeath",
         "clinicalstage", "tumorstage", "melanomaulceration",
         "melanomaprimaryknown", "radiationexposure",
         "extrathyroidalextension", "multifocality")

# The RNA gene names can be defined by the remaining column names
rna <- setdiff(cn, c(rem, num, fct))

dat <- complete_cancer %>%
  select(any_of(c(fct, num, rna, rem)))

all_na <- function(x) all(is.na(x))

dat2 <- dat %>%
  mutate_at(vars(any_of(num)), as.numeric) %>%
  mutate_at(vars(any_of(fct)), factor) %>%
  mutate_at(vars(any_of(rna)), as.numeric) %>%
  select(-any_of(rem))

dat3 <- dat2 %>%
  select_if(function(col) !all_na(col))

saveRDS(dat3, "data/paired_processed_tcga2stat_RNASeq2_with_clinical.rds")
adknudson/cp3brca documentation built on June 9, 2020, 11:46 p.m.