data-raw/BRCA.R

library(dplyr)

brca <- TCGA2STAT::getTCGA(disease = "BRCA",
                           data.type = "RNASeq2",
                           clinical = TRUE)

# Subset to patients with paired RNA sequence data and isolate clinical data
paired_data <- TCGA2STAT::TumorNormalMatch(brca$dat)
patients    <- colnames(paired_data$normal)
clin_dat    <- with(brca, subset(clinical, rownames(clinical) %in% patients))

# NOTES
# 1. Unnecessary vars: Composite Element REF, daystolastknownalive
# 2. Numeric vars: yearstobirth, daystodeath, daystolastfollowup,
#                  numberoflymphnodes
# 3. Almost everything else is a factor variable
clinical_data <- clin_dat %>%
  as_tibble(rownames = "patient") %>%
  select(-c(`Composite Element REF`, daystolastknownalive)) %>%
  mutate_at(vars(yearstobirth,
                 daystodeath,
                 daystolastfollowup,
                 numberoflymphnodes),
            as.numeric) %>%
  mutate_at(vars(vitalstatus), factor, labels = c("Alive", "Dead")) %>%
  mutate_at(vars(tumortissuesite,
                 pathologicstage,
                 pathologyTstage,
                 pathologyNstage,
                 pathologyMstage,
                 gender,
                 radiationtherapy,
                 histologicaltype,
                 race,
                 ethnicity,
                 dateofinitialpathologicdiagnosis),
            factor)

# Make the merged data sets ---------------------------------------------------

dat <- brca$merged.dat %>%
  tidyr::drop_na() %>%
  select(-status, -OS) %>%
  rename(patient = bcr)

# NOTES
# 1. Unnecessary vars: Composite Element REF, daystolastknownalive
# 2. Numeric vars: yearstobirth, daystodeath, daystolastfollowup,
#                  numberoflymphnodes
# 3. Almost everything else is a factor variable
all_clinical_data <- brca$clinical %>%
  as_tibble(rownames = "patient") %>%
  select(-c(`Composite Element REF`, daystolastknownalive)) %>%
  mutate_at(vars(yearstobirth,
                 daystodeath,
                 daystolastfollowup,
                 numberoflymphnodes),
            as.numeric) %>%
  mutate_at(vars(vitalstatus), factor, labels = c("Alive", "Dead")) %>%
  mutate_at(vars(tumortissuesite,
                 pathologicstage,
                 pathologyTstage,
                 pathologyNstage,
                 pathologyMstage,
                 gender,
                 radiationtherapy,
                 histologicaltype,
                 race,
                 ethnicity,
                 dateofinitialpathologicdiagnosis),
            factor)

brca_full <- inner_join(x = all_clinical_data, y = dat, by = "patient")

gene_cols <- brca_full %>%
  select(`A1BG`:`tAKR`)

gene_medians <- apply(gene_cols, 2, median)
gene_median_sort <- sort(gene_medians, decreasing = TRUE)

brca10 <- brca_full %>%
  select(patient:ethnicity, names(head(gene_median_sort, 10)))

brca100 <- brca_full %>%
  select(patient:ethnicity, names(head(gene_median_sort, 100)))

usethis::use_data(brca_full,
                  brca10,
                  brca100,
                  paired_data,
                  clinical_data,
                  overwrite = TRUE)
adknudson/cp3brca documentation built on June 9, 2020, 11:46 p.m.