data-raw/get_tcga2018_data.R

library(cBioPortalData)
library(dplyr)

cbio <- cBioPortal()
studies <- getStudies(cbio)

#get studyIds
study_names <-
  data.frame(cbind(gsub(" \\(TCGA, PanCancer Atlas\\)", "",studies$name[grepl("tcga_pan_can_atlas", studies$studyId)]),
                   studies$studyId[grepl("tcga_pan_can_atlas", studies$studyId)]))
colnames(study_names) <- c("Cancer", "studyId")

#loop through studyIds to get all_tcga2018_data
all_tcga2018_data <- data.frame()
for(study in study_names$studyId){
  cbio_table <- getDataByGenePanel(cbio, study, genePanelId = "IMPACT468",
                          molecularProfileId = paste0(study, "_gistic"),
                     sampleListId = paste0(study, "_cna"))
  cbio_dat <- data.frame(cbio_table[[1]], stringsAsFactors = FALSE)
  cbio_summ <- cbio_dat %>% group_by(hugoGeneSymbol) %>%
    summarise(Gain = sum(value ==1)/n(),
              Amplification = sum(value == 2)/n(),
              ShallowDeletion = sum(value == -1)/n(),
              DeepDeletion = sum(value == -2)/n())
  cbio_summ$sample_size <- rep(paste0(" (N = ", length(unique(cbio_dat$uniquePatientKey)), ")"), nrow(cbio_summ))
  cbio_summ$studyId <- rep(study, nrow(cbio_summ))
  all_tcga2018_data <- rbind(all_tcga2018_data, cbio_summ)
}

#add sample sizes to get cbio_studies
ss <- all_tcga2018_data %>% group_by(studyId, sample_size) %>% summarise()
cbio_studies <- inner_join(study_names, ss, by = c("studyId"))
cbio_studies$Cancer <- paste0(cbio_studies$Cancer, cbio_studies$sample_size)
cbio_studies <- dplyr::select(cbio_studies, Cancer, studyId)
rebeccagreenblatt/CNViz documentation built on Feb. 3, 2022, 1:50 a.m.