inst/doc/clinical.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(progress = FALSE)

## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
query <- GDCquery(project = "TCGA-ACC", 
                  data.category = "Clinical",
                  data.type = "Clinical Supplement", 
                  data.format = "BCR Biotab")
GDCdownload(query)
clinical.BCRtab.all <- GDCprepare(query)
names(clinical.BCRtab.all)

query <- GDCquery(project = "TCGA-ACC", 
                  data.category = "Clinical",
                  data.type = "Clinical Supplement", 
                  data.format = "BCR Biotab",
                   file.type = "radiation")
GDCdownload(query)
clinical.BCRtab.radiation <- GDCprepare(query)


## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
clinical.BCRtab.all$clinical_drug_acc  %>% 
  head  %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ---- results = "hide",cache=TRUE, message=FALSE------------------------------
library(TCGAbiolinks)
query <- GDCquery(project = "TCGA-BRCA", 
                  data.category = "Clinical",
                  data.type = "Clinical Supplement", 
                  data.format = "BCR Biotab")
GDCdownload(query)
clinical.BCRtab.all <- GDCprepare(query)

## -----------------------------------------------------------------------------
# All available tables
names(clinical.BCRtab.all)

# colnames from clinical_patient_brca
tibble::tibble(sort(colnames(clinical.BCRtab.all$clinical_patient_brca)))

# ER status count
plyr::count(clinical.BCRtab.all$clinical_patient_brca$er_status_by_ihc)

# ER content 
er.cols <- grep("^er",colnames(clinical.BCRtab.all$clinical_patient_brca))
clinical.BCRtab.all$clinical_patient_brca[,c(2,er.cols)] %>% 
DT::datatable(options = list(scrollX = TRUE))

# All columns content first rows
clinical.BCRtab.all$clinical_patient_brca %>% 
  head  %>% 
  DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ---- results = "hide",cache=TRUE, message=FALSE,warning=FALSE----------------
# Biospecimen BCR Biotab
query.biospecimen <- GDCquery(project = "TCGA-BRCA", 
                  data.category = "Biospecimen",
                  data.type = "Biospecimen Supplement", 
                  data.format = "BCR Biotab")
GDCdownload(query.biospecimen)
biospecimen.BCRtab.all <- GDCprepare(query.biospecimen)

## -----------------------------------------------------------------------------
# All available tables
names(biospecimen.BCRtab.all)

biospecimen.BCRtab.all$ssf_normal_controls_ov  %>% 
  head  %>% 
  DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
clinical <- GDCquery_clinic(project = "TCGA-LUAD", type = "clinical")

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
clinical %>%
    head %>% 
  DT::datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
clinical <- GDCquery_clinic(project = "BEATAML1.0-COHORT", type = "clinical")

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
clinical %>% 
  head %>% 
  DT::datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
clinical <- GDCquery_clinic(project = "CPTAC-2", type = "clinical")

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
clinical %>% 
  head %>% 
  DT::datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results='hide', echo=TRUE, message=FALSE, warning=FALSE------------------
clinical <- GDCquery_clinic(project = "GENIE-MSK", type = "clinical")

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
clinical %>% 
  head %>% 
  DT::datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results = 'hide',echo=TRUE, message=FALSE, warning=FALSE-----------------
query <- GDCquery(project = "TCGA-COAD", 
                  data.category = "Clinical", 
                  file.type = "xml", 
                  barcode = c("TCGA-RU-A8FL","TCGA-AA-3972"))
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical %>% 
datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical.drug <- GDCprepare_clinic(query, clinical.info = "drug")

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical.drug %>% 
datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical.radiation <- GDCprepare_clinic(query, clinical.info = "radiation")

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical.radiation %>% 
datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
clinical.admin <- GDCprepare_clinic(query, clinical.info = "admin")

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
clinical.admin %>% 
datatable(filter = 'top', 
          options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
          rownames = FALSE)

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE,eval = F-------
#  query <- GDCquery(project = "TCGA-COAD",
#                    data.category = "Other",
#                    legacy = TRUE,
#                    access = "open",
#                    data.type = "Auxiliary test",
#                    barcode = c("TCGA-AD-A5EJ","TCGA-DM-A0X9"))
#  GDCdownload(query)
#  msi_results <- GDCprepare_clinic(query, "msi")

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
msi_results %>% DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
# Tissue slide image files from legacy database
query.legacy <- GDCquery(project = "TCGA-COAD", 
                         data.category = "Clinical", 
                         data.type = "Tissue slide image",
                         legacy = TRUE,
                         barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")) 

# Tissue slide image files from harmonized database
query.harmonized <- GDCquery(project = "TCGA-OV",
                             data.category = "Biospecimen",
                             data.type = 'Slide Image')

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
query.legacy %>% 
  getResults %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

query.harmonized  %>% 
  getResults %>% 
  head  %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
# Pathology report from harmonized portal 
query.harmonized <- GDCquery(project = "TCGA-COAD", 
                             data.category = "Biospecimen", 
                             data.type = "Slide Image",
                             experimental.strategy = "Diagnostic Slide",
                             barcode = c("TCGA-RU-A8FL","TCGA-AA-3972"))  

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
query.harmonized  %>% 
  getResults %>% 
  head  %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
# Pathology report from legacy portal 
query.legacy <- GDCquery(project = "TCGA-COAD", 
                         data.category = "Clinical", 
                         data.type = "Pathology report",
                         legacy = TRUE,
                         barcode = c("TCGA-RU-A8FL","TCGA-AA-3972"))  

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
query.legacy %>% 
  getResults %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE----
#  # Tissue slide image
#  query <- GDCquery(project = "TCGA-COAD",
#                    data.category = "Clinical",
#                    data.type = "Tissue slide image",
#                    legacy = TRUE,
#                    barcode = c("TCGA-RU-A8FL","TCGA-AA-3972"))

## ----echo = TRUE, message = FALSE, warning = FALSE----------------------------
query %>% 
  getResults %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo = TRUE, message = FALSE, warning = FALSE----------
# Clinical Supplement
query <- GDCquery(project = "TCGA-COAD", 
                  data.category = "Clinical", 
                  data.type = "Clinical Supplement",
                  legacy = TRUE,
                  barcode = c("TCGA-RU-A8FL","TCGA-AA-3972")) 

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
query %>% 
  getResults %>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE----------------
# Clinical data
query <- GDCquery(project = "TCGA-COAD", 
                  data.category = "Clinical", 
                  data.type = "Clinical data",
                  legacy = TRUE,
                  file.type = "txt")  

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
query %>% 
  getResults %>% 
  select(-matches("cases"))%>% 
 DT::datatable(options = list(scrollX = TRUE, keys = TRUE))

## ----results = 'hide', echo=TRUE, message=FALSE, warning=FALSE, eval = FALSE----
#  GDCdownload(query)
#  clinical.biotab <- GDCprepare(query)

## ----echo=TRUE, message=FALSE, warning=FALSE----------------------------------
names(clinical.biotab)
datatable(clinical.biotab$clinical_radiation_coad, options = list(scrollX = TRUE, keys = TRUE))

## ---- eval = TRUE-------------------------------------------------------------
bar <- c("TCGA-G9-6378-02A-11R-1789-07", "TCGA-CH-5767-04A-11R-1789-07",  
         "TCGA-G9-6332-60A-11R-1789-07", "TCGA-G9-6336-01A-11R-1789-07",
         "TCGA-G9-6336-11A-11R-1789-07", "TCGA-G9-7336-11A-11R-1789-07",
         "TCGA-G9-7336-04A-11R-1789-07", "TCGA-G9-7336-14A-11R-1789-07",
         "TCGA-G9-7036-04A-11R-1789-07", "TCGA-G9-7036-02A-11R-1789-07",
         "TCGA-G9-7036-11A-11R-1789-07", "TCGA-G9-7036-03A-11R-1789-07",
         "TCGA-G9-7036-10A-11R-1789-07", "TCGA-BH-A1ES-10A-11R-1789-07",
         "TCGA-BH-A1F0-10A-11R-1789-07", "TCGA-BH-A0BZ-02A-11R-1789-07",
         "TCGA-B6-A0WY-04A-11R-1789-07", "TCGA-BH-A1FG-04A-11R-1789-08",
         "TCGA-D8-A1JS-04A-11R-2089-08", "TCGA-AN-A0FN-11A-11R-8789-08",
         "TCGA-AR-A2LQ-12A-11R-8799-08", "TCGA-AR-A2LH-03A-11R-1789-07",
         "TCGA-BH-A1F8-04A-11R-5789-07", "TCGA-AR-A24T-04A-55R-1789-07",
         "TCGA-AO-A0J5-05A-11R-1789-07", "TCGA-BH-A0B4-11A-12R-1789-07",
         "TCGA-B6-A1KN-60A-13R-1789-07", "TCGA-AO-A0J5-01A-11R-1789-07",
         "TCGA-AO-A0J5-01A-11R-1789-07", "TCGA-G9-6336-11A-11R-1789-07",
         "TCGA-G9-6380-11A-11R-1789-07", "TCGA-G9-6380-01A-11R-1789-07",
         "TCGA-G9-6340-01A-11R-1789-07", "TCGA-G9-6340-11A-11R-1789-07")

S <- TCGAquery_SampleTypes(bar,"TP")
S2 <- TCGAquery_SampleTypes(bar,"NB")

# Retrieve multiple tissue types  NOT FROM THE SAME PATIENTS
SS <- TCGAquery_SampleTypes(bar,c("TP","NB"))

# Retrieve multiple tissue types  FROM THE SAME PATIENTS
SSS <- TCGAquery_MatchedCoupledSampleTypes(bar,c("NT","TP"))

## ---- eval = FALSE------------------------------------------------------------
#  # This code will get all clinical indexed data from TCGA
#  library(data.table)
#  library(dplyr)
#  library(regexPipes)
#  clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>%
#    regexPipes::grep("TCGA",value=T) %>%
#    sort %>%
#    plyr::alply(1,GDCquery_clinic, .progress = "text") %>%
#    rbindlist
#  readr::write_csv(clinical,path = paste0("all_clin_indexed.csv"))
#  
#  # This code will get all clinical XML data from TCGA
#  getclinical <- function(proj){
#    message(proj)
#    while(1){
#      result = tryCatch({
#        query <- GDCquery(project = proj, data.category = "Clinical",file.type = "xml")
#        GDCdownload(query)
#        clinical <- GDCprepare_clinic(query, clinical.info = "patient")
#        for(i in c("admin","radiation","follow_up","drug","new_tumor_event")){
#          message(i)
#          aux <- GDCprepare_clinic(query, clinical.info = i)
#          if(is.null(aux) || nrow(aux) == 0) next
#          # add suffix manually if it already exists
#          replicated <- which(grep("bcr_patient_barcode",colnames(aux), value = T,invert = T) %in% colnames(clinical))
#          colnames(aux)[replicated] <- paste0(colnames(aux)[replicated],".",i)
#          if(!is.null(aux)) clinical <- merge(clinical,aux,by = "bcr_patient_barcode", all = TRUE)
#        }
#        readr::write_csv(clinical,path = paste0(proj,"_clinical_from_XML.csv")) # Save the clinical data into a csv file
#        return(clinical)
#      }, error = function(e) {
#        message(paste0("Error clinical: ", proj))
#      })
#    }
#  }
#  clinical <- TCGAbiolinks:::getGDCprojects()$project_id %>%
#    regexPipes::grep("TCGA",value=T) %>% sort %>%
#    plyr::alply(1,getclinical, .progress = "text") %>%
#    rbindlist(fill = TRUE) %>% setDF %>% subset(!duplicated(clinical))
#  
#  readr::write_csv(clinical,path = "all_clin_XML.csv")
#  # result: https://drive.google.com/open?id=0B0-8N2fjttG-WWxSVE5MSGpva1U
#  # Obs: this table has multiple lines for each patient, as the patient might have several followups, drug treatments,
#  # new tumor events etc...

Try the TCGAbiolinks package in your browser

Any scripts or data that you put into this service are public.

TCGAbiolinks documentation built on Nov. 8, 2020, 5:37 p.m.