inst/doc/extension.R

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(dpi = 300)
knitr::opts_chunk$set(cache=FALSE)
knitr::opts_chunk$set(echo = TRUE)

## ----message=FALSE, warning=FALSE, include=FALSE------------------------------
library(TCGAbiolinks)
library(SummarizedExperiment)
#library(dplyr)
#library(DT)

## ---- echo=TRUE, warning=FALSE, eval=TRUE-------------------------------------
projects <- getGDCprojects()$project_id
as.data.frame(grep("TARGET", projects,value = TRUE))

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  #Downloading and prepare TARGET CASE
#  query_Target <- GDCquery(project = "TARGET-AML",
#                           data.category = "Transcriptome Profiling",
#                           data.type = "Gene Expression Quantification",
#                           workflow.type = "HTSeq - Counts")
#  
#  samplesDown_Target <- getResults(query_Target, cols = c("cases"))
#  
#  dataSmTB_Target <- TCGAquery_SampleTypes(barcode = samplesDown_Target,
#                                           typesample = "TB")
#  
#  dataSmNB_Target <- TCGAquery_SampleTypes(barcode = samplesDown_Target,
#                                           typesample = "TBM")
#  dataSmTB_short_Target <- dataSmTB_Target[1:10]
#  dataSmNB_short_Target <- dataSmNB_Target[1:10]
#  
#  queryDown_Target <- GDCquery(project = "TARGET-AML",
#                               data.category = "Transcriptome Profiling",
#                               data.type = "Gene Expression Quantification",
#                               workflow.type = "HTSeq - Counts",
#                               barcode = c(dataSmTB_short_Target, dataSmNB_short_Target))
#  
#  GDCdownload(queryDown_Target)
#  
#  ### SummarizedExperiment = TRUE
#  data <- GDCprepare(queryDown_Target)
#  
#  dataPrep_Target <- TCGAanalyze_Preprocessing(object = data,
#                                               cor.cut = 0.6,
#                                               datatype = "HTSeq - Counts")
#  
#  dataNorm_Target <- TCGAanalyze_Normalization(tabDF = dataPrep_Target,
#                                               geneInfo = geneInfoHT,
#                                               method = "gcContent")
#  
#  boxplot(dataPrep_Target, outline = FALSE)
#  
#  boxplot(dataNorm_Target, outline = FALSE)
#  
#  dataFilt_Target <- TCGAanalyze_Filtering(tabDF = dataNorm_Target,
#                                           method = "quantile",
#                                           qnt.cut =  0.25)

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  CancerProject <- "TCGA-BRCA"
#  DataDirectory <- paste0("../GDC/",gsub("-","_",CancerProject))
#  FileNameData <- paste0(DataDirectory, "_","HTSeq_Counts",".rda")
#  
#  query <- GDCquery(project = CancerProject,
#                    data.category = "Transcriptome Profiling",
#                    data.type = "Gene Expression Quantification",
#                    workflow.type = "HTSeq - Counts")
#  
#  samplesDown <- getResults(query,cols = c("cases"))
#  
#  dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
#                                    typesample = "TP")
#  
#  dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
#                                    typesample = "NT")
#  dataSmTP_short <- dataSmTP[1:10]
#  dataSmNT_short <- dataSmNT[1:10]
#  
#  queryDown <- GDCquery(project = CancerProject,
#                        data.category = "Transcriptome Profiling",
#                        data.type = "Gene Expression Quantification",
#                        workflow.type = "HTSeq - Counts",
#                        barcode = c(dataSmTP_short, dataSmNT_short))
#  
#  GDCdownload(query = queryDown)
#  
#  dataPrep1 <- GDCprepare(query = queryDown,
#                          save = TRUE,
#                          save.filename = "TCGA_BRCA_HTSeq_Countds.rda")
#  
#  dataPrep <- TCGAanalyze_Preprocessing(object = dataPrep1,
#                                        cor.cut = 0.6,
#                                        datatype = "HTSeq - Counts")
#  
#  
#  dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
#                                        geneInfo = geneInfoHT,
#                                        method = "gcContent")
#  dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
#                                        geneInfo = geneInfoHT,
#                                        method = "geneLength")
#  
#  dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
#                                    method = "quantile",
#                                    qnt.cut =  0.25)

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  ### dataframe will have genes from dataFilt but raw counts from dataPrep
#  dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  ##use previously fetched BRCA data:
#  Pam50.subtypes<-TCGA_MolecularSubtype(colnames(dataFilt))

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,dataSmTP_short],
#                              mat2 = dataFilt[,dataSmNT_short],
#                              pipeline="limma",
#                              Cond1type = "Normal",
#                              Cond2type = "Tumor",
#                              fdr.cut = 0.01 ,
#                              logFC.cut = 1,
#                              method = "glmLRT", ClinicalDF = data.frame())

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  ### ##download and prepare lung data through GDC### ##
#  query.lung <- GDCquery(project = "TCGA-LUSC",
#                         data.category = "Transcriptome Profiling",
#                         data.type = "Gene Expression Quantification",
#                         workflow.type = "HTSeq - Counts")
#  
#  samplesDown.lusc <- getResults(query.lung,cols=c("cases"))
#  
#  dataSmTP.lusc <- TCGAquery_SampleTypes(barcode = samplesDown.lusc,
#                                         typesample = "TP")
#  
#  dataSmNT.lusc <- TCGAquery_SampleTypes(barcode = samplesDown.lusc,
#                                         typesample = "NT")
#  dataSmTP_short.lusc <- dataSmTP.lusc[1:10]
#  dataSmNT_short.lusc <- dataSmNT.lusc[1:10]
#  length(dataSmTP.lusc)
#  
#  
#  queryDown.lung <- GDCquery(project = "TCGA-LUSC",
#                             data.category = "Transcriptome Profiling",
#                             data.type = "Gene Expression Quantification",
#                             workflow.type = "HTSeq - Counts",
#                             barcode = c(dataSmTP.lusc, dataSmNT.lusc))
#  
#  GDCdownload(query = queryDown.lung)
#  
#  dataPrep1.tcga <- GDCprepare(query = queryDown.lung,
#                               save = TRUE,
#                               save.filename = "TCGA_LUSC_HTSeq_Countds.rda")
#  dataPrep.tcga <- TCGAanalyze_Preprocessing(object = dataPrep1.tcga,
#                                             cor.cut = 0.6,
#                                             datatype = "HTSeq - Counts")
#  
#  dataNorm.tcga <- TCGAanalyze_Normalization(tabDF = dataPrep.tcga,
#                                             geneInfo = geneInfoHT,
#                                             method = "gcContent")
#  
#  dataNorm.tcga <- TCGAanalyze_Normalization(tabDF = dataNorm.tcga,
#                                             geneInfo = geneInfoHT,
#                                             method = "geneLength")
#  dataFilt.tcga <- TCGAanalyze_Filtering(tabDF = dataNorm.tcga,
#                                         method = "quantile",
#                                         qnt.cut =  0.25)
#  
#  ### #Filtering data so all samples have a pam50 subtype for LUSC
#  
#  diff<-setdiff(colnames(dataFilt.tcga), TCGA_MolecularSubtype(colnames(dataFilt.tcga))$filtered)
#  TCGA_MolecularSubtype(diff)$subtypes$barcodes
#  
#  dataFilt.tcga.lusc<-dataFilt.tcga[,diff]
#  LUSC.subtypes<-TCGA_MolecularSubtype(colnames(dataFilt.tcga.lusc))$subtypes$subtype
#  
#  ### Differential expression analysis after correcting for "Plate" factor.
#  DEG.lusc <- TCGAanalyze_DEA(MAT=dataFilt.tcga.lusc,
#                              pipeline="limma",
#                              batch.factors = c("Plate"),
#                              Cond1type = "Normal",
#                              Cond2type = "Tumor",
#                              fdr.cut = 0.01 ,
#                              logFC.cut = 1,
#                              method = "glmLRT", ClinicalDF = data.frame(),
#                              Condtypes = LUSC.subtypes,
#                              contrast.formula = "LUSC.basalvsLUSC.classical=LUSC.basal - LUSC.classical")

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  
#  batch.corrected <- TCGAbatch_Correction(tabDF = dataFilt.tcga, batch.factor = "Plate")
#  
#  batch.corrected.adjusted <- TCGAbatch_Correction(tabDF = dataFilt.tcga,
#                                                   batch.factor = "Plate",
#                                                   adjustment=c("TSS"))

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  
#  #dataFilt is a gene expression matrix from pancancer33 RNASeq data that can be generated following our previous described workflow
#  
#  sampleTP <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = "TP")
#  sampleNT <- TCGAquery_SampleTypes(barcode = colnames(dataFilt), typesample = "NT")
#  dataFiltTP <- dataFilt[,sampleTP]
#  dataFiltNT <- dataFilt[,sampleNT]
#  
#  dataSubt_LUAD <-TCGAquery_subtype(tumor = "LUAD")
#  sampleStageI_LUAD <- dataSubt_LUAD[dataSubt_LUAD$Tumor.stage %in% c("Stage I","Stage IA","Stage IB"),]
#  dataFilt.LUAD.stageI.TP <- dataFiltTP[,substr(colnames(dataFiltTP),1,12) %in% sampleStageI_LUAD$patient]
#  dataFilt.LUAD.stageI.NT <- dataFiltNT[,substr(colnames(dataFiltNT),1,12) %in% sampleStageI_LUAD$patient]
#  
#  # here for example the end-user can replace LUSC with unpublished data in a dataframe containing counts.
#  # and adding the information for LUSC.stageI.TP or LUSC.stageI.NT with unpublished data
#  
#  dataSubt_LUSC <-TCGAquery_subtype(tumor = "LUSC")
#  sampleStageI_LUSC <- dataSubt_LUSC[dataSubt_LUSC$T.stage %in% c("T1","T1a","T1b"),]
#  dataFilt.LUSC.stageI.TP <- dataFiltTP[,substr(colnames(dataFiltTP),1,12) %in% sampleStageI_LUSC$patient]
#  dataFilt.LUSC.stageI.NT <- dataFiltNT[,substr(colnames(dataFiltNT),1,12) %in% sampleStageI_LUSC$patient]
#  
#  countsTable <-cbind(dataFilt.LUAD.stageI.TP,dataFilt.LUAD.stageI.NT,
#                      dataFilt.LUSC.stageI.TP, dataFilt.LUSC.stageI.NT)
#  
#  AnnotationCounts <- matrix(0,ncol(countsTable),2)
#  colnames(AnnotationCounts) <- c("Samples","Batch")
#  rownames(AnnotationCounts) <- colnames(countsTable)
#  
#  AnnotationCounts <- as.data.frame(AnnotationCounts)
#  AnnotationCounts$Samples <- colnames(countsTable)
#  
#  AnnotationCounts[colnames(dataFilt.LUAD.stageI.TP),"Batch"] <- "LUAD.stageI.TP"
#  AnnotationCounts[colnames(dataFilt.LUAD.stageI.NT),"Batch"] <- "LUAD.stageI.NT"
#  AnnotationCounts[colnames(dataFilt.LUSC.stageI.TP),"Batch"] <- "LUSC.stageI.TP"
#  AnnotationCounts[colnames(dataFilt.LUSC.stageI.NT),"Batch"] <- "LUSC.stageI.NT"
#  
#  countsCorrected <- TCGAbatch_Correction(tabDF =countsTable,
#                                          UnpublishedData = TRUE,
#                                          AnnotationDF = AnnotationCounts)

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  Purity.BRCA <- TCGAtumor_purity(colnames(dataPrep.tcga), 0, 0, 0, 0, 0.7)

## ---- echo=TRUE, results='hide', message=FALSE, warning=FALSE, eval=FALSE-----
#  ##Brain data through Recount2
#  
#  brain.rec <- TCGAquery_recount2(project = "gtex", tissue = "brain")

Try the TCGAbiolinks package in your browser

Any scripts or data that you put into this service are public.

TCGAbiolinks documentation built on Nov. 8, 2020, 5:37 p.m.