knitr::opts_chunk$set(echo = TRUE, warning=FALSE)
my.project <- gsub('TCGA-', '', params$project)
Install r my.project
.data by using devtools
package.
Load the library
Load the required datasets (one or more of the following)
clinical
fpkm.per.tissue
fpkm.per.tissue.barcode
mutation
gdc
# load library or use directly if insta install.packages('devtools') # The library can also be loaded and use the function install_git without 'devtools::' prefix devtools::install_url('http://sels.tecnico.ulisboa.pt/gitlab/SOUND/brca.data/repository/archive.zip') # # Load the brca.data package library(brca.data) # start using the data, for example the tissue data data(fpkm.per.tissue) # tissue is now in the enviromnet and will be loaded on the first # time it is used. For example: names(fpkm.per.tissue)
The Cancer Genome Atlas r params$project
data collection is part of a larger effort to build a research community focused on connecting cancer phenotypes to genotypes by providing clinical images matched to subjects from The Cancer Genome Atlas (TCGA).
The data is publicly available (https://gdc-portal.nci.nih.gov/) and is decomposed into two data sets:
the gene expression data;
the clinical data.
The following links explain (1) the individuals' barcode and (2) the sample type code:
Explanation of TCGA barcode using example TCGA-02-0001-01C-01D-0182-01
| Label | Identifier for | Value | Value Description | Possible Values | |-------------|--------------------|-------|-------------------|-----------------| | Project | Project name | TCGA | TCGA project | TCGA | | TSS | Tissue source site | 02 | GBM (brain tumor) sample from MD Anderson | See Code Tables Report | | Participant | Study participant | 0001 | The first participant from MD Anderson for GBM study | Any alpha-numeric value | | Sample | Sample type | 01 | A solid tumor | Tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29. See Code Tables Report for a complete list of sample codes | | Vial | Order of sample in a sequence of samples | C | The third vial | A to Z | | Portion | Order of portion in a sequence of 100 - 120 mg sample portions | 01 | The first portion of the sample | 01-99 | | Analyte | Molecular type of analyte for analysis | D | The analyte is a DNA sample | See Code Tables Report | | Plate | Order of plate in a sequence of 96-well plates | 0182 | The 182nd plate | 4-digit alphanumeric value | | Center | Sequencing or characterization center that will receive the aliquot for analysis | 01 | The Broad Institute GCC | See Code Tables Report |
# # # # # This now documents the generation of the dataset. # # No longer a generic documentation of the package! # # #
The data set can also be extracted through the TCGAbiolinks R bioconductor package
source("https://bioconductor.org/biocLite.R") biocLite("TCGAbiolinks")
r params$project
)Change project
variable to get other data projects in TCGA.
library(TCGAbiolinks) project <- params$project cached.file <- file.path('..', 'tcga-biolinks-cache.RData') if (file.exists(cached.file)) { load(cached.file) } else { query <- list() # Query the GDC to obtain data query$clinical <- GDCquery(project = project, data.category = "Clinical") # Download query data download.out <- GDCdownload(query$clinical, method = 'api') # Prepare R object with data gdc <- list() gdc$clinical <- GDCprepare_clinic(query$clinical, 'patient') gdc$drug <- GDCprepare_clinic(query$clinical, 'drug') gdc$radiation <- GDCprepare_clinic(query$clinical, 'radiation') gdc$admin <- GDCprepare_clinic(query$clinical, 'admin') gdc$follow.up <- GDCprepare_clinic(query$clinical, 'follow_up') gdc$stage_event <- GDCprepare_clinic(query$clinical, 'stage_event') gdc$new_tumor_event <- GDCprepare_clinic(query$clinical, 'new_tumor_event') # query$rnaseq <- GDCquery(project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - FPKM") # GDCdownload(query$rnaseq, method = 'api', chunks.per.download = 6) gdc$rnaseq <- GDCprepare(query$rnaseq) # query$mutation <- GDCquery(project = project, data.category = "Simple Nucleotide Variation", data.type = "Masked Somatic Mutation", workflow.type = "MuSE Variant Aggregation and Masking") GDCdownload(query$mutation, method = 'api', chunks.per.download = 6) gdc$mutation <- GDCprepare(query$mutation) # clean up table by removing all rows that are NA gdc$mutation <- gdc$mutation[,colSums(is.na(gdc$mutation)) != nrow(gdc$mutation)] # save(gdc, query, file = cached.file) }
# Library to print some information library(futile.logger) library(ggplot2) library(tibble) library(dplyr) library(Matrix) # # this is distributed as part of the package getParticipant <- function(fullBarcode) { source('../R/functions.R') getParticipantCode(fullBarcode) } getSample <- function(fullBarcode) { source('../R/functions.R') getSampleTypeCode(fullBarcode) }
Pattern to retrieve TCGA patient and sample type from extended barcode.
# $1 gets the TCGA participant portion # $2 gets the sample type tcga.barcode.pattern <- '(TCGA-[A-Z0-9a-z]{2}-[a-zA-Z0-9]{4})-([0-9]{2}).+' # # patient barcode for each column in tcga.data$dat fpkm.per.tissue.barcode <- list() fpkm.per.tissue.barcode$all <- getParticipant(gdc$rnaseq@colData$barcode) # # getting all tisues fpkm.per.tissue <- list() fpkm.per.tissue$all <- as.matrix(gdc$rnaseq@assays$data[['HTSeq - FPKM']]) colnames(fpkm.per.tissue$all) <- gdc$rnaseq@colData$barcode # rownames(fpkm.per.tissue$all) <- gdc$rnaseq@rowRanges$ensembl_gene_id # gene.ranges <- gdc$rnaseq@rowRanges # # clinical data rownames(gdc$clinical) <- gdc$clinical$bcr_patient_barcode gdc$clinical <- gdc$clinical clinical <- list() clinical$all <- gdc$clinical
See Code Tables Report for a complete list of sample codes.
# sample type per sample tissue.type <- as.numeric(getSample(colnames(fpkm.per.tissue$all))) # # mapping for tissue type tissue.mapping <- list( 'primary.solid.tumor' = 01, 'recurrent.solid.tumor' = 02, 'primary.blood.derived.cancer.peripheral.blood' = 03, 'recurrent.blood.derived.cancer.bone.marrow' = 04, 'additional.new.primary' = 05, 'metastatic' = 06, 'additional.metastatic' = 07, 'human.tumor.original.cells' = 08, 'primary.blood.derived.cancer.bone.marrow' = 09, 'blood.derived.normal' = 10, 'solid.tissue.normal' = 11, 'buccal.cell.normal' = 12, 'ebv.immortalized.normal' = 13, 'bone.marrow.normal' = 14, 'sample.type.15' = 15, 'sample.type.16' = 16, 'control.analyte' = 20, 'recurrent.blood.derived.cancer.peripheral.blood' = 40, 'cell.lines' = 50, 'primary.xenograft.tissue' = 60, 'cell.line.derived.xenograft.tissue' = 61, 'sample.type.99' = 99 ) # # Types of tissues tissue.ix <- list() for (el in names(tissue.mapping)) { tissue.ix[[el]] <- (tissue.type == tissue.mapping[[el]]) if (any(tissue.ix[[el]])) { fpkm.per.tissue[[el]] <- as.matrix(fpkm.per.tissue$all[,tissue.ix[[el]]]) colnames(fpkm.per.tissue[[el]]) <- colnames(fpkm.per.tissue$all)[tissue.ix[[el]]] rownames(fpkm.per.tissue[[el]]) <- rownames(fpkm.per.tissue$all) } }
sample.size <- c() for (el in names(fpkm.per.tissue)) { sample.size <- rbind(sample.size, c(ncol(fpkm.per.tissue[[el]]), nrow(fpkm.per.tissue[[el]]))) } rownames(sample.size) <- names(fpkm.per.tissue) colnames(sample.size) <- c('# of Samples', '# of Genes') futile.logger::flog.info('Tissue information per tissue type:', sample.size, capture = TRUE)
tissue
in tissue.barcode
for (el in names(fpkm.per.tissue)) { fpkm.per.tissue.barcode[[el]] <- getParticipant(colnames(fpkm.per.tissue[[el]])) }
# # Add survival event time column survival.event.tmp <- clinical$all[, c('days_to_death', "days_to_last_followup", "vital_status")] # vital.status.ix <- !is.na(clinical$all$days_to_death) # clinical$all$surv_event_time[ vital.status.ix] <- clinical$all[ vital.status.ix,'days_to_death'] clinical$all$surv_event_time[!vital.status.ix] <- clinical$all[!vital.status.ix,'days_to_last_followup'] # # for (el in names(fpkm.per.tissue)[names(fpkm.per.tissue) != 'all']) { clinical[[el]] <- clinical$all[unique(fpkm.per.tissue.barcode[[el]]),] } # sample.size <- c() for (el in names(clinical)) { sample.size <- rbind(sample.size, c(nrow(clinical[[el]]), ncol(clinical[[el]]))) } rownames(sample.size) <- names(fpkm.per.tissue) colnames(sample.size) <- c('# of Samples', '# of Features') futile.logger::flog.info('Clinical information per tissue type:', sample.size, capture = TRUE)
# # Somatic mutations data mutations <- matrix(0, ncol = nrow(gdc$clinical), nrow = length(gdc$mutation$Gene), dimnames = list(gdc$mutation$Gene, gdc$clinical$bcr_patient_barcode)) # get a temporary index list for fast access ix.list <- cbind(gdc$mutation$Gene, getParticipant(gdc$mutation$Tumor_Sample_Barcode)) ix.list <- ix.list[!is.na(gdc$mutation$Gene),] # this is a long run, should be as simple as possible for(ix in seq(nrow(ix.list))) { mutations[ix.list[ix,1], ix.list[ix,2]] <- mutations[ix.list[ix,1], ix.list[ix,2]] + 1 }
# analyse raw mutation results mutations.by.case <- data.frame(ix = seq(ncol(mutations)), case = colSums(mutations), unique = colSums(mutations != 0)) mutations.by.case$rel.unique.case <- mutations.by.case$case / mutations.by.case$unique # flog.info('Summary of count of mutations (with repeated)', summary(mutations.by.case$case), capture = TRUE) flog.info('Summary of count of mutations (unique only)', summary(mutations.by.case$unique), capture = TRUE) # ggplot(data = mutations.by.case) + theme_minimal() + geom_point(data = base::subset(mutations.by.case, rel.unique.case <= 1.2), aes(x = ix, y = unique, colour = rel.unique.case)) + geom_point(data = base::subset(mutations.by.case, rel.unique.case > 1.2), aes(x = ix, y = unique, colour = rel.unique.case)) + geom_point(data = base::subset(mutations.by.case, rel.unique.case > 1.4), aes(x = ix, y = unique, colour = rel.unique.case)) + geom_point(data = base::subset(mutations.by.case, rel.unique.case > 1.6), aes(x = ix, y = unique, colour = rel.unique.case)) + geom_point(data = base::subset(mutations.by.case, rel.unique.case > 1.8), aes(x = ix, y = unique, colour = rel.unique.case)) + scale_colour_gradient('Sum/Unique (per case)', high = 'red', low = 'green') + labs(title="Histogram for Mutations by Case") + labs(x="Age", y="Unique count of mutations")
Other than duplicates identified from different samples, all other mutations seem to be on different positions of the same gene.
Therefore, they will be counted!
# Filter out cases that have unique mutations my.dup <- mutations.by.case[mutations.by.case$case != mutations.by.case$unique,] # Sort by descending order of cases that have biggest difference between mutations and unique count my.dup <- my.dup[sort(my.dup$case / my.dup$unique, index.return = TRUE, decreasing = TRUE)$ix,] # Look at top case! test.case <- rownames(my.dup) # Get the GDC rows that map to the top case test.mat <- gdc$mutation[getParticipantCode(gdc$mutation$Tumor_Sample_Barcode) %in% test.case,] # Choose only genes that are duplicated uniq.id <- paste0(test.mat$Hugo_Symbol, getParticipantCode(test.mat$Tumor_Sample_Barcode)) test.mat <- test.mat[duplicated(uniq.id) | duplicated(uniq.id, fromLast = TRUE),] # Sort by Hugo_Symbol to have the same gene in consecutive lines test.mat <- arrange(test.mat, Hugo_Symbol, barcode = getParticipantCode(test.mat$Tumor_Sample_Barcode)) # Chosen randomly in brca # A2ML1 <- same type of mutation on the same gene # AAK1 <- two types of variant on different positions on the same gene # AC097711.1 and CEP128 <- Two different types of samples (tumour / metastases) #gene.test.list <- c('AAK1', 'A2ML1', 'AC097711.1') gene.test.list <- sample(unique(test.mat$Hugo_Symbol), 5) diff.columns <- array(FALSE, ncol(test.mat)) # add some important ones diff.columns[c(1,5,6,7,8,9)] <- TRUE for (gene.test in gene.test.list) { my.case <- test.mat[test.mat$Hugo_Symbol == gene.test,] for (ix in 2:nrow(my.case)) { diff.columns = diff.columns | as.vector(my.case[ix - 1,] != my.case[ix,]) } diff.columns[is.na(diff.columns)] <- FALSE } my.analysis <- test.mat[test.mat$Hugo_Symbol %in% gene.test.list,diff.columns] # Only a subset of columns are displayed my.analysis
Cases to ignore:
A2ML1
we found multiple mutations on different postition on the same geneAAK1
two types of variant on different positions on the same geneCases to remove duplicates:
CEP128
and AC097711.1
we found that there are duplicates from different samples per individual (Tumour and Metastases), which we will discard one.mutation <- list() # remove duplicate mutations on same position mutation$data <- distinct(gdc$mutation, Hugo_Symbol, Chromosome, Start_Position, End_Position, Strand, .keep_all = T) dups.ix <- paste0(gdc$mutation$Hugo_Symbol, gdc$mutation$Chromosome, gdc$mutation$Start_Position, gdc$mutation$End_Position, gdc$mutation$Strand) # # check if the number of duplicated are the same # flog.info('Is the number of discarded row from distict the same as duplicated? R: %s', nrow(gdc$mutation) - nrow(mutation) == sum(duplicated(dups.ix))) # # mutation$dups <- gdc$mutation[duplicated(dups.ix) | duplicated(dups.ix, fromLast = TRUE),] mutation$dups <- dplyr::arrange(mutation$dups, Hugo_Symbol, Tumor_Sample_Barcode)
mutation$count <- matrix(integer(1), ncol = nrow(gdc$clinical), nrow = length(mutation$data$Gene), dimnames = list(mutation$data$Gene, gdc$clinical$bcr_patient_barcode)) # get a temporary index list for fast access ix.list <- cbind(mutation$data$Gene, getParticipant(mutation$data$Tumor_Sample_Barcode)) ix.list <- ix.list[!is.na(mutation$data$Gene),] # this is a long run, should be as simple as possible for(ix in seq(nrow(ix.list))) { mutation$count[ix.list[ix,1], ix.list[ix,2]] <- mutation$count[ix.list[ix,1], ix.list[ix,2]] + 1 } mutation$count <- Matrix(mutation$count, sparse = TRUE)
fpkm.per.tissue
: gene expression data for all types of tissues, see names(tissue)
;fpkm.per.tissue.barcode
: Patient's participation data code (TCGA-XX-XXXX) per type of tissue;clinical
: Clinical data per tissue type. Has the same structure as tissue.mutation
: Mutation data from GDC (filtered) and a matrix of counts.# Extract all expression data to a different variable to remove redundancy in `tissue` fpkm.per.tissue$all <- NULL gdc$rnaseq <- 'Use function loadGDCRnaSeq to get the original assay' # # # Uncomment to update data variables # devtools::use_data(gdc, overwrite = TRUE) devtools::use_data(fpkm.per.tissue, overwrite = TRUE) devtools::use_data(fpkm.per.tissue.barcode, overwrite = TRUE) devtools::use_data(clinical, overwrite = TRUE) devtools::use_data(gene.ranges, overwrite = TRUE) devtools::use_data(mutation, overwrite = TRUE)
rmarkdown::render('build_data.Rmd', out.dir = '.')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.