knitr::opts_chunk$set(echo = TRUE)
When selecting the data, be sure to choose only the samples that end in 01 (TCGA-..-....-01) for solid tissue tumor types and samples that end in 11 (TCGA-..-....-11) for solid tissue normal types. For each cancer type, once the correct settings are selected, click the "Build Archive" button and then download the data. The settings used for the data matrix page and the specific TCGA samples chosen are shown in the following screenshots:
BRCA normal - Filter Settings
BRCA normal - Samples
BRCA tumor - Samples
COAD normal - Filter Settings
COAD normal - Samples
LUSC normal - Filter Settings
LUSC normal - Samples
After unzipping the files that are sent to the email of your choice, save each cancer type (BRCA, COAD, LUSC) as different folders under a new directory, like "datadir" shown below. You will need to update the path to the files.
Load breast tumor and breast normal data:
library(minfi) datadir <- "/Users/Morgan/Documents/methylation_files/breast" clinicalDir <- file.path(datadir,"Clinical/Biotab") sample_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_sample_brca.txt"),sep="\t",stringsAsFactors=FALSE) keep <- sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal") sa mple_tab <- sample_tab[keep,] patient_id <- unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-"))) tumor_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"] normal_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Solid Tissue Normal"] # read tumor data tumor_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_tumor_sample_brca.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(sample_tab, tumor_tab, by="bcr_sample_uuid", suffixes=c(".sample",".tumor"),all.x=TRUE) # read normal data normal_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_normal_control_brca.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(tab, normal_tab, by="bcr_sample_uuid", suffixes=c(".tumor",".normal"),all.x=TRUE) tab$bcr_patient_barcode <- tab$bcr_patient_barcode.tumor ii <- is.na(tab$bcr_patient_barcode) tab$bcr_patient_barcode[ii] <- tab$bcr_patient_barcode.normal[ii] # read patient data patient_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_clinical_patient_brca.txt"),sep="\t",stringsAsFactors=FALSE) names(patient_tab) <- paste("patient",names(patient_tab),sep=".") tmp <- merge(tab,patient_tab,by.x="bcr_patient_barcode",by.y= "patient.bcr_patient_barcode",all.x=TRUE,suffixes=c(".sample",".patient")) tab <- tmp # read meth metadata methMetaDir <- file.path(datadir,"METADATA/JHU_USC__HumanMethylation450") methMeta_tab <- read.delim(file.path(methMetaDir, "jhu-usc.edu_BRCA.HumanMethylation450.1.9.0.sdrf.txt"),sep="\t",stringsAsFactors=FALSE) sample_barcode <- sapply(strsplit(methMeta_tab$Comment..TCGA.Barcode.,split="-"), function(x) paste(x[1:4],collapse="-")) m <- match(tab$bcr_sample_barcode,sample_barcode) tab$Basename <- gsub("_Grn\\.idat","",methMeta_tab$Array.Data.File[m]) tab <- tab[!is.na(tab$Basename),] basedir <- file.path(datadir,"DNA_Methylation/JHU_USC__HumanMethylation450/Level_1") tab$Basename <- file.path(basedir,tab$Basename) keep <- file.exists(paste(tab$Basename,"_Grn.idat",sep="")) breast_targets <- tab objs <- grep("tab",ls(),value=TRUE) rm(list=objs) objs <- grep("dir",ls(),value=TRUE,ignore=TRUE) rm(list=objs) nms <- names(breast_targets) targets.breast <- breast_targets[nms] targets.breast$Status <- factor(ifelse(targets.breast$sample_type== "Primary Tumor","cancer","normal"),levels=c("normal","cancer")) targets.breast$Tissue <- tolower(targets.breast$patient.tumor_tissue_site) targets.breast$Sex <- targets.breast$patient.gender
Load colon normal data:
datadir <- "/Users/Morgan/Documents/methylation_files/colon" clinicalDir <- file.path(datadir,"Clinical/Biotab") sample_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_sample_coad.txt"),sep="\t",stringsAsFactors=FALSE) keep <- sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal") sample_tab <- sample_tab[keep,] patient_id <- unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-"))) tumor_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"] normal_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type== "Solid Tissue Normal"] # read tumor data tumor_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_tumor_sample_coad.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(sample_tab, tumor_tab, by="bcr_sample_uuid", suffixes=c(".sample",".tumor"),all.x=TRUE) # read normal data normal_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_normal_control_coad.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(tab, normal_tab, by="bcr_sample_uuid", suffixes=c(".tumor",".normal"),all.x=TRUE) tab$bcr_patient_barcode <- tab$bcr_patient_barcode.tumor ii <- is.na(tab$bcr_patient_barcode) tab$bcr_patient_barcode[ii] <- tab$bcr_patient_barcode.normal[ii] # read patient data patient_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_clinical_patient_coad.txt"),sep="\t",stringsAsFactors=FALSE) names(patient_tab) <- paste("patient",names(patient_tab),sep=".") tmp <- merge(tab,patient_tab,by.x="bcr_patient_barcode",by.y= "patient.bcr_patient_barcode",all.x=TRUE,suffixes=c(".sample",".patient")) tab <- tmp # read meth metadata methMetaDir <- file.path(datadir,"METADATA/JHU_USC__HumanMethylation450") methMeta_tab <- read.delim(file.path(methMetaDir, "jhu-usc.edu_COAD.HumanMethylation450.1.9.0.sdrf.txt"),sep="\t",stringsAsFactors=FALSE) sample_barcode <- sapply(strsplit(methMeta_tab$Comment..TCGA.Barcode.,split="-"), function(x) paste(x[1:4],collapse="-")) m <- match(tab$bcr_sample_barcode,sample_barcode) tab$Basename <- gsub("_Grn\\.idat","",methMeta_tab$Array.Data.File[m]) tab <- tab[!is.na(tab$Basename),] basedir <- file.path(datadir,"DNA_Methylation/JHU_USC__HumanMethylation450/Level_1") tab$Basename <- file.path(basedir,tab$Basename) keep <- file.exists(paste(tab$Basename,"_Grn.idat",sep="")) colon_targets <- tab objs <- grep("tab",ls(),value=TRUE) rm(list=objs) objs <- grep("dir",ls(),value=TRUE,ignore=TRUE) rm(list=objs) nms <- names(colon_targets) targets.colon <- colon_targets[nms] targets.colon$Status <- factor(ifelse(targets.colon$sample_type== "Primary Tumor","cancer","normal"),levels=c("normal","cancer")) targets.colon$Tissue <- tolower(targets.colon$patient.tumor_tissue_site) targets.colon$Sex <- targets.colon$patient.gender
Load lung normal data:
datadir <- "/Users/Morgan/Documents/methylation_files/lung" clinicalDir <- file.path(datadir,"Clinical/Biotab") sample_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_sample_lusc.txt"),sep="\t",stringsAsFactors=FALSE) keep <- sample_tab$sample_type %in% c("Primary Tumor", "Solid Tissue Normal") sample_tab <- sample_tab[keep,] patient_id <- unique(sapply(strsplit(sample_tab$bcr_sample_barcode,split="-"), function(x) paste(x[1:3],collapse="-"))) tumor_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type=="Primary Tumor"] normal_sample_id <- sample_tab$bcr_sample_uuid[sample_tab$sample_type== "Solid Tissue Normal"] # read tumor data tumor_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_tumor_sample_lusc.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(sample_tab, tumor_tab, by="bcr_sample_uuid", suffixes=c(".sample",".tumor"),all.x=TRUE) # read normal data normal_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_biospecimen_normal_control_lusc.txt"),sep="\t", stringsAsFactors=FALSE) tab <- merge(tab, normal_tab, by="bcr_sample_uuid", suffixes=c(".tumor",".normal"),all.x=TRUE) tab$bcr_patient_barcode <- tab$bcr_patient_barcode.tumor ii <- is.na(tab$bcr_patient_barcode) tab$bcr_patient_barcode[ii] <- tab$bcr_patient_barcode.normal[ii] # read patient data patient_tab <- read.delim(file.path(clinicalDir, "nationwidechildrens.org_clinical_patient_lusc.txt"),sep="\t",stringsAsFactors=FALSE) names(patient_tab) <- paste("patient",names(patient_tab),sep=".") tmp <- merge(tab,patient_tab,by.x="bcr_patient_barcode",by.y= "patient.bcr_patient_barcode",all.x=TRUE,suffixes=c(".sample",".patient")) tab <- tmp # read meth metadata methMetaDir <- file.path(datadir,"METADATA/JHU_USC__HumanMethylation450") methMeta_tab <- read.delim(file.path(methMetaDir, "jhu-usc.edu_LUSC.HumanMethylation450.1.7.0.sdrf.txt"),sep="\t",stringsAsFactors=FALSE) sample_barcode <- sapply(strsplit(methMeta_tab$Comment..TCGA.Barcode.,split="-"), function(x) paste(x[1:4],collapse="-")) m <- match(tab$bcr_sample_barcode,sample_barcode) tab$Basename <- gsub("_Grn\\.idat","",methMeta_tab$Array.Data.File[m]) tab <- tab[!is.na(tab$Basename),] basedir <- file.path(datadir,"DNA_Methylation/JHU_USC__HumanMethylation450/Level_1") tab$Basename <- file.path(basedir,tab$Basename) keep <- file.exists(paste(tab$Basename,"_Grn.idat",sep="")) lung_targets <- tab objs <- grep("tab",ls(),value=TRUE) rm(list=objs) objs <- grep("dir",ls(),value=TRUE,ignore=TRUE) rm(list=objs) nms <- names(lung_targets) targets.lung <- lung_targets[nms] targets.lung$Status <- factor(ifelse(targets.lung$sample_type== "Primary Tumor","cancer","normal"),levels=c("normal","cancer")) targets.lung$Tissue <- tolower(targets.lung$patient.tumor_tissue_site) targets.lung$Sex <- targets.lung$patient.gender rm(list=ls()[!(ls() %in% c('targets.breast','targets.colon','targets.lung'))])
Merge and read methylation data.
merge <- merge(targets.breast,targets.colon,all=TRUE) targets <- merge(merge,targets.lung,all=TRUE) targets <- targets[which(file.exists(paste0(targets$Basename,"_Grn.idat"))),] memory.limit(size=10000) rg_set <- read.metharray(targets$Basename,verbose=TRUE) pData(rg.set) <- targets table(targets$Tissue,targets$Status)
We now have an RGChannelSet that was created from reading the downloaded IDAT files. Next, the data needs to be processed into usable objects for various functions in the minfi package to work properly. The object classes to be used are the RGChannelSet, MethylSet, GenomicMethylSet, RatioSet, or GenomicRatioSet, which all represent different forms of information from the 450k experiment. Most of the analysis will be done on the GenomicRatioSet datatype. To preprocess the data minfi's variety of preprocess...() functions are used to convert the data sets into usable R objects.
Data Input | Processing Function | Output | Analytic Use
---------------------- | ---------------------- | ---------------- | --------------------------
Raw data (IDAT files) | read.metharray() | RGChannelSet | output of reading data
RGChannelSet | preprocessIllumina() | MethylSet | dmpFinder method
MethylSet | mapToGenome() | GenomicMethylSet | blockFinder method
GenomicMethylSet | ratioConvert() | GenomicRatioSet | bumphunter method
These steps make a GenomicRatioSet out of an RGChannelSet.
memory.limit(size=10000) methset <- preprocessIllumina(rg_set) gen_methset <- mapToGenome(methset) gen_ratset <- ratioConvert(gen_methset,type="Illumina")
Finally, we subset the last two objects to chromosomes 10, 11 and 20 for the purposes of this workshop.
gr <- granges(gen_methset) keep <- seqnames(gr) %in% c("chr10", "chr11", "chr20") gen_methset_small <- gen_methset[keep,] gr <- granges(gen_ratset) keep <- seqnames(gr) %in% c("chr10", "chr11", "chr20") gen_ratset_small <- get_ratset[keep,]
These are the objects available in this workshop pacakge.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.