knitr::opts_chunk$set(echo = TRUE)
This is a CN analysis workflow which shows different possible ways of normalizing / anlyzing Illumina 450k methylation data.
The current version of the package can be installed using devtools
#Install devtools if not already available if (!is.element("devtools", installed.packages()[,1])) { install.packages("devtools") } devtools::install_github("mknoll/cnAnalysis450k")
Idat files can be loaded with read.methyarray() of minfi. This gives an RGChannelSet.
## Use minfiData example dataset dataset <- minfiData::RGsetEx
All normalization methods which yield an CN array, can be used in the described workflow. E.g. the BMIQ normalization, which uses and provides only beta values, cannot be used to normalize copy numbers and therefore is not applicable for the provided workflow.
The resulting RGChannelSet can then be normalized, e.g. using methods from the minfi package:
Different normalization methods return different datasets:
MethylSet preprocessRaw() preprocessIllumina() * preprocessSWAN()
GenomicRatioSet preprocessFunnorm() preprocessQuantile()
Both data structures contain CN assays, which can be retrieved with the getCN() method from the minfi package.
With the conumee workflow (C), only MethylSets can be processed.
For more complex normalization strategies, e.g. dasen, RGMethylSets are normalized with preprocessRaw() prior to further calculations (wateRmelon, y_minfi.R). Alternatively, any normalized MethylSet can be further processed by the dasen method.
An additional z-transformation is advisable for two reasons:
a) Removal of baseline-differences between samples (especially in funnorm / noob / raw / swan and partly in illumina normalized data)
b) Direct comparison of different normalization methods using the same absolute cutoffs for gains / losses.
## use z-transformation workflow <- "B" #B, A, C ## normalize samples and controls ## together normData <- minfi::getCN(minfi::preprocessIllumina( dataset)) switch(workflow, A = { ctrlAll <- normData[,1:3] ctrl <- apply(ctrlAll, 1, "median") }, B = { #with z-Transformation, illumina ctrlAll <- normData[,1:3] ctrlAll[is.infinite(ctrlAll)] <- NA ctrlAll <- scale(ctrlAll) ctrl <- apply(ctrlAll, 1, "median") }, C = { #conumee-path, Illumina ctrl <- normData[,1:3] }) ## samples switch(workflow, A = { #without z-transformation samples <- normData[,4:6] }, B = { #with z-transformation samples <- normData[,4:6] samples[is.infinite(samples)] <- NA samples <- scale(samples) }, C = { #conumee path, Illumina samples <- normData[,4:6] }, { stop("Invalid Workflow! Please set workflow to either 'A', 'B' or 'C'.") })
When using the conumee path, one can calculate segments / bins and additionally transcript / gene specific CN alterations.
#What information is of interest? candidates <- "segments" #segments, bins, transcripts, genes candidatesDATA <- NULL if (candidates == "segments") { if (workflow == "C") { #calculate segments with conumee candidatesDATA <- cnAnalysis450k::runConumee(samples, ctrl) } else { candidatesDATA <- cnAnalysis450k::findSegments(samples[, , drop = FALSE], ctrl, ctrlAll) } } else if (candidates == "bins") { if (workflow == "C") { #calculate bins with conumee candidatesDATA <- cnAnalysis450k::runConumee(samples, ctrl, what = "bins") } else { #calculate bins, binsize=50000 ### CHANGE BINSIZE HERE candidatesDATA <- cnAnalysis450k::createBinsFast(samples[, 1:3, drop = FALSE], ctrl, ctrlAll, binsize = 500000) } } else if (candidates == "transcripts" || candidates == "genes") { ## genenames genes <- c( "EGFR", "NF1", "PIK3CA", "PTEN", "ARID1B", "ATRX", "CIC", "SETD2", "TSC2","KMT2D", "NOTCH1", "NOTCH2", "VHL", "TP53", "BRCA1", "BRCA2","ATM", "APC", "TERT", "PTCH1","SMO", "ALK", "MPL", "MDM2", "MDM4", "MYC", "MYCN", "ID2", "PDGFRA","MET", "CDK4", "CDK6","CCND2", "CDKN2A","PTEN","RB1", "SOX2" ) egid <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db, genes, c("ENTREZID"), "SYMBOL") tx <- AnnotationDbi::select( TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene, egid$ENTREZID, columns = "TXNAME", keytype = "GENEID" )$TXNAME ## or alternatively, give a vector of transcript names #tx <- c("uc003tqh.3", "uc022ado.1", "uc003qqo.3") ## or test all transcripts (can take some time) tx <- GenomicFeatures::transcripts( TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene)$tx_name if (workflow == "C") { candidatesDATA <- cnAnalysis450k::runConumee(samples, ctrl, what = "transcripts", tx) } else { candidatesDATA <- cnAnalysis450k::getTxValues(samples, ctrl, ctrlAll, tx, output = "diff") ##alteratively #candidatesDATA <- cnAnalysis450k::getTxValuesFast(samples, ctrl, ctrlAll, tx) } } else { stop("Invalid candidate selection! Set to 'segments', 'genes', 'transcripts', 'bins'.") }
The next step is to create a data matrix out of the aquired CN data. This allows for clustering of samples and therefore identification of similar samples with regards to copy numbers. Here, one can select whether all rows should be considered (set p.val <= 1 / p.selected = 1), or if only differentially regulated candidates will be further processed (e.g. mann-whitney-u test / t-test < 0.01).
candidatesMATRIX <- NULL if (workflow == "C") { candidatesMATRIX <- cnAnalysis450k::createConumeeMatrix(candidatesDATA) } else { if (candidates == "segments") { candidatesMATRIX <- cnAnalysis450k::createSegmentMatrix(candidatesDATA, p.select = 0.05) } else if (candidates == "bins") { candidatesMATRIX <- cnAnalysis450k::createBinMatrix(candidatesDATA, pval=1) } else { candidatesMATRIX <- candidatesDATA$data[which(candidatesDATA$p.val <= 0.05), ] } } pheatmap::pheatmap(candidatesMATRIX)
Cutoffs for definition of gain / loss might be chosen manually (control not only the p.val but also the effect size) or automatically (especially if one is interested in - even small - contrasts). The effectsize can be given via the effectsize(LOSS, GAIN) parameter; standard is a minimal effectsize of 0.
These cutoffs are then applied to the previously aquired matrix.
This step might be skipped by setting defineCutoffs to 'skip' if one is interested in the continuous data.
For calculation of cutoffs, previous / following rows may be considered (e.g. when screening for segment alterations): This can be achieved by setting the proximity parameter: c(2,1) will use rows [(i-2):(i+1)] for the determination of cutoffs. Chromosomal borders are respected.
defineCutoffs <- "auto" # man, auto, skip LOSSEFFECT <- 0.15 GAINEFFECT <- 0.1 PROXIMITY <- c(2, 1) candidatesCUT <- NULL candidatesFINAL <- NULL switch(defineCutoffs, man = { ##manual thresholds #set the thresholds as visually appropriate ### CHANGE HERE candidatesFINAL <- cnAnalysis450k::segmentDataAbs( candidatesMATRIX, upper = 0.78, lower = 1.21, ylim = c(0, 2) ) }, auto = { ##auto thresholds if (workflow == "C") { #conumee if (candidates == "segments") { candidatesCUT <- cnAnalysis450k::findCutoffs( candidatesMATRIX[complete.cases(candidatesMATRIX * 0), , drop =FALSE], proximity = PROXIMITY) } else { candidatesCUT <- cnAnalysis450k::findCutoffs( candidatesMATRIX[complete.cases(candidatesMATRIX * 0), , drop = FALSE]) } candidatesFINAL <- cnAnalysis450k::segmentData( candidatesMATRIX[complete.cases(candidatesMATRIX * 0), , drop = FALSE], candidatesCUT, effectsize = c(LOSSEFFECT, GAINEFFECT)) } else { if (candidates == "segments") { candidatesCUT <- cnAnalysis450k::findCutoffs(candidatesMATRIX, proximity = PROXIMITY) } else { candidatesCUT <- cnAnalysis450k::findCutoffs(candidatesMATRIX) } candidatesFINAL <- cnAnalysis450k::segmentData(candidatesMATRIX, candidatesCUT, effectsize = c(LOSSEFFECT, GAINEFFECT)) } }, skip = { ##no thresholds candidatesFINAL <- candidatesMATRIX }, { stop("Invalid cutoff strategy! Please set defineCutoffs to 'man', 'auto' or 'skip'.") })
Calculate Fisher's exact test for a given group assigment (group vector has to be in the same order as the columns in candidatesFINAL). If binF is set to false, categories for p values will be created (<0.001, <0.01, <0.05, <0.1, >0.1).
## should Fishers' p-values be calculated? calc <- TRUE #TRUE, FALSE ##if so, should they be aggregated? binF <- FALSE #TRUE, FALSE ################### fisherVal <- NULL group <- NULL #change below if (calc) { if (defineCutoffs == "skip") { calc <- FALSE stop("Please define & apply cutoffs to data!") } ##give group assignment: CHANGE HERE! group <- c(rep(1, length(candidatesFINAL[1, ]) - 2), rep(2, 2)) ##CHECK ORDER !!! ## calculate fisher values fisherVal <- cnAnalysis450k::calcFisher(candidatesFINAL, group, bin = binF) } ## Colors of legends anno_color <- NULL if (binF && calc) { # set colors for fisher p-val categories anno_color <- list( Fisher = c( "<0.001" = "black", "<0.01" = "green", "<0.05" = "yellow", "<0.1" = "lightgray", ">0.1" = "white" ) ) } ## Row annotation anno_row <- NULL if (candidates == "bin" || candidates == "segments") { anno_row <- data.frame(id = rownames(candidatesFINAL), Chromosome = do.call(rbind, strsplit(rownames( candidatesFINAL ), ":"))[, 1]) anno_row$Chromosome <- factor(anno_row$Chromosome, levels = paste("chr", 1:22, sep = "")) rownames(anno_row) <- anno_row[, 1] } else if (candidates == "transcripts" || candidates == "genes") { require(Homo.sapiens) gen_dat <- AnnotationDbi::select(Homo.sapiens, rownames(candidatesFINAL), c("SYMBOL", "TXCHROM"), "TXNAME") anno_row <- data.frame( id = rownames(candidatesFINAL), Chromosome = gen_dat$TXCHROM, Symbol = gen_dat$SYMBOL ) rownames(anno_row) <- anno_row[, 1] } if (calc && !is.null(anno_row)) { anno_row$Fisher <- factor(fisherVal) } ## Col annotation anno_col <- NULL if (!is.null(group)) { anno_col <- data.frame(group) rownames(anno_col) <- colnames(candidatesFINAL) } ##output pheatmap::pheatmap( candidatesFINAL, annotation_row = anno_row[, -1, drop = FALSE], annotation_col = anno_col[], annotation_colors = anno_color, cluster_rows = FALSE, cluster_cols = TRUE, show_rownames = FALSE )
Print identified candidates
## get significant canididates list if (calc) { anno_row[which(fisherVal < 0.1), ] }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.