knitr::opts_chunk$set(echo = TRUE)

Overview

This is a CN analysis workflow which shows different possible ways of normalizing / anlyzing Illumina 450k methylation data.

Installation

The current version of the package can be installed using devtools

#Install devtools if not already available
if (!is.element("devtools", installed.packages()[,1])) {
    install.packages("devtools")
}
devtools::install_github("mknoll/cnAnalysis450k")

Data loading

Idat files can be loaded with read.methyarray() of minfi. This gives an RGChannelSet.

## Use minfiData example dataset
dataset <- minfiData::RGsetEx

Normalization & Transformation

All normalization methods which yield an CN array, can be used in the described workflow. E.g. the BMIQ normalization, which uses and provides only beta values, cannot be used to normalize copy numbers and therefore is not applicable for the provided workflow.

Minfi normalization methods

The resulting RGChannelSet can then be normalized, e.g. using methods from the minfi package:

Different normalization methods return different datasets:

MethylSet preprocessRaw() preprocessIllumina() * preprocessSWAN()

GenomicRatioSet preprocessFunnorm() preprocessQuantile()

Both data structures contain CN assays, which can be retrieved with the getCN() method from the minfi package.

With the conumee workflow (C), only MethylSets can be processed.

Dasen normalization

For more complex normalization strategies, e.g. dasen, RGMethylSets are normalized with preprocessRaw() prior to further calculations (wateRmelon, y_minfi.R). Alternatively, any normalized MethylSet can be further processed by the dasen method.

Z-transformation

An additional z-transformation is advisable for two reasons:

a) Removal of baseline-differences between samples (especially in funnorm / noob / raw / swan and partly in illumina normalized data)

b) Direct comparison of different normalization methods using the same absolute cutoffs for gains / losses.

## use z-transformation
workflow <- "B" #B, A, C

## normalize samples and controls 
## together
normData <- 
    minfi::getCN(minfi::preprocessIllumina(
        dataset))

switch(workflow,
       A = {
           ctrlAll <- normData[,1:3]
           ctrl <- apply(ctrlAll, 1, "median")
           },
       B = {
           #with z-Transformation, illumina
           ctrlAll <- normData[,1:3]
           ctrlAll[is.infinite(ctrlAll)] <- NA
           ctrlAll <- scale(ctrlAll)
           ctrl <- apply(ctrlAll, 1, "median")
           },
       C = {
           #conumee-path, Illumina
           ctrl <- normData[,1:3]
           })

## samples
switch(workflow,
       A = {
           #without z-transformation
           samples <- normData[,4:6]
           },
       B = {
           #with z-transformation
           samples <- normData[,4:6]
           samples[is.infinite(samples)] <- NA
           samples <- scale(samples)
           },
       C = {
           #conumee path, Illumina
           samples <- normData[,4:6]
           },
       {
           stop("Invalid Workflow! Please set workflow to either 'A', 'B' or 'C'.")
           })

Finding segments/bins/transcripts/genes in CN data

When using the conumee path, one can calculate segments / bins and additionally transcript / gene specific CN alterations.

#What information is of interest?
candidates <- "segments" #segments, bins, transcripts, genes

candidatesDATA <- NULL
if (candidates == "segments") {
    if (workflow == "C") {
        #calculate segments with conumee
        candidatesDATA <- cnAnalysis450k::runConumee(samples, ctrl)
        } else {
            candidatesDATA <-
                cnAnalysis450k::findSegments(samples[, , drop = FALSE], ctrl, ctrlAll)
            }
    } else if (candidates == "bins") {
        if (workflow == "C") {
            #calculate bins with conumee
            candidatesDATA <-
                cnAnalysis450k::runConumee(samples, ctrl, what = "bins")
            } else {
                #calculate bins, binsize=50000
                ### CHANGE BINSIZE HERE
                candidatesDATA <-
                    cnAnalysis450k::createBinsFast(samples[, 1:3, drop = FALSE], ctrl,
                                               ctrlAll, binsize = 500000)
                }
        } else if (candidates == "transcripts" || candidates == "genes") {
            ## genenames
            genes <-
                c(
                    "EGFR", "NF1", "PIK3CA", "PTEN", "ARID1B", "ATRX",
                    "CIC", "SETD2", "TSC2","KMT2D", "NOTCH1", "NOTCH2",
                    "VHL", "TP53", "BRCA1", "BRCA2","ATM", "APC", "TERT",
                    "PTCH1","SMO",  "ALK", "MPL", "MDM2", "MDM4", "MYC",
                    "MYCN", "ID2", "PDGFRA","MET", "CDK4", "CDK6","CCND2",                
                    "CDKN2A","PTEN","RB1", "SOX2"
                    )
            egid <-
                AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
                                      genes,
                                      c("ENTREZID"),
                                      "SYMBOL")
            tx <-
                AnnotationDbi::select(
                    TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene,
                    egid$ENTREZID,
                    columns = "TXNAME",
                    keytype = "GENEID"
                    )$TXNAME
            ## or alternatively, give a vector of transcript names
            #tx <- c("uc003tqh.3", "uc022ado.1", "uc003qqo.3")
            ## or test all transcripts (can take some time)
            tx <-
                GenomicFeatures::transcripts(
                    TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene)$tx_name

            if (workflow == "C") {
                candidatesDATA <-
                    cnAnalysis450k::runConumee(samples, ctrl, what = "transcripts", tx)
                } else {
                    candidatesDATA <-
                        cnAnalysis450k::getTxValues(samples, ctrl, ctrlAll, tx, output = "diff")
                    ##alteratively
                    #candidatesDATA <- cnAnalysis450k::getTxValuesFast(samples, ctrl, ctrlAll, tx)
                    }
            } else {
                stop("Invalid candidate selection! Set to 'segments', 'genes', 
                     'transcripts', 'bins'.")
                }

Create multi-sample matrix (& select differing candidates)

The next step is to create a data matrix out of the aquired CN data. This allows for clustering of samples and therefore identification of similar samples with regards to copy numbers. Here, one can select whether all rows should be considered (set p.val <= 1 / p.selected = 1), or if only differentially regulated candidates will be further processed (e.g. mann-whitney-u test / t-test < 0.01).

candidatesMATRIX <- NULL
if (workflow == "C") {
    candidatesMATRIX <-
        cnAnalysis450k::createConumeeMatrix(candidatesDATA)
    } else {
        if (candidates == "segments") {
            candidatesMATRIX <-
                cnAnalysis450k::createSegmentMatrix(candidatesDATA, p.select = 0.05)
        } else if (candidates == "bins") {
            candidatesMATRIX <- 
                cnAnalysis450k::createBinMatrix(candidatesDATA, pval=1)
        } else  {
            candidatesMATRIX <-
                candidatesDATA$data[which(candidatesDATA$p.val <= 0.05), ]
        }
    }

pheatmap::pheatmap(candidatesMATRIX)

Define cutoffs for gain/loss

Cutoffs for definition of gain / loss might be chosen manually (control not only the p.val but also the effect size) or automatically (especially if one is interested in - even small - contrasts). The effectsize can be given via the effectsize(LOSS, GAIN) parameter; standard is a minimal effectsize of 0.

These cutoffs are then applied to the previously aquired matrix.

This step might be skipped by setting defineCutoffs to 'skip' if one is interested in the continuous data.

For calculation of cutoffs, previous / following rows may be considered (e.g. when screening for segment alterations): This can be achieved by setting the proximity parameter: c(2,1) will use rows [(i-2):(i+1)] for the determination of cutoffs. Chromosomal borders are respected.

defineCutoffs <- "auto" # man, auto, skip
LOSSEFFECT <- 0.15
GAINEFFECT <- 0.1
PROXIMITY <- c(2, 1)

candidatesCUT <- NULL
candidatesFINAL <- NULL
switch(defineCutoffs,
       man = {
           ##manual thresholds
           #set the thresholds as visually appropriate
           ### CHANGE HERE
           candidatesFINAL <-
               cnAnalysis450k::segmentDataAbs(
                   candidatesMATRIX,
                   upper = 0.78,
                   lower = 1.21,
                   ylim = c(0, 2)
                   )
           },
       auto = {
           ##auto thresholds
           if (workflow == "C") {
               #conumee
               if (candidates == "segments") {
                   candidatesCUT <-
                       cnAnalysis450k::findCutoffs(
                           candidatesMATRIX[complete.cases(candidatesMATRIX * 0), , 
                                            drop =FALSE], proximity = PROXIMITY)
               } else {
                       candidatesCUT <-
                           cnAnalysis450k::findCutoffs(
                               candidatesMATRIX[complete.cases(candidatesMATRIX * 0), ,
                                                drop = FALSE])
               }
               candidatesFINAL <-
                   cnAnalysis450k::segmentData(
                       candidatesMATRIX[complete.cases(candidatesMATRIX * 0), ,
                                        drop = FALSE],
                       candidatesCUT,
                       effectsize = c(LOSSEFFECT, GAINEFFECT))
               } else {
                   if (candidates == "segments") {
                       candidatesCUT <-
                           cnAnalysis450k::findCutoffs(candidatesMATRIX, proximity = PROXIMITY)
                       } else {
                           candidatesCUT <- cnAnalysis450k::findCutoffs(candidatesMATRIX)
                           }
                   candidatesFINAL <-
                       cnAnalysis450k::segmentData(candidatesMATRIX,
                                                   candidatesCUT,
                                                   effectsize = c(LOSSEFFECT, GAINEFFECT))
                   }
           },
       skip = {
           ##no thresholds
           candidatesFINAL <- candidatesMATRIX
           },
       {
           stop("Invalid cutoff strategy! Please set defineCutoffs to
                'man', 'auto' or 'skip'.")
           })

Analyze data for enrichment of loss / gain for a given group

Calculate Fisher's exact test for a given group assigment (group vector has to be in the same order as the columns in candidatesFINAL). If binF is set to false, categories for p values will be created (<0.001, <0.01, <0.05, <0.1, >0.1).

## should Fishers' p-values be calculated?
calc <- TRUE #TRUE, FALSE
##if so, should they be aggregated?
binF <- FALSE #TRUE, FALSE


###################
fisherVal <- NULL
group <- NULL #change below
if (calc) {
    if (defineCutoffs == "skip") {
        calc <- FALSE
        stop("Please define & apply cutoffs to data!")
        }

    ##give group assignment: CHANGE HERE!
    group <- c(rep(1, length(candidatesFINAL[1, ]) - 2), rep(2, 2))
    ##CHECK ORDER !!!

    ## calculate fisher values
    fisherVal <-
        cnAnalysis450k::calcFisher(candidatesFINAL, group, bin = binF)
    }

## Colors of legends
anno_color <- NULL
if (binF && calc) {
    # set colors for fisher p-val categories
    anno_color <-
        list(
            Fisher = c(
                "<0.001" = "black",
                "<0.01" = "green",
                "<0.05" = "yellow",
                "<0.1" = "lightgray",
                ">0.1" = "white"
                )
            )
    }

## Row annotation
anno_row <- NULL
if (candidates == "bin" || candidates == "segments") {
    anno_row <-
        data.frame(id = rownames(candidatesFINAL),
                   Chromosome = do.call(rbind, strsplit(rownames(
                       candidatesFINAL
                       ), ":"))[, 1])
    anno_row$Chromosome <-
        factor(anno_row$Chromosome, levels = paste("chr", 1:22, sep = ""))
    rownames(anno_row) <- anno_row[, 1]
    } else if (candidates == "transcripts" || candidates == "genes") {
        require(Homo.sapiens)
        gen_dat <-
            AnnotationDbi::select(Homo.sapiens,
                                  rownames(candidatesFINAL),
                                  c("SYMBOL", "TXCHROM"),
                                  "TXNAME")
        anno_row <-
            data.frame(
                id = rownames(candidatesFINAL),
                Chromosome = gen_dat$TXCHROM,
                Symbol = gen_dat$SYMBOL
                )
        rownames(anno_row) <- anno_row[, 1]
        }
if (calc && !is.null(anno_row)) {
    anno_row$Fisher <- factor(fisherVal)
    }

## Col annotation
anno_col <- NULL
if (!is.null(group)) {
    anno_col <- data.frame(group)
    rownames(anno_col) <- colnames(candidatesFINAL)
    }

##output
pheatmap::pheatmap(
    candidatesFINAL,
    annotation_row = anno_row[, -1, drop = FALSE],
    annotation_col = anno_col[],
    annotation_colors = anno_color,
    cluster_rows = FALSE,
    cluster_cols = TRUE,
    show_rownames = FALSE
    )

Get candidates

Print identified candidates

## get significant canididates list
if (calc) {
    anno_row[which(fisherVal < 0.1), ]
    }


mknoll/cnAnalysis450k documentation built on May 23, 2019, 2:01 a.m.