In leroybondhus/dmrscaler: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

knitr::opts_chunk$set(
                      warning=FALSE)

Summary

DMRscaler is a method designed to identify features of differential methylation between case and control conditions across a wide range of genomic scale. In the first part of the vignette we step through downloading a publicly available dataset, running the statistical test for difference at individual CG site and a permutation test for estimating enrichment of differentially methylated CGs within a window. After this we run DMRscaler to call DMRs at each layer of window sizes generating a list of results. We then look at the data to visualize how DMRscaler has called and represents these DMR features.

install DMRscaler

Download the most recent version of DMRscaler from github and place in working directory, if DMRscaler is downloaded to another directory, change the "path_to_DMRscaler" variable below to the path to the DMRscaler directory.

path_to_DMRscaler <- "../DMRscaler"

if(!require("devtools", quietly = TRUE )){
  install.packages("devtools")
  library("devtools")
}
if(!require("DMRscaler")){
  devtools::install(path_to_DMRscaler)
  library("DMRscaler")
}

Example Dataset Setup

We will use data from GSE149960 from (Köhler F. et al, 2020) with DNA methylation from fibroblasts from progeria patients and controls measured on the Illumina methylation EPIC array.

if(!require("BiocManager", quietly = TRUE )){
  install.packages("BiocManager")
  library("BiocManager")
}
if(!require("GEOquery")){
  BiocManager::install("GEOquery")
  library("GEOquery")
}

## get sample phenotype data table 
gse <- getGEO("GSE149960", GSEMatrix = TRUE)
phen <- gse$GSE149960_series_matrix.txt.gz@phenoData@data
rm(gse)

## get methylation data as idat files (NOTE: this saves files locally in working directory,
## unpacked size is 411 Mb)
if(!dir.exists("GSE149960/idat")){
  ## note: some people have issues using GEOquery to download
  ##       if this is the case, manually downloading into the data
  ##       into the working directory may be necessary
  getGEOSuppFiles("GSE149960")
  untar("GSE149960/GSE149960_RAW.tar", exdir = "GSE149960/idat")
  file.remove("GSE149960/GSE149960_RAW.tar")
}
idat_files <- list.files("GSE149960/idat", pattern = "idat.gz$", full = TRUE)
sapply(idat_files, gunzip, overwrite = TRUE); rm(idat_files)

Preprocessing

Preprocessing of idat files is done with minfi here.

if(!require("minfi")){
  BiocManager::install("minfi")
  library("minfi")
}
if(!require("IlluminaHumanMethylationEPICmanifest")){
  BiocManager::install("IlluminaHumanMethylationEPICmanifest")
  library("IlluminaHumanMethylationEPICmanifest")
}

###  Reading of idat files done with minfi library ###
idats_dir<-"GSE149960/idat"
RGSet <- read.metharray.exp(base = idats_dir)
GRset.funnorm <- preprocessFunnorm(RGSet);rm(RGSet)
snps <- getSnpInfo(object = GRset.funnorm)
GRset.funnorm <- dropLociWithSnps(GRset.funnorm, snps=c("SBE", "CpG"), maf=0);rm(snps)
rm(idats_dir)

Set up for DMRscaler

if(!require("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")){
  BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
  library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")
}

controls <- grep("control",phen$title)
cases <- grep("hgps", phen$title)
locs <- getLocations(GRset.funnorm)
locs <- data.frame("names"=locs@ranges@NAMES, "pos"=locs@ranges@start,
                   "chr" = rep(locs@seqnames@values, locs@seqnames@lengths))
B <- getBeta(GRset.funnorm)

Run statistical tests and permutations to feed into DMRscaler method

We use the non-parametric wilcox test for individual CG significance calculations and permutation of these significance values for estimating window level significance.

if(!require("doParallel", quietly = TRUE )){
  install.packages("doParallel")
  library("doParallel")
}
if(!require("rlang", quietly = TRUE )){
  install.packages("rlang")
  library("rlang")
}
if(!require("dplyr", quietly = TRUE )){
  install.packages("dplyr")
  library("dplyr")
}
if(!require("foreach", quietly = TRUE )){
  install.packages("foreach")
  library("foreach")
}

registerDoParallel(detectCores())

mwr <- DMRscaler::run_MWW(control_indices = controls ,
                          case_indices = cases ,
                          Beta = B)

locs$chr <- as.factor(locs$chr)
locs$scoring_values <- mwr$p_val
locs$pval <- mwr$p_val

## get_loc_fdr_pval permutations can take a while... consider using more liberal fdr
## threshold for estimation of pval_cutoff or manually setting pval_cutoff. 
fdr <- 0.2
pcut_table <- get_loc_fdr_pval(B, cases,controls, wilcox.test, fdr=fdr, return_table = T)
pvals_below_fdr_cut <- pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)]
if(length(pvals_below_fdr_cut)==0 ){
  warning("desired cpg level fdr not achieved at any pvalue threshold, consider using more liberal fdr cutoff")
}
pval_cutoff <- 10^max(pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)])
print(paste("p-value cutoff set to ", signif(pval_cutoff,2), sep=""))

Run DMRscaler to call DMRs

Uses significance values associated with each CG location and the values from the statistical test and permutations along with windows of adjacent measured CGs defined by the layer_sizes ToDo: This can take

dmrscaler_result <- DMRscaler::dmrscaler(locs=locs,
                                         locs_pval_cutoff = pval_cutoff,
                                         region_signif_method = "ben",
                                         region_signif_cutoff = 0.05,
                                         window_type = "k_nearest",
                                         window_sizes = c(2,4,8,16,32,64,128),
                                         output_type = "complete")
head(dmrscaler_result[[length(dmrscaler_result)]])

Choose an example region to look at

example_region <- data.frame(chr="chr7",start=155616381, stop=159033499,
                             stringsAsFactors = FALSE)

if(!require("circlize", quietly = TRUE )){
  install.packages("circlize")
  library("circlize")
}
if(!require("HilbertCurve", quietly = TRUE )){
  BiocManager::install("HilbertCurve")
  library("HilbertCurve")
}

col_fun = colorRamp2(c(0,-log10(0.05),-log10(0.01),max(-log10(locs$pval))), c("grey30", "grey60", "red", "red"))
level = 6
## 4^level - 1 = # segments

hc_points <- locs[which(locs$chr== example_region$chr ),]
hc_max <- nrow(hc_points)
hc_col <- col_fun(-log10(hc_points$pval))
hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval))

#  hc_size <- hc_points$scoring_values / fdrscaler
hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*2, "mm"),
          gp = gpar(col = hc_col, fill = hc_col))
hc_polygon(hc, x1 = min(which(hc_points$pos >= example_region$start)),
           x2 = max(which(hc_points$pos <= example_region$stop)) )

### now looking only at the specified region
level = 4
hc_points <- locs[which(locs$chr== example_region$chr & 
                          locs$pos >= example_region$start & 
                          locs$pos <= example_region$stop ),]
hc_max <- nrow(hc_points)
hc_col <- col_fun(-log10(hc_points$pval))
hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval))

hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr)
hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*4, "mm"),
          gp = gpar(col = hc_col, fill = hc_col))

Explore some features of the data

DMRscaler defines differential methylation features iteratively, expanding the size of the window for aggregating on at each step this procedure allows a hierarchical structure to describe a region enriched in differntially methylated CpGs. We look at our example region here

if(!require("networkD3", quietly = TRUE )){
  install.packages("networkD3")
  library("networkD3")
}

dmr_tree <- example_generate_dmr_tree(dmrscaler_result = dmrscaler_result,
                                      layer=length(dmrscaler_result),
                                      chr=example_region$chr,
                                      start = example_region$start,
                                      stop = example_region$stop)
diagonalNetwork(List = dmr_tree, fontSize = 12, fontFamily = "bold",
                nodeStroke = "black", linkColour = "black", opacity = 1 )

Next we can look at the region using the Gviz package

if(!require("Gviz")){
  BiocManager::install("Gviz")
  library("Gviz")
}
if(!require("biomaRt")){
  BiocManager::install("biomaRt")
  library("biomaRt")
}

## organize data used in tracks
group <- character(length=ncol(B)) 
group[cases] <- "case"
group[controls] <- "control"
delta_mean_B <- rowMeans(B[,cases]) - rowMeans(B[,controls]) 
which <- which(locs$chr==example_region$chr & locs$pos >= example_region$start & locs$pos <= example_region$stop)
Bsub <- B[which, ]
gr <- GRanges(seqnames = example_region$chr, ranges = IRanges(start = locs[which,"pos"], width = 1), mcols = Bsub )

## set up ideogram track
itrack<-IdeogramTrack(genome="hg19", chromosome = example_region$chr)
## set up genome axis track
gtrack<-GenomeAxisTrack()
## set up significance track
sig_track <- DataTrack(range=gr, data = -log10(locs$pval[which]), type=c("p"))
## set up beta value track
beta_track <- DataTrack(range = gr, data = t(Bsub), groups=group,name="Beta", type=c("a","g","confint"))
## set up diff beta track
diff_beta_track <- DataTrack(range = gr, data = delta_mean_B[which], type=c("a","g"))

## set up gene model track ### 
genesub <- example_gene_anno_df(chr=example_region$chr, start=example_region$start, stop=example_region$stop)
if(!all(is.na(genesub))){
  grtrack <- GeneRegionTrack(genesub, chromosome = example_region$chr, start = example_region$start, stop=example_region$stop, transcriptAnnotation="symbol" ,collapseTranscripts = "meta")
}else{grtrack<-GeneRegionTrack()}

## set up DMRscaler results layer tracks
layer_track <- list()
for(j in 1:length(dmrscaler_result)){
  if(nrow(dmrscaler_result[[j]])==0){
    layer_track[[j]] <- AnnotationTrack()
  } else {
    layer_track[[j]] <- AnnotationTrack(start = dmrscaler_result[[j]]$start, 
                                        end =  dmrscaler_result[[j]]$stop,
                                        chromosome = dmrscaler_result[[j]]$chr,
                                        strand = "*", genome = "hg19", name=paste("layer", j, sep = "_"))
    displayPars(layer_track[[j]]) <- list(cex.legend=0.7, cex.axis=0.7, cex.title=0.7, rotation.title=0, stackHeight = 1, shape="box")
  }
}


plotTracks(c(list(itrack, gtrack, sig_track, beta_track, diff_beta_track, grtrack), layer_track), from = example_region$start-1, to=example_region$stop+1 )