knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
knitr::opts_chunk$set( warning=FALSE)
DMRscaler is a method designed to identify features of differential methylation between case and control conditions across a wide range of genomic scale. In the first part of the vignette we step through downloading a publicly available dataset, running the statistical test for difference at individual CG site and a permutation test for estimating enrichment of differentially methylated CGs within a window. After this we run DMRscaler to call DMRs at each layer of window sizes generating a list of results. We then look at the data to visualize how DMRscaler has called and represents these DMR features.
Download the most recent version of DMRscaler from github and place in working directory, if DMRscaler is downloaded to another directory, change the "path_to_DMRscaler" variable below to the path to the DMRscaler directory.
path_to_DMRscaler <- "../DMRscaler" if(!require("devtools", quietly = TRUE )){ install.packages("devtools") library("devtools") } if(!require("DMRscaler")){ devtools::install(path_to_DMRscaler) library("DMRscaler") }
We will use data from GSE149960 from (Köhler F. et al, 2020) with DNA methylation from fibroblasts from progeria patients and controls measured on the Illumina methylation EPIC array.
if(!require("BiocManager", quietly = TRUE )){ install.packages("BiocManager") library("BiocManager") } if(!require("GEOquery")){ BiocManager::install("GEOquery") library("GEOquery") }
## get sample phenotype data table gse <- getGEO("GSE149960", GSEMatrix = TRUE) phen <- gse$GSE149960_series_matrix.txt.gz@phenoData@data rm(gse) ## get methylation data as idat files (NOTE: this saves files locally in working directory, ## unpacked size is 411 Mb) if(!dir.exists("GSE149960/idat")){ ## note: some people have issues using GEOquery to download ## if this is the case, manually downloading into the data ## into the working directory may be necessary getGEOSuppFiles("GSE149960") untar("GSE149960/GSE149960_RAW.tar", exdir = "GSE149960/idat") file.remove("GSE149960/GSE149960_RAW.tar") } idat_files <- list.files("GSE149960/idat", pattern = "idat.gz$", full = TRUE) sapply(idat_files, gunzip, overwrite = TRUE); rm(idat_files)
Preprocessing of idat files is done with minfi here.
if(!require("minfi")){ BiocManager::install("minfi") library("minfi") } if(!require("IlluminaHumanMethylationEPICmanifest")){ BiocManager::install("IlluminaHumanMethylationEPICmanifest") library("IlluminaHumanMethylationEPICmanifest") }
### Reading of idat files done with minfi library ### idats_dir<-"GSE149960/idat" RGSet <- read.metharray.exp(base = idats_dir) GRset.funnorm <- preprocessFunnorm(RGSet);rm(RGSet) snps <- getSnpInfo(object = GRset.funnorm) GRset.funnorm <- dropLociWithSnps(GRset.funnorm, snps=c("SBE", "CpG"), maf=0);rm(snps) rm(idats_dir)
if(!require("IlluminaHumanMethylationEPICanno.ilm10b4.hg19")){ BiocManager::install("IlluminaHumanMethylationEPICanno.ilm10b4.hg19") library("IlluminaHumanMethylationEPICanno.ilm10b4.hg19") }
controls <- grep("control",phen$title) cases <- grep("hgps", phen$title) locs <- getLocations(GRset.funnorm) locs <- data.frame("names"=locs@ranges@NAMES, "pos"=locs@ranges@start, "chr" = rep(locs@seqnames@values, locs@seqnames@lengths)) B <- getBeta(GRset.funnorm)
We use the non-parametric wilcox test for individual CG significance calculations and permutation of these significance values for estimating window level significance.
if(!require("doParallel", quietly = TRUE )){ install.packages("doParallel") library("doParallel") } if(!require("rlang", quietly = TRUE )){ install.packages("rlang") library("rlang") } if(!require("dplyr", quietly = TRUE )){ install.packages("dplyr") library("dplyr") } if(!require("foreach", quietly = TRUE )){ install.packages("foreach") library("foreach") }
registerDoParallel(detectCores()) mwr <- DMRscaler::run_MWW(control_indices = controls , case_indices = cases , Beta = B) locs$chr <- as.factor(locs$chr) locs$scoring_values <- mwr$p_val locs$pval <- mwr$p_val ## get_loc_fdr_pval permutations can take a while... consider using more liberal fdr ## threshold for estimation of pval_cutoff or manually setting pval_cutoff. fdr <- 0.2 pcut_table <- get_loc_fdr_pval(B, cases,controls, wilcox.test, fdr=fdr, return_table = T) pvals_below_fdr_cut <- pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)] if(length(pvals_below_fdr_cut)==0 ){ warning("desired cpg level fdr not achieved at any pvalue threshold, consider using more liberal fdr cutoff") } pval_cutoff <- 10^max(pcut_table$log10pval_cutoff[which(pcut_table$fdr <= fdr)]) print(paste("p-value cutoff set to ", signif(pval_cutoff,2), sep=""))
Uses significance values associated with each CG location and the values from the statistical test and permutations along with windows of adjacent measured CGs defined by the layer_sizes ToDo: This can take
dmrscaler_result <- DMRscaler::dmrscaler(locs=locs, locs_pval_cutoff = pval_cutoff, region_signif_method = "ben", region_signif_cutoff = 0.05, window_type = "k_nearest", window_sizes = c(2,4,8,16,32,64,128), output_type = "complete") head(dmrscaler_result[[length(dmrscaler_result)]])
example_region <- data.frame(chr="chr7",start=155616381, stop=159033499, stringsAsFactors = FALSE)
if(!require("circlize", quietly = TRUE )){ install.packages("circlize") library("circlize") } if(!require("HilbertCurve", quietly = TRUE )){ BiocManager::install("HilbertCurve") library("HilbertCurve") }
col_fun = colorRamp2(c(0,-log10(0.05),-log10(0.01),max(-log10(locs$pval))), c("grey30", "grey60", "red", "red")) level = 6 ## 4^level - 1 = # segments hc_points <- locs[which(locs$chr== example_region$chr ),] hc_max <- nrow(hc_points) hc_col <- col_fun(-log10(hc_points$pval)) hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval)) # hc_size <- hc_points$scoring_values / fdrscaler hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr) hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*2, "mm"), gp = gpar(col = hc_col, fill = hc_col)) hc_polygon(hc, x1 = min(which(hc_points$pos >= example_region$start)), x2 = max(which(hc_points$pos <= example_region$stop)) ) ### now looking only at the specified region level = 4 hc_points <- locs[which(locs$chr== example_region$chr & locs$pos >= example_region$start & locs$pos <= example_region$stop ),] hc_max <- nrow(hc_points) hc_col <- col_fun(-log10(hc_points$pval)) hc_size <- -log10(hc_points$pval) / max(-log10(hc_points$pval)) hc <- HilbertCurve(s=1,e=hc_max, level = level, reference = F, title = example_region$chr) hc_points(hc, x1 = 1:nrow(hc_points), np = NULL, pch=15, size = unit(hc_size*4, "mm"), gp = gpar(col = hc_col, fill = hc_col))
DMRscaler defines differential methylation features iteratively, expanding the size of the window for aggregating on at each step this procedure allows a hierarchical structure to describe a region enriched in differntially methylated CpGs. We look at our example region here
if(!require("networkD3", quietly = TRUE )){ install.packages("networkD3") library("networkD3") }
dmr_tree <- example_generate_dmr_tree(dmrscaler_result = dmrscaler_result, layer=length(dmrscaler_result), chr=example_region$chr, start = example_region$start, stop = example_region$stop) diagonalNetwork(List = dmr_tree, fontSize = 12, fontFamily = "bold", nodeStroke = "black", linkColour = "black", opacity = 1 )
Next we can look at the region using the Gviz package
if(!require("Gviz")){ BiocManager::install("Gviz") library("Gviz") } if(!require("biomaRt")){ BiocManager::install("biomaRt") library("biomaRt") }
## organize data used in tracks group <- character(length=ncol(B)) group[cases] <- "case" group[controls] <- "control" delta_mean_B <- rowMeans(B[,cases]) - rowMeans(B[,controls]) which <- which(locs$chr==example_region$chr & locs$pos >= example_region$start & locs$pos <= example_region$stop) Bsub <- B[which, ] gr <- GRanges(seqnames = example_region$chr, ranges = IRanges(start = locs[which,"pos"], width = 1), mcols = Bsub ) ## set up ideogram track itrack<-IdeogramTrack(genome="hg19", chromosome = example_region$chr) ## set up genome axis track gtrack<-GenomeAxisTrack() ## set up significance track sig_track <- DataTrack(range=gr, data = -log10(locs$pval[which]), type=c("p")) ## set up beta value track beta_track <- DataTrack(range = gr, data = t(Bsub), groups=group,name="Beta", type=c("a","g","confint")) ## set up diff beta track diff_beta_track <- DataTrack(range = gr, data = delta_mean_B[which], type=c("a","g")) ## set up gene model track ### genesub <- example_gene_anno_df(chr=example_region$chr, start=example_region$start, stop=example_region$stop) if(!all(is.na(genesub))){ grtrack <- GeneRegionTrack(genesub, chromosome = example_region$chr, start = example_region$start, stop=example_region$stop, transcriptAnnotation="symbol" ,collapseTranscripts = "meta") }else{grtrack<-GeneRegionTrack()} ## set up DMRscaler results layer tracks layer_track <- list() for(j in 1:length(dmrscaler_result)){ if(nrow(dmrscaler_result[[j]])==0){ layer_track[[j]] <- AnnotationTrack() } else { layer_track[[j]] <- AnnotationTrack(start = dmrscaler_result[[j]]$start, end = dmrscaler_result[[j]]$stop, chromosome = dmrscaler_result[[j]]$chr, strand = "*", genome = "hg19", name=paste("layer", j, sep = "_")) displayPars(layer_track[[j]]) <- list(cex.legend=0.7, cex.axis=0.7, cex.title=0.7, rotation.title=0, stackHeight = 1, shape="box") } } plotTracks(c(list(itrack, gtrack, sig_track, beta_track, diff_beta_track, grtrack), layer_track), from = example_region$start-1, to=example_region$stop+1 )
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.