knitr::opts_chunk$set(echo = TRUE) set.seed(66)
Loading the data and recoding the phenotype from factor to numeric.
library(WaveletScreening) load("simulated_data.Rdata") colon.state[1:4] pheno <- as.numeric(colon.state)-1 pheno[1:4]
The data are a compound of two elements: 1) The base pair position of each CpG, 2) The individual CpG level (M value)
temp.pos[1:10]#the base pair position of each CpG length(temp.pos)#total number of CpGs dim(sim_methy_all)#lines= CpG level, column individual sim_methy_all[1:10,]
Here, we divide the data into regions that have at least 17 CpGs, separated by a maximum distance of 500 bp. This results in a missingness rate of 1.89%.
thresh <- 500 tl <- split(temp.pos , cumsum(c(1, diff(temp.pos) > thresh) ) ) #Remove cases that are too far apart #Level of analysis 4 tl <- tl[-which(lengths(tl)< 17)]#List of the positions of CpGs that we will analyze later sum(lengths(tl))/length( temp.pos)#percent missingness hist( lengths(tl),nclass = 100, main="Histogram of the number of CpGs per region")
In this section, we run Wavelet Screening on each region separately. This takes about 3 to 4 minutes on a single CPU.
library(WaveletScreening) tt <- proc.time() res <- list() for ( i in 1 :length(tl)) { reg_mat <- sim_methy_all[which( temp.pos %in% tl[[i]]),] #Select CpGs in region i bp <- temp.pos[which( temp.pos %in% tl[[i]])]#Correspond to the base position of the CpG in region i res[[i]] <- Wavelet_screening( Y=pheno, loci=reg_mat, lev_res=4, coeftype = "c", bp = bp, base_shrink = 1/26, sigma_b =200, verbose = FALSE ) } proc.time() -tt #total run time
We concatenate the results of Wavelet Screening into one data frame.
res <- do.call( rbind, res) res[1:10,]
To assess the significance of each region, we simulate the null distribution of the test statistic. Below we display how to simulate this distribution.
set.seed(1) #Simulation function using the same parameter as used in the Wavelet_screening function tt <- proc.time() Sim <- Simu_null_emp(res=res, coeftype="c", lev_res = 4, base_shrink = 1/26, size=100000#Number of simulations required ) proc.time() -tt #total running time plot( Sim[,1], Sim[,2])
As explained in the article, the distribution of L~h~ might not be Gaussian when analyzing data using low depth. Below, we display how to perform a Box-Cox transform on the simulations of the test statistic under the H~0.
Here, we have an additionnal issue: the observed test statistics contains some positive and some negative values. However the Box-Cox transform requires only positive values. We therefore shift the distribution and then apply the Box-Cox transform.
library(EnvStats) shifting <- max( 0,1.0001*max( Sim[,1],res[,1])) bc_lambda <- boxcox(-Sim[,1] + shifting, optimize = TRUE) Sim[,1] <- - ( ( (-Sim[,1] + shifting)^bc_lambda$lambda) -1 )/bc_lambda$lambda res[,1] <- - ( ( (-res[,1] + shifting)^bc_lambda$lambda) -1 )/bc_lambda$lambda
When the Box-Cox transform is done, we can then compute the p-value of each region as follows.
lambda <- Search_lambda(Sim,plot=TRUE)#optimizing the lambda* value #lambda <- 15 Th <- Sim[,c("L_h")]+lambda*Sim[,c("min_ph_pv")] #Computing the null distribution of the test statistic muv <- median(Th,na.rm = TRUE) #estimating the parameter of the null distribution sdv <- mad(Th,na.rm = TRUE) #################################### ##Test value of the loci to be tested #################################### th <- res[,1]+lambda*res[,2]#Computing the test statistic ########################## ##Plot of two distributions ########################## dat <- data.frame(dens = c(c(th),c(Th[1:2000]))#To have the same size on the plot , lines = c(rep("obs", length(c(th))), rep( "sim", length(c(Th[1:2000]))) )) ggplot(dat, aes(x = dens, fill = lines)) + geom_density(alpha = 0.5)+ xlim(c(-3,3))+#Some of the associated regions have test statistics very far from the null; this results in poor visualization if they are not excluded. ggtitle("Test stat density")
Here, we see the null distribution of the test statistic, and the associated regions on the right side of the plot.
#Computing the p-value pv <- 1-pnorm(th,mean=muv,sd=sdv) hist(1-pnorm(th,mean=muv,sd=sdv),nclass=100, main="Histogram of the pvalues") hist(log10(1-pnorm(th,mean=muv,sd=sdv)),nclass=100,main ="Histogram of the Log10 pvalues")
The data can be found at https://www.nature.com/articles/ng.298 as supplementary material.
library(readxl) tt <- read_excel("41588_2009_BFng298_MOESM18_ESM.xls") sub <- tt[which(tt[,2]=="chr3"),]#Only selecting Chromosome 3 sub <- as.data.frame(sub) head(sub) #data frame with the start and end positions of each DMR we are trying to detect. lstemp <- list() # for (i in 1: dim(sub)[1]) { lstemp[[i]] <- temp.pos[ which( (sub$start[i]-1) < temp.pos & temp.pos < (sub$end[i] +1 ))] } sum( lengths(lstemp)) # number of CpGs to detect pos_to_detect <- do.call( c,lstemp)#List of the CpGs to detect
Ensure the same slicing as before.
thresh <- 500 tl <- split(temp.pos , cumsum(c(1, diff(temp.pos) > thresh) ) ) #Remove cases that are too far apart tl <- tl[-which(lengths(tl) <17)]#Keep only long-enough sequences of CPGs pos_to_detect <- pos_to_detect [which(pos_to_detect %in% do.call( c , tl))] #filtering out the CpGs when slicing
If a region contains more than one CpG that is differentially methylated, then it has to be detected.
#Number of CpGs per regions n_CpG_per_region <- list() for ( i in 1:length(tl)) { n_CpG_per_region[[i]] <- pos_to_detect[which(pos_to_detect %in% tl[[i]])] } nCpG_reg <- lengths(n_CpG_per_region) indx_to_detect <- which( nCpG_reg >0)#List of the regions to detect indx_to_detect
Regions detected
type <- c("pv", "pv", "fdr", "fdr") threshold <- c("e-5", "e-6", "5%", "1%") detected_reg <- c() tru_reg <- c() #Regions detected #When using p-values 10^-5 indx_WS <- which(pv < 10^(-5)) detected_reg <- c( detected_reg ,length(indx_WS)) tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) ) #When using p-values 10^-6 indx_WS <- which(pv < 10^(-6)) detected_reg <- c( detected_reg ,length(indx_WS)) tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) ) fdr_regions <- p.adjust(pv, method ="BH") #When using FDR at 5% indx_WS <- which(fdr_regions <0.05) detected_reg <- c( detected_reg ,length(indx_WS)) tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) ) #When using FDR at 1% indx_WS <- which(fdr_regions <0.01) detected_reg <- c( detected_reg ,length(indx_WS)) tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) ) #Summary df_out <- data.frame(type=type, threshold = threshold, nb_detected_region= detected_reg, nb_true_regions= tru_reg )
df_out
df_out summarizes the number of regions detected using different criteria and the number of truly associated regions.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.