knitr::opts_chunk$set(echo = TRUE)
set.seed(66)

Preparing the data

Loading the data and recoding the phenotype from factor to numeric.

library(WaveletScreening)
load("simulated_data.Rdata")
colon.state[1:4]
pheno <- as.numeric(colon.state)-1
pheno[1:4]

The data are a compound of two elements: 1) The base pair position of each CpG, 2) The individual CpG level (M value)

temp.pos[1:10]#the base pair position of each CpG
length(temp.pos)#total number of CpGs
dim(sim_methy_all)#lines= CpG level, column individual
sim_methy_all[1:10,]

Defining slice

Here, we divide the data into regions that have at least 17 CpGs, separated by a maximum distance of 500 bp. This results in a missingness rate of 1.89%.

thresh <- 500
tl <- split(temp.pos , cumsum(c(1, diff(temp.pos) > thresh)  ) ) #Remove cases that are too far apart

#Level of analysis 4
tl <-  tl[-which(lengths(tl)< 17)]#List of the positions of CpGs that we will analyze later
sum(lengths(tl))/length( temp.pos)#percent missingness
hist( lengths(tl),nclass = 100, main="Histogram of the number of CpGs per region")

Running Wavelet Screening region by region

In this section, we run Wavelet Screening on each region separately. This takes about 3 to 4 minutes on a single CPU.

library(WaveletScreening)
tt <- proc.time()
res <- list()
for ( i in 1 :length(tl))
{
  reg_mat <- sim_methy_all[which( temp.pos  %in% tl[[i]]),] #Select CpGs in region i
  bp <- temp.pos[which( temp.pos  %in% tl[[i]])]#Correspond to the base position of the CpG in region i
  res[[i]] <- Wavelet_screening( Y=pheno,
                                 loci=reg_mat,
                                 lev_res=4,
                                 coeftype = "c",
                                 bp = bp,
                                 base_shrink = 1/26,
                                 sigma_b =200,
                                 verbose = FALSE
  )


}


proc.time()  -tt #total run time

We concatenate the results of Wavelet Screening into one data frame.

res <- do.call( rbind, res)
res[1:10,]

Simulation of the null distribution

To assess the significance of each region, we simulate the null distribution of the test statistic. Below we display how to simulate this distribution.

set.seed(1)
#Simulation function using the same parameter as used in the Wavelet_screening function
tt <- proc.time()
Sim <- Simu_null_emp(res=res,
                     coeftype="c",
                     lev_res = 4,
                     base_shrink = 1/26,
                     size=100000#Number of simulations required
)



proc.time()  -tt #total running time
plot( Sim[,1], Sim[,2])

Performing the Box-Cox transform

As explained in the article, the distribution of L~h~ might not be Gaussian when analyzing data using low depth. Below, we display how to perform a Box-Cox transform on the simulations of the test statistic under the H~0.

Here, we have an additionnal issue: the observed test statistics contains some positive and some negative values. However the Box-Cox transform requires only positive values. We therefore shift the distribution and then apply the Box-Cox transform.

library(EnvStats)
shifting  <-  max( 0,1.0001*max( Sim[,1],res[,1]))
bc_lambda <-  boxcox(-Sim[,1] + shifting, optimize = TRUE)
Sim[,1]   <-  - ( ( (-Sim[,1] + shifting)^bc_lambda$lambda) -1  )/bc_lambda$lambda
res[,1]   <-  - ( ( (-res[,1] + shifting)^bc_lambda$lambda) -1  )/bc_lambda$lambda

Computing the p-values

When the Box-Cox transform is done, we can then compute the p-value of each region as follows.

lambda <- Search_lambda(Sim,plot=TRUE)#optimizing the lambda* value
#lambda <- 15
Th <- Sim[,c("L_h")]+lambda*Sim[,c("min_ph_pv")] #Computing the null distribution of the test statistic 
muv <- median(Th,na.rm = TRUE) #estimating the parameter of the null distribution
sdv <- mad(Th,na.rm = TRUE)
####################################
##Test value of the loci to be tested
####################################
th <-  res[,1]+lambda*res[,2]#Computing the test statistic

##########################
##Plot of two distributions
##########################
dat <- data.frame(dens = c(c(th),c(Th[1:2000]))#To have the same size on the plot
                  , lines = c(rep("obs", length(c(th))), rep( "sim", length(c(Th[1:2000]))) ))

ggplot(dat, aes(x = dens, fill = lines)) +
  geom_density(alpha = 0.5)+
  xlim(c(-3,3))+#Some of the associated regions have test statistics very far from the null; this results in poor visualization if they are not excluded.
  ggtitle("Test stat density")

Here, we see the null distribution of the test statistic, and the associated regions on the right side of the plot.

#Computing the p-value
pv <- 1-pnorm(th,mean=muv,sd=sdv)
hist(1-pnorm(th,mean=muv,sd=sdv),nclass=100, main="Histogram of the pvalues")
hist(log10(1-pnorm(th,mean=muv,sd=sdv)),nclass=100,main ="Histogram of the Log10 pvalues")

Loading data from Irizzary et al. 2009

The data can be found at https://www.nature.com/articles/ng.298 as supplementary material.

library(readxl)
tt <- read_excel("41588_2009_BFng298_MOESM18_ESM.xls")
sub <- tt[which(tt[,2]=="chr3"),]#Only selecting Chromosome 3
sub <- as.data.frame(sub)
head(sub) #data frame with the start and end positions of each DMR we are trying to detect.

lstemp <- list() #

for (i in 1: dim(sub)[1])
{

  lstemp[[i]] <- temp.pos[ which( (sub$start[i]-1)  < temp.pos & temp.pos < (sub$end[i] +1 ))]

}

sum( lengths(lstemp)) # number of CpGs to detect
pos_to_detect <-  do.call( c,lstemp)#List of the CpGs to detect

Performing a slicing

Ensure the same slicing as before.

thresh <- 500

tl <- split(temp.pos , cumsum(c(1, diff(temp.pos) > thresh)  ) ) #Remove cases that are too far apart
tl <-  tl[-which(lengths(tl) <17)]#Keep only long-enough sequences of CPGs

pos_to_detect <- pos_to_detect [which(pos_to_detect %in% do.call( c , tl))] #filtering out the CpGs when slicing

Power

If a region contains more than one CpG that is differentially methylated, then it has to be detected.

#Number of CpGs per regions
n_CpG_per_region <- list()

for ( i in 1:length(tl))
{
  n_CpG_per_region[[i]] <- pos_to_detect[which(pos_to_detect %in% tl[[i]])]

}


nCpG_reg <- lengths(n_CpG_per_region)
indx_to_detect <- which( nCpG_reg >0)#List of the regions to detect
indx_to_detect

Regions detected

type <- c("pv", "pv", "fdr", "fdr")
threshold <- c("e-5", "e-6", "5%", "1%")
detected_reg <- c()
tru_reg <- c()
#Regions detected

#When using p-values 10^-5
indx_WS <-  which(pv < 10^(-5))
detected_reg <- c( detected_reg ,length(indx_WS))
tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) )

#When using p-values 10^-6
indx_WS <-  which(pv < 10^(-6))
detected_reg <- c( detected_reg ,length(indx_WS))
tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) )
fdr_regions <- p.adjust(pv, method ="BH")

#When using FDR at 5%
indx_WS <-  which(fdr_regions  <0.05)
detected_reg <- c( detected_reg ,length(indx_WS))
tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) )

#When using FDR at 1%
indx_WS <-  which(fdr_regions  <0.01)
detected_reg <- c( detected_reg ,length(indx_WS))
tru_reg <- c( tru_reg , length(which(indx_WS %in% indx_to_detect) ) )

#Summary 
df_out <- data.frame(type=type, threshold = threshold, nb_detected_region= detected_reg, nb_true_regions= tru_reg )
df_out

df_out summarizes the number of regions detected using different criteria and the number of truly associated regions.



william-denault/WaveletScreening documentation built on Jan. 20, 2021, 11:37 p.m.