In perishky/meffil: Efficient algorithms for DNA methylation

library(knitr)
library(Cairo)
opts_chunk$set(warning=FALSE, fig.width=6, fig.height=6, dev="CairoPNG", stop=TRUE)

library(penalized)
library(GEOquery)

EWAS in meffil

Download example data set

path <- download.450k.lead.dataset()

Normalize dataset

Create samplesheet

library(meffil)
options(mc.cores=10)
samplesheet <- meffil.create.samplesheet(path)

samples <- read.csv(file.path(path, "samples.csv"), check.names=F, row.names=1)
samplesheet <- data.frame(samplesheet,
                          samples[match(samplesheet$Sample_Name, rownames(samples)),],
                          stringsAsFactors=F, check.names=F)

samplesheet <- samplesheet[which(samplesheet[["sample type"]] == "HM450K"),]

Parameters.

qc.file <- "ewas/qc-report.html"
author <- "Sen, et al."
study <- "Cord blood DNA methylation and lead exposure (GSE69633)"
norm.file <- "ewas/normalization-report.html"
cell.type.reference <- "gervin and lyle cord blood"

Generate QC objects for each sample and QC report.

qc.objects <- meffil.qc(samplesheet, cell.type.reference=cell.type.reference, verbose=T)

qc.summary <- meffil.qc.summary(qc.objects, verbose=T)
meffil.qc.report(qc.summary,
                 output.file=qc.file,
                 author=author,
                 study=study)

Remove any low quality samples.

if (nrow(qc.summary$bad.samples) > 0)
    qc.objects <- meffil.remove.samples(qc.objects, qc.summary$bad.samples$sample.name)

samplesheet <- samplesheet[match(names(qc.objects), rownames(samplesheet)),]

Check how many principal components to include.

print(meffil.plot.pc.fit(qc.objects, n.cross=3)$plot)

Ten seems about right.

number.pcs <- 10

Normalize dataset and generate normalization report.

norm.objects <- meffil.normalize.quantiles(qc.objects, number.pcs=number.pcs, verbose=T)
norm.dataset <- meffil.normalize.samples(norm.objects,
                                         just.beta=F, 
                                         cpglist.remove=qc.summary$bad.cpgs$name,
                                         verbose=T)

beta <- meffil.get.beta(norm.dataset$M, norm.dataset$U)
pcs <- meffil.methylation.pcs(beta, sites=meffil.get.autosomal.sites("450k"), verbose=T)

parameters <- meffil.normalization.parameters(norm.objects)
norm.summary <- meffil.normalization.summary(norm.objects=norm.objects,
                                             pcs=pcs,
                                             parameters=parameters, verbose=T)
meffil.normalization.report(norm.summary,
                            output.file=norm.file,
                            author=author,
                            study=study)

Test associations with gestational age

Parameters.

ewas.variable.name <- "gestational age"
ewas.covariate.names <- c("socioeconomic score","gender","smoke ever","birth weight")
ewas.output.file <- "ewas/ewas-gestational-age-report.html"
ewas.author <- "Me"
ewas.study <- "Gestational age in cord blood DNA methylation (GEO:GSE69633)"
ewas.cpg.sites <- c("cg08943494", "cg11932158","cg16725984", "cg20334115", "cg18623216") 
## PMID: 25869828 (top 5)

Perform EWAS.

variable <- samplesheet[[ewas.variable.name]]
covariates <- data.frame(t(meffil.cell.count.estimates(qc.objects)),
                         samplesheet[,ewas.covariate.names],
                         stringsAsFactors=F)

ewas.ret <- meffil.ewas(beta,
                        variable=variable,
                        covariates=covariates,
                        isva=F, sva=F, smartsva=T,
                        most.variable=50000,
                        verbose=T)

Generate EWAS report.

ewas.parameters <- meffil.ewas.parameters(sig.threshold=1e-5,max.plots=10)
ewas.summary <- meffil.ewas.summary(ewas.ret,
                                    beta,
                                    selected.cpg.sites=ewas.cpg.sites,
                                    parameters=ewas.parameters, verbose=T)

meffil.ewas.report(ewas.summary,
                   output.file=ewas.output.file,
                   author=ewas.author,
                   study=ewas.study)

idx <- sample(order(ewas.ret$analyses$smartsva$table$p.value)[1:1000],100)
sub.sites <- rownames(beta)[idx]
sub.covariates <- cbind(covariates, with(ewas.ret$analyses$smartsva, design[,grep("^X[0-9]+$", colnames(design))]))

ewas.sub.ret <- meffil.ewas(beta,
                            variable=variable,
                            sites=sub.sites,
                            covariates=sub.covariates,
                            sva=F, isva=F, smartsva=F,
                            verbose=T)

ewas.parameters <- meffil.ewas.parameters(sig.threshold=0.05/length(sub.sites),max.plots=10)
ewas.summary <- meffil.ewas.summary(ewas.sub.ret,
                                    beta,
                                    selected.cpg.sites=sub.sites,
                                    parameters=ewas.parameters, verbose=T)

meffil.ewas.report(ewas.summary,
                   output.file="ewas/ewas-gestational-age-report-100.html",
                   author=ewas.author,
                   study=ewas.study)

How well does SVA capture batch effects that are not explicitly included in the model?

slide <- model.matrix(~ 0 + Slide, samplesheet)
row <- model.matrix(~ 0 + sentrix_row, samplesheet)

sv <- ewas.ret$analyses$smartsva$design
sv <- sv[,grep("X", colnames(sv))]

library(penalized)

prediction.accuracy <- function(v, p) {
    fit <- optL1(response=v, penalized=p, fold=10, standardize=T, maxiter=1000, model="logistic")
    sum((fit$predictions > 0.5) == (v == 1))/length(v)
}

apply(row, 2, prediction.accuracy, sv)

apply(slide, 2, prediction.accuracy, sv)

How much do the different EWAS models agree?

sapply(ewas.ret$analyses, function(a) 
       sapply(ewas.ret$analyses, function(b) 
              length(intersect(rownames(a$table)[which(a$table$p.value < 0.0001)],
                               rownames(b$table)[which(b$table$p.value < 0.0001)]))))

sapply(ewas.ret$analyses, function(a) 
       sapply(ewas.ret$analyses, function(b) {
           a <- rownames(a$table)[which(a$table$p.value < 0.0001)]
           b <- rownames(b$table)[which(b$table$p.value < 0.0001)]
           length(intersect(a,b))/length(union(a,b))
       }))

sapply(ewas.ret$analyses, function(a) 
       sapply(ewas.ret$analyses, function(b) {
           idx <- which(a$table$p.value < 0.01)
           cor(a$table$coefficient[idx], b$table$coefficient[idx])
       }))

Compare meffil EWAS to EWAS 'done by hand'.

## EWAS by meffil
test.ret <- meffil.ewas(beta,
                        variable=variable,
                        covariates=covariates,
                        isva=F,sva=T,smartsva=F,
                        most.variable=50000,
                        random.seed=20161123,
                        verbose=T)
test.ret$sv <- test.ret$analyses$sva$design
test.ret$sv <- test.ret$sv[,grep("X", colnames(test.ret$sv))]

## EWAS by hand
beta.win <- meffil:::winsorize(beta)
beta.sva <- beta.win
beta.sva <- beta.sva[which(rownames(beta.sva) %in% meffil.get.autosomal.sites()),]
beta.sva <- beta.sva[order(rowVars(beta.sva), decreasing=T)[1:50000],]
mod0 <- model.matrix(~ ., covariates)
mod <- cbind(mod0, variable)
set.seed(20161123)
sva.ret <- sva(dat=beta.sva, mod=mod, mod0=mod0)

apply(abs(cor(test.ret$sv, sva.ret$sv)), 2, max)

mod <- cbind(mod, sva.ret$sv)
fit <- lm.fit(mod, t(beta.win))

cor(fit$coefficients["variable",], test.ret$analyses$sva$table$coefficient)

View EWAS effect estimates next to genes and other genomic landmarks by generating genome browser tracks and uploading them to https://genome.ucsc.edu/cgi-bin/hgCustom.

meffil.ewas.bedgraph(ewas.ret, "ewas/sva.bed", "smartsva", name="ga", description=ewas.study)
header <- paste0("track type=bedGraph name='ga-custom' description='",
                 ewas.study,
                 "' visibility=full color=200,100,0 altColor=0,100,200 priority=20")
meffil.ewas.bedgraph(ewas.ret, "ewas/sva-custom.bed", "smartsva", header=header)
meffil.ewas.bedgraph(ewas.ret, "ewas/sva-minimal.bed", "smartsva")