tutorial/tutorial.R

########################################## 1. Installation ##########################################

# To install R-packages from github, you need devtools:
install.packages("devtools")
library(devtools)

# Dependencies from Bioconductor
if (!requireNamespace("BiocManager", quietly=TRUE)) install.packages("BiocManager")
BiocManager::install('minfi')
BiocManager::install('Sushi')

# Dependencies from CRAN
install.packages("scales")
install.packages("EMCluster")
install.packages("dbscan")
install.packages("RColorBrewer")
# Dependencies from Github
install_github("BenjaminPlanterose/UMtools")

# Needed to complete this tutorial
BiocManager::install('GEOquery')
BiocManager::install('IlluminaHumanMethylation450kanno.ilmn12.hg19')
BiocManager::install('IlluminaHumanMethylation450kmanifest')



########################################## 2. Downloading example ##########################################

# Run in bash
# wget ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE104nnn/GSE104812/suppl/GSE104812_RAW.tar # download files
# tar -xvf GSE104812_RAW.tar # decompress files
# find . -type f ! -name '*.idat.gz' -delete # eliminate non-IDAT files
# gunzip *.gz # uncompress IDATs


########################################## 3. Peaking into an IDAT ##########################################

# Illuminaio is a dependency of minfi
library(illuminaio)
setwd("~/foo/") # Change this route to fit your system
example = illuminaio::readIDAT("GSM2808239_sample1_Grn.idat")
names(example)
# [1] "fileSize"  "versionNumber" "nFields"   "fields"  "nSNPsRead" "Quants" "MidBlock"
# [8] "RedGreen"  "Barcode"       "ChipType"  "RunInfo" "Unknowns"

head(example$RunInfo, 3)
#             RunTime          BlockType
# [1,] "1/12/2016 2:37:05 AM" "Decoding"
# [2,] "4/8/2016 6:03:57 PM"  "Scan"
# [3,] "4/8/2016 6:03:57 PM"  "Register"

head(example$Quants, 3)
#           Mean  SD  NBeads
# 10600313  284  137     13
# 10600322 9405 1363     14
# 10600328 3538  439     11

########################################## 4. Extracting fluorescence intensity matrices ##########################################

library(UMtools)
setwd("~/foo/") # Change this route to fit your system
rgSet = read.metharray.exp(getwd(), extended = TRUE)
TypeI.Red <- getProbeInfo(rgSet, type = "I-Red")
TypeI.Green <- getProbeInfo(rgSet, type = "I-Green")
TypeII <- getProbeInfo(rgSet, type = "II")
ctrls <- getProbeInfo(rgSet, type = "Control")
SnpI <- getProbeInfo(rgSet, type = "SnpI")
SnpII <- getProbeInfo(rgSet, type = "SnpII")
known_probes = c(SnpI$AddressA, SnpI$AddressB, SnpII$AddressA, ctrls$Address, TypeI.Red$AddressA,
                 TypeI.Red$AddressB, TypeI.Green$AddressA, TypeI.Green$AddressB, TypeII$AddressA)
length(known_probes) # 621926
all = rownames(rgSet); length(known_probes) # 622399
orphan = all[!(all %in% known_probes)]; length(orphan) # 473

head(TypeI.Green[, 1:3], n = 2)
#          Name    AddressA    AddressB
#   <character> <character> <character>
# 1  cg02004872    25785404    58629399
# 2  cg02050847    43656343    73683470

head(TypeII[, 1:2], n = 2)
#          Name    AddressA
#   <character> <character>
# 1  cg00035864    31729416
# 2  cg00061679    28780415

# Getting the probe annotation
library(IlluminaHumanMethylation450kanno.ilmn12.hg19) # 450K
annotation <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
# library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19) # 850K
# annotation <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)

colnames(annotation)
# [1]  "chr"                      "pos"                      "strand"                   "Name"
# [5]  "AddressA"                 "AddressB"                 "ProbeSeqA"                "ProbeSeqB"
# [9]  "Type"                     "NextBase"                 "Color"                    "Probe_rs"
# [13] "Probe_maf"                "CpG_rs"                   "CpG_maf"                  "SBE_rs"
# [17] "SBE_maf"                  "Islands_Name"             "Relation_to_Island"       "Forward_Sequence"
# [21] "SourceSeq"                "Random_Loci"              "Methyl27_Loci"            "UCSC_RefGene_Name"
# [25] "UCSC_RefGene_Accession"   "UCSC_RefGene_Group"       "Phantom"                  "DMR"
# [29] "Enhancer"                 "HMM_Island"               "Regulatory_Feature_Name"  "Regulatory_Feature_Group"
# [33] "DHS"

Grn = assay(rgSet, "Green")       # Green mean across beads
Red = assay(rgSet, "Red")         # Red mean across beads
GrnSD = assay(rgSet, "GreenSD")   # Green SD across beads
RedSD = assay(rgSet, "RedSD")     # Red SD across beads
nBeads = assay(rgSet, "NBeads")   # Number of Beads across probes

M_U = GR_to_UM(Red = Red, Grn = Grn, rgSet = rgSet, what = "Mean")
M_U_sd = GR_to_UM(Red = RedSD, Grn = GrnSD, rgSet = rgSet, what = "SD")
nBeads_cg = GR_to_UM(nBeads = nBeads, rgSet = rgSet, what = "NBeads")

# PEAK
M_U$M[1:3, 1:3]; M_U$U[1:3, 1:3]
M_U_sd$M[1:3, 1:3]; M_U_sd$U[1:3, 1:3]
nBeads_cg[1:3, 1:3]


########################################## 5. Compute beta- and M-values ##########################################

offset = 100 # For numerical stability at low fluorescence intensities
beta_value = M_U$M/(M_U$M + M_U$U + offset)

offset = 1 # For numerical stability at low fluorescence intensities
M_value = log2((M_U$M + offset)/(M_U$U + offset))

rm(Grn, Red, GrnSD, RedSD, nBeads, nBeads_cg, M_value); gc()

########################################## 6. Quickly importing/exporting large matrices with data.table ##########################################

setwd("~/foo/") # Change this route to fit your system
export_bigmat(M_U$M, "M.txt", nThread = 4)
list.files()
M = import_bigmat("2021-01-28_M.txt", nThread = 4)

########################################## 7. Quickly importing GEO phenotypes with GEOquery ##########################################

library(GEOquery)
setwd('~/foo/')
pheno_object <- getGEO(GEO = 'GSE104812', destdir=".", getGPL = FALSE)
pheno <- pheno_object[[1]]
pheno <- phenoData(pheno)
pheno <- pData(pheno)
pheno = data.frame(GEO_ID = as.character(rownames(pheno)),
                   sex = as.factor(pheno$`gender:ch1`),
                   age = as.numeric(pheno$`age (y):ch1`))
IDAT_IDs = sapply(strsplit(colnames(rgSet), split = "_"),function(x) x[1])
pheno <- pheno[match(pheno$GEO_ID, IDAT_IDs),] # Make sure samples in pheno are in the same order as in IDATs

########################################## 8. UMtools in action ##########################################

# 450K annotation
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
annotation <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)

# Y-chromosome targeting probe
density_jitter_plot(beta_value, "cg00050873", pheno$sex)
annotation["cg00050873", c("chr", "pos")] # chrY   9363356
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg00050873", sex = pheno$sex)

# X-inactivation
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg00026186", sex = pheno$sex)
annotation["cg00026186", c("chr", "pos")] # chrX  48367230

# X-inactivation escape
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg04927982", sex = pheno$sex)
annotation["cg04927982", c("chr", "pos")] # chrX  53254653

# X-hypermethylation
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg02973417", sex = pheno$sex)
annotation["cg02973417", c("chr", "pos")] # chrX  153220895


### Bivariate Gaussian Mixture Models (bGMMs)
set.seed(2); res = bGMM(M_U$M, M_U$U, "cg03398919", K = 2)
set.seed(3); res = bGMM(M_U$M, M_U$U, "cg00814218", K = 3)
set.seed(2); res = bGMM(M_U$M, M_U$U, "cg27024127", K = 4)
set.seed(4); res = bGMM(M_U$M, M_U$U, "cg23186955", K = 5)


### Quantifying epigenome-wide ambivalency in probe failure
CV = compute_CV(M_SD = M_U_sd$M, U_SD = M_U_sd$U, M = M_U$M, U = M_U_sd$U, alpha = 100)
BC_CV = compute_BC_CV(CV = CV)
density_jitter_plot(CV, "cg00050873", pheno$sex)
BC_CV["cg00050873"]
# cg00050873
#   1.128555
annotation["cg00050873", c("chr", "pos")] # chrY   9363356


### K-calling
Kcall_CpG("cg15771735", M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG("cg03398919", M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG("cg00814218", M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG("cg27024127", M_U$M, M_U$U, minPts = 5, eps = 0.1)

chrY = rownames(annotation)[annotation$chr == "chrY"]
K_vec = par_EW_Kcalling(M_U$M[chrY,], M_U$U[chrY,], minPts = 5, eps = 0.1, nThread = 10)
table(K_vec)
# K_vec
#  1   2
# 35 381

# K = 1 are cross-reactive
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg02494853", sex = pheno$sex)
annotation["cg02494853", c("chr", "pos")] # chrY   4868397

# If aiming to employ epigenome-wide, it is very important to adjust {minPts, eps} to the
# sample size of the data employed.
data("training_set")
# Training annotated with dataset of 426 EUR MZ twin pairs. Training set may not be correctly annotated
# for other ancestries and other sample sizes.
Kcall_CpG(sample(training_set$k_1, 1), M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG(sample(training_set$k_2, 1), M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG(sample(training_set$k_3, 1), M_U$M, M_U$U, minPts = 5, eps = 0.1)
Kcall_CpG(sample(training_set$k_4, 1), M_U$M, M_U$U, minPts = 5, eps = 0.1)

# Lower maf variants are wrongly annotated in this dataset. Sample size not big enough.
train_k_caller(M_U$M, M_U$U, training_set, 3, 0.07, nThread = 10)

### Comethylation plots

# meQTL
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg14911689", sex = NULL)
annotation["cg14911689", c("chr", "pos", "UCSC_RefGene_Name")] # chr12    739980    NINJ2
annotation <- annotation[order(annotation$chr, annotation$pos),]
pos <- which(rownames(annotation) == "cg14911689")
UM_plot(M = M_U$M, U = M_U$U, CpG = rownames(annotation)[pos-1], sex = NULL)
UM_plot(M = M_U$M, U = M_U$U, CpG = rownames(annotation)[pos+1], sex = NULL)

res = Visualize_cometh(annotation = annotation, CpG = 'cg14911689', distance = 1000,
                       L_bound = 3, R_bound = 2, beta_mat = beta_value, max_y = 5)


# A genetic artefact
UM_plot(M = M_U$M, U = M_U$U, CpG = "cg11495604", sex = NULL)
res = Visualize_cometh(annotation = annotation, CpG = 'cg11495604', distance = 1000,
                       L_bound = 0, R_bound = 2, beta_mat = beta_value, max_y = 5)
annotation["cg11495604", c("chr", "pos", "UCSC_RefGene_Name")] # chr20  62053198 KCNQ2;KCNQ2;KCNQ2;KCNQ2


# HLA locus
res = Visualize_cometh(annotation = annotation, CpG = 'cg00211215', distance = 200,
                       L_bound = 3, R_bound = 0, beta_mat = beta_value, max_y = 5)
annotation["cg00211215", c("chr", "pos", "UCSC_RefGene_Name")] # chr6  32552246   HLA-DRB1
BenjaminPlanterose/UMtools documentation built on Aug. 19, 2024, 4:54 a.m.