# R/em_mendelian.R In heritEWAS: Identify Heritable Methylation Marks

#### Defines functions em_mendelian

```#' @importFrom stats dnorm sd
em_mendelian <- function(typed.genos, M, ncores = 1) {

indices.typed.genos <- find_geno_index(typed.genos, M)

update.theta <- function(theta, mval) {
# Update of the parameter vector theta=c(mu0,sd0,mu1,sd1) with the EM
# algorithm
# Here our likelihood is a Gaussian mixture, with different Gaussian
# densities for carriers (params mu0 and sd0) and non-carriers (params mu1
# and sd1)
# of a rare mutation which affects methylation values at a probe with
# M-values mval.
# This is the standard algorithm but modified to take non-independence of y
# into account, where:
#   * the data is x = (x1, ..., xn) is the vector of M-values mval
#   * the hidden data y = (y1, ..., yn) gives the true genotypes of each
#     person

# Decode the parameter vector theta
mu0 <- theta[1]
sd0 <- theta[2]
mu1 <- theta[3]
sd1 <- theta[4]

# Calculate q where
# q[i] = q_{i1}^t in the notation of my notes
#      = probability that person i (matching mval) is a carrier based on
#        the M-values and previous values of the parameters
y <- c(dnorm(mval, mu0, sd0), dnorm(mval, mu1, sd1))
p01 <- matrix(y, length(mval), 2)
f <- function(j) {
# j is the family number
datg <- typed.genos[[j]]
p01.fam <- p01[indices.typed.genos[[j]], ]
if (ncol(datg) == 2) p01.fam <- matrix(p01.fam, 1, )
p <- datg\$p
for (k in 2:ncol(datg)) {
indices <- datg[, k] + 1
p <- p * p01.fam[k - 1, indices]
}
p <- p / sum(p)
g <- function(k) {
carriers <- (datg[, k] == 1)
sum(p[carriers])
}
q.famj <- sapply(2:ncol(datg), g)
q.famj
}
q.fams <- lapply(1:length(typed.genos), f)
q <- numeric(length(mval))
for (j in 1:length(typed.genos)) {
q[indices.typed.genos[[j]]] <- q.fams[[j]]
}

# Calculate the updated estimates
w0 <- (1 - q) / sum(1 - q)
w1 <- q / sum(q)
mu0 <- sum(w0 * mval)
sd0 <- sqrt(sum(w0 * (mval - mu0)^2))
mu1 <- sum(w1 * mval)
sd1 <- sqrt(sum(w1 * (mval - mu1)^2))

# Output
theta <- c(mu0, sd0, mu1, sd1)
theta
}

calc.ll.em <- function(theta, mval) {
# Calculate the likelihood whcih is maximized by the EM algorithm
# for given parameters theta=c(mu0,sd0,mu1,sd1) and M-values at a
# particular probe mval

# Decode the parameter vector theta
mu0 <- theta[1]
sd0 <- theta[2]
mu1 <- theta[3]
sd1 <- theta[4]

# Calculate P(x | theta)
y <- c(dnorm(mval, mu0, sd0), dnorm(mval, mu1, sd1))
p01 <- matrix(y, length(mval), 2)
f <- function(j) {
# j is the family number, this calculates the log-likelihood for family j
datg <- typed.genos[[j]]
p01.fam <- p01[indices.typed.genos[[j]], ]
if (ncol(datg) == 2) p01.fam <- matrix(p01.fam, 1, )
p <- datg\$p
for (k in 2:ncol(datg)) {
indices <- datg[, k] + 1
p <- p * p01.fam[k - 1, indices]
}
ll.famj <- log(sum(p))
ll.famj
}
p.fams <- sapply(1:length(typed.genos), f)

# Output
ll <- sum(p.fams)
ll
}

em.maxlik <- function(i, tol = 1e-3, max.count = 200, theta.start = NULL) {
# Use the EM algorithm to produce ML estimates for probe i

# Initialize
mval <- as.numeric(M[i, ])
n <- length(mval)
mu0 <- mean(mval)
sd0 <- sd(mval) * sqrt((n - 1) / n)
mu1 <- mu0
sd1 <- sd0
theta.null <- c(mu0, sd0, mu1, sd1)
if (is.null(theta.start)) {
theta.start <- theta.null
}
theta <- theta.start

# Iterate
continue <- TRUE
count <- 0
while (continue) {
count <- count + 1
theta.new <- update.theta(theta, mval)
continue <- all(!is.na(theta.new)) & any(abs(theta.new - theta) > tol) &
(count < max.count)
theta <- theta.new
}

# Calculate likelihoods
# This should be called ll.null, but this name kept for compatibility with
# other scripts
ll.start <- calc.ll.em(theta.null, mval)
ll.end <- calc.ll.em(theta, mval)

# Output
c(ll.start, ll.end, theta.null[1:2], theta, count)
}

# Choose three stating values and choose the best EM output
em.maxlik.best <- function(i) {

# Three sets of starting values for EM
theta_start_values <- list(
NULL, c(-2, 1, 2, 1), c(2, 1, -2, 1)
)

out <- vector("list", length = 3)
for (j in seq_along(out)) {
out[[j]] <- em.maxlik(i, theta.start = theta_start_values[[j]])
}
out <- matrix(unlist(out, use.names = FALSE), nrow = 3, byrow = TRUE)

#max_out <- out[which.max(out[, 2]), ]

out_noinf <- rbind(out[out[, 2] != Inf, ])
max_out <- out_noinf[which.max(out_noinf[, 2]), ]

#if (length(max_out) == 0 | max_out[2] == Inf) {
if (length(max_out) == 0) {
max_out <- out[1, ]
max_out[-c(1, 3, 4)] <- NA
}
max_out
}

nprobe <- nrow(M)
if (ncores == 1) {
out <- sapply(1:nprobe, em.maxlik.best)
} else {
cl <- parallel::makeCluster(ncores)
on.exit(parallel::stopCluster(cl), add = TRUE)
out <- parallel::parSapply(cl, 1:nprobe, em.maxlik.best)
}
out <- data.frame(probe = rownames(M)[1:nprobe], as.data.frame(t(out)))
names(out) <- c("probe", "ll.null", "ll.mendel", "mu.null", "sd.null",
"mu0", "sd0", "mu1", "sd1", "count")
out

}
```

## Try the heritEWAS package in your browser

Any scripts or data that you put into this service are public.

heritEWAS documentation built on July 1, 2020, 6:02 p.m.