#' @title Hamming distance
#' @description compute the Hamming distance between (binary) genotypes, as the
#' \eqn{\sum_{i} |xi - yi|}
#'
#' @param x,y binary vectors. Each entry in the vector indicates whether an
#' event has been observed (\code{1}) or not (\code{0})
hamming.dist <- function(x, y) {
dist <- sum(abs(x - y))
#dist <- sum(x != y)
return(as.integer(dist))
}
#' @param p number of events/mutations
possible.genotypes <- function(p) {
m <- 2^p
genotypes <- matrix(0, nrow=m, ncol=p)
for (i in 1:m) {
genotypes[i, ] <- gen.genotype(p, m - i + 1)
}
return(genotypes)
}
#' @param p number of events/mutations
#' @param idx index identifying a given observation/genotype
gen.genotype <- function(p, idx) {
g <- numeric(p)
for(i in p:1) {
g[i] <- idx %% 2
idx <- idx %/% 2
}
return(g)
}
#' @description write poset to file in the format expected by H-CBN
#'
#' @param poset a matrix containing the cover relations
#' @param filename name of the output file
#' @param datadir output directory
write.poset <- function(poset, filename, datadir) {
p <- nrow(poset)
outfile <- file.path(datadir, paste(filename, ".poset", sep=""))
# write header
write(p, outfile)
relations <- which(poset == 1)
i <- relations %% p
j <- (relations %/% p) + 1
write.table(cbind(i, j), outfile,row.names=FALSE, col.names=FALSE,
append=TRUE)
# write footer
write(0, outfile, append=TRUE)
}
#' @title Complete-data Log-Likelihood
#' @description compute the complete-data log-likelihood or, equivalently, the
#' hidden log-likelihood
#'
#' @param lambdas vector of the rate parameters
#' @param eps error rate
#' @param Tdiff matrix of expected time differences
#' @param dist vector of expected Hamming distances
complete.loglikelihood_ <- function(lambdas, eps, Tdiff, dist) {
p <- length(lambdas)
N <- length(dist)
llhood <- N * sum(log(lambdas)) - sum(lambdas * t(Tdiff))
if (eps == 0) {
llhood <-
ifelse(all(dist == 0), llhood,
llhood + log(eps + .Machine$double.eps) * sum(dist) +
log(1 - (eps + .Machine$double.eps)) * sum(p - dist))
} else {
llhood <- llhood + log(eps) * sum(dist) + log(1-eps) * sum(p - dist)
}
return(llhood)
}
#' @title Observed Log-Likelihood
#' @description compute the observed log-likelihood
#'
#' @param obs.events a matrix containing observations or genotypes: each row
#' correponds to a genotype vector whose entries indicate whether an event has
#' been observed (\code{1}) or not (\code{0})
#' @param poset a matrix containing the cover relations
#' @param lambdas vector of the rate parameters
#' @param lambda.s rate of the sampling process
#' @param eps error rate
#' @param sampling.times an optional vector of sampling times per observation
#' @param L number of samples to be drawn from the proposal. Defaults to
#' \code{1000}
#' @param seed seed for reproducibility
#' @param neighborhood.dist an integer value indicating the Hamming distance
#' between the \code{'genotype'} and the samples generated by \code{"backward"}
#' sampling. This option is used if \code{sampling} is set to \code{"backward"}.
#' Defaults to \code{1}
obs.loglikelihood_ <- function(
obs.events, poset, lambdas, lambda.s, eps, sampling.times=NULL, L=1000,
sampling=c('forward', 'add-remove', 'backward', 'bernoulli', 'pool'),
perturb.prob=0.8, version="3", neighborhood.dist=1L, exact=FALSE,
seed=NULL) {
sampling <- match.arg(sampling)
if (is.null(seed))
seed <- sample.int(1e9, 1)
set.seed(seed, kind="L'Ecuyer-CMRG")
seeds <- sample.int(3e4, N)
# p: number of events / mutations
# N: number of observations / genotypes
if (exact) {
### **TODO** ### this is not correct
# p <- length(lambdas)
# N <- nrow(obs.events)
# llhood <- 0
# for (j in 1:p) {
# mutated <- sum(obs.events[, j] == 1)
# Pr_X_0 <- lambda.s/(lambda.s + lambdas[j])
# Pr_X_1 <- lambdas[j]/(lambda.s + lambdas[j])
# llhood <- llhood + mutated * log(eps * Pr_X_0 + (1-eps) * Pr_X_1)
# llhood <- llhood + (N - mutated) * log((1-eps) * Pr_X_0 + eps * Pr_X_1)
# }
} else {
sampling.pool <- NULL
weight.remove <- numeric(0)
if (sampling == "add-remove") {
weight.remove <- .Call("_scale_path_to_mutation",
matrix(as.integer(poset), nrow=p, ncol=p), lambdas)
} else if (sampling == "pool") {
K <- p * L;
sampling.pool <- sample.times(K, poset, lambdas, seed=seed)
}
prob_Y <- foreach(i=1:N, .combine='c', .packages="mccbn") %dopar% {
prob <-
prob.importance.sampling(
genotype=obs.events[i, ], L=L, poset=poset, lambdas=lambdas,
lambda.s=lambda.s, eps=eps, sampling.time=sampling.times[i],
sampling=sampling, perturb.prob=perturb.prob, version=version,
weight.remove=weight.remove, sampling.pool=sampling.pool,
neighborhood.dist=as.integer(neighborhood.dist), seed=seeds[i])
return(prob)
}
llhood <- sum(log(prob_Y))
}
return(llhood)
}
#' @description empirical computation of the probability of a genotype
#' according to the model
#'
#' @param N number of samples
#' @param poset a matrix containing the cover relations
#' @param lambdas vector of the rate parameters
#' @param lambda.s rate of the sampling process
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param eps error rate, \eqn{\epsilon}
probY.empirical <- function(N, poset, lambdas, lambda.s, genotype, eps) {
# TODO: use sample.genotypes
simGenotypes <-
sample_genotypes(N, poset, sampling_param=lambda.s, lambdas=lambdas,
eps=eps)
return(sum(apply(simGenotypes$obs_events, 1,
function(x, y) all(x == y), y=genotype)) / N)
}
#' @description compute the occurrence times expirically for a given genotype
#'
#' @inheritParams probY.empirical
tdiff.empirical <- function(N, poset, lambdas, lambda.s, genotype, eps) {
simGenotypes <-
sample_genotypes(N, poset, sampling_param=lambda.s, lambdas=lambdas,
eps=eps)
idx <- which(apply(simGenotypes$obs_events, 1, function(x, y) all(x == y),
y=genotype))
return(apply(simGenotypes$T_events[idx, ], 2, mean))
}
#' @description compute the average distance between genotype Y (subject to
#' noise) and N possible true genotypes, X
#'
#' @inheritParams probY.empirical
dist.empirical <- function(N, poset, lambdas, lambda.s, genotype, eps) {
p <- ncol(poset)
simGenotypes <-
sample_genotypes(N, poset, sampling_param=lambda.s, lambdas=lambdas,
eps=eps)
idx <-
which(apply(simGenotypes$obs_events, 1, function(x, y) all(x == y),
y=genotype))
return(sum(
apply(simGenotypes$hidden_genotypes[idx, ], 1, hamming.dist, y=genotype)) /
length(idx))
}
#' @description identify the predecessors (parents) for each event
#'
#' @param poset a matrix containing the cover relations
#' @return list of parents per event
get.parents <- function(poset) {
parents <- apply(poset, 2, function(x) which(x == 1))
# Following step is need for the empty poset
if (length(parents) == 0) {
parents <- vector("list", length=nrow(poset))
}
return(parents)
}
#' @description identify the successors (children) for each event
#'
#' @param poset a matrix containing the cover relations
#' @return list of children per event
get.children <- function(poset) {
children <- apply(poset, 1, function(x) which(x == 1))
# Following step is need for the empty poset
if (length(children) == 0) {
children <- vector("list", length=nrow(poset))
}
return(children)
}
#' @description add events - look for parent nodes which have not occurred, but
#' any of their children had, and insert a 1 (i.e., replace `0` in the parent
#' node by `1`)
#'
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param children list of children per event. It should have been computed on
#' the transitively closed poset
add.relations <- function(genotype, children) {
p <- length(genotype)
return(sapply(1:p,
function(i, obs, c)
ifelse(obs[i] == 0, ifelse(any(obs[c[[i]]] == 1), 1, obs[i]),
obs[i]), obs=genotype, c=children))
}
#' @description remove events - look for nodes which have occurred, but any of
#' their parents have not occurred, and remove it (i.e., replace \code{1} in
#' the child node by \code{0})
#'
#' @inheritParams add.relations
#' @param parents list of parents per node. It should have been computed on the
#' transitively closed poset
remove.relations <- function(genotype, parents) {
p = length(genotype)
return(sapply(1:p,
function(i, obs, p)
ifelse(obs[i] == 1,
ifelse(any(obs[p[[i]]] == 0), 0, obs[i]), obs[i]),
obs=genotype, p=parents))
}
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param parents list of parents per node. It should have been computed on the
#' transitively closed poset
#' @param children list of children per event. It should have been computed on
#' the transitively closed poset
get.possible.moves <- function(genotype, parents, children) {
### **TODO** ### vectorize get.possible.moves
# Get the size of the Exit set - mutations that can happen next
set.size.add <- 0
idxs.add <- NA
idx <- which(genotype == 0)
if (length(idx) > 0) {
idxs.add <- idx[sapply(idx, function(i, x, p) all(x[p[[i]]] == 1),
x=genotype, p=parents)]
set.size.add <- length(idxs.add)
}
# Get the size of the set of mutations that happened last - can be removed
# and the genotype remains compatible with the poset
set.size.remove <- 0
idxs.remove <- NA
idx <- which(genotype == 1)
if (length(idx) > 0) {
idxs.remove <- idx[sapply(idx, function(i, x, c) all(x[c[[i]]] == 0),
x=genotype, c=children)]
set.size.remove <- length(idxs.remove)
}
return(list("set.size"= c("add"=set.size.add, "remove"=set.size.remove),
"idxs.add"=idxs.add, "idxs.remove"=idxs.remove))
}
#' @description perturb genotypes. Draw (uniformily) a random number \code{x}
#' between \code{0} and \code{1}. If \code{x} is less than \code{perturb.prob},
#' then perturb the genotype by adding or removing an event, but ensuring that
#' the resulting observation remains compatible with the poset
#'
#' @param events a matrix of genotypes, each row correponds to a vector
#' indicating whether an event has been observed (\code{1}) or not (\code{0})
#' @param parents list of parents per node. It should have been computed on the
#' transitively closed poset
#' @param children list of children per event. It should have been computed on
#' the transitively closed poset
#' @param perturb.prob probability of perturbing a genotype. Defaults to
#' \code{0.8}
#' @param compatible logical. If \code{TRUE} indicates that the observed
#' genotype, \code{Y}, is comptible with the current poset. Defaults to
#' \code{FALSE}
#' @param version indicates which version of the \code{"add-remove"} sampling
#' scheme to use
#' @return List with following named arguments: (1) \code{X} - perturbed
#' version of compatible genotypes, which remain compatible with the current
#' poset, (2) \code{option.set.size} size of the exit set or set of
#' observations which can be removed. \code{NA} is returned when the sample was
#' not perturbed
perturb <- function(events, parents, children, perturb.prob=0.8,
compatible=FALSE, version=c("1", "2", "3")) {
version <- match.arg(version)
L <- nrow(events) # number of samples
p <- ncol(events) # number of events/mutations
new.obs <- events
option.set.size <- rep(NA, L)
perturb <- ifelse(runif(L, 0, 1) < perturb.prob, 1, 0)
idxs.perturb <- which(perturb == 1)
# When observation Y was already compatible, all rows are equivalent
if (compatible || var(as.vector(events)) == 0) {
# Get indices of mutations that can be either added or removed
idxs <- get.possible.moves(events[1, ], parents, children)
}
# Deal with special cases: wild type and resistant type
if (all(events == 0)) {
if (version == "1") {
add <- runif(length(idxs.perturb), 0, 1) < 0.5
} else if (version == "2" || version == "3") {
add <- rep(TRUE, length(idxs.perturb))
}
mutation.idx <- sample(idxs$idxs.add, sum(add), replace=TRUE)
option.set.size[idxs.perturb[add]] <- idxs$set.size["add"]
if (sum(add) == 1) {
new.obs[idxs.perturb[add], mutation.idx] = 1
} else {
### **TODO** ### vectorize it
for (i in 1:sum(add)) {
new.obs[idxs.perturb[add][i], mutation.idx[i]] = 1
}
}
} else if (all(events == 1)) {
if (version == "1") {
add <- runif(length(idxs.perturb), 0, 1) < 0.5
} else if (version == "2" || version == "3") {
add <- rep(FALSE, length(idxs.perturb))
}
mutation.idx <- sample(idxs$idxs.remove, sum(!add), replace=TRUE)
option.set.size[idxs.perturb[!add]] <- idxs$set.size["remove"]
if (sum(!add) == 1) {
new.obs[idxs.perturb[!add], mutation.idx] <- 0
} else {
### **TODO** ### vectorize it
for (i in 1:sum(!add)) {
new.obs[idxs.perturb[!add][i], mutation.idx[i]] <- 0
}
}
} else {
for (i in idxs.perturb) {
# Get indices of mutations that can be either added or removed
if (!compatible) {
idxs <- get.possible.moves(events[i, ], parents, children)
}
if (version == "1") {
# Add or remove one observation, both moves have equal probability.
# Draw a random number between 0 and 1. If this number is > 0.5, then
# add an observation
add <- runif(1, 0, 1) < 0.5
if (add) {
mutation.idx <-
ifelse(idxs$set.size["add"] > 1, sample(idxs$idxs.add, 1), idxs$idxs.add)
option.set.size[i] <- idxs$set.size["add"]
new.obs[i, mutation.idx] <- 1
} else {
mutation.idx <-
ifelse(idxs$set.size["remove"] > 1, sample(idxs$idxs.remove, 1),
idxs$idxs.remove)
option.set.size[i] <- idxs$set.size["remove"]
new.obs[i, mutation.idx] <- 0
}
} else if (version == "2" || version == "3") {
option.set.size[i] <- sum(idxs$set.size)
# Choose one index randomly
idx <- sample.int(option.set.size[i], 1)
mutation.idx <- na.omit(c(idxs$idxs.add, idxs$idxs.remove))[idx]
# Check whether move corresponds to an adding move
add <- ifelse(idx <= idxs$set.size["add"], TRUE, FALSE)
new.obs[i, mutation.idx] <- ifelse(add, 1, 0)
}
}
}
return(list("X"=new.obs, "option.set.size"=option.set.size))
}
#' @description draw L samples from the proposal. Two-steps proposal: (1) make
#' compatible and (2) perturb genotype
#'
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param L number of samples to be drawn from the proposal
#' @param parents list of parents per node. It should have been computed on the
#' transitively closed poset
#' @param children list of children per event. It should have been computed on
#' the transitively closed poset
#' @param compatible variable indicating whether the genotype is compatible
#' (\code{1}) with current poset or not (\code{0})
#' @param eps error rate
#' @param perturb.prob probability of perturbing a genotype. Genotypes are
#' perturbed in order to learn the error rate, \eqn{\epsilon}. Defaults to
#' \code{0.8}
#' @param version indicates which version of the \code{"add-remove"} sampling
#' scheme to use
draw.samples <- function(genotype, L, parents, children, compatible, eps,
perturb.prob=0.8, version=c("1", "2", "3")) {
version <- match.arg(version)
p <- length(genotype) # number of events/mutations
if (compatible) {
# if genotype is compatible with poset, we can inmediately go to step 2:
# perturb
compatible.obs <- matrix(genotype, nrow=L, ncol=p, byrow=TRUE)
make.compatible.prob <- NA
} else {
# if genotype is NOT compatible with poset, we need to generate L compatible
# versions by adding or removing observations
compatible.by.adding <- add.relations(genotype, children)
compatible.by.removing <- remove.relations(genotype, parents)
if (version == "1" | version == "2") {
# Both moves, add and remove, have equal probability
# Move add (remove) wouldn't be applicable if genotype is the resistant
# type (wild type). However, such genotypes are always compatible with
# the poset
add <- ifelse(runif(L, 0, 1) < 0.5, 1, 0)
make.compatible.prob <- rep(0.5, L)
} else if (version == "3") {
# Choose to add or remove observations according to the number of
# modifications
dist.add <- hamming.dist(genotype, compatible.by.adding)
dist.remove <- hamming.dist(genotype, compatible.by.removing)
add.prob <- eps^dist.add / (eps^dist.add + eps^dist.remove)
add <- ifelse(runif(L, 0, 1) < add.prob, 1, 0)
make.compatible.prob <- ifelse(add == 1, add.prob, 1 - add.prob)
}
compatible.obs <- matrix(0, L, p)
if (all(add == 1)) {
compatible.obs <- matrix(compatible.by.adding, nrow=L, ncol=p, byrow=TRUE)
} else if (all(add == 0)) {
compatible.obs <-
matrix(compatible.by.removing, nrow=L, ncol=p, byrow=TRUE)
} else {
idxs.add <- which(add == 1)
compatible.obs[idxs.add, ] = matrix(compatible.by.adding, nrow=sum(add),
ncol=p, byrow=TRUE)
compatible.obs[-idxs.add, ] = matrix(compatible.by.removing, nrow=L-sum(add),
ncol=p, byrow=TRUE)
}
}
samples <-
perturb(compatible.obs, parents, children, perturb.prob, compatible,
version=version)
return(list("samples"=samples, "make.compatible.prob"=make.compatible.prob))
}
#' @description random generation for the truncated exponential distribution
#'
#' @param x upper bound on the observation
#' @param rate rate of the exponential distribution (i.e. mean 1/`rate`)
rtexp <- function(x, rate) {
rand_num <- runif(1, 0, 1)
Z <- ifelse(x > 0, -log(1 - rand_num * (1 - exp(-rate * x))) / rate, 0)
return(Z)
}
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param poset a matrix containing the cover relations
#' @param lambdas a vector of the rate parameters
#' @param lambda.s rate of the sampling process. Defaults to \code{1.0}
#' @param sampling.time optional argument specifying the sampling time
sample.mutation.times <- function(genotype, poset, lambdas, lambda.s=1.0,
sampling.time=NULL) {
p <- length(genotype) # number of events/mutations
parents <- get.parents(poset)
if (is.null(sampling.time))
sampling.time = rexp(1, rate=lambda.s)
Tdiff <- numeric(p)
Tsum <- numeric(p)
dens <- 0
topo.path <- my.topological.sort(poset)
for (j in topo.path) {
max.time.parents <- 0
if (length(parents[[j]]) > 0) {
for (u in 1:length(parents[[j]])) {
if(Tsum[parents[[j]][u]] > max.time.parents)
max.time.parents <- Tsum[parents[[j]][u]]
}
}
if (genotype[j] == 1) {
# If mutation is observed,
# Z ~ TExp(lambda, 0, sampling.time - max.time.parents)
# When a lot of mutations have occurred max.time.parents approaches the
# 'sampling.time'. Due to numerical precision problems,
# 'sampling.time' < 'max.time.parents'. Therefore, we have to make sure
# that when it happens rtexp returns 0
# dexp: P(Z)
# pexp: P(Z <= sampling.time - max.time.parents)
Z <- rtexp(sampling.time - max.time.parents, rate=lambdas[j])
Tsum[j] <- max.time.parents + Z
dens <- dens + dexp(Z, rate=lambdas[j], log=TRUE) -
pexp(sampling.time - max.time.parents, rate=lambdas[j], log.p=TRUE)
} else {
# If mutation is not observed, Z ~ Exp(lambda)
Z <- rexp(1, rate=lambdas[j])
Tsum[j] <- max(sampling.time, max.time.parents) + Z
dens <- dens + dexp(Z, rate=lambdas[j], log=TRUE)
}
Tdiff[j] <- Tsum[j] - max.time.parents
}
return(c("density"=dens, "Tdiff"=Tdiff))
}
#' @param Tdiff matrix of expected time differences
#' @param rate vector of exponential rates
log.cbn.density <- function(Tdiff, rate) {
dens <- sum(log(rate)) - sum(rate * Tdiff)
return(dens)
}
#' @param genotype a binary vector indicating whether an event has been
#' observed (\code{1}) or not (\code{0})
#' @param L number of samples to be drawn from the proposal
#' @param poset a matrix containing the cover relations
#' @param lambdas a vector of exponential rates
#' @param lambda.s rate of the sampling process
#' @param eps error rate
#' @param sampling.time an optional argument specifying the sampling time
#' @param sampling type of sampling scheme. OPTIONS: \code{"forward"},
#' \code{"add-remove"}, \code{"backward"}, \code{"bernoulli"}, or \code{"pool"}
#' @param perturb.prob probability of perturbing a genotype. Genotypes are
#' perturbed in order to learn the error rate, \eqn{\epsilon}. Defaults to
#' \code{0.8}
#' @param version an optional argument indicating which version of the
#' \code{"add-remove"} sampling scheme to use. Defaults to \code{3}
#' @param dist.pool Hamming distance between \code{genotype} and the genotype
#' pool. This option is used if \code{sampling} is set to \code{"add-remove"}
#' @param seed seed for reproducibility
#' @return returns a list containing the importance weights and the sufficient
#' statistics. If \code{sampling} is set to \code{"pool"}, it returns the
#' importance weights and the indices of choosed observations from the genotype
#' pool
importance.weight_ <- function(
genotype, L, poset, lambdas, lambda.s, eps, sampling.time=NULL,
sampling=c('forward', 'add-remove', 'pool'), perturb.prob=0.8, version="3",
dist.pool=NULL, seed=NULL) {
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="L'Ecuyer-CMRG")
p <- ncol(poset) # number of events/mutations
if (sampling == 'forward') {
# Generate L samples from poset with parameters 'lambdas' and 'lambda.s'
# In particular, epsilon is zero (default value) - because the idea is to
# generate samples of X (underlying true)
### **TODO** ### what to do when sampling times available. At the moment,
# not considered to generate samples - incorporate sampling times in
# function sample_genotypes
if (!is.null(sampling.time)) {
warning("Forward proposal doesn't account for sampling times")
}
samples <- sample_genotypes(L, poset, sampling_param=lambda.s, lambdas=lambdas)
d <- apply(samples$hidden_genotypes, 1, hamming.dist, y=genotype)
prob.Y.X <- eps^d * (1-eps)^(p-d)
return(list("w"=prob.Y.X, "time.differences"=samples$T_events, "dist"=d))
} else if (sampling == 'add-remove') {
poset.trans.closed <- trans_closure(poset)
parents <- get.parents(poset.trans.closed)
children <- get.children(poset.trans.closed)
compatible <- is_compatible(genotype, poset)
# Generate L samples according to proposal
return.list <-
draw.samples(genotype, L, parents, children, compatible, eps,
perturb.prob, version)
make.compatible.prob <- return.list$make.compatible.prob
samples <- return.list$samples
# Generate mutation times Tj from sample i
Tdiff <- apply(samples$X, 1, sample.mutation.times, poset=poset,
lambdas=lambdas, lambda.s=lambda.s,
sampling.time=sampling.time)
log.proposal.X <- Tdiff["density", ]
Tdiff <- t(tail(Tdiff, n=-1))
# Hamming distance bewteen samples and genotype
dist <- apply(samples$X, 1, hamming.dist, y=genotype)
# Computing log(Pr(Y|X))
if (eps == 0) {
# NOTE: If all observations are compatible with poset and pertub_prob == 0
# (which can happen because noisy observations can be compatible),
# then eps can be 0 and dist 0
log.Pr.Y.X <-
ifelse(dist == 0, 0,
dist * log(eps + .Machine$double.eps) +
(p - dist) * log(1 - (eps + .Machine$double.eps)))
} else {
log.Pr.Y.X <- dist*log(eps) + (p - dist) * log(1 - eps)
}
# Computing log(Pr(X))
log.Pr.X <- apply(Tdiff, 1, log.cbn.density, rate=lambdas)
# Computing density of the proposal - for correction
# proposal : weight accounting for making genotype compatible + weight
# accounting for choosing a possible mutation for perturbing
# the sample + weight accounting for sampling times from a
# truncated exponential
num.options <- samples$option.set.size
if (version == "1") {
log.proposal.Y.X <-
ifelse(is.na(num.options) | num.options == 0, log(1 - perturb.prob),
log(perturb.prob) + log(0.5) + log(1/num.options))
} else if (version == "2" || version == "3") {
stopifnot(all(na.omit(num.options) != 0))
log.proposal.Y.X <- ifelse(is.na(num.options), log(1 - perturb.prob),
log(perturb.prob) + log(1 / num.options))
}
if (!compatible)
log.proposal.Y.X <- log.proposal.Y.X + log(make.compatible.prob)
log.proposal <- log.proposal.Y.X + log.proposal.X
importance.weight <- exp(log.Pr.X + log.Pr.Y.X - log.proposal)
return(list("w"=importance.weight, "time.differences"=Tdiff, "dist"=dist))
} else if (sampling == 'pool') {
### **TODO** ### what to do when sampling times available. At the moment,
# not considered to generate samples - incorporate sampling times in
# function sample_genotypes
if (is.null(dist.pool)) {
stop("Vector of distances expected for pool sampling")
}
# if (is.null(genotype_pool)) {
# stop("Pool of genotypes are expected for pool sampling")
# }
K <- length(dist.pool)
# compatible = is_compatible(genotype, poset)
aux <- eps^(dist.pool) * (1-eps) ^ (p - dist.pool)
# if (compatible) {
# genotype_pool <- rbind(genotype_pool, genotype)
# q.prob <- c(aux, 1 - sum(aux))
# idxs <- sample(seq(1, K + 1), L, replace=TRUE, prob=q.prob)
# } else {
# q.prob <- aux/sum(aux)
# idxs <- sample(seq(1, K), L, replace=TRUE, prob=q.prob)
# }
q.prob <- aux / sum(aux)
idxs <- sample(seq(1, K), L, replace=TRUE, prob=q.prob)
# Generate mutation times Tj from sample i
# Tdiff <-
# apply(genotype_pool[idxs, ], 1, sample.mutation.times, poset=poset,
# lambdas=lambdas, lambda.s=lambda.s, sampling.time=sampling.time)
#
# log.proposal.X = Tdiff["density", ]
# Tdiff = t(tail(Tdiff, n=-1))
# Hamming distance bewteen samples and genotype
dist <- dist.pool[idxs]
# Computing log(Pr(Y|X))
if (eps == 0) {
# NOTE: If all obs ervations are compatible with poset and pertub_prob == 0
# (which can happen because noisy observations can be compatible),
# then eps can be 0 and dist 0
log.Pr.Y.X <-
ifelse(dist == 0, 0,
dist * log(eps + .Machine$double.eps) +
(p - dist) * log(1 - (eps + .Machine$double.eps)))
} else {
log.Pr.Y.X <- dist * log(eps) + (p - dist) * log(1 - eps)
}
# Computing log(Pr(X))
# log.Pr.X <- apply(Tdiff, 1, log.cbn.density, rate=lambdas)
# Computing density of the proposal - for correction
log.proposal <- log(q.prob[idxs]) + log(K) #+ log.proposal.X
# importance.weight <- exp(log.Pr.X + log.Pr.Y.X - log.proposal)
importance.weight <- exp(log.Pr.Y.X - log.proposal)
# return(list("w"=importance.weight, "time.differences"=Tdiff, "dist"=dist))
return(list("w"=importance.weight, "idxs"=idxs))
}
}
#' @description compute Pr(Y) using Monte Carlo sampling, where `Y` corresponds
#' to the observed genotype
#'
#' @inheritParams importance.weight_
#' @param weight.remove a numeric vector of length \code{p} containing the
#' weights for choosing events to be removed. This option is used if
#' \code{sampling} is set to \code{"add-remove"}
#' @param sampling.pool an optional list containing mutation times and
#' occurrence (cumulative mutation) times. Otherwise the size of the pool. This
#' option is used if \code{sampling} is set to \code{"pool"}
#' @param neighborhood.dist an integer value indicating the Hamming distance
#' between the \code{'genotype'} and the samples generated by \code{"backward"}
#' sampling. This option is used if \code{sampling} is set to \code{"backward"}.
#' Defaults to \code{1}
prob.importance.sampling <- function(
genotype, L, poset, lambdas, lambda.s, eps, sampling.time=NULL,
sampling=c('forward', 'add-remove', 'backward', 'bernoulli', 'pool'),
perturb.prob=0.8, version="3", weight.remove=numeric(0), sampling.pool=200,
neighborhood.dist=1L, seed=NULL) {
sampling <- match.arg(sampling)
if (is.null(seed))
seed <- sample.int(3e4, 1)
if (sampling == "add-remove" & version != "4") {
probs <-
importance.weight_(genotype, L, poset, lambdas, lambda.s, eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version, seed=seed)
} else {
dist.pool <- integer(0)
Tdiff.pool <- matrix(0)
if (sampling == "pool") {
if (is.numeric(sampling.pool)) {
K <- sampling.pool
sampling.pool <- sample.times(K, poset, lambdas, seed=seed)
}
# NOTE: A new seed is required, to restart draws of the
# 'std::exponential_distribution'
set.seed(seed, kind="L'Ecuyer-CMRG")
new.seed <- sample.int(3e4, 1)
genotype.pool <-
generate.genotypes(sampling.pool$mutation_times, poset,
sampling.time=sampling.time, lambda.s=lambda.s,
seed=new.seed)
dist.pool <- apply(genotype.pool$samples, 1, hamming.dist, y=genotype)
Tdiff.pool <- sampling.pool$Tdiff
}
probs <-
importance.weight(genotype, L, poset, lambdas, eps, time=sampling.time,
sampling=sampling, weight.remove=weight.remove,
dist.pool=dist.pool, Tdiff.pool=Tdiff.pool,
neighborhood.dist=as.integer(neighborhood.dist),
lambda.s=lambda.s, seed=as.integer(seed))
}
probs <- probs$w
if (sampling == "backward") {
probs <- probs[probs > 0]
L <- sum(probs > 0)
}
return(sum(probs) / L)
}
#' @description compute expected time differences for a given genotype using
#' Monte Carlo sampling
#'
#' @inheritParams importance.weight_
tdiff.importance.sampling <- function(
genotype, L, poset, lambdas, lambda.s, eps, sampling.time=NULL,
sampling=c('forward', 'add-remove', 'pool'), perturb.prob=0.8,
version="3", dist.pool=NULL, seed=NULL) {
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="default")
importance.weights <-
importance.weight_(genotype, L, poset, lambdas, lambda.s, eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version,
dist.pool=dist.pool)
return(colSums(importance.weights$w * importance.weights$time.differences) /
sum(importance.weights$w))
}
#' @description compute expected Hamming distance between a given observation
#' (genotype) and the underlying/true genotype using Monte Carlo sampling
#'
#' @inheritParams importance.weight_
dist.importance.sampling <- function(
genotype, L, poset, lambdas, lambda.s, eps, sampling.time=NULL,
sampling=c('forward', 'add-remove', 'pool'), perturb.prob=0.8,
version="3", dist.pool=NULL, seed=NULL) {
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="default")
importance.weights <-
importance.weight_(genotype, L, poset, lambdas, lambda.s, eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version,
dist.pool=dist.pool)
return(sum(importance.weights$dist * importance.weights$w) /
sum(importance.weights$w))
}
#' @param events a matrix of observations or a vector containing a single
#' genotype, if \code{one.genotype} is set to \code{TRUE}
#' @param L number of samples to be drawn from the proposal
#' @param poset a matrix containing the cover relations
#' @param lambdas a vector of exponential rates
#' @param lambda.s rate of the sampling process
#' @param eps error rate
#' @param rep an optional argument indicating the number of repetitions. This
#' option is used if \code{one.genotype} is set to \code{TRUE}
#' @param one.genotype a boolean variable indicating whether only one genotype
#' is provided
#' @param sampling.times an optional vector of sampling times per observation
#' @param sampling type of sampling scheme. OPTIONS: \code{"forward"},
#' \code{"add-remove"}, \code{"backward"}, \code{"bernoulli"}, or \code{"pool"}
#' @param perturb.prob probability of perturbing a genotype. This option is
#' used if \code{sampling} is set to \code{"add-remove"}. Defaults to
#' \code{0.8}
#' @param version an optional agrument indicating which version of the
#' \code{"add-remove"} sampling scheme to use. Defaults to \code{3}
#' @param sampling.pool an optional list containing mutation times and
#' occurrence (cumulative mutation) times. Otherwise the size of the pool. This
#' option is used if \code{sampling} is set to \code{"pool"}
#' @param outdir an optional argument indicating the path to the output
#' directory
#' @param outname an optional argument indicating the suffix to include in the
#' output file name
#' @param binwidth an optional argument indicating the width of the histrogram
#' bins. This option is used if \code{one.genotype} is set to \code{TRUE}
#' @param seed seed for reproducibility
prob.empirical_vs_sampling <- function(
events, L, poset, lambdas, lambda.s, eps, rep=NULL, one.genotype=FALSE,
sampling.times=NULL,
sampling=c('forward', 'add-remove', 'backward', 'bernoulli', 'pool'),
perturb.prob=0.8, version="3", sampling.pool=200, neighborhood.dist=1L,
outdir=NULL, outname="", binwidth=0.01, seed=NULL) {
sampling <- match.arg(sampling)
if (is.null(seed))
seed <- sample.int(1e9, 1)
if (one.genotype) {
if (length(sampling.times) > 1) {
warning("Only one sampling time was expected. First entry of vector ",
"\'sampling.times\' is used.")
sampling.times <- sampling.times[1]
}
genotype <- events
sampling.time <- sampling.times
N <- rep
} else {
N <- nrow(events)
}
set.seed(seed, kind="L'Ecuyer-CMRG")
seeds <- sample.int(3e4, N)
if (sampling == "add-remove" & version == "4") {
weight.remove <- .Call("_scale_path_to_mutation",
matrix(as.integer(poset), nrow=p, ncol=p), lambdas)
} else {
weight.remove <- numeric(0)
}
probs <- foreach(i=1:N, .combine='rbind', .packages="mccbn") %dopar% {
if (!one.genotype) {
genotype <- events[i, ]
sampling.time <- sampling.times[i]
}
prob.empirical <-
probY.empirical(N=100000, poset, lambdas, lambda.s, genotype, eps=eps)
prob.sampling <-
prob.importance.sampling(genotype, L, poset, lambdas, lambda.s, eps=eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version,
weight.remove=weight.remove,
sampling.pool=sampling.pool,
neighborhood.dist=as.integer(neighborhood.dist),
seed=seeds[i])
return(c(prob.empirical, prob.sampling))
}
if (!is.null(outdir)) {
outdir <- file.path(outdir, paste("L", L, "_", sampling, sep=""))
if (!dir.exists(outdir))
dir.create(outdir)
outname <- file.path(outdir, paste("probability_Y_empirical_vs_sampling",
outname, ".pdf", sep=""))
cat("Saving plot at \'", outname,"\'\n", sep="")
if (one.genotype) {
xlab <- expression(P(Y))
ylab <- ""
} else {
xlab <- expression(P[empirical](Y))
ylab <- expression(widehat(P)(Y))
}
pl.empirical_vs_sampling(
empirical=probs[, 1], sampling=probs[, 2], xlab=xlab, ylab=ylab, N=N,
one.genotype=one.genotype, outname=outname, binwidth=binwidth)
}
return(list("empirical"=probs[, 1], "sampling"=probs[, 2]))
}
#' @inheritParams prob.empirical_vs_sampling
tdiff.empirical_vs_sampling <- function(
events, L, poset, lambdas, lambda.s, eps, rep=NULL, one.genotype=FALSE,
sampling.times=NULL, sampling=c('forward', 'add-remove', 'pool'),
perturb.prob=0.8, version="3", genotype.pool=NULL, outdir=NULL, outname="",
binwidth=0.01, seed=NULL) {
# NOTE: If 'forward' sampling is employed, sampling times are not used.
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="default")
if (one.genotype) {
if (length(sampling.times) > 1) {
warning("Only one sampling time was expected. First entry of vector ",
"\'sampling.times\' is used.")
sampling.times <- sampling.times[1]
}
genotype <- events
sampling.time <- sampling.times
p <- length(genotype)
N <- rep
} else {
N <- nrow(events)
p <- ncol(events)
}
time.diff.empirical <- matrix(0, ncol=p, nrow=N)
time.diff.sampling <- matrix(0, ncol=p, nrow=N)
for (i in 1:N) {
if (!one.genotype) {
genotype <- events[i, ]
sampling.time <- sampling.times[i]
}
time.diff.empirical[i, ] <-
tdiff.empirical(N=100000, poset, lambdas, lambda.s, genotype, eps=eps)
time.diff.sampling[i, ] <-
tdiff.importance.sampling(genotype, L=L, poset, lambdas, lambda.s, eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version)
}
if (!is.null(outdir)) {
outdir <- file.path(outdir, paste("L", L, "_", sampling, sep=""))
if (!dir.exists(outdir))
dir.create(outdir)
for (j in 1:p) {
pl.name <- file.path(outdir, paste("time_diff_empirical_vs_sampling",
outname, "_j", j, ".pdf", sep=""))
if (one.genotype) {
xlab = expression(Z[j])
ylab = ""
} else {
xlab = substitute(paste(Z[j]), list(j=j))
ylab = substitute(paste(widehat(Z)[j]), list(j=j))
}
pl.empirical_vs_sampling(empirical=time_diff_empirical[, j],
sampling=time.diff.sampling[, j],
xlab=xlab, ylab=ylab, N=N,
one.genotype=one.genotype, outname=pl.name,
binwidth=binwidth)
}
}
return(list("empirical"=time.diff.empirical, "sampling"=time.diff.sampling))
}
#' @inheritParams prob.empirical_vs_sampling
dist.empirical_vs_sampling <- function(
events, L, poset, lambdas, lambda.s, eps, rep=NULL, one.genotype=FALSE,
sampling.times=NULL, sampling=c('forward', 'add-remove', 'pool'),
perturb.prob=0.8, version="3", genotype_pool=NULL, outdir=NULL, outname="",
binwidth=0.01, seed=NULL) {
# NOTE: If 'forward' sampling is employed, sampling times are not used.
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="default")
if (one.genotype) {
if (length(sampling.times) > 1) {
warning("Only one sampling time was expected. First entry of vector ",
"\'sampling.times\' is used.")
sampling.times <- sampling.times[1]
}
genotype <- events
sampling.time <- sampling.times
N <- rep
} else {
N <- nrow(events)
}
dist.empirical <- numeric(N)
dist.sampling <- numeric(N)
for (i in 1:N) {
if (!one.genotype) {
genotype <- events[i, ]
sampling.time <- sampling.times[i]
}
dist.empirical[i] <-
dist.empirical(N=100000, poset, lambdas, lambda.s, genotype, eps=eps)
dist.sampling[i] <-
dist.importance.sampling(genotype, L=L, poset, lambdas, lambda.s, eps,
sampling.time=sampling.time, sampling=sampling,
perturb.prob=perturb.prob, version=version)
}
if (!is.null(outdir)) {
outdir <- file.path(outdir, paste("L", L, "_", sampling, sep=""))
if (!dir.exists(outdir))
dir.create(outdir)
outname <- file.path(outdir, paste("hamming_dist_empirical_vs_sampling",
outname, ".pdf", sep=""))
if (one.genotype) {
xlab <- expression(d(X,Y))
ylab <- ""
} else {
xlab <- expression(d[empirical](X,Y))
ylab <- expression(widehat(d)(X,Y))
}
pl.empirical_vs_sampling(empirical=d.empirical, sampling=d.sampling,
xlab=xlab, ylab=ylab, N=N,
one.genotype=one.genotype, outname=outname,
binwidth=binwidth)
}
return(list("empirical"=dist.empirical, "sampling"=dist.sampling))
}
#' @param empirical,sampling numeric vectors corresponding to the x and y axes,
#' respectively
#' @param xlab a title for the x axis
#' @param ylab an optional title for the y axis
#' @param truth an optional argument indicating the exact value
#' @param N an optional argument indicating the number of repetitions. This
#' option is used if \code{one.genotype} is set to \code{TRUE}
#' @param one.genotype a boolean variable indicating whether only one genotype
#' is provided
#' @param outname an optional argument indicating the output file name
#' @param binwidth an optional argument indicating the width of the histrogram
#' bins. This option is used if \code{one.genotype} is set to \code{TRUE}
pl.empirical_vs_sampling <- function(
empirical, sampling, xlab, ylab="", truth=NULL, N=NULL, one.genotype=FALSE,
outname=NULL, binwidth=0.01) {
if (one.genotype) {
df <- data.frame(x=c(empirical, sampling),
method=c(rep("empirical", N), rep("sampling", N)))
pl <- ggplot(df, aes(x = x, fill = method))
pl <- pl + geom_histogram(binwidth=binwidth, alpha=0.5, position="identity") +
geom_vline(xintercept=truth, colour="#BB0000") +
labs(x=xlab, y="Frequency") +
theme_bw() + theme(text=element_text(size=14))
if (is.null(outname)) {
return(pl)
} else {
ggsave(outname, pl, width=4, height=2.5)
}
} else {
df <- data.frame(x=empirical, y=sampling)
pl <- ggplot(df, aes(x=x, y=y))
pl <- pl + geom_point() +
geom_abline(intercept=0, slope=1, colour="blue") +
labs(x=xlab, y=ylab) +
theme_bw() + theme(text=element_text(size=14))
if (is.null(outname)) {
return(pl)
} else {
ggsave(outname, pl, width=3, height=2)
}
}
}
#' @description initialization of the rate parameters
#'
#' @param obs.events a matrix of observations, where each row correponds to a
#' vector indicating whether an event has been observed (\code{1}) or not
#' (\code{0})
#' @param poset a matrix containing the cover relations
#' @param lambda.s rate of the sampling process
#' @param verbose an optional argument indicating whether to output logging
#' information
initialize.lambda <- function(obs.events, poset, lambda.s, verbose=FALSE) {
p <- nrow(poset)
lambda <- rep(0, p)
for (i in 1:p) {
parents <- which(poset[, i] == 1)
if (length(parents) == 0) {
aux <- sum(obs.events[, i]) / nrow(obs.events)
lambda[i] <- aux * lambda.s / (1 - aux)
} else {
idxs <-
which(apply(obs.events[, parents, drop=FALSE], 1,
function(x) all(x == 1)))
if (length(idxs) == 0) {
if (verbose) {
cat("No enough evidence for mutation", i, "\n")
}
lambda[i] <- 1e-7
} else {
aux <- sum(obs.events[idxs, i]) / length(idxs)
lambda[i] <- aux * lambda.s / (1 - aux)
}
}
}
lambda[is.infinite(lambda)] <- 5 * max(lambda[!is.infinite(lambda)])
return(lambda)
}
#' @title Monte Carlo Expectation Maximization
#' @description parameter estimation for the hidden conjunctive Bayesian network
#' model (H-CBN) via importance sampling
#'
#' @param poset a matrix containing the cover relations
#' @param obs.events a matrix of observations, where each row correponds to a
#' vector indicating whether an event has been observed (\code{1}) or not
#' (\code{0})
#' @param sampling.times an optional vector of sampling times per observation/
#' genotype
#' @param lambda.s rate of the sampling process. Defaults to \code{1.0}
#' @param max.iter the maximum number of EM iterations. Defaults to \code{100}
#' iterations
#' @param L number of samples to be drawn from the proposal in the E-step
#' @param sampling type of sampling scheme. OPTIONS: \code{"forward"} -
#' generate occurrence times according to current rate parameters, and, from
#' them, generate the genotypes; \code{"add-remove"} - generate genotypes from
#' observed genotypes using a two-steps proposal. First, make genotypes
#' compatible with the poset by either adding or removing mutations. Second,
#' perturb this version by adding or removing a mutation while yielding a
#' compatible observation; \code{"pool"} - generate a pool of compatible
#' genotypes according to current rate parameters and sample \code{K}
#' observations proportional to the Hamming distance.
#' @param max.lambda.val an optional upper bound on the value of the rate
#' parameters. Defaults to \code{1e6}
#' @param perturb.prob probability of perturbing a genotype. Genotypes are
#' perturbed in order to learn the error rate, \eqn{\epsilon}. A genotype is
#' perturbed, if after drawing a random number, this is smaller than
#' \code{perturb.prob}, otherwise \eqn{X_start = X}. This option is used if
#' \code{sampling} is set to \code{"add-remove"}. Defaults to \code{0.3}
#' @param version an optional argument indicating which version of the
#' \code{"add-remove"} sampling scheme to use. Defaults to \code{3}
#' @param parallel boolean variable indicating whether sampling should be
#' executed sequentially (\code{0}) or in parallel (\code{1}). Option is used
#' if \code{sampling} is set to \code{"forward"}
#' @param verbose an optional argument indicating whether to output logging
#' information
#' @param seed seed for reproducibility
MCEM.hcbn_ <- function(
poset, obs.events, sampling.times=NULL, lambda.s=1.0, max.iter=100,
burn_in=0.8, L=100, sampling=c('forward', 'add-remove', 'pool'),
max.lambda.val=1e6, perturb.prob=0.3, version="3", parallel=TRUE,
verbose=TRUE, seed=NULL) {
# NOTE: If 'forward' sampling is employed, sampling times are not used.
sampling <- match.arg(sampling)
if (!is.null(seed))
set.seed(seed, kind="L'Ecuyer-CMRG")
p <- ncol(poset) # number of events/mutations
N <- nrow(obs.events) # number of observations/genotypes
if (is.null(sampling.times)) {
avg.sampling.t <- lambda.s
} else {
avg.sampling.t <- mean(sampling.times)
if (sampling == 'forward') {
warning("Forward proposal doesn't account for sampling times")
}
}
# initialize lambdas
lambdas <-
initialize.lambda(obs.events=obs.events, poset=poset, lambda.s=lambda.s)
# NOTE: function initialize_lambda should be modify to account for cases where
# sum(obs.events[parents_i == 1, i]) is zero.
lambdas[lambdas == 0] <- 1e-7
compatible.obs <- compatible_genotypes(obs.events, poset)
lambdas <-
estimate_mutation_rates(
poset, obs.events[compatible.obs$compatible_indexes, ],
sampling.times, ilambda=lambdas, verbose=verbose)
lambdas <- lambdas$lambda
# initialize epsilon
eps <- 1 - compatible.obs$fraction
eps <- ifelse(eps == 0, runif(1, 0, 0.0001), eps)
avg.eps <- 0
avg.lambdas <- numeric(p)
avg.llhood <- 0
record.iter <- max(as.integer(burn_in * max.iter), 1)
if (sampling == 'add-remove') {
poset_trans_closed <- trans_closure(poset)
parents <- get.parents(poset_trans_closed)
children <- get.children(poset_trans_closed)
### **TODO** ### at the moment, compatibility test is performed for each genotype
#idx_compatible = apply(obs.events, 1, is_compatible, poset=poset)
}
llhood <- numeric()
iter <- 1
while(iter <= max.iter) {
if (iter > 2) {
if (abs(llhood[iter - 1] - llhood[iter - 2]) < 1e-4)
break
}
# E step
# Compute for each event j, expected.Tdiff[j] = E[T_j - max_{u \in pa(j)} T_u | Y]
# Compute expected.dist
if (sampling == 'forward') {
if (!parallel) {
expected.dist = numeric(N)
expected.Tdiff = matrix(0, nrow=N, ncol=p)
for (i in 1:N) {
# Draw L samples from poset with parameters 'lambdas' and 'lambda.s'
e.step <-
importance.weight_(genotype=obs.events[i, ], L=L, poset=poset,
lambdas=lambdas, lambda.s=lambda.s, eps=eps,
sampling.time=NULL, sampling='forward')
# Conditional expectation of the sufficient statistic d(X, Y)
expected.dist[i] = sum(e.step$w * e.step$dist) / sum(e.step$w)
# Contitional expectation of the sufficient statistic Z_j
# Z_j = t_j - max_{u \in pa(j)} t_u
expected.Tdiff[i, ] = colSums(e.step$w * e.step$time.differences) /
sum(e.step$w)
}
} else {
ret <- foreach(i=1:N, .combine='rbind', .packages="mccbn") %dopar% {
# Draw L samples from poset with parameters 'lambdas' and 'lambda.s'
e.step <-
importance.weight_(genotype=obs.events[i, ], L=L, poset=poset,
lambdas=lambdas, lambda.s=lambda.s, eps=eps,
sampling.time=NULL, sampling='forward')
# Conditional expectation of the sufficient statistic d(X, Y)
expected.dist = sum(e.step$w * e.step$dist) / sum(e.step$w)
# Contitional expectation of the sufficient statistic Z_j
# Z_j = t_j - max_{u \in pa(j)} t_u
expected.Tdiff = colSums(e.step$w * e.step$time.differences) /
sum(e.step$w)
return(c(expected.dist, expected.Tdiff))
}
expected.dist <- ret[, 1]
expected.Tdiff <- ret[, -1]
colnames(expected.Tdiff) <- NULL
}
} else if (sampling == 'add-remove') {
if (verbose) cat("E-step - ", iter, "\n")
ret <- foreach(i=1:N, .combine='rbind', .packages="mccbn") %dopar% {
# In each iteration and for each observation, draw L samples from proposal
# two-steps proposal: make compatible and perturb
# 1. Make genotypes compatible by adding or removing 1's
# 2. Perturb version of previous genotypes, but ensure it remains
# compatible with current poset
e.step <-
importance.weight_(genotype=obs.events[i, ], L=L, poset=poset,
lambdas=lambdas, lambda.s=lambda.s, eps=eps,
sampling.time=sampling.times[i],
sampling='add-remove', perturb.prob=perturb.prob,
version=version)
# Conditional expectation of the sufficient statistic d(X, Y)
expected.dist <- sum(e.step$w * e.step$dist) / sum(e.step$w)
# Contitional expectation of the sufficient statistic Z_j
# Z_j = t_j - max_{u \in pa(j)} t_u
expected.Tdiff <- colSums(e.step$w * e.step$time.differences) /
sum(e.step$w)
return(c(expected.dist, expected.Tdiff))
}
expected.dist <- ret[, 1]
expected.Tdiff <- ret[, -1]
colnames(expected.Tdiff) <- NULL
} else if (sampling == 'pool') {
K <- min(max(2^(p + 1), 2*L), 1e8 / (8 * p))
genotype_pool <-
sample_genotypes(K, poset, sampling_param=lambda.s, lambdas=lambdas)
ret <- foreach(i=1:N, .combine='rbind', .packages="mccbn") %dopar% {
# In each iteration and for each observation, draw L samples from proposal
# pool sampling: sample X genotypes according to the epsilon and
# Hamming distance to the observed genotype, Y
d_pool <-
apply(genotype_pool$obs_events, 1, hamming.dist, y=obs.events[i, ])
e.step <-
importance.weight_(genotype=obs.events[i, ], L=L, poset=poset,
lambdas=lambdas, lambda.s=lambda.s, eps=eps,
sampling.time=NULL, sampling='pool',
genotype.pool=genotype_pool$obs_events,
dist.pool=d_pool)
# Conditional expectation of the sufficient statistic d(X, Y)
expected.dist <- sum(e.step$w * d_pool[e.step$idxs]) / sum(e.step$w)
# Contitional expectation of the sufficient statistic Z_j
# Z_j = t_j - max_{u \in pa(j)} t_u
expected.Tdiff <-
colSums(e.step$w * genotype_pool$T_events[e.step$idxs, ]) /
sum(e.step$w)
return(c(expected.dist, expected.Tdiff))
}
expected.dist <- ret[, 1]
expected.Tdiff <- ret[, -1]
colnames(expected.Tdiff) <- NULL
}
if (any(is.na(expected.Tdiff))) {
cat("At iteration", iter, "unexpected value for expected time differences\n")
save(expected.Tdiff,
file=paste("expected.Tdiff_p", p, "_iter", iter, ".RData", sep=""))
}
if (any(is.na(expected.dist))) {
cat("At iteration", iter, "unexpected value for expected distance\n")
save(expected.dist,
file=paste("expected.dist_p", p, "_iter", iter, ".RData", sep=""))
}
# M step
eps <- mean(expected.dist) / p
lambdas <- 1 / apply(expected.Tdiff, 2, mean)
if (verbose) cat("M-step - ", iter, "epsilon: ", eps, "\n")
if (any(lambdas > max.lambda.val)) {
idx <- which(lambdas > max.lambda.val)
lambdas[idx] <- max.lambda.val
}
llhood <- c(llhood,
complete.loglikelihood_(lambdas, eps, expected.Tdiff,
expected.dist))
if (verbose)
cat("Iteration:", iter, "- Log-likelihood: ", llhood[iter], "\n")
if (iter > record.iter) {
if (verbose)
cat("Recording parameters .. \n")
avg.eps <- avg.eps + eps
avg.lambdas <- avg.lambdas + lambdas
avg.llhood <- avg.llhood + llhood[iter]
}
iter <- iter + 1
}
avg.eps <- avg.eps / (max.iter - record.iter)
avg.lambdas <- avg.lambdas / (max.iter - record.iter)
avg.llhood <- avg.llhood / (max.iter - record.iter)
return(list("lambdas"=lambdas, "eps"=eps, "llhood"=llhood,
"avg.lambdas"=avg.lambdas, "avg.eps"=avg.eps,
"avg.llhood"=avg.llhood))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.