### y is an nx3 matrix, where n is the number of trios
### 1st column = M, 2nd column = F, 3rd column = Offspring
### S is the number of gibbs sampler scans
### mu is the mean of the normal prior put on each class mean
### tau is the standard deviation (not variance) of the same distribution
### Put a Gamma(nu/2, nu/2 * lambda^2) on the class precisions
### Assume prior of dirichlet(a,b,c) for the joint prior on the
### probabilities of class membership
### This model assumes no independence of offspring copy number state
### to parent copy number state
#' A Gibbs Sampler for genotyping copy number in trios
#'
#' This function allows you to obtain a posterior probability distribution for copy number genotyping on Illumina arrays for an fixed genetic model where tau1=1, tau2=0.5 and tau3=0
#' @param y Data$response where data is as generated by the cn.sim.data function.
#' @param S The number of iterations for the Gibbs Sampler (suggest default 10000).
#' @param mu Defaults to -.02.
#' @param xi Defaults to sqrt(200).
#' @param nu Defaults to 3.
#' @param lambda Defaults to 0.05.
#' @param alpha Alpha1 and alpha2 and psi refer to HWE parameters as specified in Cardin et al (alpha=w, psi=lambda).
#' @keywords marimba
#'
initialize_chains <- function(gmodel, params) {
N <- params$N
K <- length(gmodel$theta)
S <- params$iter
C <- list()
for (k in 1:K){
cn.k <- attributes(table(gmodel$cn))$dimnames[[1]][k]
label <- as.vector(paste0("C",cn.k))
c <- matrix(0, N, 3)
colnames(c) <- c("m", "f", "o")
C[[length(C)+1]] <- c
names(C)[k] <- label
}
theta <- matrix(0, S, K)
sigma <- matrix(0, S, K)
tau <- matrix(0, S, 1)
logll <- rep(0, S)
##tau <- rep(0, S)
## each row of p corresponds to scan, each column is posterior probability of
## copy number 0, 1, and 2 respectively
pi <- matrix(0, nrow = S, ncol = K)
pi.child <- matrix(0, nrow = S, ncol = K)
colnames(pi) <- names(C)
colnames(pi.child) <- names(C)
list(C=C,
theta=theta,
sigma=sigma,
logll=logll,
tau=tau,
pi=pi,
pi.child=pi.child)
}
gMendelian <- function(tau.one, tau.two, tau.three, err=0){
tau1 <- tau.one-err/2
tau2 <- tau.two-err/2
tau3 <- tau.three
mendelian.probs <- array(dim=c(3, 3, 3))
genotypes <- c("BB", "AB", "AA")
dimnames(mendelian.probs) <- list(paste0("O_", genotypes),
paste0("F_", genotypes),
paste0("M_", genotypes))
mendelian.probs[, 1, 1] <- c((1 - tau3)^2, 2 * tau3 * (1 - tau3), tau3^2)
mendelian.probs[, 2, 1] <- c((1 - tau2) * (1 - tau3), tau2 * (1 - tau3) + (1 - tau2) * tau3, tau2 * tau3)
mendelian.probs[, 3, 1] <- c((1 - tau1) * (1 - tau3), tau1 * (1 - tau3) + tau3 * (1 - tau1), tau1 * tau3)
mendelian.probs[, 1, 2] <- c((1 - tau2) * (1 - tau3), tau2 * (1 - tau3) + (1 - tau2) * tau3, tau2 * tau3)
mendelian.probs[, 2, 2] <- c((1 - tau2)^2, 2 * tau2 * (1 - tau2), tau2^2)
mendelian.probs[, 3, 2] <- c((1 - tau1) * (1 - tau2), tau1 * (1 - tau2) + tau2 * (1 - tau1), tau1 * tau2)
mendelian.probs[, 1, 3] <- c((1 - tau1) * (1 - tau3), tau1 * (1 - tau3) + tau3 * (1 - tau1), tau1 * tau3)
mendelian.probs[, 2, 3] <- c((1 - tau1) * (1 - tau2), tau1 * (1 - tau2) + tau2 * (1 - tau1), tau1 * tau2)
mendelian.probs[, 3, 3] <- c((1 - tau1)^2, 2 * tau1 * (1 - tau1), tau1^2)
mendelian.probs
}
kmean_clusters <- function(dat, params){
K <- params$K
y.mf <- dat[, 1:2]
y.mfdf <- melt(y.mf)
y.mfresponse <- y.mfdf$value
kmeans.result <- kmeans(y.mfresponse, K)
kmeans.center <- kmeans.result$centers
kmeans.order <- order(kmeans.center)
kmeans.clust <- kmeans.result$cluster
kmeans.clust <- sapply(kmeans.clust, function(x) which(kmeans.order==x))
y.mfdf$cn <- kmeans.clust - 1
y.mfdf
}
kmean_clusters.add <- function(kmean.df){
# use y.mfdf as input from kmean_clusters
kmean.clust<-kmean.df$cn
kmean.df$cn<-kmean.clust+1
kmean.df
}
p_offspring <- function(cn, mendelian.probs, theta){
t(apply(cn, 1, lookup_prob, mendelian.probs=mendelian.probs, theta=theta))
}
lookup_prob <- function(cn, mendelian.probs, theta) {
att <- names(theta)[1]
if((as.numeric(att))==0) {
mendelian.probs[, cn[[2]] + 1, cn[[1]] + 1]
} else if((as.numeric(att))==1) {
mendelian.probs[, cn[[2]], cn[[1]]]
} else {
mendelian.probs[, cn[[2]] - 1, cn[[1]] - 1]
}
}
cnProb <- function(p, y, theta, sigma){
K <- length(theta)
N <- length(y)
##
## Repeat each y K times so that each y is evaluated for the K different values of theta and sigma
##
y.tmp <- rep(y, each=K)
d.y <- matrix(dnorm(y.tmp, mean=theta, sd=sigma), N, K, byrow=TRUE)
if(nrow(p) != nrow(d.y)) stop()
cn.prob <- p * d.y
cn.denom <- rowSums(cn.prob, na.rm=TRUE)
if(any(cn.denom==0)){
##
## we simulated parental states that are incompatible with mendelian transmission
##
any_incomp <- TRUE
i <- which(cn.denom==0)
cn.denom[i] <- 1
} else any_incomp <- FALSE
cn.prob <- cn.prob/cn.denom
if(any_incomp){
## let data drive (?)
cn.prob[i, ] <- d.y[i, ]/(rowSums(d.y[i, , drop=FALSE]))
}
colnames(cn.prob) <- NULL
cn.prob <- as.matrix(cn.prob)
cn.prob
}
cnProb2 <- function(p, y, theta, sigma){
N <- length(y)
K <- length(theta)
p <- matrix(p, N, K, byrow=TRUE)
cnProb(p, y, theta, sigma)
}
multi_K <- function(dat, params){
K <- 3
params$K <- K
y.mf.k3 <- initialize_gmodel_setup(dat, params)
# k2a has cn components 0,1
# k2b has cn components 1,2
K <- 2
params$K <- K
y.mf.k2a <- initialize_gmodel_setup(dat, params)
y.mf.k2b <- initialize_gmodel_setup.alt(dat, params)
#k1a has cn component 1
#k1b has cn component 2
K <- 1
params$K <- K
y.mf.k1a <- initialize_gmodel_setup.alt(dat, params)
y.mf.k1b <- initialize_gmodel_setup.alt2(dat, params)
return(list(y.mf.k3=y.mf.k3,
y.mf.k2a=y.mf.k2a,
y.mf.k2b=y.mf.k2b,
y.mf.k1a=y.mf.k1a,
y.mf.k1b=y.mf.k1b))
}
initialize_gmodel_setup <- function(dat, params){
K <- params$K
N <- params$N
y.mf <- kmean_clusters(dat[, c("m", "f")], params)
}
initialize_gmodel_setup.alt <- function(dat, params){
K <- params$K
N <- params$N
y.mf <- kmean_clusters(dat[, c("m", "f")], params)
y.mf <- kmean_clusters.add(y.mf)
}
initialize_gmodel_setup.alt2 <- function(dat, params){
K <- params$K
N <- params$N
y.mf <- kmean_clusters(dat[, c("m", "f")], params)
y.mf <- kmean_clusters.add(y.mf)
y.mf <- kmean_clusters.add(y.mf)
}
dim.reduct <- function(offspring.probs, comp){
if (comp==2){
offspring.probs <- offspring.probs[,1:2]
} else if (comp==3){
offspring.probs <- offspring.probs[,2:3]
} else if (comp==4) {
offspring.probs <- as.matrix(offspring.probs[,2])
} else if (comp==5) {
offspring.probs <- as.matrix(offspring.probs[,3])
} else offspring.probs <- offspring.probs
return (offspring.probs)
}
init.offspring.cn <- function(cn.prob, cn.mf, comp, y.odf){
if (comp < 3){
cn.o <- rMultinom(cn.prob, 1)[, 1]
y.odf$cn <- cn.o - 1
} else {
if (comp==3){
cn.o <- rMultinom(cn.prob, 1)[, 1]
y.odf$cn <- cn.o
} else {
cn.o <- cn.mf$m
y.odf$cn <- cn.o
}
}
return (y.odf)
}
compute_loglik <- function(gmodel){
y <- as.numeric(gmodel$y)
att <- attributes(gmodel$theta)
if(as.numeric(att$names[1])==0) {
z <- gmodel$cn + 1
} else if (as.numeric(att$names[1])==1) {
z <- gmodel$cn
} else {
z <- gmodel$cn - 1
}
theta <- gmodel$theta[z]
sigma <- gmodel$sigma[z]
sum(dnorm(y, theta, sigma, log=TRUE))
}
componentStats <- function(y, cn){
yy <- as.numeric(y)
cn2 <- as.numeric(cn)
ylist <- split(yy, cn2)
means <- sapply(ylist, mean, na.rm=TRUE)
sds <- sapply(ylist, sd, na.rm=TRUE)
## record just the parents for the mix probs
y.p <- as.numeric(y[, 1:2])
cn.p <- as.numeric(cn[, 1:2])
y.p.list <- split(y.p, cn.p)
ns <- sapply(y.p.list, length)
list(means=means, Ns=ns, sds=sds)
}
update_cn <- function(gmodel, params){
N <- params$N
pi <- gmodel$pi
gmodel <- update_mendel(gmodel)
mendel.prob <- gmodel$m.probs
y <- gmodel$y
sigma <- gmodel$sigma
theta <- gmodel$theta
K <- length(theta)
## parents
cn.prob.m <- cnProb2(pi, y[, "m"], theta, sigma)
cn.prob.f <- cnProb2(pi, y[, "f"], theta, sigma)
cn.mf <- update_cn.parents(cn.prob.m, cn.prob.f, gmodel, params)
# cn.mf <- replaceIfMissing2(cn.mf)
##
## offspring
##
cn.o <- update_cn.child(gmodel, params, cn.mf, mendel.prob)
cn <- cbind(cn.mf, cn.o)
colnames(cn) <- c("m", "f", "o")
cn
}
update_cn.parents <- function(probs.m, probs.f, gmodel, params){
N <- params$N
att <- attributes(gmodel$theta)
cn.prob.m <- probs.m
cn.prob.f <- probs.f
cn.mf <- cbind(rMultinom(cn.prob.m, 1)[, 1],
rMultinom(cn.prob.f, 1)[, 1])
cn.mf <- if((att$names[1])==1) cn.mf else cn.mf-1
cn.mf <- if((att$names[1])==2) cn.mf+1 else cn.mf
cn.mf
}
update_cn.child <- function (gmodel, params, cn.parents, probs) {
N <- params$N
y <- gmodel$y
theta <- gmodel$theta
sigma <- gmodel$sigma
# note for future, if mendel.prob only one prob - nulll dimension, may throw error here.
p <- p_offspring(cn.parents, probs, theta)
p.star <- cnProb(p, y[, "o"], theta, sigma)
if(FALSE){
delta <- abs(p - p.star)
index <- which(rowSums(delta > 0.5) > 0)
p[index, ]
p.star[index, ]
}
cn.o <- update_cn.child.cno(gmodel, p.star)
cn.o
}
update_cn.child.cno <- function (gmodel, probs) {
p.star <- probs
cn.o <- tryCatch(rMultinom(p.star, 1)[, 1] , error=function(e) NULL)
if(is.null(cn.o)) browser()
att<-attributes(gmodel$theta)
cn.o <- if((att$names[1])==1) cn.o else cn.o-1
cn.o <- if((att$names[1])==2) cn.o+1 else cn.o
cn.o
}
missingCnState <- function(cn.states, expected=as.character(c(0, 1, 2))){
observed.cnstates <- names(table(cn.states))
not.observed <- expected[ !expected %in% observed.cnstates ]
not.two <- names(table(cn.states)[table(cn.states)==1])
not.observed <- c(not.observed, not.two)
not.observed <- unique(not.observed)
}
replaceIfMissing <- function(cn.states, expected=as.character(c(0, 1, 2))){
missing.states <- missingCnState(cn.states)
if(length(missing.states) > 0){
cn.states[sample(seq_along(cn.states), 4)] <- as.numeric(missing.states)
}
cn.states
}
replaceIfMissing2 <- function(cn.states, expected=as.character(c(0, 1, 2))){
cn.states <- as.matrix(cn.states)
missing.states <- missingCnState(cn.states)
if(length(missing.states) > 0){
cn.states[sample(seq_along(cn.states), 2)] <- as.numeric(missing.states)
}
cn.states
}
balance.cn.all <- function(gmodel, params){
params <- params
N <- params$N
cn.states <- gmodel$cn
result <- replaceIfMissing(cn.states)
result
}
update_theta <- function(ymns, sigma, ns, params){
mu.0 <- params$mu
xi <- params$xi ## prior variance
## let phi denote inverse variance
data.precision <- ns * 1/sigma^2
prior.precision <- 1/xi
post.prec <- prior.precision + data.precision
mu.n <- (mu.0 * prior.precision + ymns * data.precision) / post.prec
K <- length(ymns)
rnorm(K, mu.n, sqrt(1/post.prec))
}
ybar <- function(gmodel){
yy <- as.numeric(gmodel$y)
cn <- factor(as.numeric(gmodel$cn))
mns <- tapply(yy, cn, mean, na.rm=TRUE)
if(any(is.na(mns))) stop("NAs in variances")
mns
}
yvar <- function(gmodel){
yy <- as.numeric(gmodel$y)
cn <- factor(as.numeric(gmodel$cn))
vars <- tapply(yy, cn, var, na.rm=TRUE)
if(any(is.na(vars))) stop("NAs in variances")
vars
}
n <- function(gmodel){
cn <- as.numeric(table(as.numeric(gmodel$cn)))
cn
}
n_parents <- function(gmodel){
yy <- as.numeric(gmodel$y[, c("m", "f")])
cn <- as.numeric(gmodel$cn[, 1:2])
tab <- tapply(yy, cn, length)
freq <- setNames(rep(0L, length(gmodel$theta)),
attributes(tab)$dimnames[[1]])
freq[names(tab)] <- tab
freq
}
n_child <- function(gmodel){
yy <- as.numeric(gmodel$y[, c("o")])
cn <- as.numeric(gmodel$cn[, 3])
tab <- tapply(yy, cn, length)
freq <- setNames(rep(0L, length(gmodel$theta)),
attributes(tab)$dimnames[[1]])
freq[names(tab)] <- tab
freq
}
nu <- function(params) params$nu
## mean: shape/rate
## var: shape/rate^2
update_sigma <- function(params, ns, theta, ymns, yvars){
nu.0 <- params$nu
s2.0 <- params$sigma2.0
nu.n <- nu.0 + ns
s2.n <- 1/nu.n * (nu.0*s2.0 + (ns-1)*yvars + ns*(ymns-theta)^2)
K <- length(ns)
prec <- rgamma(K, nu.n/2, nu.n*s2.n/2)
sqrt(1/prec)
}
# key module for different models
gmodel_oneiter <- function(gmodel, params){
N <- params$N
y <- gmodel$y
pi <- gmodel$pi
pi.child <- gmodel$pi.child
theta <- gmodel$theta
sigma <- gmodel$sigma
K <- length(theta)
##
## update component labels
##
if(TRUE){
cn <- update_cn(gmodel, params)
} else cn <- gmodel$cn
gmodel$cn <- cn
if(TRUE){
gmodel$cn <- balance.cn.all(gmodel, params)
}
##
## update theta
##
ymeans <- ybar(gmodel)
yvars <- yvar(gmodel)
ns <- n(gmodel)
if(TRUE){
theta <- update_theta(params=params,
ymns=ymeans,
sigma=sigma,
ns=ns)
} else theta <- gmodel$theta
gmodel$theta[] <- theta
##
## update sigma
##
if(TRUE){
sigma <- update_sigma(params=params,
ns=ns,
theta=theta,
ymns=ymeans,
yvars=yvars)
} else sigma <- gmodel$sigma
gmodel$sigma[] <- sigma
## update mixture probabilities
## -- only for parents
if(TRUE){
pi <- update_pi(gmodel, params)
} else pi <- gmodel$pi
gmodel$pi[] <- pi
if(TRUE){
pi.child <- update_pi.child(gmodel, params)
} else pi.child <- gmodel$pi.child
gmodel$pi.child[] <- pi.child
# update transmission matrix
if(TRUE){
gmodel <- updateTransmissionProb(gmodel, params)
}
# update log likelihood
gmodel$logll <- compute_loglik(gmodel)
gmodel
}
update_pi <- function(gmodel, params){
a <- params$a
ns <- n_parents(gmodel)
d <- length(ns)
a <- a[1:d]
alpha.n <- ns + a
p <- tryCatch(rdirichlet(1, alpha.n), error=function(e) NULL)
if(is.null(p)) browser()
p
}
update_pi.child <- function(gmodel, params){
a <- params$a
ns <- n_child(gmodel)
d <- length(ns)
a <- a[1:d]
alpha.n <- ns + a
p <- tryCatch(rdirichlet(1, alpha.n), error=function(e) NULL)
if(is.null(p)) browser()
p
}
update_tau <- function(gmodel, params){
a <- params$a
N <- params$N
ns <- n_parents(gmodel)
d <- length(ns) - 1
bvec0 <- a[1:d]
nvec <- as.numeric(n_child(gmodel))
allele.freq <- DirichSampHWE(nvec,bvec0, 1)
pvec <- allele.freq$pvec[2]
}
update_tau.env <- function(gmodel, params){
a <- params$a
N <- params$N
ns <- n_parents(gmodel)
d <- length(ns) - 1
bvec0 <- a[1:d]
nvec <- as.numeric(n_child(gmodel))
# the line below will throw an error if balance.cn.all is not working properly to balance 3 components
# bvec0 should always be c(1,1) and nvec a length 3 vector for 3 component model
allele.freq <- DirichSampHWE(nvec,bvec0, 1)
# to generalise to multi-components, the line below will need to be revised.
pvec <- allele.freq$pvec[2]
denom <- 2 * ns[[3]] + ns[[2]]
numer <- pvec * 2* N
tau.env <- numer / denom
gmodel$tau[] <- tau.env
tau <- gmodel$tau
tau
}
update_tau.intermed <- function(gmodel, params){
# see notes in update_tau.env
a <- params$a
N <- params$N
ns <- n_parents(gmodel)
d <- length(ns) - 1
bvec0 <- a[1:d]
nvec <- as.numeric(n_child(gmodel))
allele.freq <- DirichSampHWE(nvec,bvec0, 1)
pvec <- allele.freq$pvec[2]
denom <- ns[[2]]
# numerator here is just for heterozygotes in kids so calculated accordingly
numer <- pvec * 2 * N * (nvec[2] / (nvec[2] + 2 * nvec[3]))
tau.intermed <- numer / denom
gmodel$tau[] <- tau.intermed
tau <- gmodel$tau
tau
}
update_mendel <- function(gmodel){
m.probs <- gmodel$m.probs
att <- attributes(gmodel$theta)
dim <- as.numeric(att$names) + 1
dim <- sort(dim)
#dim.l <- length(dim)
dim.max <- max(dim)
dim.min <- min(dim)
subset <- c(dim.min:dim.max)
m.probs <- m.probs[subset, subset, subset]
gmodel$m.probs <- m.probs
gmodel
}
# this module updates both the Mendelian transmission matrix as well as the taus where relecant
updateTransmissionProb <- function(gmodel, params){
#ncp <- params$ncp
if(params$model == "Genetic") {
m.probs <- gMendelian(tau.one<-params$tau.one, tau.two<-params$tau.two, tau.three<-params$tau.three, err<-params$error)
gmodel$m.probs <- m.probs
gmodel
} else if(params$model == "Environmental") {
tau <- update_tau.env(gmodel, params)
gmodel$tau <- tau
m.probs <- gMendelian(tau.one<-gmodel$tau, tau.two<-gmodel$tau, tau.three<-gmodel$tau, err<-params$error)
gmodel$m.probs <- m.probs
gmodel
} else {
tau <- update_tau.intermed(gmodel, params)
gmodel$tau <- tau
m.probs <- gMendelian(tau.one<-params$tau.one, tau.two<-gmodel$tau, tau.three<-params$tau.three, err<-params$error)
gmodel$m.probs <- m.probs
gmodel
}
return (gmodel)
}
# move_chains <- function(chains, gmodel, i){
#generalised chain movement
#K <- length(gmodel$theta)
#chains$C$C0 <- chains$C$C0 + 1L * (gmodel$cn == 0)
#chains$C$C1 <- chains$C$C1 + 1L * (gmodel$cn == 1)
#chains$C$C2 <- chains$C$C2 + 1L * (gmodel$cn == 2)
##
##
#chains$theta[i, ] <- gmodel$theta
#chains$sigma[i, ] <- gmodel$sigma
#chains$logll[i] <- compute_loglik(gmodel)
#chains$pi[i, ] <- gmodel$pi
#chains$tau[i] <- gmodel$tau
#chains
#}
move_chains <- function(chains, gmodel, i){
##generalised chain movement
K <- length(gmodel$theta)
for (k in 1:K){
cn.k <- attributes(table(gmodel$cn))$dimnames[[1]][k]
cn.k <- as.numeric(cn.k)
chains$C[[k]] <- chains$C[[k]] + 1L * (gmodel$cn == cn.k)
}
##
chains$theta[i, ] <- gmodel$theta
chains$sigma[i, ] <- gmodel$sigma
chains$logll[i] <- compute_loglik(gmodel)
chains$pi[i, ] <- gmodel$pi
chains$pi.child[i,] <- gmodel$pi.child
chains$tau[i] <- gmodel$tau
chains
}
.gibbs_burnin <- function(S, gmod, params){
for(i in seq_len(S)){
gmod <- gmodel_oneiter(gmod, params)
}
gmod
}
##############################
###the customised functions
#############################
###########################
###original working well###
###########################
# wrapper <- _genetic <- mcmc <- oneiter (make changes to oneiter)
gmodel_mcmc <- function(gmod, params, chains){
S <- params$iter
Th <- params$thin
for(i in 2:S){
for(j in seq_len(Th)){
gmod <- gmodel_oneiter(gmod, params)
}
chains <- move_chains(chains, gmod, i)
}
list(gmodel=gmod, chains=chains)
}
# model comparison metrics
# calculation of BIC and DIC
model.comparison <- function (fit.model) {
BIC <- calc.BIC(fit.model)
DIC <- calc.DIC(fit.model)
list = c(model.BIC = BIC,
model.DIC = DIC)
}
calc.BIC <- function (fit.model) {
n.sample <- length(fit.model$gmodel$y)
npar <- length(fit.model$chains)
logll <- fit.model$chains$logll
bic.calc <- log(n.sample)*npar - 2*logll
bic.calc <- mean(bic.calc)
bic.calc
}
calc.DIC <- function (fit.model) {
S <- nrow(fit.model$chains$theta)
logll <- fit.model$chains$logll
logll.sum <- sum(fit.model$chains$logll)
npar <- 2*(logll - (1/S * logll.sum))
dic.calc <- -2*(logll-npar)
dic.calc <- mean(dic.calc)
dic.calc
}
# output results summary
gibbs.results <- function(sim.truth, gibbs.cnv){
N <- nrow(gibbs.cnv$trio.cn)
# compare the truth set with call set
truth <- sim.truth["cn"]
truth <- truth$cn
map.cn <- gibbs.cnv$trio.cn
pr.cn <- gibbs.cnv$trio.cn.probs
contig <- table(map.cn, truth)
index <- which(rowSums(map.cn != truth) > 0)
mistake.true.cn.trio <- truth[index, ]
mistake.cn.called.trio <- map.cn[index, ]
mistake.cn.probs.trio <- pr.cn[index, ]
index.2 <- map.cn != truth
mistake.type <- index.2 * 1
mistake.type <- data.frame(mistake.type)
mistake.type$num <- mistake.type$m * 1 + mistake.type$f * 10 + mistake.type$o * 100
mistake.typed <- table(mistake.type$num)
mistake.cn.true <- truth[index.2]
mistake.y.true <- sim.truth$y[index.2]
mistake.cn.called <- map.cn[index.2]
mistake.spec <- cbind(mistake.cn.true, mistake.cn.called, mistake.y.true)
colnames(mistake.spec) <- c("CN.true", "CN.estimated", "LRR.true")
# calculate $ misclassification splot into parents and children
map.cn.parents <- map.cn[,1:2]
map.cn.offspring <- map.cn[,3]
truth.parents <- truth[,1:2]
truth.offspring <- truth[,3]
contig.par <- table(map.cn.parents, truth.parents)
contig.off <- table(map.cn.offspring, truth.offspring)
parent.misclass.perc <- 1 - ((sum(map.cn.parents==truth.parents))/1000)
parent.misclass.perc.cn0 <- (contig.par[2]+contig.par[3])/sum(contig.par[,1])
parent.misclass.perc.cn1 <- (contig.par[4]+contig.par[6])/sum(contig.par[,2])
parent.misclass.perc.cn2 <- (contig.par[7]+contig.par[8])/sum(contig.par[,3])
parent.misclass.perc.cn <- c(parent.misclass.perc.cn0, parent.misclass.perc.cn1, parent.misclass.perc.cn2)
offspring.misclass.perc <- 1 - ((sum(map.cn.offspring==truth.offspring))/500)
child.misclass.perc.cn0 <- (contig.off[2]+contig.off[3])/sum(contig.off[,1])
child.misclass.perc.cn1 <- (contig.off[4]+contig.off[6])/sum(contig.off[,2])
child.misclass.perc.cn2 <- (contig.off[7]+contig.off[8])/sum(contig.off[,3])
child.misclass.perc.cn <- c(child.misclass.perc.cn0, child.misclass.perc.cn1, child.misclass.perc.cn2)
# effective size
p <- gibbs.cnv$post.pi
eff.size.pi <- effectiveSize(p)
p.mns.parents <- colMeans(p)
p <- gibbs.cnv$post.pi.child
eff.size.pi.child <- effectiveSize(p)
p.mns.child <- colMeans(p)
p <- gibbs.cnv$post.thetas
eff.size.thetas <- effectiveSize(p)
p <- gibbs.cnv$post.sigmas
eff.size.sigmas <- effectiveSize(p)
# compare posterior probabilities for different components
truth.p <- sim.truth$p
empirical.p.overall <- table(truth)/(N*3)
empirical.p.parents <- table(truth[,1:2]) / (N*2)
empirical.p.child <- table(truth[,3]) / N
attributes(p.mns.parents)$names <- c("0","1","2")
attributes(p.mns.child)$names <- c("0","1","2")
# calculate empirical and estimated thetas and sigmas
empirical.thetas <- ybar(sim.truth)
empirical.sigmas <- yvar(sim.truth)
estimated.thetas <- apply(gibbs.cnv$post.thetas, 2, mean)
estimated.sigmas <- apply(gibbs.cnv$post.sigmas, 2, mean)
# taus
posterior.tau <- median(gibbs.cnv$taus)
# taus from truth set
parents.count <- n_parents(sim.truth)
child.count <- n_child(sim.truth)
numer.env <- child.count[[2]] + 2 * child.count[[3]]
denom.env <- parents.count[[2]] + 2 * parents.count[[3]]
tau.env <- numer.env / denom.env
numer.intermed <- child.count[[2]]
denom.intermed <- parents.count[[2]]
tau.intermed <- numer.intermed / denom.intermed
# truth table
truth.tb <- smartbind(truth.p, empirical.p.parents, empirical.p.child, p.mns.parents, p.mns.child)
truth.tb[is.na(truth.tb)] <- 0
row.names(truth.tb) <- c("truth", "empirical.parents", "empirical.offspring", "estimated.parents", "estimated.offspring")
truth.tb.2 <- data.frame(cn=c("C0", "C1", "C2"),
truth=as.numeric(truth.p),
empirical=as.numeric(empirical.p.overall),
empirical.parents=as.numeric(empirical.p.parents),
empirical.offspring=as.numeric(empirical.p.child),
empirical.thetas=as.numeric(empirical.thetas),
empirical.sigmas=as.numeric((empirical.sigmas)^0.5),
estimated.parents = as.numeric(p.mns.parents),
estimated.offspring = as.numeric(p.mns.child),
estimated.thetas = as.numeric(estimated.thetas),
estimated.sigmas = as.numeric(estimated.sigmas))
return(list(truth.table = contig,
parent.misclassification.perc = parent.misclass.perc,
child.misclassification.perc = offspring.misclass.perc,
parent.misclassification.by.cn = parent.misclass.perc.cn,
child.misclassification.by.cn = child.misclass.perc.cn,
missed.true.cn.trio = mistake.true.cn.trio,
missed.called.cn.trio = mistake.cn.called.trio,
missed.called.cn.probs.trio = mistake.cn.probs.trio,
mistake.typed = mistake.typed,
missed.call.values = mistake.spec,
taus = posterior.tau,
true.tau.env = tau.env,
true.tau.intermed = tau.intermed,
effective.size.pi = eff.size.pi,
effective.size.pi.child = eff.size.pi.child,
effective.size.thetas = eff.size.thetas,
effective.size.sigmas = eff.size.sigmas,
mixture.prob.tb1 = truth.tb,
mixture.prob.tb2 = truth.tb.2))
}
## are the mixing probs are in agreement with the empirical values estimated from
## the simulated data?
gg_prob.comparison <- function (gibbs.cnv, gibbs.results) {
p.df <- melt(gibbs.cnv$post.pi)
colnames(p.df)[2] <- "cn"
ggplot(p.df, aes(Var1, value)) +
geom_line(color="gray") +
geom_hline(data=gibbs.results$mixture.prob.tb2,
mapping=aes(yintercept=empirical.parents)) +
facet_wrap(~cn)
}
gg_prob.child.comparison <- function (gibbs.cnv, gibbs.results) {
p.df <- melt(gibbs.cnv$post.pi.child)
colnames(p.df)[2] <- "cn"
ggplot(p.df, aes(Var1, value)) +
geom_line(color="gray") +
geom_hline(data=gibbs.results$mixture.prob.tb2,
mapping=aes(yintercept=empirical.offspring)) +
facet_wrap(~cn)
}
gg_theta.comparison <- function (gibbs.cnv) {
p.df <- melt(gibbs.cnv$post.thetas)
colnames(p.df)[2] <- "cn"
ggplot(p.df, aes(Var1, value)) +
geom_line(color="gray") +
facet_wrap(~cn)
}
gg_sigma.comparison <- function (gibbs.cnv) {
p.df <- melt(gibbs.cnv$post.sigmas)
colnames(p.df)[2] <- "cn"
ggplot(p.df, aes(Var1, value)) +
geom_line(color="gray") +
facet_wrap(~cn)
}
gg_inten.comparison1 <- function (sim.truth, gibbs.cnv) {
p <- cbind(as.numeric(gibbs.cnv$post.cn0), as.numeric(gibbs.cnv$post.cn1), as.numeric(gibbs.cnv$post.cn2))
cn.model <- apply(p, 1, which.max) - 1
cn.trio <- matrix(cn.model, nrow(gibbs.cnv$trio.cn), ncol(gibbs.cnv$trio.cn))
colnames(cn.trio) <- c("m", "f", "o")
df <- data.frame(y=as.numeric(sim.truth$y),
cn=factor(cn.model),
truth=factor(as.numeric(sim.truth$cn)),
concordant=cn.model==as.numeric(sim.truth$cn))
ggplot(df, aes(cn, y)) +
geom_jitter(width=0.1,
aes(color=concordant)) +
geom_boxplot(alpha=0.2, outlier.fill="transparent",
outlier.color="transparent")
}
gg_inten.comparison2 <- function (sim.truth, gibbs.cnv) {
p <- cbind(as.numeric(gibbs.cnv$post.cn0), as.numeric(gibbs.cnv$post.cn1), as.numeric(gibbs.cnv$post.cn2))
cn.model <- apply(p, 1, which.max) - 1
cn.trio <- matrix(cn.model, nrow(gibbs.cnv$trio.cn), ncol(gibbs.cnv$trio.cn))
colnames(cn.trio) <- c("m", "f", "o")
df <- data.frame(y=as.numeric(sim.truth$y),
cn=factor(cn.model),
truth=factor(as.numeric(sim.truth$cn)),
concordant=cn.model==as.numeric(sim.truth$cn))
ggplot(df, aes(truth, y)) +
geom_jitter(width=0.1,
aes(color=concordant)) +
geom_boxplot(alpha=0.2, outlier.fill="transparent",
outlier.color="transparent")
}
## We will use this function to simulate data
## We will simulate using assumed distributions, which is definitely not the
## actual case
## Output: a list of two n x 3 matrices, where each row is a trio,
## 1st column = M, 2nd column = F, 3rd column = O
## 1st list is of fluorescence values (what we would observe)
## 2nd list is of true CN states (for evaluation)
## Inputs:
## n -- the number of trios
## p -- a vector of length three giving the probabilities of having
## CN 0, 1, 2 respectively (for mother and father)
## theta -- a vector of length three giving the distribution means for
## CN 0, 1, 2 respectively
## sigma -- a vector of length three giving the distribution SDs for
## CN 0, 1, 2 respectively
simulate_data <- function(params, n, mendelian.probs){
p <- params[, "p"]
theta <- params[, "theta"]
sigma <- params[, "sigma"]
c_m <- sample(1:3, size = n, replace = TRUE, prob = p)
c_f <- sample(1:3, size = n, replace = TRUE, prob = p)
c_o <- rep(NA, length = n)
for(i in 1:n){
cn_m <- c_m[i]
cn_f <- c_f[i]
p.offspring <- mendelian.probs[, cn_m, cn_f]
c_o[i] <- sample(1:3, size = 1, prob = p.offspring)
}
y_m <- rnorm(n, mean = theta[c_m], sd = sigma[c_m])
y_f <- rnorm(n, mean = theta[c_f], sd = sigma[c_f])
y_o <- rnorm(n, mean = theta[c_o], sd = sigma[c_o])
y.mat <- cbind(y_m, y_f, y_o)
cn.mat <- cbind(c_m, c_f, c_o)
return(list(response = y.mat, cn = cn.mat))
}
statistics_1000g <- function(region){
path <- system.file("extdata", package="marimba")
vcf.del <- read.table(file.path(path, "parameter.p.1000gp.v2.txt"),
header=TRUE, sep="\t")
median.sd <- read.table(file.path(path, "medians.sd.txt"),
header=TRUE, sep="\t")
p <- as.numeric(vcf.del[region, c(9:11)])
theta <- as.numeric(median.sd[region, c(1:3)])
sigma <- as.numeric(median.sd[region, c(4:6)])
params <- cbind(p, theta, sigma)
colnames(params) <- c("p", "theta", "sigma")
params
}
#' Simulate data from simple deletion model - Genetic model
#'
#' @param region integer indexing the 1000G data
#' @param N number of trios to simulate
#' @param error probability of non-Mendelian event in offspring
#' @export
simulate_data2 <- function(params, N, error=0){
##mendelian.probs <- mendelianProb(epsilon=error)
mendelian.probs <- gMendelian(tau.one=1, tau.two=0.5, tau.three=0, err=0)
##stats <- statistics_1000g(region)
dat <- simulate_data(params, N, mendelian.probs)
dat$cn <- dat$cn - 1
colnames(dat$cn) <- c("m", "f", "o")
colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
names(dat) <- c("y", "cn")
dat$theta <- setNames(params[, "theta"], 0:2)
dat$sigma <- setNames(params[, "sigma"], 0:2)
dat$p <- setNames(params[, "p"], 0:2)
dat$logll <- compute_loglik(dat)
dat
}
# simulate from environmental model
# set taus to arbitary but the same value
simulate_data.env1 <- function(params, N, error=0){
##mendelian.probs <- mendelianProb(epsilon=error)
mendelian.probs <- gMendelian(tau.one=0.1, tau.two=0.1, tau.three=0.1, err=0)
##stats <- statistics_1000g(region)
dat <- simulate_data(params, N, mendelian.probs)
dat$cn <- dat$cn - 1
colnames(dat$cn) <- c("m", "f", "o")
colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
names(dat) <- c("y", "cn")
dat$theta <- setNames(params[, "theta"], 0:2)
dat$sigma <- setNames(params[, "sigma"], 0:2)
dat$p <- setNames(params[, "p"], 0:2)
dat$logll <- compute_loglik(dat)
dat
}
simulate_data.env2 <- function(params, N, error=0){
##mendelian.probs <- mendelianProb(epsilon=error)
mendelian.probs <- gMendelian(tau.one=0.8, tau.two=0.8, tau.three=0.8, err=0)
##stats <- statistics_1000g(region)
dat <- simulate_data(params, N, mendelian.probs)
dat$cn <- dat$cn - 1
colnames(dat$cn) <- c("m", "f", "o")
colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
names(dat) <- c("y", "cn")
dat$theta <- setNames(params[, "theta"], 0:2)
dat$sigma <- setNames(params[, "sigma"], 0:2)
dat$p <- setNames(params[, "p"], 0:2)
dat$logll <- compute_loglik(dat)
dat
}
simulate_data.int <- function(params, N, error=0){
##mendelian.probs <- mendelianProb(epsilon=error)
mendelian.probs <- gMendelian(tau.one=1, tau.two=0.3, tau.three=0, err=0)
##stats <- statistics_1000g(region)
dat <- simulate_data(params, N, mendelian.probs)
dat$cn <- dat$cn - 1
colnames(dat$cn) <- c("m", "f", "o")
colnames(dat$response) <- gsub("y_", "", colnames(dat$response))
names(dat) <- c("y", "cn")
dat$theta <- setNames(params[, "theta"], 0:2)
dat$sigma <- setNames(params[, "sigma"], 0:2)
dat$p <- setNames(params[, "p"], 0:2)
dat$logll <- compute_loglik(dat)
dat
}
mendelianProb <- function(epsilon=0){
## Mendelian probability array for child
## for ease of referencing of row/ column names, let (0,0) = BB, (0,1)= AB and (1,1) = AA
## here BB, AB, AA represent from mother dim for array creation
## the probability array is labelled with .c, .f and .m representing child, father, mother respectively
## BB <- matrix(c(1, 0, 0,
## 0.5, 0.5, 0,
## 0, 1, 0) ,nrow=3, ncol=3,
## dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
## AB <- matrix(c(0.5, 0.5, 0,
## 0.25, 0.5, 0.25,
## 0, 0.5, 0.5) ,nrow=3, ncol=3,
## dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
## AA <- matrix(c(0, 1, 0,
## 0, 0.5, 0.5,
## 0, 0, 1) ,nrow=3, ncol=3,
## dimnames=list(c("BB","AB","AA"),c("BB","AB","AA")))
## mendelian.probs <- abind(BB, AB, AA, along=3,
## new.names=list(c("BB.c","AB.c","AA.c"),
## c("BB.f","AB.f","AA.f"),
## c("BB.m","AB.m","AA.m")))
.Deprecated("use gMendelian")
mendelian.probs <- array(dim=c(3, 3, 3))
mendelian.probs[, 1, 1] <- c(1 - epsilon, epsilon/2, epsilon/2)
mendelian.probs[, 2, 1] <- c(.5 - epsilon/2, .5 - epsilon/2, epsilon)
mendelian.probs[, 3, 1] <- c(epsilon/2, 1 - epsilon, epsilon/2)
mendelian.probs[, 1, 2] <- c(.5 - epsilon/2 , .5 - epsilon/2, epsilon)
mendelian.probs[, 2, 2] <- c(.25, .5, .25)
mendelian.probs[, 3, 2] <- c(epsilon, .5 - epsilon/2, .5 - epsilon/2)
mendelian.probs[, 1, 3] <- c(epsilon/2, 1 - epsilon, epsilon/2)
mendelian.probs[, 2, 3] <- c(epsilon, .5 - epsilon/2, .5 - epsilon/2)
mendelian.probs[, 3, 3] <- c(epsilon/2, epsilon/2, 1 - epsilon)
mendelian.probs
}
gg_cnp <- function(dat){
response.df <- melt(dat$y)
cn.df <- melt(dat$cn)
df <- data.frame(logr=response.df$value,
cn=cn.df$value)
df$cn <- as.factor(df$cn)
p <- ggplot(df, aes(logr, ..count.., fill = cn)) +
geom_density(alpha = .5) + xlab("LRR")
p
}
gg_chains <- function(gibbs.cnv){
L <- nrow(gibbs.cnv$post.thetas)
K <- ncol(gibbs.cnv$post.thetas)
df <- tibble(post.thetas0=gibbs.cnv$post.thetas[, 1],
post.thetas1=gibbs.cnv$post.thetas[, 2],
post.thetas2=gibbs.cnv$post.thetas[, 3],
post.sigmas0=gibbs.cnv$post.sigmas[, 1],
post.sigmas1=gibbs.cnv$post.sigmas[, 2],
post.sigmas2=gibbs.cnv$post.sigmas[, 3],
post.pi0=gibbs.cnv$post.pi[, 1],
post.pi1=gibbs.cnv$post.pi[, 2],
post.pi2=gibbs.cnv$post.pi[, 3],
post.pi.child0=gibbs.cnv$post.pi.child[, 1],
post.pi.child1=gibbs.cnv$post.pi.child[, 2],
post.pi.child2=gibbs.cnv$post.pi.child[, 3],
# loglik=gibbs.cnv$logll,
iter=seq_len(L))
df2 <- gather(df, key=parameter, value=monte_carlo, -iter)
## df <- data.frame(post.thetas=as.numeric(gibbs.cnv$post.thetas),
## post.sigmas=as.numeric(gibbs.cnv$post.sigmas),
## post.pi=as.numeric(gibbs.cnv$post.pi),
## ##loglik=as.numeric(),
## cn=rep(c("0", "1", "2"), each=L),
## log_lik=as.numeric(gibbs.cnv$logll),
## iter=rep(seq_len(L), K))
## df2 <- melt(df, id.vars=c("iter", "cn"))
ggplot(df2, aes(iter, monte_carlo)) +
geom_line() +
facet_wrap(~parameter, scales="free_y")
}
# deprecate
#gg_chains <- function(chains){
# L <- nrow(chains$theta)
#K <- ncol(chains$theta)
#df <- tibble(theta0=chains$theta[, 1],
# theta1=chains$theta[, 2],
# theta2=chains$theta[, 3],
# sigma0=chains$sigma[, 1],
# sigma1=chains$sigma[, 2],
# sigma2=chains$sigma[, 3],
# pi0=chains$pi[, 1],
# pi1=chains$pi[, 2],
# pi2=chains$pi[, 3],
# pi.child0=chains$pi.child[, 1],
# pi.child1=chains$pi.child[, 2],
# pi.child2=chains$pi.child[, 3],
# loglik=chains$logll,
# iter=seq_len(L))
# df2 <- gather(df, key=parameter, value=monte_carlo, -iter)
## df <- data.frame(theta=as.numeric(chains$theta),
## sigma=as.numeric(chains$sigma),
## pi=as.numeric(chains$pi),
## ##loglik=as.numeric(),
## cn=rep(c("0", "1", "2"), each=L),
## log_lik=as.numeric(chains$logll),
## iter=rep(seq_len(L), K))
## df2 <- melt(df, id.vars=c("iter", "cn"))
#ggplot(df2, aes(iter, monte_carlo)) +
# geom_line() +
# facet_wrap(~parameter, scales="free_y")
# }
map_cn <- function(gmodel, ch, params){
S<-params$iter
K <- length(gmodel$theta)
pr.list<-list()
for (k in 1:K){
c <- ch[[k]]/S
pr.list[[length(pr.list)+1]]<-c
names(pr.list)[k]<-attributes(gmodel$pi)$names[k]
}
m <- do.call(cbind, lapply(pr.list, "[", , "m"))
f <- do.call(cbind, lapply(pr.list, "[", , "f"))
o <- do.call(cbind, lapply(pr.list, "[", , "o"))
cn.f <- apply(f, 1, which.max) - 1
cn.m <- apply(m, 1, which.max) - 1
cn.o <- apply(o, 1, which.max) - 1
probs <- cbind(rowMaxs(m),
rowMaxs(f),
rowMaxs(o))
cn <- cbind(cn.m, cn.f, cn.o)
colnames(probs) <- colnames(cn) <- c("m", "f", "o")
list(prob=probs,
cn=cn)
}
#' Return maximum a posterior copy number from a model
#'
#' @param model a fitted model
#' @return a vector of the maximum a posterior copy number
#' @examples
#' params <- statistics_1000g(region=192)
#' truth <- simulate_data2(params, N=81, error=0)
#' gparams <- geneticParams(N=81, K=3,
#' iter=50,
#' thin=2,
#' burnin=0)
#' set.seed(204)
#' comp <- 1
#' gmod <- initialize_gmodel(truth$y, gparams, comp)
#' fit <- gibbs_genetic(gmod, gparams)
#' cn <- map_cn2(fit)
map_cn2 <- function(model){
cn.list <- model$chains$C
iter <- nrow(model$chains$theta)
prob0 <- (cn.list$C0/iter) %>% as.tibble %>%
mutate(trio=paste0("trio", 1:nrow(.))) %>%
gather(key=family_membership, value=prob0, -trio)
prob1 <- (cn.list$C1/iter) %>% as.tibble %>%
mutate(trio=paste0("trio", 1:nrow(.))) %>%
gather(key=family_membership, value=prob1, -trio)
prob2 <- (cn.list$C2/iter) %>% as.tibble %>%
mutate(trio=paste0("trio", 1:nrow(.))) %>%
gather(key=family_membership, value=prob2, -trio)
left_join(prob0, prob1) %>%
left_join(prob2) %>%
select(c(prob0, prob1, prob2)) %>%
max.col %>%
(0:2)[.]
}
dnorm_quantiles <- function(y, mean, sd){
x <- list()
for(i in seq_along(mean)){
x[[i]] <- qnorm(c(0.001, 0.999), mean=mean[i], sd=sd[i])
}
xx <- unlist(x)
limits <- c(min(y, min(xx)),
max(y, max(xx)))
seq(limits[1], limits[2], length.out=500)
}
.dnorm_poly <- function(x, p, mean, sd){
y <- p*dnorm(x, mean=mean, sd=sd)
yy <- c(y, rep(0, length(y)))
xx <- c(x, rev(x))
tmp <- data.frame(y=yy, x=xx)
}
dnorm_model <- function(qtiles, p, mean, sd){
df.list <- list()
for(i in seq_along(mean)){
dat <- .dnorm_poly(qtiles, p[i], mean[i], sd[i])
if(i == 1){
overall <- dat$y
} else{
overall <- overall + dat$y
}
df.list[[i]] <- dat
}
df <- do.call(rbind, df.list)
L <- sapply(df.list, nrow)
df$component <- factor(rep(seq_along(mean), L))
df.overall <- data.frame(y=overall, x=dat$x)
df.overall$component <- "marginal"
df <- rbind(df, df.overall)
df$component <- factor(df$component, levels=c("marginal", seq_along(mean)))
if(length(mean) == 1){
df <- df[df$component != "overall", ]
df$component <- factor(df$component)
}
df
}
dnorm_poly <- function(model){
mixprob <- model$pi
means <- model$theta
sds <- model$sigma
qtiles <- dnorm_quantiles(model$y, means, sds)
df <- dnorm_model(qtiles, mixprob, means, sds)
df
}
gg_model <- function(model, bins){
colors <- c("#999999", "#56B4E9", "#E69F00", "#0072B2",
"#D55E00", "#CC79A7", "#009E73")
df.observed <- melt(gmodel$y)
colnames(df.observed)[3] <- "y"
if(missing(bins))
bins <- nrow(df.observed)/2
dat <- dnorm_poly(model)
component <- x <- y <- ..density.. <- NULL
ggplot(dat, aes(x, y, group=component)) +
geom_histogram(data=df.observed, aes(y, ..density..),
bins=bins,
inherit.aes=FALSE) +
geom_polygon(aes(fill=component, color=component), alpha=0.4) +
xlab("quantiles") + ylab("density") +
scale_color_manual(values=colors) +
scale_y_sqrt() +
scale_fill_manual(values=colors) +
guides(fill=guide_legend(""), color=guide_legend(""))
}
startAtTrueValues <- function(truth, gmodel){
gmodel$theta <- truth$theta
gmodel$sigma <- truth$sigma
gmodel$cn <- truth$cn
gmodel$pi <- truth$p
gmodel
}
# function to check if the real parameter is captured in the credible interval of the posterior chains
credible.interval <- function (gibbs.model, gibbs.result, real.params) {
ci.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
mixture.ci.parents <- data.frame(HPDinterval(mcmc(gibbs.model$post.pi), 0.95))
mixture.ci.offspring <- data.frame(HPDinterval(mcmc(gibbs.model$post.pi.child), 0.95))
theta.ci <- data.frame(HPDinterval(mcmc(gibbs.model$post.thetas), 0.95))
sigma.ci <- data.frame(HPDinterval(mcmc(gibbs.model$post.sigmas), 0.95))
#w <- real.params[,1]
#x <- real.params[,1]
#y <- real.params[,2]
#z <- real.params[,3]
w <- gibbs.result$mixture.prob.tb2$empirical.parents
x <- gibbs.result$mixture.prob.tb2$empirical.offspring
y <- gibbs.result$mixture.prob.tb2$empirical.thetas
z <- (gibbs.result$mixture.prob.tb2$empirical.sigmas)#^0.5
mix.parents.hit <- mixture.ci.parents %>%
filter(lower<=w, w<=upper)
mix.par <- mixture.ci.parents[,1] %in% mix.parents.hit[,1]
mix.offspring.hit <- mixture.ci.offspring %>%
filter(lower<=x, x<=upper)
mix.off <- mixture.ci.offspring[,1] %in% mix.offspring.hit[,1]
theta.hit <- theta.ci %>%
filter(lower<=y, y<=upper)
theta <- theta.ci[,1] %in% theta.hit[,1]
sigma.hit <- sigma.ci %>%
filter(lower<=z, z<=upper)
sigma <- sigma.ci[,1] %in% sigma.hit[,1]
col <- nrow(real.params)
ci.mat[1,1:col] <- mix.par
ci.mat[1,(col+1) : (2*col)] <- mix.off
ci.mat[1,(2*col+1) : (3*col)] <- theta
ci.mat[1,(3*col+1) : (4*col)] <- sigma
return (ci.mat)
}
credible.interval.quantile <- function (gibbs.model, gibbs.result, real.params) {
ci.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
mixture.ci.parents <- apply(gibbs.model$post.pi, 2, quantile, probs = c(0.025, 0.975))
mixture.ci.offspring <- apply(gibbs.model$post.pi.child, 2, quantile, probs = c(0.025, 0.975))
theta.ci <- apply(gibbs.model$post.thetas, 2, quantile, probs = c(0.025, 0.975))
sigma.ci <- apply(gibbs.model$post.sigmas, 2, quantile, probs = c(0.025, 0.975))
# reshape
mixture.ci.parents <- data.frame(t(mixture.ci.parents))
colnames(mixture.ci.parents) <- c("lower", "upper")
mixture.ci.offspring <- data.frame(t(mixture.ci.offspring))
colnames(mixture.ci.offspring) <- c("lower", "upper")
theta.ci <- data.frame(t(theta.ci))
colnames(theta.ci) <- c("lower", "upper")
sigma.ci <- data.frame(t(sigma.ci))
colnames(sigma.ci) <- c("lower", "upper")
w <- gibbs.result$mixture.prob.tb2$empirical.parents
x <- gibbs.result$mixture.prob.tb2$empirical.offspring
y <- gibbs.result$mixture.prob.tb2$empirical.thetas
z <- (gibbs.result$mixture.prob.tb2$empirical.sigmas)#^0.5
mix.parents.hit <- mixture.ci.parents %>%
filter(lower<=w, w<=upper)
mix.par <- mixture.ci.parents[,1] %in% mix.parents.hit[,1]
mix.offspring.hit <- mixture.ci.offspring %>%
filter(lower<=x, x<=upper)
mix.off <- mixture.ci.offspring[,1] %in% mix.offspring.hit[,1]
theta.hit <- theta.ci %>%
filter(lower<=y, y<=upper)
theta <- theta.ci[,1] %in% theta.hit[,1]
sigma.hit <- sigma.ci %>%
filter(lower<=z, z<=upper)
sigma <- sigma.ci[,1] %in% sigma.hit[,1]
col <- nrow(real.params)
ci.mat[1,1:col] <- mix.par
ci.mat[1,(col+1) : (2*col)] <- mix.off
ci.mat[1,(2*col+1) : (3*col)] <- theta
ci.mat[1,(3*col+1) : (4*col)] <- sigma
return (ci.mat)
}
init.pi.child <- function(dat.y, cn.mf, params, theta){
N.half <- params$N / 2
yy <- as.numeric(dat.y[, c("o")])
cn.mf.mat <- as.matrix(cn.mf)
cn.mf.mat <- cn.mf.mat[1:N.half,]
cn <- as.numeric(cn.mf.mat[, 1:2])
tab <- tapply(yy, cn, length)
freq <- setNames(rep(0L, length(theta)),
attributes(tab)$dimnames[[1]])
freq[names(tab)] <- tab
freq <- freq / params$N
}
ess.compile <- function(gibbs.result, real.params){
ess.mat <- matrix(0, ncol=nrow(real.params)*(ncol(real.params)+1), nrow=1)
col <- nrow(real.params)
ess.mat[1,1:col] <- gibbs.result$effective.size.pi
ess.mat[1,(col+1) : (2*col)] <- gibbs.result$effective.size.pi.child
ess.mat[1,(2*col+1) : (3*col)] <- gibbs.result$effective.size.thetas
ess.mat[1,(3*col+1) : (4*col)] <- gibbs.result$effective.size.sigmas
return(ess.mat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.