Nothing
.empty_trio_model <- function(hp, mp){
K <- k(hp)
B <- 0L
S <- iter(mp)
ch <- initialize_mcmcT(K, S, B, 0)
obj <- new("TrioBatchModel",
k=as.integer(K),
hyperparams=hp,
theta=matrix(NA, 0, K),
sigma2=matrix(NA, 0, K),
mu=numeric(K),
tau2=numeric(K),
nu.0=numeric(1),
sigma2.0=numeric(1),
pi=numeric(K),
pi_parents=numeric(K),
#data=numeric(K),
triodata=as_tibble(0),
mprob=matrix(NA, 0, 0),
#maplabel=numeric(K),
predictive=numeric(K*B),
zstar=integer(K*B),
data.mean=matrix(NA, B, K),
data.prec=matrix(NA, B, K),
z=integer(0),
zfreq=integer(K),
zfreq_parents=integer(K),
probz=matrix(0, S, K),
# note this assumes parents are 2/3 of N
probz_par=matrix(0, 2/3*S, K),
logprior=numeric(1),
loglik=numeric(1),
mcmc.chains=ch,
mcmc.params=mp,
batch=integer(0),
batchElements=integer(0),
label_switch=FALSE,
marginal_lik=as.numeric(NA),
.internal.constraint=5e-4,
.internal.counter=0L)
chains(obj) <- McmcChainsTrios(obj)
obj
}
.TBM <- function(triodata=as_tibble(),
hp=HyperparametersTrios(),
mp=McmcParams(iter=1000, thin=10,
burnin=1000, nStarts=1),
mprob=matrix(),
maplabel=maplabel){
## If the data is not ordered by batch,
## its a little harder to sort component labels
log_ratio <- triodata$log_ratio
batches <- triodata$batches
triodata <- select(triodata, -c(log_ratio, batches))
ub <- unique(batches)
nbatch <- setNames(as.integer(table(batches)), ub)
B <- length(ub)
N <- nrow(triodata)
## move to setValidity
if(nrow(triodata) != length(batches)) {
stop("batch vector must be the same length as data")
}
K <- k(hp)
##mprob <- mprob
##maplabel <- maplabel
## mu_k is the average across batches of the thetas for component k
## tau_k is the sd of the batch means for component k
mu <- sort(rnorm(k(hp), mu.0(hp), sqrt(tau2.0(hp))))
tau2 <- 1/rgamma(k(hp), 1/2*eta.0(hp), 1/2*eta.0(hp) * m2.0(hp))
p <- rdirichlet(1, alpha(hp))[1, ]
pp <- rdirichlet(1, alpha(hp))[1, ]
sim_theta <- function(mu, tau, B) sort(rnorm(B, mu, tau))
. <- NULL
thetas <- map2(mu, sqrt(tau2), sim_theta, B) %>%
do.call(cbind, .) %>%
apply(., 1, sort) %>%
t
if(K == 1) thetas <- t(thetas)
nu.0 <- 3.5
sigma2.0 <- 0.25
sigma2s <- 1/rgamma(k(hp) * B, 0.5 * nu.0, 0.5 * nu.0 * sigma2.0) %>%
matrix(B, k(hp))
u <- rchisq(nrow(triodata), hp@dfr)
S <- iter(mp)
T <- length(unique(triodata$id))
ch <- initialize_mcmcT(K, S, B, T)
index <- match(c("father", "mother"), colnames(mprob))
mprob2 <- mprob[, -index]
father <- mprob[, "father"]
mother <- mprob[, "mother"]
is_mendel <- rep(1L, T)
##zamp <- sample(seq_len(K), N, replace=TRUE)
##is_offspring <- model@triodata$family_member=="o"
# write R update z module
#zo <- update
##zamp[is_offspring] <- zo
obj <- new("TrioBatchModel",
k=as.integer(K),
hyperparams=hp,
theta=thetas,
sigma2=sigma2s,
mu=mu,
tau2=tau2,
nu.0=nu.0,
sigma2.0=sigma2.0,
pi=p,
pi_parents=pp,
data=log_ratio,
batch=batches,
triodata=triodata,
mprob=mprob2,
maplabel=maplabel,
u=u,
predictive=numeric(K*B),
zstar=integer(K*B),
data.mean=matrix(0, B, K),
data.prec=matrix(0, B, K),
z=sample(seq_len(K), N, replace=TRUE),
zfreq=integer(K),
zfreq_parents=integer(K),
probz=matrix(0, N, K),
# note this assumes parents are 2/3 of N
probz_par=matrix(0, 2/3*N, K),
logprior=numeric(1),
loglik=numeric(1),
mcmc.chains=ch,
mcmc.params=mp,
batchElements=nbatch,
label_switch=FALSE,
marginal_lik=as.numeric(NA),
father=as.integer(father),
mother=as.integer(mother),
is_mendelian=is_mendel,
.internal.constraint=5e-4,
.internal.counter=0L)
obj
}
triodata <- function(object, wide=FALSE, map=FALSE){
dat <- object@triodata
if(map){
dat$z <- map_z(object)
} else {
dat$z <- z(object)
}
dat$is_mendelian <- isMendelian(object)
dat$log_ratio <- y(object)
dat$batch <- batch(object)
if(wide){
dat2 <- dat %>%
select(c("id", "family_member", "z")) %>%
spread("family_member", "z")
dat3 <- dat %>%
select(c("id", "family_member", "log_ratio")) %>%
spread("family_member", "log_ratio") %>%
set_colnames(c("id", paste0("logr_", c("m", "f", "o"))))
## assumes that family members in a trio were processed in same batch
dat4 <- group_by(dat, id) %>%
summarize(batch=unique(batch))
dat5 <- left_join(dat2, dat3, by="id") %>%
left_join(dat4, by="id")
dat <- dat5
}
dat
}
setReplaceMethod("probzpar", "TrioBatchModel", function(object, value){
object@probz_par <- value
object
})
## TODO Dangerous to have accessor do something more than return the value of it
## slot. Further
## probzpar(object) <- probzpar(object)
## will not behave as expected
#' @rdname probzpar-method
#' @aliases probzpar,TrioBatchModel-method
setMethod("probzpar", "TrioBatchModel", function(object) {
## because first iteration not saved
object@probz_par/(iter(object)-1)
})
combine_batchTrios <- function(model.list, batches){
. <- NULL
ch.list <- map(model.list, chains)
th <- map(ch.list, theta) %>% do.call(rbind, .)
s2 <- map(ch.list, sigma2) %>% do.call(rbind, .)
ll <- map(ch.list, log_lik) %>% unlist
ppi <- map(ch.list, p) %>% do.call(rbind, .)
pp.par <- map(ch.list, pp) %>% do.call(rbind, .)
n0 <- map(ch.list, nu.0) %>% unlist
s2.0 <- map(ch.list, sigma2.0) %>% unlist
logp <- map(ch.list, logPrior) %>% unlist
.mu <- map(ch.list, mu) %>% do.call(rbind, .)
.tau2 <- map(ch.list, tau2) %>% do.call(rbind, .)
zfreq <- map(ch.list, zFreq) %>% do.call(rbind, .)
zfreqpar <- map(ch.list, zFreqPar) %>% do.call(rbind, .)
pred <- map(ch.list, predictive) %>% do.call(rbind, .)
zsta <- map(ch.list, zstar) %>% do.call(rbind, .)
mc <- new("McmcChainsTrios",
theta=data.matrix(th),
sigma2=data.matrix(s2),
pi=data.matrix(ppi),
pi_parents=data.matrix(pp.par),
mu=data.matrix(.mu),
tau2=data.matrix(.tau2),
nu.0=n0,
sigma2.0=s2.0,
zfreq=zfreq,
zfreq_parents=zfreqpar,
logprior=logp,
loglik=ll,
iter=nrow(th),
predictive=pred,
zstar=zsta,
k=k(model.list[[1]]),
B=length(unique(batches)))
hp <- hyperParams(model.list[[1]])
mp <- mcmcParams(model.list[[1]])
triodata <- model.list[[1]]@triodata
mprob <- model.list[[1]]@mprob
father <- model.list[[1]]@father
mother <- model.list[[1]]@mother
maplabel <- model.list[[1]]@maplabel
iter(mp) <- nrow(th)
B <- length(unique(batches))
K <- k(model.list[[1]])
pm.th <- matrix(colMeans(th), B, K)
pm.s2 <- matrix(colMeans(s2), B, K)
pm.p <- colMeans(ppi)
pm.par <- colMeans(pp.par)
pm.n0 <- median(n0)
pm.mu <- colMeans(.mu)
pm.tau2 <- colMeans(.tau2)
pm.s20 <- mean(s2.0)
pz <- map(model.list, probz) %>% Reduce("+", .)
pz <- pz/length(model.list)
## the accessor divides by number of iterations, so rescale
pz <- pz * (iter(mp) - 1)
zz <- max.col(pz)
yy <- y(model.list[[1]])
y_mns <- as.numeric(tapply(yy, zz, mean))
y_prec <- as.numeric(1/tapply(yy, zz, var))
zfreq <- as.integer(table(zz))
pzpar <- map(model.list, probzpar) %>% Reduce("+", .)
pzpar <- pzpar/length(model.list)
pzpar <- pzpar * (iter(mp) - 1)
zzpar <- max.col(pzpar)
#fam.class <- model.list[[1]]@triodata$family_member
#zzpar <- zzpar[fam.class!="o"]
zfreq_parents <- as.integer(table(zzpar))
any_label_swap <- any(map_lgl(model.list, label_switch))
## use mean marginal likelihood in combined model,
## or NA if marginal likelihood has not been estimated
ml <- map_dbl(model.list, marginal_lik)
if(all(is.na(ml))) {
ml <- as.numeric(NA)
} else ml <- mean(ml, na.rm=TRUE)
nbatch <- as.integer(table(batch(model.list[[1]])))
model <- new(class(model.list[[1]]),
triodata=triodata,
mprob=mprob,
father=father,
mother=mother,
maplabel=maplabel,
k=k(hp),
hyperparams=hp,
theta=pm.th,
sigma2=pm.s2,
mu=pm.mu,
tau2=pm.tau2,
nu.0=pm.n0,
sigma2.0=pm.s20,
pi=pm.p,
pi_parents=pm.par,
data=y(model.list[[1]]),
u=u(model.list[[1]]),
data.mean=y_mns,
data.prec=y_prec,
z=zz,
zfreq=zfreq,
zfreq_parents=zfreq_parents,
probz=pz,
probz_par=pzpar,
predictive=predictive(mc)[nrow(th), ],
zstar=zstar(mc)[nrow(th), ],
logprior=numeric(1),
loglik=numeric(1),
mcmc.chains=mc,
batch=batch(model.list[[1]]),
batchElements=nbatch,
modes=list(),
mcmc.params=mp,
label_switch=any_label_swap,
marginal_lik=ml,
.internal.constraint=5e-4,
.internal.counter=0L)
modes(model) <- computeModes(model)
log_lik(model) <- computeLoglik(model)
logPrior(model) <- computePrior(model)
model
}
#' Constructor for TrioBatchModel
#'
#' Initializes a TrioBatchModel, a container for storing data, parameters, and MCMC output for mixture models with batch- and component-specific means and variances.
#'
#' @param triodata the data for the simulation.
#' @param batches an integer-vector of the different batches
#' @param hp An object of class `Hyperparameters` used to specify the hyperparameters of the model.
#' @param mp An object of class 'McmcParams'
#' @return An object of class `TrioBatchModel`
#' @export
TrioBatchModel <- function(triodata=tibble(),
hp=HyperparametersTrios(),
mp=McmcParams(iter=1000, thin=10,
burnin=1000, nStarts=4),
mprob=mprob,
maplabel=maplabel
){
if(length(triodata) == 0){
return(.empty_trio_model(hp, mp))
}
iter <- 0
validZ <- FALSE
mp.tmp <- McmcParams(iter=0, burnin=burnin(mp), thin=1, nStarts=1)
while(!validZ){
##
## Burnin with TBM model
##
tbm <- .TBM(triodata, hp, mp.tmp, mprob, maplabel)
tbm <- runBurnin(tbm)
tabz1 <- table(batch(tbm), z(tbm))
tabz2 <- table(z(tbm))
validZ <- length(tabz2) == k(hp) && all(tabz1 > 1)
iter <- iter + 1
if(iter == 50) {
message("Trouble initializing a valid model. The number of components is likely too large")
return(NULL)
}
}
tbm2 <- sortComponentLabels(tbm)
mcmcParams(tbm2) <- mp
chains(tbm2) <- McmcChainsTrios(tbm2)
tbm2
}
#' @export
TBM <- TrioBatchModel
theta.fix <- function(model, values) {
# values is in the form of a [1,K] matrix
model@theta <- values
model
}
theta.fix2 <- function(model) {
modes(model) <- computeModes(model)
model@theta <- as.matrix(model@modes$theta)
model
}
mp.fix <- function(model, mp){
mcmcParams(model) <- mp
model
}
sigma2.fix <- function(model){
modes(model) <- computeModes(model)
model@sigma2 <- as.matrix(model@modes$sigma2)
model
}
pi.fix <- function(model){
modes(model) <- computeModes(model)
model@pi <- model@modes$mixprob
model
}
gibbs_trios_K2 <- function(hp,
mp,
k_range=c(1, 4),
dat,
batches,
maplabel,
mprob,
max_burnin=32000,
reduce_size=TRUE,
min_effsize=500,
theta.feed){
K <- seq(k_range[1], k_range[2])
hp.list <- map(K, updateK, hp)
model.list <- map(hp.list,
.gibbs_trios_mcmc2,
mp=mp,
dat=dat,
batches=batches,
maplabel=maplabel,
mprob=mprob,
max_burnin=max_burnin,
min_effsize=min_effsize,
theta.feed=theta.feed)
names(model.list) <- paste0("TBM", map_dbl(model.list, k))
## sort by marginal likelihood
##
## if(reduce_size) TODO: remove z chain, keep y in one object
##
ix <- order(map_dbl(model.list, marginal_lik), decreasing=TRUE)
models <- model.list[ix]
}
gibbs_trios2 <- function(model="TBM",
dat,
mp,
hp.list,
batches,
k_range=c(1, 4),
max_burnin=32e3,
top=2,
df=100,
min_effsize=500,
theta.feed){
#model <- unique(model)
max_burnin <- max(max_burnin, burnin(mp)) + 1
if(missing(hp.list)){
#note this line will have to be incorporated in gibbs.R when generalised
hp.list <- hpList(df=df)[["TBM"]]
}
message("Fitting TBM models")
tbm <- gibbs_trios_K2(hp.list,
k_range=k_range,
mp=mp,
dat=dat,
batches=rep(1L, length(dat)),
maplabel=maplabel,
mprob=mprob,
max_burnin=max_burnin,
min_effsize=min_effsize,
theta.feed=theta.feed)
}
par.fix <- function(model, values) {
# values is in the form of a model@z input
model@z <- values
model
}
gibbs_trios_K3 <- function(hp,
mp,
k_range=c(1, 4),
dat,
batches,
maplabel,
mprob,
max_burnin=32000,
reduce_size=TRUE,
min_effsize=500,
theta.feed, par.feed){
K <- seq(k_range[1], k_range[2])
hp.list <- map(K, updateK, hp)
model.list <- map(hp.list,
.gibbs_trios_mcmc3,
mp=mp,
dat=dat,
batches=batches,
maplabel=maplabel,
mprob=mprob,
max_burnin=max_burnin,
min_effsize=min_effsize,
theta.feed=theta.feed,
par.feed=par.feed)
names(model.list) <- paste0("TBM", map_dbl(model.list, k))
## sort by marginal likelihood
##
## if(reduce_size) TODO: remove z chain, keep y in one object
##
ix <- order(map_dbl(model.list, marginal_lik), decreasing=TRUE)
models <- model.list[ix]
}
gibbs_trios3 <- function(model="TBM",
dat,
mp,
hp.list,
batches,
k_range=c(1, 4),
max_burnin=32e3,
top=2,
df=100,
min_effsize=500,
theta.feed,
par.feed){
#model <- unique(model)
max_burnin <- max(max_burnin, burnin(mp)) + 1
if(missing(hp.list)){
#note this line will have to be incorporated in gibbs.R when generalised
hp.list <- hpList(df=df)[["TBM"]]
}
message("Fitting TBM models")
tbm <- gibbs_trios_K3(hp.list,
k_range=k_range,
mp=mp,
dat=dat,
batches=rep(1L, length(dat)),
maplabel=maplabel,
mprob=mprob,
max_burnin=max_burnin,
min_effsize=min_effsize,
theta.feed=theta.feed,
par.feed=par.feed)
}
gMendelian.multi <- function(tau=c(0.5, 0.5, 0.5)){
tau1 <- tau[1]
tau2 <- tau[2]
tau3 <- tau[3]
mendelian.probs <- array(dim=c(5, 5, 5))
genotypes <- c("CN0", "CN1", "CN2", "CN3", "CN4")
dimnames(mendelian.probs) <- list(paste0("O_", genotypes),
paste0("M_", genotypes),
paste0("F_", genotypes))
mendelian.probs[, 1, 1] <- c(1,0,0,0,0)
mendelian.probs[, 1, 2] <- c(tau1, 1 - tau1, 0,0,0)
mendelian.probs[, 1, 3] <- c(tau2 / 2, 0.5, (1 - tau2) / 2, 0,0)
mendelian.probs[, 1, 4] <- c(0, tau3, 1 - tau3, 0,0)
mendelian.probs[, 1, 5] <- c(0,0,1,0,0)
mendelian.probs[, 2, 1] <- c(tau1, 1 - tau1, 0,0,0)
mendelian.probs[, 2, 2] <- c((tau1^2), 2 * (tau1 * (1 - tau1)), ((1 - tau1)^2), 0,0)
mendelian.probs[, 2, 3] <- c((tau1 * tau2) / 2, (tau2 * (1 - tau1) + tau1) / 2, (tau1 * (1-tau2) + (1 - tau1)) / 2, ((1 - tau1) * (1 - tau2)) / 2, 0)
mendelian.probs[, 2, 4] <- c(0, tau1 * tau3, tau1 * (1 - tau3) + (1 - tau1) * tau3, (1- tau1) * (1 - tau3), 0)
mendelian.probs[, 2, 5] <- c(0, 0, tau1, (1 - tau1), 0)
mendelian.probs[, 3, 1] <- c(tau2 / 2, 0.5, (1 - tau2) / 2, 0, 0)
mendelian.probs[, 3, 2] <- c((tau1 * tau2) / 2, (tau1 + tau2 * (1 - tau1)) / 2, ((1 - tau1) + (tau1 * (1-tau2)) ) / 2, (1 - tau1) * (1 - tau2) / 2, 0)
mendelian.probs[, 3, 3] <- c(tau2^2 / 4, tau2 / 2, (0.5 + tau2 * (1 - tau2)) / 2, (1 - tau2) / 2, (1 - tau2)^2 / 4)
mendelian.probs[, 3, 4] <- c(0, tau2 * tau3 / 2, (tau3 + tau2 * (1 - tau3)) / 2, (((1 - tau3) + (1 - tau2) * tau3) / 2), (1 - tau2) * (1 - tau1) /2)
mendelian.probs[, 3, 5] <- c(0, 0, tau2 / 2, 0.5, (1 - tau2) / 2)
mendelian.probs[, 4, 1] <- c(0, tau3, (1-tau3), 0, 0)
mendelian.probs[, 4, 2] <- c(0, tau1 * tau3, tau1 * (1 - tau3) + (1 - tau1) * tau3, (1 - tau1) * (1 - tau3), 0)
mendelian.probs[, 4, 3] <- c(0, tau2 * tau3 / 2, (tau3 + tau2 * (1 - tau3)) / 2, ((1 - tau3) + (1 - tau2) * tau3) / 2, (1 - tau2) * (1 - tau3) / 2)
mendelian.probs[, 4, 4] <- c(0,0, tau3^2, 2 * tau3 * (1 - tau3), (1 - tau3)^2)
mendelian.probs[, 4, 5] <- c(0,0,0, tau3, 1-tau3)
mendelian.probs[, 5, 1] <- c(0,0,1,0,0)
mendelian.probs[, 5, 2] <- c(0,0, tau1, 1 - tau1, 0)
mendelian.probs[, 5, 3] <- c(0,0, tau2 / 2, 0.5, (1 - tau2) / 2)
mendelian.probs[, 5, 4] <- c(0,0,0, tau3, 1 - tau3)
mendelian.probs[, 5, 5] <- c(0,0,0,0,1)
mendelian.probs
}
mprob.matrix <- function(tau=c(0.5, 0.5, 0.5), maplabel, error){
# to avoid confusion, maplabel1 will always be unique applied throughout all functions
# and maplabel will not
# so line below will repeat as needed
maplabel1 <- unique(maplabel)
# these conditionals means if maplabel has only one component = 2,
# then it defaults to biallelic matrix
if (all(any(maplabel1 < 2) & any(maplabel1 > 2))) {
mprob.mat <- mprob.matrix.mallelic(tau, maplabel)
} else {
mprob.mat <- mprob.matrix.biallelic(tau, maplabel, error)
}
setDT(mprob.mat)[, c("father","mother") := tstrsplit(parents, "")]
mprob.mat <- mprob.mat[, -1] %>%
as.tibble() %>%
mutate(father=as.numeric(father),
mother=as.numeric(mother)) %>%
as.matrix
# makes sure transmission matrix same size as maplabel, including repeats CN states
mprob.mat2 <- mprob.mat.resize(mprob.mat, maplabel)
# makes sure each row adds to 1
mprob.mat3 <- mprob.balance(mprob.mat2, maplabel)
mprob.mat3
}
mprob.matrix.mallelic <- function (tau=c(0.5, 0.5, 0.5), maplabel){
tau1 <- tau[1]
tau2 <- tau[2]
tau3 <- tau[3]
mendelian.probs <- array(dim=c(25,6))
colnames(mendelian.probs) <- c("parents", "p(0|f,m)", "p(1|f,m)", "p(2|f,m)", "p(3|f,m)", "p(4|f,m)")
mendelian.probs[1, 2:6] <- c(1,0,0,0,0)
mendelian.probs[2, 2:6] <- c(tau1, 1 - tau1, 0,0,0)
mendelian.probs[3, 2:6] <- c(tau2 / 2, 0.5, (1 - tau2) / 2, 0,0)
mendelian.probs[4, 2:6] <- c(0, tau3, 1 - tau3, 0,0)
mendelian.probs[5, 2:6] <- c(0,0,1,0,0)
mendelian.probs[6, 2:6] <- c(tau1, 1 - tau1, 0,0,0)
mendelian.probs[7, 2:6] <- c((tau1^2), 2 * (tau1 * (1 - tau1)), ((1 - tau1)^2), 0,0)
mendelian.probs[8, 2:6] <- c((tau1 * tau2) / 2, (tau2 * (1 - tau1) + tau1) / 2, (tau1 * (1-tau2) + (1 - tau1)) / 2, ((1 - tau1) * (1 - tau2)) / 2, 0)
mendelian.probs[9, 2:6] <- c(0, tau1 * tau3, tau1 * (1 - tau3) + (1 - tau1) * tau3, (1- tau1) * (1 - tau3), 0)
mendelian.probs[10, 2:6] <- c(0, 0, tau1, (1 - tau1), 0)
mendelian.probs[11, 2:6] <- c(tau2 / 2, 0.5, (1 - tau2) / 2, 0, 0)
mendelian.probs[12, 2:6] <- c((tau1 * tau2) / 2, (tau1 + tau2 * (1 - tau1)) / 2, ((1 - tau1) + (tau1 * (1-tau2)) ) / 2, (1 - tau1) * (1 - tau2) / 2, 0)
mendelian.probs[13, 2:6] <- c(tau2^2 / 4, tau2 / 2, (0.5 + tau2 * (1 - tau2)) / 2, (1 - tau2) / 2, (1 - tau2)^2 / 4)
mendelian.probs[14, 2:6] <- c(0, tau2 * tau3 / 2, (tau3 + tau2 * (1 - tau3)) / 2, (((1 - tau3) + (1 - tau2) * tau3) / 2), (1 - tau2) * (1 - tau1) /2)
mendelian.probs[15, 2:6] <- c(0, 0, tau2 / 2, 0.5, (1 - tau2) / 2)
mendelian.probs[16, 2:6] <- c(0, tau3, (1-tau3), 0, 0)
mendelian.probs[17, 2:6] <- c(0, tau1 * tau3, tau1 * (1 - tau3) + (1 - tau1) * tau3, (1 - tau1) * (1 - tau3), 0)
mendelian.probs[18, 2:6] <- c(0, tau2 * tau3 / 2, (tau3 + tau2 * (1 - tau3)) / 2, ((1 - tau3) + (1 - tau2) * tau3) / 2, (1 - tau2) * (1 - tau3) / 2)
mendelian.probs[19, 2:6] <- c(0,0, tau3^2, 2 * tau3 * (1 - tau3), (1 - tau3)^2)
mendelian.probs[20, 2:6] <- c(0,0,0, tau3, 1-tau3)
mendelian.probs[21, 2:6] <- c(0,0,1,0,0)
mendelian.probs[22, 2:6] <- c(0,0, tau1, 1 - tau1, 0)
mendelian.probs[23, 2:6] <- c(0,0, tau2 / 2, 0.5, (1 - tau2) / 2)
mendelian.probs[24, 2:6] <- c(0,0,0, tau3, 1 - tau3)
mendelian.probs[25, 2:6] <- c(0,0,0,0,1)
if(all((rowSums(mendelian.probs, na.rm=T))==1)==F) stop("mendelian matrix is incorrect")
# must do as.tibble step as cbind converts all cells into one type i.e. characters otherwise
mprob.mat <- as.tibble(mendelian.probs)
ref.geno <- reference.genotype(maplabel)
mprob.mat[, 1] <- ref.geno
mprob.mat <- mprob.subset2(mprob.mat, maplabel)
mprob.mat
}
# error term is hard coded into bi-allelic matrix for now
mprob.matrix.biallelic <- function (tau=c(0.5, 0.5, 0.5), maplabel, error = 0.001){
# note in biallelic, only 1 tau value required
tau1 <- tau[1]
allele1 <- c(1 - error, 1-tau1-error, 0 + error)
allele2 <- 1 - allele1
# kronecker's product
a1a1 <- allele1 %x% allele1
a1a2 <- allele1 %x% allele2 + allele2 %x% allele1
a2a2 <- allele2 %x% allele2
ref.geno <- reference.genotype(maplabel)
# put the three vectors together for the biallelic transmission matrix
#for single autosomal locus
mendelian.probs <- array(dim=c(9,4))
offspring.geno <- cbind(a1a1, a1a2, a2a2)
mendelian.probs[,2:4] <- offspring.geno
mprob.mat <- as.tibble(mendelian.probs)
mprob.mat[,1] <- ref.geno
# correctly label the columns
maplabel1 <- unique(maplabel)
if (all(maplabel1 < 3)){
maplabel2 <- c(0,1,2)
} else {
maplabel2 <- c(2,3,4)
}
colnames.label <- vector(length = 3)
for (i in 1:3) {
cn <- maplabel2[i]
label <- paste0("p(",cn,"|f,m)")
colnames.label[i] <- label
}
colnames(mprob.mat)[1] <- "parents"
colnames(mprob.mat)[2:4] <- colnames.label
mprob.mat <- mprob.subset2(mprob.mat, maplabel)
mprob.mat
}
reference.genotype <- function(maplabel){
maplabel1 <- unique(maplabel)
if (all(any(maplabel1 < 2) & any(maplabel1 > 2))) {
ref.geno <- c("00", "01", "02", "03", "04",
"10", "11", "12", "13", "14",
"20", "21", "22", "23", "24",
"30", "31", "32", "33", "34",
"40", "41", "42", "43", "44")
} else {
if (all(maplabel1 < 3)){ # this is deletion matrix reference genotypes
ref.geno <- c("00", "01", "02",
"10", "11", "12",
"20", "21", "22")
} else { # this would be duplication matrix reference genotypes
ref.geno <- c("22", "23", "24",
"32", "33", "34",
"42", "43", "44")
}
}
}
mprob.subset2 <- function(mprob.mat, maplabel) {
maplabel1 <- unique(maplabel)
K <- length(maplabel1)
M <- ifelse(all(maplabel1 > 1), 0, 2)
col.a <- maplabel1[1] + M
col.b <- maplabel1[K] + M
# reference genotype is an index to enable the relevant rows of the
#transmission matrix to be subset
ref.geno <- reference.genotype(maplabel)
index <- mprob.label2(maplabel)
rows <- match(index, ref.geno)
mprob.subset <- mprob.mat[rows, c(col.a:col.b)]
mprob.rows <- mprob.mat[rows, 1]
mprob.subset <- cbind(mprob.rows, mprob.subset)
mprob.subset
}
mprob.label2 <- function(maplabel){
maplabel1 <- unique(maplabel)
n <- length(maplabel1)
combo <- permutations(n=n, r=2, v=maplabel1, repeats.allowed=T)
geno.combo <- paste0(combo[,1], combo[,2])
geno.combo
}
mprob.mat.resize <- function(mprob.matrix, maplabel){
maplabel1 <- unique(maplabel)
K <- ncol(mprob.matrix)
mprob.mat2 <- mprob.matrix[,1:(K-2)]
mprob.mat2t <- t(mprob.mat2)
mprob.mat3 <- cbind(mprob.mat2t, maplabel1)
mprob.mat3 <- data.frame(mprob.mat3)
maplabel.sort <- data.frame(sort(maplabel))
colnames(maplabel.sort) <- "maplabels"
mprob.sized <- left_join(maplabel.sort, mprob.mat3, by=c("maplabels"="maplabel1"))
sized.matrix <- as.matrix(t(mprob.sized))
sized.matrix <- sized.matrix[-1,]
rownames(sized.matrix) <- NULL
L <- length(maplabel)
colnames.label <- vector(length = L)
for (i in 1:L) {
cn <- maplabel[i]
label <- paste0("p(",cn,"|f,m)")
colnames.label[i] <- label
}
colnames(sized.matrix) <- colnames.label
mprob.parents <- mprob.matrix[,(K-1):K]
mprob.mat.sized <- cbind(sized.matrix, mprob.parents)
mprob.mat.sized
}
mprob.balance <- function (mprob.matrix, maplabel){
cols <- length(maplabel)
mprob2 <- mprob.matrix[,1:cols]
mprob2 <- (mprob2)/(rowSums(mprob2))
mprob.matrix[, 1:cols] <- mprob2
mprob.matrix
}
component_stats <- function(tbl){
tbl2 <- tbl %>% group_by(batches, copy_number) %>%
summarize(mean=mean(log_ratio),
sd=sd(log_ratio),
n=n())
k <- length(unique(tbl2$batches))
denom <- sum(tbl2$n)/k
tbl3 <- tbl2 %>% group_by(batches, copy_number) %>% mutate(p = n / denom)
tbl3
}
# sensitivity and specificity function
# deprecate the following simulation functions:
.simulate_data_multi2 <- function(params, n, mprob, maplabel){
K <- nrow(params)
p <- params$p
theta <- params$theta
sigma2 <- params$sigma2
sigma <- sqrt(sigma2)
c_mk <- sample(1:K, size = n, replace = TRUE, prob = p)
c_fk <- sample(1:K, size = n, replace = TRUE, prob = p)
c_m <- maplabel[c_mk]
c_f <- maplabel[c_fk]
c_ok <- rep(NA, length = n)
C <- ncol(mprob)
for(i in 1:n){
cn_ms <- c_m[i]
cn_fs <- c_f[i]
mprob2 <- mprob[mprob[,"mother"]==cn_ms,]
mprob2 <- mprob2[mprob2[,"father"]==cn_fs,]
mprob3 <- mprob2[1:(C-2)]
c_ok[i] <- sample(1:K, size = 1, prob = mprob3)
}
c_o <- maplabel[c_ok]
id.index <- formatC(seq_len(n), flag="0", width=3)
logr.tbl <- tibble(m=rnorm(n, mean = theta[c_mk], sd = sigma[c_mk]),
f=rnorm(n, mean = theta[c_fk], sd = sigma[c_fk]),
o=rnorm(n, mean = theta[c_ok], sd = sigma[c_ok]),
id=factor(paste0("trio_", id.index))) %>%
gather(key="family_member", value="log_ratio", -id)
cn.mat <- cbind(c_m, c_f, c_o)
colnames(cn.mat) <- c("m", "f", "o")
cn.tbl <- as.tibble(cn.mat) %>%
mutate(id=factor(paste0("trio_", id.index))) %>%
gather(key="family_member", value="copy_number", -id)
tbl <- left_join(logr.tbl, cn.tbl, by=c("id", "family_member")) %>%
mutate(family_member=factor(family_member, levels=c("m", "f", "o"))) %>%
arrange(id, family_member)
tbl
}
# this function ensures the simulation generates the requisite number of components regardless of MAF parameter (p)
simulate_data_multi2 <- function(params, N, batches,
error=0, mendelian.probs,
maplabel){
tbl <- .simulate_data_multi2(params, N, mendelian.probs, maplabel)
# append batch info (assumes ordering by trios and id)
if(missing(batches)) {
batches <- rep(1L, 3*N)
} else {
batches <- as.integer(factor(batches))
}
batches <- sort(batches)
tbl2 <- cbind(tbl, batches)
tbl3 <- as_tibble(tbl2)
##
## update parameters to be same as empirical values
##
stats <- component_stats(tbl3)
while(nrow(stats)!=length(maplabel)){
tbl <- .simulate_data_multi2(params, N, mendelian.probs, maplabel)
batches <- sort(batches)
tbl2 <- cbind(tbl, batches)
tbl3 <- as_tibble(tbl2)
stats <- component_stats(tbl3)
}
p <- stats %>% group_by(copy_number) %>% summarize(p=mean(p))
sd <- stats %>% group_by(copy_number) %>% summarize(sd=mean(sd))
mean <- stats %>% group_by(copy_number) %>% summarize(mean=mean(mean))
params$p <- p$p
params$sigma2 <- (sd$sd)^2
params$theta <- mean$mean
komponent <- frank(tbl$copy_number, ties.method=c("dense"))
loglik <- sum(dnorm(tbl$log_ratio, params$theta[komponent],
sqrt(params$sigma2[komponent]), log=TRUE))
truth <- list(data=tbl3, params=params, loglik=loglik)
truth
}
simulate_data_multi3 <- function(params, N, batches, error=0, mendelian.probs, maplabel){
tbl <- .simulate_data_multi2(params, N, mendelian.probs, maplabel)
# append batch info (assumes ordering by trios and id)
if(missing(batches)) {
batches <- rep(1L, 3*N)
} else {
batches <- as.integer(factor(batches))
}
batches <- sort(batches)
tbl2 <- cbind(tbl, batches)
tbl3 <- as_tibble(tbl2)
##
## update parameters to be same as empirical values
##
stats <- component_stats(tbl3)
p <- stats %>% group_by(copy_number) %>% summarize(p=mean(p))
sd <- stats %>% group_by(copy_number) %>% summarize(sd=mean(sd))
mean <- stats %>% group_by(copy_number) %>% summarize(mean=mean(mean))
params$p <- p$p
params$sigma2 <- (sd$sd)^2
params$theta <- mean$mean
komponent <- frank(tbl$copy_number, ties.method=c("dense"))
loglik <- sum(dnorm(tbl$log_ratio, params$theta[komponent],
sqrt(params$sigma2[komponent]), log=TRUE))
truth <- list(data=tbl3, params=params, loglik=loglik)
truth
}
#
startAtTrueValues2 <- function(model, truth_stats, data){
rows <- length(unique(truth_stats$batches))
cols <- length(unique(truth_stats$copy_number))
theta(model) <- matrix(truth_stats$mean, nrow=rows, ncol=cols, byrow=T)
sigma2(model) <- matrix((truth_stats$sd)^2, nrow=rows, ncol=cols, byrow=T)
p(model) <- data$params$p
z(model) <- as.integer(data$data$copy_number)
zFreq1 <- truth_stats %>% group_by(copy_number) %>% summarize(zfreq=sum(n))
zFreq(model) <- zFreq1$zfreq
model
}
setMethod("triodata_lrr", "TrioBatchModel", function(object){
object@triodata$log_ratio
}
)
setMethod("updateZ", "TrioBatchModel", function(object){
update_z(object)
})
setMethod("computeMeans", "TrioBatchModel", function(object){
compute_means(object)
})
setMethod("computePrec", "TrioBatchModel", function(object){
compute_prec(object)
})
setMethod("computeModes", "TrioBatchModel", function(object){
.computeModesBatch(object)
})
componentVariances <- function(y, z) v <- sapply(split(y, z), var)
setMethod("computeVars", "TrioBatchModel", function(object){
compute_vars(object)
})
setMethod("show", "TrioBatchModel", function(object){
##callNextMethod()
cls <- class(object)
cat(paste0("An object of class ", cls), "\n")
cat(" n. obs :", length(y(object)), "\n")
cat(" n. batches :", nBatch(object), "\n")
cat(" k :", k(object), "\n")
cat(" nobs/batch :", table(batch(object)), "\n")
cat(" loglik (s) :", round(log_lik(object), 1), "\n")
cat(" logprior (s):", round(logPrior(object), 1), "\n")
})
#' @rdname k-method
#' @aliases k,TrioBatchModel-method
setMethod("k", "TrioBatchModel", function(object) object@k)
#' @rdname k-method
#' @aliases k<-,TrioBatchModel-method
setReplaceMethod("k", "TrioBatchModel",
function(object, value) {
k <- as.integer(value)
hypp <- hyperParams(object)
hypp@k <- k
hypp@alpha <- rep(1, k)
hyperParams(object) <- hypp
object@k <- k
object@pi <- rep(1/k, k)
object@probz <- matrix(0, length(y(object)), k)
object <- startingValues(object)
object
}
)
#' Retrieve mixture proportions of parents.
#'
#' @examples
#' \dontrun{
#' pp(TrioBatchModelExample)
#' }
#' @param object an object of class TrioBatchModel
#' @return A vector of length the number of components
#' @export
pp <- function(object) object@pi_parents
setReplaceMethod("pp", "TrioBatchModel", function(object, value){
object@pi_parents <- value
object
})
# adapted from old SNR script
# MC: will modify variables to be more self-explanatory later
snr.calc.median<-function(count.num, sd.num, deltas, lrr.list){
loop.length<-ifelse(length(lrr.list)!=1, length(lrr.list)-1, 1)
variance<-rep(NA,loop.length)
for (i in 1:loop.length){
var.calc<-((count.num[[i]]-1)*(sd.num[[i]]^2)+(count.num[[i+1]]-1)*(sd.num[[i+1]]^2))/(count.num[i]+count.num[i+1]-2)
variance[i]<-var.calc
}
snr<-deltas/(variance^0.5)
median.snr<-median(snr)
return(median.snr)
}
# wrapper function
snr.calc <- function(model){
lrr<-as.numeric(y(model))
calls<-as.numeric(z(model))
avglrr.list <- split(lrr, calls)
comp.means <- sapply(avglrr.list, mean)
comp.means <- sort(comp.means)
deltas <- diff(comp.means)
sds <- sapply(avglrr.list, sd)
count<-sapply(avglrr.list,length)
SNR<-snr.calc.median(count, sds, deltas, avglrr.list)
SNR
}
snr.calc.mean<-function(count.num, sd.num, deltas, lrr.list){
loop.length<-ifelse(length(lrr.list)!=1, length(lrr.list)-1, 1)
variance<-rep(NA,loop.length)
for (i in 1:loop.length){
var.calc<-((count.num[[i]]-1)*(sd.num[[i]]^2)+(count.num[[i+1]]-1)*(sd.num[[i+1]]^2))/(count.num[i]+count.num[i+1]-2)
variance[i]<-var.calc
}
snr<-deltas/(variance^0.5)
mean.snr<-mean(snr)
return(mean.snr)
}
snr.calc2 <- function(model){
lrr<-as.numeric(y(model))
calls<-as.numeric(z(model))
avglrr.list <- split(lrr, calls)
comp.means <- sapply(avglrr.list, mean)
comp.means <- sort(comp.means)
deltas <- diff(comp.means)
sds <- sapply(avglrr.list, sd)
count<-sapply(avglrr.list,length)
SNR<-snr.calc.mean(count, sds, deltas, avglrr.list)
SNR
}
# cross table in the same format as caret package
# specifically for 3 state cross table
modified.sens.spec.calc <- function(cross.table, cn.type=c("DEL", "DUP")){
# sensitivity = TP / TP+ FN
# specificity = TN / TN + FP
if(cn.type == "DEL"){
tp <- cross.table[2,2] + cross.table[1,1]
fn <- cross.table[3,1] + cross.table[3,2]
tn <- cross.table[3,3]
fp <- cross.table[1,2] + cross.table[1,3] + cross.table[2,1] + cross.table[2,3]
sens <- tp / (tp + fn)
spec <- tn / (tn + fp)
} else{
tp <- cross.table[2,2] + cross.table[3,3]
fn <- cross.table[1,2] + cross.table[1,3]
tn <- cross.table[1,1]
fp <- cross.table[2,1] + cross.table[2,3] + cross.table[3,1] + cross.table[3,2]
sens <- tp / (tp + fn)
spec <- tn / (tn + fp)
}
sens.spec <- list("sensitivity" = sens, "specificity" = spec)
return(sens.spec)
}
ci95.wrap <- function(vector) {
test <- t.test(vector)
ci95 <- test$conf.int
ci95
}
setMethod("isMendelian", "TrioBatchModel", function(object) {
object@is_mendelian
})
nTrios <- function(object) {
dat <- triodata(object)
length(unique(dat$id))
}
simulateTrioData <- function(theta=c(-1.2, 0.3, 1.7), maplabel=c(0,1,2),
error=0){
set.seed(123)
p <- c(0.24, 0.43, 0.33)
sigma2 <- c(0.05, 0.05, 0.05)
params <- data.frame(cbind(p, theta, sigma2))
mp <- McmcParams(iter=50, burnin=5)
nbatch <- 1
N <- 300
maplabel <- c(0,1,2)
mprob <- mprob.matrix(tau=c(0.5, 0.5, 0.5), maplabel, error=error)
dat2 <- simulate_data_multi2(params,
N=N,
batches = rep(c(1:nbatch),
length.out = 3*N),
error=0, mprob, maplabel)
hp <- HyperparametersTrios(k = 3)
model <- TBM(triodata=dat2$data,
hp=hp,
mp=mp,
mprob=mprob,
maplabel=maplabel)
dat <- dat2$data
list(model=model, true_z=as.integer(dat$copy_number) + 1L)
}
#' @rdname mcmcParams-method
#' @aliases mcmcParams,MixtureModel-method
setReplaceMethod("mcmcParams", "TrioBatchModel", function(object, value){
S <- iter(value)
B <- numBatch(object)
K <- k(object)
T <- nTrios(object)
if(iter(object) != S){
if(S > iter(object)){
object@mcmc.params <- value
## create a new chain
mcmc_chains <- initialize_mcmcT(K, S, B, T)
} else {
object@mcmc.params <- value
index <- seq_len(S)
mcmc_chains <- chains(object)[index, ]
mcmc_chains@iter <- S
mcmc_chains@B <- B
mcmc_chains@k <- K
}
chains(object) <- mcmc_chains
return(object)
}
## if we've got to this point, it must be safe to update mcmc.params
## (i.e., size of chains is not effected)
object@mcmc.params <- value
object
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.