#' Bayesian multinomial mixture model estimation using MCMC
#'
#' This function and model are under development. Do not use them, contact the
#' author if interested.
#'
#' There are essentially 4 variants of the model implemented by \code{bmmix}:
#' \itemize{ \item estimate mixture only (default)only the mixture coefficients
#' are estimated; frequencies (phi) are fixed to their maximum likelihood
#' estimate from the data; this model has 'K' parameters (where 'K' is the
#' number of putative origins, i.e. the number of columns in 'x').
#'
#' \item estimate mixture and frequenciesboth mixture coefficients and
#' frequencies for each group and origin are estimated; this model has (N+1)K
#' parameters (N being the number of rows in 'x'); to use this model, specify
#' \code{move.phi=TRUE}.
#'
#' \item estimate mixture, allowing unsampled originmixture coefficients are
#' estimated with an additional 'unsampled' origin whose frequencies are
#' estimated; this model has K+N+1 + parameters (N being the number of rows in
#' 'x'); to use this model, specify \code{move.phi=FALSE,model.unsampled=TRUE};
#' this is the only practical model allowing unsampled origin for medium-sized
#' or large datasets..
#'
#' \item estimate mixture, frequencies, and allow unsampled originthis is the
#' most complex model; in addition to the previous one, an unsampled origin is
#' allowed, and its frequencies are estimated; this model therefore has
#' (N+1)(K+1) parameters; to use this model, specify \code{move.phi=TRUE} and
#' \code{model.unsampled=TRUE}; note that if frequencies are not estimated
#' (\code{move.phi=FALSE}), the frequencies in the unsampled origin will be
#' fixed to their initial value in which all groups have the same frequency;
#' this model quickly becomes hard to fit for medium-sized to large datasets.
#'
#' }
#'
#' @aliases bmmix
#'
#' @param x a matrix containing multinomial data in columns used to compose the
#' mixture (i.e., each column is an 'origin').
#' @param y a vector of the same length as the number of rows of \code{x}
#' containing the response variable.
#' @param n the length of the MCMC.
#' @param sample.every an integer indicating the frequency at which to save
#' MCMC samples.
#' @param move.alpha a logical indicating whether the mixture coefficients
#' (alpha) should be estimated.
#' @param move.phi a logical indicating whether the frequencies in \code{x}
#' (phi) should be estimated; see details.
#' @param sd.alpha the standard deviation of the normal distribution used as
#' proposal for alpha.
#' @param sd.phi the standard deviation of the normal distribution used as
#' proposal for phi.
#' @param move.phi.every the frequency at which values of phi should be moved.
#' @param model.unsampled a logical indicating whether an 'unsampled origin'
#' should be allowed; if TRUE, then \code{move.phi} should be TRUE as well, to
#' allow for frequencies in this group to be estimated.
#' @param prior.unsampled.contrib the mean of the exponential distribution used
#' as prior for the contribution of the unsampled origin in the mixture; all
#' other origins have flat priors.
#' @param min.ini.freq the default minimum frequency of unobserved items in
#' \code{x} used for the initial frequency estimate.
#' @param file.out the name of the file used to store the outputs.
#' @param quiet a logical indicating whether output messages should be hidden.
#' @return A data.frame with class 'bmmix', containing the MCMC outputs: step,
#' log-posterior, log- likelihood, log-prior, alpha values (mixture
#' coefficients), and optionally frequencies for each group and origin (phi).
#'
#' @importFrom gtools rdirichlet ddirichlet
#'
#' @export bmmix
#'
#' @author Thibaut Jombart \email{t.jombart@@imperial.ac.uk}
#'
#' @examples
#'
#' ## GENERATE TOY DATA ##
#' ## ST frequencies in 3 origins:
#' ## dogs, cows, asymptotic cases in human
#' f.dogs <- c(.5, .3, .1, .1, 0)
#' f.cows <- c(.6, .1, .1, .1, .1)
#' f.asymp <- c(0, .1, .2, 0, .7)
#'
#' ## mixture (y would be symptomatic cases)
#' f.y <- .1*f.dogs + .1*f.cows + .8*f.asymp
#'
#' set.seed(1)
#' dogs <- rmultinom(1, 30, f.dogs)
#' cows <- rmultinom(1, 50, f.cows)
#' asymp <- rmultinom(1, 35, f.asymp)
#' X <- data.frame(dogs, cows, asymp,
#' row.names=paste("ST", letters[1:5]))
#' X
#' y <- rmultinom(1, 40, f.y)
#' y
#'
#' cbind(X,y)
#'
#'
#' ## RUN BMMIX ##
#'
#' ## BASIC MODEL
#' ## note: small n for this example only!
#' set.seed(1)
#' res <- bmmix(X,y, n=3e4)
#' head(res)
#'
#'
#' ## VISUALIZE RESULTS ##
#' if(require("ggplot2") && require("reshape2")){
#'
#' ## manually ##
#' ## chech log-posterior
#' ggplot(dat=res) + geom_line(aes(x=step, y=post)) +
#' labs(title="Trace of log-posterior values")
#'
#' ## check mixture coefficients
#' fig.dat <- melt(res, id=1:4)
#'
#' ggplot(dat=fig.dat, aes(x=step)) +
#' geom_line(aes(y=value, colour=variable)) +
#' labs(title="Trace of mixture coefficients")
#'
#'
#' ## with process.bmmix ##
#' ## mixture coefficients
#' temp <- process.bmmix(res, "alpha")
#' names(temp)
#' temp$alpha # values
#' temp$trace # graphics: trace
#' temp$hist # graphics: histograms
#' temp$dens # graphics: densities
#' temp$violin # graphics: violinplot
#'
#' }
#'
#'
#' \dontrun{
#' ## MODEL WITH ESTIMATED FREQUENCIES
#' set.seed(1)
#' res <- bmmix(X,y, move.phi=TRUE)
#' head(res)
#'
#' ## VISUALIZE RESULTS
#' if(require("ggplot2") && require("reshape2")){
#'
#' ## chech log-posterior
#' ggplot(dat=res) + geom_line(aes(x=step, y=post)) +
#' labs(title="Trace of log-posterior values")
#'
#' fig.dat <- melt(res[,1:7], id=1:4)
#'
#' ## check mixture coefficients
#' ggplot(dat=fig.dat, aes(x=step)) +
#' geom_line(aes(y=value, colour=variable)) +
#' labs(title="Trace of mixture coefficients")
#'
#' ## check ST frequencies, i.e. in dogs:
#' fig.dat <- melt(res[,c(1,grep("dogs", names(res))[-1])], id=1)
#'
#' ggplot(dat=fig.dat) +
#' geom_line(aes(x=step, y=value, colour=variable)) +
#' labs(title="Estimate of ST frequencies in dogs")
#'
#' ggplot(dat=fig.dat) +
#' geom_density(aes(x=value, fill=variable),alpha=.2) +
#' labs(title="Estimate of ST frequencies in dogs")
#' }
#' }
##########
## bmmix
##########
bmmix <- function(x, y, n=5e4, sample.every=200,
move.alpha=TRUE, move.phi=FALSE,
sd.alpha=0.1, sd.phi=0.05, move.phi.every=10,
model.unsampled=FALSE, prior.unsampled.contrib=0.1,
min.ini.freq=0.01,
file.out="mcmc.txt", quiet=FALSE){
## CHECKS ##
NEARZERO <- 1e-20
if(n/sample.every < 10) warning("less than 10 samples are going to be produced")
x <- as.matrix(x)
if(model.unsampled){
x <- cbind(x, unsampled=rep(0, nrow(x)))
rate.alpha.prior <- 1/prior.unsampled.contrib
}
K <- ncol(x)
N <- nrow(x)
if(N != length(y)) stop("The number of rows in x does not match the length of y")
## MAKE SURE THERE'S NO EMPTY SOURCE
if (any(!is.finite(x))) {
stop("Non-finite values detected in 'x'")
}
if (any(!is.finite(y))) {
stop("Non-finite values detected in 'y'")
}
sources_sums <- colSums(x)
if (any(sources_sums < 1)) {
stop("Some of the sources have no observation")
}
## LIKELIHOOD FUNCTIONS ##
## LIKELIHOOD OF CASE DATA 'Y' ##
## y is a vector of N numbers
## phi is a NxK matrix of numbers
## alpha is a vector of K mixture coefficients
LL.y <- function(y, phi, alpha){
phi.y <- phi %*% (alpha/sum(alpha))
return(dmultinom(y, prob=phi.y, log=TRUE))
}
## LIKELIHOOD OF PUTATIVE ORIGIN DATA 'x'
## y is a vector of N numbers
## phi is a NxK matrix of numbers
## alpha is a vector of K mixture coefficients
LL.x <- function(x, phi){
return(sum(sapply(1:ncol(x), function(i) dmultinom(x[,i], prob=phi[,i], log=TRUE))))
}
## LIKELIHOOD OF ALL DATA ##
LL.all <- function(y, x, phi, alpha){
return(LL.y(y, phi, alpha) + LL.x(x, phi))
}
## PRIOR FUNCTIONS ##
if(model.unsampled){
LPrior.alpha <- function(alpha){
return(dexp((alpha/sum(alpha))[K], rate=rate.alpha.prior, log=TRUE))
}
} else {
LPrior.alpha <- function(alpha){
return(0)
}
}
## POSTERIOR FUNCTIONS ##
LPost.all <- function(y, x, phi, alpha){
return(LL.all(y,x, phi, alpha) + LPrior.alpha(alpha))
}
## MOVEMENT FUNCTIONS ##
## MOVE ALPHA
ALPHA.ACC <- 0
ALPHA.REJ <- 0
SCALE.ALPHA <- 10
alpha.move <- function(alpha, sigma=sd.alpha){
## propose new vector ##
newval <- as.vector(rdirichlet(1, alpha*SCALE.ALPHA))
## correction factor for MH ##
## LL.corr = log(p(new->old)) - log(p(old->new))
LL.correc <- log(ddirichlet(alpha, newval*SCALE.ALPHA)) - log(ddirichlet(newval, alpha*SCALE.ALPHA))
if(all(newval>=0 & newval<=1)){
metro.ratio <- LL.y(y, phi, newval) - LL.y(y, phi, alpha) +
LPrior.alpha(newval) - LPrior.alpha(alpha) +
LL.correc
if((r <- log(runif(1))) <= metro.ratio){
alpha <- newval # accept
ALPHA.ACC <<- ALPHA.ACC+1
} else {
ALPHA.REJ <<- ALPHA.REJ+1
}
} else {
ALPHA.REJ <<- ALPHA.REJ+1
}
## return moved vector
return(alpha)
}
## MOVE PHI
PHI.ACC <- 0
PHI.REJ <- 0
SCALE.PHI <- 10
phi.move <- function(phi, sigma=sd.phi){
## check which one must move
if(move.phi) {
idx.toMove <- 1:K
} else if(model.unsampled){
idx.toMove <- K
} else return(phi)
## for all frequencies to move...
for(tomove in idx.toMove){
## generate all proposals ##
## newval <- rnorm(n=nrow(phi), mean=phi[,tomove], sd=sigma)
## newval <- newval/sum(newval)
newval <- as.vector(rdirichlet(1, phi[,tomove]*SCALE.PHI))
newphi <- phi
newphi[,tomove] <- newval
## correction factor for MH ##
## LL.corr = log(p(new->old)) - log(p(old->new))
LL.correc <- log(ddirichlet(phi[,tomove], newval*SCALE.PHI)) - log(ddirichlet(newval, phi[,tomove]*SCALE.PHI))
if(all(newval>=0 & newval<=1)){
if((r <- log(runif(1))) <= (LL.all(y, x, newphi, alpha) - LL.all(y, x, phi, alpha))){
phi <- newphi # accept
PHI.ACC <<- PHI.ACC+1
} else {
PHI.REJ <<- PHI.REJ+1
}
} else {
PHI.REJ <<- PHI.REJ+1
}
}
## return moved vector
return(phi)
}
## MAIN MCMC FUNCTION ##
## INITIALIZE MCMC
## initial alpha
alpha <- rep(1/K, K)
## initial phi
if(model.unsampled){
phi <- cbind(prop.table(x[,-K],2), "unsampled"=rep(1/nrow(x), nrow(x)))
} else {
phi <- prop.table(x,2)
}
## handle 'zero' replacement
nb.toreplace <- apply(phi,2, function(e) sum(e<NEARZERO))
replace.freq <- min.ini.freq/nb.toreplace
freq.tosubstract <- 0.01/(nrow(x)-nb.toreplace)
for(j in 1:ncol(phi)){
## replace zeros with small freq
phi.arezero <- phi[,j] < NEARZERO
phi[phi.arezero,j] <- replace.freq[j]
phi[!phi.arezero,j] <- phi[!phi.arezero,j] - freq.tosubstract[j]
}
## ADD HEADER TO THE OUTPUT FILE
## basic header
header <- "step\tpost\tlikelihood\tprior"
## header for alpha
if(move.alpha) header <- c(header, paste("alpha", colnames(x), sep=".", collapse="\t"))
## header for phi
if(move.phi) {
annot.phi <- paste("phi",
rownames(phi)[as.vector(row(phi))],
colnames(phi)[as.vector(col(phi))],
sep=".", collapse="\t")
header <- c(header, annot.phi)
} else if(model.unsampled){
annot.phi <- paste("phi",
rownames(phi),
"unsampled",
sep=".", collapse="\t")
header <- c(header, annot.phi)
}
## collapse everything and write to file
header <- paste(header, collapse="\t")
cat(header, file=file.out)
## add first line
## temp: c(loglike, logprior)
temp <- c(LL.all(y, x, phi, alpha), LPrior.alpha(alpha))
## check that initial LL is not -Inf
if(!is.finite(temp[1])) warning("Initial likelihood is zero")
## write to file
cat("\n", file=file.out, append=TRUE)
cat(c(1, sum(temp), temp), sep="\t", append=TRUE, file=file.out)
if(move.alpha) cat("", alpha/sum(alpha), sep="\t", append=TRUE, file=file.out)
if(move.phi){
cat("", as.vector(phi), sep="\t", append=TRUE, file=file.out)
} else if(model.unsampled){
cat("", as.vector(phi[,K]), sep="\t", append=TRUE, file=file.out)
}
if(!quiet) cat("\nStarting MCMC: 1")
## mcmc ##
for(i in 1:n){
## move stuff ##
## move alpha if needed
if(move.alpha) alpha <- alpha.move(alpha, sd.alpha)
## move phi if needed (phi.move makes the necessary moves)
if((move.phi|model.unsampled) && (i %% move.phi.every == 0)) phi <- phi.move(phi, sd.phi)
## if retain this sample ##
if(i %% sample.every ==0){
temp <- c(LL.all(y, x, phi, alpha), LPrior.alpha(alpha))
cat("\n", file=file.out, append=TRUE)
cat(c(i, sum(temp), temp), sep="\t", append=TRUE, file=file.out)
if(move.alpha) cat("", alpha/sum(alpha), sep="\t", append=TRUE, file=file.out)
if(move.phi) {
cat("", as.vector(phi), sep="\t", append=TRUE, file=file.out)
} else if(model.unsampled){
cat("", as.vector(phi[,K]), sep="\t", append=TRUE, file=file.out)
}
if(!quiet) cat("..",i)
}
}
if(!quiet) cat("..done!\nResults were saved in file:",file.out,"\n")
## re-read output file ##
out <- read.table(file.out, header=TRUE, colClasses="numeric", sep="\t")
out$step <- as.integer(out$step)
## format using coda ##
if(!quiet){
## acceptance rates for alpha
if(move.alpha){
cat("\nacceptance rate for alpha: ", ALPHA.ACC/(ALPHA.ACC+ALPHA.REJ))
cat("\naccepted: ", ALPHA.ACC)
cat("\nreject: ", ALPHA.REJ)
cat("\n")
}
## acceptance rates for phi
if(move.phi || model.unsampled){
cat("\nacceptance rate for phi: ", PHI.ACC/(PHI.ACC+PHI.REJ))
cat("\naccepted: ", PHI.ACC)
cat("\nreject: ", PHI.REJ)
cat("\n")
}
}
class(out) <- c("data.frame", "bmmix")
return(out)
} # end bmmix
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.