#' Minus log-likelihood of the finite mixture model.
#'
#' @keywords internal
# This function returns minus the log-likelihood of the set of parameters "mu1",
# "sigma1", "mu2" and "sigma2", given dataset "data", distributions "D1" and
# "D2", and parameter "lambda".
mLL <- function(mu1,sigma1,mu2,sigma2,lambda,data,D1,D2) {
# "mu1", "mu2", "sigma1" and "sigma2" are the log of the parameter values.
# "lambda" and "data" are two numeric vectors of the same length.
# "D1" and "D2" are two functions of probability density.
params <- c(mu1,sigma1,mu2,sigma2)
names(params) <- c("mu1","sigma1","mu2","sigma2")
with(as.list(exp(params)), {
out <- lambda*D1(data,mu1,sigma1)
out <- out+(1-lambda)*D2(data,mu2,sigma2)
return(-sum(log(out)))
})
}
#-------------
#' Starting values of the parameter of a finite mixture model.
#'
#' @keywords internal
# This function calculates the starting values of parameter "lambda".
startval <- function(data,D1,D2) {
# require(tree) # for "tree".
thresh <- tree::tree(data~data)$frame$yval[1]
sel <- data<thresh
data1 <- data[sel]
data2 <- data[!sel]
lambda <- length(data1)/length(data)
param1 <- MASS::fitdistr(data1,D1)$est
param2 <- MASS::fitdistr(data2,D2)$est
out <- c(param1,param2,lambda)
names(out) <- c("mu1","sigma1","mu2","sigma2","lambda")
return(out)
}
#-------------
#' Expectation-Maximization estimation of a finite mixture model
#'
#' \code{em} returns points estimations of the parameters of a finite mixture
#' model using the Expectation-Maximization (E-M) algorithm.
#'
#' The finite mixture model considered in this function is a mixture of two
#' probability distributions that are one of the following: normal, log-normal,
#' gamma or Weibull. Each of these distributions is defined by two parameters:
#' a location and a scale parameter:
#' \tabular{lcc}{
#' \tab location \tab scale \cr
#' normal \tab mean \tab sd \cr
#' log-normal \tab meanlog \tab sdlog \cr
#' gamma \tab shape \tab rate \cr
#' Weibull \tab shape \tab scale
#' }
#' These parameters, together with the mixture parameter, are estimated by the
#' Expection-Maximization algorithm.
#'
#' @param data A vector of real numbers, the data to model with a finite mixture
#' model.
#' @param D1,D2 Probability distributions constituting the finite mixture model.
#' See Details.
#' @param t A numerical scalar indicating the value below which the E-M
#' algorithm should stop.
#' @return A list with class \code{em} containing the following components:
#' \item{lambda}{a numerical vector of length \code{length(data)} containing,
#' for each datum, the probability to belong to distribution \code{D1}.}
#' \item{param}{the location (mu) and scale (sigma) parameters of the
#' probability distributions \code{D1} and \code{D2}.}
#' \code{mu2} and \code{lambda2}.}
#' \item{D1,D2}{character scalars containing the names of the probability
#' distributions used in the finite mixture model.}
#' \item{data}{the numerical vector of data used as input.}
#' \item{data_name}{character scalar containing the name of the dataset used as
#' input.}
#' \item{out}{an object of class \code{mle2} that contains the
#' maximum-likelihood estimates of parameters \code{mu1}, \code{lambda1},
#' \item{t}{the input \code{t} argument value.}
#' @references
#' Chuong B. Do and Serafim Batzoglou (2008) What is the expectation
#' maximization algorithm? Nature Biotechnology 26(8): 897-899.\cr
#' \cr
#' Peter Schlattmann (2009) Medical Applications of Finite Mixture Models.
#' Springer-Verlag, Berlin.
#' @seealso \code{\link{confint.em}} method for calculating the confidence
#' intervals of the parameters and \code{\link{cutoff}} for deriving a
#' cut-off value.
#' @examples
#' # Measles IgG concentration data:
#' length(measles)
#' range(measles)
#' # Plotting the data:
#' hist(measles,100,F,xlab="concentration",ylab="density",ylim=c(0,.55),
#' main=NULL,col="grey")
#' # The kernel density:
#' lines(density(measles),lwd=1.5,col="blue")
#' # Estimating the parameters of the finite mixture model:
#' (measles_out <- em(measles,"normal","normal"))
#' # The confidence interval of the parameter estimates:
#' confint(measles_out,t=1e-64,nb=100,level=.95)
#' # Adding the E-M estimated finite mixture model:
#' lines(measles_out,lwd=1.5,col="red")
#' # The legend:
#' legend("topleft",leg=c("non-parametric","E-M"),col=c("blue","red"),
#' lty=1,lwd=1.5,bty="n")
#' @export
# This function uses the EM algorithm to calculates parameters "lambda"
# (E step), "mu1", "sigma1", "mu2" and "sigma2" (M step).
em <- function(data,D1,D2,t=1e-64) {
data_name <- unlist(strsplit(deparse(match.call()),"="))[2]
data_name <- sub(",.*$","",gsub(" ","",data_name))
start <- as.list(startval(data,D1,D2))
hash <- c(normal=dnorm,"log-normal"=dlnorm,gamma=dgamma,weibull=dweibull)
D1b <- hash[[D1]]
D2b <- hash[[D2]]
lambda0 <- 0 # the previous value of lambda (scalar).
with(start, {
while(abs(lambda0-mean(lambda))>t) {
lambda <- mean(lambda)
lambda0 <- lambda
# Expectation step:
distr1 <- lambda*D1b(data,mu1,sigma1)
distr2 <- (1-lambda)*D2b(data,mu2,sigma2)
lambda <- distr1/(distr1+distr2) # lambda is a vector.
# Minimization step (maximum-likelihood parameters estimations):
mLL2 <- function(mu1,sigma1,mu2,sigma2)
return(mLL(mu1,sigma1,mu2,sigma2,lambda,data,D1b,D2b))
start <- as.list(log(c(mu1=mu1,sigma1=sigma1,mu2=mu2,sigma2=sigma2)))
out <- bbmle::mle2(mLL2,start,"Nelder-Mead")
# The following 4 lines assign the MLE values to the corresponding parameters:
coef <- out@coef
coef_n <- names(coef)
names(coef) <- NULL
for(i in 1:4) assign(coef_n[i],exp(coef[i]))
}
# Put in shape and return the output:
out <- list(
lambda=lambda,param=exp(out@coef),D1=D1,D2=D2,deviance=out@min,
data=data,data_name=data_name,out=out,t=t)
class(out) <- "em"
return(out)
})
}
#-------------
#' Print method of S3-class "em".
#'
#' @S3method print em
print.em <- function(object) {
hash <- list(
normal=c("mean","sd"),
"log-normal"=c("meanlog","sdlog"),
gamma=c("shape","rate"),
weibull=c("shape","scale")
)
param <- as.list(object$param)
digits <- unlist(options("digits"))
cat(paste0("\n Finite mixture model fitting to dataset \"",
object$data_name,"\":\n\n"))
cat(paste("probability to belong to distribution 1 =",
round(mean(object$lambda),digits),"\n"))
cat(paste("distribution 1:",object$D1,"\n"))
cat(paste0(" location (",hash[[object$D1]][1],") = ",
round(param$mu1,digits),"\n"))
cat(paste0(" scale (",hash[[object$D1]][2],") = ",
round(param$sigma1,digits),"\n"))
cat(paste("distribution 2:",object$D2,"\n"))
cat(paste0(" location (",hash[[object$D2]][1],") = ",
round(param$mu2,digits),"\n"))
cat(paste0(" scale (",hash[[object$D2]][2],") = ",
round(param$sigma2,digits),"\n"))
cat(paste("deviance of the fitted model:",
round(object$deviance,digits),"\n\n"))
}
#-------------
#' Lines method of S3-class "em".
#'
#' @S3method lines em
lines.em <- function(object,...) {
# ...: parameter passed to the "line" function.
with(object,with(as.list(param), {
lambda <- mean(lambda)
curve(dnorm(x,mu1,sigma1),add=T,n=512,lty=2,...)
curve(dnorm(x,mu2,sigma2),add=T,n=512,lty=2,...)
curve(lambda*dnorm(x,mu1,sigma1)+
(1-lambda)*dnorm(x,mu2,sigma2),add=T,n=512,...)
}))
}
#-------------
#' Confint method of S3-class \code{em}.
#'
#' This method of the S3 \code{em} class calculates the confidence intervals of
#' the parameters of the fitted finite mixture model.
#'
#' Confidence intervals of the parameters of probability distributions are
#' calculated by the \code{confint} method of the S4 \code{confint} class of the
#' \code{bbmle} package with default values. See the help of this method for
#' technical details. The confidence interval of the \code{lambda} parameter is
#' calculated thanks to the information-based method of Oakes (1999). The
#' possible non-independance between lambda and the parameters of the probability
#' distributions is accounted for by Monte Carlo simulations where each iteration
#' consists in (i) sampling values of theses parameters in a multinormal
#' distribution, and (ii) applying the method of Oakes (1999). Samplings in the
#' multinormal distribution is performed by the \code{rmultinormal} function of
#' the \code{mc2d} package.
#' @inheritParams em
#' @param nb Number of Monte Carlo simulations.
#' @param level The confidence level required.
#' @return A dataframe containing point estimates (first column) and confidence
#' intervals (second and third columns) of each of the five parameters of the
#' finite mixture model (five row, one per parameter).
#' @references David Oakes (1999) Direct calculation of the information matrix
#' via the EM algorithm. J R Statist Soc B, 61: 479-482.
#' @S3method confint em
# This function returns the parameter values and their confidence
# intervals from an output of the "em" function.
confint.em <- function(object,t=1e-64,nb=10,level=.95) {
a <- coef_ci(object,level)
b <- lambda_ci(object,t,nb,level)
return(rbind(a,b))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.