Nothing
# ==================================================== #
# Maximum likelihood computation for nonparametric and #
# semiparametric mixture model #
# ==================================================== #
valid = function(x, beta, mix) UseMethod("valid")
suppspace = function(x, beta) UseMethod("suppspace")
gridpoints = function(x, beta, grid) UseMethod("gridpoints")
initial = function(x, beta, mix, kmax) UseMethod("initial")
logd = function(x, beta, pt, which) UseMethod("logd")
llex = function(x, beta, mix) UseMethod("llex")
llexdb = function(x, beta, mix) UseMethod("llexdb")
weight = function(x, beta) UseMethod("weight")
# Other generic functions needed: length
# print?
valid.default = function(x, beta, mix) TRUE
suppspace.default = function(x, beta) c(-Inf, Inf)
llex.default = function(x, beta, mix) 0
llexdb.default = function(x, beta, mix) 0
# Compute the NPMLE or mixing proportions only
# model = prop, computes the mixing proportions
# = npmle, computes the NPMLE
##'Maximum Likelihood Estimation of a Nonparametric Mixture Model
##'
##'
##'Function \code{cnm} can be used to compute the maximum likelihood estimate
##'of a nonparametric mixing distribution (NPMLE) that has a one-dimensional
##'mixing parameter. or simply the mixing proportions with support points held
##'fixed.
##'
##'
##'A finite mixture model has a density of the form
##'
##'\deqn{f(x; \pi, \theta, \beta) = \sum_{j=1}^k \pi_j f(x; \theta_j,
##'\beta).}{f(x; pi, theta, beta) = sum_{j=1}^k pi_j f(x; theta_j, beta),}
##'
##'where \eqn{\pi_j \ge 0}{pi_j >= 0} and \eqn{\sum_{j=1}^k \pi_j }{sum_{j=1}^k
##'pi_j =1}\eqn{ =1}{sum_{j=1}^k pi_j =1}.
##'
##'A nonparametric mixture model has a density of the form
##'
##'\deqn{f(x; G) = \int f(x; \theta) d G(\theta),}{f(x; G) = Integral f(x;
##'theta) d G(theta),} where \eqn{G} is a mixing distribution that is
##'completely unspecified. The maximum likelihood estimate of the nonparametric
##'\eqn{G}, or the NPMLE of \eqn{G}, is known to be a discrete distribution
##'function.
##'
##'Function \code{cnm} implements the CNM algorithm that is proposed in Wang
##'(2007) and the hierarchical CNM algorithm of Wang and Taylor (2013). The
##'implementation is generic using S3 object-oriented programming, in the sense
##'that it works for an arbitrary family of mixture models defined by the user.
##'The user, however, needs to supply the implementations of the following
##'functions for their self-defined family of mixture models, as they are
##'needed internally by function \code{cnm}:
##'
##'\code{initial(x, beta, mix, kmax)}
##'
##'\code{valid(x, beta)}
##'
##'\code{logd(x, beta, pt, which)}
##'
##'\code{gridpoints(x, beta, grid)}
##'
##'\code{suppspace(x, beta)}
##'
##'\code{length(x)}
##'
##'\code{print(x, ...)}
##'
##'\code{weight(x, ...)}
##'
##'While not needed by the algorithm for finding the solution, one may also
##'implement
##'
##'\code{plot(x, mix, beta, ...)}
##'
##'so that the fitted model can be shown graphically in a user-defined way.
##'Inside \code{cnm}, it is used when \code{plot="probability"} so that the
##'convergence of the algorithm can be graphically monitored.
##'
##'For creating a new class, the user may consult the implementations of these
##'functions for the families of mixture models included in the package, e.g.,
##'\code{npnorm} and \code{nppois}.
##'
##'@param x a data object of some class that is fully defined by the user. The
##'user needs to supply certain functions as described below.
##'@param init list of user-provided initial values for the mixing distribution
##'\code{mix} and the structural parameter \code{beta}.
##'@param model the type of model that is to estimated: the non-parametric MLE
##'(if \code{npmle}), or mixing proportions only (if \code{proportions}).
##'@param maxit maximum number of iterations.
##'@param tol a tolerance value needed to terminate an algorithm. Specifically,
##'the algorithm is terminated, if the increase of the log-likelihood value
##'after an iteration is less than \code{tol}.
##'@param grid number of grid points that are used by the algorithm to locate
##'all the local maxima of the gradient function. A larger number increases the
##'chance of locating all local maxima, at the expense of an increased
##'computational cost. The locations of the grid points are determined by the
##'function \code{gridpoints} provided by each individual mixture family, and
##'they do not have to be equally spaced. If needed, a \code{gridpoints}
##'function may choose to return a different number of grid points than
##'specified by \code{grid}.
##'@param plot whether a plot is produced at each iteration. Useful for
##'monitoring the convergence of the algorithm. If \code{="null"}, no plot is
##'produced. If \code{="gradient"}, it plots the gradient curves and if
##'\code{="probability"}, the \code{plot} function defined by the user for the
##'class is used.
##'@param verbose verbosity level for printing intermediate results in each
##'iteration, including none (= 0), the log-likelihood value (= 1), the maximum
##'gradient (= 2), the support points of the mixing distribution (= 3), the
##'mixing proportions (= 4), and if available, the value of the structural
##'parameter beta (= 5).
##'@return
##'
##'\item{family}{the name of the mixture family that is used to fit to the
##'data.}
##'
##'\item{num.iterations}{number of iterations required by the algorithm}
##'
##'\item{max.gradient}{maximum value of the gradient function, evaluated at the
##'beginning of the final iteration}
##'
##'\item{convergence}{convergence code. \code{=0} means a success, and
##'\code{=1} reaching the maximum number of iterations}
##'
##'\item{ll}{log-likelihood value at convergence}
##'
##'\item{mix}{MLE of the mixing distribution, being an object of the class
##'\code{disc} for discrete distributions.}
##'
##'\item{beta}{value of the structural parameter, that is held fixed throughout
##'the computation.}
##'@author Yong Wang <yongwang@@auckland.ac.nz>
##'@seealso \code{\link{nnls}}, \code{\link{npnorm}}, \code{\link{nppois}},
##'\code{\link{cnmms}}.
##'@references
##'
##'Wang, Y. (2007). On fast computation of the non-parametric maximum
##'likelihood estimate of a mixing distribution. \emph{Journal of the Royal
##'Statistical Society, Ser. B}, \bold{69}, 185-198.
##'
##'Wang, Y. (2010). Maximum likelihood computation for fitting semiparametric
##'mixture models. \emph{Statistics and Computing}, \bold{20}, 75-86
##'
##'Wang, Y. and Taylor, S. M. (2013). Efficient computation of nonparametric
##'survival functions via a hierarchical mixture formulation. \emph{Statistics
##'and Computing}, \bold{23}, 713-725.
##'@keywords function
##'@examples
##'
##'## Simulated data
##'x = rnppois(1000, disc(c(1,4), c(0.7,0.3))) # Poisson mixture
##'(r = cnm(x))
##'plot(r, x)
##'
##'x = rnpnorm(1000, disc(c(0,4), c(0.3,0.7)), sd=1) # Normal mixture
##'plot(cnm(x), x) # sd = 1
##'plot(cnm(x, init=list(beta=0.5)), x) # sd = 0.5
##'mix0 = disc(seq(min(x$v),max(x$v), len=100)) # over a finite grid
##'plot(cnm(x, init=list(beta=0.5, mix=mix0), model="p"),
##' x, add=TRUE, col="blue") # An approximate NPMLE
##'
##'## Real-world data
##'data(thai)
##'plot(cnm(x <- nppois(thai)), x) # Poisson mixture
##'
##'data(brca)
##'plot(cnm(x <- npnorm(brca)), x) # Normal mixture
##'
##'
##'@export cnm
cnm = function(x, init=NULL, model=c("npmle","proportions"), maxit=100, tol=1e-6,
grid=100, plot=c("null", "gradient", "probability"),
verbose=0) {
if(is.null(class(x))) stop("Data belongs to no class")
plot = match.arg(plot)
model = match.arg(model)
k = length(x)
if(model == "proportions") {
if(is.null(init$mix))
stop("init$mix must be provided for model=='proportions'")
pt0 = sort(unique(init$mix$pt))
m0 = length(pt0)
ind = unique(round(seq(1, length(pt0), len=100)))
init$mix = disc(pt0[ind])
}
init = initial.snpmle(x, init)
beta = init$beta
nb = length(beta)
mix = init$mix
attr(mix, "ll") = init$ll
convergence = 1
gp = gridpoints(x, beta, grid) # does not depend on beta
w = weight(x, beta)
wr = sqrt(w)
if(maxit == 0)
return( structure(list(family=class(x)[1], mix=mix, beta=beta,
num.iterations=0, ll=ll[1],
max.gradient=max(maxgrad(x, beta, mix, tol=-5)$grad),
convergence=1),
class = "nspmix") )
for(i in 1:maxit) {
ll = attr(mix, "ll")
dmix = attr(ll, "dmix")
ma = attr(ll, "logd") - log(dmix)
switch(plot,
"gradient" = plotgrad(x, mix, beta, pch=1),
"probability" = plot(x, mix, beta) )
mix1 = mix
if(model == "npmle") {
g = maxgrad(x, beta, mix, grid=gp, tol=-5)
if(plot=="gradient") points(g$pt, g$grad, col="red", pch=20)
gmax = g$gmax
mix = disc(c(mix$pt,g$pt), c(mix$pr,rep(0,length(g$pt))))
}
else {
gpt0 = grad(pt0, x, beta, mix)$d0
ind = indx(pt0, mix$pt)
ind[ind == 0] = 1
s = split(gpt0, ind)
imax = sapply(s, which.max)
gmax = max(gpt0[imax])
imax = imax[imax > 1]
csum = c(0,cumsum(sapply(s, length)))
ipt = csum[as.numeric(names(imax))] + imax
mix = disc(c(mix$pt, pt0[ipt]), c(mix$pr, double(length(ipt))))
}
lpt = logd(x, beta, mix$pt, which=c(1,0,0))$ld
D = pmin(exp(lpt - ma), 1e100)
if(verbose > 0)
print.snpmle(verbose, attr(mix1, "ll")[1], mix1, beta, gmax, i-1)
r = hcnm(D, mix$pr, w=w, maxit=3)
mix$pr = r$pf
# attr(mix, "ll") = r$ll + sum(w * ma)
mix = collapse.snpmle(mix, beta, x, tol=0)
if( attr(mix, "ll")[1] <= attr(mix1, "ll")[1] + tol )
{convergence = 0; break}
}
if(model == "npmle")
mix = collapse.snpmle(mix, beta, x,
tol=max(tol*.1, abs(attr(mix, "ll")[1])*1e-16))
ll = attr(mix,"ll")[1]
attr(mix,"ll") = NULL
structure(list(family=class(x)[1], num.iterations=i, max.gradient=gmax,
convergence=convergence, ll=ll, mix=mix, beta=beta),
class="nspmix")
}
## Hierarchical CNM
## S = D / P
hcnm = function(D, p0, w=1, maxit=3, tol=1e-6,
blockpar=NULL, recurs.maxit=2, depth=1) {
nx = nrow(D)
w = rep(w, length=nx)
wr = sqrt(w)
n = sum(w)
converge = FALSE
m = ncol(D)
j = 1:m
nblocks = 1
## maxdepth = depth
if(length(p <- p0) != m) stop("Argument 'p0' is the wrong length.")
p = p / sum(p)
P = drop(D %*% p)
ll = sum(w * log(P)) # log-likelihood relative to D
evenstep = FALSE
for(iter in 1:maxit) {
p.old = p
ll.old = ll
S = D / pmax(P, 1e-100)
g = colSums(w * S)
dmax = max(g) - n
sj = m
if(is.null(blockpar) || is.na(blockpar))
iter.blockpar = ifelse(sj < 30, 0,
1 - log(max(20,10*log(sj/100)))/log(sj))
else iter.blockpar = blockpar
if(iter.blockpar==0 | sj < 30) {
nblocks = 1
BW = matrix(1, nrow=sj, ncol=1)
}
else {
nblocks = max(1, if(iter.blockpar>1) round(sj/iter.blockpar)
else floor(min(sj/2, sj^iter.blockpar)))
i = seq(0, nblocks, length=sj+1)[-1]
if(evenstep) {
nblocks = nblocks + 1
BW = outer(round(i)+1, 1:nblocks, "==")
}
else BW = outer(ceiling(i), 1:nblocks, "==")
storage.mode(BW) = "numeric"
}
for(block in 1:nblocks) {
jj = logical(m)
jj[j] = BW[,block] > 0
sjj = sum(jj)
if (sjj > 1 && (delta <- sum(p.old[jj])) > 0) {
Sj = S[,jj,drop=FALSE]
res = pnnls(wr * Sj, wr * drop(Sj %*% p.old[jj]) + wr, sum=delta)
## if (res$mode > 1) warning("Problem in pnnls(a,b)")
p[jj] = p[jj] + BW[jj[j],block] *
(res$x * (delta / sum(res$x)) - p.old[jj])
}
}
p.gap = p - p.old
ll.rise.gap = sum(g * p.gap)
alpha = 1
p.alpha = p
ll.rise.alpha = ll.rise.gap
repeat {
P = drop(D %*% p.alpha)
ll = sum(w * log(P))
if(ll >= ll.old && ll + ll.rise.alpha <= ll.old) {
p = p.alpha
converge = TRUE
break
}
if(ll > ll.old && ll >= ll.old + ll.rise.alpha * .33) {
p = p.alpha
break
}
if((alpha <- alpha * 0.5) < 1e-10) {
p = p.old
P = drop(D %*% p)
ll = ll.old
converge = TRUE
break
}
p.alpha = p.old + alpha * p.gap
ll.rise.alpha = alpha * ll.rise.gap
}
if(converge) break
if (nblocks > 1) {
Q = sweep(BW,1,p[j],"*")
q = colSums(Q)
Q = sweep(D[,j] %*% Q, 2, q, "/")
if( any(q == 0) ) {
## warning("A block has zero probability!")
}
else {
res = hcnm(D=Q, p0=q, w=w, blockpar=iter.blockpar,
maxit=recurs.maxit, recurs.maxit=recurs.maxit,
depth=depth+1)
## maxdepth = max(maxdepth, res$maxdepth)
if (res$ll > ll) {
p[j] = p[j] * (BW %*% (res$pf / q))
P = drop(D %*% p)
ll = sum(w * log(P))
}
}
}
if(iter > 2) if( ll <= ll.old + tol ) {converge=TRUE; break}
evenstep = !evenstep
}
list(pf=p, convergence=converge, method="hcnm", ll=ll,
maxgrad=max(crossprod(w/P, D))-n, numiter=iter)
}
# Updates mix using CNM and then (pi, theta, beta) using BFGS.
# Modifying the support set
##'Maximum Likelihood Estimation of a Semiparametric Mixture Model
##'
##'
##'Functions \code{cnmms}, \code{cnmpl} and \code{cnmap} can be used to compute
##'the maximum likelihood estimate of a semiparametric mixture model that has a
##'one-dimensional mixing parameter. The types of mixture models that can be
##'computed include finite, nonparametric and semiparametric ones.
##'
##'Function \code{cnmms} can also be used to compute the maximum likelihood
##'estimate of a finite or nonparametric mixture model.
##'
##'A finite mixture model has a density of the form
##'
##'\deqn{f(x; \pi, \theta, \beta) = \sum_{j=1}^k \pi_j f(x; \theta_j,
##'\beta).}{f(x; pi, theta, beta) = sum_{j=1}^k pi_j f(x; theta_j, beta),}
##'
##'where \eqn{pi_j \ge 0}{pi_j >= 0} and \eqn{\sum_{j=1}^k pi_j }{sum_{j=1}^k
##'pi_j =1}\eqn{ =1}{sum_{j=1}^k pi_j =1}.
##'
##'A nonparametric mixture model has a density of the form
##'
##'\deqn{f(x; G) = \int f(x; \theta) d G(\theta),}{f(x; G) = Integral f(x;
##'theta) d G(theta),} where \eqn{G} is a mixing distribution that is
##'completely unspecified. The maximum likelihood estimate of the nonparametric
##'\eqn{G}, or the NPMLE of $\eqn{G}, is known to be a discrete distribution
##'function.
##'
##'A semiparametric mixture model has a density of the form
##'
##'\deqn{f(x; G, \beta) = \int f(x; \theta, \beta) d G(\theta),}{f(x; G, beta)
##'= Int f(x; theta, beta) d G(theta),}
##'
##'where \eqn{G} is a mixing distribution that is completely unspecified and
##'\eqn{\beta}{beta} is the structural parameter.
##'
##'Of the three functions, \code{cnmms} is recommended for most problems; see
##'Wang (2010).
##'
##'Functions \code{cnmms}, \code{cnmpl} and \code{cnmap} implement the
##'algorithms CNM-MS, CNM-PL and CNM-AP that are described in Wang (2010).
##'Their implementations are generic using S3 object-oriented programming, in
##'the sense that they can work for an arbitrary family of mixture models that
##'is defined by the user. The user, however, needs to supply the
##'implementations of the following functions for their self-defined family of
##'mixture models, as they are needed internally by the functions above:
##'
##'\code{initial(x, beta, mix, kmax)}
##'
##'\code{valid(x, beta)}
##'
##'\code{logd(x, beta, pt, which)}
##'
##'\code{gridpoints(x, beta, grid)}
##'
##'\code{suppspace(x, beta)}
##'
##'\code{length(x)}
##'
##'\code{print(x, ...)}
##'
##'\code{weight(x, ...)}
##'
##'While not needed by the algorithms, one may also implement
##'
##'\code{plot(x, mix, beta, ...)}
##'
##'so that the fitted model can be shown graphically in a way that the user
##'desires.
##'
##'For creating a new class, the user may consult the implementations of these
##'functions for the families of mixture models included in the package, e.g.,
##'\code{cvp} and \code{mlogit}.
##'
##'@aliases cnmms cnmpl cnmap
##'@param x a data object of some class that can be defined fully by the user
##'@param init list of user-provided initial values for the mixing distribution
##'\code{mix} and the structural parameter \code{beta}
##'@param model the type of model that is to estimated: non-parametric MLE
##'(\code{npmle}) or semi-parametric MLE (\code{spmle}).
##'@param maxit maximum number of iterations
##'@param tol a tolerance value that is used to terminate an algorithm.
##'Specifically, the algorithm is terminated, if the relative increase of the
##'log-likelihood value after an iteration is less than \code{tol}. If an
##'algorithm converges rapidly enough, then \code{-log10(tol)} is roughly the
##'number of accurate digits in log-likelihood.
##'@param tol.npmle a tolerance value that is used to terminate the computing
##'of the NPMLE internally.
##'@param grid number of grid points that are used by the algorithm to locate
##'all the local maxima of the gradient function. A larger number increases the
##'chance of locating all local maxima, at the expense of an increased
##'computational cost. The locations of the grid points are determined by the
##'function \code{gridpoints} provided by each individual mixture family, and
##'they do not have to be equally spaced. If needed, an individual
##'\code{gridpoints} function may return a different number of grid points than
##'specified by \code{grid}.
##'@param kmax upper bound on the number of support points. This is
##'particularly useful for fitting a finite mixture model.
##'@param plot whether a plot is produced at each iteration. Useful for
##'monitoring the convergence of the algorithm. If \code{null}, no plot is
##'produced. If \code{gradient}, it plots the gradient curves and if
##'\code{probability}, the \code{plot} function defined by the user of the
##'class is used.
##'@param verbose verbosity level for printing intermediate results in each
##'iteration, including none (= 0), the log-likelihood value (= 1), the maximum
##'gradient (= 2), the support points of the mixing distribution (= 3), the
##'mixing proportions (= 4), and if available, the value of the structural
##'parameter beta (= 5).
##'@return
##'
##'\item{family}{the class of the mixture family that is used to fit to the
##'data.}
##'
##'\item{num.iterations}{Number of iterations required by the algorithm}
##'
##'\item{grad}{For \code{cnmms}, it contains the values of the gradient
##'function at the support points and the first derivatives of the
##'log-likelihood with respect to theta and beta. For \code{cnmpl}, it contains
##'only the first derivatives of the log-likelihood with respect to beta. For
##'\code{cnmap}, it contains only the gradient of \code{beta}.}
##'
##'\item{max.gradient}{Maximum value of the gradient function, evaluated at the
##'beginning of the final iteration. It is only given by function
##'\code{cnmap}.}
##'
##'\item{convergence}{convergence code. \code{=0} means a success, and
##'\code{=1} reaching the maximum number of iterations}
##'
##'\item{ll}{log-likelihood value at convergence}
##'
##'\item{mix}{MLE of the mixing distribution, being an object of the class
##'\code{disc} for discrete distributions}
##'
##'\item{beta}{MLE of the structural parameter}
##'@author Yong Wang <yongwang@@auckland.ac.nz>
##'@seealso \code{\link{nnls}}, \code{\link{cnm}}, \code{\link{cvp}},
##'\code{\link{cvps}}, \code{\link{mlogit}}.
##'@references
##'
##'Wang, Y. (2007). On fast computation of the non-parametric maximum
##'likelihood estimate of a mixing distribution. \emph{Journal of the Royal
##'Statistical Society, Ser. B}, \bold{69}, 185-198.
##'
##'Wang, Y. (2010). Maximum likelihood computation for fitting semiparametric
##'mixture models. \emph{Statistics and Computing}, \bold{20}, 75-86
##'@keywords function
##'@examples
##'
##'## Compute the MLE of a finite mixture
##'x = rnpnorm(100, disc(c(0,4), c(0.7,0.3)), sd=1)
##'for(k in 1:6) plot(cnmms(x, kmax=k), x, add=(k>1), comp="null", col=k+1,
##' main="Finite Normal Mixtures")
##'legend("topright", 0.3, leg=paste0("k = ",1:6), lty=1, lwd=2, col=2:7)
##'
##'## Compute a semiparametric MLE
##'# Common variance problem
##'x = rcvps(k=50, ni=5:10, mu=c(0,4), pr=c(0.7,0.3), sd=3)
##'cnmms(x) # CNM-MS algorithm
##'cnmpl(x) # CNM-PL algorithm
##'cnmap(x) # CNM-AP algorithm
##'
##'# Logistic regression with a random intercept
##'x = rmlogit(k=30, gi=3:5, ni=6:10, pt=c(0,4), pr=c(0.7,0.3),
##' beta=c(0,3))
##'cnmms(x)
##'
##'data(toxo) # k = 136
##'cnmms(mlogit(toxo))
##'
##'@usage
##'cnmms(x, init=NULL, maxit=1000, model=c("spmle","npmle"), tol=1e-6,
##' grid=100, kmax=Inf, plot=c("null", "gradient", "probability"),
##' verbose=0)
##'cnmpl(x, init=NULL, tol=1e-6, tol.npmle=tol*1e-4, grid=100, maxit=1000,
##' plot=c("null", "gradient", "probability"), verbose=0)
##'cnmap(x, init=NULL, maxit=1000, tol=1e-6, grid=100, plot=c("null",
##' "gradient"), verbose=0)
##'@export cnmms
##'@export cnmpl
##'@export cnmap
cnmms = function(x, init=NULL, maxit=1000,
model=c("spmle","npmle"), tol=1e-6, grid=100, kmax=Inf,
plot=c("null", "gradient", "probability"), verbose=0) {
if(is.null(class(x))) stop("Data belongs to no class")
plot = match.arg(plot)
model = match.arg(model)
init = initial.snpmle(x, init, kmax=kmax)
beta = init$beta
if(is.null(beta) || any(is.na(beta))) model = "npmle"
nb = length(beta)
mix = init$mix
attr(mix, "ll") = init$ll
gradient = "Not computed"
switch(plot,
"gradient" = plotgrad(x, mix, beta, pch=1),
"probability" = plot(x, mix, beta) )
## ll = -Inf # logLik.snpmle(x, beta, mix, attr=FALSE)
if(maxit == 0)
return( structure(list(family=class(x)[1], mix=mix, beta=beta,
num.iterations=0, ll=attr(mix, "ll")[1],
max.gradient=max(maxgrad(x, beta, mix, tol=-5)$grad),
convergence=1),
class="nspmix") )
convergence = 1
for(i in 1:maxit) {
mix1 = mix
if(length(mix$pt) < kmax) {
gp = gridpoints(x, beta, grid)
g = maxgrad(x, beta, mix, grid=gp, tol=-Inf)
gmax = g$gmax
kpt = min(kmax - length(mix$pt), length(g$pt))
jpt = order(g$grad, decreasing=TRUE)
mix = disc(c(mix$pt,g$pt[jpt][1:kpt]), c(mix$pr,rep(0,kpt)))
if(plot=="gradient") points(g$pt, g$grad, col="red", pch=20)
}
if(verbose > 0)
print.snpmle(verbose, attr(mix1, "ll")[1], mix1, beta, gmax, i-1)
lpt = logd(x, beta, mix$pt, which=c(1,0,0))$ld
S = pmin(exp(lpt - attr(attr(mix1,"ll"),"logd")), 1e100)
w = weight(x, beta)
wr = sqrt(w)
grad.support = colSums(w * (S - 1))
r = pnnls(wr * S, wr * 2, sum=1)
sol = r$x / sum(r$x)
r = lsch(mix, beta, disc(mix$pt,sol), beta, x, which=c(1,0,0))
mix = r$mix
attr(mix, "ll") = logLik.snpmle(x, beta, mix)
mix = collapse.snpmle(mix, beta, x, tol=0)
if(max(grad.support) < 1e10) { # 1e20
r = switch(model,
spmle = bfgs(mix, beta, x, which=c(1,1,1)),
npmle = bfgs(mix, beta, x, which=c(1,1,0)) )
beta = r$beta
mix = collapse.snpmle(r$mix, beta, x, tol=0)
}
switch(plot,
"gradient" = plotgrad(x, mix, beta, pch=1),
"probability" = plot(x, mix, beta) )
if(attr(mix,"ll")[1] <= attr(mix1,"ll")[1] + tol) {convergence = 0; break}
}
mix = collapse.snpmle(mix, beta, x,
tol=max(tol*.1, abs(attr(mix, "ll")[1])*1e-16))
m = length(mix$pt)
if(m < length(r$mix$pt)) {
d = dll.snpmle(x, mix, beta, which=c(0,1,1,1))
grad = c(d$dp, d$dt, d$db)
names(grad) = c(paste0("pr",1:m), paste0("pt",1:m),
if(is.null(d$db)) NULL else paste0("beta",1:length(beta)) )
}
else grad = r$grad
grad[1:m] = grad[1:m] - sum(rep(weight(x, beta), len=length(x)))
ll = attr(mix,"ll")[1]
attr(mix,"ll") = NULL
structure(list(family=class(x)[1], num.iterations=i, grad=grad,
convergence=convergence, ll=ll, mix=mix, beta=beta),
class="nspmix")
}
# Use the BFGS method to maximize the profile likelihood function
cnmpl = function(x, init=NULL, tol=1e-6,
tol.npmle=tol*1e-4, grid=100, maxit=1000,
plot=c("null", "gradient", "probability"), verbose=0) {
if(is.null(class(x))) stop("Data belongs to no class")
plot = match.arg(plot)
init = initial.snpmle(x, init)
beta = init$beta
mix = init$mix
nb = length(beta)
if(is.null(beta) || any(is.na(beta)))
stop("No proper initial value for beta. Use cnm() for NPMLE problems.")
switch(plot,
"gradient" = plotgrad(x, mix, beta, pch=1),
"probability" = plot(x, mix, beta) )
r = pll(beta, mix, x, tol=tol.npmle, grid=grid)
D = diag(-1, nrow=nb)
convergence = 1
num.npmle = 1
if(maxit == 0) {
r = list(family=class(x)[1], mix=mix, beta=beta, num.iterations=0,
ll=logLik.snpmle(x, beta, mix),
max.gradient=max(maxgrad(x, beta, mix, tol=-5)$grad), convergence=1)
class(r) = "nspmix"
return(r)
}
for(i in 1:maxit) {
old.r = r
beta2 = drop(r$beta - D %*% r$grad)
r = lsch.pll(old.r, beta2, x, tol=tol, tol.npmle=tol.npmle,
grid=grid, brkt=TRUE)
switch(plot,
"gradient" = plotgrad(x, r$mix, r$beta, pch=1),
"probability" = plot(x, r$mix, r$beta) )
num.npmle = num.npmle + r$num.npmle
if( r$ll <= old.r$ll + tol ) {convergence = 0; break}
g = r$grad - old.r$grad
d = r$beta - old.r$beta
dg = sum(d * g)
if( dg < 0 ) {
nd = length(d)
dod = d * rep(d, rep(nd, nd))
dim(dod) = c(nd,nd)
dog = d * rep(g, rep(nd, nd))
dim(dog) = c(nd,nd)
dogD = dog %*% D
D = D + (1 + drop(t(g) %*% D %*% g) / dg) * dod / dg -
(dogD + t(dogD)) / dg
}
if(verbose > 0) print.snpmle(verbose, r$ll[1], r$mix, r$beta, r$max.grad, i)
}
structure(list(family=class(x)[1], num.iterations=i, grad=r$grad,
max.gradient=r$max.gradient, # num.cnm.calls=num.npmle,
convergence=convergence, ll=r$ll, mix=r$mix, beta=r$beta),
class="nspmix")
}
# Updates mix using one iteration of CNM and then updates beta usin BFGS.
# It is almost the standard cyclic method, except only one iteration of CNM
# is used.
cnmap = function(x, init=NULL, maxit=1000, tol=1e-6, grid=100,
plot=c("null", "gradient"), verbose=0) {
if(is.null(class(x))) stop("Data belongs to no class")
plot = match.arg(plot)
k = length(x)
init = initial.snpmle(x, init)
beta = init$beta
if(is.null(beta) || any(is.na(beta)))
stop("No initial value for beta. Use cnm() for NPMLE problems.")
nb = length(beta)
mix = init$mix
attr(mix, "ll") = init$ll
convergence = 1
for(i in 1:maxit) {
mix1 = mix
switch(plot, "gradient" = plotgrad(x, mix, beta) )
gp = gridpoints(x, beta, grid)
g = maxgrad(x, beta, mix, grid=gp, tol=-Inf)
gmax = g$gmax
if(verbose > 0)
print.snpmle(verbose, attr(mix, "ll")[1], mix, beta, gmax, i-1)
mix = disc(c(mix$pt,g$pt), c(mix$pr,rep(0,length(g$pt))))
lpt = logd(x, beta, mix$pt, which=c(1,0,0))$ld
## ll1 = attr(mix1, "ll")
# logd.mix1 = log(attr(ll1,"dmix")) + attr(ll1,"ma")
## S = pmin(exp(lpt - logd.mix1), 1e100)
S = pmin(exp(lpt - attr(attr(mix1,"ll"),"logd")), 1e100)
wr = sqrt(weight(x, beta))
r = pnnls(wr * S, wr * 2, sum=1)
sol = r$x / sum(r$x)
r = lsch(mix, beta, disc(mix$pt,sol), beta, x, which=c(1,0,0))
mix = collapse.snpmle(r$mix, beta, x, tol=0)
r = bfgs(mix, beta, x, which=c(0,0,1))
beta = r$beta
mix = collapse.snpmle(r$mix, beta, x, tol=0)
if( attr(mix, "ll")[1] <= attr(mix1, "ll")[1] + tol ) {convergence = 0; break}
}
mix = collapse.snpmle(mix, beta, x,
tol=max(tol*.1, abs(attr(mix, "ll")[1])*1e-16))
m = length(mix$pt)
d = dll.snpmle(x, mix, beta, which=c(0,1,1,1))
grad = c(d$dp, d$dt, d$db)
names(grad) = c(paste0("pr",1:m), paste0("pt",1:m),
if(is.null(d$db)) NULL else paste0("beta",1:length(beta)) )
grad[1:m] = grad[1:m] - sum(rep(weight(x, beta), len=length(x)))
ll = attr(mix,"ll")[1]
attr(mix,"ll") = NULL
structure(list(family=class(x)[1], num.iterations=i, grad=grad,
max.gradient=gmax, convergence=convergence,
ll=ll, mix=r$mix, beta=r$beta),
class="nspmix")
}
# The BFGS quasi-Newton method for updating pi, theta and beta
# The default negative identity matrix for D may be badly scaled for a
# given problem. It can result in failures to convergence.
# In such a case, it'd better use cnmpl().
# beta a real- or vector-valued parameter that has no constraints
# which which of pi, theta and beta are to be updated
bfgs = function(mix, beta, x, tol=1e-15, maxit=1000, which=c(1,1,1), D=NULL) {
k1 = if(which[1]) length(mix$pr) else 0
k2 = if(which[2]) length(mix$pt) else 0
k3 = if(which[3]) length(beta) else 0
if( sum(which) == 0 ) stop("No parameter specified for updating in bfgs()")
dl = dll.snpmle(x, mix, beta, which=c(1,which), ind=TRUE)
w = weight(x, beta)
ll = llex(x, beta, mix) + sum(w * dl$ll)
grad = c(if(which[1]) colSums(w * dl$dp) else NULL,
if(which[2]) colSums(w * dl$dt) else NULL,
if(which[3]) llexdb(x,beta,mix) + colSums(w * dl$db) else NULL)
r = list(mix=mix, beta=beta, ll=ll, grad=grad, convergence=1)
par = c(if(which[1]) r$mix$pr else NULL,
if(which[2]) r$mix$pt else NULL,
if(which[3]) r$beta else NULL)
# if(is.null(D)) D = diag(c(rep(-1,k1+k2),rep(-1,k3)))
if(is.null(D)) D = diag(-1, nrow=k1+k2+k3)
else if(nrow(D) != k1+k2+k3) stop("D provided has incompatible dimensions")
# D = - solve(crossprod(sqrt(w) * cbind(dl$dp, dl$dt, dl$db))) # Gauss-Newton
# D = - diag(1/colSums(w * cbind(dl$dp, dl$dt, dl$db)^2), nrow=k1+k2+k3)
# Note: it may also depend on llexdb
bs = suppspace(x, beta)
for(i in 1:maxit) {
old.r = r
old.par = par
d1 = - drop(D %*% r$grad)
if(which[1]) { # correction for unity restriction (Wu, 1978)
D1 = D[,1:k1,drop=FALSE]
D11 = D1[1:k1,,drop=FALSE]
lambda = - sum(r$grad * rowSums(D1)) / sum(D11)
d1 = d1 - lambda * rowSums(D1)
}
par2 = par + d1
if(which[2]) {
theta1 = par[k1+1:k2]
theta2 = par2[k1+1:k2]
dtheta = d1[k1+1:k2]
if(any(theta2 < bs[1], theta2 > bs[2])) { # New support points outside
out = pmax(c(bs[1]-theta2, theta2-bs[2]), 0)
j = out > 0 & (theta1 == bs[1] | theta1 == bs[2])
if(any(j)) { # When old ones on boundary
jj = (which(j)-1) %% k2 + 1
D[k1+jj,] = D[,k1+jj] = 0 # Remove new ones outside
d1 = - drop(D %*% r$grad) # Re-calculate
if(which[1]) {
D1 = D[,1:k1,drop=FALSE]
D11 = D1[1:k1,,drop=FALSE]
lambda = - sum(r$grad * rowSums(D1)) / sum(D11)
d1 = d1 - lambda * rowSums(D1)
}
par2 = par + d1
}
else { # Backtrack to boundary when all old ones inside
ratio = out / abs(dtheta)
ratio[dtheta==0] = 0
jmax = which.max(ratio)
alpha = 1 - ratio[jmax]
d1 = alpha * d1
par2 = par + d1
if(jmax <= k2) par2[k1 + jmax] = bs[1]
else par2[k1 + jmax - k2] = bs[2]
}
}
}
if(which[1]) {
if(any(par2[1:k1] < 0)) {
step = d1[1:k1]
ratio = pmax(-par2[1:k1], 0) / abs(step)
jmax = which.max(ratio)
alpha = 1 - ratio[jmax]
d1 = alpha * d1
par2 = par + d1
par2[jmax] = 0
}
}
mix2 = disc(if(which[2]) par2[k1+1:k2] else r$mix$pt,
if(which[1]) par2[1:k1] else r$mix$pr)
beta2 = if(which[3]) par2[k1+k2+1:k3] else r$beta
r = lsch(r$mix, r$beta, mix2, beta2, x, which=which, brkt=TRUE)
if( any(r$mix$pr == 0) ) {
j = r$mix$pr == 0
r$mix = disc(r$mix$pt[!j], r$mix$pr[!j])
j2 = which(j)
if(which[2]) j2 = c(j2,k1+j2)
D = D[-j2,-j2]
return( bfgs(r$mix, r$beta, x, tol, maxit, which, D=D) )
}
if( r$conv != 0 ) {
# cat("lsch() failed in bfgs(): convergence =", r$conv, "\n");
break}
# if(r$ll >= old.r$ll - tol * abs(r$ll) && r$ll <= old.r$ll + tol * abs(r$ll))
if(r$ll >= old.r$ll - tol && r$ll <= old.r$ll + tol)
{convergence = 0; break}
g = r$grad - old.r$grad
par = c(if(which[1]) r$mix$pr else NULL,
if(which[2]) r$mix$pt else NULL,
if(which[3]) r$beta else NULL)
d = par - old.par
dg = sum(d * g)
if( dg < 0 ) {
nd = length(d)
dod = d * rep(d, rep(nd, nd))
dim(dod) = c(nd,nd)
dog = d * rep(g, rep(nd, nd))
dim(dog) = c(nd,nd)
dogD = dog %*% D
D = D + (1 + drop(t(g) %*% D %*% g) / dg) * dod / dg -
(dogD + t(dogD)) / dg
}
}
r$num.iterations = i
r
}
# Line search for beta and mix
# If brkt is TRUE, (mix1, beta1) must be an interior point
# Two conditions must be satisfied at termination: (a) the Armijo rule; (b)
# mix1 Current disc object
# beta1 Current structural parameters
# mix2 Next disc object, of the same size as mix1
# beta2 Next structural parameters
# x Data
# maxit Maximum number of iterations
# which Indicators for pi, theta, beta, which their first derivatives
# of the log-likelihood should be computed and examined
# brkt Bracketing first?
# Output:
# mix Estimated mixing distribution
# beta Estimated beta
# ll Log-likelihood value at the estimate
# convergence = 0, converged successfully; = 1, failed
lsch = function(mix1, beta1, mix2, beta2, x,
maxit=100, which=c(1,1,1), brkt=FALSE ) {
k = length(mix1$pt)
je = (mix1$pt == mix2$pt) | (mix1$pt == mix2$pt)
convergence = 1
dl1 = dll.snpmle(x, mix1, beta1, which=c(1,which))
lla = ll1 = dl1$ll
names.grad = c(if(which[1]) paste0("pr",1:k) else NULL,
if(which[2]) paste0("pt",1:k) else NULL,
if(which[3]) paste0("beta",1:length(beta1)) else NULL)
grad1 = c(if(which[1]) dl1$dp else NULL,
if(which[2]) dl1$dt else NULL,
if(which[3]) dl1$db else NULL)
names(grad1) = names.grad
d1 = c(if(which[1]) mix2$pr - mix1$pr else NULL,
if(which[2]) mix2$pt - mix1$pt else NULL,
if(which[3]) beta2 - beta1 else NULL)
d1.norm = sqrt(sum(d1*d1))
s = d1 / d1.norm
g1d1 = sum(grad1 * d1)
dla = g1s = g1d1 / d1.norm
if(d1.norm == 0 || g1s <= 0) {
# cat("Warning in lsch(): d1.norm =", d1.norm, " g1s =", g1s, "\n")
return( list(mix=mix1, beta=beta1, grad=grad1, ll=ll1, convergence=3) )
}
a = 0
b = 1
if(which[1] && any(mix2$pr == 0)) brkt = FALSE
for(i in 1:maxit) {
for(j in 1:1000) {
m = disc( (1-b) * mix1$pt + b * mix2$pt, (1-b) * mix1$pr + b * mix2$pr)
m$pt[je] = mix1$pt[je]
# beta = if(is.null(beta1)) NULL else (1-b) * beta1 + b * beta2
beta = if(which[3]) (1-b) * beta1 + b * beta2 else beta1
if( valid.snpmle(x, beta, m) ) break
brkt = FALSE
b = 0.5 * a + 0.5 * b
}
if(j == 1000) stop("Can not produce valid interior point in lsch()")
dl = dll.snpmle(x, m, beta, which=c(1,which))
ll = dl$ll
grad = c(if(which[1]) dl$dp else NULL,
if(which[2]) dl$dt else NULL,
if(which[3]) dl$db else NULL)
gs = sum(grad * s)
# With the following condition, "ll(a)" stays above the Armijo line and
# "ll(b)" either fails to meet the gradient condition or falls below
# the Armijo line. Therefore, bracketing must have succeeded
if( brkt && gs > g1s * .5 && ll >= ll1 + g1d1 * b * .33 )
{a = b; b = 2 * b; lla = ll; dla = gs}
else break
}
if(i == maxit) brkt = FALSE
alpha = b; llb = ll; dlb = gs
for(i in 1:maxit) {
g1d = g1d1 * alpha
if(is.na(ll)) ll = -Inf
if( ll >= ll1 - 1e-15 * abs(ll1) && g1d <= 1e-15 * abs(ll)) # Flat land
{ # cat("Warning in lsch(): flat land reached.\n");
convergence=2; break}
if( brkt ) { # Armijo rule + slope test
if( ll >= ll1 + g1d * .33 && abs(gs) <= g1s * .5)
{convergence=0; break}
if( ll >= ll1 + g1d * .33 && gs > g1s * .5 )
{ a = alpha; lla = ll; dla = gs }
else { b = alpha; llb = ll; dlb = gs }
}
else { # Armijo rule
if( ll >= ll1 + g1d * .33 ) {convergence=0; break}
else { b = alpha; llb = ll; dlb = gs }
}
alpha = (a + b) * .5
m = disc( (1-alpha) * mix1$pt + alpha * mix2$pt,
(1-alpha) * mix1$pr + alpha * mix2$pr)
m$pt[je] = mix1$pt[je]
beta = if(which[3]) (1-alpha) * beta1 + alpha * beta2 else beta1
# beta = if(is.null(beta1)) NULL else (1-alpha) * beta1 + alpha * beta2
dl = dll.snpmle(x, m, beta, which=c(1,which))
ll = dl$ll
grad = c(if(which[1]) dl$dp else NULL,
if(which[2]) dl$dt else NULL,
if(which[3]) dl$db else NULL)
gs = sum(grad * s)
}
# if(i == 100) { print("i = 100 in lsch()"); stop() } # should never happen
names(grad) = names.grad
beta = if(which[3]) (1-alpha) * beta1 + alpha * beta2 else beta1
mix = disc((1-alpha)*mix1$pt+alpha*mix2$pt, (1-alpha)*mix1$pr+alpha*mix2$pr)
mix$pt[je] = mix1$pt[je]
list(mix=mix, beta=beta, grad=grad,
ll=ll, convergence=convergence, num.iterations=i)
}
# The Wolfe-Powell conditions are satisfied after line search.
# An exact line search is perhaps unnecessary, since BFGS has superlinear
# convergence
# Input:
#
# r A list containing "mix" and "beta", for the current estimates
# of th mixing distribution and beta, respectively
# beta2 Tentative new beta
# x Data
# tol Tolerance level
# ... Arguments passed to pll
# Output:
#
# outputs from function pll, plus
# convergence 0, converged successfully;
# 1, failed (likely due to precision limit reached)
# num.npmle 0, number of the NPMLEs computed
lsch.pll = function(r, beta2, x, tol=1e-15, maxit=100, tol.npmle=tol*1e-4,
brkt=FALSE, ...) {
convergence = 0
num.npmle = 0
lla = ll1 = r$ll
d1 = beta2 - r$beta
d1.norm = sqrt(sum(d1*d1))
s = d1 / d1.norm
dla = g1s = sum(r$grad * s)
g1d1 = sum(r$grad * d1)
# if(d1.norm == 0 || g1s <= 0) return(r)
a = 0
alpha = 1
for(i in 1:maxit) {
repeat {
beta = (1-alpha) * r$beta + alpha * beta2
if( valid.snpmle(x, beta, r$mix) ) break
alpha = a + 0.9 * (alpha - a)
brkt = FALSE
}
r.alpha = pll(beta, r$mix, x, tol=tol.npmle, ...)
num.npmle = num.npmle + 1
d = beta - r$beta
gs = sum(r.alpha$grad * s)
if( brkt && gs > g1s * .5 && r.alpha$ll >= ll1 + g1d1 * alpha * .33)
{a = alpha; alpha = 2 * alpha; lla = r.alpha$ll; dla = gs}
else break
# if( sum(r.alpha$grad * r.alpha$grad) < 1e-12 ) {brkt=FALSE; break}
}
if(i == maxit) brkt = FALSE
b = alpha; llb = r.alpha$ll; dlb = gs
for(i in 1:maxit) {
if(b - a <= 1e-10) {convergence=1; break}
# if(r.alpha$ll >= r$ll - tol * abs(r.alpha$ll) &&
# r.alpha$ll <= r$ll + tol * abs(r.alpha$ll)) break
if(r.alpha$ll >= r$ll - tol && r.alpha$ll <= r$ll + tol) break
g1d = g1d1 * alpha
if( brkt ) {
if( r.alpha$ll >= r$ll + g1d * .33 & abs(gs) <= g1s * .5 ) break
if( r.alpha$ll >= r$ll + g1d * .33 & gs > g1s * .5 )
{a = alpha; lla = r.alpha$ll; dla = gs}
else {b = alpha; llb = r.alpha$ll; dlb = gs}
}
else {
if( r.alpha$ll >= r$ll + g1d * .33 ) {convergence=0; break}
else {b = alpha; llb = r.alpha$ll; dlb = gs}
}
alpha = (a + b) * .5
beta = (1-alpha) * r$beta + alpha * beta2
r.alpha = pll(beta, r$mix, x, tol=tol.npmle, ...)
num.npmle = num.npmle + 1
d = beta - r$beta
gs = sum(r.alpha$grad * s)
}
r.alpha$convergence = convergence
r.alpha$num.npmle = num.npmle
r.alpha
}
logLik.snpmle = function(x, beta, mix) {
ld = logd(x, beta, mix$pt, which=c(1,0,0))$ld
ma = matMaxs(ld)
dmix = drop(exp(ld - ma) %*% mix$pr) + 1e-100
logd = log(dmix) + ma
ll = llex(x, beta, mix) + sum( weight(x, beta) * logd )
attr(ll, "dmix") = dmix
attr(ll, "logd") = logd # log(mixture density)
ll
}
density.snpmle = function(x, beta, mix) {
rowSums(exp(logd(x, beta, mix$pt, which=c(1,0,0))$ld +
rep(log(mix$pr), rep(length(x), length(mix$pr)))))
}
# Profile log-likelihood
pll = function(beta, mix0, x, tol=1e-15, grid=100, ...) {
r = cnm(x, init=list(beta=beta, mix=mix0), tol=tol, grid=grid, maxit=50, ...)
mix = r$mix
dl = logd(x, beta, mix$pt, which=c(1,1,0))
lpt = dl$ld
ma = matMaxs(lpt)
dmix = drop(exp(lpt - ma) %*% mix$pr) + 1e-100
D = pmin(exp(lpt - ma), 1e100)
p = weight(x, beta) * D/dmix * rep(mix$pr, rep(nrow(D), length(mix$pr)))
db = apply(sweep(dl$db, c(1,2), p, "*"), c(1,3), sum)
g = llexdb(x,beta,mix) + colSums(db)
list(ll=r$ll, beta=r$beta, mix=mix, grad=g, max.gradient=r$max.gradient,
num.iterations=r$num.iter)
}
## Compute all local maxima of the gradient function using a hybrid
## secant-bisection method. No need for the second derivative of the
## gradient function.
## O(n*m) + #iteration * O(n*mhat)
## n - sample size
## m - grid points
## mhat - number of local maxima
maxgrad = function(x, beta, mix, grid=100, tol=-Inf, maxit=100) {
if( length(grid) == 1 ) grid = gridpoints(x, beta, grid)
dg = grad(grid, x, beta, mix, order=1)$d1
j = is.na(dg)
grid = grid[!j]
tol.pt = 1e-14 * diff(range(grid))
dg = dg[!j]
np = length(grid)
jmax = which(dg[-np] > 0 & dg[-1] < 0)
if( length(jmax) < 1 )
return( list(pt=NULL, grad=NULL, num.iterations=0, gmax=NULL) )
left = grid[jmax]
right = grid[jmax+1]
pt = (left + right) * .5
if(length(pt) != 0) {
pt.old = left
d1.old = dg[jmax]
d2 = rep(-1, length(pt))
for(i in 1:maxit) {
d1 = grad(pt, x, beta, mix, order=1)$d1
d2t = (d1 - d1.old) / (pt - pt.old)
jd = !is.na(d2t) & d2t < 0
d2[jd] = d2t[jd]
left[d1>0] = pt[d1>0]
right[d1<0] = pt[d1<0]
pt.old = pt
d1.old = d1
pt = pt - d1 / d2
j = is.na(pt) | pt < left | pt > right # those out of brackets
pt[j] = (left[j] + right[j]) * .5
if( max(abs(pt - pt.old)) <= tol.pt) break
}
}
else i = 0
if(dg[np] >= 0) pt = c(pt, grid[np])
if(dg[1] <= 0) pt = c(grid[1], pt)
if(length(pt) == 0) stop("no new support point found") # should never happen
g = grad(pt, x, beta, mix, order=0)$d0
gmax = max(g)
names(pt) = names(g) = NULL
j = g >= tol
list(pt=pt[j], grad=g[j], num.iterations=i, gmax=gmax)
}
# gradient function, with O(n*m)
grad = function(pt, x, beta, mix, order=0) {
w = weight(x, beta)
if(length(w) != length(x)) w = rep(w, length=length(x))
if(is.null(attr(mix, "ll")) || is.null(attr(attr(mix, "ll"), "dmix")))
attr(mix, "ll") = logLik.snpmle(x, beta, mix)
logd.mix = attr(attr(mix, "ll"), "logd")
g = vector("list", length(order))
names(g) = paste0("d", order)
which = c(1,0,0)
if(any(order >= 1)) which[3:max(order+2)] = 1
dl = logd(x, beta, pt, which=which)
ws = pmin(exp(dl$ld + (log(w) - logd.mix)), 1e100)
if(0 %in% order) g$d0 = colSums(ws) - sum(w)
if(1 %in% order) g$d1 = colSums(ws * dl$dt)
g
}
##'Plot the Gradient Function
##'
##'
##'Function \code{plotgrad} plots the gradient function or its first derivative
##'of a nonparametric mixture.
##'
##'
##'\code{data} must belong to a mixture family, as specified by its class.
##'
##'The support points are shown on the horizontal line of gradient 0. The
##'vertical lines going downwards at the support points are proportional to the
##'mixing proportions at these points.
##'
##'@param x a data object of a mixture model class.
##'@param mix an object of class 'disc', for a discrete mixing distribution.
##'@param beta the structural parameter.
##'@param len number of points used to plot the smooth curve.
##'@param order the order of the derivative of the gradient function to be
##'plotted. If 0, it is the gradient function itself.
##'@param col color for the curve.
##'@param col2 color for the support points.
##'@param add if \code{FALSE}, create a new plot; if \code{TRUE}, add the curve
##'and points to the current one.
##'@param main,xlab,ylab,cex,pch,lwd,xlim,ylim arguments for graphical
##'parameters (see \code{par}).
##'@param ... arguments passed on to function \code{plot}.
##'@author Yong Wang <yongwang@@auckland.ac.nz>
##'@seealso \code{\link{plot.nspmix}}, \code{\link{nnls}}, \code{\link{cnm}},
##'\code{\link{cnmms}}, \code{\link{npnorm}}, \code{\link{nppois}}.
##'@references
##'
##'Wang, Y. (2007). On fast computation of the non-parametric maximum
##'likelihood estimate of a mixing distribution. \emph{Journal of the Royal
##'Statistical Society, Ser. B}, \bold{69}, 185-198.
##'
##'Wang, Y. (2010). Maximum likelihood computation for fitting semiparametric
##'mixture models. \emph{Statistics and Computing}, \bold{20}, 75-86
##'@keywords function
##'@examples
##'
##'## Poisson mixture
##'x = rnppois(200, disc(c(1,4), c(0.7,0.3)))
##'r = cnm(x)
##'plotgrad(x, r$mix)
##'
##'## Normal mixture
##'x = rnpnorm(200, disc(c(0,4), c(0.3,0.7)), sd=1)
##'r = cnm(x, init=list(beta=0.5)) # sd = 0.5
##'plotgrad(x, r$mix, r$beta)
##'
##'
##'@export plotgrad
plotgrad = function(x, mix, beta, len=500, order=0, col="blue",
col2="red", add=FALSE,
main=paste0("Class: ",class(x)),
xlab=expression(theta),
ylab=paste0("Gradient (order = ",order,")"), cex=1,
pch=1, lwd=1, xlim, ylim, ...) {
if(order > 1) stop("order > 1")
if( missing(xlim) ) xlim = range(gridpoints(x, beta, grid=5))
pt = seq(xlim[1], xlim[2], len=len)
if(class(mix) == "disc") pt = sort(c(mix$pt, pt))
g = grad(pt, x, beta, mix, order=order)[[1]]
if(missing(ylim)) ylim = range(0,g,na.rm=TRUE)
if(!add) plot(xlim, c(0,0), type="l", col="black", ylim=ylim, main=main,
xlab=xlab, ylab=ylab, cex = cex, cex.axis = cex, cex.lab = cex, ...)
lines(pt, g, col=col, lwd=lwd)
if(class(mix) == "disc") {
j = mix$pr > 0
points(mix$pt[j], rep(0,length(mix$pt[j])), pch=pch, col=col2)
lines(mix$pt[j], mix$pr[j]*min(g), type="h", col=col2)
}
invisible(list(pt=pt,g=g))
}
# The functions examines whether the initial values are proper. If not,
# proper ones are provided, by employing the user's function "initial."
initial.snpmle = function(x, init=NULL, kmax=NULL) {
if(!is.null(kmax) && kmax == Inf) kmax = NULL
init = initial(x, init$beta, init$mix, kmax=kmax)
init$ll = logLik.snpmle(x, init$beta, init$mix)
# ll = logLik.snpmle(x, init0$beta, init$mix)
# if(ll > init0$ll) init = list(beta=init0$beta, mix=init$mix, ll=ll)
# else init = init0
if(! valid.snpmle(x, init$beta, init$mix)) stop("Invalid initial values!")
init
}
# Compute the first derivatives of the log-likelihood function of pi,
# theta and beta
# INPUT:
#
# which A 4-vector having values either 0 and 1, indicating which
# derivatives to be computed, in the order of pi, theta and beta
# ind = TRUE, return the derivatives with respect to individual
# observations (with weight)
# = FALSE, return the derivatives
#
# OUTPUT: a list with ll, dp, dt, db, as specified by which
dll.snpmle = function(x, mix, beta, which=c(1,0,0,0), ind=FALSE) {
w = weight(x, beta)
r = list()
dl = logd(x, beta, mix$pt, which=c(1,which[4:3]))
lpt = dl$ld
ma = matMaxs(lpt)
if(which[1] == 1) {
r$ll = log(drop(exp(lpt - ma) %*% mix$pr)) + ma
if(!ind) r$ll = llex(x, beta, mix) + sum(w * r$ll)
}
if(sum(which[2:4]) == 0) return(r)
dmix = drop(exp(lpt - ma) %*% mix$pr) + 1e-100
S = pmin(exp(lpt - (ma + log(dmix))), 1e100)
# dp = D / dmix
if(which[2] == 1) {
if(ind) r$dp = S
else r$dp = colSums(w * S)
}
if(sum(which[3:4]) == 0) return(r)
p = S * rep(mix$pr, rep(nrow(S), length(mix$pr)))
if(which[3] == 1) {
r$dt = p * dl$dt
if(!ind) r$dt = colSums(w * r$dt)
}
if(which[4] == 0) return(r)
dl1 = dl$db
if(is.null(dl1)) r$db = NULL
else {
r$db = apply(sweep(dl1, c(1,2), p, "*"), c(1,3), sum)
if(!ind) r$db = llexdb(x, beta, mix) + colSums(w * r$db)
}
r
}
valid.snpmle = function(x, beta, mix) {
bs = suppspace(x, beta)
bs = bs + 1e-14 * c(-1,1) * abs(bs) # tolerance
valid(x, beta, mix) && all(mix$pr >= 0, mix$pt >= bs[1], mix$pt <= bs[2])
}
collapse.snpmle = function(mix, beta, x, tol=1e-15) {
j = mix$pr == 0
if(any(j)) {
mix = disc(mix$pt[!j], mix$pr[!j])
attr(mix, "ll") = logLik.snpmle(x, beta, mix)
}
else { if( is.null(attr(mix, "ll")) || is.null(attr(attr(mix, "ll"), "dmix")) )
attr(mix, "ll") = logLik.snpmle(x, beta, mix) }
## if( any(mix$pr < tol.p) ) {
## j = mix$pr >= tol.p
## mixt = mix
## mixt$pt = mixt$pt[j]
## mixt$pr = mixt$pr[j] / sum(mix$pr[j])
## attr(mixt, "ll") = logLik.snpmle(x, beta, mixt)
## if(attr(mixt,"ll")[1] >= attr(mix,"ll")[1] - tol) mix = mixt
## }
## p = attr(attr(mix, "ll"), "p")
## pj = colSums(weight(x, beta) * p)
## if( any(pj == 0) ) {
## j = pj != 0
## mix$pt = mix$pt[j]
## mix$pr = mix$pr[j] / sum(mix$pr[j])
## attr(mix, "ll") = logLik.snpmle(x, beta, mix)
## }
if(tol > 0) {
repeat {
if(length(mix$pt) <= 1) break
prec = 10 * min(diff(mix$pt)) ## one digit at a time
## if(prec > diff(range(gridpoints(x, beta, grid=2))) * .01) break
mixt = unique(mix, prec=c(prec,0))
attr(mixt, "ll") = logLik.snpmle(x, beta, mixt)
# if(attr(mixt,"ll")[1] >= attr(mix,"ll")[1] - tol * abs(attr(mix,"ll")[1]))
if(attr(mixt,"ll")[1] >= attr(mix,"ll")[1] - tol) mix = mixt
else break
}
}
mix
}
print.snpmle = function(verbose, ll, mix, beta, maxdg, i) {
if(verbose > 0)
cat("## Iteration ", i, ":\n Log-likelihood = ", as.character(ll), "\n", sep="")
if(verbose > 1) cat(" Max gradient =", maxdg, "\n")
if(verbose > 2) cat(" theta =", signif(mix$pt,6), "\n")
if(verbose > 3) cat(" Proportions =", signif(mix$pr,6), "\n")
if(verbose > 4)
cat(" beta =", if(is.null(beta)) "NULL" else signif(beta,6), "\n")
}
plot.nspmix = function(x, data, type=c("probability","gradient"), ...) {
type = match.arg(type)
if(missing(data)) data = NULL
else {
if(is.null(class(data))) stop("Data belongs to no class")
if(class(data)[1] != x$family)
stop(paste0("Class of data does not match the mixture family: ",
class(data)[1], " != ", x$family))
}
plotf = getFunction(paste0("plot.",x$family))
switch(type,
"probability" = plotf(data, x$mix, x$beta, ...),
"gradient" = plotgrad(data, x$mix, x$beta, pch=1, ...) )
invisible(x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.