R/cmp-reg.R

Defines functions parametric.bootstrap.cmpfit predict.cmpfit residuals.cmpfit deviance.cmpfit leverage.cmpfit equitest.cmpfit vcov.cmpfit sdev.cmpfit nu.cmpfit coef.cmpfit BIC.cmpfit AIC.cmpfit logLik.cmpfit print.cmpfit fitted_cmp_internal summary.cmpfit

Documented in AIC.cmpfit BIC.cmpfit coef.cmpfit deviance.cmpfit equitest.cmpfit leverage.cmpfit logLik.cmpfit nu.cmpfit parametric.bootstrap.cmpfit predict.cmpfit print.cmpfit residuals.cmpfit sdev.cmpfit summary.cmpfit vcov.cmpfit

#' Supporting Functions for COM-Poisson Regression
#' 
#' @param object object of type \code{cmp}.
#' @param x object of type \code{cmp}.
#' @param k Penalty per parameter to be used in AIC calculation.
#' @param newdata New covariates to be used for prediction.
#' @param type Specifies quantity to be computed. See details.
#' @param reps Number of bootstrap repetitions.
#' @param report.period Report progress every \code{report.period} iterations.
#' @param ... other arguments, such as \code{subset} and \code{na.action}.
#' 
#' @details
#' The function \code{residuals} returns raw residuals when
#' \code{type = "raw"}  and quantile residuals when
#' \code{type = "quantile"}.
#' 
#' The function \code{predict} returns expected values of the outcomes,
#' eveluated at the computed estimates, when \code{type = "response"}. When
#' \code{type = "link"}, a \code{data.frame} is instead returned with
#' columns corresponding to estimates of \code{lambda} and \code{nu}.
#' 
#' The function \code{coef} returns a vector of coefficient estimates in
#' the form \code{c(beta, gamma)} when \code{type = "vector"}. When
#' \code{type = "list"}, the estimates are returned as a list with named
#' elements \code{beta} and \code{gamma}.
#' 
#' The \code{type} argument behaves the same for the \code{sdev} function
#' as it does for \code{coef}.
#' 
#' @name glm.cmp, CMP support
NULL

#' @name glm.cmp, CMP support
#' @export
summary.cmpfit = function(object, ...)
{
	n = nrow(object$X)
	d1 = ncol(object$X)
	d2 = ncol(object$S)
	qq = d1 + d2

	# We need the indices of the fixed coefficients and the ones included in
	# optimization.
	fixed = object$fixed
	unfixed = object$unfixed
	idx.par1 = seq_along(unfixed$beta)
	idx.par2 = seq_along(unfixed$gamma) + length(unfixed$beta)

	V = vcov(object)
	est = coef(object)

	# In the vector of SEs, include an NA entry if the variable was fixed
	# This NA will propagate to the corresponding z-value and p-value as well.
	se = rep(NA, qq)
	se[unfixed$beta] = sdev(object)[idx.par1]
	se[unfixed$gamma + d1] = sdev(object)[idx.par2]

	z.val = est / se
	p.val = 2*pnorm(-abs(z.val))

	DF = data.frame(
		Estimate = round(est, 4),
		SE = round(se, 4),
		z.value = round(z.val, 4),
		p.value = sprintf("%0.4g", p.val)
	)
	colnames(DF) = c("Estimate", "SE", "z-value", "p-value")
	rownames(DF) = c(
		sprintf("X:%s", colnames(object$X)),
		sprintf("S:%s", colnames(object$S))
	)

	# Add a column to indicate fixed components if anything is fixed
	if (length(unlist(fixed)) > 0) {
		fixed.beta = rep("F", d1)
		fixed.gamma = rep("F", d2)
		fixed.beta[fixed$beta] = "T"
		fixed.gamma[fixed$gamma] = "T"
		DF$Fixed = c(fixed.beta, fixed.gamma)
	}

	# If X, S, or W are intercept only, compute results for non-regression parameters
	DF.lambda = NULL
	DF.nu = NULL

	# In each block below, make sure to consider only the non-fixed variables
	# for the Jacobian and Hessian. If one of the intercepts was fixed, it should
	# result in an SE of zero.

	if (is.intercept.only(object$X) && is.zero.matrix(object$offset$x)) {
		if (length(fixed$beta) > 0) {
			se = 0
		} else {
			J = c(exp(object$beta), numeric(length(idx.par2)))
			se = sqrt(t(J) %*% V %*% J)
		}
		est = exp(object$beta)
		DF.lambda = data.frame(
			Estimate = round(est, 4),
			SE = round(se, 4)
		)
		rownames(DF.lambda) = "lambda"
	}

	if (is.intercept.only(object$S) && is.zero.matrix(object$offset$s)) {
		if (length(fixed$gamma) > 0) {
			se = 0
		} else {
			J = c(numeric(length(idx.par1)), exp(object$gamma))
			se = sqrt(t(J) %*% V %*% J)
		}
		est = exp(object$gamma)
		DF.nu = data.frame(
			Estimate = round(est, 4),
			SE = round(se, 4)
		)
		rownames(DF.nu) = "nu"
	}

	list(DF = DF, DF.lambda = DF.lambda, DF.nu = DF.nu,
		n = n,
		loglik = logLik(object),
		aic = AIC(object),
		bic = BIC(object),
		optim.method = object$control$optim.method,
		opt.message = object$opt.res$message,
		opt.convergence = object$opt.res$convergence,
		elapsed.sec = object$elapsed.sec
	)
}

fitted_cmp_internal = function(X, S, beta, gamma, off.x, off.s)
{
	n = nrow(X)
	stopifnot(n == nrow(S))
	stopifnot(n == length(off.x))
	stopifnot(n == length(off.s))
	stopifnot(ncol(X) == length(beta))
	stopifnot(ncol(S) == length(gamma))

	if (length(beta) > 0) {
		lambda = as.numeric(exp(X %*% beta + off.x))
	} else {
		lambda = rep(0, n)
	}

	if (length(gamma) > 0) {
		nu = as.numeric(exp(S %*% gamma + off.s))
	} else {
		nu = rep(0, n)
	}

	list(lambda = lambda, nu = nu)
}

#' @name glm.cmp, CMP support
#' @export
print.cmpfit = function(x, ...)
{
	printf("CMP coefficients\n")
	s = summary(x)
	print(s$DF)

	if (length(x$fixed$gamma) > 0) {
		tt = paste(collapse = " ", c(
			"Some elements of gamma were fixed.",
			"Chi-squared test for equidispersion not defined."))
	} else {
		tt = equitest(x)
	}

	if (!is.null(s$DF.lambda) || !is.null(s$DF.nu)) {
		printf("--\n")
		printf("Transformed intercept-only parameters\n")
		print(rbind(s$DF.lambda, s$DF.nu))
	}
	if (is.character(tt)) {
		printf("--\n")
		cat(paste(tt, collapse = "\n"))
		printf("\n")
	} else {
		printf("--\n")
		printf("Chi-squared test for equidispersion\n")
		printf("X^2 = %0.4f, df = %d, ", tt$teststat, tt$df)
		printf("p-value = %0.4e\n", tt$pvalue)
	}
	printf("--\n")
	printf("Elapsed: %s   ", format_elapsed(s$elapsed.sec))
	printf("Sample size: %d   ", s$n)
	printf("%s interface\n", x$interface)
	printf("LogLik: %0.4f   ", s$loglik)
	printf("AIC: %0.4f   ", s$aic)
	printf("BIC: %0.4f   ", s$bic)
	printf("\n")
	printf("Optimization Method: %s   ", s$optim.method)
	printf("Converged status: %d\n", s$opt.convergence)
	printf("Message: %s\n", s$opt.message)
}

#' @name glm.cmp, CMP support
#' @export
logLik.cmpfit = function(object, ...)
{
	object$loglik
}

#' @name glm.cmp, CMP support
#' @export
AIC.cmpfit= function(object, ..., k=2)
{
	d = length(unlist(object$unfixed))
	-2*object$loglik + 2*d
}

#' @name glm.cmp, CMP support
#' @export
BIC.cmpfit = function(object, ...)
{
	n = length(object$y)
	d = length(unlist(object$unfixed))
	-2*object$loglik + log(n)*d
}

#' @name glm.cmp, CMP support
#' @export
coef.cmpfit = function(object, type = c("vector", "list"), ...)
{
	switch(match.arg(type),
		vector = c(object$beta, object$gamma),
		list = list(beta = object$beta, gamma = object$gamma)
	)
}

#' @name glm.cmp, CMP support
#' @export
nu.cmpfit = function(object, ...)
{
	# This function is deprecated - use predict instead
	.Deprecated("predict(object, type = \"link\")")
	link = predict(object, type = "link")
	link$nu
}

#' @name glm.cmp, CMP support
#' @export
sdev.cmpfit = function(object, type = c("vector", "list"), ...)
{
	d1 = ncol(object$X)
	d2 = ncol(object$S)
	fixed = object$fixed
	unfixed = object$unfixed
	idx.par1 = seq_along(unfixed$beta)
	idx.par2 = seq_along(unfixed$gamma) + length(unfixed$beta)

	sd.hat = sqrt(diag(vcov(object)))

	if (match.arg(type) == "vector") {
		out = sd.hat
	} else if (match.arg(type) == "list") {
		sd.beta = rep(NA, d1)
		sd.gamma = rep(NA, d2)
		sd.beta[unfixed$beta] = sd.hat[idx.par1]
		sd.gamma[unfixed$gamma] = sd.hat[idx.par2]
		out = list(beta = sd.beta, gamma = sd.gamma)
	} else {
		stop("Unrecognized type")
	}

	return(out)
}

#' @name glm.cmp, CMP support
#' @export
vcov.cmpfit = function(object, ...)
{
	# Compute the covariance via Hessian from optimizer
	-solve(object$H)
}

#' @name glm.cmp, CMP support
#' @export
equitest.cmpfit = function(object, ...)
{
	if ("equitest" %in% names(object)) {
		return(object$equitest)
	}

	y = object$y
	X = object$X
	S = object$S
	init = object$init
	offset = object$offset
	fixed = object$fixed
	ll = object$loglik

	# If any elements of gamma have been fixed, an "equidispersion" test no
	# longer makes sense. Unless the values were fixed at zeroes. But let's
	# avoid this complication.
	if (length(fixed$gamma) > 0) {
		msg = c("Some elements of gamma were fixed,",
			"chi-squared test for equidispersion not defined")
		stop(paste(msg, collapse = " "))
	}

	n = length(y)
	d2 = ncol(S)

	# Null model is CMP with nu determined by the offset off.s. If off.s happens
	# to be zeros, this simplifies to a Poisson regression.
	fit0.out = fit.cmp.reg(y, X, S, offset = offset,
		init = get.init(beta = object$beta, gamma = numeric(d2), zeta = numeric(0)),
		fixed = get.fixed(beta = fixed$beta, gamma = seq_len(d2), zeta = fixed$zeta),
		control = object$control)
	ll0 = fit0.out$loglik

	teststat = -2 * (ll0 - ll)
	pvalue = pchisq(teststat, df = d2, lower.tail = FALSE)
	list(teststat = teststat, pvalue = pvalue, df = d2)
}

#' @name glm.cmp, CMP support
#' @export
leverage.cmpfit = function(object, ...)
{
	y = object$y
	n = length(y)

	out = fitted_cmp_internal(object$X, object$S, object$beta, object$gamma,
		object$offset$x, object$offset$s)

	# 1) Some quantities corresponding to parameters lambda and nu: Normalizing
	# constants, expected values, variances, and truncation values.
	z.hat = ncmp(out$lambda, out$nu, control = object$control)
	E.y = ecmp(out$lambda, out$nu, control = object$control)
	V.y = vcmp(out$lambda, out$nu, control = object$control)
	y.trunc = max(tcmp(out$lambda, out$nu, control = object$control))

	# Note that z_prodlogj uses truncation, even if we would approximate z 
	# using the asymptotic expression using some of the given lambda and nu
	# pairs. This might be something that could be improved later.

	#    and X matrix (in Appendix)
	E.logfacty = z_prodlogj(out$lambda, out$nu, max = y.trunc) / z.hat
	extravec = (E.logfacty - lgamma(y+1)) / (y - E.y)
	curlyX.mat = cbind(object$X, extravec)

	# 2) to compute H using equation (12)  on p. 11
	if (FALSE) {
		# Here is a more readable version, but it creates n x n matrices so may
		# have problems with large inputs.
		WW = diag(V.y)
		H1 = t(curlyX.mat) %*% sqrt(WW)
		H2 = solve(t(curlyX.mat) %*% WW %*% curlyX.mat)
		H = t(H1) %*% H2 %*% H1
		diag.H = diag(H)
	} else {
		# This version avoids creating large matrices, but is a bit harder to read.
		diag.H = numeric(n)
		H1 = t(curlyX.mat * sqrt(V.y))
		H2 = solve(t(curlyX.mat * V.y) %*% curlyX.mat)
		for (i in 1:n) {
			diag.H[i] = t(H1[,i]) %*% H2 %*% H1[,i]
		}
	}

	return(diag.H)
}

#' @name glm.cmp, CMP support
#' @export
deviance.cmpfit = function(object, ...)
{
	# Compute the COM-Poisson deviances exactly
	y = object$y
	X = object$X
	S = object$S
	# init = object$init
	fixed = object$fixed
	offset = object$offset
	control = object$control
	n = length(y)
	d2 = ncol(S)

	# Compute optimal log likelihood value for given nu-hat value
	# beta.init = object$beta
	# d1 = length(beta.init)
	ll.star = numeric(n)

	for (i in 1:n) {
		# Maximize loglik for ith obs
		glm.out = fit.cmp.reg(y[i],
			X = X[i,,drop = FALSE],
			S = S[i,,drop = FALSE],
			init = get.init(beta = object$beta, gamma = numeric(d2)),
			offset = get.offset(x = offset$x[i], s = offset$s[i], w = offset$w[i]),
			fixed = get.fixed(beta = fixed$beta, gamma = seq_len(d2), zeta = fixed$zeta),
			control = control)
		ll.star[i] = glm.out$opt.res$value
	}

	# Compute exact deviances
	ll = numeric(n)
	out = fitted_cmp_internal(X, S, object$beta, object$gamma, offset$x, offset$s)
	for (i in 1:n) {
		ll[i] = dcmp(y[i], out$lambda[i], out$nu[i], log = TRUE, control = control)
	}

	dev = -2*(ll - ll.star)
	lev = leverage(object)
	cmpdev = dev / sqrt(1 - lev)
	return(cmpdev)
}

#' @name glm.cmp, CMP support
#' @export
residuals.cmpfit = function(object, type = c("raw", "quantile"), ...)
{
	out = fitted_cmp_internal(object$X, object$S, object$beta, object$gamma,
		object$offset$x, object$offset$s)

	type = match.arg(type)
	if (type == "raw") {
		y.hat = ecmp(out$lambda, out$nu, control = object$control)
		res = object$y - y.hat
	} else if (type == "quantile") {
		res = rqres.cmp(object$y, lambda = out$lambda, nu = out$nu,
			control = object$control)
	} else {
		stop("Unsupported residual type")
	}

	return(as.numeric(res))
}

#' @name glm.cmp, CMP support
#' @export
predict.cmpfit = function(object, newdata = NULL, type = c("response", "link"), ...)
{
	if (is.null(newdata)) {
		# If newdata is NULL, reuse data from model fit
		X = object$X
		S = object$S
		off.x = object$offset$x
		off.s = object$offset$s
	} else if (object$interface == "formula") {
		# If the model was fit with the formula interface, attempt to process
		# newdata as a data.frame
		newdata = as.data.frame(newdata)

		# If the response was not included in newdata, add a column with zeros
		response.name = all.vars(object$formula.lambda)[1]
		if (is.null(newdata[[response.name]])) {
			newdata[[response.name]] = 0
		}

		raw = formula2raw(object$formula.lambda, object$formula.nu,
			object$formula.p, data = newdata, ...)
		X = raw$X
		S = raw$S
		off.x = raw$offset$x
		off.s = raw$offset$s
	} else if (object$interface == "raw") {
		# If the model was fit with the raw interface, attempt to process
		# newdata as a list
		stopifnot("COMPoissonReg.modelmatrix" %in% class(newdata))
		X = newdata$X
		S = newdata$S
		off.x = newdata$offset$x
		off.s = newdata$offset$s
	} else {
		stop("Don't recognize value of interface")
	}

	link = fitted_cmp_internal(X, S, object$beta, object$gamma, off.x, off.s)
	switch(match.arg(type),
		response = ecmp(link$lambda, link$nu, control = object$control),
		link = data.frame(lambda = link$lambda, nu = link$nu)
	)
}

#' @name glm.cmp, CMP support
#' @export
parametric.bootstrap.cmpfit = function(object, reps = 1000, report.period = reps+1, ...)
{
	n = nrow(object$X)
	d1 = ncol(object$X)
	d2 = ncol(object$S)
	qq = d1 + d2

	out = matrix(NA, nrow = reps, ncol = qq)
	colnames(out) = c(colnames(object$X), colnames(object$S), recursive=TRUE)

	fitted.out = fitted_cmp_internal(object$X, object$S, object$beta, object$gamma,
		object$offset$x, object$offset$s)
	lambda.hat = fitted.out$lambda
	nu.hat = fitted.out$nu

	init = get.init(beta = object$beta, gamma = object$gamma)

	for (r in 1:reps){
		if (r %% report.period == 0) {
			logger("Starting bootstrap rep %d\n", r)
		}

		# Generate bootstrap samples of the full dataset using MLE
		y.boot = rcmp(n, lambda.hat, nu.hat, control = object$control)

		# Take each of the bootstrap samples and fit model to generate bootstrap
		# estimates
		tryCatch({
			fit.boot = fit.cmp.reg(y = y.boot, X = object$X, S = object$S,
				init = init, offset = object$offset, fixed = object$fixed,
				control = object$control)
			out[r,] = unlist(fit.boot$theta.hat)
		}, error = function(e) {
			# Do nothing now; emit a warning later
		})
	}

	cnt = sum(rowSums(is.na(out)) > 0)
	if (cnt > 0) {
		warning(sprintf("%d out of %d bootstrap iterations failed", cnt, reps))
	}

	return(out)
}
lotze/COMPoissonReg documentation built on Feb. 11, 2024, 12:03 p.m.