Nothing
#' Fitting frequentist STAR linear model via EM algorithm
#'
#' Compute the MLEs and log-likelihood for the STAR linear model.
#' The regression coefficients are estimated using least squares within
#' an EM algorithm.
#'
#' Standard function calls including
#' \code{\link{coefficients}}, \code{\link{fitted}}, and \code{\link{residuals}} apply. Fitted values are the expectation
#' at the MLEs, and as such are not necessarily count-valued.
#'
#' @param formula an object of class "\code{\link{formula}}" (see \code{\link{lm}} for details on model specification)
#' @param data an optional data frame, list or environment (or object coercible by as.data.frame to a data frame)
#' containing the variables in the model; like \code{\link{lm}}, if not found in data,
#' the variables are taken from \code{environment(formula)}
#' @param transformation transformation to use for the latent data; must be one of
#' \itemize{
#' \item "identity" (identity transformation)
#' \item "log" (log transformation)
#' \item "sqrt" (square root transformation)
#' \item "np" (nonparametric transformation estimated from empirical CDF)
#' \item "pois" (transformation for moment-matched marginal Poisson CDF)
#' \item "neg-bin" (transformation for moment-matched marginal Negative Binomial CDF)
#' \item "box-cox" (box-cox transformation with learned parameter)
#' }
#' @param y_max a fixed and known upper bound for all observations; default is \code{Inf}
#' @param sd_init add random noise for EM algorithm initialization scaled by \code{sd_init}
#' times the Gaussian MLE standard deviation; default is 10
#' @param tol tolerance for stopping the EM algorithm; default is 10^-10;
#' @param max_iters maximum number of EM iterations before stopping; default is 1000
#' @return an object of \code{class} "lmstar", which is a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the MLEs of the coefficients
#' \item \code{fitted.values} the fitted values at the MLEs
#' \item \code{g.hat} a function containing the (known or estimated) transformation
#' \item \code{ginv.hat} a function containing the inverse of the transformation
#' \item \code{sigma.hat} the MLE of the standard deviation
#' \item \code{mu.hat} the MLE of the conditional mean (on the transformed scale)
#' \item \code{z.hat} the estimated latent data (on the transformed scale) at the MLEs
#' \item \code{residuals} the Dunn-Smyth residuals (randomized)
#' \item \code{residuals_rep} the Dunn-Smyth residuals (randomized) for 10 replicates
#' \item \code{logLik} the log-likelihood at the MLEs
#' \item \code{logLik0} the log-likelihood at the MLEs for the *unrounded* initialization
#' \item \code{lambda} the Box-Cox nonlinear parameter
#' \item and other parameters that
#' (1) track the parameters across EM iterations and
#' (2) record the model specifications
#' }
#'
#' @note Infinite latent data values may occur when the transformed
#' Gaussian model is highly inadequate. In that case, the function returns
#' the *indices* of the data points with infinite latent values, which are
#' significant outliers under the model. Deletion of these indices and
#' re-running the model is one option, but care must be taken to ensure
#' that (i) it is appropriate to treat these observations as outliers and
#' (ii) the model is adequate for the remaining data points.
#'
#' @references
#' Kowal, D. R., & Wu, B. (2021).
#' Semiparametric count data regression for selfâreported mental health.
#' \emph{Biometrics}. \doi{10.1111/biom.13617}
#'
#' @examples
#' # Simulate data with count-valued response y:
#' sim_dat = simulate_nb_lm(n = 100, p = 3)
#' y = sim_dat$y; X = sim_dat$X
#'
#' # Fit model
#' fit_em = lm_star(y~X)
#'
#' # Fitted coefficients:
#' coef(fit_em)
#' # Fitted values:
#' y_hat = fitted(fit_em)
#' plot(y_hat, y);
#'
#' # Residuals:
#' plot(residuals(fit_em))
#' qqnorm(residuals(fit_em)); qqline(residuals(fit_em))
#' @export
lm_star = function(formula, data=NULL, transformation = 'np',
y_max = Inf,
sd_init = 10,
tol = 10^-10,
max_iters = 1000){
#Get model matrix and response from formula and data
if(is.null(data)){
mf <- lm(formula, method = "model.frame")
} else{
mf <- lm(formula, data, method = "model.frame")
}
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
X <- model.matrix(mt, mf)
# Check: currently implemented for nonnegative integers only
if(any(y < 0) || any(y != floor(y)))
stop('Response must be nonnegative counts')
# Check: y_max must be a true upper bound
if(any(y > y_max))
stop('Response must not exceed y_max')
# Check: does the transformation make sense?
transformation = tolower(transformation);
if(!is.element(transformation, c("identity", "log", "sqrt", "np", "pois", "neg-bin", "box-cox")))
stop("The transformation must be one of 'identity', 'log', 'sqrt', 'np', 'pois', 'neg-bin', or 'box-cox'")
# Assign a family for the transformation: Box-Cox or CDF?
transform_family = ifelse(
test = is.element(transformation, c("identity", "log", "sqrt", "box-cox")),
yes = 'bc', no = 'cdf'
)
# Define estimator
estimator = function(w){
mf[[1]] <- w
lm(mf)
}
# Number of observations:
n = length(y)
# Define the transformation:
if(transform_family == 'bc'){
# Lambda value for each Box-Cox argument:
if(transformation == 'identity') lambda = 1
if(transformation == 'log') lambda = 0
if(transformation == 'sqrt') lambda = 1/2
if(transformation == 'box-cox') lambda = runif(n = 1) # random init on (0,1)
# Transformation function:
g = function(t) g_bc(t,lambda = lambda)
# Inverse transformation function:
g_inv = function(s) g_inv_bc(s,lambda = lambda)
# Sum of log-derivatives (for initial log-likelihood):
#g_deriv = function(t) t^(lambda - 1)
sum_log_deriv = (lambda - 1)*sum(log(y+1))
}
if(transform_family == 'cdf'){
# Transformation function:
g = g_cdf(y = y, distribution = transformation)
# Define the grid for approximations using equally-spaced + quantile points:
t_grid = sort(unique(round(c(
seq(0, min(2*max(y), y_max), length.out = 250),
quantile(unique(y[y < y_max] + 1), seq(0, 1, length.out = 250))), 8)))
# Inverse transformation function:
g_inv = g_inv_approx(g = g, t_grid = t_grid)
# Sum of log-derivatives (for initial log-likelihood):
sum_log_deriv = sum(log(pmax(g(y+1, deriv = 1), 0.01)))
# No Box-Cox transformation:
lambda = NULL
}
# Initialize the parameters: add 1 in case of zeros
z_hat = g(y + 1)
fit = estimator(z_hat);
# Check: does the estimator make sense?
if(is.null(fit$fitted.values) || is.null(fit$coefficients))
stop("The estimator() function must return 'fitted.values' and 'coefficients'")
# (Initial) Fitted values:
mu_hat = fit$fitted.values
# (Initial) Coefficients:
theta_hat = fit$coefficients
# (Initial) observation SD:
sigma_hat = sd(z_hat - mu_hat)
# (Initial) log-likelihood:
logLik0 = logLik_em0 =
sum_log_deriv + sum(dnorm(z_hat, mean = mu_hat, sd = sigma_hat, log = TRUE))
# Randomize for EM initialization:
if(sd_init > 0){
z_hat = g(y + 1) + sd_init*sigma_hat*rnorm(n = n)
fit = estimator(z_hat);
mu_hat = fit$fitted.values;
theta_hat = fit$coefficients;
sigma_hat = sd(z_hat - mu_hat)
}
# Number of parameters (excluding sigma)
p = length(theta_hat)
# Lower and upper intervals:
a_y = a_j(y, y_max = y_max); a_yp1 = a_j(y + 1, y_max = y_max)
z_lower = g(a_y); z_upper = g(a_yp1)
# Store the EM trajectories:
mu_all = zhat_all = array(0, c(max_iters, n))
theta_all = array(0, c(max_iters, p)) # Parameters (coefficients)
sigma_all = numeric(max_iters) # SD
logLik_all = numeric(max_iters) # Log-likelihood
for(s in 1:max_iters){
# ----------------------------------
## E-step: impute the latent data
# ----------------------------------
# First and second moments of latent variables:
z_mom = truncnorm_mom(a = z_lower, b = z_upper, mu = mu_hat, sig = sigma_hat)
z_hat = z_mom$m1; z2_hat= z_mom$m2;
# Check: if any infinite z_hat values, return these indices and stop
if(any(is.infinite(z_hat))){
warning('Infinite z_hat values: returning the problematic indices')
return(list(error_inds = which(is.infinite(z_hat))))
}
# ----------------------------------
## M-step: estimation
# ----------------------------------
fit = estimator(z_hat)
mu_hat = fit$fitted.values
theta_hat = fit$coefficients
sigma_hat = sqrt((sum(z2_hat) + sum(mu_hat^2) - 2*sum(z_hat*mu_hat))/n)
# If estimating lambda:
if(transformation == 'box-cox'){
# Negative log-likelihood function
ff <- function(l_bc){
sapply(l_bc, function(l_bc){
-logLikeRcpp(g_a_j = g_bc(a_y, lambda = l_bc),
g_a_jp1 = g_bc(a_yp1, lambda = l_bc),
mu = mu_hat,
sigma = rep(sigma_hat, n))})
}
# Set the search interval
a = 0; b = 1.0;
# Brent method will get in error if the function value is infinite
# A simple (but not too rigorous) way to restrict the search interval
while (ff(b) == Inf){
b = b * 0.8
}
# Tune tolorence if needed
lambda = BrentMethod(a, b, fcn = ff, tol = .Machine$double.eps^0.2)$x
# Update the transformation and inverse transformation function:
g = function(t) g_bc(t, lambda = lambda)
g_inv = function(s) g_inv_bc(s, lambda = lambda)
# Update the lower and upper limits:
z_lower = g(a_y); z_upper = g(a_yp1)
}
# Update log-likelihood:
logLik_em = logLikeRcpp(g_a_j = z_lower,
g_a_jp1 = z_upper,
mu = mu_hat,
sigma = rep(sigma_hat, n))
# Storage:
mu_all[s,] = mu_hat; theta_all[s,] = theta_hat; sigma_all[s] = sigma_hat; logLik_all[s] = logLik_em; zhat_all[s,] = z_hat
# Check whether to stop:
if((logLik_em - logLik_em0)^2 < tol) break
logLik_em0 = logLik_em
}
# Subset trajectory to the estimated values:
mu_all = mu_all[1:s,]; theta_all = theta_all[1:s,]; sigma_all = sigma_all[1:s];
logLik_all = logLik_all[1:s]; zhat_all = zhat_all[1:s,]
# Also the expected value (fitted values)
# First, estimate an upper bound for the (infinite) summation:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu_hat, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction:
y_hat = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu_hat, sigma = rep(sigma_hat, n),
Jmax = Jmax)
# Dunn-Smyth residuals:
resids_ds = qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
# Replicates of Dunn-Smyth residuals:
resids_ds_rep = sapply(1:10, function(...)
qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
)
# Return:
z <- list(coefficients = theta_hat,
fitted.values = y_hat,
g.hat = g,
ginv.hat = g_inv,
sigma.hat = sigma_hat,
mu.hat = mu_hat,
z.hat = z_hat,
residuals = resids_ds,
residuals_rep = resids_ds_rep,
logLik = logLik_em,
logLik0 = logLik0,
lambda = lambda,
mu_all = mu_all, theta_all = theta_all, sigma_all = sigma_all, logLik_all = logLik_all, zhat_all = zhat_all, # EM trajectory
y = y, X=X, lmfit = fit, transformation = transformation, y_max = y_max, tol = tol, max_iters = max_iters) # And return the info about the model as well
class(z) <- c("lmstar")
return(z)
}
#' Predict method for response in STAR linear model
#'
#' Outputs predicted values based on an lmstar fit and optionally prediction intervals based
#' on the the (plug-in) predictive distribution for the STAR linear model
#'
#' @param object Object of class "lmstar" as output by \code{\link{lm_star}}
#' @param newdata An optional matrix/data frame in which to look for variables with which to predict.
#' If omitted, the fitted values are used.
#' @param interval logical; whether or not to include prediction intervals (default FALSE)
#' @param level Level for prediction intervals
#' @param N number of Monte Carlo samples from the posterior predictive distribution
#' used to approximate intervals; default is 1000
#' @param ... Ignored
#'
#' @return Either a a vector of predictions (if interval=FALSE) or a matrix of predictions and
#' bounds with column names fit, lwr, and upr
#'
#' @details
#' If interval=TRUE, then \code{predict.lmstar} uses a Monte Carlo approach to estimating the (plug-in)
#' predictive distribution for the STAR linear model. The algorithm iteratively samples
#' (i) the latent data given the observed data, (ii) the latent predictive data given the latent data from (i),
#' and (iii) (inverse) transforms and rounds the latent predictive data to obtain a
#' draw from the integer-valued predictive distribution.
#'
#' The appropriate quantiles of these Monte Carlo draws are computed and reported as the prediction interval.
#'
#' @note
#' The ``plug-in" predictive distribution is a crude approximation. Better
#' approaches are available using the Bayesian models, e.g. \code{\link{blm_star}}, which provide samples
#' from the posterior predictive distribution.
#'
#' For highly skewed responses, prediction intervals especially at lower levels may
#' not include the predicted value itself, since the mean is often much larger than the median.
#'
#'
#' @examples
#' # Simulate data with count-valued response y:
#' x = seq(0, 1, length.out = 100)
#' y = rpois(n = length(x), lambda = exp(1.5 + 5*(x -.5)^2))
#'
#' # Estimate model--assume a quadratic effect (better for illustration purposes)
#' fit = lm_star(y~x+I(x^2), transformation = 'sqrt')
#'
#' #Compute the predictive draws for the test points (same as observed points here)
#' #Also compute intervals using plug-in predictive distribution
#' y_pred = predict(fit, interval=TRUE)
#'
#' # Plot the results
#' plot(x, y, ylim = range(y, y_pred), main = 'STAR: Predictions and 95% PI')
#' lines(x,y_pred[,"fit"], col='black', type='s', lwd=4)
#' lines(x, y_pred[,"lwr"], col='darkgray', type='s', lwd=4)
#' lines(x, y_pred[,"upr"], col='darkgray', type='s', lwd=4)
#'
#' @export
predict.lmstar <- function(object, newdata = NULL, interval=FALSE, level=0.95, N=1000, ...){
if (!inherits(object, "lmstar"))
stop("Not a lmstar object")
#Get data from object
y <- object$y
y_max <- object$y_max
X <- object$X
g <- object$g.hat
g_inv <- object$ginv.hat
sigma_hat <- object$sigma.hat
fit <- object$lmfit
# Sample size:
n = length(y)
# If no test set, use the original design points:
if (missing(newdata) || is.null(newdata)) {
X.test = X
} else {
Terms <- delete.response(terms(fit))
m = model.frame(Terms, newdata, na.action = na.pass, xlev = fit$xlevels)
X.test <- model.matrix(Terms, m, contrasts.arg = fit$contrasts)
}
# Number of test points:
m = nrow(X.test)
# Check:
if(ncol(X.test) != ncol(X))
stop('newdata must have the same (number of) predictors as X')
# Number of predictors:
p = ncol(X)
# Get fitted values at new points
#First calculate new mu_hat
mu_hat = X.test%*%object$coefficients
mu_hat = mu_hat[,,drop=TRUE]
#Estimate an upper bound for the (infinite) summation:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu_hat, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction:
y_hat = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu_hat, sigma = rep(sigma_hat, n),
Jmax = Jmax)
y_hat <- y_hat[,,drop=TRUE]
names(y_hat) <- 1:length(y_hat)
if(interval){
# Define the grid for approximations using equally-spaced + quantile points:
t_grid = sort(unique(round(c(
seq(0, min(2*max(y), y_max), length.out = 250),
quantile(unique(y[y < y_max] + 1), seq(0, 1, length.out = 250))), 8)))
# (Approximate) inverse transformation function:
g_inv = g_inv_approx(g = g, t_grid = t_grid)
# Sample z_star from the posterior distribution:
y_pred = matrix(NA, nrow = N, ncol = m)
# Recurring terms:
a_y = a_j(y, y_max = y_max); a_yp1 = a_j(y + 1, y_max = y_max)
XtXinv = chol2inv(chol(crossprod(X)))
cholS0 = chol(diag(m) + tcrossprod(X.test%*%XtXinv, X.test))
for(nsi in 1:N){
# sample [z* | y]
z_star_s = rtruncnormRcpp(y_lower = g(a_y),
y_upper = g(a_yp1),
mu = object$mu.hat,
sigma = rep(object$sigma.hat, n),
u_rand = runif(n = n))
# sample [z~ | z*]:
# first, estimate the lm using z* as data:
beta_hat = XtXinv%*%crossprod(X, z_star_s)
mu_hat = X.test%*%beta_hat
s_hat = sqrt(sum((z_star_s - X%*%beta_hat)^2/(n - p - 1)))
# next, sample z~:
z_tilde = mu_hat +
s_hat*crossprod(cholS0, rnorm(m))/sqrt(rchisq(n = 1, df = n - p - 1)/(n - p - 1))
# save the (inverse) transformed and rounded sims:
y_pred[nsi,] = round_floor(g_inv(z_tilde), y_max = y_max)
}
alpha=1-level
quants = t(apply(y_pred, 2, quantile, c(alpha/2, 1-(alpha/2))))
result = cbind(y_hat, quants)
colnames(result) = c("fit", "lwr", "upr")
return(result)
} else {
return(y_hat)
}
}
#' Compute asymptotic confidence intervals for STAR linear regression
#'
#' For a linear regression model within the STAR framework,
#' compute (asymptotic) confidence intervals for a regression coefficient of interest.
#' Confidence intervals are computed by inverting the likelihood ratio test and
#' profiling the log-likelihood.
#'
#' @param object Object of class "lmstar" as output by \code{\link{lm_star}}
#' @param parm a specification of which parameters are to be given confidence intervals,
#' either a vector of numbers or a vector of names. If missing, all parameters are considered.
#' @param level confidence level; default is 0.95
#' @param ... Ignored
#'
#' @return A matrix (or vector) with columns giving lower and upper confidence limits for each parameter.
#' These will be labelled as (1-level)/2 and 1 - (1-level)/2 in % (by default 2.5% and 97.5%).
#'
#' @examples
#' #Simulate data with count-valued response y:
#' sim_dat = simulate_nb_lm(n = 100, p = 2)
#' y = sim_dat$y; X = sim_dat$X
#'
#' #Select a transformation:
#' transformation = 'np'
#'
#' #Estimate model
#' fit = lm_star(y~X, transformation=transformation)
#'
#' #Confidence interval for all parameters
#' confint(fit)
#'
#' @export
confint.lmstar = function(object, parm,
level = 0.95,
...){
if (!inherits(object, "lmstar"))
stop("Not a lmstar object")
# Get values from object
transformation = object$transformation
g = object$g.hat
y= object$y
X = object$X
y_max = object$y_max
z_hat = object$z.hat
theta_all = object$theta_all
max_iters = object$max_iters
tol = object$tol
# Dimensions:
n = length(y);
#Format parm so that it's numeric
pnames <- colnames(X)
if (missing(parm))
parm <- 1:ncol(X)
else if (!is.numeric(parm))
parm <- match(parm, pnames)
# Initialization:
z2_hat = z_hat^2 # Second moment
# Lower and upper intervals:
z_lower = g(a_j(y, y_max = y_max))
z_upper = g(a_j(y + 1, y_max = y_max))
#Format output (code borrowed from confint.lm)
a <- (1 - level)/2
a <- c(a, 1 - a)
pct <- paste(format(100 * a, trim = TRUE, scientific = FALSE, digits = 3),"%")
ci <- array(NA_real_, dim = c(length(parm), 2L),
dimnames = list(pnames[parm], pct))
#Run through loop for length(parm) predictors
for(i in 1:length(parm)){
# For s=1 comparison:
mu_hat0 = rep(0,n); # This will be updated to a warm-start within the loop
j=parm[i]
# Construct a sequence of theta values for predictor j:
n_coarse = 50 # Length of sequence
# Max distance from the MLE in the EM sequence:
d_max = max(coef(object)[j] - min(theta_all[,j]),
max(theta_all[,j]) - coef(object)[j])
theta_seq_coarse = seq(from = coef(object)[j] - 2*d_max - 2*diff(range(theta_all[,j])),
to = coef(object)[j] + 2*d_max + 2*diff(range(theta_all[,j])),
length.out = n_coarse)
# Store the profile log-likelihood:
prof.logLik = numeric(n_coarse)
# theta_j's with log-like's that exceed this threshold will belong to the confidence set
conf_thresh = object$logLik - qchisq(1 - level, df = 1, lower.tail = FALSE)/2
ng = 1;
while(ng <= n_coarse){
# theta_j is fixed:
theta_j = theta_seq_coarse[ng]
for(s in 1:max_iters){
# Estimation (with the jth coefficient fixed at theta_j)
fit = lm(z_hat ~ -1 + X[,-j] + offset(theta_j*X[,j]))
mu_hat = fit$fitted.values
sigma_hat = sqrt((sum(z2_hat) + sum(mu_hat^2) - 2*sum(z_hat*mu_hat))/n)
# First and second moments of latent variables:
z_mom = truncnorm_mom(a = z_lower, b = z_upper, mu = mu_hat, sig = sigma_hat)
z_hat = z_mom$m1; z2_hat= z_mom$m2;
# Check whether to stop:
if(mean((mu_hat - mu_hat0)^2) < tol) break
mu_hat0 = mu_hat
}
prof.logLik[ng] = logLikeRcpp(g_a_j = z_lower,
g_a_jp1 = z_upper,
mu = mu_hat,
sigma = rep(sigma_hat,n))
# Check at the final iteration:
if(ng == n_coarse){
# Bad lower endpoint:
if(prof.logLik[which.min(theta_seq_coarse)] >= conf_thresh - 5){
# Expand theta downward:
theta_seq_coarse = c(theta_seq_coarse,
min(theta_seq_coarse) - 2*median(abs(diff(theta_seq_coarse))))
}
# Bad upper endpoint:
if(prof.logLik[which.max(theta_seq_coarse)] >= conf_thresh - 5){
# Expand theta upward:
theta_seq_coarse = c(theta_seq_coarse,
max(theta_seq_coarse) + 2*median(abs(diff(theta_seq_coarse))))
}
# Update: lengthen prof.logLik and increase n_coarse accordingly
temp = prof.logLik;
prof.logLik = numeric(length(theta_seq_coarse));
prof.logLik[1:n_coarse] = temp
n_coarse = length(theta_seq_coarse)
}
ng = ng + 1
}
# Smooth on a finer grid:
theta_seq_fine = seq(min(theta_seq_coarse), max(theta_seq_coarse), length.out = 10^3)
prof.logLik_hat = splinefun(theta_seq_coarse, prof.logLik)(theta_seq_fine)
ci_all = theta_seq_fine[prof.logLik_hat > conf_thresh]
ci[i,] <- range(ci_all)
}
return(ci)
}
#' Compute coefficient p-values for STAR linear regression using likelihood ratio test
#'
#' For a linear regression model within the STAR framework,
#' compute p-values for regression coefficients using a likelihood ratio test.
#' It also computes a p-value for excluding all predictors, akin to a (partial)
#' F test.
#'
#' @param object Object of class "lmstar" as output by \code{\link{lm_star}}
#' @return a list of p+1 p-values, one for each predictor as well as the joint
#' p-value excluding all predictors
#'
#' @examples
#' # Simulate data with count-valued response y:
#' sim_dat = simulate_nb_lm(n = 100, p = 2)
#' y = sim_dat$y; X = sim_dat$X
#'
#' # Select a transformation:
#' transformation = 'np'
#'
#' #Estimate model
#' fit = lm_star(y~X, transformation = transformation)
#'
#' #Compute p-values
#' pvals(fit)
#'
#' @export
pvals <- function(object){
if (!inherits(object, "lmstar"))
stop("Not a lmstar object")
X <- object$X
y <- object$y
p = ncol(X)
logLik = object$logLik
pvals = array(0, p+1)
for(i in 1:p){
#If there's an intercept
if(all(X[,i]==1)){
fit_curr <- lm_star(y ~ X[,-i]-1, transformation = object$transformation, y_max=object$y_max,
tol = object$tol, max_iters=object$max_iters)
} else{ #All others
fit_curr <- lm_star(y ~ X[,-i], transformation = object$transformation, y_max=object$y_max,
tol = object$tol, max_iters=object$max_iters)
}
pvals[i] = pchisq(-2*(fit_curr$logLik - logLik), df = 1, lower.tail = FALSE)
}
#Get p-value for any effects
fit_curr <- lm_star(y ~ 1, transformation = object$transformation, y_max=object$y_max,
tol = object$tol, max_iters=object$max_iters)
pvals[p+1] = pchisq(-2*(fit_curr$logLik - logLik), df = p-1, lower.tail = FALSE)
names(pvals) = c(colnames(X), "Any linear effects")
return(pvals)
}
#' Fit Random Forest STAR with EM algorithm
#'
#' Compute the MLEs and log-likelihood for the Random Forest STAR model.
#' The STAR model requires a *transformation* and an *estimation function* for the conditional mean
#' given observed data. The transformation can be known (e.g., log or sqrt) or unknown
#' (Box-Cox or estimated nonparametrically) for greater flexibility.
#' The estimator in this case is a random forest.
#' Standard function calls including \code{\link{fitted}} and \code{\link{residuals}} apply.
#'
#' @param y \code{n x 1} vector of observed counts
#' @param X \code{n x p} matrix of predictors
#' @param X.test \code{m x p} matrix of out-of-sample predictors
#' @param transformation transformation to use for the latent data; must be one of
#' \itemize{
#' \item "identity" (identity transformation)
#' \item "log" (log transformation)
#' \item "sqrt" (square root transformation)
#' \item "np" (nonparametric transformation estimated from empirical CDF)
#' \item "pois" (transformation for moment-matched marginal Poisson CDF)
#' \item "neg-bin" (transformation for moment-matched marginal Negative Binomial CDF)
#' \item "box-cox" (box-cox transformation with learned parameter)
#' }
#' @param y_max a fixed and known upper bound for all observations; default is \code{Inf}
#' @param sd_init add random noise for EM algorithm initialization scaled by \code{sd_init}
#' times the Gaussian MLE standard deviation; default is 10
#' @param tol tolerance for stopping the EM algorithm; default is 10^-10;
#' @param max_iters maximum number of EM iterations before stopping; default is 1000
#' @param ntree Number of trees to grow.
#' This should not be set to too small a number, to ensure that every input row gets predicted at least a few times.
#' Default is 500.
#' @param mtry Number of variables randomly sampled as candidates at each split.
#' Default is p/3.
#' @param nodesize Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time).
#' Default is 5.
#' @return a list with the following elements:
#' \itemize{
#' \item \code{fitted.values}: the fitted values at the MLEs based on out-of-bag samples (training)
#' \item \code{fitted.values.test}: the fitted values at the MLEs (testing)
#' \item \code{g.hat} a function containing the (known or estimated) transformation
#' \item \code{sigma.hat} the MLE of the standard deviation
#' \item \code{mu.hat} the MLE of the conditional mean (on the transformed scale)
#' \item \code{z.hat} the estimated latent data (on the transformed scale) at the MLEs
#' \item \code{residuals} the Dunn-Smyth residuals (randomized)
#' \item \code{residuals_rep} the Dunn-Smyth residuals (randomized) for 10 replicates
#' \item \code{logLik} the log-likelihood at the MLEs
#' \item \code{logLik0} the log-likelihood at the MLEs for the *unrounded* initialization
#' \item \code{lambda} the Box-Cox nonlinear parameter
#' \item \code{rfObj}: the object returned by randomForest() at the MLEs
#' \item and other parameters that
#' (1) track the parameters across EM iterations and
#' (2) record the model specifications
#' }
#'
#' @details STAR defines a count-valued probability model by
#' (1) specifying a Gaussian model for continuous *latent* data and
#' (2) connecting the latent data to the observed data via a
#' *transformation and rounding* operation.
#'
#' The expectation-maximization (EM) algorithm is used to produce
#' maximum likelihood estimators (MLEs) for the parameters defined in the
#' The fitted values are computed using out-of-bag samples. As a result,
#' the log-likelihood is based on out-of-bag prediction, and it is similarly
#' straightforward to compute out-of-bag squared and absolute errors.
#'
#' @note Since the random forest produces random predictions, the EM algorithm
#' will never converge exactly.
#'
#' @note Infinite latent data values may occur when the transformed
#' Gaussian model is highly inadequate. In that case, the function returns
#' the *indices* of the data points with infinite latent values, which are
#' significant outliers under the model. Deletion of these indices and
#' re-running the model is one option, but care must be taken to ensure
#' that (i) it is appropriate to treat these observations as outliers and
#' (ii) the model is adequate for the remaining data points.
#'
#' @references
#' Kowal, D. R., & Wu, B. (2021).
#' Semiparametric count data regression for selfâreported mental health.
#' \emph{Biometrics}. \doi{10.1111/biom.13617}
#'
#' @examples
#' \donttest{
#' # Simulate data with count-valued response y:
#' sim_dat = simulate_nb_friedman(n = 100, p = 10)
#' y = sim_dat$y; X = sim_dat$X
#'
#' # EM algorithm for STAR (using the log-link)
#' fit_em = randomForest_star(y = y, X = X,
#' transformation = 'log',
#' max_iters = 100)
#'
#' # Fitted values (out-of-bag)
#' y_hat = fitted(fit_em)
#' plot(y_hat, y);
#'
#' # Residuals:
#' plot(residuals(fit_em))
#' qqnorm(residuals(fit_em)); qqline(residuals(fit_em))
#'
#' # Log-likelihood at MLEs (out-of-bag):
#' fit_em$logLik
#' }
#'
#' @import randomForest
#' @export
randomForest_star = function(y, X, X.test = NULL,
transformation = 'np',
y_max = Inf,
sd_init = 10,
tol = 10^-10,
max_iters = 1000,
ntree=500,
mtry= max(floor(ncol(X)/3), 1),
nodesize = 5){
# Check: currently implemented for nonnegative integers
if(any(y < 0) || any(y != floor(y)))
stop('y must be nonnegative counts')
# Check: y_max must be a true upper bound
if(any(y > y_max))
stop('y must not exceed y_max')
# Check: does the transformation make sense?
transformation = tolower(transformation);
if(!is.element(transformation, c("identity", "log", "sqrt", "np", "pois", "neg-bin", "box-cox")))
stop("The transformation must be one of 'identity', 'log', 'sqrt', 'np', 'pois', 'neg-bin', or 'box-cox'")
# Assign a family for the transformation: Box-Cox or CDF?
transform_family = ifelse(
test = is.element(transformation, c("identity", "log", "sqrt", "box-cox")),
yes = 'bc', no = 'cdf'
)
# Number of observations:
n = length(y)
# Define the transformation:
if(transform_family == 'bc'){
# Lambda value for each Box-Cox argument:
if(transformation == 'identity') lambda = 1
if(transformation == 'log') lambda = 0
if(transformation == 'sqrt') lambda = 1/2
if(transformation == 'box-cox') lambda = runif(n = 1) # random init on (0,1)
# Transformation function:
g = function(t) g_bc(t,lambda = lambda)
# Inverse transformation function:
g_inv = function(s) g_inv_bc(s,lambda = lambda)
# Sum of log-derivatives (for initial log-likelihood):
#g_deriv = function(t) t^(lambda - 1)
sum_log_deriv = (lambda - 1)*sum(log(y+1))
}
if(transform_family == 'cdf'){
# Transformation function:
g = g_cdf(y = y, distribution = transformation)
# Define the grid for approximations using equally-spaced + quantile points:
t_grid = sort(unique(round(c(
seq(0, min(2*max(y), y_max), length.out = 250),
quantile(unique(y[y < y_max] + 1), seq(0, 1, length.out = 250))), 8)))
# Inverse transformation function:
g_inv = g_inv_approx(g = g, t_grid = t_grid)
# Sum of log-derivatives (for initial log-likelihood):
sum_log_deriv = sum(log(pmax(g(y+1, deriv = 1), 0.01)))
# No Box-Cox transformation:
lambda = NULL
}
# Initialize the parameters: add 1 in case of zeros
z_hat = g(y + 1)
fit = randomForest(x = X, y = z_hat,
ntree = ntree, mtry = mtry, nodesize = nodesize)
# (Initial) Fitted values:
mu_hat = fit$predicted
# (Initial) observation SD:
sigma_hat = sd(z_hat - mu_hat)
# (Initial) log-likelihood:
logLik0 = logLik_em0 =
sum_log_deriv + sum(dnorm(z_hat, mean = mu_hat, sd = sigma_hat, log = TRUE))
# Randomize for EM initialization:
if(sd_init > 0){
z_hat = g(y + 1) + sd_init*sigma_hat*rnorm(n = n)
fit = randomForest(x = X, y = z_hat,
ntree = ntree, mtry = mtry, nodesize = nodesize)
mu_hat = fit$predicted; sigma_hat = sd(z_hat - mu_hat)
}
# Lower and upper intervals:
a_y = a_j(y, y_max = y_max); a_yp1 = a_j(y + 1, y_max = y_max)
z_lower = g(a_y); z_upper = g(a_yp1)
# Store the EM trajectories:
mu_all = zhat_all = array(0, c(max_iters, n))
sigma_all = numeric(max_iters) # SD
logLik_all = numeric(max_iters) # Log-likelihood
for(s in 1:max_iters){
# ----------------------------------
## E-step: impute the latent data
# ----------------------------------
# First and second moments of latent variables:
z_mom = truncnorm_mom(a = z_lower, b = z_upper, mu = mu_hat, sig = sigma_hat)
z_hat = z_mom$m1; z2_hat= z_mom$m2;
# Check: if any infinite z_hat values, return these indices and stop
if(any(is.infinite(z_hat))){
warning('Infinite z_hat values: returning the problematic indices')
return(list(error_inds = which(is.infinite(z_hat))))
}
# ----------------------------------
## M-step: estimation
# ----------------------------------
fit = randomForest(x = X, y = z_hat,
ntree = ntree, mtry = mtry, nodesize = nodesize)
mu_hat = fit$predicted
sigma_hat = sqrt((sum(z2_hat) + sum(mu_hat^2) - 2*sum(z_hat*mu_hat))/n)
# If estimating lambda:
if(transformation == 'box-cox'){
# Negative log-likelihood function
ff <- function(l_bc){
sapply(l_bc, function(l_bc){
-logLikeRcpp(g_a_j = g_bc(a_y, lambda = l_bc),
g_a_jp1 = g_bc(a_yp1, lambda = l_bc),
mu = mu_hat,
sigma = rep(sigma_hat, n))})
}
# Set the search interval
a = 0; b = 1.0;
# Brent method will get in error if the function value is infinite
# A simple (but not too rigorous) way to restrict the search interval
while (ff(b) == Inf){
b = b * 0.8
}
# Tune tolorence if needed
lambda = BrentMethod(a, b, fcn = ff, tol = .Machine$double.eps^0.2)$x
# Update the transformation and inverse transformation function:
g = function(t) g_bc(t, lambda = lambda)
g_inv = function(s) g_inv_bc(s, lambda = lambda)
# Update the lower and upper limits:
z_lower = g(a_y); z_upper = g(a_yp1)
}
# Update log-likelihood:
logLik_em = logLikeRcpp(g_a_j = z_lower,
g_a_jp1 = z_upper,
mu = mu_hat,
sigma = rep(sigma_hat, n))
# Storage:
mu_all[s,] = mu_hat; sigma_all[s] = sigma_hat; logLik_all[s] = logLik_em; zhat_all[s,] = z_hat
# Check whether to stop:
if((logLik_em - logLik_em0)^2 < tol) break
logLik_em0 = logLik_em
}
# Subset trajectory to the estimated values:
mu_all = mu_all[1:s,]; sigma_all = sigma_all[1:s]; logLik_all = logLik_all[1:s]; zhat_all = zhat_all[1:s,]
# Also the expected value (fitted values)
# First, estimate an upper bound for the (infinite) summation:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu_hat, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction:
y_hat = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu_hat, sigma = rep(sigma_hat, n),
Jmax = Jmax)
# Dunn-Smyth residuals:
resids_ds = qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
# Replicates of Dunn-Smyth residuals:
resids_ds_rep = sapply(1:10, function(...)
qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
)
# Predictive quantities, if desired:
if(!is.null(X.test)){
# Fitted values on transformed-scale at test points:
mu.test = predict(fit, X.test)
# Conditional expectation at test points:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu.test, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction at test points:
fitted.values.test = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu.test, sigma = rep(sigma_hat, n),
Jmax = Jmax)
} else {
fitted.values.test = NULL
}
# Return:
list(fitted.values = y_hat,
fitted.values.test = fitted.values.test,
g.hat = g,
sigma.hat = sigma_hat,
mu.hat = mu_hat,
z.hat = z_hat,
residuals = resids_ds,
residuals_rep = resids_ds_rep,
logLik = logLik_em,
logLik0 = logLik0,
lambda = lambda,
rfObj = fit,
mu_all = mu_all, sigma_all = sigma_all, logLik_all = logLik_all, zhat_all = zhat_all, # EM trajectory
transformation = transformation, y_max = y_max, tol = tol, max_iters = max_iters) # And return the info about the model as well
}
#' Fitting STAR Gradient Boosting Machines via EM algorithm
#'
#' Compute the MLEs and log-likelihood for the Gradient Boosting Machines (GBM) STAR model.
#' The STAR model requires a *transformation* and an *estimation function* for the conditional mean
#' given observed data. The transformation can be known (e.g., log or sqrt) or unknown
#' (Box-Cox or estimated nonparametrically) for greater flexibility.
#' The estimator in this case is a GBM.
#' Standard function calls including \code{\link{fitted}} and \code{\link{residuals}} apply.
#'
#' @param y \code{n x 1} vector of observed counts
#' @param X \code{n x p} matrix of predictors
#' @param X.test \code{m x p} matrix of out-of-sample predictors
#' @param transformation transformation to use for the latent data; must be one of
#' \itemize{
#' \item "identity" (identity transformation)
#' \item "log" (log transformation)
#' \item "sqrt" (square root transformation)
#' \item "np" (nonparametric transformation estimated from empirical CDF)
#' \item "pois" (transformation for moment-matched marginal Poisson CDF)
#' \item "neg-bin" (transformation for moment-matched marginal Negative Binomial CDF)
#' \item "box-cox" (box-cox transformation with learned parameter)
#' }
#' @param y_max a fixed and known upper bound for all observations; default is \code{Inf}
#' @param sd_init add random noise for EM algorithm initialization scaled by \code{sd_init}
#' times the Gaussian MLE standard deviation; default is 10
#' @param tol tolerance for stopping the EM algorithm; default is 10^-10;
#' @param max_iters maximum number of EM iterations before stopping; default is 1000
#' @param n.trees Integer specifying the total number of trees to fit.
#' This is equivalent to the number of iterations and the number of basis functions in the additive expansion.
#' Default is 100.
#' @param interaction.depth Integer specifying the maximum depth of each tree
#' (i.e., the highest level of variable interactions allowed).
#' A value of 1 implies an additive model, a value of 2 implies a model with up to 2-way interactions, etc.
#' Default is 1.
#' @param shrinkage a shrinkage parameter applied to each tree in the expansion.
#' Also known as the learning rate or step-size reduction; 0.001 to 0.1 usually work, but a smaller learning rate typically requires more trees.
#' Default is 0.1.
#' @param bag.fraction the fraction of the training set observations randomly selected to propose the next tree in the expansion.
#' This introduces randomnesses into the model fit. If bag.fraction < 1 then running the same model twice will result in similar but different fits.
#' Default is 1 (for a deterministic prediction).
#'
#' @return a list with the following elements:
#' \itemize{
#' \item \code{fitted.values}: the fitted values at the MLEs (training)
#' \item \code{fitted.values.test}: the fitted values at the MLEs (testing)
#' \item \code{g.hat} a function containing the (known or estimated) transformation
#' \item \code{sigma.hat} the MLE of the standard deviation
#' \item \code{mu.hat} the MLE of the conditional mean (on the transformed scale)
#' \item \code{z.hat} the estimated latent data (on the transformed scale) at the MLEs
#' \item \code{residuals} the Dunn-Smyth residuals (randomized)
#' \item \code{residuals_rep} the Dunn-Smyth residuals (randomized) for 10 replicates
#' \item \code{logLik} the log-likelihood at the MLEs
#' \item \code{logLik0} the log-likelihood at the MLEs for the *unrounded* initialization
#' \item \code{lambda} the Box-Cox nonlinear parameter
#' \item \code{gbmObj}: the object returned by gbm() at the MLEs
#' \item and other parameters that
#' (1) track the parameters across EM iterations and
#' (2) record the model specifications
#' }
#'
#' @details STAR defines a count-valued probability model by
#' (1) specifying a Gaussian model for continuous *latent* data and
#' (2) connecting the latent data to the observed data via a
#' *transformation and rounding* operation. The Gaussian model in
#' this case is a GBM.
#'
#' @note Infinite latent data values may occur when the transformed
#' Gaussian model is highly inadequate. In that case, the function returns
#' the *indices* of the data points with infinite latent values, which are
#' significant outliers under the model. Deletion of these indices and
#' re-running the model is one option, but care must be taken to ensure
#' that (i) it is appropriate to treat these observations as outliers and
#' (ii) the model is adequate for the remaining data points.
#'
#' @references
#' Kowal, D. R., & Wu, B. (2021).
#' Semiparametric count data regression for selfâreported mental health.
#' \emph{Biometrics}. \doi{10.1111/biom.13617}
#'
#' @examples
#' # Simulate data with count-valued response y:
#' sim_dat = simulate_nb_friedman(n = 100, p = 10)
#' y = sim_dat$y; X = sim_dat$X
#'
#' # EM algorithm for STAR (using the log-link)
#' fit_em = gbm_star(y = y, X = X,
#' transformation = 'log')
#'
#' # Evaluate convergence:
#' plot(fit_em$logLik_all, type='l', main = 'GBM-STAR-log', xlab = 'Iteration', ylab = 'log-lik')
#'
#' # Fitted values:
#' y_hat = fitted(fit_em)
#' plot(y_hat, y);
#'
#' # Residuals:
#' plot(residuals(fit_em))
#' qqnorm(residuals(fit_em)); qqline(residuals(fit_em))
#'
#' # Log-likelihood at MLEs:
#' fit_em$logLik
#'
#' @import gbm
#' @export
gbm_star = function(y, X, X.test = NULL,
transformation = 'np',
y_max = Inf,
sd_init = 10,
tol = 10^-10,
max_iters = 1000,
n.trees = 100,
interaction.depth = 1,
shrinkage = 0.1,
bag.fraction = 1){
# Check: currently implemented for nonnegative integers
if(any(y < 0) || any(y != floor(y)))
stop('y must be nonnegative counts')
# Check: y_max must be a true upper bound
if(any(y > y_max))
stop('y must not exceed y_max')
# Check: does the transformation make sense?
transformation = tolower(transformation);
if(!is.element(transformation, c("identity", "log", "sqrt", "np", "pois", "neg-bin", "box-cox")))
stop("The transformation must be one of 'identity', 'log', 'sqrt', 'np', 'pois', 'neg-bin', or 'box-cox'")
# Assign a family for the transformation: Box-Cox or CDF?
transform_family = ifelse(
test = is.element(transformation, c("identity", "log", "sqrt", "box-cox")),
yes = 'bc', no = 'cdf'
)
# Number of observations:
n = length(y)
# Define the transformation:
if(transform_family == 'bc'){
# Lambda value for each Box-Cox argument:
if(transformation == 'identity') lambda = 1
if(transformation == 'log') lambda = 0
if(transformation == 'sqrt') lambda = 1/2
if(transformation == 'box-cox') lambda = runif(n = 1) # random init on (0,1)
# Transformation function:
g = function(t) g_bc(t,lambda = lambda)
# Inverse transformation function:
g_inv = function(s) g_inv_bc(s,lambda = lambda)
# Sum of log-derivatives (for initial log-likelihood):
#g_deriv = function(t) t^(lambda - 1)
sum_log_deriv = (lambda - 1)*sum(log(y+1))
}
if(transform_family == 'cdf'){
# Transformation function:
g = g_cdf(y = y, distribution = transformation)
# Define the grid for approximations using equally-spaced + quantile points:
t_grid = sort(unique(round(c(
seq(0, min(2*max(y), y_max), length.out = 250),
quantile(unique(y[y < y_max] + 1), seq(0, 1, length.out = 250))), 8)))
# Inverse transformation function:
g_inv = g_inv_approx(g = g, t_grid = t_grid)
# Sum of log-derivatives (for initial log-likelihood):
sum_log_deriv = sum(log(pmax(g(y+1, deriv = 1), 0.01)))
# No Box-Cox transformation:
lambda = NULL
}
# Initialize the parameters: add 1 in case of zeros
z_hat = g(y + 1)
fit = gbm(y ~ ., data = data.frame(y = z_hat, X = X),
distribution = "gaussian", # Squared error loss
n.trees = n.trees,
interaction.depth = interaction.depth,
shrinkage = shrinkage,
bag.fraction = bag.fraction
)
# (Initial) Fitted values:
mu_hat = fit$fit
# (Initial) observation SD:
sigma_hat = sd(z_hat - mu_hat)
# (Initial) log-likelihood:
logLik0 = logLik_em0 =
sum_log_deriv + sum(dnorm(z_hat, mean = mu_hat, sd = sigma_hat, log = TRUE))
# Randomize for EM initialization:
if(sd_init > 0){
z_hat = g(y + 1) + sd_init*sigma_hat*rnorm(n = n)
fit = gbm(y ~ ., data = data.frame(y = z_hat, X = X),
distribution = "gaussian", # Squared error loss
n.trees = n.trees,
interaction.depth = interaction.depth,
shrinkage = shrinkage,
bag.fraction = bag.fraction
)
mu_hat = fit$fit; sigma_hat = sd(z_hat - mu_hat)
}
# Lower and upper intervals:
a_y = a_j(y, y_max = y_max); a_yp1 = a_j(y + 1, y_max = y_max)
z_lower = g(a_y); z_upper = g(a_yp1)
# Store the EM trajectories:
mu_all = zhat_all = array(0, c(max_iters, n))
sigma_all = numeric(max_iters) # SD
logLik_all = numeric(max_iters) # Log-likelihood
for(s in 1:max_iters){
# ----------------------------------
## E-step: impute the latent data
# ----------------------------------
# First and second moments of latent variables:
z_mom = truncnorm_mom(a = z_lower, b = z_upper, mu = mu_hat, sig = sigma_hat)
z_hat = z_mom$m1; z2_hat= z_mom$m2;
# Check: if any infinite z_hat values, return these indices and stop
if(any(is.infinite(z_hat))){
warning('Infinite z_hat values: returning the problematic indices')
return(list(error_inds = which(is.infinite(z_hat))))
}
# ----------------------------------
## M-step: estimation
# ----------------------------------
fit = gbm(y ~ ., data = data.frame(y = z_hat, X = X),
distribution = "gaussian", # Squared error loss
n.trees = n.trees,
interaction.depth = interaction.depth,
shrinkage = shrinkage,
bag.fraction = bag.fraction
)
mu_hat = fit$fit
sigma_hat = sqrt((sum(z2_hat) + sum(mu_hat^2) - 2*sum(z_hat*mu_hat))/n)
# If estimating lambda:
if(transformation == 'box-cox'){
# Negative log-likelihood function
ff <- function(l_bc){
sapply(l_bc, function(l_bc){
-logLikeRcpp(g_a_j = g_bc(a_y, lambda = l_bc),
g_a_jp1 = g_bc(a_yp1, lambda = l_bc),
mu = mu_hat,
sigma = rep(sigma_hat, n))})
}
# Set the search interval
a = 0; b = 1.0;
# Brent method will get in error if the function value is infinite
# A simple (but not too rigorous) way to restrict the search interval
while (ff(b) == Inf){
b = b * 0.8
}
# Tune tolorence if needed
lambda = BrentMethod(a, b, fcn = ff, tol = .Machine$double.eps^0.2)$x
# Update the transformation and inverse transformation function:
g = function(t) g_bc(t, lambda = lambda)
g_inv = function(s) g_inv_bc(s, lambda = lambda)
# Update the lower and upper limits:
z_lower = g(a_y); z_upper = g(a_yp1)
}
# Update log-likelihood:
logLik_em = logLikeRcpp(g_a_j = z_lower,
g_a_jp1 = z_upper,
mu = mu_hat,
sigma = rep(sigma_hat, n))
# Storage:
mu_all[s,] = mu_hat; sigma_all[s] = sigma_hat; logLik_all[s] = logLik_em; zhat_all[s,] = z_hat
# Check whether to stop:
if((logLik_em - logLik_em0)^2 < tol) break
logLik_em0 = logLik_em
}
# Subset trajectory to the estimated values:
mu_all = mu_all[1:s,]; sigma_all = sigma_all[1:s]; logLik_all = logLik_all[1:s]; zhat_all = zhat_all[1:s,]
# Also the expected value (fitted values)
# First, estimate an upper bound for the (infinite) summation:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu_hat, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction:
y_hat = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu_hat, sigma = rep(sigma_hat, n),
Jmax = Jmax)
# Dunn-Smyth residuals:
resids_ds = qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
# Replicates of Dunn-Smyth residuals:
resids_ds_rep = sapply(1:10, function(...)
qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
)
# Predictive quantities, if desired:
if(!is.null(X.test)){
# Fitted values on transformed-scale at test points:
mu.test = predict(fit, data.frame(X = X.test), n.trees = n.trees)
# Conditional expectation at test points:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu.test, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction at test points:
fitted.values.test = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu.test, sigma = rep(sigma_hat, n),
Jmax = Jmax)
} else {
fitted.values.test = NULL
}
# Return:
list(fitted.values = y_hat,
fitted.values.test = fitted.values.test,
g.hat = g,
sigma.hat = sigma_hat,
mu.hat = mu_hat,
z.hat = z_hat,
residuals = resids_ds,
residuals_rep = resids_ds_rep,
logLik = logLik_em,
logLik0 = logLik0,
lambda = lambda,
gbmObj = fit,
mu_all = mu_all, sigma_all = sigma_all, logLik_all = logLik_all, zhat_all = zhat_all, # EM trajectory
transformation = transformation, y_max = y_max, tol = tol, max_iters = max_iters) # And return the info about the model as well
}
#' Generalized EM estimation for STAR
#'
#' Compute MLEs and log-likelihood for a generalized STAR model. The STAR model requires
#' a *transformation* and an *estimation function* for the conditional mean
#' given observed data. The transformation can be known (e.g., log or sqrt) or unknown
#' (Box-Cox or estimated nonparametrically) for greater flexibility.
#' The estimator can be any least squares estimator, including nonlinear models.
#' Standard function calls including
#' \code{coefficients()}, \code{fitted()}, and \code{residuals()} apply.
#'
#' @param y \code{n x 1} vector of observed counts
#' @param estimator a function that inputs data \code{y} and outputs a list with two elements:
#' \enumerate{
#' \item The fitted values \code{fitted.values}
#' \item The parameter estimates \code{coefficients}
#' }
#' @param transformation transformation to use for the latent data; must be one of
#' \itemize{
#' \item "identity" (identity transformation)
#' \item "log" (log transformation)
#' \item "sqrt" (square root transformation)
#' \item "np" (nonparametric transformation estimated from empirical CDF)
#' \item "pois" (transformation for moment-matched marginal Poisson CDF)
#' \item "neg-bin" (transformation for moment-matched marginal Negative Binomial CDF)
#' \item "box-cox" (box-cox transformation with learned parameter)
#' }
#' @param y_max a fixed and known upper bound for all observations; default is \code{Inf}
#' @param sd_init add random noise for EM algorithm initialization scaled by \code{sd_init}
#' times the Gaussian MLE standard deviation; default is 10
#' @param tol tolerance for stopping the EM algorithm; default is 10^-10;
#' @param max_iters maximum number of EM iterations before stopping; default is 1000
#' @return a list with the following elements:
#' \itemize{
#' \item \code{coefficients} the MLEs of the coefficients
#' \item \code{fitted.values} the fitted values at the MLEs
#' \item \code{g.hat} a function containing the (known or estimated) transformation
#' \item \code{sigma.hat} the MLE of the standard deviation
#' \item \code{mu.hat} the MLE of the conditional mean (on the transformed scale)
#' \item \code{z.hat} the estimated latent data (on the transformed scale) at the MLEs
#' \item \code{residuals} the Dunn-Smyth residuals (randomized)
#' \item \code{residuals_rep} the Dunn-Smyth residuals (randomized) for 10 replicates
#' \item \code{logLik} the log-likelihood at the MLEs
#' \item \code{logLik0} the log-likelihood at the MLEs for the *unrounded* initialization
#' \item \code{lambda} the Box-Cox nonlinear parameter
#' \item and other parameters that
#' (1) track the parameters across EM iterations and
#' (2) record the model specifications
#' }
#'
#' @details STAR defines a count-valued probability model by
#' (1) specifying a Gaussian model for continuous *latent* data and
#' (2) connecting the latent data to the observed data via a
#' *transformation and rounding* operation.
#'
#' The expectation-maximization (EM) algorithm is used to produce
#' maximum likelihood estimators (MLEs) for the parameters defined in the
#' \code{estimator} function, such as linear regression coefficients,
#' which define the Gaussian model for the continuous latent data.
#' Fitted values (point predictions), residuals, and log-likelihood values
#' are also available. Inference for the estimators proceeds via classical maximum likelihood.
#' Initialization of the EM algorithm can be randomized to monitor convergence.
#' However, the log-likelihood is concave for all transformations (except 'box-cox'),
#' so global convergence is guaranteed.
#'
#' There are several options for the transformation. First, the transformation
#' can belong to the *Box-Cox* family, which includes the known transformations
#' 'identity', 'log', and 'sqrt', as well as a version in which the Box-Cox parameter
#' is estimated within the EM algorithm ('box-cox'). Second, the transformation
#' can be estimated (before model fitting) using the empirical distribution of the
#' data \code{y}. Options in this case include the empirical cumulative
#' distribution function (CDF), which is fully nonparametric ('np'), or the parametric
#' alternatives based on Poisson ('pois') or Negative-Binomial ('neg-bin')
#' distributions. For the parametric distributions, the parameters of the distribution
#' are estimated using moments (means and variances) of \code{y}.
#'
#' @note Infinite latent data values may occur when the transformed
#' Gaussian model is highly inadequate. In that case, the function returns
#' the *indices* of the data points with infinite latent values, which are
#' significant outliers under the model. Deletion of these indices and
#' re-running the model is one option, but care must be taken to ensure
#' that (i) it is appropriate to treat these observations as outliers and
#' (ii) the model is adequate for the remaining data points.
#'
#' @references
#' Kowal, D. R., & Wu, B. (2021).
#' Semiparametric count data regression for selfâreported mental health.
#' \emph{Biometrics}. \doi{10.1111/biom.13617}
#'
#' @examples
#' # Simulate data with count-valued response y:
#' sim_dat = simulate_nb_friedman(n = 100, p = 5)
#' y = sim_dat$y; X = sim_dat$X
#'
#' # Select a transformation:
#' transformation = 'np'
#'
#' # Example using GAM as underlying estimator (for illustration purposes only)
#' if(require("mgcv")){
#' fit_em = genEM_star(y = y,
#' estimator = function(y) gam(y ~ s(X1)+s(X2),
#' data=data.frame(y,X)),
#' transformation = transformation)
#' }
#'
#' # Fitted coefficients:
#' coef(fit_em)
#'
#' # Fitted values:
#' y_hat = fitted(fit_em)
#' plot(y_hat, y);
#'
#' # Log-likelihood at MLEs:
#' fit_em$logLik
#'
#' @export
genEM_star = function(y,
estimator,
transformation = 'np',
y_max = Inf,
sd_init = 10,
tol = 10^-10,
max_iters = 1000){
# Check: currently implemented for nonnegative integers
if(any(y < 0) || any(y != floor(y)))
stop('y must be nonnegative counts')
# Check: y_max must be a true upper bound
if(any(y > y_max))
stop('y must not exceed y_max')
# Check: does the transformation make sense?
transformation = tolower(transformation);
if(!is.element(transformation, c("identity", "log", "sqrt", "np", "pois", "neg-bin", "box-cox")))
stop("The transformation must be one of 'identity', 'log', 'sqrt', 'np', 'pois', 'neg-bin', or 'box-cox'")
# Assign a family for the transformation: Box-Cox or CDF?
transform_family = ifelse(
test = is.element(transformation, c("identity", "log", "sqrt", "box-cox")),
yes = 'bc', no = 'cdf'
)
# Number of observations:
n = length(y)
# Define the transformation:
if(transform_family == 'bc'){
# Lambda value for each Box-Cox argument:
if(transformation == 'identity') lambda = 1
if(transformation == 'log') lambda = 0
if(transformation == 'sqrt') lambda = 1/2
if(transformation == 'box-cox') lambda = runif(n = 1) # random init on (0,1)
# Transformation function:
g = function(t) g_bc(t,lambda = lambda)
# Inverse transformation function:
g_inv = function(s) g_inv_bc(s,lambda = lambda)
# Sum of log-derivatives (for initial log-likelihood):
#g_deriv = function(t) t^(lambda - 1)
sum_log_deriv = (lambda - 1)*sum(log(y+1))
}
if(transform_family == 'cdf'){
# Transformation function:
g = g_cdf(y = y, distribution = transformation)
# Define the grid for approximations using equally-spaced + quantile points:
t_grid = sort(unique(round(c(
seq(0, min(2*max(y), y_max), length.out = 250),
quantile(unique(y[y < y_max] + 1), seq(0, 1, length.out = 250))), 8)))
# Inverse transformation function:
g_inv = g_inv_approx(g = g, t_grid = t_grid)
# Sum of log-derivatives (for initial log-likelihood):
sum_log_deriv = sum(log(pmax(g(y+1, deriv = 1), 0.01)))
# No Box-Cox transformation:
lambda = NULL
}
# Initialize the parameters: add 1 in case of zeros
z_hat = g(y + 1)
fit = estimator(z_hat);
# Check: does the estimator make sense?
if(is.null(fit$fitted.values) || is.null(fit$coefficients))
stop("The estimator() function must return 'fitted.values' and 'coefficients'")
# (Initial) Fitted values:
mu_hat = fit$fitted.values
# (Initial) Coefficients:
theta_hat = fit$coefficients
# (Initial) observation SD:
sigma_hat = sd(z_hat - mu_hat)
# (Initial) log-likelihood:
logLik0 = logLik_em0 =
sum_log_deriv + sum(dnorm(z_hat, mean = mu_hat, sd = sigma_hat, log = TRUE))
# Randomize for EM initialization:
if(sd_init > 0){
z_hat = g(y + 1) + sd_init*sigma_hat*rnorm(n = n)
fit = estimator(z_hat);
mu_hat = fit$fitted.values;
theta_hat = fit$coefficients;
sigma_hat = sd(z_hat - mu_hat)
}
# Number of parameters (excluding sigma)
p = length(theta_hat)
# Lower and upper intervals:
a_y = a_j(y, y_max = y_max); a_yp1 = a_j(y + 1, y_max = y_max)
z_lower = g(a_y); z_upper = g(a_yp1)
# Store the EM trajectories:
mu_all = zhat_all = array(0, c(max_iters, n))
theta_all = array(0, c(max_iters, p)) # Parameters (coefficients)
sigma_all = numeric(max_iters) # SD
logLik_all = numeric(max_iters) # Log-likelihood
for(s in 1:max_iters){
# ----------------------------------
## E-step: impute the latent data
# ----------------------------------
# First and second moments of latent variables:
z_mom = truncnorm_mom(a = z_lower, b = z_upper, mu = mu_hat, sig = sigma_hat)
z_hat = z_mom$m1; z2_hat= z_mom$m2;
# Check: if any infinite z_hat values, return these indices and stop
if(any(is.infinite(z_hat))){
warning('Infinite z_hat values: returning the problematic indices')
return(list(error_inds = which(is.infinite(z_hat))))
}
# ----------------------------------
## M-step: estimation
# ----------------------------------
fit = estimator(z_hat)
mu_hat = fit$fitted.values
theta_hat = fit$coefficients
sigma_hat = sqrt((sum(z2_hat) + sum(mu_hat^2) - 2*sum(z_hat*mu_hat))/n)
# If estimating lambda:
if(transformation == 'box-cox'){
# Negative log-likelihood function
ff <- function(l_bc){
sapply(l_bc, function(l_bc){
-logLikeRcpp(g_a_j = g_bc(a_y, lambda = l_bc),
g_a_jp1 = g_bc(a_yp1, lambda = l_bc),
mu = mu_hat,
sigma = rep(sigma_hat, n))})
}
# Set the search interval
a = 0; b = 1.0;
# Brent method will get in error if the function value is infinite
# A simple (but not too rigorous) way to restrict the search interval
while (ff(b) == Inf){
b = b * 0.8
}
# Tune tolorence if needed
lambda = BrentMethod(a, b, fcn = ff, tol = .Machine$double.eps^0.2)$x
# Update the transformation and inverse transformation function:
g = function(t) g_bc(t, lambda = lambda)
g_inv = function(s) g_inv_bc(s, lambda = lambda)
# Update the lower and upper limits:
z_lower = g(a_y); z_upper = g(a_yp1)
}
# Update log-likelihood:
logLik_em = logLikeRcpp(g_a_j = z_lower,
g_a_jp1 = z_upper,
mu = mu_hat,
sigma = rep(sigma_hat, n))
# Storage:
mu_all[s,] = mu_hat; theta_all[s,] = theta_hat; sigma_all[s] = sigma_hat; logLik_all[s] = logLik_em; zhat_all[s,] = z_hat
# Check whether to stop:
if((logLik_em - logLik_em0)^2 < tol) break
logLik_em0 = logLik_em
}
# Subset trajectory to the estimated values:
mu_all = mu_all[1:s,]; theta_all = theta_all[1:s,]; sigma_all = sigma_all[1:s]; logLik_all = logLik_all[1:s]; zhat_all = zhat_all[1:s,]
# Also the expected value (fitted values)
# First, estimate an upper bound for the (infinite) summation:
if(y_max < Inf){
Jmax = rep(y_max + 1, n)
} else {
Jmax = round_floor(g_inv(qnorm(0.9999, mean = mu_hat, sd = sigma_hat)), y_max = y_max)
Jmax[Jmax > 2*max(y)] = 2*max(y) # cap at 2*max(y) to avoid excessive computations
}
Jmaxmax = max(Jmax) # overall max
# Point prediction:
y_hat = expectation_gRcpp(g_a_j = g(a_j(0:Jmaxmax, y_max = y_max)),
g_a_jp1 = g(a_j(1:(Jmaxmax + 1), y_max = y_max)),
mu = mu_hat, sigma = rep(sigma_hat, n),
Jmax = Jmax)
# Dunn-Smyth residuals:
resids_ds = qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
# Replicates of Dunn-Smyth residuals:
resids_ds_rep = sapply(1:10, function(...)
qnorm(runif(n)*(pnorm((z_upper - mu_hat)/sigma_hat) -
pnorm((z_lower - mu_hat)/sigma_hat)) +
pnorm((z_lower - mu_hat)/sigma_hat))
)
# Return:
list(coefficients = theta_hat,
fitted.values = y_hat,
g.hat = g,
sigma.hat = sigma_hat,
mu.hat = mu_hat,
z.hat = z_hat,
residuals = resids_ds,
residuals_rep = resids_ds_rep,
logLik = logLik_em,
logLik0 = logLik0,
lambda = lambda,
mu_all = mu_all, theta_all = theta_all, sigma_all = sigma_all, logLik_all = logLik_all, zhat_all = zhat_all, # EM trajectory
y = y, estimator = estimator, transformation = transformation, y_max = y_max, tol = tol, max_iters = max_iters) # And return the info about the model as well
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.