#' Generalized Principal Components Regression
#'
#' @description Principal Components Regression utilizes PCA to decompose a model matrix of numeric predictors
#' into its principal components and then fits a linear model to the components. Afterwards, the coefficients
#' are projected back to obtain the coefficients for the original set of variables. Here this process is
#' extended to generalized linear models, allowing for non-Gaussian outcomes in the exponential family.
#' \cr
#' Principal Components Regression, like any method, has both advantages and disadvantages. One advantage is that coefficients
#' reflecting the original predictors are returned, while still only having to perform regression on k principal components.
#' This obviates the sometimes difficult and awkward task of interpreting the principal components. The selection of the
#' leading k principal components is based on the assumption that components which explain a lot of variance in the design
#' matrix will also explain variance in the outcome variable. This is not always the case. Furthermore as a regularization
#' method, PCR lacks a smooth relationship of the coefficients to the coefficients of the unpenalized model when compared
#' to methods like ridge regression. \cr
#' \cr
#' An alternative method known as projection to latent structures (or partial least squares), available in this package
#' via the function \code{\link{gpls}}, is an approach more closely related to factor analysis than to PCA. Other good
#' alternatives include any of the functions available for sufficient dimension reduction (see \code{\link{sdr}}) and the
#' envelope methods available in \code{\link{ENV}} and \code{\link{GLENV}}. \cr
#' @param formula a model formula
#' @param data a data frame
#' @param ncomp The number of principal components to retain. If left as NULL the minimum of P-1 and the number of non-zero
#' eigenvalues will be returned.
#' @param family one of the glm families supported by R's glm function.
#' @return an object with classes "gpcr".
#' @export
#' @examples gpcr(prog ~ ., diabetes)
gpcr <- function(formula, data, ncomp = NULL, family = gaussian()){
this.call <- match.call()
family <- family
no_dispersion <- family$family %in% c("poisson", "binomial")
mf <- model.frame(formula, data); xnames <- colnames(mf)[-1]
terms <- terms(mf)
y <- mf[,1]; rm(mf)
mm <- model.matrix(formula, data)[,-1]
if (is.null(ncomp)){
ncomp <- min(c(sum(wsvd(mm)$d > 0), ncol(mm)-1))
}
if (ncomp==1){
SVD <- svd(sweep(mm, 2, colMeans(mm)))
pca.results <- list()
pca.results$values <- SVD$d^2 / nrow(mm)
pca.results$vectors <- matrix(SVD$v[,1], ncol = 1)
pca.results$scores <- matrix(mm %*% pca.results$vectors, ncol = 1)
}
else{
pca.results = cvreg::pca(mm, ncomp = ncomp)
}
W <- pca.results$scores
fit <- list()
fit0 <- glm(y ~ ., cbind.data.frame(y=y, W), family = family)
pc.coef <- as.vector(fit0$coefficients)[-1]; intercept <- fit0$coefficients[1]
coef <- drop(as.matrix(pca.results$vectors) %*% matrix(pc.coef, ncol = 1))
fit <- list()
fit$coefficients <- c(intercept, coef)
fit$linear.predictors <- fit0$linear.predictors
fit$fitted.values <- fit0$fitted.values
fit$residuals <- fit0$residuals
fit$terms <- terms
reconstruction <- as.matrix(pca.results$scores) %*% as.matrix(pseudoinverse(pca.results$vectors))
colnames(reconstruction) <- xnames
fit$qr <- qr(reconstruction)
fit$R <- qr.R(fit$qr)
fit$prior.weights <- fit0$prior.weights
fit$weights <- fit0$weights
fit$control <- fit0$control
fit$method <- fit0$contrasts
fit$contrasts <- fit0$contrasts
fit$rank <- fit0$rank
fit$deviance.residuals <- family$dev.res(y, fit$fitted.values, rep(1, length(y)))
fit$df.residual <- fit0$df.residual
fit$df.null <- fit0$df.null
fit$deviance <- fit0$deviance
fit$null.deviance <- fit0$null.deviance
fit$aic <- AIC(fit0)
fit$bic <- BIC(fit0)
if (fit$df.residual == 0 & !no_dispersion) {
fit$dispersion <- NA_real_
}
else if (fit$df.residual > 0 & !no_dispersion){
fit$dispersion <- sum((fit$weights * fit$residuals^2))/fit$df.residual
}
else if (no_dispersion){
fit$disperson <- 1
}
fit$vcov.unscaled <- solve(cvreg:::.ridgeinv(pseudoinverse(crossprod(cbind(1, reconstruction)))))
fit$vcov <- fit$vcov.unscaled * fit$dispersion
fit$std.err <- sqrt(diag(fit$vcov))
fit$family <- family$family
fit$link <- family$link
fit$loadings <- pca.results$loadings
fit$vectors <- pca.results$vectors
fit$scores <- pca.results$scores
fit$call <- this.call
class(fit) <- "gpcr"
return(fit)
}
#' Predict Method for gpcr Fits
#'
#' @param fit a fitted object of class "gpcr"
#' @param newdata optionally, a data frame in which to look for variables with
#' which to predict. If omitted, the fitted linear predictors are used.
#' @param type the type of prediction required. the default is on the scale
#' of the linear predictors; the alternative "response" is on the scale of
#' the response variable. Thus for a default binomial model the default
#' predictions are of log-odds (probabilities on logit scale) and
#' type = "response" gives the predicted probabilities.
#' @method predict gpcr
#' @return a vector of predictions
#' @export
#' @keywords internal
predict.gpcr <- function(fit, newdata = NULL, type = c("link", "response")){
type <- match.arg(type)
class(fit) <- "glm"
suppressMessages(suppressWarnings(stats::predict.glm(fit, newdata = newdata, type = type)))
}
#' Summary method for gpcr models
#'
#' @param model a fitted model object
#' @method summary gpcr
#' @return a summary
#' @export
#' @keywords internal
summary.gpcr <- function(model){
purple = crayon::make_style(rgb(0.565, 0, 0.769))
cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 4), round(model$coefficients/model$std.err, 3), round(2 * pnorm(abs(model$coefficients/model$std.err), lower.tail = F), 5))
colnames(results) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
return(results)
}
#' Print method for gpcr models
#'
#' @param model the model
#' @method print gpcr
#' @return none
#' @export
#' @keywords internal
print.gpcr <- function(model){
purple = crayon::make_style(rgb(0.565, 0, 0.769))
cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 4), round(model$coefficients/model$std.err, 3), round(2 * pnorm(abs(model$coefficients/model$std.err), lower.tail = F), 5))
colnames(results) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
orange = crayon::make_style(rgb(1, 0.314, 0))
cat(orange("\nCoefficients: \n\n"))
print(results)
}
#' Estimate number of principal components to retain for gpcr
#'
#' @param formula model formula
#' @param data data frame. should only contain numeric covariates!
#' @param max.ncomp maximum number of components to try
#' @param family a glm family. defaults to gaussian()
#' @param crit one of "bic" or "aic". defaults to "bic" as the BIC tends
#' to be more parsimonious than AIC.
#' @param print should the results be printed to the console? defaults to TRUE.
#' @param plot should the results be visualized in a plot? defaults to TRUE.
#'
#' @return invisibly returns an integer for the number of factors.
#' @export
rankest.gpcr <- function(formula, data, max.ncomp = NULL, family = gaussian(), crit = c("bic", "aic"), print = TRUE, plot = TRUE){
crit <- match.arg(crit)
mm <- model.matrix(formula, data)[,-1]
if (is.null(max.ncomp)){
max.ncomp <- min(c(sum(wsvd(mm)$d > 0), ncol(mm)-1))
}
values <- eigen(cor(mm), T, T)$values
if (crit == "bic")
ic <- sapply(1:max.ncomp, function(n) gpcr(formula, data, n, family)$bic)
else
ic <- sapply(1:max.ncomp, function(n) gpcr(formula, data, n, family)$aic)
rank.est <- seq(1:max.ncomp)[which.min(ic)]
if (print) cat(cvreg::.crimson(paste0("Optimal Rank: ", rank.est, " (" , ifelse(crit=="bic", "BIC = ", "AIC = "), round(min(ic),3) , ") ")))
if (plot){
par(mfrow)
plot(x = 1:max.ncomp, y = ic, xlab = "number of principal components", ylab = ifelse(crit=="bic", "BIC", "AIC"), type = "b", pch = 21)
points(rank.est, min(ic), col = "red", bg = "red", pch = 21)
}
invisible(rank.est)
}
#' Bootstrap Generalized Principal Components Regression Coefficients
#'
#' @description This function provides a means of obtaining bias corrected
#' and accelerated bootstrap confidence intervals for the regression coefficients
#' (reflecting the original full set of predictors) as well as exact empirical
#' p-values.
#'
#' @param formula a model formula
#' @param data a data frame
#' @param ncomp The number of principal components to retain. If left as NULL the minimum of P-1 and the number of non-zero
#' eigenvalues will be returned.
#' @param family one of the glm families supported by R's glm function.
#' @param nsamp number of bootstrap samples
#' @param parallel whether to use parallel processing for bootstrapping
#' @param ncores if parallel=TRUE, the number of cores. defaults to 4.
#' @param conf.level the confidence level for the confidence intervals. defaults to 0.95
#' @return an object with class "gpcrBoot".
#' @export
#'
gpcr.boot <- function(formula, data, ncomp = NULL, family = gaussian(), nsamp = 1000, parallel = T, ncores = 4, conf.level = 0.95){
this.call <- match.call()
out0 <- gpcr(formula, data, ncomp, family)
make.bootfun <- function(formula, ncomp, family){
function(data){
varnames <- colnames(data)
kern <- c("epan", "tri", "optco", "biweight")
kern <- kern[sample(1:4, 1)]
if (family$family != "binomial")
data <- as.data.frame(rmvk(nrow(data), y = data, kernel = kern, shrinked = T))
else
rowid <- sample(1:nrow(data), size = nrow(data), replace = T)
data <- data[rowid, ]
colnames(data) <- varnames
gpcr(formula, data, ncomp, family)$coefficients
}
}
boot.fun <- make.bootfun(formula, ncomp, family)
suppressPackageStartupMessages(require(parallel))
suppressPackageStartupMessages(require(doParallel))
suppressPackageStartupMessages(require(foreach))
if (parallel){
if (is.null(ncores)){ncores <- parallel::detectCores()}
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
boot = foreach(i = 1:nsamp, .combine = rbind, .packages= c("kernelboot","cvreg")) %dopar%
boot.fun(data)
parallel::stopCluster(cl)
}
else{
boot = sapply(1:nsamp, function(n) boot.fun(data))
boot = t(boot)
}
out <- list(samples = boot)
colnames(out$samples) <- names(out0$coefficients)
confidence.intervals <- sapply(1:ncol(out$samples),function(i) bci(out$samples[,i], conf.level))
out$p.values <- sapply(1:ncol(out$samples),function(i) cvreg:::exact_pvalue(out$samples[,i]))
confidence.intervals <- t(confidence.intervals)
colnames(confidence.intervals) <- c(paste0("lower ", 100*conf.level, "% CI" ), paste0("upper ", 100*conf.level, "% CI"))
out$coefficients <- out0$coefficients
out$std.err <- sapply(1:ncol(out$samples), function(x) sqrt(mean((out$samples[,x]-mean(out$samples[,x]))^2)))
out$confidence.intervals <- confidence.intervals
out$call <- this.call
out$family <- out0$family
return(structure(out, class = "gpcrBoot"))
}
#' Print method for gpcrBoot models
#'
#' @param model the model
#' @method print gpcrBoot
#' @return none
#' @export
#' @keywords internal
print.gpcrBoot <- function(model){
purple = crayon::make_style(rgb(0.565, 0, 0.769))
cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 3), round(model$p.val, 4), round(model$confidence.intervals, 3))
colnames(results) <- c("Estimate", "Std. Error", "Pr(>|t|)", "Lower", "Upper")
orange = crayon::make_style(rgb(1, 0.314, 0))
cat(orange("\nCoefficients: \n\n"))
print(results)
}
#' Summary method for gpcrBoot models
#'
#' @param model the model
#' @method summary gpcrBoot
#' @return a data frame
#' @export
#' @keywords internal
summary.gpcrBoot <- function(model){
results = cbind.data.frame(coefs = round(model$coefficients, 6), round(model$std.err, 6), model$p.val, model$confidence.intervals)
colnames(results) <- c("Estimate", "Std. Error", "Pr(>|t|)", "Lower", "Upper")
return(as.data.frame(results))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.