R/gpcr.R

Defines functions summary.gpcrBoot print.gpcrBoot gpcr.boot rankest.gpcr print.gpcr summary.gpcr predict.gpcr gpcr

Documented in gpcr gpcr.boot predict.gpcr print.gpcr print.gpcrBoot rankest.gpcr summary.gpcr summary.gpcrBoot

#' Generalized Principal Components Regression
#'
#' @description Principal Components Regression utilizes PCA to decompose a model matrix of numeric predictors
#' into its principal components and then fits a linear model to the components. Afterwards, the coefficients
#' are projected back to obtain the coefficients for the original set of variables. Here this process is
#' extended to generalized linear models, allowing for non-Gaussian outcomes in the exponential family.
#' \cr
#' Principal Components Regression, like any method, has both advantages and disadvantages. One advantage is that coefficients
#' reflecting the original predictors are returned, while still only having to perform regression on k principal components.
#' This obviates the sometimes difficult and awkward task of interpreting the principal components. The selection of the
#' leading k principal components is based on the assumption that components which explain a lot of variance in the design
#' matrix will also explain variance in the outcome variable. This is not always the case. Furthermore as a regularization
#' method, PCR lacks a smooth relationship of the coefficients to the coefficients of the unpenalized model when compared
#' to methods like ridge regression. \cr
#' \cr
#' An alternative method known as projection to latent structures (or partial least squares), available in this package
#' via the function \code{\link{gpls}}, is an approach more closely related to factor analysis than to PCA. Other good
#' alternatives include any of the functions available for sufficient dimension reduction (see \code{\link{sdr}}) and the
#' envelope methods available in \code{\link{ENV}} and \code{\link{GLENV}}. \cr
#' @param formula a model formula
#' @param data a data frame
#' @param ncomp The number of principal components to retain. If left as NULL the minimum of P-1 and the number of non-zero
#' eigenvalues will be returned.
#' @param family one of the glm families supported by R's glm function.
#' @return an object with classes "gpcr".
#' @export
#' @examples gpcr(prog ~ ., diabetes)
gpcr <- function(formula, data, ncomp = NULL, family = gaussian()){

  this.call <- match.call()
  family <- family
  no_dispersion <- family$family %in% c("poisson", "binomial")
  mf <- model.frame(formula, data); xnames <- colnames(mf)[-1]
  terms <- terms(mf)
  y <- mf[,1]; rm(mf)
  mm <- model.matrix(formula, data)[,-1]

  if (is.null(ncomp)){
    ncomp <- min(c(sum(wsvd(mm)$d > 0), ncol(mm)-1))
  }
  if (ncomp==1){
    SVD <- svd(sweep(mm, 2, colMeans(mm)))
    pca.results <- list()
    pca.results$values <- SVD$d^2 / nrow(mm)
    pca.results$vectors <- matrix(SVD$v[,1], ncol = 1)
    pca.results$scores <- matrix(mm %*% pca.results$vectors, ncol = 1)
  }
  else{
    pca.results = cvreg::pca(mm, ncomp = ncomp)
  }
  W <- pca.results$scores
  fit <- list()
  fit0 <- glm(y ~ ., cbind.data.frame(y=y, W), family = family)
  pc.coef <- as.vector(fit0$coefficients)[-1]; intercept <- fit0$coefficients[1]
  coef <- drop(as.matrix(pca.results$vectors) %*% matrix(pc.coef, ncol = 1))
  fit <- list()
  fit$coefficients <- c(intercept, coef)
  fit$linear.predictors <- fit0$linear.predictors
  fit$fitted.values <- fit0$fitted.values
  fit$residuals <- fit0$residuals
  fit$terms <- terms
  reconstruction <- as.matrix(pca.results$scores) %*% as.matrix(pseudoinverse(pca.results$vectors))
  colnames(reconstruction) <- xnames
  fit$qr <- qr(reconstruction)
  fit$R <- qr.R(fit$qr)
  fit$prior.weights <- fit0$prior.weights
  fit$weights <- fit0$weights
  fit$control <- fit0$control
  fit$method <- fit0$contrasts
  fit$contrasts <- fit0$contrasts
  fit$rank <- fit0$rank
  fit$deviance.residuals <- family$dev.res(y, fit$fitted.values, rep(1, length(y)))
  fit$df.residual <- fit0$df.residual
  fit$df.null <- fit0$df.null
  fit$deviance <- fit0$deviance
  fit$null.deviance <- fit0$null.deviance
  fit$aic <- AIC(fit0)
  fit$bic <- BIC(fit0)

  if (fit$df.residual == 0 & !no_dispersion) {
    fit$dispersion <- NA_real_
  }
  else if (fit$df.residual > 0 & !no_dispersion){
    fit$dispersion <- sum((fit$weights * fit$residuals^2))/fit$df.residual
  }
  else if (no_dispersion){
    fit$disperson <- 1
  }

  fit$vcov.unscaled <- solve(cvreg:::.ridgeinv(pseudoinverse(crossprod(cbind(1, reconstruction)))))
  fit$vcov <- fit$vcov.unscaled * fit$dispersion
  fit$std.err <- sqrt(diag(fit$vcov))
  fit$family <- family$family
  fit$link <- family$link
  fit$loadings <- pca.results$loadings
  fit$vectors  <- pca.results$vectors
  fit$scores <- pca.results$scores
  fit$call <- this.call
  class(fit) <-  "gpcr"
  return(fit)
}



#' Predict Method for gpcr Fits
#'
#' @param fit a fitted object of class "gpcr"
#' @param newdata optionally, a data frame in which to look for variables with
#' which to predict. If omitted, the fitted linear predictors are used.
#' @param type the type of prediction required. the default is on the scale
#' of the linear predictors; the alternative "response" is on the scale of
#' the response variable. Thus for a default binomial model the default
#' predictions are of log-odds (probabilities on logit scale) and
#' type = "response" gives the predicted probabilities.
#' @method predict gpcr
#' @return a vector of predictions
#' @export
#' @keywords internal
predict.gpcr <- function(fit, newdata = NULL, type = c("link", "response")){
  type <- match.arg(type)
  class(fit) <- "glm"
  suppressMessages(suppressWarnings(stats::predict.glm(fit, newdata = newdata, type = type)))
}



#' Summary method for gpcr models
#'
#' @param model a fitted model object
#' @method summary gpcr
#' @return a summary
#' @export
#' @keywords internal
summary.gpcr <- function(model){
  purple = crayon::make_style(rgb(0.565, 0, 0.769))
  cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
  cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
  results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 4), round(model$coefficients/model$std.err, 3), round(2 * pnorm(abs(model$coefficients/model$std.err), lower.tail = F), 5))
  colnames(results) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
  return(results)
}

#' Print method for gpcr models
#'
#' @param model the model
#' @method print gpcr
#' @return none
#' @export
#' @keywords internal
print.gpcr <- function(model){
  purple = crayon::make_style(rgb(0.565, 0, 0.769))
  cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
  cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
  results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 4), round(model$coefficients/model$std.err, 3), round(2 * pnorm(abs(model$coefficients/model$std.err), lower.tail = F), 5))
  colnames(results) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
  orange = crayon::make_style(rgb(1, 0.314, 0))
  cat(orange("\nCoefficients: \n\n"))
  print(results)
}



#' Estimate number of principal components to retain for gpcr
#'
#' @param formula model formula
#' @param data data frame. should only contain numeric covariates!
#' @param max.ncomp maximum number of components to try
#' @param family a glm family. defaults to gaussian()
#' @param crit one of "bic" or "aic". defaults to "bic" as the BIC tends
#' to be more parsimonious than AIC.
#' @param print should the results be printed to the console? defaults to TRUE.
#' @param plot should the results be visualized in a plot? defaults to TRUE.
#'
#' @return invisibly returns an integer for the number of factors.
#' @export
rankest.gpcr <- function(formula, data, max.ncomp = NULL, family = gaussian(), crit = c("bic", "aic"), print = TRUE, plot = TRUE){
  crit <- match.arg(crit)
  mm <- model.matrix(formula, data)[,-1]
  if (is.null(max.ncomp)){
    max.ncomp <- min(c(sum(wsvd(mm)$d > 0), ncol(mm)-1))
  }
  values <- eigen(cor(mm), T, T)$values

  if (crit == "bic")
    ic <- sapply(1:max.ncomp, function(n) gpcr(formula, data, n, family)$bic)
  else
    ic <- sapply(1:max.ncomp, function(n) gpcr(formula, data, n, family)$aic)

  rank.est <- seq(1:max.ncomp)[which.min(ic)]
  if (print) cat(cvreg::.crimson(paste0("Optimal Rank: ", rank.est, " (" , ifelse(crit=="bic", "BIC = ", "AIC = "), round(min(ic),3) , ") ")))
  if (plot){
    par(mfrow)
    plot(x = 1:max.ncomp, y = ic, xlab = "number of principal components", ylab = ifelse(crit=="bic", "BIC", "AIC"), type = "b", pch = 21)
    points(rank.est, min(ic), col = "red", bg = "red", pch = 21)
  }
  invisible(rank.est)
}


#' Bootstrap Generalized Principal Components Regression Coefficients
#'
#' @description This function provides a means of obtaining bias corrected
#' and accelerated bootstrap confidence intervals for the regression coefficients
#' (reflecting the original full set of predictors) as well as exact empirical
#' p-values.
#'
#' @param formula a model formula
#' @param data a data frame
#' @param ncomp The number of principal components to retain. If left as NULL the minimum of P-1 and the number of non-zero
#' eigenvalues will be returned.
#' @param family one of the glm families supported by R's glm function.
#' @param nsamp number of bootstrap samples
#' @param parallel whether to use parallel processing for bootstrapping
#' @param ncores if parallel=TRUE, the number of cores. defaults to 4.
#' @param conf.level the confidence level for the confidence intervals. defaults to 0.95
#' @return an object with class "gpcrBoot".
#' @export
#'
gpcr.boot <- function(formula, data, ncomp = NULL, family = gaussian(), nsamp = 1000, parallel = T, ncores = 4, conf.level = 0.95){
  this.call <- match.call()
  out0 <- gpcr(formula, data, ncomp, family)

  make.bootfun <- function(formula, ncomp, family){
    function(data){
      varnames <- colnames(data)
      kern <- c("epan", "tri", "optco", "biweight")
      kern <- kern[sample(1:4, 1)]
      if (family$family != "binomial")
        data <- as.data.frame(rmvk(nrow(data), y = data, kernel = kern, shrinked = T))
      else
        rowid <- sample(1:nrow(data), size = nrow(data), replace = T)
        data <- data[rowid, ]

      colnames(data) <- varnames
      gpcr(formula, data, ncomp, family)$coefficients
    }
  }
  boot.fun <- make.bootfun(formula, ncomp, family)
  suppressPackageStartupMessages(require(parallel))
  suppressPackageStartupMessages(require(doParallel))
  suppressPackageStartupMessages(require(foreach))
  if (parallel){
    if (is.null(ncores)){ncores <- parallel::detectCores()}
    cl <- parallel::makeCluster(ncores)
    doParallel::registerDoParallel(cl)
    boot = foreach(i = 1:nsamp, .combine = rbind, .packages= c("kernelboot","cvreg")) %dopar%
      boot.fun(data)
    parallel::stopCluster(cl)
  }
  else{
    boot = sapply(1:nsamp, function(n) boot.fun(data))
    boot = t(boot)
  }
  out <- list(samples = boot)
  colnames(out$samples) <- names(out0$coefficients)
  confidence.intervals <- sapply(1:ncol(out$samples),function(i) bci(out$samples[,i], conf.level))
  out$p.values <- sapply(1:ncol(out$samples),function(i) cvreg:::exact_pvalue(out$samples[,i]))
  confidence.intervals <- t(confidence.intervals)
  colnames(confidence.intervals) <- c(paste0("lower ", 100*conf.level, "% CI" ), paste0("upper ", 100*conf.level, "% CI"))
  out$coefficients <- out0$coefficients
  out$std.err <- sapply(1:ncol(out$samples), function(x) sqrt(mean((out$samples[,x]-mean(out$samples[,x]))^2)))
  out$confidence.intervals <- confidence.intervals
  out$call <- this.call
  out$family <- out0$family
  return(structure(out, class = "gpcrBoot"))
}

#' Print method for gpcrBoot models
#'
#' @param model the model
#' @method print gpcrBoot
#' @return none
#' @export
#' @keywords internal
print.gpcrBoot <- function(model){
  purple = crayon::make_style(rgb(0.565, 0, 0.769))
  cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
  cat(crayon::green("\nFamily: "), noquote(model$family), "\n", sep = "")
  results = cbind.data.frame(coefs = round(model$coefficients, 3), round(model$std.err, 3), round(model$p.val, 4), round(model$confidence.intervals, 3))
  colnames(results) <- c("Estimate", "Std. Error", "Pr(>|t|)", "Lower", "Upper")
  orange = crayon::make_style(rgb(1, 0.314, 0))
  cat(orange("\nCoefficients: \n\n"))
  print(results)
}


#' Summary method for gpcrBoot models
#'
#' @param model the model
#' @method summary gpcrBoot
#' @return a data frame
#' @export
#' @keywords internal
summary.gpcrBoot <- function(model){
  results = cbind.data.frame(coefs = round(model$coefficients, 6), round(model$std.err, 6), model$p.val,  model$confidence.intervals)
  colnames(results) <- c("Estimate", "Std. Error", "Pr(>|t|)", "Lower", "Upper")
  return(as.data.frame(results))
}
abnormally-distributed/cvreg documentation built on May 3, 2020, 3:45 p.m.