pcsstools: Tools for Regression Using Pre-Computed Summary Statistics

Documented in calculate_lm pcsslm

#' Approximate a linear model using PCSS
#' 
#' \code{pcsslm} approximates a linear model of a combination of variables using
#'   precomputed summary statistics.
#' 
#' \code{pcsslm} parses the input \code{formula}'s dependent variable for 
#'   functions such as sums (\code{+}), products (\code{*}), or logical 
#'   operators (\code{|} and \code{&}).
#'   It then identifies models the combination of variables using one of
#'   \code{\link{model_combo}}, \code{\link{model_product}}, 
#'   \code{\link{model_or}}, \code{\link{model_and}}, or 
#'   \code{\link{model_prcomp}}.
#' 
#'   Different precomputed summary statistics are needed inside \code{pcss}
#'   depending on the function that combines the dependent variable.
#'   \itemize{
#'     \item For linear combinations (and principal component analysis), only
#'           \code{n}, \code{means}, and \code{covs} are required
#'     \item For products and logical combinations, the additional items
#'           \code{predictors} and \code{responses} are required.
#'           These are named lists of objects of class \code{predictor}
#'           generated by \code{\link{new_predictor}}, with a \code{predictor}
#'           object for each independent variable in \code{predictors} and
#'           each dependent variable in \code{responses}.
#'           However, if only modeling the product or logical combination of 
#'           only two variables, \code{responses} can be \code{NULL} without
#'           consequence.
#'   }
#'   
#'   If modeling a principal component score of a set of variables, include 
#'   the argument \code{comp} where \code{comp} is an integer indicating which 
#'   principal component score to analyze. Optional logical arguments 
#'   \code{center} and \code{standardize} determine if responses should be
#'   centered and standardized before principal components are calculated. 
#'   
#'   If modeling a linear combination, include the argument \code{phi}, a named
#'   vector of linear weights for each variable in the dependent variable in 
#'   formula.
#'   
#'   If modeling a product, include the argument \code{response}, a character
#'   equal to either \code{"continuous"} or \code{"binary"}. If \code{"binary"},
#'   specialized approximations are performed to estimate means and variances.
#' 
#' @param formula an object of class formula whose dependent variable is a 
#'     combination of variables and logical | operators. 
#'     All model terms must have appropriate PCSS in \code{pcss}.
#' @param pcss a list of precomputed summary statistics. In all cases, this
#'     should include \code{n}: the sample size, \code{means}: a named vector of
#'     predictor and response means, and \code{covs}: a named covariance matrix
#'     including all predictors and responses. See Details for more information.
#' @param ... additional arguments. See Details for more information.
#' 
#' @seealso \code{\link{model_combo}}, \code{\link{model_product}}, 
#'   \code{\link{model_or}}, \code{\link{model_and}}, and 
#'   \code{\link{model_prcomp}}.
#' 
#' @return an object of class \code{"pcsslm"}.
#' 
#'   An object of class \code{"pcsslm"} is a list containing at least the 
#'   following components:
#'     \item{call}{the matched call}
#'     \item{terms}{the \code{terms} object used}
#'     \item{coefficients}{a \eqn{p x 4} matrix with columns for the 
#'       estimated coefficient, its standard error, t-statistic and
#'       corresponding (two-sided) p-value.}
#'     \item{sigma}{the square root of the estimated variance of the random
#'       error.}
#'     \item{df}{degrees of freedom, a 3-vector \eqn{p, n-p, p*}, the
#'       first being the number of non-aliased coefficients, the last being
#'       the total number of coefficients.}
#'     \item{fstatistic}{a 3-vector with the value of the F-statistic with its
#'       numerator and denominator degrees of freedom.}
#'     \item{r.squared}{\eqn{R^2}, the 'fraction of variance explained by the 
#'       model'.}
#'     \item{adj.r.squared}{the above \eqn{R^2} statistic \emph{'adjusted'},
#'       penalizing for higher \eqn{p}.}
#'     \item{cov.unscaled}{a \eqn{p x p} matrix of (unscaled) covariances of the
#'       \eqn{coef[j], j=1,...p}.}
#'     \item{Sum Sq}{a 3-vector with the model's Sum of Squares Regression 
#'       (SSR), Sum of Squares Error (SSE), and Sum of Squares Total (SST).}
#'   
#' @importFrom Rdpack reprompt
#' 
#' @examples 
#' ## Principal Component Analysis
#' ex_data <- pcsstools_example[c("g1", "x1", "y1", "y2", "y3")]
#' pcss <- list(
#'   means = colMeans(ex_data),
#'   covs = cov(ex_data),
#'   n = nrow(ex_data)
#' )
#' 
#' pcsslm(y1 + y2 + y3 ~ g1 + x1, pcss = pcss, comp = 1)
#' 
#' ## Linear combination of variables
#' ex_data <- pcsstools_example[c("g1", "g2", "y1", "y2")]
#' pcss <- list(
#'   means = colMeans(ex_data),
#'   covs = cov(ex_data),
#'   n = nrow(ex_data)
#' )
#' 
#' pcsslm(y1 + y2 ~ g1 + g2, pcss = pcss, phi = c(1, -1))
#' summary(lm(y1 - y2 ~ g1 + g2, data = ex_data))
#' 
#' ## Product of variables
#' ex_data <- pcsstools_example[c("g1", "x1", "y4", "y5", "y6")]
#' 
#' pcss <- list(
#'   means = colMeans(ex_data),
#'   covs = cov(ex_data),
#'   n = nrow(ex_data),
#'   predictors = list(
#'     g1 = new_predictor_snp(maf = mean(ex_data$g1) / 2),
#'     x1 = new_predictor_normal(mean = mean(ex_data$x1), sd = sd(ex_data$x1))
#'   ),
#'   responses = lapply(
#'     colMeans(ex_data)[3:length(colMeans(ex_data))], 
#'     new_predictor_binary
#'   )
#' )
#'
#' pcsslm(y4 * y5 * y6 ~ g1 + x1, pcss = pcss, response = "binary")
#' summary(lm(y4 * y5 * y6 ~ g1 + x1, data = ex_data))
#' 
#' ## Disjunct (OR statement) of variables
#' ex_data <- pcsstools_example[c("g1", "x1", "y4", "y5")]
#' 
#' pcss <- list(
#'   means = colMeans(ex_data),
#'   covs = cov(ex_data),
#'   n = nrow(ex_data),
#'   predictors = list(
#'     g1 = new_predictor_snp(maf = mean(ex_data$g1) / 2),
#'     x1 = new_predictor_normal(mean = mean(ex_data$x1), sd = sd(ex_data$x1))
#'   )
#' )
#' pcsslm(y4 | y5 ~ g1 + x1, pcss = pcss) 
#' summary(lm(y4 | y5 ~ g1 + x1, data = ex_data))
#' 
#' @references{
#' 
#'   \insertRef{wolf_using_2021}{pcsstools}
#'   
#'   \insertRef{wolf_computationally_2020}{pcsstools}
#'   
#'   \insertRef{gasdaska_leveraging_2019}{pcsstools}
#' 
#' }
#'   
#' @export
pcsslm <- function(formula, pcss = list(), ...) {
  cl <- match.call()
  
  args <- c(append(list(formula = formula), pcss), list(...))

  # First check if argument comp provided. If so, assume this is PCA
  if ("comp" %in% names(list(...))) {
    model_func <- "model_prcomp"
  } else {
    model_func <- guess_response(extract_response(formula))
  }
  
  re <- do.call(model_func, args)
  re$call <- cl
  
  class(re) <- "pcsslm"
  
  return(re)
}


#' Calculate a linear model using PCSS
#'
#' \code{calculate_lm} describes the linear model of the last listed variable
#' in \code{means} and \code{covs} as a function of all other variables in
#' \code{means} and \code{covs}.
#'
#' @param means a vector of means of all model predictors and the response with
#'   the last element the response mean.
#' @param covs a matrix of the covariance of all model predictors and the
#'   response with the order of rows/columns corresponding to the order of
#'   \code{means}.
#' @param n sample size
#' @param add_intercept logical. If \code{TRUE} adds an intercept to the model.
#' @param keep_pcss logical. If \code{TRUE}, returns \code{means} and 
#'   \code{covs}.
#' @param terms terms
#'
#' @importFrom stats pt
#'
#' @references{
#' 
#'   \insertRef{wolf_using_2021}{pcsstools}
#'   
#'   \insertRef{wolf_computationally_2020}{pcsstools}
#'   
#'   \insertRef{gasdaska_leveraging_2019}{pcsstools}
#' 
#' }
#'@inherit pcsslm return
calculate_lm <- function(means, covs, n, add_intercept = FALSE, 
                         keep_pcss = FALSE, terms = NULL) {
  cl <- match.call()
  p <- ncol(covs) - 1
  if (add_intercept) {
    p <- p + 1
    covs <- rbind(0, cbind(0, covs))
    means <- c(1, means)
    if (!is.null(dimnames(covs)) & !is.null(dimnames(covs))) {
      names(means)[1] <- "(Intercept)"
      colnames(covs)[1] <- "(Intercept)"
      rownames(covs)[1] <- "(Intercept)"
    }
  }

  covX <- covs[-(p + 1), -(p + 1)]
  meanX <- means[-(p + 1)]
  covXY <- covs[p + 1, -(p + 1)]
  varY <- covs[p + 1, p + 1]
  meanY <- means[p + 1]
  
  yty <- (n - 1) * varY + n * meanY^2
  
  aliased <- rep(FALSE, p)
  names(aliased) <- names(meanX)
  while(TRUE) {
  
    XtX <- (n - 1) * covX + n * meanX %*% t(meanX)
    Xty <- (n - 1) * covXY + n * meanX * means[p + 1]
    
    # Check if XtX can be inverted
    XtX1 <- try(solve(XtX), silent = T)
    if ("try-error" %in% class(XtX1)) {
      rankifremoved <- sapply(1:ncol(XtX), function (x) qr(XtX[,-x])$rank)
      # Remove last column with linear dependence
      lind.col <- max(which(rankifremoved == max(rankifremoved)))
      covX <- covX[-lind.col, -lind.col]
      meanX <- meanX[-lind.col]
      covXY <- covXY[-lind.col]
      aliased[lind.col] <- TRUE
    }
    else{
      break()
    }
  }
  
  p <- length(aliased[!aliased])

  beta <- drop(XtX1 %*% Xty)

  sigma2 <- drop((yty - t(beta) %*% Xty) / (n - p))

  var_beta <- diag(sigma2 * XtX1)
  sd_beta <- sqrt(var_beta)

  t_stat <- beta / sd_beta
  p_val <- 2 * pt(abs(t_stat), df = n - p, lower.tail = F)

  coefficients <- cbind(beta, sd_beta, t_stat, p_val)
  colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")

  sigma <- sqrt(sigma2)

  df <- c(p, n - p, length(aliased))


  SSE <- drop(yty - t(Xty) %*% XtX1 %*% Xty)
  SST <- (n - 1) * varY
  SSR <- SST - SSE

  SS <- c(SSR = SSR, SSE = SSE, SST = SST)

  MSR <- SSR / (p - 1)
  MSE <- SSE / (n - p)

  r.squared <- 1 - (SSE / SST)
  adj.r.squared <- 1 - (1 - r.squared) * (n - 1) / (n - p)

  fstatistic <- c(value = MSR / MSE, numdf = p - 1, dendf = n - p)

  cov.unscaled <- XtX1

  re <- list(
    call = cl,
    terms = terms,
    coefficients = coefficients,
    aliased = aliased,
    sigma = sigma,
    df = df,
    r.squared = r.squared,
    adj.r.squared = adj.r.squared,
    fstatistic = fstatistic,
    cov.unscaled = cov.unscaled,
    `Sum Sq` = SS
  )
  
  if (keep_pcss) {
    re$means <- means
    re$covs <- covs
  }
  
  class(re) <- "pcsslm"

  return(re)
}