#' Approximate a linear model using PCSS
#'
#' \code{pcsslm} approximates a linear model of a combination of variables using
#' precomputed summary statistics.
#'
#' \code{pcsslm} parses the input \code{formula}'s dependent variable for
#' functions such as sums (\code{+}), products (\code{*}), or logical
#' operators (\code{|} and \code{&}).
#' It then identifies models the combination of variables using one of
#' \code{\link{model_combo}}, \code{\link{model_product}},
#' \code{\link{model_or}}, \code{\link{model_and}}, or
#' \code{\link{model_prcomp}}.
#'
#' Different precomputed summary statistics are needed inside \code{pcss}
#' depending on the function that combines the dependent variable.
#' \itemize{
#' \item For linear combinations (and principal component analysis), only
#' \code{n}, \code{means}, and \code{covs} are required
#' \item For products and logical combinations, the additional items
#' \code{predictors} and \code{responses} are required.
#' These are named lists of objects of class \code{predictor}
#' generated by \code{\link{new_predictor}}, with a \code{predictor}
#' object for each independent variable in \code{predictors} and
#' each dependent variable in \code{responses}.
#' However, if only modeling the product or logical combination of
#' only two variables, \code{responses} can be \code{NULL} without
#' consequence.
#' }
#'
#' If modeling a principal component score of a set of variables, include
#' the argument \code{comp} where \code{comp} is an integer indicating which
#' principal component score to analyze. Optional logical arguments
#' \code{center} and \code{standardize} determine if responses should be
#' centered and standardized before principal components are calculated.
#'
#' If modeling a linear combination, include the argument \code{phi}, a named
#' vector of linear weights for each variable in the dependent variable in
#' formula.
#'
#' If modeling a product, include the argument \code{response}, a character
#' equal to either \code{"continuous"} or \code{"binary"}. If \code{"binary"},
#' specialized approximations are performed to estimate means and variances.
#'
#' @param formula an object of class formula whose dependent variable is a
#' combination of variables and logical | operators.
#' All model terms must have appropriate PCSS in \code{pcss}.
#' @param pcss a list of precomputed summary statistics. In all cases, this
#' should include \code{n}: the sample size, \code{means}: a named vector of
#' predictor and response means, and \code{covs}: a named covariance matrix
#' including all predictors and responses. See Details for more information.
#' @param ... additional arguments. See Details for more information.
#'
#' @seealso \code{\link{model_combo}}, \code{\link{model_product}},
#' \code{\link{model_or}}, \code{\link{model_and}}, and
#' \code{\link{model_prcomp}}.
#'
#' @return an object of class \code{"pcsslm"}.
#'
#' An object of class \code{"pcsslm"} is a list containing at least the
#' following components:
#' \item{call}{the matched call}
#' \item{terms}{the \code{terms} object used}
#' \item{coefficients}{a \eqn{p x 4} matrix with columns for the
#' estimated coefficient, its standard error, t-statistic and
#' corresponding (two-sided) p-value.}
#' \item{sigma}{the square root of the estimated variance of the random
#' error.}
#' \item{df}{degrees of freedom, a 3-vector \eqn{p, n-p, p*}, the
#' first being the number of non-aliased coefficients, the last being
#' the total number of coefficients.}
#' \item{fstatistic}{a 3-vector with the value of the F-statistic with its
#' numerator and denominator degrees of freedom.}
#' \item{r.squared}{\eqn{R^2}, the 'fraction of variance explained by the
#' model'.}
#' \item{adj.r.squared}{the above \eqn{R^2} statistic \emph{'adjusted'},
#' penalizing for higher \eqn{p}.}
#' \item{cov.unscaled}{a \eqn{p x p} matrix of (unscaled) covariances of the
#' \eqn{coef[j], j=1,...p}.}
#' \item{Sum Sq}{a 3-vector with the model's Sum of Squares Regression
#' (SSR), Sum of Squares Error (SSE), and Sum of Squares Total (SST).}
#'
#' @importFrom Rdpack reprompt
#'
#' @examples
#' ## Principal Component Analysis
#' ex_data <- pcsstools_example[c("g1", "x1", "y1", "y2", "y3")]
#' pcss <- list(
#' means = colMeans(ex_data),
#' covs = cov(ex_data),
#' n = nrow(ex_data)
#' )
#'
#' pcsslm(y1 + y2 + y3 ~ g1 + x1, pcss = pcss, comp = 1)
#'
#' ## Linear combination of variables
#' ex_data <- pcsstools_example[c("g1", "g2", "y1", "y2")]
#' pcss <- list(
#' means = colMeans(ex_data),
#' covs = cov(ex_data),
#' n = nrow(ex_data)
#' )
#'
#' pcsslm(y1 + y2 ~ g1 + g2, pcss = pcss, phi = c(1, -1))
#' summary(lm(y1 - y2 ~ g1 + g2, data = ex_data))
#'
#' ## Product of variables
#' ex_data <- pcsstools_example[c("g1", "x1", "y4", "y5", "y6")]
#'
#' pcss <- list(
#' means = colMeans(ex_data),
#' covs = cov(ex_data),
#' n = nrow(ex_data),
#' predictors = list(
#' g1 = new_predictor_snp(maf = mean(ex_data$g1) / 2),
#' x1 = new_predictor_normal(mean = mean(ex_data$x1), sd = sd(ex_data$x1))
#' ),
#' responses = lapply(
#' colMeans(ex_data)[3:length(colMeans(ex_data))],
#' new_predictor_binary
#' )
#' )
#'
#' pcsslm(y4 * y5 * y6 ~ g1 + x1, pcss = pcss, response = "binary")
#' summary(lm(y4 * y5 * y6 ~ g1 + x1, data = ex_data))
#'
#' ## Disjunct (OR statement) of variables
#' ex_data <- pcsstools_example[c("g1", "x1", "y4", "y5")]
#'
#' pcss <- list(
#' means = colMeans(ex_data),
#' covs = cov(ex_data),
#' n = nrow(ex_data),
#' predictors = list(
#' g1 = new_predictor_snp(maf = mean(ex_data$g1) / 2),
#' x1 = new_predictor_normal(mean = mean(ex_data$x1), sd = sd(ex_data$x1))
#' )
#' )
#' pcsslm(y4 | y5 ~ g1 + x1, pcss = pcss)
#' summary(lm(y4 | y5 ~ g1 + x1, data = ex_data))
#'
#' @references{
#'
#' \insertRef{wolf_using_2021}{pcsstools}
#'
#' \insertRef{wolf_computationally_2020}{pcsstools}
#'
#' \insertRef{gasdaska_leveraging_2019}{pcsstools}
#'
#' }
#'
#' @export
pcsslm <- function(formula, pcss = list(), ...) {
cl <- match.call()
args <- c(append(list(formula = formula), pcss), list(...))
# First check if argument comp provided. If so, assume this is PCA
if ("comp" %in% names(list(...))) {
model_func <- "model_prcomp"
} else {
model_func <- guess_response(extract_response(formula))
}
re <- do.call(model_func, args)
re$call <- cl
class(re) <- "pcsslm"
return(re)
}
#' Calculate a linear model using PCSS
#'
#' \code{calculate_lm} describes the linear model of the last listed variable
#' in \code{means} and \code{covs} as a function of all other variables in
#' \code{means} and \code{covs}.
#'
#' @param means a vector of means of all model predictors and the response with
#' the last element the response mean.
#' @param covs a matrix of the covariance of all model predictors and the
#' response with the order of rows/columns corresponding to the order of
#' \code{means}.
#' @param n sample size
#' @param add_intercept logical. If \code{TRUE} adds an intercept to the model.
#' @param keep_pcss logical. If \code{TRUE}, returns \code{means} and
#' \code{covs}.
#' @param terms terms
#'
#' @importFrom stats pt
#'
#' @references{
#'
#' \insertRef{wolf_using_2021}{pcsstools}
#'
#' \insertRef{wolf_computationally_2020}{pcsstools}
#'
#' \insertRef{gasdaska_leveraging_2019}{pcsstools}
#'
#' }
#'@inherit pcsslm return
calculate_lm <- function(means, covs, n, add_intercept = FALSE,
keep_pcss = FALSE, terms = NULL) {
cl <- match.call()
p <- ncol(covs) - 1
if (add_intercept) {
p <- p + 1
covs <- rbind(0, cbind(0, covs))
means <- c(1, means)
if (!is.null(dimnames(covs)) & !is.null(dimnames(covs))) {
names(means)[1] <- "(Intercept)"
colnames(covs)[1] <- "(Intercept)"
rownames(covs)[1] <- "(Intercept)"
}
}
covX <- covs[-(p + 1), -(p + 1)]
meanX <- means[-(p + 1)]
covXY <- covs[p + 1, -(p + 1)]
varY <- covs[p + 1, p + 1]
meanY <- means[p + 1]
yty <- (n - 1) * varY + n * meanY^2
aliased <- rep(FALSE, p)
names(aliased) <- names(meanX)
while(TRUE) {
XtX <- (n - 1) * covX + n * meanX %*% t(meanX)
Xty <- (n - 1) * covXY + n * meanX * means[p + 1]
# Check if XtX can be inverted
XtX1 <- try(solve(XtX), silent = T)
if ("try-error" %in% class(XtX1)) {
rankifremoved <- sapply(1:ncol(XtX), function (x) qr(XtX[,-x])$rank)
# Remove last column with linear dependence
lind.col <- max(which(rankifremoved == max(rankifremoved)))
covX <- covX[-lind.col, -lind.col]
meanX <- meanX[-lind.col]
covXY <- covXY[-lind.col]
aliased[lind.col] <- TRUE
}
else{
break()
}
}
p <- length(aliased[!aliased])
beta <- drop(XtX1 %*% Xty)
sigma2 <- drop((yty - t(beta) %*% Xty) / (n - p))
var_beta <- diag(sigma2 * XtX1)
sd_beta <- sqrt(var_beta)
t_stat <- beta / sd_beta
p_val <- 2 * pt(abs(t_stat), df = n - p, lower.tail = F)
coefficients <- cbind(beta, sd_beta, t_stat, p_val)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
sigma <- sqrt(sigma2)
df <- c(p, n - p, length(aliased))
SSE <- drop(yty - t(Xty) %*% XtX1 %*% Xty)
SST <- (n - 1) * varY
SSR <- SST - SSE
SS <- c(SSR = SSR, SSE = SSE, SST = SST)
MSR <- SSR / (p - 1)
MSE <- SSE / (n - p)
r.squared <- 1 - (SSE / SST)
adj.r.squared <- 1 - (1 - r.squared) * (n - 1) / (n - p)
fstatistic <- c(value = MSR / MSE, numdf = p - 1, dendf = n - p)
cov.unscaled <- XtX1
re <- list(
call = cl,
terms = terms,
coefficients = coefficients,
aliased = aliased,
sigma = sigma,
df = df,
r.squared = r.squared,
adj.r.squared = adj.r.squared,
fstatistic = fstatistic,
cov.unscaled = cov.unscaled,
`Sum Sq` = SS
)
if (keep_pcss) {
re$means <- means
re$covs <- covs
}
class(re) <- "pcsslm"
return(re)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.