#' Approximate a linear model for a product using PCSS
#'
#' \code{model_product} approximates the linear model for the product
#' of m phenotypes as a function of a set of predictors.
#'
#' @param formula an object of class \code{formula} whose dependent variable is
#' a combination of variables and \code{*} operators. All model terms
#' must be accounted for in \code{means} and \code{covs}.
#' @param n sample size.
#' @param means named vector of predictor and response means.
#' @param covs named matrix of the covariance of all model predictors and the
#' responses.
#' @param predictors named list of objects of class \code{predictor}
#' @param responses named list of objects of class \code{predictor}
#' corresponding to all terms being multiplied in the response. Can be
#' left \code{NULL} if only multiplying two terms
#' @param response character. Describe distribution of all product terms.
#' Either \code{"continuous"} or \code{"binary"}. If \code{"binary"}
#' different approximations of product means and variances are used.
#' @param ... additional arguments
#'
#' @inherit pcsslm return
#'
#' @references{
#'
#' \insertRef{wolf_using_2021}{pcsstools}
#'
#' }
#'
#' @examples
#' ex_data <- pcsstools_example[c("g1", "g2", "g3", "x1", "y4", "y5", "y6")]
#' head(ex_data)
#' means <- colMeans(ex_data)
#' covs <- cov(ex_data)
#' n <- nrow(ex_data)
#' predictors <- list(
#' g1 = new_predictor_snp(maf = mean(ex_data$g1) / 2),
#' g2 = new_predictor_snp(maf = mean(ex_data$g2) / 2),
#' g3 = new_predictor_snp(maf = mean(ex_data$g3) / 2),
#' x1 = new_predictor_normal(mean = mean(ex_data$x1), sd = sd(ex_data$x1))
#' )
#' responses <- lapply(means[c("y4", "y5", "y6")], new_predictor_binary)
#'
#' model_product(
#' y4 * y5 * y6 ~ g1 + g2 + g3 + x1,
#' means = means, covs = covs, n = n,
#' predictors = predictors, responses = responses, response = "binary"
#' )
#'
#' summary(lm(y4 * y5 * y6 ~ g1 + g2 + g3 + x1, data = ex_data))
#' @export
#'
model_product <- function(formula, n, means, covs, predictors, responses = NULL,
response = "continuous", ...) {
cl <- match.call()
terms <- terms(formula)
xterms <- extract_predictors(formula)
yterms <- parse_product(extract_response(formula))
check_terms_product(xterms$predictors, yterms, means, covs, predictors,
responses)
# Re-arrange means, covs, and predictors to match given formula
means0 <- means[c(xterms$predictors, yterms)]
covs0 <- covs[c(xterms$predictors, yterms), c(xterms$predictors, yterms)]
predictors0 <- predictors[xterms$predictors]
add_intercept <- xterms$add_intercept
approx0 <- approx_mult_prod(
means = means0, covs = covs0, n = n,
response = response, responses = responses,
predictors = predictors, ...
)
re <- calculate_lm(
means = approx0$means, covs = approx0$covs, add_intercept = add_intercept,
n = n, terms = terms
)
re$call <- cl
class(re) <- "pcsslm"
return(re)
}
#' Approximate the covariance of a set of predictors and a product of responses
#'
#' \code{approx_mult_prod} recursively estimates the covariances and means of a
#' set of responses. Estimates are approximated using all unique response
#' orderings and aggregated.
#'
#' @param means a vector of predictor and response means with all response means
#' at the end of the vector.
#' @param covs covariance matrix of all predictors and responses with column
#' and row order corresponding to the order of \code{means}.
#' @param n sample size (an integer).
#' @param response a string. Currently supports \code{"binary"} or
#' \code{"continuous"}.
#' @param predictors,responses lists of objects of class \code{predictor} where
#' each entry corresponds to one predictor/response variable.
#' @param verbose logical.
#'
#' @importFrom stats median integrate cov2cor
#'
#' @references{
#'
#' \insertRef{wolf_using_2021}{pcsstools}
#'
#' }
#' @return A list containing the following elements:
#' \item{means}{a vector of the (approximated) means of all predictors
#' and the product of responses}
#' \item{covs}{a matrix of (approximated) covariances between all predictors
#' and the product of responses}
#'
approx_mult_prod <- function(means, covs, n, response, predictors, responses,
verbose = FALSE) {
# Number of responses
m <- length(means) - length(predictors)
p <- length(means) - m
if (m > 3) {
warning("Estimation with products of >3 responses has not yet been tested for accuracy.")
}
perms <- lapply(make_permutations(m), rev)
approx <-
lapply(perms, function(order0) {
# Re-arrange order of means and covs to match order0
means0 <- means[c(1:p, p + order0)]
covs0 <- covs[c(1:p, p + order0), c(1:p, p + order0)]
# Print estimation order
if (verbose & m > 2) {
resp_names <- names(means0)[(p + 1):(p + m)]
if (any(is.null(resp_names)) | "" %in% resp_names) {
order_chr <- paste(rev(order0), collapse = " * ")
message(paste("Approximating with response columns order: ",
order_chr, "\n"))
} else {
y_names <- paste(rev(resp_names), collapse = " * ")
message(paste("Approximating with responses ordered as: ",
y_names, "\n"))
}
}
# Estimate covariances and means recursively according to order0
approx0 <-
approx_prod_recursive(
means = means0, covs = covs0, n = n,
response = response, predictors = predictors,
responses = responses[order0]
)
})
# Take pairwise medians of estimates
pred_mean <- median(
sapply(approx, function(.) .$means[p + 1])
)
pred_covs <- apply(
sapply(approx, function(.) .$covs[, p + 1]),
MARGIN = 1, FUN = median
)
means_out <- c(means[1:p], pred_mean)
covs_out <- rbind(cbind(covs[1:p, 1:p], pred_covs[1:p]), pred_covs)
# Prepare names
prod_name <- paste(names(means)[(p + 1):(p + m)], collapse = "")
names(means_out)[p + 1] <- prod_name
colnames(covs_out)[p + 1] <- prod_name
rownames(covs_out)[p + 1] <- prod_name
return(list(means = means_out, covs = covs_out))
}
# Recursively approximate covariances with a product of responses
approx_prod_recursive <- function(means, covs, n, response, predictors,
responses) {
# Number of responses and predictors
m <- length(means) - length(predictors)
p <- length(means) - m
if (m == 2) {
cov_out <- approx_prod_stats(
means = means, covs = covs, n = n,
response = response, predictors = predictors
)
return(cov_out)
} else if (m > 2) {
# mean of y_m, covariance with each predictor, and variance
mean_m <- means[p + 1]
covs_m <- covs[1:(p + 1), p + 1]
# mean of w_(m-1), covariance with each predictor, and variance
ids2 <- -(p + 1)
approx_w <- approx_prod_recursive(
means = means[ids2],
covs = covs[ids2, ids2],
n = n, response = response, predictors = predictors,
responses = responses[-1]
)
# Covariance of y_m and w_(m-1)
# Location of the responses in means and covs
resp_ids <- (p + 1):(p + m)
w_cov <- approx_response_cov_recursive(
ids = 1:m, r_means = means[resp_ids], r_covs = covs[resp_ids, resp_ids],
n = n, response = response, responses = responses
)
means_in <- c(approx_w$means, mean_m)
cov_in <- c(covs_m[1:p], w_cov[["cov"]], covs_m[p + 1])
covs_in <- rbind(
cbind(approx_w$covs, cov_in[1:(length(cov_in) - 1)]),
cov_in
)
cov_out <- approx_prod_stats(
means = means_in, covs = covs_in, n = n,
response = response, predictors = predictors
)
return(cov_out)
} else {
stop("Enter at least 2 responses.")
}
}
#' Approximate the covariance of one response with an arbitrary product of
#' responses.
#' @param ids Column ids of responses to use. First is taken alone while 2nd to
#' last are to be multiplied
#' @param r_covs Response covariance matrix
#' @param r_means Response means (vector)
#' @param n Sample size
#' @param response Character, Either "binary" or "continuous"
#' @param responses List of lists with elements of class predictor
#' @param verbose logical
#'
#' @return A vector with the approximated covariance, and approximated mean and
#' variance of the product
#'
#'
approx_response_cov_recursive <- function(ids, r_covs, r_means, n, responses,
response, verbose = FALSE) {
if (length(ids) == 3) {
if (verbose) {
if (!any(is.null(names(r_means)))) {
resp_names <- names(r_means)
} else {
resp_names <- 1:(length(r_means))
}
message(
paste(
"Estimating covariance of", resp_names[ids[1]], "and",
paste(resp_names[ids[2:length(ids)]], collapse = "")
)
)
}
covs0 <- r_covs[ids, ids]
means0 <- r_means[ids]
cov_out <- do.call(
approx_cov,
c(
list(means = means0, covs = covs0, n = n, response = response),
responses[[ids[1]]]
)
)
return(cov_out)
} else if (length(ids) > 3) {
if (verbose) {
message(paste("More than 3 ids input, approximating recursively"))
}
cov_12 <- r_covs[ids[1], ids[2]]
mean_1 <- r_means[ids[1]]
var_1 <- r_covs[ids[1], ids[1]]
mean_2 <- r_means[ids[2]]
var_2 <- r_covs[ids[2], ids[2]]
# Approx cov of 1st id and product of 3rd to END
ids0 <- c(ids[1], ids[3:length(ids)])
approx_13 <- approx_response_cov_recursive(
ids = ids0, r_covs = r_covs, r_means = r_means, n = n,
responses = responses, response = response, verbose = verbose
)
cov_13 <- approx_13[["cov"]]
# Approx cov of 2nd id and product of 3rd to END and mean of product of 3rd
# to END
ids0 <- c(ids[2], ids[3:length(ids)])
approx_23 <- approx_response_cov_recursive(
ids = ids0, r_covs = r_covs, r_means = r_means, n = n,
responses = responses, response = response, verbose = verbose
)
cov_23 <- approx_23[["cov"]]
var_3 <- median(c(approx_13[["var"]], approx_23[["var"]]))
mean_3 <- median(c(approx_13[["mean"]], approx_23[["mean"]]))
means0 <- c(mean_1, mean_2, mean_3)
covs0 <- matrix(c(
var_1, cov_12, cov_13,
cov_12, var_2, cov_23,
cov_13, cov_23, var_3
), ncol = 3)
cov_out <- do.call(
approx_cov,
c(
list(means = means0, covs = covs0, n = n, response = response),
responses[[ids[1]]]
)
)
return(cov_out)
} else {
stop("At least three id values are required.")
}
}
#' Approximate summary statistics for a product of phenotypes and a set of
#' predictors
#'
#' @param means Vector of means of predictors and the two phenotypes to be
#' multiplied
#' @param covs Covariance matrix of all predictors and the two phenotypes
#' @param n Sample size
#' @param response character. Either "binary" or "continuous".
#' @param predictors a list of elements of class predictor
#' @return A list with the predicted covariance matrix of all predictors and
#' the product and the means of all predictors and the product.
#'
approx_prod_stats <- function(means, covs, n, response, predictors) {
n_preds <- length(means) - 2
preds <- list()
for (j in 1:n_preds) {
params <- list(
means = means[c(j, n_preds + 1:2)],
covs = covs[c(j, n_preds + 1:2), c(j, n_preds + 1:2)],
response = response,
n = n
)
preds[[j]] <- do.call(approx_cov, c(params, predictors[[j]]))
}
prod_covs <- sapply(preds, function(.) .[["cov"]])
prod_vars <- sapply(preds, function(.) .[["var"]])
prod_means <- sapply(preds, function(.) .[["mean"]])
prod_var <- median(prod_vars)
prod_mean <- prod_means[[1]]
means_out <- c(means[1:n_preds], prod_mean)
cov_out <- rbind(
cbind(covs[1:n_preds, 1:n_preds], unname(prod_covs)),
c(prod_covs, prod_var)
)
# Put names on
prod_name <- paste(names(means)[n_preds + 1:2], collapse = ":")
names(means_out)[n_preds + 1] <- prod_name
colnames(cov_out)[n_preds + 1] <- prod_name
rownames(cov_out)[n_preds + 1] <- prod_name
colnames(cov_out)[1:n_preds] <- names(means)[1:n_preds]
rownames(cov_out)[1:n_preds] <- names(means)[1:n_preds]
return(list(means = means_out, covs = cov_out))
}
#' Approximate the mean of Y conditional on X
#' @param means Vector of the mean of X and the mean of Y
#' @param covs Matrix of covariances for X and Y
#' @param response Character. If "binary" truncates means to interval [0, 1].
#' If "continuous" does not restrict.
#' @param n Sample size
#' @return A list of length 2 consisting of 2 functions that give the
#' estimated conditional mean and conditional variance of Y as a function of X
#'
approx_conditional <- function(means, covs, response, n) {
# OLS coefficients for Y = a + bX
b <- covs[1, 2] / covs[1, 1]
a <- means[2] - b * means[1]
if (response == "binary") {
p_mu <- function(x0) {
p <- a + b * x0
p[p > 1] <- 1
p[p < 0] <- 0
return(p)
}
p_var <- function(x0) p_mu(x0) * (1 - p_mu(x0))
} else if (response == "continuous") {
p_mu <- function(x0) {
p <- a + b * x0
return(p)
}
# Approximate the conditional variance of a continuous Y given X under the
# OLS assumption of homoscedasticity.
p_s2 <- (n * means[2]^2 + (n - 1) * covs[2, 2] - a * n * means[2] -
b * (n * means[1] * means[2] + (n - 1) * covs[1, 2])) / (n - 2)
p_var <- function(x0) p_s2
}
return(list(c_mu = p_mu, c_var = p_var))
}
#' Approximate the partial correlation of Y and Z given X
#' @param covs Covariance matrix of X, Y, and Z.
#' @param cors Correlation matrix of X, Y, and Z.
#' @return Approximated partial correlation of the later two terms given the
#' first
get_pcor <- function(covs, cors = cov2cor(covs)) {
if (isTRUE(all.equal(cors[1, 2], 1)) | isTRUE(all.equal(cors[1, 3], 1))) {
rho <- 0
} else {
rho <- (cors[2, 3] - cors[1, 2] * cors[1, 3]) /
(sqrt(1 - cors[1, 2]^2) * sqrt(1 - cors[1, 3]^2))
}
return(rho)
}
# Approximate the covariance of X and Y*Z as well as the mean and variance of
# Y * Z
approx_cov <- function(means, covs, predictor_type, response, n, f, ...) {
# MEAN ##
pred_mean <- get_mean(means = means[2:3], covs = covs[2:3, 2:3], n = n)
# COVARIANCE ##
# Conditional means / variances for phenotype 1 and 2
c_1 <- approx_conditional(
means = means[c(1, 2)], covs = covs[c(1, 2), c(1, 2)],
response = response, n = n
)
c_2 <- approx_conditional(
means = means[c(1, 3)], covs = covs[c(1, 3), c(1, 3)],
response = response, n = n
)
# Partial correlation
rho <- get_pcor(covs)
# Predicted product conditional mean
c_prod_mean <- function(x0) {
c_1$c_mu(x0) * c_2$c_mu(x0) + rho * sqrt(c_1$c_var(x0) * c_2$c_var(x0)) *
(n - 1) / n
}
if (predictor_type == "discrete") {
# Estimated product covariance
pred_cov <- approx_cov_discrete(
c_prod_mean = c_prod_mean,
predictor_mean = means[1],
f = f, ...
)
} else if (predictor_type == "continuous") {
pred_cov <- approx_cov_continuous(
c_prod_mean = c_prod_mean,
predictor_mean = means[1],
f = f, ...
)
} else {
stop("Invalid predictor_type argument to approx_cov")
}
## VARIANCE ##
if (response == "binary") {
pred_var <- pred_mean * (1 - pred_mean) * n / (n - 1)
} else if (response == "continuous") {
# Estimate the conditional variance
c_prod_var <- function(x0) {
(c_1$c_mu(x0)^2 / c_1$c_var(x0) + c_2$c_mu(x0)^2 / c_2$c_var(x0) +
2 * rho * c_1$c_mu(x0) * c_2$c_mu(x0) /
sqrt(c_1$c_var(x0) * c_2$c_var(x0)) +
1 + rho^2) * c_1$c_var(x0) * c_2$c_var(x0)
}
# Take the expectation(?) of the conditional variance of
if (predictor_type == "discrete") {
pred_var <- approx_var_discrete(
c_prod_mean = c_prod_mean, c_prod_var = c_prod_var,
prod_mean = pred_mean, n = n, f = f, ...
)
} else if (predictor_type == "continuous") {
pred_var <- approx_var_continuous(
c_prod_mean = c_prod_mean, c_prod_var = c_prod_var,
prod_mean = pred_mean, n = n, f = f, ...
)
}
}
return(c(cov = unname(pred_cov),
mean = unname(pred_mean),
var = unname(pred_var)))
}
approx_cov_discrete <- function(c_prod_mean, predictor_mean, f, support) {
g <- function(x0) f(x0) * (x0 - predictor_mean) * c_prod_mean(x0)
pred_cov <- sum(g(support))
return(pred_cov)
}
approx_cov_continuous <- function(c_prod_mean, predictor_mean, f, lb, ub) {
g <- function(x0) f(x0) * (x0 - predictor_mean) * c_prod_mean(x0)
pred_cov <- integrate(g, lower = lb, upper = ub, stop.on.error = FALSE)$value
return(pred_cov)
}
# Calculate the mean of the product of X and Y
get_mean <- function(means, covs, n) {
prod_mean <- means[1] * means[2] + covs[1, 2] * (n - 1) / n
return(prod_mean)
}
approx_var_discrete <- function(c_prod_mean, c_prod_var, prod_mean, n, f,
support) {
g <- function(x0) {
(n * f(x0) - 1) * c_prod_var(x0) + n * f(x0) *
(c_prod_mean(x0) - prod_mean)^2
}
pred_var <- sum(g(support)) / (n - 1)
return(pred_var)
}
approx_var_continuous <- function(c_prod_mean, c_prod_var, prod_mean, n, f,
lb, ub) {
g <- function(x0) {
# f(x0) * c_prod_var(x0) + f(x0) * c_prod_mean(x0) - prod_mean^2
(n * f(x0) - 1) * c_prod_var(x0) + n * f(x0) *
(c_prod_mean(x0) - prod_mean)^2
}
pred_var <- integrate(g, lower = lb, upper = ub,
stop.on.error = FALSE)$value / (n - 1)
return(pred_var)
}
#' List all permutations of a sequence of integers
#'
#' Lists all permutations of 1,2,...,m unique up to the first two elements
#' @param m number of elements to permute
#' @importFrom gtools permutations
#' @return A list of vectors of permutations of 1,2,...,m.
make_permutations <- function(m) {
# All possible permutations. We only need half of these as the order of the
# first two columns does not matter.
if (m < 3) {
re_perms <- list(1:m)
return(re_perms)
}
all_perms <- gtools::permutations(m, m)
dups <- which(duplicated(as.matrix(all_perms[, 3:ncol(all_perms)])))
re_perms <- all_perms[-dups, ]
re_perms <- split(re_perms, row(re_perms))
return(re_perms)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.