Nothing
#' Compute Gower's centered similarity matrix from a distance matrix
#'
#' Compute Gower's centered similarity matrix \eqn{G}, which is the matrix
#' decomposed by the MDMR test statistic.
#'
#' @param d.mat Symmetric distance matrix (or R distance object) computed from
#' the outcome data to be used in MDMR.
#'
#' @return G Gower's centered dissimilarity matrix computed from D.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Gower, J. C. (1966). Some distance properties of latent root and
#' vector methods used in multivariate analysis. Biometrika, 53(3-4), 325-338.
#'
#' @export
gower <- function(d.mat) {
# Convert distance object to matrix form
d.mat <- as.matrix(d.mat)
# Dimensionality of distance matrix
n <- nrow(d.mat)
# Create Gower's symmetry matrix (Gower, 1966)
A <- -0.5 * d.mat^2
# Subtract column means As = (I - 1/n * 11")A
As <- A - rep(colMeans(A), rep.int(n, n))
# Subtract row means G = As(I- 1/n * 11")
# the transpose is just to deal with how "matrix minus vector" operations work
# in R
return(t(As) - rep(rowMeans(As), rep.int(n, n)))
}
#' Function to compute analytic p-values using Davies (1980)
#'
#' Compute analytic MDMR p-values
#'
#' @param q Test statistic
#' @param lambda Centered distance matrix eigenvalues
#' @param k Numerator degrees of freedom for this test statistic
#' @param p Total number of predictors in the design matrix (except for the
#' intercept)
#' @param n Sample size
#' @param lim Maximum number of integration terms. Realistic values for "lim"
#' range from 1,000 if the procedure is to be called repeatedly up to 50,000 if
#' it is to be called only occasionally
#' @param acc Error bound. Suitable values for "acc" range from 0.001 to 0.00005
#' which should be adequate for most statistical purposes.
#'
#' @return Output of CompQuadForm::davies()
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Davies, R. B. (1980). The Distribution of a Linear Combination of
#' chi-square Random Variables. Journal of the Royal Statistical Society.
#' Series C (Applied Statistics), 29(3), 323-333.
#'
#' Duchesne, P., & De Micheaux, P. L. (2010). Computing the distribution of
#' quadratic forms: Further comparisons between the Liu-Tang-Zhang
#' approximation and exact methods. Computational Statistics and Data
#' Analysis, 54(4), 858-862.
#'
#' McArtor, D. B., Lubke, G. H., & Bergeman, C. S. (2017). Extending
#' multivariate distance matrix regression with an effect size measure and the
#' distribution of the test statistic. Psychometrika, 82, 1052-1077.
.pmdmr <- function(q, lambda, k, p, n = length(lambda),
lim = 50000, acc = 1e-20) {
# Use the eigenvalues of G and the test statistic q to make a vector
# corresponding to all of the weights for the composite chi-square variables
# See Equation 12 in McArtor et al.
gamma <- c(lambda, -q * lambda)
# Aggregate the degrees of freedom for each composite chi-square variable
# See Equation 12 in McArtor et al.
nu <- c(rep(k, length(lambda)), rep(n-p-1, length(lambda)))
# Call the Davies function at zero using the given weights and df, along
# with the starting values of the metaparameters of the algorithm
pv <- CompQuadForm::davies(0, lambda = gamma, h = nu, lim = lim, acc = acc)
# Check error status. If there was an error or if the p-value was below zero,
# return entire davies object
if(pv$ifault != 0 || pv$Qq < 0) {
return(pv)
} else {
return(pv$Qq)
}
return(pv)
}
#' Function to the hat matrix
.hat <- function(X) {
tcrossprod(tcrossprod(X, solve(crossprod(X))), X)
}
#' Function to compute test statistics for a permuted design matrix
.mdmr.permstats <- function(X, n, vg, trG, px, test.inds) {
# Permute rows of X
perm.indices <- sample(1:n, size = n, replace = F)
X.perm <- X[perm.indices,]
# Omnibus test statistic
H.perm <- .hat(X.perm)
vh.perm <- c(H.perm)
numer.perm <- as.numeric(crossprod(vh.perm, vg))
denom.perm <- trG - numer.perm
f.omni.perm <- numer.perm / denom.perm
# Conditional test statistics
Hs.perm <- lapply(1:px, function(k) {
x.rm <- which(test.inds == k)
Xs.perm <- X.perm[,-x.rm]
Hs.perm <- .hat(Xs.perm)
vh.perm - c(Hs.perm)
})
numer.x.perm <- unlist(lapply(Hs.perm, function(vhs.perm) {
as.numeric(crossprod(vhs.perm, vg))
}))
f.x.perm <- numer.x.perm / denom.perm
# OUTPUT
return(c(f.omni.perm, f.x.perm))
}
#' function to compute pseudo R-square
.pseudo.r2 <- function(G, vh, vhs, x.inds, unique.xnames = NULL) {
# =========================== Omnibus Test =============================== #
# Computational trick: H is idempotent, so H = HH. tr(ABC) = tr(CAB), so
# tr(HGH) = tr(HHG) = tr(HG). Also, tr(AB) = vec(A)"vec(B), so
vg <- c(G)
numer <- crossprod(vh, vg)
# pseudo-R2 is defined as tr(HGH)/tr(G), so
denom <- sum(diag(G))
# Omnibus pseudo R-square
res <- numer / denom
# ===================== Tests of Each Predictor ========================== #
numer.x <- unlist(lapply(1:length(vhs), function(k) {
num <- NA
if(k %in% (x.inds)) {
num <- crossprod(vhs[[k]], vg)
}
return(num)
}))
r2.x <- numer.x / denom
# ====================== Return Results =========================== #
res <- c(res, r2.x)
names(res) <- c("(Omnibus)", unique.xnames)
return(res)
}
#' Conduct MDMR with analytic p-values
#'
#' \code{mdmr} (multivariate distance matrix regression) is used to regress a
#' distance matrix onto a set of predictors. It returns the test statistic,
#' pseudo R-square statistic, and analytic p-values for all predictors
#' jointly and for each predictor individually, conditioned on the rest.
#'
#' This function is the fastest approach to conducting MDMR. It uses the
#' fastest known computational strategy to compute the MDMR test statistic (see
#' Appendix A of McArtor et al., 2017), and it uses fast, analytic p-values.
#'
#' The slowest part of conducting MDMR is now the necessary eigendecomposition
#' of the \code{G} matrix, whose computation time is a function of
#' \eqn{n^3}. If MDMR is to be conducted multiple times on the same
#' distance matrix, it is recommended to compute eigenvalues of \code{G} in
#' advance and pass them to the function rather than computing them every
#' time \code{mdmr} is called, as is the case if the argument \code{lambda}
#' is left \code{NULL}.
#'
#' The distance matrix \code{D} can be passed to \code{mdmr} as either a
#' distance object or a symmetric matrix.
#'
#' @param X A \eqn{n x p} matrix or data frame of predictors. Unordered factors
#' will be tested with contrast-codes by default, and ordered factors will be
#' tested with polynomial contrasts. For finer control of how categorical
#' predictors are handled, or if higher-order effects are desired, the output
#' from a call to \code{model.matrix()} can be supplied to this argument as
#' well.
#' @param D Distance matrix computed on the outcome data. Can be either a
#' matrix or an R \code{\link{dist}} object. Either \code{D} or \code{G}
#' must be passed to \code{mdmr()}.
#' @param G Gower's centered similarity matrix computed from \code{D}.
#' Either \code{D} or \code{G} must be passed to \code{mdmr}.
#' @param lambda Optional argument: Eigenvalues of \code{G}.
#' Eigendecomposition of large \code{G} matrices can be somewhat time
#' consuming, and the theoretical p-values require the eigenvalues of
#' \code{G}. If MDMR is to be conducted multiple times on one distance
#' matrix, it is advised to conduct the eigendecomposition once and pass the
#' eigenvalues to \code{mdmr()} directly each time.
#' @param return.lambda Logical; indicates whether or not the eigenvalues of
#' \code{G} should be returned, if calculated. Default is \code{FALSE}.
#' @param start.acc Starting accuracy of the Davies (1980) algorithm
#' implemented in the \code{\link{davies}} function in the \code{CompQuadForm}
#' package (Duchesne & De Micheaux, 2010) that \code{mdmr()} uses to compute
#' MDMR p-values.
#' @param ncores Integer; if \code{ncores} > 1, the \code{\link{parallel}}
#' package is used to speed computation. Note: Windows users must set
#' \code{ncores = 1} because the \code{parallel} pacakge relies on forking. See
#' \code{mc.cores} in the \code{\link{mclapply}} function in the
#' \code{parallel} pacakge for more details.
#' @param perm.p Logical: should permutation-based p-values be computed instead
#' of analytic p-values? Default behavior is \code{TRUE} if \code{n < 200} and
#' \code{FALSE} otherwise because the anlytic p-values depend on asymptotics.
#' for \code{n > 200} and "permutation" otherwise.
#' @param nperm Number of permutations to use if permutation-based p-values are
#' to be computed.
#' @param seed Random seed to use to generate the permutation null distribution.
#' Defaults to a random seed.
#'
#' @return An object with six elements and a summary function. Calling
#' \code{summary(mdmr.res)} produces a data frame comprised of:
#' \item{Statistic}{Value of the corresponding MDMR test statistic}
#' \item{Numer DF}{Numerator degrees of freedom for the corresponding effect}
#' \item{Pseudo R2}{Size of the corresponding effect on the
#' distance matrix}
#' \item{p-value}{The p-value for each effect.}
#' In addition to the information in the three columns comprising
#' \code{summary(res)}, the \code{res} object also contains:
#'
#' \item{p.prec}{A data.frame reporting the precision of each p-value. If
#' analytic p-values were computed, these are the maximum error bound of the
#' p-values reported by the \code{davies} function in \code{CompQuadForm}. If
#' permutation p-values were computed, it is the standard error of each
#' permutation p-value.}
#' \item{lambda}{A vector of the eigenvalues of \code{G} (if
#' \code{return.lambda = T}).}
#' \item{nperm}{Number of permutations used. Will read \code{NA} if analytic
#' p-values were computed}
#'
#' Note that the printed output of \code{summary(res)} will truncate p-values
#' to the smallest trustworthy values, but the object returned by
#' \code{summary(res)} will contain the p-values as computed. The reason for
#' this truncation differs for analytic and permutation p-values. For an
#' analytic p-value, if the error bound of the Davies algorithm is larger than
#' the p-value, the only conclusion that can be drawn with certainty is that
#' the p-value is smaller than (or equal to) the error bound. For a permutation
#' test, the estimated p-value will be zero if no permuted test statistics are
#' greater than the observed statistic, but the zero p-value is only a product
#' of the finite number of permutations conduted. The only conclusion that can
#' be drawn is that the p-value is smaller than \code{1/nperm}.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Davies, R. B. (1980). The Distribution of a Linear Combination of
#' chi-square Random Variables. Journal of the Royal Statistical Society.
#' Series C (Applied Statistics), 29(3), 323-333.
#'
#' Duchesne, P., & De Micheaux, P. L. (2010). Computing the distribution of
#' quadratic forms: Further comparisons between the Liu-Tang-Zhang
#' approximation and exact methods. Computational Statistics and Data
#' Analysis, 54(4), 858-862.
#'
#' McArtor, D. B., Lubke, G. H., & Bergeman, C. S. (2017). Extending
#' multivariate distance matrix regression with an effect size measure and the
#' distribution of the test statistic. Psychometrika, 82, 1052-1077.
#'
#' @examples
#'# --- The following two approaches yield equivalent results --- #
#'# Approach 1
#'data(mdmrdata)
#'D <- dist(Y.mdmr, method = "euclidean")
#'res1 <- mdmr(X = X.mdmr, D = D)
#'summary(res1)
#'
#'# Approach 2
#'data(mdmrdata)
#'D <- dist(Y.mdmr, method = "euclidean")
#'G <- gower(D)
#'res2 <- mdmr(X = X.mdmr, G = G)
#'summary(res2)
#'
#' @importFrom CompQuadForm davies
#' @importFrom parallel mclapply
#' @export
mdmr <- function(X, D = NULL, G = NULL, lambda = NULL, return.lambda = F,
start.acc = 1e-20, ncores = 1,
perm.p = (nrow(as.matrix(X)) < 200),
nperm = 500, seed = NULL) {
##############################################################################
## INPUT HANDLING
##############################################################################
# If X is a vector, convert it to a matrix
if(is.vector(X)) {
X <- as.matrix(X)
}
# Get names of the columns in X
unique.xnames <- colnames(X)
if(is.null(unique.xnames)) {
colnames(X) <- unique.xnames <- paste0("X", 1:ncol(X))
}
# Make sure "D" is not interpreted as the D function
if(is.function(D)) {
stop(paste0("Please provide either a distance matrix or a ",
"Gower-centered distance matrix."))
}
# Make sure either D or G was provided
if(is.null(D) & is.null(G)) {
stop(paste0("Please provide either a distance matrix ",
"or a Gower-centered distance matrix."))
}
# Make sure D is a distance matrix
if(!is.null(D)) {
if(nrow(as.matrix(D)) != ncol(as.matrix(D))) {
stop(paste0("Please provide either a distance matrix ",
"or a Gower-centered distance matrix."))
}
}
# If G was not provided, compute it from D
if(is.null(G)) {
G <- gower(D)
}
# Get test indices of variables comprising a model matrix (e.g. which contrast
# codes need to be tested jointly to assess a factor if a model.matrix was
# passed as X). If X isn"t a model matrix, do them all one-at-a-time.
test.inds <- attr(X, "assign")
if(!is.null(test.inds)) {
is.model.mat <- T
}
if(is.null(test.inds)) {
is.model.mat <- F
}
# Remove observations that are misxing on X
X.na <- which(rowSums(is.na(as.matrix(X))) > 0)
if(length(X.na) > 0) {
X <- X[-X.na,]
G <- G[-X.na, -X.na]
warning(paste0(length(X.na),
" observations removed due to missingness on X."))
}
# Handle potential factors if X is not a model.matrix
if(!is.model.mat) {
contr.list <- lapply(1:ncol(X), FUN = function(k) {
contr.type <- NULL
if(is.factor(X[,k])) {
contr.type <- "contr.sum"
}
if(is.ordered(X[,k])) {
contr.type <- "contr.poly"
}
return(contr.type)
})
names(contr.list) <- colnames(X)
contr.list <- contr.list[!unlist(lapply(contr.list, is.null))]
# Case 1: at least some categorical predictors
if(length(contr.list) > 0) {
X <- stats::model.matrix(~ . , data = as.data.frame(X),
contrasts.arg = contr.list)
}
# Case 2: all numeric predictors
if(length(contr.list) == 0) {
X <- stats::model.matrix(~ . , data = as.data.frame(X))
}
test.inds <- attr(X, "assign")
}
xnames <- colnames(X)
unique.xnames <- lapply(1:max(test.inds), FUN = function(kk) {
hold.names <- varname <- xnames[test.inds == kk]
if(length(varname) > 1) {
constant.char <-
which(unlist(lapply(1:nchar(hold.names[1]), FUN = function(ind) {
chars <- unlist(lapply(hold.names, FUN = function(var.name) {
substr(x = var.name, start = ind, stop = ind)
}))
stats::var(as.numeric(as.factor(chars))) == 0
})))
varname <- paste(unlist(lapply(constant.char, FUN = function(cc) {
substr(x = hold.names[1], start = cc, stop = cc)
})), collapse = "")
}
varname
})
unique.xnames <- unlist(unique.xnames)
# Record the number of items and sample size
p <- ncol(X)-1
px <- length(unique.xnames)
n <- nrow(X)
##############################################################################
## COMPUTE OMNIBUS TEST STATISTIC
##############################################################################
# Compute hat matrix
H <- .hat(X)
# Computational trick: H is idempotent, so H = HH. tr(ABC) = tr(CAB), so
# tr(HGH) = tr(HHG) = tr(HG). Also, tr(AB) = vec(A)"vec(B), so
vh <- c(H)
vg <- c(G)
numer <- as.numeric(crossprod(vh, vg))
# Numerical trick: tr((I-H)G(I-H)) = tr(G) - tr(HGH), so
trG <- sum(diag(G))
denom <- trG - numer
# Save omnibus F
pr2.omni <- as.numeric(numer / trG)
f.omni <- as.numeric(numer / denom)
##############################################################################
## COMPUTE CONDITIONAL TEST STATISTICS
##############################################################################
# Get vectorized hat matrices for each conditional effect
Hs <- parallel::mclapply(1:px, mc.cores = ncores, FUN = function(k) {
x.rm <- which(test.inds == k)
Xs <- X[,-x.rm]
Hs <- .hat(Xs)
c(H - Hs)
})
# Get DF of each test
df <- unlist(parallel::mclapply(1:px, mc.cores = ncores, FUN = function(k) {
x.rm <- which(test.inds == k)
length(x.rm)
}))
# Compute SSD due to conditional effect
numer.x <- unlist(
parallel::mclapply(Hs, mc.cores = ncores, FUN = function(vhs) {
crossprod(vhs, vg)
}))
# Rescale to get either test statistic or pseudo r-square
f.x <- numer.x / denom
pr2.x <- numer.x / trG
# Combine test statistics and pseudo R-squares
stat <- data.frame("stat" = c(f.omni, f.x),
row.names = c("(Omnibus)", unique.xnames))
df <- data.frame("df" = c(p, df),
row.names = c("(Omnibus)", unique.xnames))
pr2 <- data.frame("pseudo.Rsq" = c(pr2.omni, pr2.x),
row.names = c("(Omnibus)", unique.xnames))
##############################################################################
## p-values: ANALYTIC APPROACH
##############################################################################
if(!perm.p) {
# Omnibus p-value
# Compute the eigenvalues of G if they were not provided
if(is.null(lambda)) {
lambda <- eigen(G, only.values = T)$values
}
# Compute adjusted sample size
n.tilde <- (n-p-1) * lambda[1] / sum(lambda)
if(n.tilde < 75) {
warning(
paste0("Adjusted sample size = ", round(n.tilde), "\n",
"Asymptotic properties of the null distribution may not hold.\n",
"This can result in overly conservative p-values.\n",
"Permutation tests are recommended."))
}
# Initiailze accuracy of the Davies algorithm
acc.omni <- start.acc
pv.omni <- .pmdmr(f.omni, lambda, k = p, p = p, n = n, acc = acc.omni)
# If the davies procedure threw an error, decrease the accuracy
while(length(pv.omni) > 1) {
acc.omni <- acc.omni * 10
pv.omni <- .pmdmr(q = f.omni, lambda = lambda, k = p, p = p, n = n,
acc = acc.omni)
}
# Get p-values
p.res <- parallel::mclapply(1:px, mc.cores = ncores, FUN = function(k) {
item.acc <- start.acc
pv <- .pmdmr(q = f.x[k], lambda = lambda, k = df[k+1,1],
p = p, n = n, acc = item.acc)
# If the davies procedure threw an error, decrease the accuracy
while(length(pv) > 1) {
item.acc <- item.acc * 10
pv <- .pmdmr(q = f.x[k], lambda = lambda, k = df[k+1,1],
p = p, n = n, acc = item.acc)
}
c(pv.x = pv, acc = item.acc)
})
pv.x <- unlist(lapply(p.res, function(p) {p[[1]]}))
acc.x <- unlist(lapply(p.res, function(p) {p[[2]]}))
pv <- data.frame("analytic.pvals" = c(pv.omni, pv.x),
row.names = c("(Omnibus)", unique.xnames))
pv.acc <- data.frame("p.max.error" = c(acc.omni, acc.x),
row.names = c("(Omnibus)", unique.xnames))
# Overwrite nperm to show no permutations were done
nperm <- NA
}
##############################################################################
## p-values: PERMUTATION APPROACH
##############################################################################
if(perm.p) {
# Set up chunk size
perm.nchunk <- ceiling(n * nperm / 1e6)
perm.chunkSize <- ceiling(nperm / perm.nchunk)
nperm <- perm.nchunk * perm.chunkSize
perm.chunkStart <- seq(1, nperm, by = perm.chunkSize)
perm.chunkEnd <- perm.chunkStart + perm.chunkSize - 1
perm.chunkEnd[perm.nchunk] <- nperm
# Set seed if not specified: make sure it doesn"t overflow
if(is.null(seed)) {
max.int <- .Machine$integer.max
seed.max <- floor((max.int - perm.chunkSize) / perm.nchunk)
seed <- round(stats::runif(1,0,1)*seed.max)
}
# Initialize counter for number of times each permuted test statistic is
# larger than the observed test statistic
perm.geq.obs <- rep(0, px + 1)
# Compute permutation p-values
for(i in 1:perm.nchunk) {
res <- parallel::mclapply(
1:perm.chunkSize, mc.cores = ncores, function(xx) {
set.seed(seed * i + xx)
.mdmr.permstats(X, n, vg, trG, px, test.inds) >= stat
})
perm.geq.obs <- perm.geq.obs +
colSums(
matrix(unlist(res), nrow = perm.chunkSize, ncol = px +1, byrow = T))
cat((perm.chunkEnd[i]/nperm)*100,
"% of permutation test statistics computed.",
fill = T)
}
# Compute p-values
pv.omni <- perm.geq.obs[1] / nperm
pv.x <- perm.geq.obs[-1] / nperm
pv <- data.frame("perm.pvals" = c(pv.omni, pv.x),
row.names = c("(Omnibus)", unique.xnames))
# Compute p-value accuracy
# Use max of p-values and 1/nperm so that we don"t get zero SE's
pv.omni.hold <- max(pv.omni, 1/nperm)
pv.x.hold <- pmax(pv.x, 1/nperm)
acc.omni <- sqrt((pv.omni.hold * (1-pv.omni.hold)) / nperm)
acc.x <- sqrt((pv.x.hold * (1-pv.x.hold)) / nperm)
pv.acc <- data.frame("perm.p.SE" = c(acc.omni, acc.x),
row.names = c("(Omnibus)", unique.xnames))
rm(pv.omni.hold, pv.x.hold)
# Fill in LAMBDA with a note if it was requested
if(return.lambda) {
lambda <- "Eigenvalues are not used in the permutation approach"
}
}
##############################################################################
## COMBINE AND RETURN OUTPUT
##############################################################################
if(return.lambda) {
out <- list("stat" = stat, "pr.sq" = pr2, "pv" = pv, "p.prec" = pv.acc,
"df" = df, lambda = lambda, nperm = nperm)
} else {
out <- list("stat" = stat, "pr.sq" = pr2, "pv" = pv, "p.prec" = pv.acc,
"df" = df, lambda = NULL, nperm = nperm)
}
class(out) <- c("mdmr", class(out))
return(out)
}
#' Print MDMR Object
#'
#' \code{print} method for class \code{mdmr}
#'
#' @param x Output from \code{mdmr}
#' @param ... Further arguments passed to or from other methods.
#'
#' @return
#' \item{p-value}{Analytic p-values for the omnibus test and each predictor}
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#'
#' @export
print.mdmr <- function(x, ...) {
if(is.na(x$nperm)) {
pv.name <- "Analytic p-value"
# If it's analytic, we can only say it's below davies error
out <- rep(NA, nrow(x$pv))
for(i in 1:length(out)) {
out[i] <- format.pval(x$pv[i,1], eps = x$p.prec[i,1])
}
out <- data.frame(out,row.names = rownames(x$pv))
names(out) <- pv.name
print(out)
}
if(!is.na(x$nperm)) {
pv.name <- "Permutation p-value"
# If it's a permutation test, we can only say it's below 1/nperm
out <- data.frame(format.pval(x$pv, eps = 1/x$nperm),
row.names = rownames(x$pv))
names(out) <- pv.name
print(out)
}
}
#' Summarizing MDMR Results
#'
#' \code{summary} method for class \code{mdmr}
#'
#' @param object Output from \code{mdmr}
#' @param ... Further arguments passed to or from other methods.
#'
#' @return Calling
#' \code{summary(mdmr.res)} produces a data frame comprised of:
#' \item{Statistic}{Value of the corresponding MDMR test statistic}
#' \item{Pseudo R2}{Size of the corresponding effect on the
#' distance matrix}
#' \item{p-value}{The p-value for each effect.}
#' In addition to the information in the three columns comprising
#' \code{summary(res)}, the \code{res} object also contains:
#'
#' \item{p.prec}{A data.frame reporting the precision of each p-value. If
#' analytic p-values were computed, these are the maximum error bound of the
#' p-values reported by the \code{davies} function in \code{CompQuadForm}. If
#' permutation p-values were computed, it is the standard error of each
#' permutation p-value.}
#' \item{lambda}{A vector of the eigenvalues of \code{G} (if
#' \code{return.lambda = T}).}
#' \item{nperm}{Number of permutations used. Will read \code{NA} if analytic
#' p-values were computed}
#'
#' Note that the printed output of \code{summary(res)} will truncate p-values
#' to the smallest trustworthy values, but the object returned by
#' \code{summary(res)} will contain the p-values as computed. The reason for
#' this truncation differs for analytic and permutation p-values. For an
#' analytic p-value, if the error bound of the Davies algorithm is larger than
#' the p-value, the only conclusion that can be drawn with certainty is that
#' the p-value is smaller than (or equal to) the error bound. For a permutation
#' test, the estimated p-value will be zero if no permuted test statistics are
#' greater than the observed statistic, but the zero p-value is only a product
#' of the finite number of permutations conduted. The only conclusion that can
#' be drawn is that the p-value is smaller than \code{1/nperm}.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Davies, R. B. (1980). The Distribution of a Linear Combination of
#' chi-square Random Variables. Journal of the Royal Statistical Society.
#' Series C (Applied Statistics), 29(3), 323-333.
#'
#' Duchesne, P., & De Micheaux, P. L. (2010). Computing the distribution of
#' quadratic forms: Further comparisons between the Liu-Tang-Zhang
#' approximation and exact methods. Computational Statistics and Data
#' Analysis, 54(4), 858-862.
#'
#' McArtor, D. B., Lubke, G. H., & Bergeman, C. S. (2017). Extending
#' multivariate distance matrix regression with an effect size measure and the
#' distribution of the test statistic. Psychometrika, 82, 1052-1077.
#'
#' @examples
#'# --- The following two approaches yield equivalent results --- #
#'# Approach 1
#'data(mdmrdata)
#'D <- dist(Y.mdmr, method = "euclidean")
#'mdmr.res <- mdmr(X = X.mdmr, D = D)
#'summary(mdmr.res)
#'
#' @export
summary.mdmr <- function(object, ...) {
if(is.na(object$nperm)) {
pv.name <- "Analytic p-value"
# If it's analytic, we can only say it's below davies error
pv.print <- rep(NA, nrow(object$pv))
for(i in 1:length(pv.print)) {
pv.print[i] <- format.pval(object$pv[i,1], eps = object$p.prec[i,1])
}
print.res <- data.frame("Statistic" =
format(object$stat, digits = 3),
"Numer DF" = object$df,
"Pseudo R2" = format.pval(object$pr.sq, digits = 3),
"p-value" = pv.print)
out.res <- data.frame("Statistic" = object$stat,
"Numer DF" = object$df,
"Pseudo R2" = object$pr.sq,
"p-value" = object$pv)
names(print.res) <- names(out.res) <- c("Statistic", "Numer DF",
"Pseudo R2", pv.name)
}
if(!is.na(object$nperm)) {
pv.name <- "Permutation p-value"
# If it's a permutation test, we can only say it's below 1/nperm
pv.print <- format.pval(object$pv, eps = 1/object$nperm)
print.res <- data.frame("Statistic" =
format(object$stat, digits = 3),
"Numer DF" = object$df,
"Pseudo R2" = format.pval(object$pr.sq, digits = 3),
"p-value" = pv.print)
out.res <- data.frame("Statistic" = object$stat,
"Numer DF" = object$df,
"Pseudo R2" = object$pr.sq,
"p-value" = object$pv)
names(print.res) <- names(out.res) <- c("Statistic", "Numer DF",
"Pseudo R2", pv.name)
}
# Add significance codes to p-values
print.res <- data.frame(print.res, NA)
for(k in 1:5) {
print.res[,k] <- paste(print.res[,k])
}
names(print.res)[5] <- ""
for(l in 1:nrow(print.res)) {
if(object$pv[l,1] > 0.1) {
print.res[l,5] <- " "
}
if((object$pv[l,1] <= 0.1) & (object$pv[l,1] > 0.05)) {
print.res[l,5] <- ". "
}
if((object$pv[l,1] <= 0.05) & (object$pv[l,1] > 0.01)) {
print.res[l,5] <- "* "
}
if((object$pv[l,1] <= 0.01) & (object$pv[l,1] > 0.001)) {
print.res[l,5] <- "** "
}
if(object$pv[l,1] <= 0.001) {
print.res[l,5] <- "***"
}
}
print(print.res)
cat("---", fill = T)
cat(paste0("Signif. codes: 0 \"***\" 0.001 \"**\" 0.01 \"*\" 0.05 ",
"\".\" 0.1 \" \" 1"))
invisible(out.res)
}
#' Compute univariate MDMR effect sizes
#'
#' \code{delta} computes permutation-based effect sizes on individual items
#' comprising the distance matrix outcome used in multivariate distance matrix
#' regression. It returns the omnibus estimates of delta (i.e. effect size of
#' the entire design matrix on each outcome) as well as estimates of each
#' pair-wise effect size (i.e. the effect of each predictor on each outcome
#' variable, conditional on the rest of the predictors).
#'
#' See McArtor et al. (2017) for a detailed description of how delta is
#' computed. Note that it is a relative measure of effect, quantifying which
#' effects are strong (high values of delta) and weak (low values of delta)
#' within a single analysis, but estimates of delta cannot be directly compared
#' across different datasets.
#'
#' There are two options for using this function. The first option is to
#' specify the predictor matrix \code{X}, the outcome matrix \code{Y}, the
#' distance type \code{dtype} (supported by "dist" in R), and number of
#' iterations \code{niter}. This option conducts the permutation of each Y-item
#' \code{niter} times (to average out random association in each permutation)
#' and reports the median estimates of delta over the \code{niter} reps.
#'
#' The second option is to specify \code{X}, \code{G}, and \code{G.list}, a
#' list of G matrices where the permutation has already been done for each item
#' comprising Y. The names of the elements in \code{G.list} should correspond
#' to the names of the variables that were permuted. This option is implemented
#' so that delta can be computed when MDMR is being used in conjunction with
#' distance metrics not supported by \code{dist}.
#'
#' @param X A \eqn{n x p} matrix or data frame of predictors. Unordered factors
#' will be tested with contrast-codes by default, and ordered factors will be
#' tested with polynomial contrasts. For finer control of how categorical
#' predictors are handled, or if higher-order effects are desired, the output
#' from a call to \code{model.matrix()} can be supplied to this argument as
#' well.
#' @param Y Outcome data: \eqn{n x q} matrix of scores along the
#' dependent variables.
#' @param dtype Measure of dissimilarity that will be used by \code{\link{dist}}
#' to compute the distance matrix based on \code{Y}. As is the case when calling
#' \code{dist} directly, this must be one of \code{"euclidean"},
#' \code{"maximum"}, \code{"manhattan"}, \code{"canberra"}, \code{"binary"} or
#' \code{"minkowski"}, and unambiguous substring can be given.
#' @param niter Number of times to permute each outcome item in the procedure
#' to compute delta. The final result is the average of all \code{niter}
#' iterations. Higher values of \code{niter} require more computation time, but
#' result in more precise estimates.
#' @param x.inds Vector indicating which columns of X should have their
#' conditional effect sizes computed. Default value of \code{NULL} results in
#' all effects being computed, and a value of \code{0} results in no conditional
#' effects being computed, such that only the omnibus effect sizes will be
#' reported.
#' @param y.inds Vector indicating which columns of Y effect sizes should be
#' computed on. Default value of \code{NULL} results in all columns being
#' used.
#' @param G Gower's centered similarity matrix computed from \code{D}.
#' Either \code{D} or \code{G} must be passed to \code{mdmr()}.
#' @param G.list List of length \eqn{q} where the \eqn{i^{th}}
#' element contains the \code{G} matrix computed from distance a matrix that
#' was computed on a version of \code{Y} where the \eqn{i^{th}}
#' column has been randomly permuted.
#' @param ncores Integer; if \code{ncores} > 1, the \code{\link{parallel}}
#' package is used to speed computation. Note: Windows users must set
#' \code{ncores = 1} because the \code{parallel} pacakge relies on forking. See
#' \code{mc.cores} in the \code{\link{mclapply}} function in the
#' \code{parallel} pacakge for more details.
#' @param seed Integer; sets seed for the permutations of each variable
#' comprising Y so that results can be replicated.
#' @param plot.res Logical; Indicates whether or not a heat-map of the results
#' should be plotted.
#' @param grayscale Logical; Indicates whether or not the heat-map should be
#' plotted in grayscale.
#' @param cex Multiplier for cex.axis, cex.lab, cex.main, and cex that are
#' passed to the plotted result.
#' @param y.las Orientation of labels for the outcome items. Defaults to
#' vertical (2). Value of 1 prints horizontal labels, and is only recommended
#' if the multivariate outcome is comprised of few variables.
#'
#' @return A data frame whose rows correspond to the omnibus effects and the
#' effect of each individual predictor (conditional on the rest), and whose
#' columns correspond to each outcome variable whose effect sizes are being
#' quantified. If \code{plot.res = TRUE}, a heat-map is plotted of this data
#' frame to easily identify the strongest effects. Note that the heatmap is
#' partitioned into the omnibus effect (first row) and pair-wise effects
#' (remaining rows), because otherwise the omnibus effect would dominate the
#' heatmap.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references
#' McArtor, D. B., Lubke, G. H., & Bergeman, C. S. (2017). Extending
#' multivariate distance matrix regression with an effect size measure and the
#' distribution of the test statistic. Psychometrika, 82, 1052-1077.
#'
#' @examples
#' data(mdmrdata)
#' # --- Method 1 --- #
#' delta(X.mdmr, Y = Y.mdmr, dtype = "euclidean", niter = 1, seed = 12345)
#'
#' # --- Method 2 --- #
#' D <- dist(Y.mdmr, method = "euclidean")
#' G <- gower(D)
#' q <- ncol(Y.mdmr)
#' G.list <- vector(mode = "list", length = q)
#' names(G.list) <- names(Y.mdmr)
#' for(i in 1:q) {
#' Y.shuf <- Y.mdmr
#' Y.shuf[,i] <- sample(Y.shuf[,i])
#' G.list[[i]] <- gower(dist(Y.shuf, method = "euclidean"))
#' }
#' delta(X.mdmr, G = G, G.list = G.list)
#'
#' @export
#' @importFrom parallel mclapply
delta <- function(X, Y = NULL, dtype = NULL, niter = 10,
x.inds = NULL, y.inds = NULL,
G = NULL, G.list = NULL, ncores = 1, seed = NULL,
plot.res = F, grayscale = F, cex = 1, y.las = 2) {
##############################################################################
## Step 1: Check input type
##############################################################################
# If a list of distance matrices is not provided....
if(is.null(G.list)) {
# Make sure raw data and distance METRIC is provided
if(any(is.null(Y), is.null(dtype))) {
stop(paste0("Input either Y and the distance type ",
"or a list of distance matrices"))
}
# Set the method to raw (see below)
method <- "raw"
}
# If a list of distance matrices is provided...
if(!is.null(G.list)) {
# Set the method to list (see below)
method <- "list"
if(!is.null(y.inds)) {
warning("y.inds is ignored if G.list is supplied.")
}
}
# Get test indices of variables comprising a model matrix (e.g. which contrast
# codes need to be tested jointly to assess a factor if a model.matrix was
# passed as X). If X isn"t a model matrix, do them all one-at-a-time.
test.inds <- attr(X, "assign")
if(!is.null(test.inds)) {
is.model.mat <- T
}
if(is.null(test.inds)) {
is.model.mat <- F
}
# Remove observations that are misxing on X
X.na <- which(rowSums(is.na(as.matrix(X))) > 0)
if(length(X.na) > 0) {
X <- X[-X.na,]
G <- G[-X.na, -X.na]
warning(paste0(length(X.na),
" observations removed due to missingness on X."))
}
# Handle potential factors if X is not a model.matrix
if(!is.model.mat) {
contr.list <- lapply(1:ncol(X), FUN = function(k) {
contr.type <- NULL
if(is.factor(X[,k])) {
contr.type <- "contr.sum"
}
if(is.ordered(X[,k])) {
contr.type <- "contr.poly"
}
return(contr.type)
})
names(contr.list) <- colnames(X)
contr.list <- contr.list[!unlist(lapply(contr.list, is.null))]
# Case 1: at least some categorical predictors
if(length(contr.list) > 0) {
X <- stats::model.matrix(~ . , data = as.data.frame(X),
contrasts.arg = contr.list)
}
# Case 2: all numeric predictors
if(length(contr.list) == 0) {
X <- stats::model.matrix(~ . , data = as.data.frame(X))
}
test.inds <- attr(X, "assign")
}
xnames <- colnames(X)
unique.xnames <- lapply(1:max(test.inds), FUN = function(kk) {
hold.names <- varname <- xnames[test.inds == kk]
if(length(varname) > 1) {
constant.char <-
which(unlist(lapply(1:nchar(hold.names[1]), FUN = function(ind) {
chars <- unlist(lapply(hold.names, FUN = function(var.name) {
substr(x = var.name, start = ind, stop = ind)
}))
stats::var(as.numeric(as.factor(chars))) == 0
})))
varname <- paste(unlist(lapply(constant.char, FUN = function(cc) {
substr(x = hold.names[1], start = cc, stop = cc)
})), collapse = "")
}
varname
})
unique.xnames <- unlist(unique.xnames)
# Record the number of items and sample size
p <- ncol(X) - 1
px <- length(unique.xnames)
n <- nrow(X)
# Which variables are to be tested?
if(is.null(x.inds)) {x.inds <- 1:px}
if(any(x.inds > p)) {stop(paste0("x.inds must be between 1 and ncol(X)"))}
##############################################################################
## Step 2: Compute hat matrices
##############################################################################
# Do all the computations that are required every time pseudo.r2 is called
# Overall hat matrix
H <- .hat(X)
vh <- c(H)
# Conditional hat matrices
vhs <- parallel::mclapply(1:px, mc.cores = ncores, FUN = function(k) {
hh <- NA
if(k %in% x.inds) {
x.rm <- which(test.inds == k)
Xs <- X[,-x.rm]
hh <- H - .hat(Xs)
}
c(hh)
})
##############################################################################
## Step 3: Computation if raw data are provided
##############################################################################
if(method == "raw") {
# ----- Manage Input ----- #
Y <- as.matrix(Y)
q <- ncol(Y)
if(is.null(y.inds)) {y.inds <- 1:q}
if(any(y.inds > q)) {stop(paste0("y.inds must be between 1 and ncol(Y)"))}
ynames <- colnames(data.frame(Y))
if(all(ynames == paste0("X", 1:q))) {
ynames <- paste0("Y", 1:q)
}
G <- gower(stats::dist(Y, method = dtype))
# ----- Populate delta matrices using jackknife procedure ----- #
# Get the "real" pseudo R-square
pr2 <- .pseudo.r2(G, vh, vhs, x.inds, unique.xnames)
# If no seed is specified by the user, generate a random one - mclapply
# requires one, which is why "seed" is an argument rather than using the
# standard approach of just setting a seed prior to running the function.
# Make sure the seed doesn"t overflow.
if(is.null(seed)) {
max.int <- .Machine$integer.max
max.seed <- floor(max.int - 1 - niter*10)
seed <- round(stats::runif(1,0,1) * max.seed)
}
# Compute pseudo R-square with each item jackknifed "niter" times
jack.pr2 <-
parallel::mclapply(1:niter, mc.cores = ncores, function(i) {
set.seed(seed+i)
jackknifed.y <- lapply(1:q, function(k) {
y.jack <- Y
y.jack[,k] <- sample(y.jack[,k], size = n,
replace = F)
y.jack
})
res <- lapply(1:q, function(k) {
pr2.jack <- rep(NA, p+1)
if(k %in% y.inds) {
GG <- gower(stats::dist(jackknifed.y[[k]], method = dtype))
pr2.jack <- .pseudo.r2(GG, vh, vhs, x.inds, unique.xnames)
}
pr2.jack
})
res <- matrix(unlist(res), nrow = px+1, ncol = q)
dimnames(res) <- list(c("(Omnibus)", unique.xnames),
paste0(ynames, ".jack"))
res
})
# Subtract the jackknifed pseudo R-squares from the real pseudo R-squares
# to get the delta statistics for each rep
jack.pr2 <- parallel::mclapply(jack.pr2, mc.cores = ncores, function(jack) {
apply(jack, 2, function(x) {pr2 - x})
})
# Compute Median of Delta
delta.med <- matrix(0, nrow = px + 1, ncol = q)
dimnames(delta.med) <- list(c("(Omnibus)", unique.xnames),
paste0(ynames, ".jack"))
for(i in 1:nrow(delta.med)) {
for(j in 1:ncol(delta.med)) {
delta.med[i,j] <- stats::median(
unlist(lapply(jack.pr2, function(x) {x[i,j]})))
}
}
# Trim out results to only the requested X and Y variables
delta.med <- delta.med[c(1, x.inds + 1), y.inds, drop = F]
}
##############################################################################
## Step 4: Computation if list of distance matrices is provided
##############################################################################
if(method == "list") {
# Manage Input
X <- as.matrix(X)
xnames <- colnames(data.frame(X))
q <- length(G.list)
ynames <- names(G.list)
if(is.null(ynames)) {
ynames <- paste0("Y", 1:q)
}
# Populate delta matrices using jackknife procedure
# Get the "real" pseudo R-square
pr2 <- .pseudo.r2(G, vh, vhs, x.inds, unique.xnames)
# Get each permuted pseudo R-square
# If no seed is specified by the user, generate a random one - mclapply
# requires one, which is why "seed" is an argument rather than using the
# standard approach of just setting a seed prior to running the function
if(is.null(seed)) {
seed <- round(stats::runif(1,0,1) * 1e5)
}
# Compute pseudo R-square with each item jackknifed "niter" times
jack.pr2 <-
matrix(unlist(
parallel::mclapply(G.list, function(GG) {
.pseudo.r2(GG, vh, vhs, x.inds, unique.xnames)
})), nrow = px+1, ncol = q)
# Subtract the jackknifed pseudo R-squares from the real pseudo R-squares
# to get the delta statistics for each rep
jack.pr2 <- apply(jack.pr2, 2, function(x) {pr2 - x})
dimnames(jack.pr2) <- list(c("(Omnibus)", unique.xnames),
paste0(ynames, ".jack"))
# In this case, there's only one rep, so the median is the single rep
delta.med <- jack.pr2
# Trim out results to only the requested X and Y variables
delta.med <- delta.med[c(1, x.inds + 1), , drop = F]
}
##############################################################################
## Step 5: Plot
##############################################################################
if(plot.res) {
# Number of outcome items to display
q <- length(y.inds)
if(q == 0) {
q <- length(G.list)
}
# colors
red <- 1
green <- 1
blue <- 1
if(grayscale) {
red <- green <- blue <- 0
}
# Case 1: Univaraite X
if(px == 1) {
if(q == 1) {
warning(paste0("Plotting results is uninformative ",
"when X and Y are unidimensional."))
}
if(q > 1) {
graphics::plot(NA, xlim = c(0.5, q+0.5), ylim = c(0.5,px+0.5),
xaxt = "n",
yaxt = "n", xlab = "", ylab = "", bty = "n",
main = "MDMR Effect Sizes",
cex.axis = cex, cex.lab = cex, cex = cex, cex.main = cex)
graphics::axis(1, at = 1:q, labels = c(ynames[y.inds]), las = y.las,
cex.axis = cex, cex.lab = cex)
graphics::axis(2, at = (px):1, labels = c("(Omnibus)"), las = 1,
cex.axis = cex, cex.lab = cex)
# --- Convert to z scores for shading --- #
z.scores <- matrix(scale(c(delta.med[1,])), nrow = px, ncol = q)
z.scores[delta.med[1,] < 0] <- -9999
omni.cols <- stats::pnorm(z.scores[1,])
for(j in 1:ncol(delta.med)) {
x.low <- j-0.5
y.low <- px-0.5
x.up <- j+0.5
y.up <- px+0.5
# Y importances
graphics::rect(x.low, y.low, x.up, y.up,
col = grDevices::rgb(0*red, 0*green, 1*blue,
omni.cols[j]))
# Effect Size text
graphics::text(x = j, y = 1, col = "white",
labels = formatC(delta.med[1,j], format = "g",
digits = 2), cex = cex * 0.8)
}
}
}
# Case 2: Multivariate X
if(px > 1) {
# Number of conditional effects to display
px <- length(x.inds)
if(all(x.inds == 0)) {
px <- 0
unique.xnames <- NULL
}
graphics::plot(NA, xlim = c(0.5, q+0.5), ylim = c(0.5,px+0.5+1),
xaxt = "n",
yaxt = "n", xlab = "", ylab = "", bty = "n",
main = "MDMR Effect Sizes",
cex.axis = cex, cex.lab = cex, cex = cex, cex.main = cex)
graphics::axis(1, at = 1:q, labels = c(ynames[y.inds]), las = y.las,
cex.axis = cex, cex.lab = cex)
graphics::axis(2, at = (px+1):1, labels = c("(Omnibus)",
unique.xnames[x.inds]),
las = 1, cex.axis = cex, cex.lab = cex)
# Convert to z scores for shading
z.scores <- matrix(scale(c(delta.med)), nrow = px+1, ncol = q)
z.scores[which(delta.med < 0, arr.ind = T)] <- -9999
omni.cols <- stats::pnorm(z.scores[1,])
pairwise.cols <- stats::pnorm(z.scores[-1,,drop=F])
for(i in 1:nrow(delta.med)) {
for(j in 1:ncol(delta.med)) {
x.low <- j-0.5
y.low <- px-i+1-0.5+1
x.up <- j+0.5
y.up <- px-i+1+0.5+1
if(i == 1) {
# Y importances
graphics::rect(x.low, y.low, x.up, y.up,
col = grDevices::rgb(0*red, 0*green, 1*blue,
omni.cols[j]))
}
if(i > 1) {
# XY Importances
if(px > 0) {
graphics::rect(x.low, y.low, x.up, y.up,
col = grDevices::rgb(0*red, 0.75*green, 0*blue,
pairwise.cols[i-1,j]))
}
}
# Effect Size text
graphics::text(x = j, y = px - i + 1 + 1, col = "white",
labels =
formatC(delta.med[i,j], format = "g", digits = 2),
cex = 0.8*cex)
}
}
}
}
return(delta.med)
}
#' Simulated predictor data to illustrate the MDMR package.
#'
#' See package vignette by calling \code{vignette("mdmr-vignette")}.
"X.mdmr"
#' Simulated outcome data to illustrate the MDMR package.
#'
#' See package vignette by calling \code{vignette("mdmr-vignette")}.
"Y.mdmr"
#' Simulated clustered predictor data to illustrate the Mixed-MDMR function
#'
#' See \code{\link{mixed.mdmr}}.
"X.clust"
#' Simulated clustered outcome data to illustrate the Mixed-MDMR function
#'
#' See \code{\link{mixed.mdmr}}.
"Y.clust"
#' Fit Mixed-MDMR models
#'
#' \code{mixed.mdmr} allows users to conduct multivariate distance matrix
#' regression (MDMR) in the context of a (hierarchically) clustered sample
#' without inflating Type-I error rates as a result of the violation of the
#' independence assumption. This is done by invoking a mixed-effects modeling
#' framework, in which clustering/grouping variables are specified as random
#' effects and the covariate effects of interest are fixed effects. The input
#' to \code{mixed.mdmr} largely reflects the input of the \code{\link{lmer}}
#' function from the package \code{\link{lme4}} insofar as the specification of
#' random and fixed effects are concerned (see Arguments for details). Note that
#' this function simply controls for the random effects in order to test the
#' fixed effects; it does not facilitate point estimation or inference on the
#' random effects.
#'
#' @param fmla A one-sided linear formula object describing both the fixed-effects and random-effects part of the model, beginning with an ~ operator, which is followed by the terms to include in the model, separated by + operators. Random-effects terms are distinguished by vertical bars (|) separating expressions for design matrices from grouping factors. Two vertical bars (||) can be used to specify multiple uncorrelated random effects for the same grouping variable.
#' @param data A mandatory data frame containing the variables named in formula.
#' @param D Distance matrix computed on the outcome data. Can be either a
#' matrix or an R \code{\link{dist}} object. Either \code{D} or \code{G}
#' must be passed to \code{mdmr()}.
#' @param G Gower's centered similarity matrix computed from \code{D}.
#' Either \code{D} or \code{G} must be passed to \code{mdmr}.
#' @param use.ssd The proportion of the total sum of squared distances (SSD)
#' that will be targeted in the modeling process. In the case of non-Euclidean
#' distances, specifying \code{use.ssd} to be slightly smaller than 1.00 (e.g.,
#' 0.99) can substantially lower the computational burden of \code{mixed.mdmr}
#' while maintaining well-controlled Type-I error rates and only sacrificing
#' a trivial amount of power. In the case of Euclidean distances the
#' computational burden of \code{mixed.mdmr} is small, so \code{use.ssd} should
#' be set to 1.00.
#' @param start.acc Starting accuracy of the Davies (1980) algorithm
#' implemented in the \code{\link{davies}} function in the \code{CompQuadForm}
#' package (Duchesne & De Micheaux, 2010) that \code{mdmr()} uses to compute
#' MDMR p-values.
#' @param ncores Integer; if \code{ncores} > 1, the \code{\link{parallel}}
#' package is used to speed computation. Note: Windows users must set
#' \code{ncores = 1} because the \code{parallel} pacakge relies on forking. See
#' \code{mc.cores} in the \code{\link{mclapply}} function in the
#' \code{parallel} pacakge for more details.
#'
#' @return An object with six elements and a summary function. Calling
#' \code{summary(mixed.mdmr.res)} produces a data frame comprised of:
#' \item{Statistic}{Value of the corresponding MDMR test statistic}
#' \item{Numer DF}{Numerator degrees of freedom for the corresponding effect}
#' \item{p-value}{The p-value for each effect.}
#' In addition to the information in the three columns comprising
#' \code{summary(res)}, the \code{res} object also contains:
#'
#' \item{p.prec}{A data.frame reporting the precision of each p-value. If
#' analytic p-values were computed, these are the maximum error bound of the
#' p-values reported by the \code{davies} function in \code{CompQuadForm}. If
#' permutation p-values were computed, it is the standard error of each
#' permutation p-value.}
#'
#' Note that the printed output of \code{summary(res)} will truncate p-values
#' to the smallest trustworthy values, but the object returned by
#' \code{summary(res)} will contain the p-values as computed. The reason for
#' this truncation differs for analytic and permutation p-values. For an
#' analytic p-value, if the error bound of the Davies algorithm is larger than
#' the p-value, the only conclusion that can be drawn with certainty is that
#' the p-value is smaller than (or equal to) the error bound.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Davies, R. B. (1980). The Distribution of a Linear Combination of
#' chi-square Random Variables. Journal of the Royal Statistical Society.
#' Series C (Applied Statistics), 29(3), 323-333.
#'
#' Duchesne, P., & De Micheaux, P. L. (2010). Computing the distribution of
#' quadratic forms: Further comparisons between the Liu-Tang-Zhang
#' approximation and exact methods. Computational Statistics and Data
#' Analysis, 54(4), 858-862.
#'
#' McArtor, D. B. (2017). Extending a distance-based approach to multivariate
#' multiple regression (Doctoral Dissertation).
#'
#' @examples
# # Source data
#' data("clustmdmrdata")
#'
#' # Get distance matrix
#' D <- dist(Y.clust)
#'
#' # Regular MDMR without the grouping variable
#' mdmr.res <- mdmr(X = X.clust[,1:2], D = D, perm.p = FALSE)
#'
#' # Results look significant
#' summary(mdmr.res)
#'
#' # Account for grouping variable
#' mixed.res <- mixed.mdmr(~ x1 + x2 + (x1 + x2 | grp),
#' data = X.clust, D = D)
#'
#' # Signifance was due to the grouping variable
#' summary(mixed.res)
#'
#'
#' @importFrom CompQuadForm davies
#' @importFrom parallel mclapply
#' @importFrom lme4 lmer lmerControl
#' @importFrom car Anova
#' @importFrom utils tail
#' @import stats
#' @export
mixed.mdmr <- function(fmla, data,
D = NULL, G = NULL,
use.ssd = 1,
start.acc = 1e-20, ncores = 1) {
##############################################################################
## Step 1: Handle input
##############################################################################
# ============================================================================
# Step 1a: Manage the input formula
# ============================================================================
# Get RHS of formula
fmla <- paste(fmla)
if(length(fmla > 1)) {
fmla <- tail(fmla, 1)
}
# Get omnibus reduced model formula (all fixed effects removed)
fmla.redux <- strsplit(fmla, split = "\\(")[[1]]
rand.terms <- grep(x = fmla.redux, pattern = "\\)")
fixed.terms <- (1:length(fmla.redux))[-rand.terms]
fmla.fixed <- paste(fmla.redux[fixed.terms])
fmla.fixed <- unlist(strsplit(fmla.fixed, split = "\\+"))
fmla.fixed <- paste0(fmla.fixed[fmla.fixed != " "], collapse = "+")
fmla.redux <- fmla.redux[rand.terms]
for(k in 1:length(fmla.redux)) {
fmla.redux[k] <- paste0("(", fmla.redux)
}
if(length(fmla.redux) > 0) {
fmla.redux <- paste(fmla.redux, collapse = "+")
}
if(length(fmla.redux) == 0) {
fmla.redux <- "1"
}
# Get the fixed portion of the formula for handling the input data in Step 1b
fmla.fixed <- as.formula(paste0("~ ", fmla.fixed))
# ============================================================================
# Step 1b: Covariate management
# ============================================================================
# ----------------------------------------------------------------------------
# Get the data
# ----------------------------------------------------------------------------
# Model matrix formulation of the predictors in the model
X <- model.matrix(fmla.fixed, data = data)
# Number of predictors that will be tested
p <- length(unique(attr(X, "assign")))
# Sample size
n <- nrow(X)
# Numerator DF for the omnibus test
df.omni <- ncol(X) - 1
# Numerator DF for each conditional test of a predictor
df.x <- unlist(lapply(unique(attr(X, "assign")), FUN = function(k) {
sum(attr(X, "assign") == k)
}))
# Combine degrees of freedom
df.all <- c(df.omni, df.x)
# ============================================================================
# Step 1c: Outcome management
# ============================================================================
if(class(D) != "dist") {
if(class(D) != "matrix") {
stop("Please pass a distance object or an n x n matrix to D")
}
if(class(D) == "matrix") {
if(nrow(D) != ncol(D)) {
stop("Please pass a distance object or an n x n matrix to D")
}
}
}
# Get eigenvalues and eigenvectors of G
D <- as.matrix(D)
if(nrow(D) != n) {
stop("The dimensionality of D does not match nrow(data)")
}
G <- gower(D)
eig <- eigen(G)
U <- eig$vectors
lambda <- eig$values
##############################################################################
## Step 2: Reduce number of dimensions of D to aid computation time
##############################################################################
# ============================================================================
# Step 2a: Record dimensionality; remove trivially small roots
# ============================================================================
# Original dimensionality of U
qq.srt <- length(lambda)
# Make lambda sum to 1 so that we can elimitate the relatively near-zero roots
lambda <- lambda / sum(lambda)
names(lambda) <- 1:qq.srt
# Get rid of dims that explain less than (1e-10 / 1) of the total SSD
keep.inds <- which(abs(lambda) > 1e-10)
U <- U[,keep.inds,drop=F]
lambda <- lambda[keep.inds]
qq <- length(lambda)
keep.lambda <- rep(T, qq)
keep.ssd <- sum(lambda * keep.lambda)
# ============================================================================
# Step 2b: Optionally, keep only the dimension that explain "use.ssd" of the
# total SSD
# ============================================================================
if(use.ssd < 1) {
# Start out keeping just the largest eigenvalue
keep.lambda <- rep(F, qq)
keep.lambda[1] <- T
# Compute the total ssd based on only the retained dimensions
keep.ssd <- sum(lambda * keep.lambda)
# Until the retained SSD is within tol=0.001 of the target SSD %, keep more
# dimensions
while(abs(keep.ssd - use.ssd) > 0.001) {
# If it's too small, add a real dimension
if(keep.ssd < use.ssd) {
keep.lambda[which(!keep.lambda)[1]] <- T
}
# If it's too large, add an imaginary dimension
if(keep.ssd > use.ssd) {
keep.lambda[tail(which(!keep.lambda), 1)] <- T
}
# Recompute the total ssd based on currently retained dimesions
keep.ssd <- sum(lambda * keep.lambda)
# Break if we"ve kept all dimensions, that is, if the dimensionality can"t
# be reduced to satisfy the criteria keep.ssd and "tol"
if(all(keep.lambda)) {break()}
}
# Trim the dimensions
U <- U[,keep.lambda,drop=F]
lambda <- lambda[keep.lambda]
qq <- length(lambda)
}
##############################################################################
## Step 3: Get LRT statistics assessing all fixed effects
##############################################################################
# Iterate over all retained MDS axes
chisqs <- parallel::mclapply(1:qq, mc.cores = ncores, FUN = function(k) {
# Define the outcome variable for this axis
u.hold <- U[,k]
# --------------------------------------------------------------------------
# Tests of individual predictors
# --------------------------------------------------------------------------
# Formula for the full model for this outcome
fmla.full <- as.formula(paste0("u.hold ~ ", fmla))
# Fit full model
lmer.full <- lmer(fmla.full, data = data, REML = F,
lmerControl(calc.derivs = F))
# Get test statistics for individual X using car::Anova()
chisq.x.res <- Anova(lmer.full, type = "III", test.statistic = "Chisq")
chisq.x <- chisq.x.res$Chisq
names(chisq.x) <- rownames(chisq.x.res)
rm(chisq.x.res)
# --------------------------------------------------------------------------
# Omnibus test
# --------------------------------------------------------------------------
# Fit model
fmla.redux <- as.formula(paste0("u.hold ~ ", fmla.redux))
lmer.redux <- lmer(fmla.redux, data = data, REML = F,
lmerControl(calc.derivs = F))
# Get test statisitc via model comparison approach
chisq.omni <- anova(lmer.redux, lmer.full)$Chisq[2]
names(chisq.omni) <- "Omnibus"
# --------------------------------------------------------------------------
# Return results
# --------------------------------------------------------------------------
return(c(chisq.omni, chisq.x))
})
# Get names of tests
nn <- names(chisqs[[1]])
##############################################################################
## Step 4: Compute overall test statistics and p-values
##############################################################################
# ============================================================================
# Step 4a: Compute overall test statistics
# ============================================================================
tilde.l <- unlist(parallel::mclapply(
1:length(nn), mc.cores = ncores, FUN = function(k) {
sum(unlist(lapply(chisqs, "[[", k)) * lambda)}))
names(tilde.l) <- nn
# ============================================================================
# Step 4b: p-values
# ============================================================================
pv <-
matrix(unlist(parallel::mclapply(
1:length(nn), mc.cores = ncores, FUN = function(k) {
df <- df.all[k]
# ----------------------------------------------------------------------
# Compute p-value
# ----------------------------------------------------------------------
acc <- 1e-20
pp <- CompQuadForm::davies(tilde.l[k], lambda = lambda,
h = rep(df, qq), lim = 50000, acc = acc)
# Error-check
err <- any(pp$ifault != 0, pp$Qq > 1, pp$Qq < 0)
while(err) {
acc <- acc * 10
if(acc > 0.01) {
warning(paste0("Unable to compute p-value for ", nn[k]))
return(c(NA, df, acc = acc))
}
pp <- CompQuadForm::davies(tilde.l[k], lambda = lambda,
h = rep(df, qq), lim = 50000, acc = acc)
err <- any(pp$ifault != 0, pp$Qq > 1, pp$Qq < 0)
}
return(c(pp$Qq, df, acc = acc))
})), ncol = 3, byrow = T)
##############################################################################
## Step 5: Output
##############################################################################
df <- pv[,2]
acc <- pv[,3]
pv <- pv[,1]
names(tilde.l) <- names(pv) <- names(acc) <- names(df) <- nn
out <- list("stat" = tilde.l,"pv" = pv, "p.prec" = acc,
"df" = df, ssd.used = keep.ssd)
class(out) <- c("mixed.mdmr", class(out))
return(out)
}
#' Print Mixed MDMR Object
#'
#' \code{print} method for class \code{mixed.mdmr}
#'
#' @param x Output from \code{\link{mixed.mdmr}}
#' @param ... Further arguments passed to or from other methods.
#'
#' @return
#' \item{p-value}{Analytic p-values for the omnibus test and each predictor}
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#'
#' @export
print.mixed.mdmr <- function(x, ...) {
pv.name <- "p-value"
# If it's analytic, we can only say it's below davies error
out <- rep(NA, length(x$pv))
for(i in 1:length(out)) {
out[i] <- format.pval(x$pv[i], eps = x$p.prec[i])
}
out <- data.frame(out, row.names = names(x$pv))
names(out) <- pv.name
print(out)
}
#' Summarizing Mixed MDMR Results
#'
#' \code{summary} method for class \code{mixed.mdmr}
#'
#' @param object Output from \code{\link{mixed.mdmr}}
#' @param ... Further arguments passed to or from other methods.
#'
#' @return Calling
#' \code{summary(mdmr.res)} produces a data frame comprised of:
#' \item{Statistic}{Value of the corresponding MDMR test statistic}
#' \item{p-value}{The p-value for each effect.}
#' In addition to the information in the three columns comprising
#' \code{summary(res)}, the \code{res} object also contains:
#'
#' \item{p.prec}{A data.frame reporting the precision of each p-value. If
#' analytic p-values were computed, these are the maximum error bound of the
#' p-values reported by the \code{davies} function in \code{CompQuadForm}. If
#' permutation p-values were computed, it is the standard error of each
#' permutation p-value.}
#'
#' Note that the printed output of \code{summary(res)} will truncate p-values
#' to the smallest trustworthy values, but the object returned by
#' \code{summary(res)} will contain the p-values as computed. The reason for
#' this truncation differs for analytic and permutation p-values. For an
#' analytic p-value, if the error bound of the Davies algorithm is larger than
#' the p-value, the only conclusion that can be drawn with certainty is that
#' the p-value is smaller than (or equal to) the error bound.
#'
#' @author Daniel B. McArtor (dmcartor@gmail.com) [aut, cre]
#'
#' @references Davies, R. B. (1980). The Distribution of a Linear Combination of
#' chi-square Random Variables. Journal of the Royal Statistical Society.
#' Series C (Applied Statistics), 29(3), 323-333.
#'
#' Duchesne, P., & De Micheaux, P. L. (2010). Computing the distribution of
#' quadratic forms: Further comparisons between the Liu-Tang-Zhang
#' approximation and exact methods. Computational Statistics and Data
#' Analysis, 54(4), 858-862.
#'
#' McArtor, D. B. (2017). Extending a distance-based approach to multivariate
#' multiple regression (Doctoral Dissertation).
#'
#' @examples
# # Source data
#' data("clustmdmrdata")
#'
#' # Get distance matrix
#' D <- dist(Y.clust)
#'
#' # Regular MDMR without the grouping variable
#' mdmr.res <- mdmr(X = X.clust[,1:2], D = D, perm.p = FALSE)
#'
#' # Results look significant
#' summary(mdmr.res)
#'
#'
#' # Account for grouping variable
#' mixed.res <- mixed.mdmr(~ x1 + x2 + (x1 + x2 | grp),
#' data = X.clust, D = D)
#'
#' # Signifance was due to the grouping variable
#' summary(mixed.res)
#'
#' @export
summary.mixed.mdmr <- function(object, ...) {
pv.name <- "p-value"
pv.print <- rep(NA, length(object$pv))
for(i in 1:length(pv.print)) {
pv.print[i] <- format.pval(object$pv[i], eps = object$p.prec[i])
}
print.res <- data.frame("Statistic" =
format(object$stat, digits = 3),
"Numer DF" = object$df,
"p-value" = pv.print)
out.res <- data.frame("Statistic" = object$stat,
"Numer DF" = object$df,
"p-value" = object$pv)
names(print.res) <- names(out.res) <- c("Statistic", "Numer DF", pv.name)
# Add significance codes to p-values
print.res <- data.frame(print.res, NA)
for(k in 1:3) {
print.res[,k] <- paste(print.res[,k])
}
names(print.res)[4] <- ""
for(l in 1:nrow(print.res)) {
if(object$pv[l] > 0.1) {
print.res[l,4] <- " "
}
if((object$pv[l] <= 0.1) & (object$pv[l] > 0.05)) {
print.res[l,4] <- ". "
}
if((object$pv[l] <= 0.05) & (object$pv[l] > 0.01)) {
print.res[l,4] <- "* "
}
if((object$pv[l] <= 0.01) & (object$pv[l] > 0.001)) {
print.res[l,4] <- "** "
}
if(object$pv[l] <= 0.001) {
print.res[l,4] <- "***"
}
}
print(print.res)
cat("---", fill = T)
cat(paste0("Signif. codes: 0 \"***\" 0.001 \"**\" 0.01 \"*\" 0.05 ",
"\".\" 0.1 \" \" 1"))
invisible(out.res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.