################################
# @author Zachary Kurtz
# @date 8/14/2013
## Methods for Partial Least Squares regression for compositional data
#' Partial Least Squares Discriminant Analysis
#' PLS regression to discriminate classes (via a logistic model)
#' basically this is a wrapper for the \code{plsda} function in the caret package,
#' but with default setup for dealing with uneven classes (via the priors option, see details)
#' see caret::plsda for implementation details
#'
#' run this code if you don't need to fit paramaters by cross-validation
#'
#' @title plsDA partial least squares discriminant analysis
#' @param x data with samples in rows, features are columns (not necessarily compositional data)
#' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)})
#' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups)
#' @param K number of components in the PLS model (default: number of classes - 1)
#' @return a plsda fitted model
#' @seealso \code{\link{plsDA_main}}, \code{\link{caret::plsda}}
#' @rdname plsDA
#' @export
plsDA <- function(x, grouping, K, usePriors=FALSE, plsfun=caret::plsda, ...) {
# wrapper function for plsda
if (length(grouping) != nrow(x))
stop('length of grouping vector does equal the number of samples')
args <- list(...)
if (!('probMethod' %in% names(args))) args <- c(args, list(probMethod='Bayes'))
if (usePriors) {
# set priors to 1-(class freq)
prior <- 1-(table(grouping)/length(grouping))
args <- c(args, list(prior=as.vector(prior)))
}
y <- as.factor(grouping)
if (missing(K)) {
K <- length(levels(y))-1
}
args <- c(args, ncomp=K)
do.call(plsfun, c(list(x, y), args))
}
#' The main wrapper for full Partial Least Squares discriminant analysis,
#' performing cross-validation to tune model parameters (here, number of components)
#' and do permutation tests (ie bootstrapping) to get pseudo-pvals estimates for model coefficients
#'
#' @title plsDA_main partial least squares discriminant analysis
#' @param x data with samples in rows, features are columns (not necessarily compositional data)
#' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)})
#' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups)
#' @param K numeric vector containing number of components in the PLS model
#' @param fold number of partitions to randomly subsample for cross-validation
#' @param nboots number of bootstraps/permutations for estimating coefficient p-vals
#' @param n.core number of cores for paralellization of bootstraps
#' @param noise for very sparse components, some subsamples may have zero variance. Optionally, add some Gaussian noise to to avoid PLS errors
#' @param ... additional arguments passed to plsDA
#'
#' @return a \code{plsDA} object that contains: the plsda model/object, \code{pvals}, the original data, \code{x}, and \code{groupings}
#' @seealso \code{\link{plsDA}}
#' @rdname plsDA_main
#' @export
plsDA_main <- function(x, grouping, K, usePriors=FALSE, fold=5, nboots=999, n.core=4, noise=0, ...) {
if (length(K) > 1)
K <- cv.plsDA.fit(x, grouping, K=K, noise=noise, fold=fold, n.core=n.core, usePriors=usePriors)$K
.bstat <- function(data, indices, ...) plsDA(data[indices,,drop=FALSE], ...)$coefficients
.pstat <- function(data, indices, ...) plsDA(apply(data[indices,,drop=FALSE], 2, function(x) sample(x)), ...)$coefficients
if (nboots > 1) {
sboots <- plsdaboot(x, .bstat, .pstat, R=nboots, n.core, K=K, grouping=grouping, ...)
pmat <- pval(sboots)
# keepInd <- which(suppressWarnings(apply(pmat, 1, min, na.rm=TRUE)) <= alpha)
} else {
keepInd <- 1:ncol(x)
pmat <- NULL
}
plsmod <- plsDA(x, grouping, usePriors=usePriors, K=K, ...)
# if (length(keepInd) > 0) {
# splskmod <- plsDA(x[,keepInd], grouping, usePriors=usePriors, K=K, ...)
# } else splskmod <- NULL
structure(list(plsda=plsmod, pvals=pmat, # keep=keepInd,
x=data, y=grouping), class="plsDA")
}
plsdaboot <- function(data, statisticboot, statisticperm, R, ncpus=1, ...) {
res <- boot::boot(data, statisticboot, R=R, parallel="multicore", ncpus=ncpus, noise=0, ...)
null_av <- boot::boot(data, statisticperm, sim='permutation', R=R, parallel="multicore", ncpus=ncpus, ...)
class(res) <- 'list'
structure(c(res, list(null_av=null_av)), class='plsdaboot')
}
pval.plsdaboot <- function(x, sided='both', mar=2) {
# calculate 1 or 2 way pseudo p-val from boot object
# Args: a boot object
if (sided != "both") stop("only two-sided currently supported")
x$t0 <- as.matrix(x$t0)
nparams <- ncol(x$t)
tmeans <- colMeans(x$null_av$t)
# check to see whether betas are unstable -- confirm
niters <- nrow(x$t)
ind95 <- max(1,round(.025*niters)):round(.975*niters)
boot_ord <- apply(x$t, 2, sort)
boot_ord95 <- boot_ord[ind95,]
outofrange <- unlist(lapply(1:length(x$t0), function(i) {
betas <- x$t0[i]
range <- range(boot_ord95[,i])
range[1] > betas || range[2] < betas
}))
# calc whether center of mass is above or below the mean
bs_above <- unlist(lapply(1:nparams, function(i)
length(which(x$t[, i] > tmeans[i]))))
is_above <- bs_above > x$R/2
pvals <- ifelse(is_above, 2*(1-bs_above/x$R), 2*bs_above/x$R)
pvals[pvals > 1] <- 1
pvals[outofrange] <- NaN
pmat <- matrix(pvals, ncol=ncol(x$t0))
rownames(pmat) <- rownames(x$t0)
colnames(pmat) <- colnames(x$t0)
pmat
}
cv.plsDA.fit <- function(x, grouping, K, noise=0, fold=5, n.core, ...) {
# Find eta & k by cross validation
x <- x + matrix(rnorm(prod(dim(x)), 0, noise), ncol=ncol(x))
capture.output(out <- .cv.plsDA(x, grouping, fold=fold, K=K, plot.it=FALSE, n.core=n.core, ...))
out
}
#' @keywords internal
.cv.plsDA <- function (x, y, fold = 10, K, kappa = 0.5, classifier = c("lda",
"logistic"), scale.x = TRUE, n.core = 4, plot.it=FALSE, ...)
{
result.mat <- c()
foldi <- .cv.split(y, fold)
x <- as.matrix(x)
n <- nrow(x)
p <- ncol(x)
ip <- c(1:p)
# y <- as.matrix(y)
q <- length(y) #ncol(y)
# eta.K.pair <- cbind(rep(1, each = length(K)), rep(K, 1))
# eta.K.list <- split(eta.K.pair, c(1:nrow(eta.K.pair)))
.fit.plsda <- function(K.val) {
mspemati <- rep(0, fold)
Ai <- rep(0, fold)
for (k in 1:fold) {
omit <- foldi[[k]]
train.x <- x[-omit, ]
train.y <- y[-omit]
test.x <- x[omit, ]
test.y <- y[omit]
plsda.fit <- plsDA(train.x, train.y, K = K.val, ...)
pred <- predict(plsda.fit, test.x)
mspemati[k] <- mean(as.numeric(pred != test.y))
Ai[k] <- mean(length(plsda.fit$A))
}
mspe.ij <- c(mean(mspemati), mean(Ai), 1, K.val)
return(mspe.ij)
}
if (.Platform$OS.type == "unix") {
result.list <- parallel::mclapply(K, function(k) .fit.plsda(k), mc.cores = n.core)
}
else {
result.list <- lapply(K, function(k) .fit.plsda(k))
}
result.mat <- c()
for (i in 1:length(result.list)) {
result.mat <- rbind(result.mat, result.list[[i]])
}
mspemat <- matrix(result.mat[, 1], length(K), 1)
mspemat <- t(mspemat)
rownames(mspemat) <- '1'
colnames(mspemat) <- K
cands <- result.mat[result.mat[, 1] == min(result.mat[, 1]),
, drop = FALSE]
cands <- cands[cands[, 2] == min(cands[, 2]), , drop = FALSE]
cands <- cands[cands[, 4] == min(cands[, 4]), , drop = FALSE]
cands <- cands[cands[, 3] == max(cands[, 3]), , drop = FALSE]
K.opt <- cands[, 4]
eta.opt <- cands[, 3]
# cat(paste("K = ", K.opt, "\n", sep = ""))
# if (plot.it) {
# spls::heatmap.spls(t(mspemat), xlab = "K", ylab = "eta", main = "CV MSPE Plot",
# coln = 16, as = "n")
# }
# rownames(mspemat) <- paste("eta=", eta)
colnames(mspemat) <- paste("K =", K)
cv <- list(mspemat = mspemat, K.opt = K.opt)
invisible(cv)
}
#' @keywords internal
.cv.split <- function (y, fold) {
n <- length(y)
group <- table(y)
x <- c()
for (i in 1:length(group)) {
x.group <- c(1:n)[y == names(group)[i]]
x <- c(x, sample(x.group))
}
foldi <- split(x, rep(1:fold, length = n))
return(foldi)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.