#pass cross-validation defaults if not specified by user
variTrainParam <- function(...) {
opt <- list(...)
if (is.null(opt$iter)) { opt$iter <- 3L }
if (is.null(opt$tol)) { opt$tol <- 1e-6 }
if (is.null(opt$cdmax)) { opt$cdmax <- 1e7 }
if (is.null(opt$sridge)) { opt$sridge <- 0.0 }
if (is.null(opt$weight)) { opt$weight <- NULL }
if (is.null(opt$remWeight)) { opt$remWeight <- NULL }
if (is.null(opt$sig)) { opt$sig <- NULL }
if (is.null(opt$rho)) { opt$rho <- NULL }
#hedge against multicolinearity, for numerical stability
if (is.null(opt$refitRidge)) { opt$refitRidge <- 0.01 }
opt
}
#######################
trainModel <- function(train, method, tuneGrid, refit = TRUE, fold_id, tune_id, resPath,
givenX, Xindex, Yindex, aszero, ...) {
opt <- variTrainParam(...)
#feedback on ols refit assumption of model sparsity
sparseFit <- TRUE
requireNamespace("Matrix")
if (method == "spacemap") {
fit <- spacemap(Y = train$Y, X = train$X,
lam1 = tuneGrid$lam1, sridge = opt$sridge,
lam2 = tuneGrid$lam2, lam3 = tuneGrid$lam3,
sig=opt$sig, rho=opt$rho,
weight=opt$weight, remWeight = opt$remWeight,
iter= opt$iter, tol = opt$tol, cdmax = opt$cdmax,
iscale = FALSE)
#zero out those below tolerance.
fit$ParCor[abs(fit$ParCor) <= aszero ] <- 0.0
fit$Gamma[abs(fit$Gamma) <= aszero ] <- 0.0
#reject models that did not converge
if(!fit$convergence) return(list(fit = fit, sparseFit = FALSE))
#recompute ols estimates based on training set: note change fit object by reference
if(refit) {
sparseFit <- olsRefitSpacemap(Y = train$Y, X = train$X,
ParCor = fit$ParCor, Gamma = fit$Gamma,
sigma = fit$sig.fit, RSS = fit$rss, tol = opt$tol,
ridge = opt$refitRidge)
}
#write to file
out <- list(ThetaXY = Matrix(fit$Gamma), ThetaYY = Matrix(fit$ParCor), sig = fit$sig.fit,
tune_id = tune_id, fold_id = fold_id, convergence = fit$convergence, sparseFit = sparseFit)
outfile <- file.path(resPath, paste0("tuneid_", sprintf("%03d", tune_id),
"_fold_", sprintf("%02d", fold_id),
".rds"))
saveRDS(out, file = outfile)
} else if (method == "space") {
fit <- spacemap::space(Y = train$XY, lam1 = tuneGrid, sridge = opt$sridge, iter = opt$iter,
cdmax = opt$cdmax, tol = opt$tol, iscale = FALSE,
sig = opt$sig, rho = opt$rho)
#zero out those below tolerance.
fit$ParCor[abs(fit$ParCor) <= aszero] <- 0.0
if(!fit$convergence) return(list(fit = fit, sparseFit = FALSE))
if(refit) {
sparseFit <- olsRefitSpace(Y = train$XY,
ParCor = fit$ParCor,
sigma = fit$sig.fit, RSS = fit$rss, tol = opt$tol,
ridge = opt$refitRidge)
}
#write to file
out <- list(sig = fit$sig.fit,
tune_id = tune_id, fold_id = fold_id,
convergence = fit$convergence, sparseFit = sparseFit)
if (givenX) {
xy <- fit$ParCor[Xindex,Yindex]
yy <- fit$ParCor[Yindex,Yindex]
out$ThetaXY = Matrix(xy)
out$ThetaYY = Matrix(yy)
} else {
out$ThetaYY = Matrix(fit$ParCor)
}
outfile <- file.path(resPath, paste0("tuneid_", sprintf("%03d", tune_id),
"_fold_", sprintf("%02d", fold_id),
".rds"))
saveRDS(out, file = outfile)
}
list(fit = fit, sparseFit = sparseFit)
}
#######################
testSpacemap <- function(test, fit, refit) {
resid <- test$Y - (test$Y%*%fit$Beta + test$X%*%fit$Gamma)
rssScore <- sum(resid*resid)
#convergence tolerance already accounted for train step
tol <- 0.0
dfParCor <- nonZeroUpper(fit$ParCor, tol)
dfGamma <- nonZeroWhole(fit$Gamma, tol)
degFree <- dfParCor + dfGamma
matrix(c(rssScore, degFree, dfParCor, dfGamma, fit$deltaMax),
nrow = 1, ncol = 5)
}
#######################
testSpace <- function(test, fit, refit, givenX, Xindex, Yindex) {
#convergence tolerance already accounted for in train step
tol <- 0.0
if (givenX) {
resid <- test$Y - (test$Y %*% fit$Beta[Yindex,Yindex] - test$X %*% fit$Beta[Xindex,Yindex])
dfParCor <- nonZeroUpper(fit$ParCor[Yindex, Yindex], tol)
dfGamma <- nonZeroWhole(fit$ParCor[Xindex, Yindex], tol)
degFree <- nonZeroUpper(fit$ParCor, tol)
} else {
resid <- test$XY - test$XY%*%fit$Beta
dfParCor <- nonZeroUpper(fit$ParCor, 0.0)
dfGamma <- 0.0
degFree <- dfParCor
}
rssScore <- sum(resid*resid)
matrix(c(rssScore, degFree, dfParCor, dfGamma, fit$deltaMax),
nrow = 1, ncol = 5)
}
#######################
testModel <- function(test, fit, sparseFit, method, refit, givenX, Xindex, Yindex) {
if (sparseFit) {
#currenlty the 0LS refit is returning the partial correlation, not the \Beta's
fit$Beta <- Beta.coef(fit$ParCor[upper.tri(fit$ParCor)], fit$sig.fit)
#do not regress on self
diag(fit$Beta) <- 0
if (method == "spacemap") {
cvScore <- testSpacemap(test = test, fit = fit, refit = refit)
} else if (method == "space") {
cvScore <- testSpace(test = test, fit = fit, refit = refit,
givenX = givenX, Xindex = Xindex, Yindex = Yindex)
}
}
else {
cvScore <- matrix(NA, nrow = 1, ncol = 5)
cvScore[1,5] <- fit$deltaMax
}
cvScore
}
dataPart <- function(f, trainIds, testIds, data,
method = c("spacemap", "space"), givenX) {
#define training and test data sets.
train <- list(); test <- list();
if (method == "spacemap") {
train$X <- data$X[trainIds[[f]],]; train$Y <- data$Y[trainIds[[f]],];
test$X <- data$X[testIds[[f]],]; test$Y <- data$Y[testIds[[f]],];
} else if (method == "space") {
train$XY <- data$XY[trainIds[[f]],];
if(givenX) {
test$X <- data$X[testIds[[f]],]; test$Y <- data$Y[testIds[[f]],];
} else {
test$XY <- data$XY[testIds[[f]],];
}
}
list(train=train, test=test)
}
structureScores <- function(cvScores, fold, method) {
metrics <- c("rss", "df", "dfParCor", "dfGamma", "deltaMax")
requireNamespace("foreach")
#for R CMD check NOTE passing
m <- NULL; f <- NULL;
metricScores <- foreach(m = seq_along(metrics)) %:%
foreach(f = seq_len(fold), .combine = 'cbind') %do% {
cvScores[[f]][,m]
}
names(metricScores) <- metrics
metricScores
}
averageScores <- function(metricScores, testIds) {
##Average cross-validated scores across folds
#while accounting for non-convergent folds
testSetLen <- sapply(testIds, length)
total <- sum(testSetLen)
foldconvmat <- !is.na(metricScores$rss)
nfoldconv <- rowSums(foldconvmat)
ntestconv <- apply(X = foldconvmat, MARGIN = 1, FUN = function(not_na) sum(testSetLen[not_na]))
i <- 0L #for R CMD check
metricScoresAvg <- foreach(i = seq_along(metricScores), .combine = 'cbind') %do% {
score <- metricScores[[i]]
stype <- names(metricScores)[[i]]
msum <- apply(X = score , MARGIN = 1, FUN = sum, na.rm = TRUE)
###Selection of tuning grid restricted to case when majority of test hold outs
###were evaluated (chunked by folds)
# divide rss by number of test sets with corresponding trained convergence
#divide other metrics by the number of converged folds
if (stype == "rss") {
ifelse(ntestconv > 0.5*total, msum / ntestconv, Inf)
} else {
ifelse(ntestconv > 0.5*total, msum / nfoldconv, Inf)
}
}
colnames(metricScoresAvg) <- names(metricScores)
metricScoresAvg
}
############################
# Find the min score index
minScoreIndex <- function(cvScoresAvg) {
#if there are multiple equally good scores, take the most sparse model.
minRssIds <- base::which(cvScoresAvg[,"rss"] == min(cvScoresAvg[,"rss"]))
minRssModelIds <- base::which.min(cvScoresAvg[minRssIds,"df"])
c(rss = minRssIds[minRssModelIds])
}
#' Cross validation (CV.Vote) for spacemap and space models.
#'
#' Selects and returns best-tuned model under CV.Vote.
#'
#' @inheritParams spacemap
#' @param trainIds List of integer vectors, where each integer vector contains
#' a split of training sample indices pertaining to \code{Y, X}.
#' @param testIds List of integer vectors, where each integer vector contains
#' a split of test sample indices pertaining to \code{Y, X}. Required to
#' be of the same length as \code{trainIds}.
#' @param method Character vector indicates network inference with function
#' \code{\link{spacemap}} when \code{method = "spacemap"} or function
#' \code{\link{space}} when \code{method = "space"}. If \code{X} is
#' non-null and \code{method = "space"}, then \code{space} will
#' infer (x--x, x--y, y--y) edges but only report (x--y, y--y) edges.
#' @param tuneGrid Named with columns \code{lam1, lam2, lam3}
#' when \code{method = "spacemap"}. Each row in the data.frame corresponds to a
#' tuning parameter set that is input into \code{\link{spacemap}}.
#' When \code{method = "space"}, supply a data.frame with only one column
#' being \code{lam1}.
#' @param resPath Character vector specifying the directory where each
#' model fit is written to file through serialization by \code{saveRDS}.
#' Defaults to temporary directory that will be deleted at end of the R session.
#' It is recommended to specify a directory where results can be stored permanently.
#' @param refit Logical indicates to refit the model after convergence to
#' reduce bias induced by penalty terms. Default to TRUE. The refit step
#' defaults to a ridge regression with small penalty of 0.01 to
#' encourage numerical stability. The user can change the ridge
#' penalty by adding an additional parameter \code{refitRidge}.
#' @param thresh Numerical threshold between 0 and 1 (defaults to 0.5 or majority vote)
#' indicating the minimum proportion of times () a given edge must be represented
#' in the trained models to be reported in the final CV.Vote model. For example,
#' If 0.5 is specified, and there are 10 training splits, then an edge in the final
#' model must be reported in 6 of the 10 traiing models.
#' @param aszero Positive numeric value (defaults to 1e-6) indicating at what point to consider extremely
#' small parameter estimates of \eqn{\Gamma} and \eqn{\rho} as zero.
#' @param ... Additional arguments for \code{\link{spacemap}} or \code{\link{space}}
#' to change their default settings (e.g. setting \code{tol = 1e-4}).
#'
#' @return A list containing
#' \itemize{
#' \item A list called \code{cvVote} with two elements:
#' \itemize{
#' \item \code{xy}, an adjacency matrix where \eqn{xy(p,q)} element
#' is 1 for an edge between \eqn{x_p} and \eqn{y_q} and 0 otherwise; and
#' \item \code{yy} Adjacency matrix where \eqn{yy(q,l)} element
#' is 1 for an edge between \eqn{y_q} and \eqn{y_l} and 0 otherwise.
#' }
#' \item \code{minTune} List containing the optimal tuning penalty set.
#' \item \code{minIndex} Integer specifying the index of \code{minTune} in \code{tuneGrid}.
#' \item \code{metricScores} Data.frame for input to \code{\link{tuneVis}} for
#' inspecting the CV score curve and model size as a function of the tuning penalties.
#' }
#' @import RcppArmadillo
#' @import Rcpp
#' @export
#' @examples
#'library(spacemap)
#'data(sim1)
#'##########################
#'#DEFINE TRAINING/TEST SETS
#'library(caret)
#'#sample size
#'N <- nrow(sim1$X)
#'#number of folds
#'K <- 5L
#'set.seed(265616L)
#'#no special population structure, but create randomized dummy structure of A and B
#'testSets <- createFolds(y = sample(x = c("A", "B"), size = N, replace = TRUE), k = K)
#'trainSets <- lapply(testSets, function(s) setdiff(seq_len(N), s))
#'nsplits <- sapply(testSets, length)
#'##########################
#'#SPACE (Y input)
#'tsp <- expand.grid(lam1 = seq(65, 75, length = 3))
#'cvspace <- cvVote(Y = sim1$Y,
#' trainIds = trainSets, testIds = testSets,
#' method = "space", tuneGrid = tsp)
#'##########################
#'# SPACEMAP (Y and X input)
#'tmap <- expand.grid(lam1 = seq(65, 75, length = 2),
#' lam2 = seq(21, 35, length = 2),
#' lam3 = seq(10, 40, length = 2))
#'cvsmap <- cvVote(Y = sim1$Y, X = sim1$X,
#' trainIds = trainSets, testIds = testSets,
#' method = "spacemap", tuneGrid = tmap)
#'
cvVote <- function(Y, X = NULL, trainIds, testIds,
method = c("spacemap", "space"), tuneGrid,
resPath = tempdir(), refit = TRUE,
thresh = 0.5, iscale = TRUE, aszero = 1e-6, ...) {
################
#check arguments
################
#paired test/train splits
stopifnot(length(trainIds) == length(testIds))
fold <- length(trainIds)
method <- match.arg(method)
if(!is.matrix(Y)) stop("Y is not a matrix.")
givenX <- !is.null(X)
Xindex <- NULL
Yindex <- NULL
if(method == "space" & givenX) {
if(!is.matrix(X)) stop("X is not a matrix.")
if(iscale) {
X <- scale(X)
Y <- scale(Y)
}
data <- list(XY = cbind(X, Y), Y = Y, X = X)
Xindex <- seq_len(ncol(X))
Yindex <- (ncol(X) + 1):(ncol(X) + ncol(Y))
} else if(method == "space" & !givenX){
if(iscale) {
Y <- scale(Y)
}
data <- list(XY = Y)
} else if (method == "spacemap") {
if(!is.matrix(X)) stop("X is not a matrix.")
if(iscale) {
X <- scale(X)
Y <- scale(Y)
}
data <- list(Y = Y, X = X)
}
## compute cross-validated metric scores for each fold and for each tune set
requireNamespace("foreach")
message("Computing CV scores over the grid...")
iscale <- FALSE
#for R CMD check NOTE passing
f <- NULL; l <- NULL;
foldScores <- foreach(f = seq_len(fold)) %:%
foreach(l = seq_len(nrow(tuneGrid)), .combine = 'rbind', .packages = "spacemap") %dopar% {
parts <- dataPart(f = f, trainIds = trainIds, testIds = testIds,
data = data, method = method, givenX = givenX)
trained <- trainModel(train = parts$train, method = method, tuneGrid = tuneGrid[l,], refit = refit,
fold_id = f, tune_id = l, resPath = resPath,
givenX = givenX, Xindex = Xindex, Yindex = Yindex, aszero = aszero, ...)
tmp <- testModel(test = parts$test, fit = trained$fit, sparseFit = trained$sparseFit,
method = method, refit = refit,
givenX = givenX, Xindex = Xindex, Yindex = Yindex)
tmp
}
##restructure list$fold[tune index, metric index ] to be list$metric$[tune index, fold index ]
metricScores <- structureScores(cvScores = foldScores, fold = fold, method = method)
#average across folds
metricScoresAvg <- averageScores(metricScores = metricScores, testIds = testIds)
##Find the minimizing tuning parameter set
minIndex <- minScoreIndex(metricScoresAvg)
if (metricScoresAvg[minIndex[1],"rss"] == Inf) {
stop("No convergence for any fold or any tuning parameter selection.")
}
#obtain model fit files from best tune index
files <- list.files(path = resPath,
pattern = paste0("tuneid_", sprintf("%03d", minIndex), "_fold"),
full.names = TRUE)
voteFit <- cvVoteRDS(files = files, tol = 0.0,
thresh = thresh, method = method,
givenX = givenX)
if (method == "spacemap") {
mtg <- tuneGrid[minIndex,]
minTune <- list(lam1 = mtg$lam1, lam2 = mtg$lam2, lam3 = mtg$lam3)
} else if (method == "space") {
minTune <- list(lam1 = tuneGrid[minIndex,])
}
list(cvVote = voteFit, minTune = minTune, minIndex = minIndex,
logcvScore = log(metricScoresAvg[minIndex,"rss"]),
metricScores = metricScores, bestFoldFiles = files)
}
##################
#PERFORMANCE
##################
#' CV.Vote Model Performance
#'
#' Convenience function for evaluating edge detection performance of
#' CV.Vote model fits.
#'
#' @inheritParams reportJointPerf
#' @param cvOut Output from \code{\link{cvVote}}.
#' @param method Character vector indicates \code{cvOut} came from
#' spaceMap or SPACE model. See parameter \code{method} in \code{\link{cvVote}}.
#' @param givenX Logical (defaults to FALSE). Set to TRUE for \code{method == "space"}
#' when \code{cvOut} had non-null \code{X} input to \code{\link{cvVote}}.
#' @param Xindex Integer vector of indices of X variables in partial correlation matrix. Defaults to
#' NULL, but must be non-null when \code{method == "space" & givenX = TRUE}.
#' @param Yindex Integer vector of indices of Y variables in partial correlation matrix. Defaults to
#' NULL, but must be non-null when \code{method == "space" & givenX = TRUE}.
#' @param aszero A numeric value specifying the point at which a parameter estimate should
#' be effectively considered of zero value.
#' @seealso \code{\link{reportJointPerf}}, \code{\link{reportPerf}}
#' @export
cvPerf <- function(cvOut, truth, method, givenX = FALSE,
Xindex=NULL, Yindex=NULL, aszero = 1e-6) {
if (method == "spacemap") {
perf <- spacemap::reportJointPerf(fit = list(xy = cvOut$cvVote$xy,
yy = cvOut$cvVote$yy),
truth = truth, aszero = aszero,
verbose = FALSE)
} else if (method == "space") {
if (givenX) {
if(is.null(Xindex) | is.null(Yindex)) {
stop("Must specify Xindex and Yindex; cannot be NULL.")
}
perf <- spacemap::reportJointPerf(fit = list(xy = cvOut$cvVote$yy[Xindex,Yindex],
yy = cvOut$cvVote$yy[Yindex,Yindex]),
truth = truth, aszero = aszero,
verbose = FALSE)
} else {
perf <- spacemap::reportPerf(cvOut$cvVote$yy,
truth$yy, YY = TRUE, aszero = aszero,
verbose = FALSE)
}
}
perf
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.