Nothing
# --------------------------------------
# Author: Andreas Alfons
# Erasmus Universiteit Rotterdam
# --------------------------------------
## workhorse function for groupwise LARS
#' @import parallel
grouplars <- function(x, y, sMax = NA, assign, robust = FALSE,
centerFun = mean, scaleFun = sd,
regFun = lm.fit, regArgs = list(),
combine = c("min", "euclidean", "mahalanobis"),
winsorize = FALSE, const = 2, prob = 0.95,
fit = TRUE, s = c(0, sMax), crit = c("BIC", "PE"),
splits = foldControl(), cost = rmspe, costArgs = list(),
selectBest = c("hastie", "min"), seFactor = 1,
ncores = 1, cl = NULL, seed = NULL, model = TRUE,
call = NULL) {
## initializations
n <- length(y)
assignList <- split(seq_len(length(assign)), assign) # indices per group
m <- length(assignList) # number of groups
p <- sapply(assignList, length) # number of variables in each group
adjust <- length(unique(p)) > 1 # adjust for group length?
if(isTRUE(is.numeric(sMax) && length(sMax) == 2)) {
# ensure backwards compatibility concerning the number of steps
s <- c(0, sMax[2])
sMax <- s[1]
}
robust <- isTRUE(robust)
sMax <- checkSMax(sMax, n, p, groupwise=TRUE) # number of groups to sequence
if(robust) {
# if possible, do not use formula interface
regControl <- getRegControl(regFun)
regFun <- regControl$fun
callRegFun <- getCallFun(regArgs)
# check how data cleaning weights should be combined
combine <- match.arg(combine)
# alternatively use winsorization
winsorize <- isTRUE(winsorize)
}
if(is.na(ncores)) ncores <- detectCores() # use all available cores
if(!is.numeric(ncores) || is.infinite(ncores) || ncores < 1) {
ncores <- 1 # use default value
warning("invalid value of 'ncores'; using default value")
} else ncores <- as.integer(ncores)
## check whether submodels along the sequence should be computed
## if yes, check whether the final model should be found via resampling-based
## prediction error estimation
fit <- isTRUE(fit)
if(fit) {
s <- checkSRange(s, sMax=sMax) # check range of steps along the sequence
crit <- if(!is.na(s[2]) && s[1] == s[2]) "none" else match.arg(crit)
if(crit == "PE") {
# further checks for steps along the sequence
if(is.na(s[2])) {
s[2] <- min(sMax, floor(n/2))
if(s[1] > sMax) s[1] <- sMax
}
# set up function call to be passed to perryFit()
remove <- c("x", "y", "crit", "splits", "cost", "costArgs",
"selectBest", "seFactor", "ncores", "cl", "seed")
remove <- match(remove, names(call), nomatch=0)
funCall <- call[-remove]
funCall$sMax <- sMax
funCall$s <- s
# make sure function call is evaluated in the correct environment
parentEnv <- parent.frame(2)
# call function perryFit() to perform prediction error estimation
s <- seq(from=s[1], to=s[2])
selectBest <- match.arg(selectBest)
out <- perryFit(funCall, x=x, y=y, splits=splits,
predictArgs=list(s=s, recycle=TRUE), cost=cost,
costArgs=costArgs, envir=parentEnv, ncores=ncores,
cl=cl, seed=seed)
out <- perryReshape(out, selectBest=selectBest, seFactor=seFactor)
fits(out) <- s
# fit final model
funCall$x <- call$x
funCall$y <- call$y
funCall$s <- s[out$best]
funCall$ncores <- call$ncores
funCall$cl <- cl
out$finalModel <- eval(funCall, envir=parentEnv)
out$call <- call
# assign class and return object
class(out) <- c("perrySeqModel", class(out))
return(out)
}
}
## prepare the data
if(!is.null(seed)) set.seed(seed)
if(fit || (robust && !winsorize)) {
# check whether parallel computing should be used
haveCl <- inherits(cl, "cluster")
haveNcores <- !haveCl && ncores > 1
useParallel <- haveNcores || haveCl
# set up multicore or snow cluster if not supplied
if(haveNcores) {
if(.Platform$OS.type == "windows") {
cl <- makePSOCKcluster(rep.int("localhost", ncores))
} else cl <- makeForkCluster(ncores)
on.exit(stopCluster(cl))
}
if(useParallel) {
# set seed of the random number stream
if(!is.null(seed)) clusterSetRNGStream(cl, iseed=seed)
else if(haveNcores) clusterSetRNGStream(cl)
}
}
if(robust) {
# use fallback mode (standardization with mean/SD) for dummies
z <- robStandardize(y, centerFun, scaleFun)
xs <- robStandardize(x, centerFun, scaleFun, fallback=TRUE)
muY <- attr(z, "center")
sigmaY <- attr(z, "scale")
muX <- attr(xs, "center")
sigmaX <- attr(xs, "scale")
if(winsorize) {
if(is.null(const)) const <- 2
# obtain data cleaning weights from winsorization
w <- winsorize(cbind(z, xs), standardized=TRUE, const=const,
prob=prob, return="weights")
} else {
# clean data in a limited sense: there may still be correlation
# outliers between the groups, but these should not be a problem
# compute weights from robust regression for each group
# intercept column needs to be used in robust short regressions
if(combine == "min") {
# define function to compute weights for each predictor group
getWeights <- function(i, x, y) {
x <- x[, i, drop=FALSE]
if(regControl$useFormula) {
fit <- callRegFun(y ~ x, fun=regFun, args=regArgs)
} else {
x <- cbind(rep.int(1, n), x)
fit <- callRegFun(x, y, fun=regFun, args=regArgs)
}
weights(fit)
}
# compute weights for each predictor group
if(useParallel) {
w <- parSapply(cl, assignList, getWeights, xs, z)
} else w <- sapply(assignList, getWeights, xs, z)
w <- sqrt(apply(w, 1, min)) # square root of smallest weights
# observations can have zero weight, in which case the number
# of observations needs to be adjusted
n <- length(which(w > 0))
} else {
# define function to compute scaled residuals for each
# predictor group
getResiduals <- function(i, x, y) {
x <- x[, i, drop=FALSE]
if(regControl$useFormula) {
fit <- callRegFun(y ~ x, fun=regFun, args=regArgs)
} else {
x <- cbind(rep.int(1, n), x)
fit <- callRegFun(x, y, fun=regFun, args=regArgs)
}
residuals(fit) / getScale(fit)
}
# compute scaled residuals for each predictor group
if(useParallel) {
residuals <- parSapply(cl, assignList, getResiduals, xs, z)
} else residuals <- sapply(assignList, getResiduals, xs, z)
# obtain weights from scaled residuals
if(combine == "euclidean") {
# assume diagonal structure of the residual correlation matrix
# and compute weights based on resulting mahalanobis distances
d <- qchisq(prob, df=m) # quantile of the chi-squared distribution
w <- pmin(sqrt(d/rowSums(residuals^2)), 1)
} else {
# obtain weights from multivariate winsorization
w <- winsorize(residuals, standardized=TRUE, const=const,
prob=prob, return="weights")
}
}
}
z <- standardize(w*z) # standardize cleaned response
xs <- standardize(w*xs)
# center and scale of response
muY <- muY + attr(z, "center")
sigmaY <- sigmaY * attr(z, "scale")
# center and scale of candidate predictor variables
muX <- muX + attr(xs, "center")
sigmaX <- sigmaX * attr(xs, "scale")
} else {
z <- standardize(y) # standardize response
xs <- standardize(x)
# center and scale of response
muY <- attr(z, "center")
sigmaY <- attr(z, "scale")
# center and scale of candidate predictor variables
muX <- attr(xs, "center")
sigmaX <- attr(xs, "scale")
}
# ## find first ranked predictor group
# zHat <- sapply(assignList,
# function(i, x, y) {
# x <- x[, i, drop=FALSE]
# model <- lm.fit(x, y)
# fitted(model)
# }, xs, z)
# corZ <- unname(apply(zHat, 2, sd))
# # if necessary, compute the denominators for adjustment w.r.t. the number
# # of variables in the group, and adjust the correlations of the fitted
# # values with the response
# if(adjust) {
# adjustment <- sqrt(p)
# corZ <- corZ / adjustment
# }
# # first active group
# active <- which.max(corZ)
# r <- c(corZ[active], rep.int(NA, sMax[1]-1))
# # not yet sequenced groups
# inactive <- seq_len(m)[-active]
# corZ <- corZ[-active]
#
# ## update active set
# R <- diag(1, sMax[1])
# a <- 1
# if(adjust) a <- a / adjustment[active]
# w <- 1
# sigma <- 1
# # start iterative computations
# for(k in seq_len(sMax[1]-1)) {
# # standardize fitted values for k-th group
# zHat[, active[k]] <- standardize(zHat[, active[k]])
# # compute the equiangular vector
# if(k == 1) u <- zHat[, active] # equiangular vector equals first direction
# else {
# # compute correlations between fitted values for active groups
# R[k, seq_len(k-1)] <- R[seq_len(k-1), k] <-
# apply(zHat[, active[seq_len(k-1)], drop=FALSE], 2, cor, zHat[, active[k]])
# # other computations according to algorithm
# invR <- solve(R[seq_len(k), seq_len(k), drop=FALSE])
# q <- if(adjust) adjustment[active] else rep.int(1, k)
# # compute the correlation of the fitted values from the active
# # predictor groups with the equiangular vector (adjustment for
# # unequal group size is considered via vector 'q')
# a <- c(t(q) %*% invR %*% q)^(-1/2)
# # compute the equiangular vector
# w <- a * c(invR %*% q)
# u <- c(zHat[, active, drop=FALSE] %*% w) # equiangular vector
# }
# # compute the fitted values of the equiangular vector for each
# # inactive predictor group, as well as the correlations involving the
# # inactive predictor groups and the equiangular vector
# uHat <- sapply(assignList[inactive],
# function(i, x, u) {
# x <- x[, i, drop=FALSE]
# model <- lm.fit(x, u)
# fitted(model)
# }, xs, u)
# corU <- apply(zHat[, inactive, drop=FALSE], 2, cor, u)
# tau <- apply(uHat, 2, sd)
# # adjustment for unequal group size if necessary
# if(adjust) {
# tmp <- adjustment[inactive]
# corU <- corU / tmp
# tau <- tau / tmp
# }
# # compute the step size by solving the quadratic equation
# gammas <- findStepSizes(r[k], a, corZ, corU, tau)
# whichMin <- which.min(gammas)
# gamma <- gammas[whichMin]
# # the following computations are not necessary in the last iteration
# if(k < sMax[1]-1) {
# # update the scale of the current response
# sigma <- sqrt(1 - 2*gamma*r[k]/a + gamma^2)
# # update the fitted values from the new or not yet sequenced
# # predictor groups
# zHat[, inactive] <- (zHat[, inactive] - gamma * uHat) / sigma
# r[k+1] <- (r[k] - gamma * a) / sigma
# corZ <- corZ[-whichMin]
# corU <- corU[-whichMin]
# tau <- tau[-whichMin]
# corZ = sqrt(corZ^2 - 2*gamma*corU*corZ + gamma^2 * tau^2) / sigma
# }
# # update active set and not yet sequenced groups
# active <- c(active, inactive[whichMin]) # update active set
# inactive <- inactive[-whichMin] # update not yet sequenced groups
# }
## call C++ function
active <- .Call("R_fastGrplars", R_x=xs, R_y=z, R_sMax=sMax,
R_assign=assignList, R_ncores=ncores,
PACKAGE="robustHD")
## choose optimal model according to specified criterion
if(fit) {
# add ones to matrix of predictors to account for intercept
x <- addIntercept(x)
# call function to fit models along the sequence
out <- seqModel(x, y, active=active, sMin=s[1], sMax=s[2],
assign=assignList, robust=robust, regFun=regFun,
useFormula=regControl$useFormula, regArgs=regArgs,
crit=crit, cl=cl)
# add center and scale estimates
out[c("muX", "sigmaX", "muY", "sigmaY")] <- list(muX, sigmaX, muY, sigmaY)
# add model data to result if requested
if(isTRUE(model)) out[c("x", "y")] <- list(x=x, y=y)
out$assign <- assign # this is helpful for some methods
out$robust <- robust
if(robust) out$w <- w # add data cleaning weights
out$call <- call # add call to return object
class(out) <- c("grplars", class(out))
out
} else active
}
# # find possible step sizes for groupwise LARS by solving quadratic equation
# findStepSizes <- function(r, a, corY, corU, tau) {
# mapply(function(corY, corU, tau, r, a) {
# # quadratic equation to be solved
# comp <- c(r^2 - corY^2, 2 * (corU*corY - a*r), a^2 - tau^2)
# # solution of quadratic equation
# gamma <- Re(polyroot(comp))
# solution <- min(gamma[gamma >= 0])
# solution
# }, corY, corU, tau, MoreArgs=list(r=r, a=a))
# }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.