Nothing
# RAI: Outermost Function; Only Exported Function ------------------------------
#' @name RAI
#' @title Main function for Revisiting Alpha-Investing (RAI) regression.
#'
#' @description The function rai is a wrapper that creates and manages the
#' inputs and outputs of the \code{\link{runAuction}} function. Using
#' poly=FALSE is an efficient and statistically valid way to run and terminate
#' stepwise regression. The function prepareData is provided in order to make
#' generating predictions on test data easier: it is used by rai to process
#' the data prior to running, and is necessary to make column names and
#' information match in order to use the model object returned by rai.
#'
#' @details Missing values are treated as follows: all observations with missing
#' values in theResponse are removed; numeric columns in theData have missing
#' values imputed by the mean of the column and an indicator column is added
#' to note missingness; missing values in factor or binary columns are given
#' the value "NA", which creates an additional group for missing values. Note
#' that as rai is run using the output of model.matrix, it is not guaranteed
#' that all categories from a factor are included in the regression. Column
#' names may also be modified to be syntactically valid. The model object can
#' be used to generate predictions on test data. Note that if default
#' conversions were used when running rai, then they must be used again with
#' prepareData for the test data prior to producing predictions.
#'
#' @param theData matrix of covariates.
#' @param theResponse response vector or single column matrix.
#' @param alpha level of procedure.
#' @param alg algorithm can be one of "rai", "raiPlus", or "RH" (Revisiting
#' Holm).
#' @param r threshold parameter, with 0 < r < 1. RAI rejects tests which
#' increase remaining R^2 by a factor r^s, where s is the epoch. Larger values
#' of r yield a closer approximation to stepwise regression.
#' @param poly logical. Should the algorithm look for higher-order polynomials?
#' @param startDeg This is the starting degree for polynomial regression. It
#' allows the search to start with lower order polynomials such as square
#' roots. This alleviates some problems with high-dimensional polynomials as a
#' 4th degree polynomial where startDeg=1/2 is only a quadratic on the
#' original scale.
#' @param searchType A character string specifying the prioritization of
#' higher-order polynomials. One of "breadth" (more base features) or "depth"
#' (higher orders).
#' @param m number of observations used in subsampling for variance inflation
#' factor estimate of r.squared. Set m=Inf to use full data.
#' @param sigma type of error estimate used; one of "ind" or "step". If "ind",
#' you must provide a numeric value for rmse and df.
#' @param rmse user provided value for rmse. Must be used with sigma="ind".
#' @param df degrees of freedom for user specified rmse. Must be used with
#' sigma="ind".
#' @param omega return from rejecting a test in Alpha-Investing (<= alpha).
#' @param reuse logical. Should repeated tests of the same covariate be
#' considered a test of the same hypothesis? reusing wealth isn't implemented
#' for RAI or RAIplus as the effect is negligible.
#' @param maxTest maximum number of tests.
#' @param verbose logical. Should auction output be printed?
#' @param save logical. Should the auction results be saved? If TRUE, returns a
#' summary matrix.
#' @param lmFit The core function that will be used to estimate linear model
#' fits. The default is .lm.fit, but other alternatives are possible. Note
#' that it does not use formula notation as this is costly. Another
#' recommended option is fastLmPure from RcppEigen or related packages.
#' @param x an R object.
#' @return A list which includes the following components: \item{y}{response.}
#' \item{X}{model matrix from final model.} \item{formula}{final model
#' formula.} \item{features}{list of interactions included in formula.}
#' \item{summary}{if save=TRUE, contains information on each test made by the
#' algorithm.} \item{time}{run time.} \item{options}{options given to RAI:
#' alg, searchType, poly, r, startDeg, alpha, omega, m.} \item{subData}{subset
#' of columns from theData that are used in the final model.}
#' \item{model}{linear model object using selected model} Summary and predict
#' methods are provided in order to generate further output and graphics.
#' @examples
#' data("CO2")
#' theResponse = CO2$uptake
#' theData = CO2[ ,-5]
#' rai_out = rai(theData, theResponse)
#' summary(rai_out) # summary information including graphs
#' @importFrom stats lm model.matrix pt qt resid var sd .lm.fit
#' @export
prepareData = function(theData, poly=TRUE, startDeg=1) {
if (any(is.na(theData))) {
warning("Missing values deteted; applying default conversions and exclusions.
See documentation for details or remove missing values manually.")
theData = modMissingData(as.data.frame(theData))
}
theData = model.matrix(~. - 1, data=as.data.frame(theData))
colnames(theData) = make.names(colnames(theData))
if (poly && startDeg!=1) {
theData = apply(theData, 2, function(col) col^startDeg)
}
theData
}
modMissingCol = function(col) {
if (is.numeric(col) && length(unique(col)) > 2) { # numeric
missing = is.na(col)
col[missing] = mean(col, na.rm=TRUE)
list(col, missing)
} else { # categorical
col[is.na(col)] = "NA"
list(col)
}
}
modMissingData = function(dataList) {
out = lapply(dataList,
function(el) if (any(is.na(el))) modMissingCol(el) else list(el))
lengths = lapply(out, length)
dataMod = as.data.frame(out)
dfNames = mapply(
function(len, name) if (len==1) name else c(name, paste0(name, "_NA")),
lengths, names(dataList)
)
names(dataMod) = unlist(dfNames)
dataMod
}
#' @name RAI
#' @export
is.rai = function(x) inherits(x, "rai")
#' @name RAI
#' @export
rai = function(theData, theResponse, alpha=.1, alg="rai", r=.8, poly=alg!="RH",
startDeg=1, searchType="breadth", m=500, sigma="step", rmse = NA,
df=NA, omega=alpha, reuse=(alg=="RH"), maxTest=Inf, verbose=FALSE,
save=TRUE, lmFit = .lm.fit) {
stopifnot(searchType %in% c("breadth", "depth") ||
sigma %in% c("ind", "step") ||
alg %in% c("rai", "raiPlus", "RH"))
if (poly && alg == "RH") { stop("Cannot do polynomial regression with RH.") }
if (reuse && alg != "RH") { stop("RAI does not reuse wealth.") }
missResponse = is.na(theResponse)
theResponse = as.matrix(theResponse[!missResponse], ncol=1)
theData = prepareData(theData[!missResponse, ], poly, startDeg)
if (sigma == "ind") {
stopifnot(is.numeric(rmse) && is.numeric(df))
} else {
rmse = sd(theResponse)
df = nrow(theResponse) - 1
}
gWealth = gWealthStep(alpha, alg, r, var(theResponse)*(nrow(theResponse)-1),
ncol(theData), reuse, rmse, df)
experts = list(makeStepwiseExpert(gWealth, ncol(theData)))
timeStart = Sys.time()
aucOut = runAuction(experts, gWealth, theData, theResponse,
alg, poly, searchType, m, sigma, omega, reuse, maxTest,
verbose, save, lmFit)
aucOut$time = Sys.time() - timeStart
aucOut$options = list(alg=alg, searchType=searchType, poly=poly, r=r,
startDeg=startDeg, alpha=alpha, omega=omega, m=m)
aucOut$subData = theData[,sort(unique(unlist(aucOut$features)))]
aucOut$model = lm(aucOut$formula, data.frame(y=theResponse, aucOut$subData))
class(aucOut) = "rai"
aucOut
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.