#' MAIL
#'
#' \code{MAIL} runs the Model-Averaged Inferential Learning method under different parameter settings
#'
#' The most important choice is whether or not use data splitting.
#' The advantage of data splitting is to mitigate post selection changes to inference.
#' The advantage of using all of the data is to reduce bias.
#'
#' @param XMat a n by p numeric matrix
#' @param yVec a n by 1 numeric vector
#' @param splitOption Mandatory - can take the values "Full" or "Split"
#' @param firstSOILWeightType Mandatory - can take values "AIC", "BIC" or "ARM"
#' @param smallestModelWeightType Mandatory - can take values "AIC", "BIC" or "ARM"
#' @param firstSOILPsi Mandatory - can take any value in [0,1]
#' @param smallestModelPsi Mandatory - can take any value in [0,1]
#' @param numSelectionIter Optional - defaults to 10, must be an integer >= 1
#' @param sigma2EstFunc Mandatory - this is a string of the function that will estimate the error variance using only XMat and yVec. We recommend using "LPM_AIC_CV_50Split". If the error variance is known, use "trueValue" here.
#' @param trueSD Optional unless "trueValue" has given to the previous argument. This is where the user gives the assumed error standard deviation.
#' @param verbose Optional: default is FALSE - set to TRUE if you want to see printed messages about MAIL's progress.
#'
#' @seealso \code{\link{MAIL_Full}} and \code{\link{MAIL_Split}} for specific versions
#' @seealso \code{\link{LPM_AIC_CV_50Split}} for the recommended variance estimation method
#'
#' @export
MAIL = function(XMat,yVec,
splitOption,
firstSOILWeightType,
smallestModelWeightType,
firstSOILPsi,
smallestModelPsi,
numSelectionIter=10,
sigma2EstFunc,
trueSD=NULL,
verbose=FALSE) {
# 1) select variables: high vs low value on the right dataset
# 2) calculate weights
# 3) construct a candidate matrix
if (verbose==TRUE) {
print("Step 1: Organize Data")
}
N = dim(XMat)[1]
p = dim(XMat)[2]
if (splitOption == "Split") {
expInds = sample(1:N,size=floor(N/2),replace=FALSE)
xExp = XMat[expInds,]
xCon = XMat[-1*expInds,]
yExp = yVec[expInds]
yCon = yVec[-1*expInds]
NExp = dim(xExp)[1]
pExp = dim(xExp)[2]
NCon = dim(xCon)[1]
pCon = dim(xCon)[2]
}
else {
xExp = XMat
xCon = XMat
yExp = yVec
yCon = yVec
NExp = dim(xExp)[1]
pExp = dim(xExp)[2]
NCon = dim(xCon)[1]
pCon = dim(xCon)[2]
}
if (verbose == TRUE) {
print("Step 2: Run First Model Average")
}
allSOILScores = rep(0,times=p) ##
for (i in 1:numSelectionIter) {
if (verbose == TRUE) {
print(sprintf("\tStep 2: Iteration %d",i))
}
if (firstSOILWeightType != "ARM") {
soilRes = SOIL(x=xExp,y=yExp,
weight_type=firstSOILWeightType,
psi=firstSOILPsi,family="gaussian",method="union")
}
else {
soilRes = SOIL(x=xExp,y=yExp,
weight_type = "ARM",
psi=firstSOILPsi,family="gaussian",method="union",
n_train = ceiling(NExp/2)+4)
}
allSOILScores = allSOILScores + as.numeric(soilRes$importance)
}
allSOILScores = allSOILScores / numSelectionIter
if (verbose == TRUE) {
print("Step 3: Select Variables for the Nested Candidate Set")
}
numModels = min(c(floor(dim(xExp)[1]/2),floor(dim(xExp)[2]/2)))
soilCutoff = sort(allSOILScores,decreasing=TRUE)[numModels]
selectedSet = which(allSOILScores >= soilCutoff)
if (length(selectedSet) > numModels) {
selectedSet = selectedSet[order(allSOILScores[selectedSet],decreasing=TRUE)[1:numModels]]
}
origSelectedSet = selectedSet
selectedSet = removeUnidentCols(selectedSet,xCon)
numSelected = length(selectedSet)
if (numSelected > 0) {
selectedSOILScores = allSOILScores[selectedSet]
selectedSetSorted = selectedSet[order(selectedSOILScores,decreasing=TRUE)]
### create the candidate matrix
if (verbose == TRUE) {
print("Step 4: Create Candidate Set, and Smallest Model")
}
candMat = matrix(0,nrow=numSelected,ncol=p)
tempVarSet = c()
for (i in 1:numSelected) {
tempVarSet = c(tempVarSet,selectedSetSorted[i])
candMat[i,tempVarSet] = 1
}
origCandMat <- candMat
reRunSOIL_SmallestModel = SOIL(x=xExp,y=yExp,family="gaussian",weight_type=smallestModelWeightType,
psi=smallestModelPsi,n_train_bound = numModels + 2,
n_train = numModels + 4,
candidate_models = candMat,method="customize")
minInd = which.max(reRunSOIL_SmallestModel$weight)
maxInd = dim(candMat)[1]
if (verbose == TRUE) {
print("Step 5: Estimate sigma^2")
}
##### Variance Estimation
if (sigma2EstFunc != "trueValue") {
tempSigma2Func = get(sigma2EstFunc)
estSigma2 = tempSigma2Func(XMat,yVec)
}
else {
estSigma2 = trueSD^2
}
### now that we have an estimate of sigma^2,
### we can throw out the obviously wrong models that are too large
### choose the number of variables as "largestIndex" -
### in other words choose min AIC-corrected as the cutoff
if (verbose == TRUE) {
print("Step 6: Estimate Final Weights")
}
candMat = candMat[minInd:maxInd,]
if (minInd == maxInd) {
candMat = matrix(candMat,nrow=1)
modelWeight = 1
}
else {
if (firstSOILWeightType != "ARM") {
finalSOIL = SOIL(x=xExp,y=yExp,family="gaussian",weight_type=firstSOILWeightType,
psi=firstSOILPsi,
candidate_models = candMat,method="customize")
}
else {
finalSOIL = SOIL(x=xExp,y=yExp,family="gaussian",weight_type="ARM",
psi=firstSOILPsi,n_train = ceiling(NExp/2)+4,
candidate_models = candMat,method="customize")
}
modelWeight = finalSOIL$weight
}
if (verbose == TRUE) {
print("Step 7: Get MAIL Estimates and CI's")
}
mailOutputs = mailStep7(candMat,selectedSet,xCon,yCon,modelWeight,estSigma2)
}
else { # no variables were selected
selectedSOILScores <- NULL
selectedSetSorted <- NULL
soilRes <- NULL
### create the candidate matrix
if (verbose == TRUE) {
print("Step 4: No Variables Selected, Return Empty Candidate Set")
}
candMat <- NULL
origCandMat <- NULL
reRunSOIL_SmallestModel <- NULL
if (verbose == TRUE) {
print("Step 5: Estimate sigma^2")
}
##### Variance Estimation
if (sigma2EstFunc != "trueValue") {
tempSigma2Func = get(sigma2EstFunc)
estSigma2 = tempSigma2Func(XMat,yVec)
}
else {
estSigma2 = trueSD^2
}
if (verbose == TRUE) {
print("Step 6: No Variables Selected, Do Not Estimate Final Weights")
}
modelWeight <- NULL
if (verbose == TRUE) {
print("Step 7: No Variables Selected, Do Not Calculate MAIL Estimates and CI's")
}
tempVarVec <- NULL
tempCI <- NULL
betaHatMA <- rep(0,times=p)
}
resList <- list(tempCI = mailOutputs$tempCI,
selectedSet = selectedSet,
margVar = mailOutputs$margVar,
betaHat = mailOutputs$betaHatMA,
modelWeight = modelWeight,
estSigma2 = estSigma2,
candMat = candMat,
origSelectedSet = origSelectedSet,
reRunSOIL_SmallestModel = reRunSOIL_SmallestModel,
origCandMat = origCandMat,
origSOILRes = soilRes)
return(resList)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.