#' read application labels off a split plan.
#'
#' @param nRow number of rows in original data.frame.
#' @param plan split plan
#' @return vector of labels
#'
#' @seealso \code{\link{kWayCrossValidation}}, \code{\link{kWayStratifiedY}}, and \code{\link{makekWayCrossValidationGroupedByColumn}}
#'
#' @examples
#'
#' plan <- kWayStratifiedY(3,2,NULL,NULL)
#' getSplitPlanAppLabels(3,plan)
#'
#' @export
getSplitPlanAppLabels <- function(nRow,plan) {
labels <- numeric(nRow)
for(i in seq_len(length(plan))) {
labels[plan[[i]]$app] <- i
}
labels
}
#' check if appPlan is a good carve-up of 1:nRows into nSplits groups
#'
#' @param nRows number of rows to carve-up
#' @param nSplits number of sets to carve-up into
#' @param appPlan carve-up to critique
#' @param strictCheck logical, if true expect application data to be a carve-up and training data to be a maximal partition and to match nSplits.
#' @return problem with carve-up (null if good)
#'
#' @seealso \code{\link{kWayCrossValidation}}, \code{\link{kWayStratifiedY}}, and \code{\link{makekWayCrossValidationGroupedByColumn}}
#'
#' @examples
#'
#' plan <- kWayStratifiedY(3,2,NULL,NULL)
#' problemAppPlan(3,3,plan,TRUE)
#'
#' @export
problemAppPlan <- function(nRows,nSplits,appPlan,strictCheck) {
if(is.null(appPlan)) {
return("appPlan was null")
}
if(!is.list(appPlan)) {
return("appPlan needs be a list")
}
if((strictCheck)&&(nRows>1)&&(!is.null(nSplits))) {
if(length(appPlan)!=nSplits) {
return("didn't get requested number of groups in appPlan")
}
}
fullSeq <- seq_len(nRows)
seen <- c()
for(i in seq_len(length(appPlan))) {
si <- appPlan[[i]]
if(!is.list(si)) {
return("non list element in app plan")
}
ti <- si$train
if(is.null(ti)) {
return("missing train slot")
}
if(length(setdiff(ti,fullSeq))!=0) {
return("unexpected symbols in train slot")
}
ai <- si$app
if(is.null(ai)) {
return("missing app slot")
}
if(length(setdiff(ai,fullSeq))!=0) {
return("unexpected symbols in application slot")
}
if(nRows>1) {
if(length(intersect(ti,ai))!=0) {
return("train and application slots overlap")
}
}
if(strictCheck) {
if(length(setdiff(fullSeq,union(ai,ti)))>0) {
return("non-maximal training set")
}
if(length(intersect(seen,ai))!=0) {
return("repeated application row")
}
}
seen <- union(seen,ai)
}
if(strictCheck) {
if(length(seen)!=nRows) {
return("not all rows appeared in application")
}
}
NULL
}
#' One way holdout, a splitFunction in the sense of vtreat::buildEvalSets.
#'
#' Note one way holdout can leak target expected values, so it should not
#' be preferred in nested modeling situations.
#' Also, doesn't respect nSplits.
#'
#' @param nRows number of rows to split (integer >1).
#' @param nSplits number of groups to split into (ignored).
#' @param dframe original data frame (ignored).
#' @param y numeric outcome variable (ignored).
#' @return split plan
#'
#' @examples
#'
#' oneWayHoldout(3,NULL,NULL,NULL)
#'
#' @export
oneWayHoldout <- function(nRows,nSplits,dframe,y) {
if(nRows<=1) {
return(NULL)
}
fullSeq <- seq_len(nRows)
evalSets <- lapply(as.list(fullSeq),
function(appi) {
list(train=setdiff(fullSeq,appi),app=appi)
})
attr(evalSets,'splitmethod') <- 'oneway'
evalSets
}
#' k-fold cross validation, a splitFunction in the sense of vtreat::buildEvalSets
#'
#' @param nRows number of rows to split (>1).
#' @param nSplits number of groups to split into (>1,<=nRows).
#' @param dframe original data frame (ignored).
#' @param y numeric outcome variable (ignored).
#' @return split plan
#'
#' @examples
#'
#' kWayCrossValidation(7,2,NULL,NULL)
#'
#' @export
kWayCrossValidation <- function(nRows,nSplits,dframe,y) {
if((nRows<=1)||(nSplits<=1)||(nSplits>nRows)) {
return(NULL)
}
fullSeq <- seq_len(nRows)
perm <- sample.int(nRows,nRows,replace=FALSE)
evalSets <- lapply(split(perm,1 + (fullSeq %% nSplits)),
function(appi) {
list(train=setdiff(fullSeq,appi),app=appi)
})
names(evalSets) <- NULL
attr(evalSets,'splitmethod') <- 'kwaycross'
evalSets
}
#' k-fold cross validation stratified on y, a splitFunction in the sense of vtreat::buildEvalSets
#'
#' @param nRows number of rows to split (>1)
#' @param nSplits number of groups to split into (<nRows,>1).
#' @param dframe original data frame (ignored).
#' @param y numeric outcome variable try to have equidistributed in each split.
#' @return split plan
#'
#' @examples
#'
#' set.seed(23255)
#' d <- data.frame(y=sin(1:100))
#' pStrat <- kWayStratifiedY(nrow(d),5,d,d$y)
#' problemAppPlan(nrow(d),5,pStrat,TRUE)
#' d$stratGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pStrat)
#' pSimple <- kWayCrossValidation(nrow(d),5,d,d$y)
#' problemAppPlan(nrow(d),5,pSimple,TRUE)
#' d$simpleGroup <- vtreat::getSplitPlanAppLabels(nrow(d),pSimple)
#' summary(tapply(d$y,d$simpleGroup,mean))
#' summary(tapply(d$y,d$stratGroup,mean))
#'
#'
#'
#' @export
kWayStratifiedY <- function(nRows,nSplits,dframe,y) {
if((nRows<=1)||(nSplits<=1)||(nSplits>nRows)) {
return(NULL)
}
if(is.null(y)||(length(unique(y))<=1)) {
return(kWayCrossValidation(nRows,nSplits,NULL,NULL))
}
fullSeq <- seq_len(nRows)
d <- data.frame(index=fullSeq,y=y)
# initial permutation in case y has large constant blocks
d <- d[order(sample.int(nRows,nRows,replace=FALSE)),]
# order by y
d <- d[order(d$y),]
# mix within order segments
rows_per_split <- ceiling(nRows/nSplits)
mix_idx <- vector("list", rows_per_split)
for(si in seq_len(rows_per_split)) {
leftI <- si*nSplits - (nSplits-1)
rightI <- min(si*nSplits,nRows)
widthI <- 1+rightI-leftI
if(widthI>1) {
oldIndices <- leftI:rightI
mix_idx[[si]] <- oldIndices[sample.int(widthI,widthI,replace=FALSE)]
} else {
mix_idx[[si]] <- leftI:rightI
}
}
d[unlist(mix_idx),] <- d
d$group <- (fullSeq %% nSplits) + 1
carveUp <- split(d$index,d$group)
evalSets <- lapply(carveUp,
function(appi) {
list(train=setdiff(fullSeq,appi),app=appi)
})
names(evalSets) <- NULL
attr(evalSets,'splitmethod') <- 'kwaycrossystratified'
evalSets
}
#' k-fold cross validation stratified with replacement on y, a splitFunction in the sense of vtreat::buildEvalSets .
#'
#' Build a k-fold cross validation sample where training sets are the same size as the original data,
#' and built by sampling disjoint from test/application sets (sampled with replacement).
#'
#' @param nRows number of rows to split (>1)
#' @param nSplits number of groups to split into (<nRows,>1).
#' @param dframe original data frame (ignored).
#' @param y numeric outcome variable try to have equidistributed in each split.
#' @return split plan
#'
#' @examples
#'
#' set.seed(23255)
#' d <- data.frame(y=sin(1:100))
#' pStrat <- kWayStratifiedYReplace(nrow(d),5,d,d$y)
#'
#' @export
kWayStratifiedYReplace <- function(nRows,nSplits,dframe,y) {
if((nRows<=1)||(nSplits<=1)||(nSplits>nRows)) {
return(NULL)
}
if(is.null(y)||(length(unique(y))<=1)) {
return(kWayCrossValidation(nRows,nSplits,NULL,NULL))
}
fullSeq <- seq_len(nRows)
d <- data.frame(index=fullSeq,y=y)
# initial permutation in case y has large constant blocks
d <- d[order(sample.int(nRows,nRows,replace=FALSE)),]
# order by y
d <- d[order(d$y),]
# mix within order segments
rows_per_split <- ceiling(nRows/nSplits)
mix_idx <- vector("list", rows_per_split)
for(si in seq_len(rows_per_split)) {
leftI <- si*nSplits - (nSplits-1)
rightI <- min(si*nSplits,nRows)
widthI <- 1+rightI-leftI
if(widthI>1) {
oldIndices <- leftI:rightI
mix_idx[[si]] <- oldIndices[sample.int(widthI,widthI,replace=FALSE)]
} else {
mix_idx[[si]] <- leftI:rightI
}
}
d[unlist(mix_idx),] <- d
d$group <- (fullSeq %% nSplits) + 1
carveUp <- split(d$index,d$group)
evalSets <- lapply(carveUp,
function(appi) {
trainIdxs <- setdiff(fullSeq,appi)
si <- sample.int(length(trainIdxs), nRows, replace=TRUE)
list(train=trainIdxs[si], app=appi)
})
names(evalSets) <- NULL
attr(evalSets,'splitmethod') <- 'kwaycrossystratifiedreplace'
evalSets
}
#' Build a k-fold cross validation splitter, respecting (never splitting) groupingColumn.
#'
#' @param groupingColumnName name of column to group by.
#' @return splitting function in the sense of vtreat::buildEvalSets.
#'
#' @examples
#'
#' d <- data.frame(y=sin(1:100))
#' d$group <- floor(seq_len(nrow(d))/5)
#' splitter <- makekWayCrossValidationGroupedByColumn('group')
#' split <- splitter(nrow(d),5,d,d$y)
#' d$splitLabel <- vtreat::getSplitPlanAppLabels(nrow(d),split)
#' rowSums(table(d$group,d$splitLabel)>0)
#'
#' @export
makekWayCrossValidationGroupedByColumn <- function(groupingColumnName) {
force(groupingColumnName)
function(nRows,nSplits,dframe,y) {
if((nRows<=1)||(nSplits<=1)||(nSplits>nRows)) {
return(NULL)
}
d <- data.frame(index=seq_len(nRows),
group=as.character(dframe[[groupingColumnName]]),
stringsAsFactors=FALSE)
groups <- sort(unique(d$group))
groupedPlan <- NULL
if((!is.null(y))&&(length(unique(y))>1)) { # try for y-stratification
d$y <- y
groupedY <- aggregate(y~group,data=d,FUN=mean)$y
groupedPlan <- kWayStratifiedY(length(groups),nSplits,NULL,groupedY)
}
if(is.null(groupedPlan)) {
groupedPlan <- kWayCrossValidation(length(groups),nSplits,NULL,NULL)
}
if(is.null(groupedPlan)) {
return(NULL)
}
splitmethod <- attr(groupedPlan,'splitmethod')
carveUp <- lapply(groupedPlan,
function(gi) {
d$index[d$group %in% groups[gi$app]]
})
fullSeq <- seq_len(nRows)
evalSets <- lapply(carveUp,
function(appi) {
list(train=setdiff(fullSeq,appi),app=appi)
})
names(evalSets) <- NULL
attr(evalSets,'splitmethod') <- paste0(splitmethod,'grouped')
evalSets
}
}
#' Build set carve-up for out-of sample evaluation.
#'
#' Return a carve-up of seq_len(nRows). Very useful for any sort of
#' nested model situation (such as data prep, stacking, or super-learning).
#'
#' Also sets attribute "splitmethod" on return value that describes how the split was performed.
#' attr(returnValue,'splitmethod') is one of: 'notsplit' (data was not split; corner cases
#' like single row data sets), 'oneway' (leave one out holdout), 'kwaycross' (a simple
#' partition), 'userfunction' (user supplied function was actually used), or a user specified attribute.
#' Any user
#' desired properties (such as stratification on y, or preservation of groups designated by
#' original data row numbers) may not apply unless you see that 'userfunction' has been
#' used.
#'
#' The intent is the user splitFunction only needs to handle "easy cases"
#' and maintain user invariants. If the user splitFunction returns NULL,
#' throws, or returns an unacceptable carve-up then vtreat::buildEvalSets
#' returns its own eval set plan. The signature of splitFunction should
#' be splitFunction(nRows,nSplits,dframe,y) where nSplits is the number of
#' pieces we want in the carve-up, nRows is the number of rows to split,
#' dframe is the original dataframe (useful for any group control variables),
#' and y is a numeric vector representing outcome (useful for outcome stratification).
#'
#' Note that buildEvalSets may not always return a partition (such
#' as one row dataframes), or if the user split function chooses to make rows eligible for
#' application a different number of times.
#'
#' @param nRows scalar, >=1 number of rows to sample from.
#' @param ... no additional arguments, declared to forced named binding of later arguments.
#' @param dframe (optional) original data.frame, passed to user splitFunction.
#' @param y (optional) numeric vector, outcome variable (possibly to stratify on), passed to user splitFunction.
#' @param splitFunction (optional) function taking arguments nSplits,nRows,dframe, and y; returning a user desired split.
#' @param nSplits integer, target number of splits.
#' @return list of lists where the app portion of the sub-lists is a disjoint carve-up of seq_len(nRows) and each list as a train portion disjoint from app.
#'
#' @seealso \code{\link{kWayCrossValidation}}, \code{\link{kWayStratifiedY}}, and \code{\link{makekWayCrossValidationGroupedByColumn}}
#'
#' @examples
#'
#' # use
#' buildEvalSets(200)
#'
#' # longer example
#' # helper fns
#' # fit models using experiment plan to estimate out of sample behavior
#' fitModelAndApply <- function(trainData,applicaitonData) {
#' model <- lm(y~x,data=trainData)
#' predict(model,newdata=applicaitonData)
#' }
#' simulateOutOfSampleTrainEval <- function(d,fitApplyFn) {
#' eSets <- buildEvalSets(nrow(d))
#' evals <- lapply(eSets,
#' function(ei) { fitApplyFn(d[ei$train,],d[ei$app,]) })
#' pred <- numeric(nrow(d))
#' for(eii in seq_len(length(eSets))) {
#' pred[eSets[[eii]]$app] <- evals[[eii]]
#' }
#' pred
#' }
#'
#' # run the experiment
#' set.seed(2352356)
#' # example data
#' d <- data.frame(x=rnorm(5),y=rnorm(5),
#' outOfSampleEst=NA,inSampleEst=NA)
#'
#' # fit model on all data
#' d$inSampleEst <- fitModelAndApply(d,d)
#' # compute in-sample R^2 (above zero, falsely shows a
#' # relation until we adjust for degrees of freedom)
#' 1-sum((d$y-d$inSampleEst)^2)/sum((d$y-mean(d$y))^2)
#'
#' d$outOfSampleEst <- simulateOutOfSampleTrainEval(d,fitModelAndApply)
#' # compute out-sample R^2 (not positive,
#' # evidence of no relation)
#' 1-sum((d$y-d$outOfSampleEst)^2)/sum((d$y-mean(d$y))^2)
#'
#' @export
buildEvalSets <- function(nRows,...,
dframe=NULL,
y=NULL,
splitFunction=NULL,
nSplits=3) {
wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::buildEvalSets")
# check args
if(nSplits<2) {
stop("vtreat::buildEvalSets must have nSplits>=2")
}
if(!is.null(y)) {
if(!is.numeric(y)) {
stop('vtreat:buildEvalSets must have is.numeric(y)')
}
if(length(y)!=nRows) {
stop('vtreat:buildEvalSets must have length(y)==nRows')
}
}
if(!is.null(dframe)) {
if(!is.data.frame(dframe)) {
stop('vtreat:buildEvalSets must have is.data.frame(dframe)')
}
if(nrow(dframe)!=nRows) {
stop('must have nrow(dframe)==nRows')
}
}
# try user carve-up function
if(!is.null(splitFunction)) {
tryCatch({
evalSets <- splitFunction(nRows=nRows,nSplits=nSplits,dframe=dframe,y=y)
problem <- problemAppPlan(nRows,nSplits,evalSets,FALSE)
if(is.null(problem)) {
if(is.null(attr(evalSets,'splitmethod'))) {
attr(evalSets,'splitmethod') <- 'userfunction'
}
return(evalSets)
} else {
warning(paste("vtreat::buildEvalSets user carve-up rejected: ",problem))
}
},
error = function(e) warning(paste('vtreat::buildEvalSets caught ',
as.character(e),'from user splitFunction')
))
}
# deal with it ourselves if we have to
fullSeq <- seq_len(nRows)
# okay, we will carve-up on our own
if(2*nSplits>nRows) {
# one corner case
if(nRows<=1) {
# no split plan possible
evalSets <- list(list(train=fullSeq,app=fullSeq))
attr(evalSets,'splitmethod') <- 'notsplit'
} else {
# not necissarilly number of splits the user requested
evalSets <- kWayCrossValidation(nRows=nRows,nSplits= min(nSplits, nRows),dframe=NULL,y=NULL)
problem <- problemAppPlan(nRows,min(nSplits, nRows),evalSets,TRUE)
if(!is.null(problem)) {
stop(paste("problem with vtreat::buildEvalSets",problem))
}
}
} else {
# know 2*nSplits<=nRows
if((!is.null(y))&&(max(y)>min(y))) {
# Try for full y-stratified k-way cross val
evalSets <- kWayStratifiedY(nRows=nRows,nSplits=nSplits,dframe=NULL,y=y)
} else {
evalSets <- kWayCrossValidation(nRows=nRows,nSplits=nSplits,dframe=NULL,y=NULL)
}
problem <- problemAppPlan(nRows,nSplits,evalSets,TRUE)
if(!is.null(problem)) {
stop(paste("problem with vtreat::buildEvalSets",problem))
}
}
evalSets
}
# make a "cross frame" that is a frame where each treated row was treated only
# by a treatment plan not involving the given row
.mkCrossFrame <- function(
...,
dframe,
referenceTreatments,
varlist, newVarsS, outcomename, zoY,
zC, zTarget,
weights,
minFraction, smFactor,
rareCount, rareSig,
collarProb,
codeRestriction,
customCoders,
scale, doCollar,
splitFunction, nSplits,
catScaling,
parallelCluster = NULL,
use_parallel = TRUE,
verbose = FALSE,
missingness_imputation, imputation_map) {
wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::.mkCrossFrame")
dsub <- dframe[,c(varlist,outcomename),drop=FALSE]
# build a carve-up plan
evalSets <- buildEvalSets(length(zoY),dframe=dframe,y=zoY,
splitFunction=splitFunction,nSplits=nSplits)
crossFrameList <- vector('list',length(evalSets))
wtList <- vector('list',length(evalSets))
rList <- vector('list',length(evalSets))
foundCols <- c() # can include outcome
for(ei in seq_len(length(evalSets))) {
evalIndices <- evalSets[[ei]]$app
buildIndices <- evalSets[[ei]]$train
dsubiEval <- dsub[evalIndices, , drop=FALSE]
dsubiBuild <- dsub[buildIndices, , drop=FALSE]
zoYBuild <- zoY[buildIndices]
zCBuild <- c()
if(!is.null(zC)) {
zCBuild <- zC[buildIndices]
}
wBuild <- weights[buildIndices]
ti <- .designTreatmentsXS(
dframe = dsubiBuild,
varlist = varlist,
outcomename = outcomename,
zoY = zoYBuild,
zC = zCBuild,
zTarget = zTarget,
weights = wBuild,
minFraction = minFraction,
smFactor = smFactor,
rareCount = rareCount,
rareSig = rareSig,
collarProb = collarProb,
codeRestriction = codeRestriction,
customCoders = customCoders,
justWantTreatments = TRUE,
catScaling = catScaling,
verbose = verbose,
parallelCluster = parallelCluster,
use_parallel = use_parallel,
missingness_imputation = missingness_imputation, imputation_map = imputation_map)
fi <- .vtreatList(ti,dsubiEval,newVarsS,
scale = scale,
doCollar = doCollar,
parallelCluster = parallelCluster,
use_parallel = use_parallel)
# fill in missing columns (a data leak potential, but a necessary step)
droppedColumns <- setdiff(newVarsS,c(outcomename,colnames(fi)))
if(length(droppedColumns)>0) {
repFrame <- NULL
if(!is.null(referenceTreatments)) {
repFrame <- prepare(referenceTreatments,dsubiEval,
pruneSig=NULL,
varRestriction=droppedColumns,
scale=scale,doCollar=doCollar,
parallelCluster=parallelCluster,
use_parallel = use_parallel)
}
for(v in droppedColumns) {
fi[[v]] <- 0.0
if((!is.null(repFrame)) && (v %in% colnames(repFrame))) {
fi[[v]] <- repFrame[[v]]
}
}
}
# make sure each frame has the same column structure (again a data leak)
fi[[outcomename]] <- dsubiEval[[outcomename]]
if(ei<=1) {
foundCols <- colnames(fi)
} else {
foundCols <- intersect(foundCols,colnames(fi))
}
crossFrameList[[ei]] <- fi
wtList[[ei]] <- weights[evalIndices]
rList[[ei]] <- evalIndices
}
# make sure each frame has the same column structure
lostVars <- setdiff(newVarsS,foundCols)
if(length(lostVars)>0) {
warning(paste('cross frame procedures lost variables: ',
paste(lostVars,collapse=', '),
'(likely did not vary on one data carve-up)'))
}
crossFrameList <- lapply(crossFrameList,
function(fi) {
fi[,foundCols,drop=FALSE]
})
# assemble frame
crossFrame <- .rbindListOfFrames(crossFrameList)
scoreWeights <- unlist(wtList)
rowList <- unlist(rList)
if((length(rowList)==nrow(dframe))&&
all(sort(rowList)==(1:nrow(dframe)))&&
(!all(rowList==(1:nrow(dframe))))) {
# undo permuation
crossFrame[rowList,] <- crossFrame
scoreWeights[rowList] <- scoreWeights[rowList]
}
list(crossFrame=crossFrame,crossWeights=scoreWeights,
method=attr(evalSets,'splitmethod'),
evalSets=evalSets,
foundCols=foundCols,
lostVars=lostVars)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.