Conditioner

Documented in designTreatmentsC designTreatmentsN designTreatmentsZ format.vtreatment mkCrossFrameCExperiment mkCrossFrameNExperiment novel_value_summary prepare prepare.treatmentplan print.treatmentplan print.vtreatment track_values vnames vorig

# variable treatments type def: list { origvar, newvars, f(col,args), args, treatmentName, scales } can share orig var



#' vtreat: A Statistically Sound 'data.frame' Processor/Conditioner
#'
#' A 'data.frame' processor/conditioner that prepares real-world data for predictive modeling in a statistically sound manner.
#' 'vtreat' prepares variables so that data has fewer exceptional cases, making
#' it easier to safely use models in production. Common problems 'vtreat' defends
#' against: 'Inf', 'NA', too many categorical levels, rare categorical levels, and new
#' categorical levels (levels seen during application, but not during training).
#' 'vtreat::prepare' should be used as you would use 'model.matrix'.
#'
#'
#'For more information:
#' \itemize{
#'   \item \code{vignette('vtreat', package='vtreat')}
#'   \item \code{vignette(package='vtreat')}
#'   \item Website: \url{https://github.com/WinVector/vtreat} }
#'
"_PACKAGE"


#' @importFrom stats aggregate anova as.formula binomial chisq.test fisher.test glm lm lm.wfit pchisq pf quantile
#' @importFrom utils packageVersion
NULL




#'
#' Original variable name from a treatmentplan$treatment item.
#' @param x vtreatment item.
#' @seealso \code{\link{designTreatmentsC}} \code{\link{designTreatmentsN}} \code{\link{designTreatmentsZ}}
#' @export
#' 
vorig <- function(x) { x$origvar }


#'
#' New treated variable names from a treatmentplan$treatment item.
#' @param x vtreatment item
#' @seealso \code{\link{designTreatmentsC}} \code{\link{designTreatmentsN}} \code{\link{designTreatmentsZ}}
#' @export
vnames <- function(x) { x$newvars }

#'
#' Display treatment plan.
#' @param x treatment plan
#' @param ... additional args (to match general signature).
#' @export
format.vtreatment <- function(x, ...) { 
  paste(
    'vtreat \'',x$treatmentName,
    '\'(\'',x$origvar,'\'(',x$origType,',',x$origClass,')->',
    x$convertedColClass,'->\'',
    paste(x$newvars,collapse='\',\''),
    '\')',sep='') 
}

#' @export
as.character.vtreatment <- function (x, ...) {
  format(x, ...)
}

#'
#' Print treatmentplan.
#' @param x treatmentplan
#' @param ... additional args (to match general signature).
#' @seealso \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsN}}, \code{\link{designTreatmentsZ}}, \code{\link{prepare.treatmentplan}}
#' @export
print.vtreatment <- function(x, ...) { 
  print(format(x), ...) 
}





#' @export
format.treatmentplan <- function(x, ...) { 
  sf <- x$scoreFrame
  cols <- c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees', 'recommended')
  cols <- intersect(cols, colnames(sf))
  format(sf[ , 
                       cols, 
                       drop = FALSE])
}

#' @export
as.character.treatmentplan <- function (x, ...) {
  format(x, ...)
}

#'
#' Print treatmentplan.
#' @param x treatmentplan
#' @param ... additional args (to match general signature).
#' @seealso \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsN}}, \code{\link{designTreatmentsZ}}, \code{\link{prepare.treatmentplan}}
#' @export
print.treatmentplan <- function(x, ...) { 
  print(class(x))
  print(format(x), ...) 
}




# add in the recommendation column
augment_score_frame <- function(score_frame) {
  n_treatment_types <- length(unique(score_frame$code))
  code_counts <- table(score_frame$code)
  vcount <- code_counts[score_frame$code]
  score_frame$default_threshold <- 1/(n_treatment_types * vcount)
  score_frame$recommended <- score_frame$varMoves & (score_frame$sig < score_frame$default_threshold)
  score_frame
}





#' Build all treatments for a data frame to predict a categorical outcome.
#' 
#' Function to design variable treatments for binary prediction of a
#' categorical outcome.  Data frame is assumed to have only atomic columns
#' except for dates (which are converted to numeric). Note: re-encoding high cardinality
#' categorical variables can introduce undesirable nested model bias, for such data consider
#' using \code{\link{mkCrossFrameCExperiment}}.
#' 
#' The main fields are mostly vectors with names (all with the same names in the same order):
#' 
#' - vars : (character array without names) names of variables (in same order as names on the other diagnostic vectors)
#' - varMoves : logical TRUE if the variable varied during hold out scoring, only variables that move will be in the treated frame
#' - #' - sig : an estimate significance of effect
#'
#' See the vtreat vignette for a bit more detail and a worked example.
#' 
#' Columns that do not vary are not passed through.
#' 
#' Note: re-encoding high cardinality on training data can introduce nested model bias, consider using \code{mkCrossFrameCExperiment} instead.
#'
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @param outcomename Name of column holding outcome variable. dframe[[outcomename]] must be only finite non-missing values.
#' @param outcometarget Value/level of outcome to be considered "success",  and there must be a cut such that dframe[[outcomename]]==outcometarget at least twice and dframe[[outcomename]]!=outcometarget at least twice.
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param weights optional training weights for each row
#' @param minFraction optional minimum frequency a categorical level must have to be converted to an indicator column.
#' @param smFactor optional smoothing factor for impact coding models.
#' @param rareCount optional integer, allow levels with this count or below to be pooled into a shared rare-level.  Defaults to 0 or off.
#' @param rareSig optional numeric, suppress levels from pooling at this significance value greater.  Defaults to NULL or off.
#' @param collarProb what fraction of the data (pseudo-probability) to collar data at if doCollar is set during \code{\link{prepare.treatmentplan}}.
#' @param codeRestriction what types of variables to produce (character array of level codes, NULL means no restriction).
#' @param customCoders map from code names to custom categorical variable encoding functions (please see \url{https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md}).
#' @param splitFunction (optional) see vtreat::buildEvalSets .
#' @param ncross optional scalar >=2 number of cross validation splits use in rescoring complex variables.
#' @param forceSplit logical, if TRUE force cross-validated significance calculations on all variables.
#' @param catScaling optional, if TRUE use glm() linkspace, if FALSE use lm() for scaling.
#' @param verbose if TRUE print progress.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods (when parallel cluster is set).
#' @param missingness_imputation function of signature f(values: numeric, weights: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers.
#' @return treatment plan (for use with prepare)
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{designTreatmentsN}}, \code{\link{designTreatmentsZ}}, \code{\link{mkCrossFrameCExperiment}}
#' 
#' @examples
#' 
#' dTrainC <- data.frame(x=c('a','a','a','b','b','b'),
#'    z=c(1,2,3,4,5,6),
#'    y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE))
#' dTestC <- data.frame(x=c('a','b','c',NA),
#'    z=c(10,20,30,NA))
#' treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
#' dTestCTreated <- prepare(treatmentsC,dTestC,pruneSig=0.99)
#' 
#' @export
designTreatmentsC <- function(dframe,varlist,
                              outcomename, outcometarget = TRUE,
                              ...,
                              weights=c(),
                              minFraction=0.02,smFactor=0.0,
                              rareCount=0,rareSig=NULL,
                              collarProb=0.00,
                              codeRestriction=NULL,
                              customCoders=NULL, 
                              splitFunction=NULL,ncross=3,
                              forceSplit=FALSE,
                              catScaling=TRUE,
                              verbose=TRUE,
                              parallelCluster=NULL,
                              use_parallel= TRUE,
                              missingness_imputation = NULL, imputation_map = NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::designTreatmentsC")
  .checkArgs(dframe=dframe,varlist=varlist,outcomename=outcomename)
  if(!(outcomename %in% colnames(dframe))) {
    stop("outcomename must be a column name of dframe")
  }
  if(any(is.na(dframe[[outcomename]]))) {
    stop("There are missing values in the outcome column, can not apply designTreatmentsC.")
  }
  zoY <- ifelse(dframe[[outcomename]]==outcometarget,1.0,0.0)
  if(min(zoY)>=max(zoY)) {
    stop("dframe[[outcomename]]==outcometarget must vary")
  }
  treatments <- .designTreatmentsX(
    dframe = dframe,
    varlist = varlist,
    outcomename = outcomename,
    zoY = zoY,
    zC = dframe[[outcomename]],
    zTarget = outcometarget,
    weights = weights,
    minFraction = minFraction,
    smFactor = smFactor,
    rareCount = rareCount,
    rareSig = rareSig,
    collarProb = collarProb,
    codeRestriction = codeRestriction,
    customCoders = customCoders,
    splitFunction = splitFunction,
    ncross = ncross,
    forceSplit = forceSplit,
    catScaling = catScaling,
    verbose = verbose,
    parallelCluster = parallelCluster,
    use_parallel = use_parallel,
    missingness_imputation = missingness_imputation, imputation_map = imputation_map)
  treatments$outcomeTarget <- outcometarget
  treatments$outcomeType <- 'Binary'
  treatments$fit_obj_id <- id_f(dframe)
  treatments$scoreFrame <- augment_score_frame(treatments$scoreFrame)
  treatments
}




#' build all treatments for a data frame to predict a numeric outcome
#' 
#' Function to design variable treatments for binary prediction of a
#' numeric outcome.  Data frame is assumed to have only atomic columns
#' except for dates (which are converted to numeric).
#' Note: each column is processed independently of all others. 
#' Note: re-encoding high cardinality on training data
#' categorical variables can introduce undesirable nested model bias, for such data consider
#' using \code{\link{mkCrossFrameNExperiment}}.
#' 
#' The main fields are mostly vectors with names (all with the same names in the same order):
#' 
#' - vars : (character array without names) names of variables (in same order as names on the other diagnostic vectors)
#' - varMoves : logical TRUE if the variable varied during hold out scoring, only variables that move will be in the treated frame
#' - sig : an estimate significance of effect
#'
#' See the vtreat vignette for a bit more detail and a worked example.
#' 
#' Columns that do not vary are not passed through.
#' 
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @param outcomename Name of column holding outcome variable. dframe[[outcomename]] must be only finite non-missing values and there must be a cut such that dframe[[outcomename]] is both above the cut at least twice and below the cut at least twice.
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param weights optional training weights for each row
#' @param minFraction optional minimum frequency a categorical level must have to be converted to an indicator column.
#' @param smFactor optional smoothing factor for impact coding models.
#' @param rareCount optional integer, allow levels with this count or below to be pooled into a shared rare-level.  Defaults to 0 or off.
#' @param rareSig optional numeric, suppress levels from pooling at this significance value greater.  Defaults to NULL or off.
#' @param collarProb what fraction of the data (pseudo-probability) to collar data at if doCollar is set during \code{\link{prepare.treatmentplan}}.
#' @param codeRestriction what types of variables to produce (character array of level codes, NULL means no restriction).
#' @param customCoders map from code names to custom categorical variable encoding functions (please see \url{https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md}).
#' @param splitFunction (optional) see vtreat::buildEvalSets .
#' @param ncross optional scalar >=2 number of cross validation splits use in rescoring complex variables.
#' @param forceSplit logical, if TRUE force cross-validated significance calculations on all variables.
#' @param verbose if TRUE print progress.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods (when parallel cluster is set).
#' @param missingness_imputation function of signature f(values: numeric, weights: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers.
#' @return treatment plan (for use with prepare)
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsZ}}, \code{\link{mkCrossFrameNExperiment}}
#' @examples
#' 
#' dTrainN <- data.frame(x=c('a','a','a','a','b','b','b'),
#'     z=c(1,2,3,4,5,6,7),y=c(0,0,0,1,0,1,1))
#' dTestN <- data.frame(x=c('a','b','c',NA),
#'     z=c(10,20,30,NA))
#' treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y')
#' dTestNTreated <- prepare(treatmentsN,dTestN,pruneSig=0.99)
#' 
#' @export
designTreatmentsN <- function(dframe,varlist,outcomename,
                              ...,
                              weights=c(),
                              minFraction=0.02,smFactor=0.0,
                              rareCount=0,rareSig=NULL,
                              collarProb=0.00,
                              codeRestriction=NULL,
                              customCoders=NULL,
                              splitFunction=NULL,ncross=3,
                              forceSplit=FALSE,
                              verbose=TRUE,
                              parallelCluster=NULL,
                              use_parallel= TRUE,
                              missingness_imputation = NULL, imputation_map = NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::designTreatmentsN")
  .checkArgs(dframe=dframe,varlist=varlist,outcomename=outcomename)
  if(!(outcomename %in% colnames(dframe))) {
    stop("outcomename must be a column name of dframe")
  }
  if(any(is.na(dframe[[outcomename]]))) {
    stop("There are missing values in the outcome column, can not apply designTreatmentsN.")
  }
  ycol <- dframe[[outcomename]]
  if(min(ycol)>=max(ycol)) {
    stop("dframe[[outcomename]] must vary")
  }
  catScaling=FALSE
  treatments <- .designTreatmentsX(
    dframe = dframe,
    varlist = varlist,
    outcomename = outcomename,
    zoY = ycol,
    zC = c(),
    zTarget = c(),
    weights = weights,
    minFraction = minFraction,
    smFactor = smFactor,
    rareCount = rareCount,
    rareSig = rareSig,
    collarProb = collarProb,
    codeRestriction = codeRestriction, 
    customCoders = customCoders,
    splitFunction = splitFunction,
    ncross = ncross,
    forceSplit = forceSplit,
    catScaling = catScaling,
    verbose = verbose,
    parallelCluster = parallelCluster,
    use_parallel = use_parallel,
    missingness_imputation = missingness_imputation, imputation_map = imputation_map)
  treatments$outcomeType <- 'Numeric'
  treatments$fit_obj_id <- id_f(dframe)
  treatments$scoreFrame <- augment_score_frame(treatments$scoreFrame)
  treatments
}




#' Design variable treatments with no outcome variable.
#' 
#' Data frame is assumed to have only atomic columns
#' except for dates (which are converted to numeric).
#' Note: each column is processed independently of all others.
#' 
#' The main fields are mostly vectors with names (all with the same names in the same order):
#' 
#' - vars : (character array without names) names of variables (in same order as names on the other diagnostic vectors)
#' - varMoves : logical TRUE if the variable varied during hold out scoring, only variables that move will be in the treated frame
#'
#' See the vtreat vignette for a bit more detail and a worked example.
#' 
#' Columns that do not vary are not passed through.
#' 
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param weights optional training weights for each row
#' @param minFraction optional minimum frequency a categorical level must have to be converted to an indicator column.
#' @param rareCount optional integer, allow levels with this count or below to be pooled into a shared rare-level.  Defaults to 0 or off.
#' @param collarProb what fraction of the data (pseudo-probability) to collar data at if doCollar is set during \code{\link{prepare.treatmentplan}}.
#' @param codeRestriction what types of variables to produce (character array of level codes, NULL means no restriction).
#' @param customCoders map from code names to custom categorical variable encoding functions (please see \url{https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md}).
#' @param verbose if TRUE print progress.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods (if parallel cluster is set).
#' @param missingness_imputation function of signature f(values: numeric, weights: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers.
#' @return treatment plan (for use with prepare)
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsN}} 
#' @examples
#' 
#' dTrainZ <- data.frame(x=c('a','a','a','a','b','b',NA,'e','e'),
#'     z=c(1,2,3,4,5,6,7,NA,9))
#' dTestZ <- data.frame(x=c('a','x','c',NA),
#'     z=c(10,20,30,NA))
#' treatmentsZ = designTreatmentsZ(dTrainZ, colnames(dTrainZ),
#'   rareCount=0)
#' dTrainZTreated <- prepare(treatmentsZ, dTrainZ)
#' dTestZTreated <- prepare(treatmentsZ, dTestZ)
#' 
#' @export
designTreatmentsZ <- function(dframe,varlist,
                              ...,
                              minFraction=0.0,
                              weights=c(),
                              rareCount=0,
                              collarProb=0.0,
                              codeRestriction=NULL,
                              customCoders=NULL,
                              verbose=TRUE,
                              parallelCluster=NULL,
                              use_parallel= TRUE,
                              missingness_imputation = NULL, imputation_map = NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::designTreatmentsZ")
  # build a name disjoint from column names
  outcomename <- setdiff(paste('VTREATTEMPCOL',
                               seq_len(ncol(dframe) + length(varlist) + 1), 
                               sep='_'),
                         c(colnames(dframe),varlist))[[1]]
  catScaling <- FALSE
  dframe[[outcomename]] <- 0
  .checkArgs(dframe=dframe,varlist=varlist,outcomename=outcomename)
  ycol <- dframe[[outcomename]]
  treatments <- .designTreatmentsX(
    dframe = dframe,
    varlist = varlist,
    outcomename = outcomename,
    zoY = ycol,
    zC = c(),
    zTarget = c(),
    weights = weights,
    minFraction = minFraction,
    smFactor = 0,
    rareCount = rareCount,
    rareSig = 1,
    collarProb = collarProb,
    codeRestriction = codeRestriction, 
    customCoders = customCoders, 
    splitFunction = NULL,
    ncross = 3,
    forceSplit = FALSE,
    catScaling = catScaling,
    verbose = verbose,
    parallelCluster = parallelCluster,
    use_parallel = use_parallel,
    missingness_imputation = missingness_imputation, imputation_map = imputation_map)
  treatments$outcomeType <- 'None'
  treatments$meanY <- NA
  treatments
}



#' Track unique character values for variables.
#' 
#' Builds lists of observed unique character values of varlist variables from the data frame.
#'  
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @return named list of values seen.
#' 
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{novel_value_summary}}
#' 
#' @examples
#' 
#' set.seed(23525)
#' zip <- c(NA, paste('z', 1:100, sep = "_"))
#' N <- 500
#' d <- data.frame(zip = sample(zip, N, replace=TRUE),
#'                 zip2 = sample(zip, N, replace=TRUE),
#'                 y = runif(N))
#' dSample <- d[1:300, , drop = FALSE]
#' tplan <- designTreatmentsN(dSample, 
#'                            c("zip", "zip2"), "y", 
#'                            verbose = FALSE)
#' trackedValues <- track_values(dSample, c("zip", "zip2"))
#' # don't normally want to catch warnings,
#' # doing it here as this is an example 
#' # and must not have unhandled warnings.
#' tryCatch(
#'   prepare(tplan, d, trackedValues = trackedValues),
#'   warning = function(w) { cat(paste(w, collapse = "\n")) })
#' 
#' @export
#' 
track_values <- function(dframe, varlist) {
  observed_values <- lapply(varlist, 
                            function(vi) {
                              unique(as.character(dframe[[vi]]))
                            })
  names(observed_values) <- varlist
  observed_values
}

#' Report new/novel appearances of character values.
#'  
#' @param dframe Data frame to inspect.
#' @param trackedValues optional named list mapping variables to know values, allows warnings upon novel level appearances (see \code{\link{track_values}})
#' @return frame of novel occurrences
#' 
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{track_values}}
#' 
#' @examples
#' 
#' set.seed(23525)
#' zip <- c(NA, paste('z', 1:10, sep = "_"))
#' N <- 10
#' d <- data.frame(zip = sample(zip, N, replace=TRUE),
#'                 zip2 = sample(zip, N, replace=TRUE),
#'                 y = runif(N))
#' dSample <- d[1:5, , drop = FALSE]
#' trackedValues <- track_values(dSample, c("zip", "zip2"))
#' novel_value_summary(d, trackedValues)
#' 
#' @export
#' 
novel_value_summary <- function(dframe, trackedValues) {
  novel <- data.frame(row_index = 1, column = "", value = "",
                      stringsAsFactors = FALSE)
  novel <- novel[c(), , drop = FALSE]
  novels <- lapply(sort(intersect(names(trackedValues),
                                  colnames(dframe))),
                   function(v) {
                     newstuff <- !(dframe[[v]] %in% trackedValues[[v]])
                     if(sum(newstuff)>0) {
                       idxs <- which(newstuff)
                       vals <- as.character(dframe[[v]][idxs])
                       return(data.frame(row_index = idxs,
                                         column = v,
                                         value = vals,
                                         stringsAsFactors = FALSE))
                     } 
                     NULL
                   })
  novels <- c(list(novel), novels)
  novels <- novels[!is.null(novels)]
  .rbindListOfFrames(novels)
}

#' Apply treatments and restrict to useful variables.
#' 
#' @param treatmentplan Plan built by designTreantmentsC() or designTreatmentsN()
#' @param dframe Data frame to be treated
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' 
#' @seealso \code{\link{prepare.treatmentplan}}, \code{\link{prepare.simple_plan}}, \code{\link{prepare.multinomial_plan}}
#' 
#' @export
prepare <- function(treatmentplan, dframe,
                    ...) {
  UseMethod("prepare")
}



#' Apply treatments and restrict to useful variables.
#' 
#' Use a treatment plan to prepare a data frame for analysis.  The
#' resulting frame will have new effective variables that are numeric
#' and free of NaN/NA.  If the outcome column is present it will be copied over.
#' The intent is that these frames are compatible with more machine learning
#' techniques, and avoid a lot of corner cases (NA,NaN, novel levels, too many levels).
#' Note: each column is processed independently of all others.  Also copies over outcome if present.
#' Note: treatmentplan's are not meant for long-term storage, a warning is issued if the version of
#' vtreat that produced the plan differs from the version running \code{prepare()}.
#' 
#' @param treatmentplan Plan built by designTreantmentsC() or designTreatmentsN()
#' @param dframe Data frame to be treated
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param pruneSig suppress variables with significance above this level
#' @param scale optional if TRUE replace numeric variables with single variable model regressions ("move to outcome-scale").  These have mean zero and (for variables with significant less than 1) slope 1 when regressed  (lm for regression problems/glm for classification problems) against outcome.
#' @param doCollar optional if TRUE collar numeric variables by cutting off after a tail-probability specified by collarProb during treatment design.
#' @param varRestriction optional list of treated variable names to restrict to
#' @param codeRestriction optional list of treated variable codes to restrict to
#' @param trackedValues optional named list mapping variables to know values, allows warnings upon novel level appearances (see \code{\link{track_values}})
#' @param extracols extra columns to copy.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods.
#' @param check_for_duplicate_frames logical, if TRUE check if we called prepare on same data.frame as design step.
#' @return treated data frame (all columns numeric- without NA, NaN)
#' 
#' @seealso \code{\link{mkCrossFrameCExperiment}}, \code{\link{mkCrossFrameNExperiment}}, \code{\link{designTreatmentsC}} \code{\link{designTreatmentsN}} \code{\link{designTreatmentsZ}}, \code{\link{prepare}}
#' 
#' @examples
#' 
#' # categorical example
#' set.seed(23525)
#' 
#' # we set up our raw training and application data
#' dTrainC <- data.frame(
#'   x = c('a', 'a', 'a', 'b', 'b', NA, NA),
#'   z = c(1, 2, 3, 4, NA, 6, NA),
#'   y = c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE))
#' 
#' dTestC <- data.frame(
#'   x = c('a', 'b', 'c', NA), 
#'   z = c(10, 20, 30, NA))
#' 
#' # we perform a vtreat cross frame experiment
#' # and unpack the results into treatmentsC
#' # and dTrainCTreated
#' unpack[
#'   treatmentsC = treatments,
#'   dTrainCTreated = crossFrame
#'   ] <- mkCrossFrameCExperiment(
#'     dframe = dTrainC,
#'     varlist = setdiff(colnames(dTrainC), 'y'),
#'     outcomename = 'y',
#'     outcometarget = TRUE,
#'     verbose = FALSE)
#' 
#' # the treatments include a score frame relating new
#' # derived variables to original columns
#' treatmentsC$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')] %.>%
#'   print(.)
#' 
#' # the treated frame is a "cross frame" which
#' # is a transform of the training data built 
#' # as if the treatment were learned on a different
#' # disjoint training set to avoid nested model
#' # bias and over-fit.
#' dTrainCTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' # Any future application data is prepared with
#' # the prepare method.
#' dTestCTreated <- prepare(treatmentsC, dTestC, pruneSig=NULL)
#' 
#' dTestCTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' @export
prepare.treatmentplan <- function(treatmentplan, dframe,
                                  ...,
                                  pruneSig= NULL,
                                  scale= FALSE,
                                  doCollar= FALSE,
                                  varRestriction= NULL,
                                  codeRestriction= NULL,
                                  trackedValues= NULL,
                                  extracols= NULL,
                                  parallelCluster= NULL,
                                  use_parallel= TRUE,
                                  check_for_duplicate_frames= TRUE) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::prepare")
  .checkArgs1(dframe=dframe)
  if(!('treatmentplan' %in% class(treatmentplan))) {
    stop("treatmentplan must be of class treatmentplan")
  }
  vtreatVersion <- packageVersion('vtreat')
  if(is.null(treatmentplan$vtreatVersion) ||
     (treatmentplan$vtreatVersion!=vtreatVersion)) {
    warning(paste('treatments designed with vtreat version',
                  treatmentplan$vtreatVersion,
                  'and preparing data.frame with vtreat version',
                  vtreatVersion))
  }
  if(!is.data.frame(dframe)) {
    stop("dframe must be a data frame")
  }
  if(nrow(dframe)<=0) {
    stop("no rows")
  }
  old_fit_obj_id <- treatmentplan$fit_obj_id
  if(check_for_duplicate_frames && (!is.null(old_fit_obj_id))) {
    fit_obj_id <- id_f(dframe)
    if(!is.null(fit_obj_id)) {
      if(fit_obj_id == old_fit_obj_id) {
        warning("possibly called prepare() on same data frame as designTreatments*()/mkCrossFrame*Experiment(), this can lead to over-fit.  To avoid this, please use mkCrossFrame*Experiment$crossFrame.")
      }
    }
  }
  if(!is.null(trackedValues)) {
    for(v in sort(intersect(names(trackedValues),
                            colnames(dframe)))) {
      new_values <- setdiff(dframe[[v]], trackedValues[[v]])
      if(length(new_values)>0) {
        if(length(new_values)>5) {
          vsample <- paste(new_values[1:5], collapse = ", ")
          vsample <- paste0(vsample, ", ...")
        } else {
          vsample <- paste(new_values, collapse = ", ")
        }
        wmsg <- paste0("vtreat::prepare: column \"", v, "\" has ",
                       length(new_values), 
                       " previously unseen values:",
                       vsample, " .")
        warning(wmsg)
      }
    }
  }
  if(treatmentplan$outcomeType=='None') {
    pruneSig <- NULL
  }
  useable <- treatmentplan$scoreFrame$varMoves
  if(!is.null(pruneSig)) {
    useable <- useable & (treatmentplan$scoreFrame$sig<=pruneSig)
  }
  useableVars <- treatmentplan$scoreFrame$varName[useable]
  if(!is.null(varRestriction)) {
    useableVars <- intersect(useableVars,varRestriction)
  }
  if(!is.null(codeRestriction)) {
    hasSelectedCode <- treatmentplan$scoreFrame$code %in% codeRestriction
    useableVars <- intersect(useableVars, 
                             treatmentplan$scoreFrame$varName[hasSelectedCode])
  }
  if(length(useableVars)<=0) {
    stop('no useable vars')
  }
  vars_we_warned_on <- list()
  for(ti in treatmentplan$treatments) {
    if(length(intersect(ti$newvars,useableVars))>0) {
      newType <- paste(typeof(dframe[[ti$origvar]]), collapse = " ")
      newClass <- paste(class(dframe[[ti$origvar]]), collapse = " ")
      if((ti$origType!=newType) || (ti$origClass!=newClass)) {
        if(is.null(vars_we_warned_on[[ti$origvar]])) {
          warning(paste('variable',ti$origvar,'expected type/class',
                        ti$origType,ti$origClass,
                        'saw ',newType,newClass))
          vars_we_warned_on[ti$origvar] <- 1
        }
      }
    }
  }
  treated <- .vtreatList(treatmentplan$treatments,dframe,useableVars,scale,doCollar,
                         parallelCluster = parallelCluster,
                         use_parallel = use_parallel)
  # copy outcome and extracols over when present
  for(ci in unique(c(treatmentplan$outcomename, extracols))) {
    if(ci %in% colnames(dframe)) {
      treated[[ci]] <- dframe[[ci]]
    }
  }
  treated
}




#' Run categorical cross-frame experiment.
#' 
#' Builds a \code{\link{designTreatmentsC}} treatment plan and a data frame prepared 
#' from \code{dframe} that is "cross" in the sense each row is treated using a treatment
#' plan built from a subset of dframe disjoint from the given row.
#' The goal is to try to and supply a method of breaking nested model bias other than splitting
#' into calibration, training, test sets.
#' 
#'
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @param outcomename Name of column holding outcome variable. dframe[[outcomename]] must be only finite non-missing values.
#' @param outcometarget Value/level of outcome to be considered "success",  and there must be a cut such that dframe[[outcomename]]==outcometarget at least twice and dframe[[outcomename]]!=outcometarget at least twice.
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param weights optional training weights for each row
#' @param minFraction optional minimum frequency a categorical level must have to be converted to an indicator column.
#' @param smFactor optional smoothing factor for impact coding models.
#' @param rareCount optional integer, allow levels with this count or below to be pooled into a shared rare-level.  Defaults to 0 or off.
#' @param rareSig optional numeric, suppress levels from pooling at this significance value greater.  Defaults to NULL or off.
#' @param collarProb what fraction of the data (pseudo-probability) to collar data at if doCollar is set during \code{\link{prepare.treatmentplan}}.
#' @param codeRestriction what types of variables to produce (character array of level codes, NULL means no restriction).
#' @param customCoders map from code names to custom categorical variable encoding functions (please see \url{https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md}).
#' @param scale optional if TRUE replace numeric variables with regression ("move to outcome-scale").
#' @param doCollar optional if TRUE collar numeric variables by cutting off after a tail-probability specified by collarProb during treatment design.
#' @param splitFunction (optional) see vtreat::buildEvalSets .
#' @param ncross optional scalar>=2 number of cross-validation rounds to design.
#' @param forceSplit logical, if TRUE force cross-validated significance calculations on all variables.
#' @param catScaling optional, if TRUE use glm() linkspace, if FALSE use lm() for scaling.
#' @param verbose if TRUE print progress.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods.
#' @param missingness_imputation function of signature f(values: numeric, weights: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers.
#' @return named list containing: treatments, crossFrame, crossWeights, method, and evalSets 
#' 
#' @seealso \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsN}}, \code{\link{prepare.treatmentplan}}
#' 
#' @examples
#' 
#' # categorical example
#' set.seed(23525)
#' 
#' # we set up our raw training and application data
#' dTrainC <- data.frame(
#'   x = c('a', 'a', 'a', 'b', 'b', NA, NA),
#'   z = c(1, 2, 3, 4, NA, 6, NA),
#'   y = c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE))
#' 
#' dTestC <- data.frame(
#'   x = c('a', 'b', 'c', NA), 
#'   z = c(10, 20, 30, NA))
#' 
#' # we perform a vtreat cross frame experiment
#' # and unpack the results into treatmentsC
#' # and dTrainCTreated
#' unpack[
#'   treatmentsC = treatments,
#'   dTrainCTreated = crossFrame
#'   ] <- mkCrossFrameCExperiment(
#'     dframe = dTrainC,
#'     varlist = setdiff(colnames(dTrainC), 'y'),
#'     outcomename = 'y',
#'     outcometarget = TRUE,
#'     verbose = FALSE)
#' 
#' # the treatments include a score frame relating new
#' # derived variables to original columns
#' treatmentsC$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')] %.>%
#'   print(.)
#' 
#' # the treated frame is a "cross frame" which
#' # is a transform of the training data built 
#' # as if the treatment were learned on a different
#' # disjoint training set to avoid nested model
#' # bias and over-fit.
#' dTrainCTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' # Any future application data is prepared with
#' # the prepare method.
#' dTestCTreated <- prepare(treatmentsC, dTestC, pruneSig=NULL)
#' 
#' dTestCTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' @export
mkCrossFrameCExperiment <- function(dframe,varlist,
                                    outcomename,outcometarget,
                                    ...,
                                    weights=c(),
                                    minFraction=0.02,smFactor=0.0,
                                    rareCount=0,rareSig=1,
                                    collarProb=0.00,
                                    codeRestriction=NULL,
                                    customCoders=NULL,
                                    scale=FALSE,doCollar=FALSE,
                                    splitFunction=NULL,ncross=3,
                                    forceSplit = FALSE,
                                    catScaling=TRUE,
                                    verbose= TRUE,
                                    parallelCluster=NULL,
                                    use_parallel = TRUE,
                                    missingness_imputation = NULL,
                                    imputation_map = NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::mkCrossFrameCExperiment")
  .checkArgs(dframe=dframe,varlist=varlist,outcomename=outcomename)
  if(!is.data.frame(dframe)) {
    stop("dframe must be a data frame")
  }
  if(collarProb>=0.5) {
    stop("collarProb must be < 0.5")
  }
  if(nrow(dframe)<1) {
    stop("most have rows")
  }
  if(!(outcomename %in% colnames(dframe))) {
    stop("outcomename must be a column name of dframe")
  }
  if(any(is.na(dframe[[outcomename]]))) {
    stop("There are missing values in the outcome column, can not run mkCrossFrameCExperiment.")
  }
  if(is.null(weights)) {
    weights <- rep(1.0,nrow(dframe))
  }
  if(verbose) {
    print(paste("vtreat", 
                packageVersion("vtreat"),
                "start initial treatment design", date()))
  }
  treatments <- designTreatmentsC(dframe,varlist,outcomename,outcometarget,
                                  weights=weights,
                                  minFraction=minFraction,smFactor=smFactor,
                                  rareCount=rareCount,rareSig=rareSig,
                                  collarProb=collarProb,
                                  codeRestriction=codeRestriction,
                                  customCoders=customCoders,
                                  splitFunction=splitFunction,ncross=ncross,
                                  forceSplit = forceSplit,
                                  catScaling=catScaling,
                                  verbose=FALSE,
                                  parallelCluster=parallelCluster,
                                  use_parallel = use_parallel,
                                  missingness_imputation = missingness_imputation,
                                  imputation_map = imputation_map)
  zC <- dframe[[outcomename]]
  zoY <- ifelse(zC==outcometarget,1,0)
  newVarsS <- treatments$scoreFrame$varName[(treatments$scoreFrame$varMoves) &
                                              (treatments$scoreFrame$sig<1)]
  if(verbose) {
    print(paste(" start cross frame work", date()))
  }
  crossDat <- .mkCrossFrame(
    dframe = dframe,
    referenceTreatments = treatments,
    varlist = varlist,
    newVarsS = newVarsS,
    outcomename = outcomename,
    zoY = zoY,
    zC = zC,
    zTarget = outcometarget,
    weights = weights,
    minFraction = minFraction,
    smFactor = smFactor,
    rareCount = rareCount,
    rareSig = rareSig,
    collarProb = collarProb,
    codeRestriction = codeRestriction,
    customCoders = customCoders,
    scale = scale,
    doCollar = doCollar,
    splitFunction = splitFunction,
    nSplits = ncross,
    catScaling = catScaling,
    parallelCluster = parallelCluster,
    use_parallel = use_parallel,
    verbose = FALSE,
    missingness_imputation = missingness_imputation,
    imputation_map = imputation_map)
  crossFrame <- crossDat$crossFrame
  newVarsS <- intersect(newVarsS,colnames(crossFrame))
  goodVars <- newVarsS[vapply(newVarsS,
                              function(v) {
                                min(crossFrame[[v]])<max(crossFrame[[v]])
                              },
                              logical(1))]
  # Make sure scoreFrame and crossFrame are consistent in variables mentioned
  treatments$scoreFrame <- treatments$scoreFrame[treatments$scoreFrame$varName %in% goodVars,]
  treatments$scoreFrame <- augment_score_frame(treatments$scoreFrame)
  crossFrame <- crossFrame[,colnames(crossFrame) %in% c(goodVars,outcomename),drop=FALSE]
  if(verbose) {
    print(paste(" vtreat::mkCrossFrameCExperiment done", date()))
  }
  res <- list(treatments=treatments,
              crossFrame=crossFrame,
              crossWeights=crossDat$crossWeights,
              method=crossDat$method,
              evalSets=crossDat$evalSets)
  class(res) <- "vtreat_cross_frame_experiment"
  res
}


#' Run a numeric cross frame experiment.
#' 
#' Builds a \code{\link{designTreatmentsN}} treatment plan and a data frame prepared 
#' from \code{dframe} that is "cross" in the sense each row is treated using a treatment
#' plan built from a subset of dframe disjoint from the given row.
#' The goal is to try to and supply a method of breaking nested model bias other than splitting
#' into calibration, training, test sets.
#'  
#' @param dframe Data frame to learn treatments from (training data), must have at least 1 row.
#' @param varlist Names of columns to treat (effective variables).
#' @param outcomename Name of column holding outcome variable. dframe[[outcomename]] must be only finite non-missing values and there must be a cut such that dframe[[outcomename]] is both above the cut at least twice and below the cut at least twice.
#' @param ... no additional arguments, declared to forced named binding of later arguments
#' @param weights optional training weights for each row
#' @param minFraction optional minimum frequency a categorical level must have to be converted to an indicator column.
#' @param smFactor optional smoothing factor for impact coding models.
#' @param rareCount optional integer, allow levels with this count or below to be pooled into a shared rare-level.  Defaults to 0 or off.
#' @param rareSig optional numeric, suppress levels from pooling at this significance value greater.  Defaults to NULL or off.
#' @param collarProb what fraction of the data (pseudo-probability) to collar data at if doCollar is set during \code{\link{prepare.treatmentplan}}.
#' @param codeRestriction what types of variables to produce (character array of level codes, NULL means no restriction).
#' @param customCoders map from code names to custom categorical variable encoding functions (please see \url{https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md}).
#' @param scale optional if TRUE replace numeric variables with regression ("move to outcome-scale").
#' @param doCollar optional if TRUE collar numeric variables by cutting off after a tail-probability specified by collarProb during treatment design.
#' @param splitFunction (optional) see vtreat::buildEvalSets .
#' @param ncross optional scalar>=2 number of cross-validation rounds to design.
#' @param forceSplit logical, if TRUE force cross-validated significance calculations on all variables.
#' @param verbose if TRUE print progress.
#' @param parallelCluster (optional) a cluster object created by package parallel or package snow.
#' @param use_parallel logical, if TRUE use parallel methods.
#' @param missingness_imputation function of signature f(values: numeric, weights: numeric), simple missing value imputer.
#' @param imputation_map map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers.
#' @return named list containing: treatments, crossFrame, crossWeights, method, and evalSets 
#' 
#' @seealso \code{\link{designTreatmentsC}}, \code{\link{designTreatmentsN}}, \code{\link{prepare.treatmentplan}}
#' 
#' @examples
#' 
#' # numeric example
#' set.seed(23525)
#' 
#' # we set up our raw training and application data
#' dTrainN <- data.frame(
#'   x = c('a', 'a', 'a', 'a', 'b', 'b', NA, NA),
#'   z = c(1, 2, 3, 4, 5, NA, 7, NA), 
#'   y = c(0, 0, 0, 1, 0, 1, 1, 1))
#' 
#' dTestN <- data.frame(
#'   x = c('a', 'b', 'c', NA), 
#'   z = c(10, 20, 30, NA))
#' 
#' # we perform a vtreat cross frame experiment
#' # and unpack the results into treatmentsN
#' # and dTrainNTreated
#' unpack[
#'   treatmentsN = treatments,
#'   dTrainNTreated = crossFrame
#'   ] <- mkCrossFrameNExperiment(
#'     dframe = dTrainN,
#'     varlist = setdiff(colnames(dTrainN), 'y'),
#'     outcomename = 'y',
#'     verbose = FALSE)
#' 
#' # the treatments include a score frame relating new
#' # derived variables to original columns
#' treatmentsN$scoreFrame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'extraModelDegrees')] %.>%
#'   print(.)
#' 
#' # the treated frame is a "cross frame" which
#' # is a transform of the training data built 
#' # as if the treatment were learned on a different
#' # disjoint training set to avoid nested model
#' # bias and over-fit.
#' dTrainNTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' # Any future application data is prepared with
#' # the prepare method.
#' dTestNTreated <- prepare(treatmentsN, dTestN, pruneSig=NULL)
#' 
#' dTestNTreated %.>%
#'   head(.) %.>%
#'   print(.)
#' 
#' @export
#' 
mkCrossFrameNExperiment <- function(dframe,varlist,outcomename,
                                    ...,
                                    weights=c(),
                                    minFraction=0.02,smFactor=0.0,
                                    rareCount=0,rareSig=1,
                                    collarProb=0.00,
                                    codeRestriction=NULL,
                                    customCoders=NULL,
                                    scale=FALSE,doCollar=FALSE,
                                    splitFunction=NULL,ncross=3,
                                    forceSplit=FALSE,
                                    verbose= TRUE,
                                    parallelCluster=NULL,
                                    use_parallel = TRUE,
                                    missingness_imputation = NULL, imputation_map=NULL) {
  wrapr::stop_if_dot_args(substitute(list(...)), "vtreat::mkCrossFrameNExperiment")
  .checkArgs(dframe=dframe,varlist=varlist,outcomename=outcomename)
  if(!is.data.frame(dframe)) {
    stop("dframe must be a data frame")
  }
  if(collarProb>=0.5) {
    stop("collarProb must be < 0.5")
  }
  if(nrow(dframe)<1) {
    stop("most have rows")
  }
  if(!(outcomename %in% colnames(dframe))) {
    stop("outcomename must be a column name of dframe")
  }
  if(any(is.na(dframe[[outcomename]]))) {
    stop("There are missing values in the outcome column, can not run mkCrossFrameNExperiment.")
  }
  catScaling=FALSE
  if(is.null(weights)) {
    weights <- rep(1.0,nrow(dframe))
  }
  if(verbose) {
    print(paste("vtreat", 
                packageVersion("vtreat"),
                "start initial treatment design", date()))
  }
  treatments <- designTreatmentsN(dframe,varlist,outcomename,
                                  weights=weights,
                                  minFraction=minFraction,smFactor=smFactor,
                                  rareCount=rareCount,rareSig=rareSig,
                                  collarProb=collarProb,
                                  codeRestriction = codeRestriction,
                                  customCoders = customCoders,
                                  splitFunction=splitFunction,ncross=ncross,
                                  forceSplit = forceSplit,
                                  verbose=FALSE,
                                  parallelCluster=parallelCluster,
                                  use_parallel = use_parallel,
                                  missingness_imputation = missingness_imputation, imputation_map=imputation_map)
  zC <- NULL
  zoY <- dframe[[outcomename]]
  newVarsS <- treatments$scoreFrame$varName[(treatments$scoreFrame$varMoves) &
                                              (treatments$scoreFrame$sig<1)]
  if(verbose) {
    print(paste(" start cross frame work", date()))
  }
  crossDat <- .mkCrossFrame(
    dframe = dframe,
    referenceTreatments = treatments,
    varlist = varlist,
    newVarsS = newVarsS,
    outcomename = outcomename,
    zoY = zoY,
    zC = zC,
    zTarget = NULL,
    weights = weights,
    minFraction = minFraction,
    smFactor = smFactor,
    rareCount = rareCount,
    rareSig = rareSig,
    collarProb = collarProb,
    codeRestriction = codeRestriction, 
    customCoders = customCoders,
    scale = scale,
    doCollar = doCollar,
    splitFunction = splitFunction,
    nSplits = ncross,
    catScaling = catScaling,
    parallelCluster = parallelCluster,
    use_parallel = use_parallel,
    verbose = FALSE,
    missingness_imputation = missingness_imputation, imputation_map=imputation_map)
  crossFrame <- crossDat$crossFrame
  newVarsS <- intersect(newVarsS,colnames(crossFrame))
  goodVars <- newVarsS[vapply(newVarsS,
                              function(v) {
                                min(crossFrame[[v]])<max(crossFrame[[v]])
                              },
                              logical(1))]
  # Make sure scoreFrame and crossFrame are consistent in variables mentioned
  treatments$scoreFrame <- treatments$scoreFrame[treatments$scoreFrame$varName %in% goodVars,]
  treatments$scoreFrame <- augment_score_frame(treatments$scoreFrame)
  crossFrame <- crossFrame[,colnames(crossFrame) %in% c(goodVars,outcomename),drop=FALSE]
  if(verbose) {
    print(paste(" vtreat::mkCrossFrameNExperiment done", date()))
  }
  res <- list(treatments=treatments,
              crossFrame=crossFrame,
              crossWeights=crossDat$crossWeights,
              method=crossDat$method,
              evalSets=crossDat$evalSets)
  class(res) <- "vtreat_cross_frame_experiment"
  res
}

#' @export
format.vtreat_cross_frame_experiment <- function(x, ...) {
  format(x$treatments)
}

#' @export
print.vtreat_cross_frame_experiment <- function(x, ...) {
  print(format(x))
  invisible(x)
}