Nothing
#' @include makeCPO.R auxiliary.R fauxCPOConstructor.R
#' @title Apply a Function Element-Wise
#'
#' @template cpo_doc_intro
#'
#' @description
#' The function must either vectorize over the given data,
#' or will be applied to each data element on its own.
#'
#' It must not change the type of the data, i.e. numeric
#' data must remain numeric etc.
#'
#' If the function can only handle a subset of the given columns,
#' e.g. only a certain type, use \code{affect.*} arguments.
#'
#' @param fun [\code{function}]\cr
#' The function to apply. If
#' \code{vectorize} is \code{TRUE}, the argument is a vector of the
#' whole column, \code{fun} must vectorize over it and return
#' a vector of the same length;
#' otherwise, the function gets called once for
#' every data item, and both the function argument and
#' the return value must have length 1.
#'
#' The function must take one or two arguments. If it takes
#' two arguments, the second argument will be \code{param}.
#' @param param [any]\cr
#' Optional argument to be given to \code{fun}. If \code{fun}
#' only takes one argument, this is ignored. Default is \code{NULL}.
#' @param vectorize [\code{logical(1)}]\cr
#' Whether to call \code{fun} once for each column, or
#' once for each element. If \code{fun} vectorizes,
#' it is recommended to have this set to \code{TRUE}
#' for better performance. Default is \code{TRUE}.
#' @param make.factors [\code{logical(1)}]\cr
#' Whether to turn resulting \code{logical} and \code{character}
#' columns into \code{factor} columns (which are preferred by
#' \code{mlr}). Default is \code{TRUE}.
#'
#' @section CPOTrained State:
#' The created state is empty.
#'
#' @template cpo_doc_outro
#' @export
cpoApplyFun = makeCPO("fun.apply", # nolint
pSS(fun: funct, param = NULL: untyped, vectorize = TRUE: logical, make.factors = TRUE: logical),
export.params = "param",
properties.adding = paste0(cpo.dataproperties, ".sometimes"),
properties.needed = paste0(cpo.dataproperties, ".sometimes"),
dataformat = "df.features",
cpo.train = NULL,
cpo.retrafo = {
fun = augmentFun(fun, 1, param)
if (!vectorize) {
fun = vectorizeFun(fun, 1, "cpoApplyFun")
}
outerfun = function(col) {
result = fun(col)
if (length(result) != nrow(data)) {
stop("cpoApplyFun 'fun' return value had the wrong length.")
}
if (!is.atomic(result)) {
stop("cpoApplyFun 'fun' did not return values that simplified to an atomic vector.")
}
if (make.factors && (is.character(result) || is.logical(result))) {
result = factor(result)
}
result
}
as.data.frame(lapply(data, outerfun), stringsAsFactors = FALSE, row.names = rownames(data))
})
#' @title Transform a Regression Target Variable
#'
#' @template cpo_doc_intro
#'
#' @description
#' Apply a given function to the target column of a regression \code{\link{Task}}.
#'
#' @section Details:
#' When both \code{mean} and \code{se} prediction is available, it may be possible to
#' make more accurate mean inversion than for the \code{response} \code{predict.type},
#' using integrals or approximations like the \emph{delta method}. In such cases it may be
#' advisable to prepend this \code{\link{CPO}} with the \code{\link{cpoResponseFromSE}}
#' \code{\link{CPO}}.
#'
#' Note when \code{trafo} or \code{invert.response} take more than one argument, the
#' second argument will be set to the value of \code{param}. This may lead to unexpected
#' results when using functions with rarely used parameters, e.g. \code{\link[base:Log]{log}}.
#' In these cases, it may be necessary to wrap the function:
#' \code{trafo = function(x) log(x)}.
#'
#' @param trafo [\code{function}]\cr
#' A function transforming the target column. If \code{vectorize} is \code{TRUE},
#' the argument is a vector of the whole column, \code{trafo} must vectorize over it
#' and return a vector of the same length; otherwise, the function gets called once
#' for every data item, and both the function argument and the return value
#' must have length 1.
#'
#' The function must take one or two arguments. If it takes two arguments, the second argument
#' will be \code{param}.
#' @param invert.response [\code{function}]\cr
#' If a model is trained on data that was transformed by \code{trafo}, this function
#' should invert a prediction made by this model back to the space of the original data.
#' In most cases, this will be the inverse of \code{trafo}, so that \code{invert.response(trafo(x)) == x}.
#'
#' Similarly to \code{trafo}, this function takes / produces single elements or the whole
#' column, depending on \code{vectorize}. The return value should be a \code{numeric}
#' in both cases.
#'
#' This can also be \code{NULL}, in which case using this \code{\link{CPO}} for
#' \code{\link{invert}} with \code{predict.type = "response"} is not possible.
#'
#' Default is \code{NULL}.
#' @param invert.se [\code{function}]\cr
#' Similarly to \code{invert.response}, this is a function that inverts a \code{"se"}
#' prediction made after training on \code{trafo}'d data. This function should take
#' at least two arguments, \code{mean} and \code{se}, and return a numeric vector of length
#' 2 if \code{vectorize} is \code{FALSE}, or a \code{data.frame} or \code{matrix} with
#' two numeric columns if \code{vectorize} is \code{TRUE}. The function may also take a third
#' argument, which will be set to \code{param}.
#'
#' \code{invert.se} may also be \code{NULL}, in which case \dQuote{se} inversion is done
#' by numeric integration using Gauss-Hermite quadrature.
#'
#' Default is \code{NULL}.
#' @param param [any]\cr
#' Optional argument to be given to \code{trafo} and / or \code{invert}. If both of
#' them only take one argument, this is ignored. Default is \code{NULL}.
#' @param vectorize [\code{logical(1)}]\cr
#' Whether to call \code{trafo}, \code{invert.response} and \code{invert.se} once
#' with the whole data column (or response \emph{and} se column if \code{predict.type == "se"}),
#' or once for each element. If the functions vectorize, it is recommended to have this
#' set to \code{TRUE} for better performance. Default is \code{TRUE}.
#' @param gauss.points [\code{numeric(1)}]\cr
#' Number of points at which to evaluate \code{invert.response} for Gauss-Hermite quadrature integration.
#' Only used if \code{invert.se} is \code{NULL}. Default is \code{23}.
#' @template cpo_doc_outro
#' @export
cpoApplyFunRegrTarget = makeCPOTargetOp("fun.apply.regr.target", # nolint
pSS(trafo: funct,
invert.response = NULL: funct [[special.vals = list(NULL)]],
invert.se = NULL: funct [[special.vals = list(NULL)]],
param = NULL: untyped, vectorize = TRUE: logical,
gauss.points = 23: integer[3, ]),
properties.target = "regr", predict.type.map = c(response = "response", se = "se"),
export.params = "param",
constant.invert = TRUE,
cpo.train = NULL, cpo.train.invert = NULL,
cpo.retrafo = {
trafo = augmentFun(trafo, 1, param)
if (!vectorize) {
trafo = vectorizeFun(trafo, 1, "cpoApplyFunRegrTarget")
}
if (!is.null(invert.response)) augmentFun(invert.response, 1, param) # just for checking
if (!is.null(invert.se)) augmentFun(invert.se, 2, param) # just for checking
targettrans = trafo(target[[1]])
if (!is.numeric(targettrans)) {
stop("cpoApplyFunRegrTarget trafo did not return a numeric.\n To convert between Task types, use makeCPOTargetOp.")
}
if (length(targettrans) != nrow(target)) {
stop("trafo may not change length")
}
target[[1]] = targettrans
target
},
cpo.invert = {
if (predict.type == "se") {
inlength = nrow(target)
if (is.null(invert.se)) {
if (is.null(invert.response)) {
stop("cpoApplyFunRegrTarget: cannot predict 'se', since invert.response or invert.se must be non-NULL")
}
invert.response = augmentFun(invert.response, 1, param)
invert.se = function(mu, sigma, param) {
invertNormalMuSigma(invert.response, mu, sigma, gauss.points, vectorize)
}
} else {
invert.se = augmentFun(invert.se, 2, param)
}
meancol = target[, 1, drop = TRUE]
secol = target[, 2, drop = TRUE]
if (!vectorize) {
invert.se = vectorizeFun(invert.se, 2, "cpoApplyFunRegrTarget")
target = t(invert.se(meancol, secol))
} else {
target = invert.se(meancol, secol)
if (is.data.frame(target)) {
if (!all(vlapply(target, is.numeric))) {
stop("invert.se returned data.frame that had non-numeric columns")
}
target = as.matrix(target)
}
}
if (!is.matrix(target) || ncol(target) != 2) {
stop("invert.se returned data needs to have two columns")
}
if (!is.numeric(target)) {
stop("cpoAppluFunRegrTarget invert.se did not return a numeric.")
}
if (nrow(target) != inlength) {
stop("invert.se output had length different from input length.")
}
} else {
assert(predict.type == "response")
if (is.null(invert.response)) {
stop("cpoApplyFunRegrTarget cannot predict 'response', since invert.response was NULL")
}
inlength = length(target)
invert.response = augmentFun(invert.response, 1, param)
if (!vectorize) {
invert.response = vectorizeFun(invert.response, 1, "cpoApplyFunRegrTarget")
}
target = invert.response(target)
if (!is.numeric(target)) {
stop("cpoAppluFunRegrTarget invert.response did not return a numeric.")
}
if (length(target) != inlength) {
stop("invert.response output length different from input length.")
}
}
target
})
# check function and possibly curry 'param'
#
# If 'fun' takes fewer than minparam params, throw an error.
# If it takes more, or has "...", return a function that
# automatically calls 'fun' with 'param'.
#
# @param fun [function] the function to check / augment
# @param minparams [numeric(1)] min number of params
# @param param [any] the param to augment the function by.
augmentFun = function(fun, minparams, param) {
force(fun)
fun.name = as.character(substitute(fun))
if (is.primitive(fun)) {
# formals() works different on primitive functions like e.g. 'as.character'
forms = formals(args(fun))
} else {
forms = formals(fun)
}
hasdots = "..." %in% names(forms)
if (!length(forms) || (length(forms) < minparams && !hasdots)) {
stopf("%s must take at least %s arguments", fun.name, minparams)
}
if (hasdots || length(forms) > minparams) {
function(...) fun(..., param)
} else {
fun
}
}
# sapply the fun to column(s), check the return
#
# Return a function that vectorizes over any number of columns
# and returns a data.frame with `numreturns` return values.
vectorizeFun = function(fun, numreturns, cponame) {
force(fun)
fun.name = as.character(substitute(fun))
function(...) {
mapply(function(...) {
ret = fun(...)
if (length(ret) != numreturns) {
stopf("%s '%s' did not return a result with length %s", cponame, fun.name, numreturns)
}
if (!is.atomic(ret)) {
stopf("%s '%s' did not return values that simplified to an atomic vector.", cponame, fun.name)
}
ret
}, ...)
}
}
#' @title Log-Transform a Regression Target Variable.
#'
#' @template cpo_doc_intro
#'
#' @description
#' Log-transforms the regression \code{\link[mlr:Task]{Task}}'s target variable.
#'
#' If \code{predict.type} is \dQuote{response} for inversion, the model's prediction is
#' exponentiated.
#'
#' If \code{predict.type} = \dQuote{se} prediction is performed, the model's prediction
#' is taken as the parameters of a lognormal random variable; the inverted prediction is then
#' \code{mean = exp(mean + se^2 / 2)}, \code{se = sqrt((exp(se^2) - 1) * exp(2 * mean + se^2))}.
#'
#' It is therefore recommended to use \dQuote{se} prediction, possibly with the help of
#' \code{\link{cpoResponseFromSE}}.
#'
#' @template cpo_doc_outro
#' @export
cpoLogTrafoRegr = function(id) {
cpo = cpoApplyFunRegrTarget(trafo = log, invert.response = exp,
invert.se = function(mean, se) { cbind(mean = exp(mean + se^2 / 2), se = sqrt((exp(se^2) - 1) * exp(2 * mean + se^2))) },
param = exp(1),
export = "export.none")
if (!missing(id)) {
cpo = setCPOId(id)
}
cpo
}
cpoLogTrafoRegr = wrapFauxCPOConstructor(cpoLogTrafoRegr) # nolint
registerCPO(cpoApplyFun(fun = identity), "data", "general data preprocessing", "Apply an arbitrary function column-wise.")
registerCPO(cpoApplyFunRegrTarget(trafo = identity), "target", "general target transformation", "Apply an arbitrary function to a regression target.")
registerCPO(cpoLogTrafoRegr(), "target", "target transformation", "Log-transform a regression target.")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.