Nothing
# FormatCheck.R is central within the CPO framework. It checks that
# incoming & outgoing data conforms with the properties declared by
# a CPO and with requirements implicit within the CPO framework
# (Task column not changed in Feature Operating CPOs, number of rows
# not changed unless Training Only CPO, data for retrafo is the same
# as data for trafo).
# In the same stride, data format is changed to match the required
# format of the CPO functions -- see `dataformat` argument of
# makeCPO.
##################################
### Externals ###
##################################
# do the preparation before calling trafo:
# - check the data is in an acceptable format (task or df)
# - check the properties are fulfilled
# - split the data
# - get a shape info object
# @param indata [Task | data.frame] incoming data to be fed to the CPO trafo function
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param allowed.properties [character] allowed properties of `indata`
# @param subset.selector [list] information about 'affect.*' parameters that determine which subset of 'indata' is affected
# @param capture.factors [logical(1)] whether to save factor levels of input data in shapeinfo data structure. This is only used if the CPO has 'fix.factors' set to TRUE
# @param operating.type [character(1)] one of 'target', 'feature', 'retrafoless': whether target data, feature data, or both (but only during trafo) may be changed
# @param name [character(1)] name of the cpo, for message printing
# @return [list] the data to feed to the CPO trafo function, as well as meta-information:
# list(indata = list(data, target, data.reduced, target.reduced), shapeinfo, properties, private)
# 'private' is a list containing some fields used by `handleTrafoOutput`.
prepareTrafoInput = function(indata, dataformat, strict.factors, allowed.properties, subset.selector, capture.factors, operating.type, name) {
assert(checkClass(indata, "data.frame"), checkClass(indata, "Task"))
subset.info = subsetIndata(indata, subset.selector, allowed.properties, "trafo", name)
indata = subset.info$indata
shapeinfo = makeInputShapeInfo(indata, capture.factors)
subset.selector$data = NULL
shapeinfo$subset.selector = subset.selector
split.data = splitIndata(indata, dataformat, strict.factors, TRUE)
list(indata = split.data$indata,
shapeinfo = shapeinfo, properties = subset.info$properties,
private = list(tempdata = split.data$tempdata, subset.index = subset.info$subset.index,
dataformat = dataformat, strict.factors = strict.factors,
origdata = subset.info$origdata, name = name, operating.type = operating.type))
}
# do the preparation before calling retrafo:
# - check data is in an acceptable format (task or df)
# - check the properties are fulfilled
# - check the shape is the same as during trafo
# - split the data
# --> return
# how does mlr predict handle this stuff? they just drop target columns by name
# @param indata [Task | data.frame] incoming data to be fed to the CPO retrafo function
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param allowed.properties [character] allowed properties of `indata`
# @param shapeinfo.input [InputShapeInfo] information about the data shape used to train the CPO
# @param operating.type [character(1)] one of 'target', 'feature', 'retrafoless': whether target data, feature data, or both (but only during trafo) may be changed
# @param name [character(1)] name of the cpo, for message printing
# @return [list] the data to feed to the CPO retrafo function, as well as meta-information:
# list(indata = data in a shape fit to be fed into retrafo, properties, task, private)
# 'task' is the reconstructed task, if any
# 'private' is a list containing some fields used by `handleTrafoOutput`.
prepareRetrafoInput = function(indata, dataformat, strict.factors, allowed.properties, shapeinfo.input, operating.type, name) {
origdata = indata
targetop = operating.type == "target"
# check that input column names and general types match (num / fac, or num/fac/ordered if strict.factors
if ("factor.levels" %in% names(shapeinfo.input)) {
indata = fixFactors(indata, shapeinfo.input$factor.levels)
}
task = NULL
if ("Task" %in% class(indata)) {
origdatatype = "Task"
if (length(shapeinfo.input$target) && length(getTaskTargetNames(indata))) {
# NOTE: this might be too strict: maybe the user wants to retrafo a Task with the target having a different name?
# HOWEVER, then either the training indata's task didnt matter (and he should have trained with a data.set?), or it
# DID matter, in which case it is probably important to have the same data type <==> target name
assertSetEqual(getTaskTargetNames(indata), shapeinfo.input$target, .var.name = sprintf("Target names of Task %s", getTaskId(indata)))
}
traintype = shapeinfo.input$type
curtype = getTaskDesc(indata)$type
if (traintype != curtype && traintype != "cluster" && curtype != "cluster") {
# if either the training or the current task is of type "cluster" (could also mean
# the training was done with a data.frame) we forgive this here.
stopf("CPO trained with task of type %s cannot operate on task of type %s.",
traintype, curtype)
}
if (curtype != "cluster" || traintype == "cluster") {
# if the current task is a cluster, we only do target
# op stuff if the training type was also a cluster.
task = indata
}
target = getTaskData(indata, features = character(0))
indata = getTaskData(indata, target.extra = TRUE)$data
} else if (is.data.frame(indata)) {
origdatatype = "data.frame"
if (any(shapeinfo.input$target %in% names(indata)) || shapeinfo.input$type == "cluster") {
if (!all(shapeinfo.input$target %in% names(indata))) {
badcols = intersect(shapeinfo.input$target, names(indata))
stopf("Some, but not all target columns of training data found in new data. This is probably an error.\n%s%s: %s",
"Offending column", ifelse(length(badcols) > 1, "s", ""), collapse(badcols, sep = ", "))
}
if (targetop) {
task = constructTask(indata, shapeinfo.input$target, shapeinfo.input$type, "[CPO CONSTRUCTED]")
}
target = indata[shapeinfo.input$target]
indata = dropNamed(indata, shapeinfo.input$target)
} else {
target = indata[character(0)]
shapeinfo.input$target = NULL
}
} else {
stopf("Data fed into CPO %s retrafo is not a Task or data.frame.", name)
}
if (!is.null(task)) {
subset.info = subsetIndata(task, shapeinfo.input$subset.selector,
allowed.properties, "retrafo", name)
origdata = subset.info$origdata
indata = subset.info$indata
assertShapeConform(getTaskData(indata, target.extra = TRUE)$data, shapeinfo.input, strict.factors, name)
} else {
# every kind of input looks like 'cluster' here
allowed.properties %c=% "cluster"
subset.info = subsetIndata(indata, shapeinfo.input$subset.selector,
allowed.properties, "retrafo", name)
indata = subset.info$indata
assertShapeConform(indata, shapeinfo.input, strict.factors, name)
}
reducing = dataformat %in% c("task", "df.all")
if (targetop && !is.null(task)) {
split.data = splitIndata(indata, dataformat, strict.factors, TRUE)
indata = split.data$indata
if (reducing) {
indata = list(data = indata$data.reduced,
target = indata$data)
} else {
indata = indata[c("data", "target")]
}
} else {
split.data = splitIndata(indata, if (reducing) "df.features" else dataformat, strict.factors, FALSE)
indata = split.data$indata
indata["target"] = list(NULL) # we want a $target slot with value NULL
}
list(indata = indata, properties = subset.info$properties, task = task,
private = list(tempdata = split.data$tempdata, subset.index = subset.info$subset.index,
origdata = origdata, dataformat = dataformat, strict.factors = strict.factors,
name = name, operating.type = operating.type, origdatatype = origdatatype,
targetnames = names(target) %??% character(0)))
}
# Do the check of the trafo's return value
# - check the data is in an acceptable format (task, df, split dfs)
# - recombine into a task / df
# - check properties are allowed
# - get a shape info object
# @param outdata [Task | data.frame | matrix] data returned by CPO trafo function
# @param prepared.input [list] return object of `prepareTrafoInput`
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
# Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
# to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param convertto [character(1)] only if the operating.type is 'target': type of the new task
# @param simpleresult [character(1)] whethre even for df.all / task dataformat the return value will be formatted
# according to df.features
# @return [list] the data resulting from the CPO operation, as well as meta-information: list(outdata, shapeinfo)
handleTrafoOutput = function(outdata, prepared.input, properties.needed, properties.adding, convertto, simpleresult) {
ppr = prepared.input$private
olddata = ppr$origdata # incoming data that was already given to prepareTrafoInput as 'indata'
dataformat = ppr$dataformat
strict.factors = ppr$strict.factors
operating.type = ppr$operating.type
name = ppr$name
subset.index = ppr$subset.index # index into olddata columns: the columns actually selected by 'affect.*' parameters
if (dataformat == "task" && !simpleresult) {
assertTask(outdata, "trafo", name)
}
if (operating.type != "target") {
# for dataformat 'factor', 'numeric' etc, combine the return object of the function with
# all the columns that were not originally given to the function
outdata = rebuildOutdata(outdata, ppr$tempdata, dataformat)
}
dataformat = getLLDataformat(dataformat)
if (dataformat %in% c("df.all", "task") && simpleresult) {
assert(operating.type == "feature")
dataformat = "df.features"
}
recombined = if (operating.type == "target") {
recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, TRUE, convertto, name)
} else if (operating.type == "retrafoless") {
recombineRetrafolessResult(olddata, outdata, prepared.input$shapeinfo, dataformat, strict.factors, subset.index, name)
} else if (is.data.frame(olddata)) {
# implied operating.type == "feature"
recombinedf(olddata, outdata, dataformat, strict.factors, subset.index, character(0), name)
} else {
# implied operating.type == "feature"
recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, FALSE, name = name)
}
checkOutputProperties(outdata, recombined, prepared.input$shapeinfo$target, prepared.input$properties, properties.needed, properties.adding, operating.type, "trafo", name)
shapeinfo = makeOutputShapeInfo(outdata)
if ("Task" %in% class(recombined)) {
shapeinfo$target = getTaskTargetNames(recombined)
}
list(outdata = recombined, shapeinfo = shapeinfo)
}
# do the check of the retrafo's return value
# - check data is in an acceptable format (task, df, split dfs)
# - recombine into a task / df
# - check the properties are fulfilled
# - check the shape is the same as during trafo
# @param outdata [Task | data.frame | matrix] data returned by CPO retrafo function
# @param prepared.input [list] return object of `prepareTrafoInput`
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
# Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
# to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param convertto [character(1)] type of task to convert to, for target operation cpo
# @param shapeinfo.output [OutputShapeInfo] ShapeInfo describing the shape of the data returned by the CPO `trafo` function when it was called.
# This imposes the same structure on the retrafo return value.
# @return [list] the data resulting from the CPO retrafo operation
handleRetrafoOutput = function(outdata, prepared.input, properties.needed, properties.adding, convertto, shapeinfo.output) {
ppr = prepared.input$private
olddata = ppr$origdata # incoming data that was already given to prepareRetrafoInput as 'indata'
dataformat = ppr$dataformat
strict.factors = ppr$strict.factors
subset.index = ppr$subset.index # index into olddata columns: the columns actually selected by 'affect.*' parameters
operating.type = ppr$operating.type
targetnames = ppr$targetnames
name = ppr$name
if (operating.type == "target" && dataformat == "task") {
assertTask(outdata, "retrafo", name)
}
# whether to ignore target columns of shapeinfo.output
# (needed when dataformat differs between trafo & retrafo)
drop.shapeinfo.target = dataformat %in% c("df.all", "task") && operating.type != "target"
if (operating.type != "target") {
# tempdata: incoming data that was given to prepareRetrafoInput as 'indata', after subsetting according to 'affect.*' parameters
outdata = rebuildOutdata(outdata, ppr$tempdata, dataformat)
}
dataformat = getLLDataformat(dataformat)
if (operating.type == "target") {
# this won't get called at all if operating.type is target and there were not target
# columns to rebuild. Therefore `olddata` will always be a Task here.
recombined = recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, TRUE, convertto, name)
} else {
if (dataformat %in% c("df.all", "task")) {
# target is always split off during retrafo
dataformat = "df.features"
}
if (ppr$origdatatype == "data.frame") {
if (any(targetnames %in% names(olddata))) {
assert(all(targetnames %in% names(olddata))) # we also check this in prepareRetrafoInput
}
recombined = recombinedf(olddata, outdata, dataformat, strict.factors, subset.index, targetnames, name)
} else {
recombined = recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, FALSE, name = name)
}
}
checkOutputProperties(outdata, recombined, targetnames, prepared.input$properties, properties.needed, properties.adding, operating.type, "retrafo", name)
# check the shape of outdata is as expected
if (dataformat == "split" && operating.type != "target") {
assertSetEqual(names(outdata), setdiff(names(shapeinfo.output), "target"))
for (n in names(outdata)) {
assertShapeConform(outdata[[n]], shapeinfo.output[[n]], strict.factors, name)
}
} else {
if (operating.type == "target" && dataformat == "task") {
outdata = getTaskData(outdata, target.extra = TRUE)$data
}
assertShapeConform(outdata, shapeinfo.output, strict.factors, name, ignore.target = drop.shapeinfo.target)
}
assertTargetShapeConform(recombined, shapeinfo.output, operating.type, name)
if (operating.type == "target" && ppr$origdatatype == "data.frame") {
# input was a data.frame (with target columns), so we return da data.frame (with target columns)
recombined = getTaskData(recombined)
}
recombined
}
##################################
### Shape & Properties ###
##################################
# make sure that the factor levels of data.frame 'data' are as described by 'levels'.
# @param data [data.frame | Task] data / task to check / modify
# @param levels [list of character] levels of `data` columns, indexed by `data` column names
# @return [data.frame | Task] the modified `data`
fixFactors = function(data, levels) {
UseMethod("fixFactors")
}
fixFactors.default = function(data, levels) {
assertSubset(names(levels), names(data))
data[names(levels)] = mapply(factor, data[names(levels)], levels, SIMPLIFY = FALSE)
data
}
fixFactors.Task = function(data, levels) {
changeData(data, fixFactors(getTaskData(data), levels))
}
# calculate the properties of the data (only feature types & missings)
# data can be a task or data.frame
# @param data [data.frame | Task] the data to check
# @param targetnames [character] only if `data` is a data.frame: the target columns, which will be ignored
# @return [character] a subset of c("numerics", "factors", "ordered", "missings")
getDataProperties = function(data, targetnames) {
if (is.data.frame(data)) {
td = makeTaskDescInternal(NULL, NULL, data, targetnames, NULL, NULL, FALSE)
} else {
assertClass(data, "Task")
td = getTaskDesc(data)
}
nf = td$n.feat
c(names(nf)[nf > 0], if (td$has.missings) "missings")
}
# calculate the properties of the data, as if it were a task.
# If data is a data.frame, we give it the property 'cluster'
# otherwise, we give it the propertye of the task type. If
# applicable, we also set oneclass, multiclass, etc (any from
# the variable 'cpo.targetproperties')
# @param data [data.frame | Task] the data to check
# @return [character] a subset of c("numerics", "factors", "ordered", "missings", "cluster", "classif", "multilabel", "regr", "surv", "oneclass", "twoclass", "multiclass")
getTaskProperties = function(data) {
props = getDataProperties(data, character(0))
if (is.data.frame(data)) {
c(props, "cluster")
} else {
td = getTaskDesc(data)
if (td$type == "classif") {
others = switch(as.character(length(td$class.levels)),
`1` = "oneclass", `2` = "twoclass", "multiclass")
} else {
others = NULL
}
c(props, td$type, others)
}
}
# calculate properties of a general data object.
#
# This may be a Task, data.frame, or list of data.frames
# (as used with dataformat "split").
# @param data [list | data.frame | Task] The data to get properties of
# @param ignore.cols [character] names of columns to ignore, only for data.frame
# @return [character] same as of getTaskProperties
getGeneralDataProperties = function(data, ignore.cols = character(0)) {
if ("Task" %nin% class(data) && !is.data.frame(data)) {
unique(unlist(lapply(data, getTaskProperties)))
} else if (is.data.frame(data)) {
getTaskProperties(dropNamed(data, ignore.cols))
} else {
getTaskProperties(data)
}
}
# give error when shape is different than dictated by shapeinfo.
#
# @param df [data.frame] the data to check
# @param shapeinfo [ShapeInfo] a the shape which `df` must conform to
# @param strict.factors [logical(1)] whether to check for 'ordered' as a type differing from 'factor'
# @param name [character(1)] name of the CPO currently being run, for error and debug printing
# @param retrafoless [logical(1)] whether this is the trafo result of a retrafoless CPO. Default FALSE.
# @param ignore.target [logical(1)] whether to ignore columns that have the same name as the target
# column(s) declared in the $target slot. Default FALSE.
# @return [invisible(NULL)]
assertShapeConform = function(df, shapeinfo, strict.factors, name, retrafoless = FALSE,
ignore.target = FALSE) {
if (ignore.target && !is.null(shapeinfo$target)) {
shapeinfo$colnames = setdiff(shapeinfo$colnames, shapeinfo$target)
shapeinfo$coltypes = dropNamed(shapeinfo$coltypes, shapeinfo$target)
}
if (!identical(names2(df), shapeinfo$colnames)) {
errmsg = if (retrafoless) {
"Error in CPO %s: columns may not be changed by cpo.trafo.\nInput was %s, output is %s."
} else {
"Error in CPO %s: column name mismatch between training and test data.\nWas %s during training, is %s now."
}
stopf(errmsg, name, collapse(shapeinfo$colnames, sep = ", "), collapse(names(df), sep = ", "))
}
indata = df[shapeinfo$colnames]
if (strict.factors) {
typesmatch = list(
c("integer", "numeric"),
"factor", "ordered")
} else {
typesmatch = list(
c("integer", "numeric"),
c("factor", "ordered"))
}
newcoltypes = vcapply(indata, function(x) class(x)[1])
for (t in typesmatch) {
typemismatch = (newcoltypes %in% t) != (shapeinfo$coltypes %in% t)
if (any(typemismatch)) {
plurs = ifelse(sum(typemismatch) > 1, "s", "")
singes = ifelse(sum(typemismatch) > 1, "", "es")
stopf("Error in CPO %s: Type%s of column%s %s mismatch%s between training and test data.", name,
plurs, plurs, collapse(names(indata)[typemismatch], sep = ", "), singes)
}
}
}
# give error when shape recorded 'target' differs from task target
#
# only target column names are compared. This is needed for target
# operation CPOs changing target names. If data is not a Task
# or the operating.type is 'target', this does nothing.
#
# @param data [Task | data.frame] the Task.
# @param shapeinfo [ShapeInfo] a ShapeInfo
# @param operating.type [character(1)] operating type: "target", "feature", or "retrafoless"
# @return [invisible(NULL)]
assertTargetShapeConform = function(data, shapeinfo, operating.type, name) {
if ("Task" %nin% class(data) || operating.type != "target") {
return(invisible(NULL))
}
if (!identical({newtarget = getTaskTargetNames(data)}, shapeinfo$target)) {
stopf("Error in CPO %s: Target name(s) after retrafo differ(s) from target name(s) after trafo. Was '%s', is now '%s'",
name, collapse(newtarget, "', '"), collapse(shapeinfo$target, "', '"))
}
}
# prepare some information about the data shape, so retrafo can check that
# it gets the kind of data it expects
# this needs to be checked both for input and for output
# @param data [data.frame] the data for which the shape is to be created
# @return [ShapeInfo] a simple datastructure that contains information about data column names and types
makeShapeInfo = function(data) {
makeS3Obj("ShapeInfo",
colnames = colnames(data) %??% character(0),
coltypes = vcapply(data, function(x) class(x)[1]))
}
# like makeShapeInfo, but additionally get the target names and possibly factor levels
# @param indata [data.frame | Task] data for which the shape is to be created
# @param capture.factors [logical(1)] whether to capture factor levels
# @return [InputShapeInfo] a datastructure extending `ShapeInfo` containing information about the data shape
makeInputShapeInfo = function(indata, capture.factors) {
if ("Task" %in% class(indata)) {
data = getTaskData(indata, target.extra = TRUE)$data
ret = makeShapeInfo(data)
ret$target = getTaskTargetNames(indata)
ret$type = getTaskDesc(indata)$type
if (ret$type == "classif") {
ret$positive = getTaskDesc(indata)$positive
}
} else {
data = indata
ret = makeShapeInfo(data)
ret$target = character(0)
ret$type = "cluster"
}
if (capture.factors) {
ret$factor.levels = Filter(function(x) !is.null(x), lapply(data, levels))
}
addClasses(ret, "InputShapeInfo")
}
# creates shape info for data coming out of trafo, so retrafo can check that the data generated
# by it conforms to the data returned by trafo earlier.
# This does not do certain tests about the form of `outdata`, so it is recommended to call this
# after `recombinetask` was called (but with the *original* data, not the recombined data).
# @param outdata [data.frame | Task | list of data.frame] data returned by `trafo` function of which the shape is to be covered
# @return [OutputShapeInfo] This either extends `ShapeInfo` (if outdata is `data.frame` or `Task`) or is a list of `ShapeInfo` objects.
makeOutputShapeInfo = function(outdata) {
if (is.data.frame(outdata)) {
res = makeShapeInfo(outdata)
} else if ("Task" %in% class(outdata)) {
res = makeShapeInfo(getTaskData(outdata, target.extra = TRUE)$data)
res$target = getTaskTargetNames(outdata)
res$type = getTaskDesc(outdata)$type
} else {
# data is split by type, so we get the shape of each of the constituents
res = lapply(outdata, makeShapeInfo)
}
addClasses(res, "OutputShapeInfo")
}
# check properties of data returned by trafo or retrafo function
# @param outdata [data.frame | Task | list] data returned by (re)trafo function (after rebuildOutdata)
# @param recombined [data.frame | Task] recombined data as will be returned to the user
# @param target.names [character] names of target columns
# @param input.properties [character] input properties as determined by prepare***Input
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
# Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
# to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param operating.type [character(1)] operating type of cpo, one of 'target', 'feature', 'retrafoless'
# @param whichfun [character(1)] name of the CPO stage
# @param name [character(1)] name of the CPO
# @return [invisible(NULL)]
checkOutputProperties = function(outdata, recombined, target.names, input.properties, properties.needed, properties.adding, operating.type, whichfun, name) {
# allowed.properties: allowed properties of `outdata`. That is the union of the CPO's 'properties.needed' field and the properties already present in 'indata'
allowed.properties = union(input.properties, properties.needed)
present.properties = if (operating.type == "feature") {
getGeneralDataProperties(outdata, target.names)
} else {
getTaskProperties(recombined)
}
if (operating.type == "target") {
# target operating CPOs can not change feature properties, but
# there may be properties hidden from 'prepared.input$properties'
# because of affect.*-subsetting which could be present in 'present.properties'
# so we remove all feature properties here.
present.properties = setdiff(present.properties, cpo.dataproperties)
} else if (operating.type == "feature") {
# remove properties of the target that are picked up by getGeneralDataProperties but
# are not relevant.
present.properties = setdiff(present.properties, cpo.all.target.properties)
}
assertPropertiesOk(present.properties, allowed.properties, whichfun, "out", name)
assertPropertiesOk(present.properties, setdiff(allowed.properties, properties.adding), whichfun, "adding", name)
}
# give userfriendly error message when data does have the properties it is allowed to have.
# @param present.properties [character] properties that were found in a given data object
# @param allowed.properties [character] the properties that the data object is allowed to have
# @param whichfun [character(1)] name of the CPO stage
# @param direction [character(1)] either "in" (data is being sent into CPO), "out" (data was returned by CPO function, some
# properties are present that were *not* present in the input data, but the given properties were not declared as
# 'properties.needed'), or "adding" (data was returned by CPO function, but the given properties *were* declared as
# 'properties.adding' and hence must not be present)
# @return [invisible(NULL)]
assertPropertiesOk = function(present.properties, allowed.properties, whichfun, direction, name) {
if (!isPropertyStrict()) {
return(invisible(NULL))
}
badprops = setdiff(present.properties, allowed.properties)
if (length(badprops)) {
if (direction == "in") {
stopf("Data going into CPO %s has propert%s %s that %s can not handle.",
whichfun, ifelse(length(badprops) > 1, "ies", "y"),
collapse(badprops, sep = ", "), name)
} else if (direction == "out") {
stopf("Data returned by CPO %s has propert%s %s that %s did not declare in .properties.needed.",
whichfun, ifelse(length(badprops) > 1, "ies", "y"),
collapse(badprops, sep = ", "), name)
} else {
# 'adding' properties may not be present during output, but the error message
# would be confusing if we used the 'out' message for this.
assert(direction == "adding")
stopf("Data returned by CPO %s has propert%s %s that %s declared in .properties.adding.\n%s",
whichfun, ifelse(length(badprops) > 1, "ies", "y"),
collapse(badprops, sep = ", "), name,
paste("properties in .properties.adding may not be present in", whichfun, "output."))
}
}
}
# Check that the given task does not lie about itself.
#
# This is used on user-returned tasks. Check that task.desc$size equals the row number, that
# the target names occur in the task, etc.
# @param task [Task] the task.
# @param whichfun [character(1)] which function returned the task: trafo, retrafo
# @param name [character(1)] name of the cpo
# @return [invisible(NULL)]
assertTask = function(task, whichfun, name) {
if (!is.list(task) || !is.list(task$task.desc) || "id" %nin% names(task$task.desc)) {
stopf("Object returned by %s %s was not a task.", whichfun, name)
}
taskdesignator = function() sprintf("Task %s returned by %s %s", task$task.desc$id, whichfun, name)
if (!is.environment(task$env)) {
stopf("%s had no environment in its '$env' slot.", taskdesignator())
}
task.desc = task$task.desc
target = task.desc$target
required.classes = switch(task.desc$type,
classif = c("ClassifTask", "SupervisedTask"),
regr = c("RegrTask", "SupervisedTask"),
cluster = c("ClusterTask", "UnsupervisedTask"),
surv = c("SurvTask", "SupervisedTask"),
multilabel = c("MultilabelTask", "SupervisedTask"),
stopf("%s task type must be one of classif, regr, cluster, multilabel, surv", taskdesignator()))
if (!identical(task$type, task.desc$type)) {
stopf("%s task type and task.desc type must be the same", taskdesignator())
}
required.classes = c(required.classes, "Task")
if (!identical(required.classes, class(task))) {
stopf("%s must have classes %s", taskdesignator(), collapse(required.classes, ", "))
}
required.classes = paste0(required.classes, "Desc")
if (!identical(required.classes, class(task.desc))) {
stopf("%s task.desc must have classes %s", taskdesignator(), collapse(required.classes, ", "))
}
checks = c(
`id must be a character(1)` = testString(task.desc$id),
`data must be a data.frame with unique column names` = testDataFrame(task$env$data, col.names = "unique"),
`target must be a character` = testCharacter(target),
`task.desc must have numeric 'n.feat' slot` = testNumeric(task.desc$n.feat))
if (!all(checks)) {
stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
}
identIntLikeNum = function(x, y) identical(as.numeric(x), as.numeric(y))
cl = table(dropNamed(vcapply(task$env$data, function(x) class(x)[1]), target))
checks = c(
`target must be a subset of task columns` = testSubset(target, colnames(task$env$data)),
`number of 'numerics' features listed in task.desc is wrong` =
identIntLikeNum(sum(cl[c("integer", "numeric")], na.rm = TRUE), task.desc$n.feat["numerics"]),
`number of 'factors' features listed in task.desc is wrong` =
identIntLikeNum(sum(cl["factor"], na.rm = TRUE), task.desc$n.feat["factors"]),
`number of 'ordered' features listed in task.desc is wrong` =
identIntLikeNum(sum(cl["ordered"], na.rm = TRUE), task.desc$n.feat["ordered"]),
`'has.missings' slot in task.desc is wrong` =
identical(anyMissing(task$env$data), task.desc$has.missings),
`'size' slot in task.desc is wrong` =
identIntLikeNum(nrow(task$env$data), task.desc$size),
`'has.weights' slot in task.desc is wrong` =
identical(!is.null(task$weights), task.desc$has.weights),
`''has.blocking' slot in task.desc is wrong` =
identical(!is.null(task$blocking), task.desc$has.blocking))
if (!all(checks)) {
stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
}
if (task.desc$type %in% c("classif", "regr") && length(target) != 1) {
stopf("%s is of type %s but has %s targets.", taskdesignator(), task.desc$type,
length(target))
} else if (task.desc$type == "surv" && length(target) != 2) {
stopf("%s is of type surv and must have exactly two targets.", taskdesignator())
} else if (task.desc$type == "multilabel" && length(target) < 2) {
stopf("%s is of type multilabel and must have more than one target.", taskdesignator())
}
checks = switch(task.desc$type,
classif = c(
`class levels in task.desc are not the factor levels of the target column` =
testSetEqual(levels(task$env$data[[target]]), task.desc$class.levels),
`task.desc 'positive' and 'negative' slots must be NA for multiclass tasks` =
length(task.desc$class.levels) <= 2 || (is.na(task.desc$positive) && is.na(task.desc$negative)),
`task.desc 'positive' and 'negative' slots must be both class levels of the target` =
length(task.desc$class.levels) != 2 || (
testString(task.desc$positive) &&
testString(task.desc$negative) &&
testSetEqual(c(task.desc$positive, task.desc$negative), task.desc$class.levels)),
`task.desc 'positive' slot must be the class level, 'negative' slot must be not_<positive>` =
length(task.desc$class.levels) != 1 || (
testString(task.desc$positive) &&
testString(task.desc$negative) &&
identical(task.desc$positive, task.desc$class.levels) &&
identical(task.desc$negative, paste0("not_", task.desc$class.levels)))),
regr = TRUE,
cluster = TRUE,
surv = c(
`time column must be numeric` = testNumeric(task$env$data[[target[1]]]),
`event column must be logical` = testLogical(task$env$data[[target[2]]])),
multilabel = c(
`class.levels in task.desc must equal target names.` =
testSetEqual(task.desc$class.levels, target)),
stop("Unexpected error: task.desc$type was bad."))
if (!all(checks)) {
stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
}
}
##################################
### Task Splitting ###
##################################
# Get the *indices* of columns of 'data' that are referenced by affect.* params.
# E.g. if 'affect.type == "numeric"', the indices of all numeric columns are returned.
#
# All of the parameters are just the relevantt 'affect.*' parameters as given to the CPO constructor, with the
# exception of 'data'. `getColIndices` can therefore be called using `do.call(getColIndices, insert(affect.param.list, list(data = DATA)))`
# @param data [data.frame] The data to get indices from
# @param type [character] subset of `c("numeric", "factor", "ordered", "other")`: all columns of the given type are included
# @param index [integer] index into data columns to include. Order is preserved, and they are ordered before all other matches
# @param names [character] names of data columns to include. Order is preserved, and they are ordered before other matches, except `index`
# @param pattern [character(1)] `grepl` pattern. Data columns that match the pattern are included
# @param invert [logical(1)] If TRUE, all matches are inverted, i.e. only columns that do not match any of the criteria are returned
# @param pattern.ignore.case [logical(1)] the `ignore.case` parameter of `grepl`: ignore case of `pattern`.
# @param pattern.perl [logical(1)] the `perl` parameter of `grepl`: use perl regex syntax
# @param pattern.fixed [logical(1)] the `fixed` parameter of `grepl`: don't interpret pattern as regex, but as fixed pattern.
# @return [integer]: index into `data` columns for selected columns.
getColIndices = function(data, type, index, names, pattern, invert, pattern.ignore.case, pattern.perl, pattern.fixed) {
coltypes = vcapply(data, function(x) class(x)[1])
coltypes[coltypes == "integer"] = "numeric"
coltypes[!coltypes %in% c("numeric", "factor", "ordered")] = "other"
matchcols = coltypes %in% type
if (!is.null(pattern)) {
matchcols = matchcols | grepl(pattern, colnames(data), pattern.ignore.case, pattern.perl, pattern.fixed)
}
badnames = names[!names %in% names(data)]
if (length(badnames)) {
stopf("Column%s not found: %s", ifelse(length(badnames) > 1, "s", ""), collapse(badnames, sep = ", "))
}
index %c=% setdiff(match(names, names(data)), index)
index %c=% setdiff(which(matchcols), index)
if (invert) {
index = setdiff(seq_along(data), index)
}
index
}
# Translate the 'dataformat' option for internal use ("low level" dataformat) to a simplified version.
#
# most of CPOFormatCheck's split / recombine logic doesn't care about "factor", "onlyfactor", "ordered" or "numeric"
# and just treats it as "most" or "all" dataformat, subsetting the resulting data. This significantly
# simplifies the "splitting" and "recombining" of input / output data.
# E.g. if dataformat is "factor":
# (1) the data is split according to "most" -- this is translated by 'getLLDataformat'
# (2) the data that is handed to the cpo.trafo function is gotten by 'getIndata' which takes the '$factor' slot of the split data, in this case
# (3) cpo.trafo returns its output. This output is put back together with the other data using 'rebuildOutdata'
# (4) all checks are then done as if cpo.trafo had had used the "most" dataformat and not touched any but the '$factor' slots.
# @param dataformat [character(1)] the dataformat to translate
# @return [character(1)] a simplified dataformat option
getLLDataformat = function(dataformat) {
if (dataformat %in% c("factor", "numeric", "ordered")) {
"split"
} else {
dataformat
}
}
# Get element of data according to dataformat.
#
# This is the complementary operation (category theoretically the quotient object) of `getLLDataformat`.
# With 'indata' being split according to dataformat "factor", "onlyfactor", "ordered", or "numeric", get the relevant subitem
# from the indata after it was split according to "most" or "all".
# If dataformat is none of these, this is a noop.
# @param indata [list of data.frame | data.frame | Task] the result of splitting incoming data according to `getLLDataformat(dataformat)`.
# @param dataformat [character(1)] one of the possible dataformat options
# @return [data.frame | Task] data formatted according to dataformat, to be fed into a trafo / retrafo function
getIndata = function(indata, dataformat) {
if (dataformat %in% c("factor", "ordered", "numeric")) {
indata[[dataformat]]
} else {
indata
}
}
# Reassemble data that was split according to some of the `dataformat` options.
#
# If dataformat is one of "factor", "onlyfactor", "ordered", or "numeric", then
# the data returned by trafo / retrafo (only the modified factors / etc) needs
# to be integrated with the remaining unmodified columns. With 'outdata' being a
# data slice according to dataformat, this function puts the returned data back
# into the "tempdata" block from which the input was taken.
#
# If dataformat is none of these, this is a noop.
# @param outdata [data.frame | list of data.frame | Task] the data returned by a trafo / retrafo function
# @param tempdata [data.frame | list of data.frame | Task] the original data, split according to `getLLDataformat(dataformat)`.
# @param dataformat [character(1)] the dataformat option of the current CPO
# @return [data.frame | list of data.frame | Task] `outdata`, possibly embedded into `tempdata`.
rebuildOutdata = function(outdata, tempdata, dataformat) {
if (dataformat %in% c("factor", "ordered", "numeric")) {
tempdata[[dataformat]] = outdata
outdata = tempdata
}
if (dataformat %in% c("numeric", "split") && is.matrix(outdata$numeric)) {
outdata$numeric = as.data.frame(outdata$numeric)
}
outdata
}
# split 'outdata' into subsets given by 'which'. If 'which' does not contain "ordered", then
# 'ordered' columns are put together with 'factor' columns.
# @param which [character] subset of `c("numeric", "factor", "ordered", "other")`: by which types to split
# @param data [data.frame | any] data to split. This can also be any other list or vector if `types` is given
# @param types [character | NULL] types of columns / elements of `data`. If this is not provided, it is
# determined from `data`. This is useful if `data` is not a data.frame but e.g. only a vector of column names.
# @return [list of data.frame | list of any] a list of subsets of `data`, named according to `which`.
splitColsByType = function(which = c("numeric", "factor", "ordered", "other"), data, types = NULL) {
if (is.null(types)) {
types = vcapply(data, function(x) class(x)[1])
}
# types: may be a character of type names, then data can be something besides a data.frame, like just a vector of names or indices
match.arg(which, several.ok = TRUE)
factorsubset = c("factor", if (!"ordered" %in% which) "ordered")
sapply(which, function(x) {
subset = if (x == "other") {
!types %in% c("integer", "numeric", "factor", "ordered")
} else {
types %in% switch(x,
numeric = c("integer", "numeric"),
factor = factorsubset,
ordered = "ordered")
}
data[subset]
}, simplify = FALSE, USE.NAMES = TRUE)
}
# Convenience function for 'dataformat' splitting.
#
# calls `splitdf` or `splittask`, depending on datatype of `data`.
#
# This performs no checks. possibly need to check that properties are adhered to
# in retrafo, must also check if the format is the same as during training
# 'possibly' here means: if not attached to a learner
#
# @param data [Task | data.frame] the data to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
# a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splitX = function(data, dataformat = c("df.features", "split", "df.all", "task"), strict.factors) {
dataformat = match.arg(dataformat)
if (is.data.frame(data)) {
splitdf(data, dataformat, strict.factors)
} else {
splittask(data, dataformat, strict.factors)
}
}
# check whether the first level of a classif target is not the positive level
#
# @param task [Task] the task to check
# @return [logical(1)] TRUE when the first level of a classif target is not the positive level, FALSE otherwise, and
# for non-classif task.
isLevelFlipped = function(task) {
if (getTaskType(task) != "classif") {
return(FALSE)
}
pos = getTaskDesc(task)$positive
assert(!is.null(pos))
if (is.na(pos)) {
return(FALSE)
}
target = getTaskData(task, target.extra = TRUE)$target
assert(length(levels(target)) <= 2)
if (!identical(levels(target)[1], pos)) {
assert(identical(levels(target)[2], pos))
return(TRUE)
}
FALSE
}
# reorder the levels of a classif target to make the positive level the first one
#
# @param data [data.frame] the data frame containing the target
# @param target [character(1) | numeric(1)] the name or index of the target column. Is assumed to be a factor with two levels.
# @return [data.frame] the input data with the two levels of the target column flipped.
flipTaskTarget = function(data, target) {
data[[target]] = factor(data[[target]], levels = rev(levels(data[[target]])))
data
}
# reorder levels of classif target if the original task was level flipped
#
# @param data [data.frame] the data frame containing the target to be flipped
# @param task [Task] the original task. The target column of this task must be the one of the data.frame
# @return [data.frame] the input data, potentially with the two levels of the target column flipped.
unflipTarget = function(data, task) {
if (isLevelFlipped(task)) {
data = flipTaskTarget(data, getTaskTargetNames(task))
}
data
}
# This does the 'dataformat' splitting up of Tasks.
#
# This is the sister of `splitdf` which gets applied to `data.frame`.
# @param task [Task] the task to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
# a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splittask = function(task, dataformat, strict.factors) {
if (dataformat %in% c("split", "df.features")) {
splt = getTaskData(task, target.extra = TRUE)$data
colsplit = c("numeric", "factor", if (strict.factors) "ordered", "other")
trg = getTaskData(task, features = character(0))
if (isLevelFlipped(task)) {
trg = flipTaskTarget(trg, 1)
}
}
if (dataformat == "df.all") {
data = getTaskData(task)
target = getTaskTargetNames(task)
data = unflipTarget(data, task)
return(list(data = data, target = target))
}
switch(dataformat,
task = list(data = task, target = getTaskTargetNames(task)),
df.features = list(data = splt,
target = trg), # want the target to always be a data.frame
split = list(data = splitColsByType(colsplit, splt),
target = trg)) # want the target to always be a data.frame
}
# This does the 'dataformat' splitting up of data.frames.
#
# When creating a `Task` from a `data.frame` for `dataformat == "task"`, a `ClusterTask` is generated.
# @param df [data.frame] the data to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
# a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splitdf = function(df, dataformat, strict.factors) {
colsplit = c("numeric", "factor", if (strict.factors) "ordered", "other")
switch(dataformat,
task = list(data = makeClusterTask("[CPO CONSTRUCTED]", data = df, fixup.data = "no", check.data = FALSE), target = character(0)),
df.all = list(data = df, target = character(0)),
df.features = list(data = df, target = df[, character(0), drop = FALSE]),
split = list(data = splitColsByType(colsplit, df),
target = df[, character(0), drop = FALSE]))
}
# Take subset of data according to 'affect.*' parameters
#
# @param indata [Task | data.frame]
# @param subset.selector [list] information about 'affect.*' parameters that determine which subset of 'indata' is affected
# @param allowed.properties [character] allowed properties of `indata`
# @param whichfun [character(1)] name of the CPO stage
# @param cpo.name [character(1)] name of the CPO
# @return [list] list(origdata, indata, subset.index, properties)
subsetIndata = function(indata, subset.selector, allowed.properties, whichfun, cpo.name) {
origdata = indata
if ("Task" %in% class(indata)) {
subset.selector$data = getTaskData(indata, target.extra = TRUE)$data
subset.index = do.call(getColIndices, subset.selector)
# subsetTask, but keep everything in order
new.subset.index = featIndexToTaskIndex(subset.index, indata)
indata.data = getTaskData(indata)
if (!identical(as.integer(new.subset.index), seq_along(indata.data))) {
indata = changeData(indata, indata.data[new.subset.index])
}
} else {
subset.selector$data = indata
subset.index = do.call(getColIndices, subset.selector)
indata = indata[subset.index]
}
present.properties = getTaskProperties(indata)
assertPropertiesOk(present.properties, allowed.properties, whichfun, "in", cpo.name)
list(origdata = origdata, indata = indata, subset.index = subset.index,
properties = present.properties)
}
# Split cpo input data according to dataformat
#
# Creates also 'tempdata', the data after the split but before
# subsetting (useful for dataformat 'numeric', 'factors' etc)
# and possibly 'reduced.indata', which reduces df.all and task into df.features.
# @param data [data.frame | Task] the input data
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param create.reduced [logical(1)] whether to create 'reduced' indata
# @return [list] list(indata, tempdata). indata is the proper input for the CPO function,
# a list(data, target [, data.reduced, target.reduced]). tempdata the data after split before subsetting.
splitIndata = function(data, dataformat, strict.factors, create.reduced) {
indata = splitX(data, getLLDataformat(dataformat), strict.factors)
tempdata = indata$data
indata$data = getIndata(indata$data, dataformat)
if (create.reduced) {
# create separate "reduced" data that, besides containing the full task / df, also
# contains the data and target alone.
reduced.indata = if (dataformat %in% c("task", "df.all")) {
splitX(data, "df.features", strict.factors)
} else {
indata
}
names(reduced.indata) = paste0(names(reduced.indata), ".reduced")
indata %c=% reduced.indata
}
list(indata = indata, tempdata = tempdata)
}
##################################
### Task Recombination ###
##################################
# Task / Data recombination entails checking that data / target was only modified if allowed by the CPO type,
# checking that the number of rows didn't change, and relevant properties didn't change.
# Recombine the data previously split up by `splitdf` / `splittask` with `dataformat` being "most" or "all",
# and after the CPO trafo / retrafo function performed its operations on it.
#
# recombineLL is called by both recombinetask and recombinedf, and does the checking (e.g. number or rows did not change)
# that is common to both.
#
# 'LL' meaning 'low level'
# @param olddata [list of data.frame] data as fed to the CPO, for reference of correct row number etc.
# @param newdata [list of data.frame] data as returned by trafo / retrafo
# @param subset.index [integer] subset of 'task' features that were selected by 'affect.*' parameters
# @param name [character(1)] CPO name for pretty debug printing
# @return [data.frame] the data in `newdata` combined into a single data.frame.
recombineLL = function(olddata, newdata, targetnames, strict.factors, subset.index, name) {
allnames = names(olddata)
needednames = c("numeric", "factor", "other", if (strict.factors) "ordered")
if (!isTRUE(checkSetEqual(names(newdata), needednames))) {
stopf('CPO %s gave bad return. The returned value must be a list with names {"%s"}.',
name, collapse(needednames, sep = '", "'))
}
targetdata = olddata[targetnames]
olddata = dropNamed(olddata, targetnames)
unsubsetdata = olddata[-subset.index]
olddata = olddata[subset.index]
dfs = vlapply(newdata, is.data.frame)
if (any(!dfs)) {
is.plur = sum(!dfs) > 1
stopf("Return of %s element%s %s %s not a data.frame.", name, ifelse(is.plur, "s", ""),
collapse(names(dfs)[!dfs], sep = ", "), ifelse(is.plur, "are", "is"))
}
# check no new names clash with other names
# this kind of sucks when a CPO just happens to change the names to something thats already there
# but we also don't want to surprise the user about us unilaterally changing names, so he needs to
# take care of that.
jointargetnames = c(targetnames, names(unsubsetdata), unlist(lapply(newdata, names)))
if (any(duplicated(jointargetnames))) {
stopf("CPO %s gave bad result\nduplicate column names %s", name, collapse(unique(jointargetnames[duplicated(jointargetnames)], sep = ", ")))
}
types = vcapply(olddata, function(x) class(x)[1])
splitargetnames = splitColsByType(names(newdata), names(olddata), types) # list(numeric = [colnames], factor = [colnames]...
numrows = nrow(olddata)
namesorder = allnames
for (splittype in names(splitargetnames)) {
if (numrows != nrow(newdata[[splittype]])) {
stopf("Number of rows of %s data returned by %s did not match input\nCPO must not change row number.",
splittype, name)
}
if (!identical(splitargetnames[[splittype]], names(newdata[[splittype]]))) {
namesorder = setdiff(namesorder, splitargetnames[[splittype]])
namesorder %c=% names(newdata[[splittype]])
}
}
newdata = cbind(unsubsetdata, do.call(cbind, unname(newdata)), targetdata)
assertSetEqual(names(newdata), namesorder)
newdata[namesorder]
}
# Recombine a task that was previously (potentially) split up according to `dataformat` and then changed by trafo / retrafo.
#
# This is used when the split up data was created from a task, and if (therefore) the result of the
# CPO is again expected to be a task.
#
# this checks that the result of trafo / retrafo has the proper type, that target and type didn't change,
# (if dataformat == "task"), and that the number of rows is the same. It then reconstructs the complete task that
# will be output by the CPO.
# @param task [Task] old task, used for input, for comparison
# @param newdata [Task | data.frame | list of data.frame] output of cpo.trafo / cpo.retrafo. This has the same format
# as `splittask(task, dataformat)`
# @param dataformat [character(1)] the dataformat used, this is `getLLDataformat` applied to the CPO's dataformat parameter.
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param subset.index [integer] subset of 'task' features that were selected by 'affect.*' parameters
# @param targetbound [logical(1)] TRUE for target operating CPO, FALSE for feature operating CPO.
# @param newtasktype [character(1)] only if `targetbound`, type of new task. Give even if no task conversion happens.
# @param name [character(1)] CPO name for pretty debug printing
# @return [Task] the task incorporating the changes done by the CPO to `newdata`.
recombinetask = function(task, newdata, dataformat = c("df.all", "task", "df.features", "split"),
strict.factors, subset.index, targetbound, newtasktype, name) {
dataformat = match.arg(dataformat)
if (is.data.frame(task)) {
# only if 'targetbound'
task = makeClusterTask(id = "[CPO CONSTRUCTED]", data = task, fixup.data = "no", check.data = FALSE)
}
if (dataformat %in% c("df.features", "split")) {
if (targetbound) {
# return is just 'target' in a df.
if (!is.data.frame(newdata)) {
stopf("CPO %s gave bad result\nmust return a data.frame containing the target.",
name)
}
olddata = getTaskData(task)
oldtnames = getTaskTargetNames(task)
newtnames = names(newdata)
if (setequal(newtnames, oldtnames)) {
olddata[newtnames] = newdata
newdata = olddata
} else if (length(oldtnames) == 1 && length(newdata) == 1) {
assert(length(oldtnames) == 1)
# note that this can NOT be combined with
# the olddata[newtnames] block above!
# also note the double brackets [[ ]].
olddata[[oldtnames]] = newdata[[1]]
names(olddata)[names(olddata) == oldtnames] = names(newdata)
newdata = olddata
} else {
newdata = cbind(dropNamed(olddata, oldtnames), newdata)
}
if (anyDuplicated(colnames(newdata))) {
stopf("CPO %s introduced duplicate column names", name)
}
if (newtasktype == "classif") {
newdata = unflipTarget(newdata, task)
}
return(constructTask(newdata, newtnames, newtasktype, getTaskId(task), isLevelFlipped(task)))
} else {
return(changeData(task, recombinedf(getTaskData(task), newdata, dataformat, strict.factors, subset.index, getTaskTargetNames(task), name)))
}
}
if (dataformat == "df.all") {
checkDFBasics(task, newdata, targetbound, name)
if (!targetbound) {
newdata = unflipTarget(newdata, task)
newdata = changeData(task, newdata)
} else {
if (newtasktype == "classif") {
newdata = unflipTarget(newdata, task)
}
newdata = constructTask(newdata, getTaskTargetNames(task), newtasktype, getTaskId(task), isLevelFlipped(task))
}
}
if (nrow(getTaskData(task)) != nrow(getTaskData(newdata))) {
stopf("CPO %s must not change number of rows", name)
}
new.subset.index = featIndexToTaskIndex(subset.index, task)
if (targetbound) {
# everything may change except size, n.feat and missings
fulldata = recombinedf(getTaskData(task), getTaskData(newdata), "df.all", strict.factors, new.subset.index, character(0), name)
fulltask = constructTask(fulldata, getTaskTargetNames(newdata), newtasktype, getTaskId(newdata), isLevelFlipped(newdata))
checkColumnsEqual(getTaskData(task, target.extra = TRUE)$data[subset.index],
getTaskData(newdata, target.extra = TRUE)$data, "non-target column", name)
checkTaskBasics(task, fulltask, setdiff(names(getTaskDesc(task)), c("n.feat", "has.missings", "has.blocking", "has.weights")), name)
return(fulltask)
}
#check type didn't change
assert(getTaskType(task) == getTaskType(newdata))
assertSetEqual(names(getTaskDesc(task)), names(getTaskDesc(newdata)))
# check target didn't change
checkColumnsEqual(getTaskData(task, features = character(0)),
getTaskData(newdata, features = character(0)), "target column", name)
checkTaskBasics(subsetTask(task, features = subset.index), newdata, c("id", "n.feat", "has.missings"), name)
changeData(task, recombinedf(getTaskData(task), getTaskData(newdata), "df.all", strict.factors, new.subset.index, character(0), name))
}
# convert an index of feature columns to an index w.r.t. the whole task
#
# A column index that references columns with respect the data
# columns only is converted to the column index with respect the
# whole task data.frame (including target columns).
#
# Target columns are included in this index. If feat.index
# is a sorted numeric, the target columns just get sorted into
# the feat.index; otherwise they are put at the beginning.
# @param feat.index [numeric] index of columns with respect to feature cols only
# @param task [Task] the task
# @return [numeric] index w.r.t. the whole task df. Includes target cols.
featIndexToTaskIndex = function(feat.index, task) {
task.data = getTaskData(task)
fullindex = seq_along(task.data)
aretargets = names(task.data) %in% getTaskTargetNames(task)
new.subset.index = fullindex[!aretargets][feat.index]
if (all(new.subset.index == sort(new.subset.index))) {
sort(c(which(aretargets), new.subset.index))
} else {
c(which(aretargets), new.subset.index)
}
}
# Recombine a data.frame that was previously (potentially) split up according to `dataformat` and then changed by trafo / retrafo.
#
# recombine data.frame after checking for match of rows etc., see 'recombinetask'.
# @param df [data.frame] old data.frame, used for input, for comparison
# @param newdata [Task | data.frame | list of data.frame] output of cpo.trafo / cpo.retrafo. This has the same format
# as `splitdf(df, dataformat)`
# @param dataformat [character(1)] the dataformat used, this is `getLLDataformat` applied to the CPO's dataformat parameter.
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param subset.index [integer] subset of 'df' features that were selected by 'affect.*' parameters
# @param targetcols [character] names of target columns; this is relevant for retrafo when cpo.trafo was trained with a Task that
# contains target columns, and cpo.retrafo is fed with a data.frame that contains columns with the same name.
# @param name [character(1)] CPO name for pretty debug printing
# @return [data.frame] the data.frame incorporating the changes done by the CPO to `newdata`
recombinedf = function(df, newdata, dataformat = c("df.features", "split", "df.all", "task"), strict.factors, subset.index, targetcols, name) {
# otherwise it contains the columns removed from the DF because they were target columns.
dataformat = match.arg(dataformat)
if (dataformat == "split") {
return(recombineLL(df, newdata, targetcols, strict.factors, subset.index, name))
} else if (dataformat == "task") {
assertClass(newdata, "Task")
newdata = getTaskData(newdata)
}
if (!is.data.frame(newdata)) {
stopf("CPO %s gave bad result\nmust return a data.frame.", name)
}
if (nrow(df) != nrow(newdata)) {
stopf("CPO %s must not change number of rows.", name)
}
outsetcols = dropNamed(df, targetcols)
if (length(subset.index)) {
outsetcols = outsetcols[-subset.index]
}
fullnames = c(names(newdata), names(outsetcols), targetcols)
dubs = duplicated(fullnames)
if (any(dubs)) {
stopf("CPO %s gave bad result\ncolumn names %s duplicated (possibly with target)", name, collapse(unique(fullnames[dubs], sep = ", ")))
}
datanames = names(newdata)
newdata = cbind(outsetcols, newdata, df[targetcols])
if (identical(datanames, setdiff(names(df), targetcols)[subset.index])) {
# names didn't change, so we preserve column order
newdata = newdata[names(df)]
names(newdata) = names(df)
}
row.names(newdata) = attr(df, "row.names")
newdata
}
# Check that columns in `old.relevants` and `new.relevants` are identical.
#
# This is mostly a helper function for pretty error messages. Depending on what
# a CPO operates on, it must not change target OR data columns. These are the "relevant"
# columns. If this rule is violated, an error message tells the user that a CPO must not
# change target / data columns.
# @param old.relevants [data.frame] subset of the old data that must stay constant
# @param new.relevants [data.frame] subset of modified data, which is checked for equality with old.relevants
# @param relevant.name [character(1)] should be something like 'targets' or 'non-target features'
# @param name [character(1)] name of the CPO for debug purposes
# @return [invisible(NULL)]
checkColumnsEqual = function(old.relevants, new.relevants, relevant.name, name) {
if (!isTRUE(checkSetEqual(names(old.relevants), names(new.relevants)))) {
stopf("CPO %s must not change %s names.", name, relevant.name)
}
for (n in names(old.relevants)) {
if (!identical(old.relevants[[n]], new.relevants[[n]])) {
stopf("CPO %s must not change %ss, but changed %s.", name, relevant.name, n)
}
}
}
# general function that builds a task of type 'type' and with id 'id', using
# the given data.
#
# @param data [data.frame] the data to be used in the new task
# @param target [character] name of target columns inside `data`
# @param type [character(1)] type of the task to be created
# @param id [character(1)] id of the newly created task
# @param flip [logical(1)] whether, for binary classif task, to put the 2nd level on 'positive'
# @return [Task] a new task of type `type`, with id `id`, data `data`, and other meta information from `oldtask`.
constructTask = function(data, target, type, id, flip = FALSE) {
if (type == "cluster") {
if (length(target)) {
stop("Cluster task cannot have target columns")
}
return(makeClusterTask(id = id, data = data, fixup.data = "no", check.data = FALSE))
}
if (type == "classif") {
assertString(target)
targetcol = data[[target]]
if (!is.factor(targetcol)) {
stop("ClassifTask target must be a factor column!")
}
if (flip && length(target) == 1) {
if (length(levels(targetcol)) == 2) {
positive = levels(targetcol)[2]
return(makeClassifTask(id = id, data = data, target = target,
positive = positive, fixup.data = "no", check.data = FALSE))
}
}
}
constructor = switch(type,
classif = makeClassifTask,
multilabel = makeMultilabelTask,
regr = makeRegrTask,
surv = makeSurvTask)
constructor(id = id, data = data, target = target, fixup.data = "no", check.data = FALSE)
}
# check that newdata is a task, and that it agrees with the
# old 'task' on everything except 'allowed.td.changes'
#
# @param task [Task] the task to compare newdata to
# @param newdata [Task] the task to check
# @param allowed.td.changes [character] slots of 'task.desc' that the tasks may disagree on
# @param name [character(1)] name of the CPO to use in the error message
# @return [invisible(NULL)]
checkTaskBasics = function(task, newdata, allowed.td.changes, name) {
if (!"Task" %in% class(newdata)) {
stopf("CPO %s must return a Task", name)
}
if ("size" %nin% allowed.td.changes && getTaskDesc(task)$size != getTaskDesc(newdata)$size) {
stopf("CPO %s must not change number of rows", name)
}
old.td = getTaskDesc(task)
new.td = getTaskDesc(newdata)
# check most of task description didn't change
for (n in setdiff(names(old.td), allowed.td.changes)) {
if (!isTRUE({complaint = all.equal(old.td[[n]], new.td[[n]])})) {
stopf("CPO %s changed task description item %s:\n%s", name, n, complaint)
}
}
}
# check that newdata is a data.frame that fits 'task's format (size, no overlap in target column names)
# @param task [Task] the task to compare newdata to
# @param newdata [data.frame] the data.frame to check
# @param targetbound [logical(1)] whether the CPO is allowed to operate on target columns
# @param name [character(1)] name of the CPO to use in the error message
# @return [invisible(NULL)]
checkDFBasics = function(task, newdata, targetbound, name) {
if (!is.data.frame(newdata)) {
stopf("CPO %s cpo.trafo gave bad result\ncpo.trafo must return a data.frame.", name)
}
assertClass(newdata, "data.frame")
tnames = getTaskTargetNames(task)
missingt = tnames[!tnames %in% names(newdata)]
if (length(missingt)) {
addendum = ""
if (targetbound) {
addendum = paste("\nIf you want to change names or number of target columns in targetbound CPOs",
'you must use other dataformat values, e.g. "df.features".', sep = "\n")
}
stopf("CPO %s cpo.trafo gave bad result\ndata.frame did not contain target column%s %s.%s",
name, ifelse(length(missingt) > 1, "s", ""), collapse(missingt, ", "), addendum)
}
}
# Checks and recombines data returned by a 'retrafoless' CPO, which is allowed to operate on both
# data and target columns.
#
# perform basic checks that a retrafoless cpo returned the kind of task / data.frame that it should;
# then convert, if necessary.
# @param olddata [Task | data.frame] the original input data
# @param newdata [Task | data.frame] the data returned by the CPO trafo function
# @param shapeinfo [ShapeInfo] The input shape which `df` must conform to
# @param dataformat [character(1)] the result of `getLLDataformat` applied to the CPO's dataformat parameter
# @param strict.factors [logical(1)] whether to check for 'ordered' as a type differing from 'factor'
# @param subset.index [integer] index into olddata columns: the columns actually selected by 'affect.*' parameters
# @param name [character(1)] the CPO name used in error messages
# @return [Task | data.frame] the recombined data from newdata
recombineRetrafolessResult = function(olddata, newdata, shapeinfo.input, dataformat, strict.factors, subset.index, name) {
assert(identical(subset.index, seq_along(subset.index)))
assertSubset(dataformat, c("df.all", "task"))
if (is.data.frame(olddata)) {
if (dataformat == "df.all") {
assertClass(newdata, "data.frame")
} else { # dataformat == "task"
assertClass(newdata, "ClusterTask")
newdata = getTaskData(newdata)
}
assertShapeConform(newdata, shapeinfo.input, strict.factors, name, TRUE)
} else {
if (dataformat == "df.all") {
assertClass(newdata, "data.frame")
if (!all(getTaskTargetNames(olddata) %in% names(newdata)) ||
!all(names(newdata)[names(newdata) %in% getTaskTargetNames(olddata)] == getTaskTargetNames(olddata))) {
stopf("retrafoless CPO %s must not change target names.", name)
}
if (getTaskType(olddata) == "classif") {
tname = getTaskTargetNames(olddata)
if (isLevelFlipped(olddata)) {
newdata = flipTaskTarget(newdata, tname)
}
if (!identical(levels(getTaskData(olddata, target.extra = TRUE)$target),
levels(newdata[[tname]]))) {
stopf("retrafoless CPO %s must not change target class levels.", name)
}
}
newdata = changeData(olddata, newdata)
} else { # dataformat == "task"
if (!identical(class(newdata), class(olddata))) {
stopf("retrafoless CPO %s must not change task type.", name)
}
if (!all(getTaskTargetNames(olddata) == getTaskTargetNames(newdata))) {
stopf("retrafoless CPO %s must not change target names.", name)
}
if (getTaskType(olddata) == "classif") {
if (isLevelFlipped(olddata) != isLevelFlipped(newdata)) {
stopf("CPO %s changed task target feature order.", name)
}
if (!identical(levels(getTaskData(olddata, target.extra = TRUE)$target),
levels(getTaskData(newdata, target.extra = TRUE)$target))) {
stopf("retrafoless CPO %s must not change target class levels.", name)
}
}
checkTaskBasics(olddata, newdata, c("has.missings", "size", "class.distribution"), name)
}
assertShapeConform(getTaskData(newdata, target.extra = TRUE)$data, shapeinfo.input, strict.factors, name, TRUE)
}
newdata
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.