mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

# FormatCheck.R is central within the CPO framework. It checks that
# incoming & outgoing data conforms with the properties declared by
# a CPO and with requirements implicit within the CPO framework
# (Task column not changed in Feature Operating CPOs, number of rows
# not changed unless Training Only CPO, data for retrafo is the same
# as data for trafo).
# In the same stride, data format is changed to match the required
# format of the CPO functions -- see `dataformat` argument of
# makeCPO.


##################################
### Externals                  ###
##################################

# do the preparation before calling trafo:
#  - check the data is in an acceptable format (task or df)
#  - check the properties are fulfilled
#  - split the data
#  - get a shape info object
# @param indata [Task | data.frame] incoming data to be fed to the CPO trafo function
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param allowed.properties [character] allowed properties of `indata`
# @param subset.selector [list] information about 'affect.*' parameters that determine which subset of 'indata' is affected
# @param capture.factors [logical(1)] whether to save factor levels of input data in shapeinfo data structure. This is only used if the CPO has 'fix.factors' set to TRUE
# @param operating.type [character(1)] one of 'target', 'feature', 'retrafoless': whether target data, feature data, or both (but only during trafo) may be changed
# @param name [character(1)] name of the cpo, for message printing
# @return [list] the data to feed to the CPO trafo function, as well as meta-information:
#   list(indata = list(data, target, data.reduced, target.reduced), shapeinfo, properties, private)
#   'private' is a list containing some fields used by `handleTrafoOutput`.
prepareTrafoInput = function(indata, dataformat, strict.factors, allowed.properties, subset.selector, capture.factors, operating.type, name) {
  assert(checkClass(indata, "data.frame"), checkClass(indata, "Task"))

  subset.info = subsetIndata(indata, subset.selector, allowed.properties, "trafo", name)
  indata = subset.info$indata

  shapeinfo = makeInputShapeInfo(indata, capture.factors)
  subset.selector$data = NULL
  shapeinfo$subset.selector = subset.selector

  split.data = splitIndata(indata, dataformat, strict.factors, TRUE)


  list(indata = split.data$indata,
    shapeinfo = shapeinfo, properties = subset.info$properties,
    private = list(tempdata = split.data$tempdata, subset.index = subset.info$subset.index,
      dataformat = dataformat, strict.factors = strict.factors,
      origdata = subset.info$origdata, name = name, operating.type = operating.type))
}

# do the preparation before calling retrafo:
#  - check data is in an acceptable format (task or df)
#  - check the properties are fulfilled
#  - check the shape is the same as during trafo
#  - split the data
#  --> return
# how does mlr predict handle this stuff? they just drop target columns by name
# @param indata [Task | data.frame] incoming data to be fed to the CPO retrafo function
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param allowed.properties [character] allowed properties of `indata`
# @param shapeinfo.input [InputShapeInfo] information about the data shape used to train the CPO
# @param operating.type [character(1)] one of 'target', 'feature', 'retrafoless': whether target data, feature data, or both (but only during trafo) may be changed
# @param name [character(1)] name of the cpo, for message printing
# @return [list] the data to feed to the CPO retrafo function, as well as meta-information:
#   list(indata = data in a shape fit to be fed into retrafo, properties, task, private)
#   'task' is the reconstructed task, if any
#   'private' is a list containing some fields used by `handleTrafoOutput`.
prepareRetrafoInput = function(indata, dataformat, strict.factors, allowed.properties, shapeinfo.input, operating.type, name) {
  origdata = indata
  targetop = operating.type == "target"
  # check that input column names and general types match (num / fac, or num/fac/ordered if strict.factors

  if ("factor.levels" %in% names(shapeinfo.input)) {
    indata = fixFactors(indata, shapeinfo.input$factor.levels)
  }
  task = NULL
  if ("Task" %in% class(indata)) {
    origdatatype = "Task"
    if (length(shapeinfo.input$target) && length(getTaskTargetNames(indata))) {
      # NOTE: this might be too strict: maybe the user wants to retrafo a Task with the target having a different name?
      # HOWEVER, then either the training indata's task didnt matter (and he should have trained with a data.set?), or it
      #  DID matter, in which case it is probably important to have the same data type <==> target name
      assertSetEqual(getTaskTargetNames(indata), shapeinfo.input$target, .var.name = sprintf("Target names of Task %s", getTaskId(indata)))
    }
    traintype = shapeinfo.input$type
    curtype = getTaskDesc(indata)$type
    if (traintype != curtype && traintype != "cluster" && curtype != "cluster") {
      # if either the training or the current task is of type "cluster" (could also mean
      # the training was done with a data.frame) we forgive this here.
      stopf("CPO trained with task of type %s cannot operate on task of type %s.",
        traintype, curtype)
    }
    if (curtype != "cluster" || traintype == "cluster") {
      # if the current task is a cluster, we only do target
      # op stuff if the training type was also a cluster.
      task = indata
    }
    target = getTaskData(indata, features = character(0))
    indata = getTaskData(indata, target.extra = TRUE)$data
  } else if (is.data.frame(indata)) {
    origdatatype = "data.frame"
    if (any(shapeinfo.input$target %in% names(indata)) || shapeinfo.input$type == "cluster") {
      if (!all(shapeinfo.input$target %in% names(indata))) {
        badcols = intersect(shapeinfo.input$target, names(indata))
        stopf("Some, but not all target columns of training data found in new data. This is probably an error.\n%s%s: %s",
          "Offending column", ifelse(length(badcols) > 1, "s", ""), collapse(badcols, sep = ", "))
      }
      if (targetop) {
        task = constructTask(indata, shapeinfo.input$target, shapeinfo.input$type, "[CPO CONSTRUCTED]")
      }
      target = indata[shapeinfo.input$target]
      indata = dropNamed(indata, shapeinfo.input$target)
    } else {
      target = indata[character(0)]
      shapeinfo.input$target = NULL
    }
  } else {
    stopf("Data fed into CPO %s retrafo is not a Task or data.frame.", name)
  }

  if (!is.null(task)) {
    subset.info = subsetIndata(task, shapeinfo.input$subset.selector,
      allowed.properties, "retrafo", name)
    origdata = subset.info$origdata
    indata = subset.info$indata
    assertShapeConform(getTaskData(indata, target.extra = TRUE)$data, shapeinfo.input, strict.factors, name)
  } else {
    # every kind of input looks like 'cluster' here
    allowed.properties %c=% "cluster"

    subset.info = subsetIndata(indata, shapeinfo.input$subset.selector,
      allowed.properties, "retrafo", name)
    indata = subset.info$indata
    assertShapeConform(indata, shapeinfo.input, strict.factors, name)
  }

  reducing = dataformat %in% c("task", "df.all")
  if (targetop && !is.null(task)) {
    split.data = splitIndata(indata, dataformat, strict.factors, TRUE)
    indata = split.data$indata
    if (reducing) {
      indata = list(data = indata$data.reduced,
        target = indata$data)
    } else {
      indata = indata[c("data", "target")]
    }
  } else {
    split.data = splitIndata(indata, if (reducing) "df.features" else dataformat, strict.factors, FALSE)
    indata = split.data$indata
    indata["target"] = list(NULL)  # we want a $target slot with value NULL
  }

  list(indata = indata, properties = subset.info$properties, task = task,
    private = list(tempdata = split.data$tempdata, subset.index = subset.info$subset.index,
      origdata = origdata, dataformat = dataformat, strict.factors = strict.factors,
      name = name, operating.type = operating.type, origdatatype = origdatatype,
      targetnames = names(target) %??% character(0)))
}

# Do the check of the trafo's return value
#  - check the data is in an acceptable format (task, df, split dfs)
#  - recombine into a task / df
#  - check properties are allowed
#  - get a shape info object
# @param outdata [Task | data.frame | matrix] data returned by CPO trafo function
# @param prepared.input [list] return object of `prepareTrafoInput`
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
#   Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
#   to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param convertto [character(1)] only if the operating.type is 'target': type of the new task
# @param simpleresult [character(1)] whethre even for df.all / task dataformat the return value will be formatted
#   according to df.features
# @return [list] the data resulting from the CPO operation, as well as meta-information: list(outdata, shapeinfo)
handleTrafoOutput = function(outdata, prepared.input, properties.needed, properties.adding, convertto, simpleresult) {
  ppr = prepared.input$private
  olddata = ppr$origdata  # incoming data that was already given to prepareTrafoInput as 'indata'
  dataformat = ppr$dataformat
  strict.factors = ppr$strict.factors
  operating.type = ppr$operating.type
  name = ppr$name
  subset.index = ppr$subset.index  # index into olddata columns: the columns actually selected by 'affect.*' parameters

  if (dataformat == "task" && !simpleresult) {
    assertTask(outdata, "trafo", name)
  }

  if (operating.type != "target") {
    # for dataformat 'factor', 'numeric' etc, combine the return object of the function with
    # all the columns that were not originally given to the function
    outdata = rebuildOutdata(outdata, ppr$tempdata, dataformat)
  }
  dataformat = getLLDataformat(dataformat)

  if (dataformat %in% c("df.all", "task") && simpleresult) {
    assert(operating.type == "feature")
    dataformat = "df.features"
  }

  recombined = if (operating.type == "target") {
    recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, TRUE, convertto, name)
  } else if (operating.type == "retrafoless") {
    recombineRetrafolessResult(olddata, outdata, prepared.input$shapeinfo, dataformat, strict.factors, subset.index, name)
  } else  if (is.data.frame(olddata)) {
    # implied operating.type == "feature"
    recombinedf(olddata, outdata, dataformat, strict.factors, subset.index, character(0), name)
  } else {
    # implied operating.type == "feature"
    recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, FALSE, name = name)
  }

  checkOutputProperties(outdata, recombined, prepared.input$shapeinfo$target, prepared.input$properties, properties.needed, properties.adding, operating.type, "trafo", name)

  shapeinfo = makeOutputShapeInfo(outdata)

  if ("Task" %in% class(recombined)) {
    shapeinfo$target = getTaskTargetNames(recombined)
  }

  list(outdata = recombined, shapeinfo = shapeinfo)
}

# do the check of the retrafo's return value
#  - check data is in an acceptable format (task, df, split dfs)
#  - recombine into a task / df
#  - check the properties are fulfilled
#  - check the shape is the same as during trafo
# @param outdata [Task | data.frame | matrix] data returned by CPO retrafo function
# @param prepared.input [list] return object of `prepareTrafoInput`
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
#   Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
#   to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param convertto [character(1)] type of task to convert to, for target operation cpo
# @param shapeinfo.output [OutputShapeInfo] ShapeInfo describing the shape of the data returned by the CPO `trafo` function when it was called.
#        This imposes the same structure on the retrafo return value.
# @return [list] the data resulting from the CPO retrafo operation
handleRetrafoOutput = function(outdata, prepared.input, properties.needed, properties.adding, convertto, shapeinfo.output) {
  ppr = prepared.input$private
  olddata = ppr$origdata  # incoming data that was already given to prepareRetrafoInput as 'indata'
  dataformat = ppr$dataformat
  strict.factors = ppr$strict.factors
  subset.index = ppr$subset.index  # index into olddata columns: the columns actually selected by 'affect.*' parameters
  operating.type = ppr$operating.type
  targetnames = ppr$targetnames
  name = ppr$name

  if (operating.type == "target" && dataformat == "task") {
    assertTask(outdata, "retrafo", name)
  }

  # whether to ignore target columns of shapeinfo.output
  # (needed when dataformat differs between trafo & retrafo)
  drop.shapeinfo.target = dataformat %in% c("df.all", "task") && operating.type != "target"

  if (operating.type != "target") {
    # tempdata: incoming data that was given to prepareRetrafoInput as 'indata', after subsetting according to 'affect.*' parameters
    outdata = rebuildOutdata(outdata, ppr$tempdata, dataformat)
  }
  dataformat = getLLDataformat(dataformat)

  if (operating.type == "target") {
    # this won't get called at all if operating.type is target and there were not target
    # columns to rebuild. Therefore `olddata` will always be a Task here.
    recombined = recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, TRUE, convertto, name)
  } else {
    if (dataformat %in% c("df.all", "task")) {
      # target is always split off during retrafo
      dataformat = "df.features"
    }
    if (ppr$origdatatype == "data.frame") {
      if (any(targetnames %in% names(olddata))) {
        assert(all(targetnames %in% names(olddata)))  # we also check this in prepareRetrafoInput
      }
      recombined = recombinedf(olddata, outdata, dataformat, strict.factors, subset.index, targetnames, name)
    } else {
      recombined = recombinetask(olddata, outdata, dataformat, strict.factors, subset.index, FALSE, name = name)
    }
  }

  checkOutputProperties(outdata, recombined, targetnames, prepared.input$properties, properties.needed, properties.adding, operating.type, "retrafo", name)

  # check the shape of outdata is as expected
  if (dataformat == "split" && operating.type != "target") {
    assertSetEqual(names(outdata), setdiff(names(shapeinfo.output), "target"))
    for (n in names(outdata)) {
      assertShapeConform(outdata[[n]], shapeinfo.output[[n]], strict.factors, name)
    }
  } else {
    if (operating.type == "target" && dataformat == "task") {
      outdata = getTaskData(outdata, target.extra = TRUE)$data
    }
    assertShapeConform(outdata, shapeinfo.output, strict.factors, name, ignore.target = drop.shapeinfo.target)
  }
  assertTargetShapeConform(recombined, shapeinfo.output, operating.type, name)

  if (operating.type == "target" && ppr$origdatatype == "data.frame") {
    # input was a data.frame (with target columns), so we return da data.frame (with target columns)
    recombined = getTaskData(recombined)
  }
  recombined
}


##################################
### Shape & Properties         ###
##################################

# make sure that the factor levels of data.frame 'data' are as described by 'levels'.
# @param data [data.frame | Task] data / task to check / modify
# @param levels [list of character] levels of `data` columns, indexed by `data` column names
# @return [data.frame | Task] the modified `data`
fixFactors = function(data, levels) {
  UseMethod("fixFactors")
}

fixFactors.default = function(data, levels) {
  assertSubset(names(levels), names(data))
  data[names(levels)] = mapply(factor, data[names(levels)], levels, SIMPLIFY = FALSE)
  data
}

fixFactors.Task = function(data, levels) {
  changeData(data, fixFactors(getTaskData(data), levels))
}

# calculate the properties of the data (only feature types & missings)
# data can be a task or data.frame
# @param data [data.frame | Task] the data to check
# @param targetnames [character] only if `data` is a data.frame: the target columns, which will be ignored
# @return [character] a subset of c("numerics", "factors", "ordered", "missings")
getDataProperties = function(data, targetnames) {
  if (is.data.frame(data)) {
    td = makeTaskDescInternal(NULL, NULL, data, targetnames, NULL, NULL, FALSE)
  } else {
    assertClass(data, "Task")
    td = getTaskDesc(data)
  }
  nf = td$n.feat
  c(names(nf)[nf > 0], if (td$has.missings) "missings")
}

# calculate the properties of the data, as if it were a task.
# If data is a data.frame, we give it the property 'cluster'
# otherwise, we give it the propertye of the task type. If
# applicable, we also set oneclass, multiclass, etc (any from
# the variable 'cpo.targetproperties')
# @param data [data.frame | Task] the data to check
# @return [character] a subset of c("numerics", "factors", "ordered", "missings", "cluster", "classif", "multilabel", "regr", "surv", "oneclass", "twoclass", "multiclass")
getTaskProperties = function(data) {
  props = getDataProperties(data, character(0))
  if (is.data.frame(data)) {
    c(props, "cluster")
  } else {
    td = getTaskDesc(data)
    if (td$type == "classif") {
      others = switch(as.character(length(td$class.levels)),
        `1` = "oneclass", `2` = "twoclass", "multiclass")
    } else {
      others = NULL
    }
    c(props, td$type, others)
  }
}

# calculate properties of a general data object.
#
# This may be a Task, data.frame, or list of data.frames
# (as used with dataformat "split").
# @param data [list | data.frame | Task] The data to get properties of
# @param ignore.cols [character] names of columns to ignore, only for data.frame
# @return [character] same as of getTaskProperties
getGeneralDataProperties = function(data, ignore.cols = character(0)) {
  if ("Task" %nin% class(data) && !is.data.frame(data)) {
    unique(unlist(lapply(data, getTaskProperties)))
  } else if (is.data.frame(data)) {
    getTaskProperties(dropNamed(data, ignore.cols))
  } else {
    getTaskProperties(data)
  }
}

# give error when shape is different than dictated by shapeinfo.
#
# @param df [data.frame] the data to check
# @param shapeinfo [ShapeInfo] a the shape which `df` must conform to
# @param strict.factors [logical(1)] whether to check for 'ordered' as a type differing from 'factor'
# @param name [character(1)] name of the CPO currently being run, for error and debug printing
# @param retrafoless [logical(1)] whether this is the trafo result of a retrafoless CPO. Default FALSE.
# @param ignore.target [logical(1)] whether to ignore columns that have the same name as the target
#   column(s) declared in the $target slot. Default FALSE.
# @return [invisible(NULL)]
assertShapeConform = function(df, shapeinfo, strict.factors, name, retrafoless = FALSE,
                              ignore.target = FALSE) {
  if (ignore.target && !is.null(shapeinfo$target)) {
    shapeinfo$colnames = setdiff(shapeinfo$colnames, shapeinfo$target)
    shapeinfo$coltypes = dropNamed(shapeinfo$coltypes, shapeinfo$target)
  }
  if (!identical(names2(df), shapeinfo$colnames)) {
    errmsg = if (retrafoless) {
      "Error in CPO %s: columns may not be changed by cpo.trafo.\nInput was %s, output is %s."
    } else {
      "Error in CPO %s: column name mismatch between training and test data.\nWas %s during training, is %s now."
    }
    stopf(errmsg, name, collapse(shapeinfo$colnames, sep = ", "), collapse(names(df), sep = ", "))
  }
  indata = df[shapeinfo$colnames]

  if (strict.factors) {
    typesmatch = list(
        c("integer", "numeric"),
        "factor", "ordered")
  } else {
    typesmatch = list(
        c("integer", "numeric"),
        c("factor", "ordered"))
  }

  newcoltypes = vcapply(indata, function(x) class(x)[1])

  for (t in typesmatch) {
    typemismatch = (newcoltypes %in% t) != (shapeinfo$coltypes %in% t)
    if (any(typemismatch)) {
      plurs = ifelse(sum(typemismatch) > 1, "s", "")
      singes = ifelse(sum(typemismatch) > 1, "", "es")
      stopf("Error in CPO %s: Type%s of column%s %s mismatch%s between training and test data.", name,
        plurs, plurs, collapse(names(indata)[typemismatch], sep = ", "), singes)
    }
  }
}

# give error when shape recorded 'target' differs from task target
#
# only target column names are compared. This is needed for target
# operation CPOs changing target names. If data is not a Task
# or the operating.type is 'target', this does nothing.
#
# @param data [Task | data.frame] the Task.
# @param shapeinfo [ShapeInfo] a ShapeInfo
# @param operating.type [character(1)] operating type: "target", "feature", or "retrafoless"
# @return [invisible(NULL)]
assertTargetShapeConform = function(data, shapeinfo, operating.type, name) {
  if ("Task" %nin% class(data) || operating.type != "target") {
    return(invisible(NULL))
  }
  if (!identical({newtarget = getTaskTargetNames(data)}, shapeinfo$target)) {
    stopf("Error in CPO %s: Target name(s) after retrafo differ(s) from target name(s) after trafo. Was '%s', is now '%s'",
      name, collapse(newtarget, "', '"), collapse(shapeinfo$target, "', '"))
  }
}

# prepare some information about the data shape, so retrafo can check that
# it gets the kind of data it expects
# this needs to be checked both for input and for output
# @param data [data.frame] the data for which the shape is to be created
# @return [ShapeInfo] a simple datastructure that contains information about data column names and types
makeShapeInfo = function(data) {
  makeS3Obj("ShapeInfo",
    colnames = colnames(data) %??% character(0),
    coltypes = vcapply(data, function(x) class(x)[1]))
}

# like makeShapeInfo, but additionally get the target names and possibly factor levels
# @param indata [data.frame | Task] data for which the shape is to be created
# @param capture.factors [logical(1)] whether to capture factor levels
# @return [InputShapeInfo] a datastructure extending `ShapeInfo` containing information about the data shape
makeInputShapeInfo = function(indata, capture.factors) {
  if ("Task" %in% class(indata)) {
    data = getTaskData(indata, target.extra = TRUE)$data
    ret = makeShapeInfo(data)
    ret$target = getTaskTargetNames(indata)
    ret$type = getTaskDesc(indata)$type
    if (ret$type == "classif") {
      ret$positive = getTaskDesc(indata)$positive
    }
  } else {
    data = indata
    ret = makeShapeInfo(data)
    ret$target = character(0)
    ret$type = "cluster"
  }
  if (capture.factors) {
    ret$factor.levels = Filter(function(x) !is.null(x), lapply(data, levels))
  }
  addClasses(ret, "InputShapeInfo")
}

# creates shape info for data coming out of trafo, so retrafo can check that the data generated
# by it conforms to the data returned by trafo earlier.
# This does not do certain tests about the form of `outdata`, so it is recommended to call this
# after `recombinetask` was called (but with the *original* data, not the recombined data).
# @param outdata [data.frame | Task | list of data.frame] data returned by `trafo` function of which the shape is to be covered
# @return [OutputShapeInfo] This either extends `ShapeInfo` (if outdata is `data.frame` or `Task`) or is a list of `ShapeInfo` objects.
makeOutputShapeInfo = function(outdata) {
  if (is.data.frame(outdata)) {
    res = makeShapeInfo(outdata)
  } else if ("Task" %in% class(outdata)) {
    res = makeShapeInfo(getTaskData(outdata, target.extra = TRUE)$data)
    res$target = getTaskTargetNames(outdata)
    res$type = getTaskDesc(outdata)$type
  } else {
    # data is split by type, so we get the shape of each of the constituents
    res = lapply(outdata, makeShapeInfo)
  }
  addClasses(res, "OutputShapeInfo")
}

# check properties of data returned by trafo or retrafo function
# @param outdata [data.frame | Task | list] data returned by (re)trafo function (after rebuildOutdata)
# @param recombined [data.frame | Task] recombined data as will be returned to the user
# @param target.names [character] names of target columns
# @param input.properties [character] input properties as determined by prepare***Input
# @param properties.needed [character] which properties are 'needed' by the subsequent data handler.
#   Therefore, these properties may be present in `outdata` even though there were not in the input data.
# @param properties.adding [character] which properties are supposed to be 'added'
#   to the subsequent data handler. Therefore, these properties must be *absent* from `outdata`.
# @param operating.type [character(1)] operating type of cpo, one of 'target', 'feature', 'retrafoless'
# @param whichfun [character(1)] name of the CPO stage
# @param name [character(1)] name of the CPO
# @return [invisible(NULL)]
checkOutputProperties = function(outdata, recombined, target.names, input.properties, properties.needed, properties.adding, operating.type, whichfun, name) {
  # allowed.properties: allowed properties of `outdata`. That is the union of the CPO's 'properties.needed' field and the properties already present in 'indata'
  allowed.properties = union(input.properties, properties.needed)
  present.properties = if (operating.type == "feature") {
    getGeneralDataProperties(outdata, target.names)
  } else {
    getTaskProperties(recombined)
  }
  if (operating.type == "target") {
    # target operating CPOs can not change feature properties, but
    # there may be properties hidden from 'prepared.input$properties'
    # because of affect.*-subsetting which could be present in 'present.properties'
    # so we remove all feature properties here.
    present.properties = setdiff(present.properties, cpo.dataproperties)
  } else if (operating.type == "feature") {
    # remove properties of the target that are picked up by getGeneralDataProperties but
    # are not relevant.
    present.properties = setdiff(present.properties, cpo.all.target.properties)
  }
  assertPropertiesOk(present.properties, allowed.properties, whichfun, "out", name)
  assertPropertiesOk(present.properties, setdiff(allowed.properties, properties.adding), whichfun, "adding", name)
}

# give userfriendly error message when data does have the properties it is allowed to have.
# @param present.properties [character] properties that were found in a given data object
# @param allowed.properties [character] the properties that the data object is allowed to have
# @param whichfun [character(1)] name of the CPO stage
# @param direction [character(1)] either "in" (data is being sent into CPO), "out" (data was returned by CPO function, some
#   properties are present that were *not* present in the input data, but the given properties were not declared as
#   'properties.needed'), or "adding" (data was returned by CPO function, but the given properties *were* declared as
#   'properties.adding' and hence must not be present)
# @return [invisible(NULL)]
assertPropertiesOk = function(present.properties, allowed.properties, whichfun, direction, name) {
  if (!isPropertyStrict()) {
    return(invisible(NULL))
  }
  badprops = setdiff(present.properties, allowed.properties)
  if (length(badprops)) {
    if (direction == "in") {
      stopf("Data going into CPO %s has propert%s %s that %s can not handle.",
        whichfun, ifelse(length(badprops) > 1, "ies", "y"),
        collapse(badprops, sep = ", "), name)
    } else if (direction == "out") {
      stopf("Data returned by CPO %s has propert%s %s that %s did not declare in .properties.needed.",
        whichfun, ifelse(length(badprops) > 1, "ies", "y"),
        collapse(badprops, sep = ", "), name)
    } else {
      # 'adding' properties may not be present during output, but the error message
      # would be confusing if we used the 'out' message for this.
      assert(direction == "adding")
      stopf("Data returned by CPO %s has propert%s %s that %s declared in .properties.adding.\n%s",
        whichfun, ifelse(length(badprops) > 1, "ies", "y"),
        collapse(badprops, sep = ", "), name,
        paste("properties in .properties.adding may not be present in", whichfun, "output."))
    }
  }
}

# Check that the given task does not lie about itself.
#
# This is used on user-returned tasks. Check that task.desc$size equals the row number, that
# the target names occur in the task, etc.
# @param task [Task] the task.
# @param whichfun [character(1)] which function returned the task: trafo, retrafo
# @param name [character(1)] name of the cpo
# @return [invisible(NULL)]
assertTask = function(task, whichfun, name) {
  if (!is.list(task) || !is.list(task$task.desc) || "id" %nin% names(task$task.desc)) {
    stopf("Object returned by %s %s was not a task.", whichfun, name)
  }
  taskdesignator = function() sprintf("Task %s returned by %s %s", task$task.desc$id, whichfun, name)

  if (!is.environment(task$env)) {
    stopf("%s had no environment in its '$env' slot.", taskdesignator())
  }

  task.desc = task$task.desc
  target = task.desc$target

  required.classes = switch(task.desc$type,
    classif = c("ClassifTask", "SupervisedTask"),
    regr = c("RegrTask", "SupervisedTask"),
    cluster = c("ClusterTask", "UnsupervisedTask"),
    surv = c("SurvTask", "SupervisedTask"),
    multilabel = c("MultilabelTask", "SupervisedTask"),
    stopf("%s task type must be one of classif, regr, cluster, multilabel, surv", taskdesignator()))

  if (!identical(task$type, task.desc$type)) {
    stopf("%s task type and task.desc type must be the same", taskdesignator())
  }
  required.classes = c(required.classes, "Task")
  if (!identical(required.classes, class(task))) {
    stopf("%s must have classes %s", taskdesignator(), collapse(required.classes, ", "))
  }
  required.classes = paste0(required.classes, "Desc")
  if (!identical(required.classes, class(task.desc))) {
    stopf("%s task.desc must have classes %s", taskdesignator(), collapse(required.classes, ", "))
  }

  checks = c(
      `id must be a character(1)` = testString(task.desc$id),
      `data must be a data.frame with unique column names` = testDataFrame(task$env$data, col.names = "unique"),
      `target must be a character` = testCharacter(target),
      `task.desc must have numeric 'n.feat' slot` = testNumeric(task.desc$n.feat))

  if (!all(checks)) {
    stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
  }

  identIntLikeNum = function(x, y) identical(as.numeric(x), as.numeric(y))

  cl = table(dropNamed(vcapply(task$env$data, function(x) class(x)[1]), target))
  checks = c(
      `target must be a subset of task columns` = testSubset(target, colnames(task$env$data)),
      `number of 'numerics' features listed in task.desc is wrong` =
        identIntLikeNum(sum(cl[c("integer", "numeric")], na.rm = TRUE), task.desc$n.feat["numerics"]),
      `number of 'factors' features listed in task.desc is wrong` =
        identIntLikeNum(sum(cl["factor"], na.rm = TRUE), task.desc$n.feat["factors"]),
      `number of 'ordered' features listed in task.desc is wrong` =
        identIntLikeNum(sum(cl["ordered"], na.rm = TRUE), task.desc$n.feat["ordered"]),
      `'has.missings' slot in task.desc is wrong` =
        identical(anyMissing(task$env$data), task.desc$has.missings),
      `'size' slot in task.desc is wrong` =
        identIntLikeNum(nrow(task$env$data), task.desc$size),
      `'has.weights' slot in task.desc is wrong` =
        identical(!is.null(task$weights), task.desc$has.weights),
      `''has.blocking' slot in task.desc is wrong` =
        identical(!is.null(task$blocking), task.desc$has.blocking))
  if (!all(checks)) {
    stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
  }

  if (task.desc$type %in% c("classif", "regr") && length(target) != 1) {
    stopf("%s is of type %s but has %s targets.", taskdesignator(), task.desc$type,
      length(target))
  } else if (task.desc$type == "surv" && length(target) != 2) {
    stopf("%s is of type surv and must have exactly two targets.", taskdesignator())
  } else if (task.desc$type == "multilabel" && length(target) < 2) {
    stopf("%s is of type multilabel and must have more than one target.", taskdesignator())
  }

  checks = switch(task.desc$type,
    classif = c(
        `class levels in task.desc are not the factor levels of the target column` =
          testSetEqual(levels(task$env$data[[target]]), task.desc$class.levels),
        `task.desc 'positive' and 'negative' slots must be NA for multiclass tasks` =
          length(task.desc$class.levels) <= 2 || (is.na(task.desc$positive) && is.na(task.desc$negative)),
        `task.desc 'positive' and 'negative' slots must be both class levels of the target` =
          length(task.desc$class.levels) != 2 || (
            testString(task.desc$positive) &&
            testString(task.desc$negative) &&
            testSetEqual(c(task.desc$positive, task.desc$negative), task.desc$class.levels)),
        `task.desc 'positive' slot must be the class level, 'negative' slot must be not_<positive>` =
          length(task.desc$class.levels) != 1 || (
            testString(task.desc$positive) &&
            testString(task.desc$negative) &&
            identical(task.desc$positive, task.desc$class.levels) &&
            identical(task.desc$negative, paste0("not_", task.desc$class.levels)))),
    regr = TRUE,
    cluster = TRUE,
    surv = c(
        `time column must be numeric` = testNumeric(task$env$data[[target[1]]]),
        `event column must be logical` = testLogical(task$env$data[[target[2]]])),
    multilabel = c(
        `class.levels in task.desc must equal target names.` =
          testSetEqual(task.desc$class.levels, target)),
    stop("Unexpected error: task.desc$type was bad."))

  if (!all(checks)) {
    stopf("%s had problems: %s", taskdesignator(), collapse(names(checks)[!checks], "; "))
  }

}

##################################
### Task Splitting             ###
##################################

# Get the *indices* of columns of 'data' that are referenced by affect.* params.
# E.g. if 'affect.type == "numeric"', the indices of all numeric columns are returned.
#
# All of the parameters are just the relevantt 'affect.*' parameters as given to the CPO constructor, with the
# exception of 'data'. `getColIndices` can therefore be called using `do.call(getColIndices, insert(affect.param.list, list(data = DATA)))`
# @param data [data.frame] The data to get indices from
# @param type [character] subset of `c("numeric", "factor", "ordered", "other")`: all columns of the given type are included
# @param index [integer] index into data columns to include. Order is preserved, and they are ordered before all other matches
# @param names [character] names of data columns to include. Order is preserved, and they are ordered before other matches, except `index`
# @param pattern [character(1)] `grepl` pattern. Data columns that match the pattern are included
# @param invert [logical(1)] If TRUE, all matches are inverted, i.e. only columns that do not match any of the criteria are returned
# @param pattern.ignore.case [logical(1)] the `ignore.case` parameter of `grepl`: ignore case of `pattern`.
# @param pattern.perl [logical(1)] the `perl` parameter of `grepl`: use perl regex syntax
# @param pattern.fixed [logical(1)] the `fixed` parameter of `grepl`: don't interpret pattern as regex, but as fixed pattern.
# @return [integer]: index into `data` columns for selected columns.
getColIndices = function(data, type, index, names, pattern, invert, pattern.ignore.case, pattern.perl, pattern.fixed) {
  coltypes = vcapply(data, function(x) class(x)[1])
  coltypes[coltypes == "integer"] = "numeric"
  coltypes[!coltypes %in% c("numeric", "factor", "ordered")] = "other"
  matchcols = coltypes %in% type
  if (!is.null(pattern)) {
    matchcols = matchcols | grepl(pattern, colnames(data), pattern.ignore.case, pattern.perl, pattern.fixed)
  }
  badnames = names[!names %in% names(data)]
  if (length(badnames)) {
    stopf("Column%s not found: %s", ifelse(length(badnames) > 1, "s", ""), collapse(badnames, sep = ", "))
  }
  index %c=% setdiff(match(names, names(data)), index)

  index %c=% setdiff(which(matchcols), index)

  if (invert) {
    index = setdiff(seq_along(data), index)
  }
  index
}

# Translate the 'dataformat' option for internal use ("low level" dataformat) to a simplified version.
#
# most of CPOFormatCheck's split / recombine logic doesn't care about "factor", "onlyfactor", "ordered" or "numeric"
# and just treats it as "most" or "all" dataformat, subsetting the resulting data. This significantly
# simplifies the "splitting" and "recombining" of input / output data.
# E.g. if dataformat is "factor":
#  (1) the data is split according to "most" -- this is translated by 'getLLDataformat'
#  (2) the data that is handed to the cpo.trafo function is gotten by 'getIndata' which takes the '$factor' slot of the split data, in this case
#  (3) cpo.trafo returns its output. This output is put back together with the other data using 'rebuildOutdata'
#  (4) all checks are then done as if cpo.trafo had had used the "most" dataformat and not touched any but the '$factor' slots.
# @param dataformat [character(1)] the dataformat to translate
# @return [character(1)] a simplified dataformat option
getLLDataformat = function(dataformat) {
  if (dataformat %in% c("factor", "numeric", "ordered")) {
    "split"
  } else {
    dataformat
  }
}

# Get element of data according to dataformat.
#
# This is the complementary operation (category theoretically the quotient object) of `getLLDataformat`.
# With 'indata' being split according to dataformat "factor", "onlyfactor", "ordered", or "numeric", get the relevant subitem
# from the indata after it was split according to "most" or "all".
# If dataformat is none of these, this is a noop.
# @param indata [list of data.frame | data.frame | Task] the result of splitting incoming data according to `getLLDataformat(dataformat)`.
# @param dataformat [character(1)] one of the possible dataformat options
# @return [data.frame | Task] data formatted according to dataformat, to be fed into a trafo / retrafo function
getIndata = function(indata, dataformat) {
  if (dataformat %in% c("factor", "ordered", "numeric")) {
    indata[[dataformat]]
  } else {
    indata
  }
}

# Reassemble data that was split according to some of the `dataformat` options.
#
# If dataformat is one of "factor", "onlyfactor", "ordered", or "numeric", then
# the data returned by trafo / retrafo (only the modified factors / etc) needs
# to be integrated with the remaining unmodified columns. With 'outdata' being a
# data slice according to dataformat, this function puts the returned data back
# into the "tempdata" block from which the input was taken.
#
# If dataformat is none of these, this is a noop.
# @param outdata [data.frame | list of data.frame | Task] the data returned by a trafo / retrafo function
# @param tempdata [data.frame | list of data.frame | Task] the original data, split according to `getLLDataformat(dataformat)`.
# @param dataformat [character(1)] the dataformat option of the current CPO
# @return [data.frame | list of data.frame | Task] `outdata`, possibly embedded into `tempdata`.
rebuildOutdata = function(outdata, tempdata, dataformat) {
  if (dataformat %in% c("factor", "ordered", "numeric")) {
    tempdata[[dataformat]] = outdata
    outdata = tempdata
  }
  if (dataformat %in% c("numeric", "split") && is.matrix(outdata$numeric)) {
    outdata$numeric = as.data.frame(outdata$numeric)
  }
  outdata
}

# split 'outdata' into subsets given by 'which'. If 'which' does not contain "ordered", then
# 'ordered' columns are put together with 'factor' columns.
# @param which [character] subset of `c("numeric", "factor", "ordered", "other")`: by which types to split
# @param data [data.frame | any] data to split. This can also be any other list or vector if `types` is given
# @param types [character | NULL] types of columns / elements of `data`. If this is not provided, it is
#   determined from `data`. This is useful if `data` is not a data.frame but e.g. only a vector of column names.
# @return [list of data.frame | list of any] a list of subsets of `data`, named according to `which`.
splitColsByType = function(which = c("numeric", "factor", "ordered", "other"), data, types = NULL) {
  if (is.null(types)) {
    types = vcapply(data, function(x) class(x)[1])
  }
  # types: may be a character of type names, then data can be something besides a data.frame, like just a vector of names or indices
  match.arg(which, several.ok = TRUE)
  factorsubset = c("factor", if (!"ordered" %in% which) "ordered")

  sapply(which, function(x) {
    subset = if (x == "other") {
               !types %in% c("integer", "numeric", "factor", "ordered")
             } else {
               types %in% switch(x,
                 numeric = c("integer", "numeric"),
                 factor = factorsubset,
                 ordered = "ordered")
             }
    data[subset]
  }, simplify = FALSE, USE.NAMES = TRUE)
}


# Convenience function for 'dataformat' splitting.
#
# calls `splitdf` or `splittask`, depending on datatype of `data`.
#
# This performs no checks. possibly need to check that properties are adhered to
# in retrafo, must also check if the format is the same as during training
# 'possibly' here means: if not attached to a learner
#
# @param data [Task | data.frame] the data to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
#   a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splitX = function(data, dataformat = c("df.features", "split", "df.all", "task"), strict.factors) {
  dataformat = match.arg(dataformat)
  if (is.data.frame(data)) {
    splitdf(data, dataformat, strict.factors)
  } else {
    splittask(data, dataformat, strict.factors)
  }
}

# check whether the first level of a classif target is not the positive level
#
# @param task [Task] the task to check
# @return [logical(1)] TRUE when the first level of a classif target is not the positive level, FALSE otherwise, and
#   for non-classif task.
isLevelFlipped = function(task) {
  if (getTaskType(task) != "classif") {
    return(FALSE)
  }
  pos = getTaskDesc(task)$positive
  assert(!is.null(pos))
  if (is.na(pos)) {
    return(FALSE)
  }
  target = getTaskData(task, target.extra = TRUE)$target
  assert(length(levels(target)) <= 2)
  if (!identical(levels(target)[1], pos)) {
    assert(identical(levels(target)[2], pos))
    return(TRUE)
  }
  FALSE
}

# reorder the levels of a classif target to make the positive level the first one
#
# @param data [data.frame] the data frame containing the target
# @param target [character(1) | numeric(1)] the name or index of the target column. Is assumed to be a factor with two levels.
# @return [data.frame] the input data with the two levels of the target column flipped.
flipTaskTarget = function(data, target) {
  data[[target]] = factor(data[[target]], levels = rev(levels(data[[target]])))
  data
}

# reorder levels of classif target if the original task was level flipped
#
# @param data [data.frame] the data frame containing the target to be flipped
# @param task [Task] the original task. The target column of this task must be the one of the data.frame
# @return [data.frame] the input data, potentially with the two levels of the target column flipped.
unflipTarget = function(data, task) {
  if (isLevelFlipped(task)) {
    data = flipTaskTarget(data, getTaskTargetNames(task))
  }
  data
}


# This does the 'dataformat' splitting up of Tasks.
#
# This is the sister of `splitdf` which gets applied to `data.frame`.
# @param task [Task] the task to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
#   a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splittask = function(task, dataformat, strict.factors) {
  if (dataformat %in% c("split", "df.features")) {
    splt = getTaskData(task, target.extra = TRUE)$data
    colsplit = c("numeric", "factor", if (strict.factors) "ordered", "other")
    trg = getTaskData(task, features = character(0))
    if (isLevelFlipped(task)) {
      trg = flipTaskTarget(trg, 1)
    }
  }
  if (dataformat == "df.all") {
    data = getTaskData(task)
    target = getTaskTargetNames(task)
    data = unflipTarget(data, task)
    return(list(data = data, target = target))
  }

  switch(dataformat,
    task = list(data = task, target = getTaskTargetNames(task)),
    df.features = list(data = splt,
      target = trg),  # want the target to always be a data.frame
    split = list(data = splitColsByType(colsplit, splt),
      target = trg))  # want the target to always be a data.frame
}

# This does the 'dataformat' splitting up of data.frames.
#
# When creating a `Task` from a `data.frame` for `dataformat == "task"`, a `ClusterTask` is generated.
# @param df [data.frame] the data to split up
# @param dataformat [character(1)] subset of `c("df.features", "split", "df.all", "task")`. Should be
#   a result of `getLLDataformat` applied to the dataformat used for the CPO.
# @param strict.factors [logical(1)] whether to split ordered from factor columns
# @return [Task | data.frame | list of data.frame] the data split / formatted according to `dataformat`.
splitdf = function(df, dataformat, strict.factors) {
  colsplit = c("numeric", "factor", if (strict.factors) "ordered", "other")
  switch(dataformat,
    task = list(data = makeClusterTask("[CPO CONSTRUCTED]", data = df, fixup.data = "no", check.data = FALSE), target = character(0)),
    df.all = list(data = df, target = character(0)),
    df.features = list(data = df, target = df[, character(0), drop = FALSE]),
    split = list(data = splitColsByType(colsplit, df),
      target = df[, character(0), drop = FALSE]))
}

# Take subset of data according to 'affect.*' parameters
#
# @param indata [Task | data.frame]
# @param subset.selector [list] information about 'affect.*' parameters that determine which subset of 'indata' is affected
# @param allowed.properties [character] allowed properties of `indata`
# @param whichfun [character(1)] name of the CPO stage
# @param cpo.name [character(1)] name of the CPO
# @return [list] list(origdata, indata, subset.index, properties)
subsetIndata = function(indata, subset.selector, allowed.properties, whichfun, cpo.name) {
  origdata = indata
  if ("Task" %in% class(indata)) {
    subset.selector$data = getTaskData(indata, target.extra = TRUE)$data
    subset.index = do.call(getColIndices, subset.selector)
    # subsetTask, but keep everything in order
    new.subset.index = featIndexToTaskIndex(subset.index, indata)
    indata.data = getTaskData(indata)
    if (!identical(as.integer(new.subset.index), seq_along(indata.data))) {
      indata = changeData(indata, indata.data[new.subset.index])
    }
  } else {
    subset.selector$data = indata
    subset.index = do.call(getColIndices, subset.selector)
    indata = indata[subset.index]
  }

  present.properties = getTaskProperties(indata)
  assertPropertiesOk(present.properties, allowed.properties, whichfun, "in", cpo.name)

  list(origdata = origdata, indata = indata, subset.index = subset.index,
    properties = present.properties)
}


# Split cpo input data according to dataformat
#
# Creates also 'tempdata', the data after the split but before
# subsetting (useful for dataformat 'numeric', 'factors' etc)
# and possibly 'reduced.indata', which reduces df.all and task into df.features.
# @param data [data.frame | Task] the input data
# @param dataformat [character(1)] one of 'task', 'df.all', 'df.features', 'split', 'factor', 'numeric', 'ordered'
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param create.reduced [logical(1)] whether to create 'reduced' indata
# @return [list] list(indata, tempdata). indata is the proper input for the CPO function,
#   a list(data, target [, data.reduced, target.reduced]). tempdata the data after split before subsetting.
splitIndata = function(data, dataformat, strict.factors, create.reduced) {
  indata = splitX(data, getLLDataformat(dataformat), strict.factors)
  tempdata = indata$data
  indata$data = getIndata(indata$data, dataformat)

  if (create.reduced) {
    # create separate "reduced" data that, besides containing the full task / df, also
    # contains the data and target alone.
    reduced.indata = if (dataformat %in% c("task", "df.all")) {
      splitX(data, "df.features", strict.factors)
    } else {
      indata
    }
    names(reduced.indata) = paste0(names(reduced.indata), ".reduced")
    indata %c=% reduced.indata
  }
  list(indata = indata, tempdata = tempdata)
}


##################################
### Task Recombination         ###
##################################
# Task / Data recombination entails checking that data / target was only modified if allowed by the CPO type,
# checking that the number of rows didn't change, and relevant properties didn't change.

# Recombine the data previously split up by `splitdf` / `splittask` with `dataformat` being "most" or "all",
# and after the CPO trafo / retrafo function performed its operations on it.
#
# recombineLL is called by both recombinetask and recombinedf, and does the checking (e.g. number or rows did not change)
# that is common to both.
#
# 'LL' meaning 'low level'
# @param olddata [list of data.frame] data as fed to the CPO, for reference of correct row number etc.
# @param newdata [list of data.frame] data as returned by trafo / retrafo
# @param subset.index [integer] subset of 'task' features that were selected by 'affect.*' parameters
# @param name [character(1)] CPO name for pretty debug printing
# @return [data.frame] the data in `newdata` combined into a single data.frame.
recombineLL = function(olddata, newdata, targetnames, strict.factors, subset.index, name) {
  allnames = names(olddata)
  needednames = c("numeric", "factor", "other", if (strict.factors) "ordered")
  if (!isTRUE(checkSetEqual(names(newdata), needednames))) {
    stopf('CPO %s gave bad return. The returned value must be a list with names {"%s"}.',
      name, collapse(needednames, sep = '", "'))
  }

  targetdata = olddata[targetnames]
  olddata = dropNamed(olddata, targetnames)
  unsubsetdata = olddata[-subset.index]
  olddata = olddata[subset.index]

  dfs = vlapply(newdata, is.data.frame)
  if (any(!dfs)) {
    is.plur = sum(!dfs) > 1
    stopf("Return of %s element%s %s %s not a data.frame.", name, ifelse(is.plur, "s", ""),
      collapse(names(dfs)[!dfs], sep = ", "), ifelse(is.plur, "are", "is"))
  }

  # check no new names clash with other names
  # this kind of sucks when a CPO just happens to change the names to something thats already there
  # but we also don't want to surprise the user about us unilaterally changing names, so he needs to
  # take care of that.
  jointargetnames = c(targetnames, names(unsubsetdata), unlist(lapply(newdata, names)))
  if (any(duplicated(jointargetnames))) {
    stopf("CPO %s gave bad result\nduplicate column names %s", name, collapse(unique(jointargetnames[duplicated(jointargetnames)], sep = ", ")))
  }

  types = vcapply(olddata, function(x) class(x)[1])

  splitargetnames = splitColsByType(names(newdata), names(olddata), types) # list(numeric = [colnames], factor = [colnames]...

  numrows = nrow(olddata)
  namesorder = allnames
  for (splittype in names(splitargetnames)) {
    if (numrows != nrow(newdata[[splittype]])) {
      stopf("Number of rows of %s data returned by %s did not match input\nCPO must not change row number.",
        splittype, name)
    }
    if (!identical(splitargetnames[[splittype]], names(newdata[[splittype]]))) {
      namesorder = setdiff(namesorder, splitargetnames[[splittype]])
      namesorder %c=% names(newdata[[splittype]])
    }
  }

  newdata = cbind(unsubsetdata, do.call(cbind, unname(newdata)), targetdata)
  assertSetEqual(names(newdata), namesorder)
  newdata[namesorder]
}

# Recombine a task that was previously (potentially) split up according to `dataformat` and then changed by trafo / retrafo.
#
# This is used when the split up data was created from a task, and if (therefore) the result of the
# CPO is again expected to be a task.
#
# this checks that the result of trafo / retrafo has the proper type, that target and type didn't change,
# (if dataformat == "task"), and that the number of rows is the same. It then reconstructs the complete task that
# will be output by the CPO.
# @param task [Task] old task, used for input, for comparison
# @param newdata [Task | data.frame | list of data.frame] output of cpo.trafo / cpo.retrafo. This has the same format
#   as `splittask(task, dataformat)`
# @param dataformat [character(1)] the dataformat used, this is `getLLDataformat` applied to the CPO's dataformat parameter.
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param subset.index [integer] subset of 'task' features that were selected by 'affect.*' parameters
# @param targetbound [logical(1)] TRUE for target operating CPO, FALSE for feature operating CPO.
# @param newtasktype [character(1)] only if `targetbound`, type of new task. Give even if no task conversion happens.
# @param name [character(1)] CPO name for pretty debug printing
# @return [Task] the task incorporating the changes done by the CPO to `newdata`.
recombinetask = function(task, newdata, dataformat = c("df.all", "task", "df.features", "split"),
                         strict.factors, subset.index, targetbound, newtasktype, name) {
  dataformat = match.arg(dataformat)

  if (is.data.frame(task)) {
    # only if 'targetbound'
    task = makeClusterTask(id = "[CPO CONSTRUCTED]", data = task, fixup.data = "no", check.data = FALSE)
  }

  if (dataformat %in% c("df.features", "split")) {
    if (targetbound) {
      # return is just 'target' in a df.
      if (!is.data.frame(newdata)) {
        stopf("CPO %s gave bad result\nmust return a data.frame containing the target.",
          name)
      }
      olddata = getTaskData(task)
      oldtnames = getTaskTargetNames(task)
      newtnames = names(newdata)
      if (setequal(newtnames, oldtnames)) {
        olddata[newtnames] = newdata
        newdata = olddata
      } else if (length(oldtnames) == 1 && length(newdata) == 1) {
        assert(length(oldtnames) == 1)
        # note that this can NOT be combined with
        # the olddata[newtnames] block above!
        # also note the double brackets [[ ]].
        olddata[[oldtnames]] = newdata[[1]]
        names(olddata)[names(olddata) == oldtnames] = names(newdata)
        newdata = olddata
      } else {
        newdata = cbind(dropNamed(olddata, oldtnames), newdata)
      }
      if (anyDuplicated(colnames(newdata))) {
        stopf("CPO %s introduced duplicate column names", name)
      }
      if (newtasktype == "classif") {
        newdata = unflipTarget(newdata, task)
      }
      return(constructTask(newdata, newtnames, newtasktype, getTaskId(task), isLevelFlipped(task)))
    } else {
      return(changeData(task, recombinedf(getTaskData(task), newdata, dataformat, strict.factors, subset.index, getTaskTargetNames(task), name)))
    }
  }

  if (dataformat == "df.all") {
    checkDFBasics(task, newdata, targetbound, name)
    if (!targetbound) {
      newdata = unflipTarget(newdata, task)
      newdata = changeData(task, newdata)
    } else {
      if (newtasktype == "classif") {
        newdata = unflipTarget(newdata, task)
      }
      newdata = constructTask(newdata, getTaskTargetNames(task), newtasktype, getTaskId(task), isLevelFlipped(task))
    }
  }

  if (nrow(getTaskData(task)) != nrow(getTaskData(newdata))) {
    stopf("CPO %s must not change number of rows", name)
  }

  new.subset.index = featIndexToTaskIndex(subset.index, task)
  if (targetbound) {
    # everything may change except size, n.feat and missings
    fulldata = recombinedf(getTaskData(task), getTaskData(newdata), "df.all", strict.factors, new.subset.index, character(0), name)
    fulltask = constructTask(fulldata, getTaskTargetNames(newdata), newtasktype, getTaskId(newdata), isLevelFlipped(newdata))
    checkColumnsEqual(getTaskData(task, target.extra = TRUE)$data[subset.index],
      getTaskData(newdata, target.extra = TRUE)$data, "non-target column", name)
    checkTaskBasics(task, fulltask, setdiff(names(getTaskDesc(task)), c("n.feat", "has.missings", "has.blocking", "has.weights")), name)
    return(fulltask)
  }
  #check type didn't change
  assert(getTaskType(task) == getTaskType(newdata))
  assertSetEqual(names(getTaskDesc(task)), names(getTaskDesc(newdata)))

  # check target didn't change
  checkColumnsEqual(getTaskData(task, features = character(0)),
    getTaskData(newdata, features = character(0)), "target column", name)

  checkTaskBasics(subsetTask(task, features = subset.index), newdata, c("id", "n.feat", "has.missings"), name)


  changeData(task, recombinedf(getTaskData(task), getTaskData(newdata), "df.all", strict.factors, new.subset.index, character(0), name))
}


# convert an index of feature columns to an index w.r.t. the whole task
#
# A column index that references columns with respect the data
# columns only is converted to the column index with respect the
# whole task data.frame (including target columns).
#
# Target columns are included in this index. If feat.index
# is a sorted numeric, the target columns just get sorted into
# the feat.index; otherwise they are put at the beginning.
# @param feat.index [numeric] index of columns with respect to feature cols only
# @param task [Task] the task
# @return [numeric] index w.r.t. the whole task df. Includes target cols.
featIndexToTaskIndex = function(feat.index, task) {
  task.data = getTaskData(task)
  fullindex = seq_along(task.data)
  aretargets = names(task.data) %in% getTaskTargetNames(task)
  new.subset.index = fullindex[!aretargets][feat.index]
  if (all(new.subset.index == sort(new.subset.index))) {
    sort(c(which(aretargets), new.subset.index))
  } else {
    c(which(aretargets), new.subset.index)
  }
}

# Recombine a data.frame that was previously (potentially) split up according to `dataformat` and then changed by trafo / retrafo.
#
# recombine data.frame after checking for match of rows etc., see 'recombinetask'.
# @param df [data.frame] old data.frame, used for input, for comparison
# @param newdata [Task | data.frame | list of data.frame] output of cpo.trafo / cpo.retrafo. This has the same format
#   as `splitdf(df, dataformat)`
# @param dataformat [character(1)] the dataformat used, this is `getLLDataformat` applied to the CPO's dataformat parameter.
# @param strict.factors [logical(1)] whether to consider 'ordered' as separate from 'factor' types
# @param subset.index [integer] subset of 'df' features that were selected by 'affect.*' parameters
# @param targetcols [character] names of target columns; this is relevant for retrafo when cpo.trafo was trained with a Task that
#   contains target columns, and cpo.retrafo is fed with a data.frame that contains columns with the same name.
# @param name [character(1)] CPO name for pretty debug printing
# @return [data.frame] the data.frame incorporating the changes done by the CPO to `newdata`
recombinedf = function(df, newdata, dataformat = c("df.features", "split", "df.all", "task"), strict.factors, subset.index, targetcols, name) {
# otherwise it contains the columns removed from the DF because they were target columns.
  dataformat = match.arg(dataformat)
  if (dataformat == "split") {
    return(recombineLL(df, newdata, targetcols, strict.factors, subset.index, name))
  } else if (dataformat == "task") {
    assertClass(newdata, "Task")
    newdata = getTaskData(newdata)
  }
  if (!is.data.frame(newdata)) {
    stopf("CPO %s gave bad result\nmust return a data.frame.", name)
  }
  if (nrow(df) != nrow(newdata)) {
    stopf("CPO %s must not change number of rows.", name)
  }
  outsetcols = dropNamed(df, targetcols)
  if (length(subset.index)) {
    outsetcols = outsetcols[-subset.index]
  }
  fullnames = c(names(newdata), names(outsetcols), targetcols)
  dubs = duplicated(fullnames)
  if (any(dubs)) {
    stopf("CPO %s gave bad result\ncolumn names %s duplicated (possibly with target)", name, collapse(unique(fullnames[dubs], sep = ", ")))
  }

  datanames = names(newdata)
  newdata = cbind(outsetcols, newdata, df[targetcols])
  if (identical(datanames, setdiff(names(df), targetcols)[subset.index])) {
    # names didn't change, so we preserve column order
    newdata = newdata[names(df)]
    names(newdata) = names(df)
  }

  row.names(newdata) = attr(df, "row.names")
  newdata
}

# Check that columns in `old.relevants` and `new.relevants` are identical.
#
# This is mostly a helper function for pretty error messages. Depending on what
# a CPO operates on, it must not change target OR data columns. These are the "relevant"
# columns. If this rule is violated, an error message tells the user that a CPO must not
# change target / data columns.
# @param old.relevants [data.frame] subset of the old data that must stay constant
# @param new.relevants [data.frame] subset of modified data, which is checked for equality with old.relevants
# @param relevant.name [character(1)] should be something like 'targets' or 'non-target features'
# @param name [character(1)] name of the CPO for debug purposes
# @return [invisible(NULL)]
checkColumnsEqual = function(old.relevants, new.relevants, relevant.name, name) {
  if (!isTRUE(checkSetEqual(names(old.relevants), names(new.relevants)))) {
    stopf("CPO %s must not change %s names.", name, relevant.name)
  }
  for (n in names(old.relevants)) {
    if (!identical(old.relevants[[n]], new.relevants[[n]])) {
      stopf("CPO %s must not change %ss, but changed %s.", name, relevant.name, n)
    }
  }
}

# general function that builds a task of type 'type' and with id 'id', using
# the given data.
#
# @param data [data.frame] the data to be used in the new task
# @param target [character] name of target columns inside `data`
# @param type [character(1)] type of the task to be created
# @param id [character(1)] id of the newly created task
# @param flip [logical(1)] whether, for binary classif task, to put the 2nd level on 'positive'
# @return [Task] a new task of type `type`, with id `id`, data `data`, and other meta information from `oldtask`.
constructTask = function(data, target, type, id, flip = FALSE) {
  if (type == "cluster") {
    if (length(target)) {
      stop("Cluster task cannot have target columns")
    }
    return(makeClusterTask(id = id, data = data, fixup.data = "no", check.data = FALSE))
  }
  if (type == "classif") {
    assertString(target)
    targetcol = data[[target]]
    if (!is.factor(targetcol)) {
      stop("ClassifTask target must be a factor column!")
    }
    if (flip && length(target) == 1) {
      if (length(levels(targetcol)) == 2) {
        positive = levels(targetcol)[2]
        return(makeClassifTask(id = id, data = data, target = target,
          positive = positive, fixup.data = "no", check.data = FALSE))
      }
    }
  }
  constructor = switch(type,
    classif = makeClassifTask,
    multilabel = makeMultilabelTask,
    regr = makeRegrTask,
    surv = makeSurvTask)
  constructor(id = id, data = data, target = target, fixup.data = "no", check.data = FALSE)
}

# check that newdata is a task, and that it agrees with the
# old 'task' on everything except 'allowed.td.changes'
#
# @param task [Task] the task to compare newdata to
# @param newdata [Task] the task to check
# @param allowed.td.changes [character] slots of 'task.desc' that the tasks may disagree on
# @param name [character(1)] name of the CPO to use in the error message
# @return [invisible(NULL)]
checkTaskBasics = function(task, newdata, allowed.td.changes, name) {
  if (!"Task" %in% class(newdata)) {
    stopf("CPO %s must return a Task", name)
  }

  if ("size" %nin% allowed.td.changes && getTaskDesc(task)$size != getTaskDesc(newdata)$size) {
    stopf("CPO %s must not change number of rows", name)
  }

  old.td = getTaskDesc(task)
  new.td = getTaskDesc(newdata)

  # check most of task description didn't change
  for (n in setdiff(names(old.td), allowed.td.changes)) {
    if (!isTRUE({complaint = all.equal(old.td[[n]], new.td[[n]])})) {
      stopf("CPO %s changed task description item %s:\n%s", name, n, complaint)
    }
  }
}

# check that newdata is a data.frame that fits 'task's format (size, no overlap in target column names)
# @param task [Task] the task to compare newdata to
# @param newdata [data.frame] the data.frame to check
# @param targetbound [logical(1)] whether the CPO is allowed to operate on target columns
# @param name [character(1)] name of the CPO to use in the error message
# @return [invisible(NULL)]
checkDFBasics = function(task, newdata, targetbound, name) {
  if (!is.data.frame(newdata)) {
    stopf("CPO %s cpo.trafo gave bad result\ncpo.trafo must return a data.frame.", name)
  }
  assertClass(newdata, "data.frame")
  tnames = getTaskTargetNames(task)
  missingt = tnames[!tnames %in% names(newdata)]
  if (length(missingt)) {
    addendum = ""
    if (targetbound) {
      addendum = paste("\nIf you want to change names or number of target columns in targetbound CPOs",
        'you must use other dataformat values, e.g. "df.features".', sep = "\n")
    }
   stopf("CPO %s cpo.trafo gave bad result\ndata.frame did not contain target column%s %s.%s",
     name, ifelse(length(missingt) > 1, "s", ""), collapse(missingt, ", "), addendum)
  }
}

# Checks and recombines data returned by a 'retrafoless' CPO, which is allowed to operate on both
# data and target columns.
#
# perform basic checks that a retrafoless cpo returned the kind of task / data.frame that it should;
# then convert, if necessary.
# @param olddata [Task | data.frame] the original input data
# @param newdata [Task | data.frame] the data returned by the CPO trafo function
# @param shapeinfo [ShapeInfo] The input shape which `df` must conform to
# @param dataformat [character(1)] the result of `getLLDataformat` applied to the CPO's dataformat parameter
# @param strict.factors [logical(1)] whether to check for 'ordered' as a type differing from 'factor'
# @param subset.index [integer] index into olddata columns: the columns actually selected by 'affect.*' parameters
# @param name [character(1)] the CPO name used in error messages
# @return [Task | data.frame] the recombined data from newdata
recombineRetrafolessResult = function(olddata, newdata, shapeinfo.input, dataformat, strict.factors, subset.index, name) {
  assert(identical(subset.index, seq_along(subset.index)))
  assertSubset(dataformat, c("df.all", "task"))
  if (is.data.frame(olddata)) {
    if (dataformat == "df.all") {
      assertClass(newdata, "data.frame")
    } else {  # dataformat == "task"
      assertClass(newdata, "ClusterTask")
      newdata = getTaskData(newdata)
    }
    assertShapeConform(newdata, shapeinfo.input, strict.factors, name, TRUE)
  } else {
    if (dataformat == "df.all") {
      assertClass(newdata, "data.frame")
      if (!all(getTaskTargetNames(olddata) %in% names(newdata)) ||
          !all(names(newdata)[names(newdata) %in% getTaskTargetNames(olddata)] == getTaskTargetNames(olddata))) {
        stopf("retrafoless CPO %s must not change target names.", name)
      }
      if (getTaskType(olddata) == "classif") {
        tname = getTaskTargetNames(olddata)
        if (isLevelFlipped(olddata)) {
          newdata = flipTaskTarget(newdata, tname)
        }
        if (!identical(levels(getTaskData(olddata, target.extra = TRUE)$target),
          levels(newdata[[tname]]))) {
          stopf("retrafoless CPO %s must not change target class levels.", name)
        }
      }
      newdata = changeData(olddata, newdata)
    } else {  # dataformat == "task"
      if (!identical(class(newdata), class(olddata))) {
        stopf("retrafoless CPO %s must not change task type.", name)
      }
      if (!all(getTaskTargetNames(olddata) == getTaskTargetNames(newdata))) {
        stopf("retrafoless CPO %s must not change target names.", name)
      }
      if (getTaskType(olddata) == "classif") {
        if (isLevelFlipped(olddata) != isLevelFlipped(newdata)) {
          stopf("CPO %s changed task target feature order.", name)
        }
        if (!identical(levels(getTaskData(olddata, target.extra = TRUE)$target),
          levels(getTaskData(newdata, target.extra = TRUE)$target))) {
          stopf("retrafoless CPO %s must not change target class levels.", name)
        }
      }
      checkTaskBasics(olddata, newdata, c("has.missings", "size", "class.distribution"), name)
    }
    assertShapeConform(getTaskData(newdata, target.extra = TRUE)$data, shapeinfo.input, strict.factors, name, TRUE)
  }
  newdata
}
mlr-org/mlrCPO documentation built on Nov. 18, 2022, 11:25 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mlr-org/mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

R/FormatCheck.R
In mlr-org/mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

R Package Documentation

Browse R Packages

We want your feedback!

mlr-org/mlrCPO Composable Preprocessing Operators and Pipelines for Machine Learning

R/FormatCheck.R In mlr-org/mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

R Package Documentation

Browse R Packages

We want your feedback!

mlr-org/mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

R/FormatCheck.R
In mlr-org/mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning