z_4_custom_CPOs_terse.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

## ----results = "asis", echo = FALSE-------------------------------------------

# output format should be of the form
#> output
#> output
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

# initialize: load library, make everything deterministic
library("mlrCPO")
set.seed(123)

# get the path of the parent document
# path = names(knitr::opts_knit$get("encoding"))[1]
base = knitr::opts_knit$get("output.dir")
file = sys.frame(min(grep("^knitr::knit$|^knit$", sapply(sys.calls(), function(x) as.character(x)[1]))))$input
file = basename(file)
path = file.path(base, file)
rpath = gsub("\\.[^.]*$", ".R", path)

# strip whitespace from lines in tangle (R file) output for lintr
knitr::knit_hooks$set(document = function(x) {
  if (file_test("-f", rpath)) {
    lines = readLines(rpath)
    lines = gsub(" *(\n|$)", "\\1", lines)
    cat(lines, file = rpath, sep = "\n", append = FALSE)
  }
  x
})

#############################
# do the trans-vignette ToC #
#############################
fullfile = file

allfiles = list.files(path = base, pattern = ".*\\.Rmd$")
stopifnot(file %in% allfiles)

# collect information (title, url, main / compact) for each file in vignette dir
fileinfolist = list()
for (cf in allfiles) {
  ismain = TRUE
  if (grepl("^z_", cf)) {
    infoslot = gsub("^z_", "", cf)
    infoslot = gsub("_terse\\.Rmd$", "", infoslot)
    subslot = "compact"
  } else {
    infoslot = gsub("^a_", "", cf)
    infoslot = gsub("\\.Rmd$", "", infoslot)
    subslot = "main"
  }

  content = scan(paste(base, cf, sep = "/"), what = "character", quiet = TRUE)
  pos = min(c(which(content == "title:"), Inf))
  if (is.infinite(pos)) {
    stop(sprintf("parsing error: %s", cf))
  }
  infolist = list(title = content[pos + 1], url = cf, iscurrent = cf == file)

  applist = list(infolist)
  names(applist) = subslot
  fileinfolist[[infoslot]] = c(fileinfolist[[infoslot]], applist)
}

# helper function that creates a link for all files except the current one
linkify = function(info, title) {
  if (info$iscurrent) {
    title
  } else {
    sprintf("[%s](%s)", title, gsub("\\.Rmd$", ".html", info$url))
  }
}

# output ToC
for (idx in seq_along(fileinfolist)) {
  content = fileinfolist[[sort(names(fileinfolist))[idx]]]
  if (!is.null(content$compact)) {
    if (paste(sub("[0-9]\\. ", "", content$main$title), "(No Output)") != sub("^z ", "", content$compact$title)) {
      stop(sprintf("File %s and its compact version %s have incompatible titles\nThe compact version must be paste(main_title, \"(No Output)\"). Is: '%s', expected: '%s'",
        content$main$url, content$compact$url, content$compact$title, paste(content$main$title, "(No Output)")))
    }
    line = sprintf("%s (%s)", linkify(content$main, content$main$title), linkify(content$compact, "compact version"))
  } else {
    line = linkify(content$main, content$main$title)
  }
  cat(sprintf("%s. %s\n", idx, line))
  if (content$main$iscurrent || content$compact$iscurrent) {
    fullfile = content$main$url
  }
}

fullpath = file.path(base, fullfile)

#############################
# Optional Document TOC     #
#############################

# print everything up to level `print.level`.
# level is the number of '#' prefixes. The lowest level is usually 2.
printToc = function(print.level = 3) {
  owncontent = readLines(fullpath)
  tripletic = grepl("^```", owncontent)
  owncontent = owncontent[cumsum(tripletic) %% 2 == 0]  # exclude ```-delimited code
  headlines = grep("^#+ +", owncontent, value = TRUE)
  headlevels = nchar(gsub(" .*", "", headlines))
  headlines = gsub("^[#]+ +", "", headlines)

  links = gsub("[^-a-z. ]", "", tolower(headlines))
  links = gsub(" +", "-", links)
  links = gsub("-$", "", links)

  if (!sum(headlevels <= print.level)) {
    return(invisible(NULL))
  }

  cat("<h", headlevels[1], ">Table of Contents</h", headlevels[1], ">\n<div id=\"TOC\">\n", sep = "")

  lastlevel = headlevels[1] - 1
  for (idx in seq_along(headlines)) {
    line = headlines[idx]
    level = headlevels[idx]
    link = links[idx]
    if (level > print.level) {
      next
    }
    if (level < headlevels[1]) {
      stop("First headline level must be the lowest one used, but '", line, "' is lower.")
    }
    lvldiff = level - lastlevel
    if (lvldiff > 1) {
      stop("Cannot jump headline levels. Error on: ", line)
    }
    if (lvldiff > 0) {
      # higher level -> open a <ul>
      cat("<ul>")
    } else {
      cat("</li>\n")
    }
    if (lvldiff < 0) {
      # lower level -> close a few <ul>
      for (l in seq_len(-lvldiff)) {
        cat("</ul></li>")
      }
    }
    cat("<li><a href=\"#", link, "\">", line, "</a>", sep = "")
    lastlevel = level
  }
  # if the last level is greater than the first level, close a few <ul>
  lvldiff = lastlevel - headlevels[1]

  cat("</li></ul>\n</div>\n")
}



#############################
# Some output settings      #
#############################

options(width = 80)


replaceprint = function(ofunc) {
  force(ofunc)
  function(x, ...) {
    cu = capture.output({ret = ofunc(x, ...)})
    cu = grep("time: [-+e0-9.]{1,6}", cu, value = TRUE, invert = TRUE)
    cat(paste(cu, collapse = "\n"))
    if (!grepl("\n$", tail(cu, 1))) {
      cat("\n")
    }
    ret
  }
}

for (pfunc in grep("print\\.", ls(asNamespace("mlr")), value = TRUE)) {
  ofunc = get(pfunc, asNamespace("mlr"))
  assign(pfunc, replaceprint(ofunc))
}



## ----eval = TRUE, echo = FALSE, results = 'asis'------------------------------
printToc(4)

## -----------------------------------------------------------------------------
# !cpoPca()

## -----------------------------------------------------------------------------
# xmpSample = makeCPORetrafoless("exsample",  # nolint
#   pSS(fraction: numeric[0, 1]),
#   dataformat = "df.all",
#   cpo.trafo = function(data, target, fraction) {
#     newsize = round(nrow(data) * fraction)
#     row.indices = sample(nrow(data), newsize)
#     data[row.indices, ]
#   })
#
# cpo = xmpSample(0.01)

## -----------------------------------------------------------------------------
# iris %>>% cpo

## -----------------------------------------------------------------------------
# xmpSampleHeadless = makeCPORetrafoless("exsample",  # nolint
#   pSS(fraction: numeric[0, 1]),
#   dataformat = "df.all",
#   cpo.trafo = {
#     newsize = round(nrow(data) * fraction)
#     row.indices = sample(nrow(data), newsize)
#     data[row.indices, ]
#   })

## -----------------------------------------------------------------------------
# xmpFilterVar = makeCPO("exemplvar",  # nolint
#   pSS(n.col: integer[0, ]),
#   dataformat = "numeric",
#   cpo.train = function(data, target, n.col) {
#     cat("*** cpo.train ***\n")
#     sapply(data, var, na.rm = TRUE)
#   },
#   cpo.retrafo = function(data, control, n.col) {
#     cat("*** cpo.retrafo ***\n")
#     cat("Control:\n")
#     print(control)
#     cat("\n")
#     greatest = order(-control)  # columns, ordered greatest to smallest var
#     data[greatest[seq_len(n.col)]]
#   })
#
# cpo = xmpFilterVar(2)

## -----------------------------------------------------------------------------
# (trafd = head(iris) %>>% cpo)

## -----------------------------------------------------------------------------
# head(iris %>>% cpo)

## -----------------------------------------------------------------------------
# head(iris %>>% retrafo(trafd))

## -----------------------------------------------------------------------------
# getCPOTrainedState(retrafo(trafd))

## -----------------------------------------------------------------------------
# xmpFilterVarFunc = makeCPO("exemplvar.func",  # nolint
#   pSS(n.col: integer[0, ]),
#   dataformat = "numeric",
#   cpo.retrafo = NULL,
#   cpo.train = function(data, target, n.col) {
#     cat("*** cpo.train ***\n")
#     ctrl = sapply(data, var, na.rm = TRUE)
#     function(x) {  # the data is given to the only present parameter: 'x'
#       cat("*** cpo.retrafo ***\n")
#       cat("Control:\n")
#       print(ctrl)
#       cat("\ndata:\n")
#       print(data)  # 'data' is deleted: NULL
#       cat("target:\n")
#       print(target)  # 'target' is deleted: NULL
#       greatest = order(-ctrl)  # columns, ordered greatest to smallest var
#       x[greatest[seq_len(n.col)]]
#     }
#   })
#
# cpo = xmpFilterVarFunc(2)

## -----------------------------------------------------------------------------
# (trafd = head(iris) %>>% cpo)

## -----------------------------------------------------------------------------
# getCPOTrainedState(retrafo(trafd))

## -----------------------------------------------------------------------------
# xmpAsNum = makeCPO("asnum",  # nolint
#   cpo.train = NULL,
#   cpo.retrafo = function(data) {
#     data.frame(lapply(data, as.numeric))
#   })
#
# cpo = xmpAsNum()

## -----------------------------------------------------------------------------
# (trafd = head(iris) %>>% cpo)

## -----------------------------------------------------------------------------
# getCPOTrainedState(retrafo(trafd))

## -----------------------------------------------------------------------------
# xmpPca = makeCPOExtendedTrafo("simple.pca",  # nolint
#   pSS(n.col: integer[0, ]),
#   dataformat = "numeric",
#   cpo.trafo = function(data, target, n.col) {
#     cat("*** cpo.trafo ***\n")
#     pcr = prcomp(as.matrix(data), center = FALSE, scale. = FALSE, rank = n.col)
#     # save the rotation matrix as 'control' variable
#     control = pcr$rotation
#     pcr$x
#   },
#   cpo.retrafo = function(data, control, n.col) {
#     cat("*** cpo.retrafo ***\n")
#     # rotate the data by the rotation matrix
#     as.matrix(data) %*% control
#   })
#
# cpo = xmpPca(2)

## -----------------------------------------------------------------------------
# (trafd = head(iris) %>>% cpo)

## -----------------------------------------------------------------------------
# tail(iris) %>>% retrafo(trafd)

## -----------------------------------------------------------------------------
# getCPOTrainedState(retrafo(trafd))

## -----------------------------------------------------------------------------
# xmpPcaFunc = makeCPOExtendedTrafo("simple.pca.func",  # nolint
#   pSS(n.col: integer[0, ]),
#   dataformat = "numeric",
#   cpo.retrafo = NULL,
#   cpo.trafo = function(data, target, n.col) {
#     cat("*** cpo.trafo ***\n")
#     pcr = prcomp(as.matrix(data), center = FALSE, scale. = FALSE, rank = n.col)
#     # save the rotation matrix as 'control' variable
#     cpo.retrafo = function(data) {
#       cat("*** cpo.retrafo ***\n")
#       # rotate the data by the rotation matrix
#       as.matrix(data) %*% pcr$rotation
#     }
#     pcr$x
#   })
#
# cpo = xmpPcaFunc(2)

## -----------------------------------------------------------------------------
# (trafd = head(iris) %>>% cpo)

## -----------------------------------------------------------------------------
# getCPOTrainedState(retrafo(trafd))$pcr$x

## ----eval = FALSE-------------------------------------------------------------
# c(response = "response", se = "prob")

## -----------------------------------------------------------------------------
# xmpMetaLearn = makeCPOTargetOp("xmp.meta",  # nolint
#   pSS(lrn: untyped),
#   dataformat = "task",
#   properties.target = c("classif", "twoclass"),
#   predict.type.map = c(response = "response", prob = "prob"),
#   cpo.train = function(data, target, lrn) {
#     cat("*** cpo.train ***\n")
#     lrn = setPredictType(lrn, "prob")
#     train(lrn, data)
#   },
#   cpo.retrafo = function(data, target, control, lrn) {
#     cat("*** cpo.retrafo ***\n")
#     prediction = predict(control, target)
#     tname = getTaskTargetNames(target)
#     tdata = getTaskData(target)
#     tdata[[tname]] = factor(prediction$data$response == prediction$data$truth)
#     makeClassifTask(getTaskId(target), tdata, tname, positive = "TRUE",
#       fixup.data = "no", check.data = FALSE)
#   },
#   cpo.train.invert = function(data, control, lrn) {
#     cat("*** cpo.train.invert ***\n")
#     predict(control, newdata = data)$data
#   },
#   cpo.invert = function(target, control.invert, predict.type, lrn) {
#     cat("*** cpo.invert ***\n")
#     if (predict.type == "prob") {
#       outmat = as.matrix(control.invert[grep("^prob\\.", names(control.invert))])
#       revmat = outmat[, c(2, 1)]
#       outmat * target[, "prob.TRUE", drop = TRUE] +
#         revmat * target[, "prob.FALSE", drop = TRUE]
#     } else {
#       stopifnot(levels(target) == c("FALSE", "TRUE"))
#       numeric.prediction = as.numeric(control.invert$response)
#       numeric.res = ifelse(target == "TRUE",
#         numeric.prediction,
#         3 - numeric.prediction)
#       factor(levels(control.invert$response)[numeric.res],
#         levels(control.invert$response))
#     }
#   })
#
# cpo = xmpMetaLearn(makeLearner("classif.logreg"))

## -----------------------------------------------------------------------------
# set.seed(12)
# split = makeResampleInstance(hout, pid.task)
# train.task = subsetTask(pid.task, split$train.inds[[1]])
# test.task = subsetTask(pid.task, split$predict.inds[[1]])

## -----------------------------------------------------------------------------
# trafd = train.task %>>% cpo
# attributes(trafd)

## -----------------------------------------------------------------------------
# head(getTaskData(trafd))

## -----------------------------------------------------------------------------
# model = train(makeLearner("classif.logreg", predict.type = "prob"), train.task)
# head(predict(model, train.task)$data[c("truth", "response")])

## -----------------------------------------------------------------------------
# retr = test.task %>>% retrafo(trafd)
# attributes(retr)

## -----------------------------------------------------------------------------
# retr.df = getTaskData(test.task, target.extra = TRUE)$data %>>% retrafo(trafd)
# names(attributes(retr.df))

## -----------------------------------------------------------------------------
# ext.model = train("classif.svm", trafd)
# ext.pred = predict(ext.model, retr)
# newpred = invert(inverter(retr), ext.pred)
# performance(newpred)

## -----------------------------------------------------------------------------
# cpo.learner = cpo %>>% makeLearner("classif.svm")
# cpo.model = train(cpo.learner, train.task)

## -----------------------------------------------------------------------------
# lrnpred = predict(cpo.model, test.task)
# performance(lrnpred)

## -----------------------------------------------------------------------------
#
# xmpMetaLearn = makeCPOTargetOp("xmp.meta.fnc",  # nolint
#   pSS(lrn: untyped),
#   dataformat = "task",
#   properties.target = c("classif", "twoclass"),
#   predict.type.map = c(response = "response", prob = "prob"),
#   # set the cpo.* parameters not needed to NULL:
#   cpo.retrafo = NULL, cpo.train.invert = NULL, cpo.invert = NULL,
#   cpo.train = function(data, target, lrn) {
#     cat("*** cpo.train ***\n")
#     lrn = setPredictType(lrn, "prob")
#     model = train(lrn, data)
#     cpo.retrafo = function(data, target) {
#       cat("*** cpo.retrafo ***\n")
#       prediction = predict(model, target)
#       tname = getTaskTargetNames(target)
#       tdata = getTaskData(target)
#       tdata[[tname]] = factor(prediction$data$response == prediction$data$truth)
#       makeClassifTask(getTaskId(target), tdata, tname, positive = "TRUE",
#         fixup.data = "no", check.data = FALSE)
#     }
#     cpo.train.invert = function(data) {
#       cat("*** cpo.train.invert ***\n")
#       prediction = predict(model, newdata = data)$data
#       function(target, predict.type) {  # this is returned as cpo.invert
#         cat("*** cpo.invert ***\n")
#         if (predict.type == "prob") {
#           outmat = as.matrix(prediction[grep("^prob\\.", names(prediction))])
#           revmat = outmat[, c(2, 1)]
#           outmat * target[, "prob.TRUE", drop = TRUE] +
#             revmat * target[, "prob.FALSE", drop = TRUE]
#         } else {
#           stopifnot(levels(target) == c("FALSE", "TRUE"))
#           numeric.prediction = as.numeric(prediction$response)
#           numeric.res = ifelse(target == "TRUE",
#             numeric.prediction,
#             3 - numeric.prediction)
#           factor(levels(prediction$response)[numeric.res],
#             levels(prediction$response))
#         }
#       }
#     }
#   })

## -----------------------------------------------------------------------------
# xmpRegCenter = makeCPOTargetOp("xmp.center",  # nolint
#   constant.invert = TRUE,
#   cpo.train.invert = NULL,  # necessary for constant.invert = TRUE
#   dataformat = "df.feature",
#   properties.target = "regr",
#   cpo.train = function(data, target) {
#     # control value is just the mean of the target column
#     mean(target[[1]])
#   },
#   cpo.retrafo = function(data, target, control) {
#     # subtract mean from target column in retrafo
#     target[[1]] = target[[1]] - control
#     target
#   },
#   cpo.invert = function(target, predict.type, control.invert) {
#     target + control.invert
#   })
#
# cpo = xmpRegCenter()

## -----------------------------------------------------------------------------
# train.task = subsetTask(bh.task, 150:155)
# getTaskTargets(train.task)

## -----------------------------------------------------------------------------
# predict.task = subsetTask(bh.task, 156:160)
# getTaskTargets(predict.task)

## -----------------------------------------------------------------------------
# trafd = train.task %>>% cpo
# getTaskTargets(trafd)

## -----------------------------------------------------------------------------
# getTaskTargets(predict.task)

## -----------------------------------------------------------------------------
# retr = retrafo(trafd)
# predict.traf = predict.task %>>% retr
# getTaskTargets(predict.traf)

## ----warnings = FALSE---------------------------------------------------------
# model = train("regr.lm", trafd)
# pred = predict(model, predict.traf)
# pred

## -----------------------------------------------------------------------------
# invert(inverter(predict.traf), pred)

## ----warnings = FALSE---------------------------------------------------------
# model = train("regr.lm", train.task)
# predict(model, predict.task)

## -----------------------------------------------------------------------------
# getCPOTrainedCapability(retr)

## -----------------------------------------------------------------------------
# invert(retr, pred)

## -----------------------------------------------------------------------------
# xmpLogRegr = makeCPOTargetOp("log.regr",  # nolint
#   constant.invert = TRUE,
#   properties.target = "regr",
#   cpo.train = NULL, cpo.train.invert = NULL,
#   cpo.retrafo = function(data, target) {
#     target[[1]] = log(target[[1]])
#     target
#   },
#   cpo.invert = function(target, predict.type) {
#     exp(target)
#   })
#
# cpo = xmpLogRegr()

## -----------------------------------------------------------------------------
# trafd = train.task %>>% cpo
# getTaskTargets(trafd)

## -----------------------------------------------------------------------------
# retr = retrafo(trafd)
# predict.traf = predict.task %>>% retr
# getTaskTargets(predict.traf)

## ----warnings = FALSE---------------------------------------------------------
# model = train("regr.lm", trafd)
# pred = predict(model, predict.traf)
# pred

## -----------------------------------------------------------------------------
# invert(inverter(predict.traf), pred)

## -----------------------------------------------------------------------------
# invert(retr, pred)

## -----------------------------------------------------------------------------
# xmpSynCPO = makeCPOExtendedTargetOp("syn.cpo",  # nolint
#   properties.target = "regr",
#   cpo.trafo = function(data, target) {
#     cat("*** cpo.trafo ***\n")
#     target[[1]] = target[[1]] + 1
#     control = "control created in cpo.trafo"
#     control.invert = "control.invert created in cpo.trafo"
#     target
#   },
#   cpo.retrafo = function(data, target, control) {
#     cat("*** cpo.retrafo ***", "control is:", deparse(control), sep = "\n")
#     control.invert = "control.invert created in cpo.retrafo"
#     if (!is.null(target)) {
#       cat("target is non-NULL, performing transformation\n")
#       target[[1]] = target[[1]] - 1
#       return(target)
#     } else {
#       cat("target is NULL, no transformation (but control.invert was created)\n")
#       return(NULL)  # is ignored.
#     }
#   },
#   cpo.invert = function(target, control.invert, predict.type) {
#     cat("*** invert ***", "control.invert is:", deparse(control.invert),
#       sep = "\n")
#     target
#   })
#
# cpo = xmpSynCPO()

## -----------------------------------------------------------------------------
# trafd = train.task %>>% cpo
# getTaskTargets(trafd)

## -----------------------------------------------------------------------------
# retrafd = train.task %>>% retrafo(trafd)

## -----------------------------------------------------------------------------
# getTaskTargets(retrafd)

## -----------------------------------------------------------------------------
# retrafd = getTaskData(train.task, target.extra = TRUE)$data %>>% retrafo(trafd)

## -----------------------------------------------------------------------------
# inv = invert(inverter(trafd), 1:6)

## -----------------------------------------------------------------------------
# inv = invert(inverter(retrafd), 1:6)

## ----echo = FALSE-------------------------------------------------------------
# oscipen = options("scipen")
# options(scipen = 10)

## -----------------------------------------------------------------------------
# learners = list(
#     logreg = makeLearner("classif.logreg"),
#     svm = makeLearner("classif.svm"),
#     cpo = xmpMetaLearn(makeLearner("classif.logreg")) %>>%
#       makeLearner("classif.svm")
# )
#
# # suppress output of '*** cpo.train ***' etc.
# configureMlr(show.info = FALSE, show.learner.output = FALSE)
#
# perfs = sapply(learners, function(lrn) {
#   unname(replicate(20, resample(lrn, pid.task, cv10)$aggr))
# })
#
# # reset mlr settings
# configureMlr()
#
# boxplot(perfs)

## -----------------------------------------------------------------------------
# pvals = c(
#     logreg = t.test(perfs[, "logreg"], perfs[, "cpo"], "greater")$p.value,
#     svm = t.test(perfs[, "svm"], perfs[, "cpo"], "greater")$p.value
# )
#
# round(p.adjust(pvals), 3)

## ----echo = FALSE-------------------------------------------------------------
# options(scipen = oscipen$scipen)

## ----results = "asis", echo = FALSE-------------------------------------------
cat(knitr::knit_child("a_4_custom_CPOs.Rmd", options = list(eval = FALSE)), sep = "\n")
Any scripts or data that you put into this service are public.
mlrCPO documentation built on June 17, 2025, 1:07 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

inst/doc/z_4_custom_CPOs_terse.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

Try the mlrCPO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mlrCPO Composable Preprocessing Operators and Pipelines for Machine Learning

inst/doc/z_4_custom_CPOs_terse.R In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

Try the mlrCPO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

inst/doc/z_4_custom_CPOs_terse.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning