test_core_datasplit.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

context("cpo data split")

test_that("data actually has the form requested", {
  # split according to argument
  for (type in c("o", "f")) {

    nosplit = generateCPO(type, "no")
    tasksplit = generateCPO(type, "task")
    targetsplit = generateCPO(type, "target")
    mostsplit = generateCPO(type, "most")
    allsplit = generateCPO(type, "all")

    doallsplit = function(learner, tdata, rdata, numtarget, numnumeric, numfactor, numother, numordered) {
      nr = if (is.data.frame(tdata)) nrow(tdata) else nrow(getTaskData(tdata))

      testsingle = function(cpo) {
        res = tdata %>>% cpo
        rt = retrafo(res)
        retrafo(res) = NULL
        expect_equal(res, tdata)
        expect_equal(rdata %>>% rt, rdata)

        if ("Task" %in% class(tdata)) {
          m = train(cpo %>>% learner, tdata)
          if ("Task" %in% class(rdata)) {
            predict(m, rdata)
          } else {
            predict(m, newdata = rdata)
          }
        }
      }

      testsingle(nosplit(nr, numtarget, numnumeric, numfactor, numother, numordered, affect.pattern = "^OUT\\.", affect.invert = TRUE))
      testsingle(tasksplit(nr, numtarget, numnumeric, numfactor, numother, numordered, affect.pattern = "^OUT\\.", affect.invert = TRUE))
      testsingle(targetsplit(nr, numtarget, numnumeric, numfactor, numother, numordered, affect.pattern = "^OUT\\.", affect.invert = TRUE))
      testsingle(mostsplit(nr, numtarget, numnumeric, numfactor + numordered, numother, -1, affect.pattern = "^OUT\\.", affect.invert = TRUE))
      testsingle(allsplit(nr, numtarget, numnumeric, numfactor, numother, numordered, affect.pattern = "^OUT\\.", affect.invert = TRUE))
      invisible(NULL)
    }

    doallsplittask = function(l, constructor, target, tdata, rdata, numtarget, ...) {
      t1 = if (length(target)) constructor(data = tdata, target = target) else constructor(data = tdata)
      t2 = if (length(target)) constructor(data = rdata, target = target) else constructor(data = rdata)
      d1 = tdata[!names(tdata) %in% target]
      d2 = rdata[!names(rdata) %in% target]
      doallsplit(l, d1, d2, 0, ...)
      doallsplit(l, t1, d2, numtarget, ...)
      doallsplit(l, t1, rdata, numtarget, ...)
      doallsplit(l, t1, t2, numtarget, ...)
      doallsplit(l, d2, t2, 0, ...)
    }

    cl = testlearnercpo
    rl = testregrcpo
    cc = testclustercpo
    ml = testmlcpo


    doallsplittask(cl, makeClassifTask, "F1", cbind(cpo.df.numeric, OUT = cpo.df.ordered, cpo.df.factorial),
                   cbind(OUT = cpo.df.ordered, cpo.df.numeric2, cpo.df.factorial2), 1, 3, 1, 0, 0)
    doallsplittask(cl, makeClassifTask, "F1",
      cbind(cpo.df.numeric, cpo.df.factorial, cpo.df.ordered, OUT = cpo.df.ordered),
      cbind(cpo.df.numeric2, cpo.df.factorial2, OUT = cpo.df.ordered, cpo.df.ordered), 1, 3, 1, 0, 3)
    doallsplittask(cl, makeClassifTask, "O2",
      cbind(OUT = cpo.df.ordered, cpo.df.numeric, cpo.df.factorial, cpo.df.ordered),
      cbind(cpo.df.numeric2, OUT = cpo.df.ordered, cpo.df.factorial2, cpo.df.ordered), 1, 3, 2, 0, 2)
    doallsplittask(cl, makeClassifTask, "O2",
      cbind(cpo.df.numeric, cpo.df.ordered, OUT = cpo.df.ordered),
      cbind(OUT = cpo.df.numeric, cpo.df.numeric2, cpo.df.ordered, OUT = cpo.df.ordered), 1, 3, 0, 0, 2)

    doallsplittask(rl, makeRegrTask, "N2", cbind(cpo.df.numeric, cpo.df.factorial, OUT = cpo.df.ordered), cbind(cpo.df.numeric2, cpo.df.factorial2), 1, 2, 2, 0, 0)
    doallsplittask(rl, makeRegrTask, "N2",
      cbind(cpo.df.numeric, cpo.df.factorial, OUT = cpo.df.ordered, cpo.df.ordered),
      cbind(cpo.df.numeric2, cpo.df.factorial2, cpo.df.ordered), 1, 2, 2, 0, 3)
    doallsplittask(rl, makeRegrTask, "N2", cpo.df.numeric, cpo.df.numeric2, 1, 2, 0, 0, 0)

    doallsplittask(cc, makeClusterTask, NULL, cbind(cpo.df.numeric, cpo.df.factorial, OUT = cpo.df.ordered, OUT = cpo.df.numeric),
                   cbind(cpo.df.numeric2, cpo.df.factorial2), 0, 3, 2, 0, 0)
    doallsplittask(cc, makeClusterTask, NULL,
      cbind(cpo.df.numeric, cpo.df.factorial, cpo.df.ordered),
      cbind(cpo.df.numeric2, OUT = cpo.df.ordered, cpo.df.factorial2, cpo.df.ordered), 0, 3, 2, 0, 3)
    doallsplittask(cc, makeClusterTask, NULL, cpo.df.numeric, cpo.df.numeric2, 0, 3, 0, 0, 0)

    doallsplittask(ml, makeMultilabelTask, c("T1", "T2"), cbind(cpo.df.numeric, cpo.df.logical), cbind(cpo.df.numeric2, cpo.df.logical2), 2, 3, 0, 0, 0)
    doallsplittask(ml, makeMultilabelTask, c("T1", "T2"),
      cbind(cpo.df.numeric, cpo.df.logical, cpo.df.factorial, cpo.df.ordered),
      cbind(cpo.df.numeric2, cpo.df.logical2, cpo.df.factorial2, cpo.df.ordered), 2, 3, 2, 0, 3)
  }
})

test_that("changing some columns leaves the others in order", {
  # datasplit: most, all; also target in 'target'
  # introducing columns where there were none before

  for (type in c("o", "f")) {
    for (split in c("target", "most", "all")) {

      cpo = cpogen("colchangetest", type, pss(test: discrete[none, target, other, all]), function(data, target, test) {
        control = 0
        if (test == "target") {  # just change a column name
          names(data)[2] = "X"
        } else if (test == "other") {
          names(data$other)[2] = "X"
        } else if (test == "all") {
          names(data$numeric)[2] = "X"
          names(data$factor)[1] = "Y"
        }
        data
      }, function(data, control, test) {
        if (test == "target") {  # just change a column name
          names(data)[2] = "X"
        } else if (test == "other") {
          names(data$other)[2] = "X"
        } else if (test == "all") {
          names(data$numeric)[2] = "X"
          names(data$factor)[1] = "Y"
        }
        data
      }, split)

      if (split == "target") {

        tr = cpo.df1c %>>% cpo("none")
        expect_equal(getTaskData(tr), cpo.df1)
        expect_equal(cpo.df1[c(1, 2, 3, 5)] %>>% retrafo(tr), cpo.df1[c(1, 2, 3, 5)])
        expect_equal(cpo.df1 %>>% retrafo(tr), cpo.df1)
        expect_equal(getTaskData(cpo.df1c %>>% retrafo(tr)), cpo.df1)

        tr = cpo.df1c %>>% cpo("target")
        expected = cpo.df1[c(1, 2, 3, 5, 4)]
        names(expected)[2] = "X"
        expect_equal(getTaskData(tr), expected)
        expect_equal(cpo.df1[c(1, 2, 3, 5)] %>>% retrafo(tr), expected[1:4])
        expect_equal(cpo.df1 %>>% retrafo(tr), expected)
        expect_equal(getTaskData(cpo.df1c %>>% retrafo(tr)), expected)

        tr = cpo.df1cc %>>% cpo("none")
        expect_equal(getTaskData(tr), cpo.df1)
        expect_equal(cpo.df1 %>>% retrafo(tr), cpo.df1)
        expect_equal(getTaskData(cpo.df1cc %>>% retrafo(tr)), cpo.df1)

        tr = cpo.df1cc %>>% cpo("target")
        expected = cpo.df1
        names(expected)[2] = "X"
        expect_equal(getTaskData(tr), expected)

        tr = cpo.df3l %>>% cpo("none")
        expect_equal(getTaskData(tr), cpo.df3)
        expect_equal(cpo.df3[c(1, 3, 4, 5, 7)] %>>% retrafo(tr), cpo.df3[c(1, 3, 4, 5, 7)])
        expect_equal(cpo.df3 %>>% retrafo(tr), cpo.df3)
        expect_equal(getTaskData(cpo.df3l %>>% retrafo(tr)), cpo.df3)

        tr = cpo.df3l %>>% cpo("target")
        expected = cpo.df3[c(1, 3, 4, 5, 7, 2, 6)]
        names(expected)[2] = "X"
        expect_equal(getTaskData(tr), expected)
        expect_equal(cpo.df3[c(1, 3, 4, 5, 7)] %>>% retrafo(tr), expected[1:5])
        expect_equal(cpo.df3 %>>% retrafo(tr), expected)
        expect_equal(getTaskData(cpo.df3l %>>% retrafo(tr)), expected)

      } else {
        tr = cpo.df2 %>>% cpo("other")
        expected = cpo.df2[c(1, 3, 4, 5, 7, 8, 9, 12, 2, 6, 10, 11)]
        names(expected)[10] = "X"
        ret = retrafo(tr)
        retrafo(tr) = NULL
        expect_equal(tr, expected)

        tr = cpo.df2 %>>% ret
        expect_equal(tr, expected)

        cpo.df2c = makeClassifTask(data = cpo.df2, target = "F1", check.data = FALSE)
        tr = cpo.df2c %>>% cpo("other")
        expected = cpo.df2[c(1, 3, 4, 5, 7, 8, 9, 12, 2, 6, 10, 11)]
        names(expected)[10] = "X"
        expect_equal(getTaskData(tr), expected)
        expect_equal(getTaskData(cpo.df2c %>>% retrafo(tr)), expected)
        expect_equal(cpo.df2 %>>% retrafo(tr), expected)
        expect_equal(cpo.df2[c(1:2, 4:12)] %>>% retrafo(tr), expected[c(1, 3:12)])

        if (split == "most") {
          tr = cpo.df5c %>>% cpo("all")
          expected = cpo.df5[c(5, 1, 4, 7, 2, 3, 6, 8, 9)]
          names(expected)[3] = "X"
          names(expected)[5] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df5[c(1:4, 6:9)] %>>% retrafo(tr), expected[2:9])
          expect_equal(cpo.df5 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df5c %>>% retrafo(tr)), expected)

          tr = cpo.df5cc %>>% cpo("all")
          expected = cpo.df5[c(1, 4, 7, 2, 3, 5, 6, 8, 9)]
          names(expected)[2] = "X"
          names(expected)[4] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df5 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df5cc %>>% retrafo(tr)), expected)

          tr = cpo.df4l %>>% cpo("all")
          expected = cpo.df4[c(2, 5, 1, 4, 7, 3, 6, 8:10)]
          names(expected)[4] = "X"
          names(expected)[6] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df4 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df4l %>>% retrafo(tr)), expected)
          expect_equal(cpo.df4[c(1, 3, 4, 6:10)] %>>% retrafo(tr), expected[3:10])

        } else {
          tr = cpo.df5c %>>% cpo("all")
          expected = cpo.df5[c(3, 5, 6, 9, 1, 4, 7, 2, 8)]
          names(expected)[6] = "X"
          names(expected)[8] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df5[c(1:4, 6:9)] %>>% retrafo(tr), expected[c(1, 3:9)])
          expect_equal(cpo.df5 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df5c %>>% retrafo(tr)), expected)

          tr = cpo.df5cc %>>% cpo("all")
          expected = cpo.df5[c(3, 6, 9, 1, 4, 7, 2, 5, 8)]
          names(expected)[5] = "X"
          names(expected)[7] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df5 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df5cc %>>% retrafo(tr)), expected)

          tr = cpo.df4l %>>% cpo("all")
          expected = cpo.df4[c(2, 3, 5, 6, 10, 1, 4, 7, 8, 9)]
          names(expected)[7] = "X"
          names(expected)[9] = "Y"
          expect_equal(getTaskData(tr), expected)
          expect_equal(cpo.df4 %>>% retrafo(tr), expected)
          expect_equal(getTaskData(cpo.df4l %>>% retrafo(tr)), expected)
          expect_equal(cpo.df4[c(1, 3, 4, 6:10)] %>>% retrafo(tr), expected[c(2, 4:10)])
        }
      }
    }
  }
})

test_that("changing the target gives an error", {
  library("checkmate")

  for (type in c("o", "f")) {
    for (split in c("no", "task")) {

      changedata = function(data, target, test, chgname) {
        if (test == "df") {
          if (chgname == "yes") {
            targetidx = which(names(data) %in% target)[1]
            names(data)[targetidx] = "X"
          } else if (chgname == "no") {
            data[[target[1]]] = data[[target[1]]][1]
          } else {
            assert(length(target) > 1)
            data[target[1]] = NULL
          }
        } else if (test == "task") {
          constructor = switch(substr(class(data)[1], 1, 3),
            Cla = makeClassifTask,
            Mul = makeMultilabelTask)
          data = getTaskData(data)
          if (chgname == "yes") {
            targetidx = which(names(data) %in% target)[1]
            names(data)[targetidx] = "X"
            target[1] = "X"
          } else if (chgname == "no") {
            data[[target[1]]] = data[[target[1]]][1]
          } else {
            assert(length(target) > 1)
            data[target[1]] = NULL
            target = target[-1]
          }
          data = constructor(data = data, target = target, fixup.data = "no", check.data = FALSE)
        }
        data
      }

      cpo = cpogen("targetchangetest", type, pss(test: discrete[none, df, task],
        chgname: discrete [no, yes, remove] [[requires = quote(test != "none")]]), function(data, target, test, chgname) {
          control = 0
          changedata(data, target, test, chgname)
        }, function(data, control, test, chgname) {
          data
        }, split)

      expect_class(cpo.df1c %>>% cpo("none"), "Task")
      expect_class(cpo.df1 %>>% cpo("none"), "data.frame")

      failmode = switch(split, no = "df", task = "task")
      if (split == "no") {
        expect_class(cpo.df1 %>>% cpo(failmode, "yes"), "data.frame")
        expect_class(cpo.df1 %>>% cpo(failmode, "yes", affect.names = "N3", affect.invert = TRUE), "data.frame")
      }
      if (split == "no") {
        expect_error(cpo.df1c %>>% cpo(failmode, "yes"), "did not contain target column")
        expect_error(cpo.df3l %>>% cpo(failmode, "yes"), "did not contain target column")

        expect_error(cpo.df1c %>>% cpo(failmode, "yes", affect.names = "N3", affect.invert = TRUE), "did not contain target column")
        expect_error(cpo.df3l %>>% cpo(failmode, "yes", affect.names = "N3", affect.invert = TRUE), "did not contain target column")
      } else {
        expect_error(cpo.df1c %>>% cpo(failmode, "yes"), "must not change target column names")
        expect_error(cpo.df3l %>>% cpo(failmode, "yes"), "must not change target column names")

        expect_error(cpo.df1c %>>% cpo(failmode, "yes", affect.names = "N3", affect.invert = TRUE), "must not change target column names")
        expect_error(cpo.df3l %>>% cpo(failmode, "yes", affect.names = "N3", affect.invert = TRUE), "must not change target column names")

      }
      expect_error(cpo.df1c %>>% cpo(failmode, "no"), "must not change target columns, but changed F1")
      expect_error(cpo.df3l %>>% cpo(failmode, "no"), "must not change target columns, but changed T1")

      expect_error(cpo.df1c %>>% cpo(failmode, "no", affect.names = "N3", affect.invert = TRUE), "must not change target columns, but changed F1")
      expect_error(cpo.df3l %>>% cpo(failmode, "no", affect.names = "N3", affect.invert = TRUE), "must not change target columns, but changed T1")

      if (split == "no") {
        expect_error(cpo.df3l %>>% cpo(failmode, "remove"), "did not contain target column T1")

        expect_error(cpo.df3l %>>% cpo(failmode, "remove", affect.names = "N3", affect.invert = TRUE), "did not contain target column T1")
      } else {
        expect_error(cpo.df4l2 %>>% cpo(failmode, "remove"), "must not change target column")

        expect_error(cpo.df4l2 %>>% cpo(failmode, "remove", affect.names = "N3", affect.invert = TRUE), "must not change target column")
      }
    }
  }
})

test_that("introducing duplicate names gives an error", {
  # datasplit: target, most, all

  for (type in c("o", "f")) {
    for (split in c("target", "most", "all")) {

      cpo = cpogen("duplicatetest", type, pss(test: discrete [none, nosplit, split]), function(data, target, test) {
          control = 0
          if (test == "nosplit") {
            names(data)[1] = names(target)[1]
          } else if (test == "split") {
            names(data$numeric)[1] = "F2"
          }
          data
        }, function(data, control, test) {
          data
        }, split)

      expect_class(cpo.df1c %>>% cpo("none"), "Task")
      expect_class(cpo.df1 %>>% cpo("none"), "data.frame")

      failmode = switch(split, target = "nosplit", "split")
      expect_error(cpo.df1c %>>% cpo(failmode), switch(failmode, nosplit = "column names F1 duplicated", split = "duplicate column names F2"))
      expect_error(cpo.df3l %>>% cpo(failmode), switch(failmode, nosplit = "column names T1 duplicated", split = "duplicate column names F2"))

      expect_error(cpo.df1c %>>% cpo(failmode, affect.name = "F2", affect.invert = TRUE),
        switch(failmode, nosplit = "column names F1 duplicated", split = "duplicate column names F2"))
      expect_error(cpo.df3l %>>% cpo(failmode, affect.name = "F2", affect.invert = TRUE),
        switch(failmode, nosplit = "column names T1 duplicated", split = "duplicate column names F2"))

      if (failmode == "split") {
        expect_error(cpo.df1 %>>% cpo(failmode), "duplicate column names F2")
        expect_error(cpo.df3 %>>% cpo(failmode), "duplicate column names F2")

        expect_error(cpo.df1 %>>% cpo(failmode, affect.name = "F2", affect.invert = TRUE), "duplicate column names F2")
        expect_error(cpo.df3 %>>% cpo(failmode, affect.name = "F2", affect.invert = TRUE), "duplicate column names F2")

      }

    }
  }

})

test_that("new task is actually changed, has the expected data", {
  # all datasplits

  for (type in c("o", "f")) {
    for (split in c("no", "task", "target", "most", "all")) {

      chgfct = function(data, ...) {
        control = 0
        val = (("control" %in% names(list(...))) + 1) * 100
        if (val == 200 && split == "task") {
          split = "no"
        }
        switch(split,
          task = {
            tdata = getTaskData(data)
            numcol = which(sapply(tdata, is.numeric))[1]
            tdata[1, numcol] = val
            changeData(data, tdata)
          },
          no = {
            numcol = which(sapply(data, is.numeric))[1]
            data[1, numcol] = val
            data
          },
          target = {
            numcol = which(sapply(data, is.numeric))[1]
            data[1, numcol] = val
            data
          },
          {
            data$numeric[1, 1] = val
            data
          })
      }

      cpo = cpogen("changetest", type, pss(), chgfct, chgfct, split)

      exp.df1.t = cpo.df1
      exp.df1.r = cpo.df1
      exp.df1.t[1, 1] = 100
      exp.df1.r[1, 1] = 200

      val = cpo.df1 %>>% cpo()
      val2 = cpo.df1 %>>% retrafo(val)
      retrafo(val) = NULL
      expect_equal(val, exp.df1.t)
      expect_equal(val2, exp.df1.r)

      val = cpo.df1c %>>% cpo()
      val2 = cpo.df1c %>>% retrafo(val)
      expect_equal(getTaskData(val), exp.df1.t)
      expect_equal(getTaskData(val2), exp.df1.r)

      val = cpo.df1 %>>% cpo(affect.name = "F2", affect.invert = TRUE)
      val2 = cpo.df1 %>>% retrafo(val)
      retrafo(val) = NULL
      expect_equal(val, exp.df1.t)
      expect_equal(val2, exp.df1.r)

      val = cpo.df1c %>>% cpo(affect.name = "F2", affect.invert = TRUE)
      val2 = cpo.df1c %>>% retrafo(val)
      expect_equal(getTaskData(val), exp.df1.t)
      expect_equal(getTaskData(val2), exp.df1.r)

    }
  }

})

test_that("cpo framework detects bad data", {
  # in splits: one of the returned things is not a df
  # missing / too many names in split
  # row number mismatch
split = "task"
  for (type in c("o", "f")) {
    for (split in c("no", "task", "target")) {

      chgfct = function(data, pred, ...) {
        control = 0
        ispredicting = "control" %in% names(list(...))
        if (split == "task" && !ispredicting) {
          task = data
          data = getTaskData(task)
        }
        if (pred == ispredicting) {
          data = data[-1, ]
        }
        if (split == "task" && !ispredicting) {
          data = changeData(task, data)
        }
        data
      }
      cpo = cpogen("numrowtest", type, pss(pred = FALSE: logical), chgfct, chgfct, split)

      expect_error(cpo.df1 %>>% cpo(), "must not change number of rows")
      expect_error(cpo.df1c %>>% cpo(), "must not change number of rows")

      rr = retrafo(cpo.df1 %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "must not change number of rows")

      rr = retrafo(cpo.df1c %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "must not change number of rows")
      expect_error(cpo.df1c %>>% rr, "must not change number of rows")
    }

    for (split in c("most", "all")) {
      # 'one of the returned things is not a df'
      chgfct = function(data, pred, ...) {
        control = 0
        ispredicting = "control" %in% names(list(...))
        if (pred == ispredicting) {
          data$numeric = data$numeric[[1]]
        }
        data
      }
      cpo = cpogen("baddatatest", type, pss(pred = FALSE: logical), chgfct, chgfct, split)

      expect_error(cpo.df1 %>>% cpo(), "numeric is not a data.frame")
      expect_error(cpo.df1c %>>% cpo(), "numeric is not a data.frame")

      rr = retrafo(cpo.df1 %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "numeric is not a data.frame")

      rr = retrafo(cpo.df1c %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "numeric is not a data.frame")
      expect_error(cpo.df1c %>>% rr, "numeric is not a data.frame")

      chgfct = function(data, pred, ...) {
        control = 0
        ispredicting = "control" %in% names(list(...))
        if (pred == ispredicting) {
          data$numeric = data$numeric[-1, ]
        }
        data
      }
      cpo = cpogen("rowchangetest", type, pss(pred = FALSE: logical), chgfct, chgfct, split)

      expect_error(cpo.df1 %>>% cpo(), "rows of numeric data.*must not change row number")
      expect_error(cpo.df1c %>>% cpo(), "rows of numeric data.*must not change row number")

      rr = retrafo(cpo.df1 %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "rows of numeric data.*must not change row number")

      rr = retrafo(cpo.df1c %>>% cpo(TRUE))
      expect_error(cpo.df1 %>>% rr, "rows of numeric data.*must not change row number")
      expect_error(cpo.df1c %>>% rr, "rows of numeric data.*must not change row number")


      chgfct = function(data, pred, ...) {
        control = 0
        ispredicting = "control" %in% names(list(...))
        if (pred == ispredicting || ispredicting) {
          data$ordered = data$numeric
          names(data$ordered) = paste(names(data$ordered), "x", sep = ".")
        }
        data
      }
      cpo = cpogen("badsplittest", type, pss(pred = FALSE: logical), chgfct, chgfct, split)

      if (split == "most") {
        expect_error(cpo.df1 %>>% cpo(), '"numeric", "factor", "other"')
        expect_error(cpo.df1c %>>% cpo(), '"numeric", "factor", "other"')

        rr = retrafo(cpo.df1 %>>% cpo(TRUE))
        expect_error(cpo.df1 %>>% rr, '"numeric", "factor", "other"')

        rr = retrafo(cpo.df1c %>>% cpo(TRUE))
        expect_error(cpo.df1 %>>% rr, '"numeric", "factor", "other"')
        expect_error(cpo.df1c %>>% rr, '"numeric", "factor", "other"')
      } else {
        expected = cpo.df1[c(1, 2, 3, 4, 5, 1, 2, 3)]
        names(expected)[6:8] = paste(names(expected)[1:3], "x", sep = ".")
        rr = cpo.df1 %>>% cpo()
        rt = retrafo(rr)
        retrafo(rr) = NULL
        expect_equal(rr, expected)
        expect_equal(cpo.df1 %>>% rt, expected)

        rr = cpo.df1 %>>% cpo(TRUE)
        rt = retrafo(rr)
        retrafo(rr) = NULL
        expect_equal(rr, cpo.df1)
        expect_error(cpo.df1 %>>% rt, "column name mismatch")

        rr = cpo.df1c %>>% cpo()
        expect_equal(getTaskData(rr), expected)
        rt = retrafo(rr)
        expect_equal(cpo.df1 %>>% rt, expected)
        expect_equal(getTaskData(cpo.df1c %>>% rt), expected)
      }
    }
  }
})

test_that("format change between trafo and retrafo are detected", {

  for (type in c("o", "f")) {

    for (split in c("no", "task", "target", "most", "all")) {

      cpo = cpogen("formattest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) data, split)

      rt = retrafo(cpo.df1 %>>% cpo())
      rt.ignore1 = retrafo(cpo.df1 %>>% cpo(affect.pattern = "^N1$", affect.invert = TRUE))

      expect_equal(cpo.df1 %>>% rt, cpo.df1)
      expect_error(cpo.df1[-1] %>>% rt, "column name mismatch")

      expect_equal(cpo.df1 %>>% rt.ignore1, cpo.df1)
      expect_equal(cpo.df1[-1] %>>% rt.ignore1, cpo.df1[-1])

      xmp = cpo.df1
      xmp[[1]] = as.factor(xmp[[1]])
      expect_error(xmp %>>% rt, "Types? of column N1 mismatches")

      expect_equal(xmp %>>% rt.ignore1, xmp)


      expect_error(cpo.df1c %>>% rt, "column name mismatch")

      rt = retrafo(cpo.df1c %>>% cpo())

      expect_equal(cpo.df1 %>>% rt, cpo.df1)

      expect_error(cpo.df1[-1] %>>% rt, "column name mismatch")

      xmp = cpo.df1
      xmp[[1]] = as.factor(xmp[[1]])
      expect_error(xmp %>>% rt, "Type of column N1 mismatches")

      expect_equal(xmp %>>% rt.ignore1, xmp)

      expect_class(cpo.df1c %>>% rt, "Task")

      xmp = cpo.df1
      xmp[[5]] = as.ordered(xmp[[5]])
      if (split == "all") {
        expect_error(xmp %>>% rt, "Type of column F2 mismatches")
      } else {
        expect_class(xmp %>>% rt, "data.frame")
      }

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1c %>>% rt, "column name mismatch")
      expect_class(cpo.df1cc %>>% rt, "Task")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_class(cpo.df1 %>>% rt, "data.frame")
      expect_error(cpo.df1cc %>>% rt, "column name mismatch")

    }

    for (split in c("no", "task", "target")) {

      cpo = cpogen("formattest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data[-1] }, split)

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1cc %>>% rt, "column name mismatch between training and test data")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1c %>>% rt, "column name mismatch between training and test data")

      cpo = cpogen("formattest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { cbind(data[c(2, 1)], data[-c(1, 2)]) }, split)

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1cc %>>% rt, "column name mismatch between training and test data")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1c %>>% rt, "column name mismatch between training and test data")

      cpo = cpogen("formattest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data[[1]] = as.factor(data[[1]]) ; data },
        split, properties.needed = "factors")

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "Type of column N1 mismatches")
      expect_error(cpo.df1cc %>>% rt, "Type of column N1 mismatches")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "Type of column N1 mismatches")
      expect_error(cpo.df1c %>>% rt, "Type of column N1 mismatches")
    }

    for (split in c("most", "all")) {
      cpo = cpogen("numrowtest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data$numeric = data$numeric[-1] ; data }, split)

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1cc %>>% rt, "column name mismatch between training and test data")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1c %>>% rt, "column name mismatch between training and test data")

      cpo = cpogen("numrowtest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data$numeric = cbind(data$numeric[c(2, 1)], data$numeric[-c(1, 2)]) ; data }, split)

      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1cc %>>% rt, "column name mismatch between training and test data")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "column name mismatch between training and test data")
      expect_error(cpo.df1c %>>% rt, "column name mismatch between training and test data")

      cpo = cpogen("numrowtest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data$numeric[[1]] = as.factor(data$numeric[[1]]) ; data },
        split, properties.needed = "factors")
      rt = retrafo(cpo.df1 %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "Type of column N1 mismatches")
      expect_error(cpo.df1cc %>>% rt, "Type of column N1 mismatches")

      rt = retrafo(cpo.df1c %>>% cpo())
      expect_error(cpo.df1 %>>% rt, "Type of column N1 mismatches")
      expect_error(cpo.df1c %>>% rt, "Type of column N1 mismatches")


      cpo = cpogen("numrowtest", type, pss(),
        function(data, ...) { control = 0 ; data }, function(data, ...) { data$factor[[1]] = as.ordered(data$factor[[1]]) ; data },
        split, properties.needed = "ordered")

      if (split == "all") {
        rt = retrafo(cpo.df1 %>>% cpo())
        expect_error(cpo.df1 %>>% rt, "Type of column F1 mismatches")
        expect_error(cpo.df1cc %>>% rt, "Type of column F1 mismatches")

        rt = retrafo(cpo.df1c %>>% cpo())
        expect_error(cpo.df1 %>>% rt, "Type of column F2 mismatches")
        expect_error(cpo.df1c %>>% rt, "Type of column F2 mismatches")
      } else {
        rt = retrafo(cpo.df1 %>>% cpo())
        expect_class(cpo.df1 %>>% rt, "data.frame")
        expect_class(cpo.df1cc %>>% rt, "Task")

        rt = retrafo(cpo.df1c %>>% cpo())
        expect_class(cpo.df1 %>>% rt, "data.frame")
        expect_class(cpo.df1c %>>% rt, "Task")
      }


    }
  }
})

test_that("factor fixing works", {

  factormemcpo = makeCPOExtended("dummyencode", .dataformat = "df.features", .fix.factors = FALSE,
    cpo.trafo = {
      control = lapply(data, levels)
      data
    }, cpo.retrafo = {
      newlevels = lapply(data, levels)
      levelsfit = mapply(identical, control, newlevels)
      data[[1]] = sum(levelsfit)
      data
    })()

  hi = head(iris)
  hi2 = hi
  hi2$Species = factor(as.character(hi2$Species), levels = c("setosa", "versicolor"))
  hi3 = hi2
  hi3$Species = factor(as.character(hi3$Species), levels = c("versicolor", "setosa"))

  expect_equal((hi %>>% retrafo(hi %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi2 %>>% retrafo(hi %>>% factormemcpo))[1, 1], 4)
  expect_equal((hi3 %>>% retrafo(hi %>>% factormemcpo))[1, 1], 4)
  expect_equal((hi %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 4)
  expect_equal((hi2 %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi3 %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 4)
  expect_equal((iris %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 4)

  expect_set_equal(names(table((iris %>>% retrafo(hi2 %>>% factormemcpo))$Species)), c("setosa", "versicolor", "virginica"))

  factormemcpo$fix.factors = TRUE

  expect_equal((hi %>>% retrafo(hi %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi2 %>>% retrafo(hi %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi3 %>>% retrafo(hi %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi2 %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 5)
  expect_equal((hi3 %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 5)
  expect_equal((iris %>>% retrafo(hi2 %>>% factormemcpo))[1, 1], 5)

  expect_set_equal(names(table((iris %>>% retrafo(hi2 %>>% factormemcpo))$Species)), c("setosa", "versicolor"))
  expect_equal(length((iris %>>% retrafo(hi2 %>>% factormemcpo))$Species), nrow(iris))
  expect_equal(is.na((iris %>>% retrafo(hi2 %>>% factormemcpo))$Species), iris$Species == "virginica")

})


test_that("datasplit 'factor', 'ordered', 'onlyfactor', 'numeric'", {

  checkfn = function(data, test, ...) {
    control = 0
    subset = switch(test,
      numeric = c(1, 5, 9),
      factor = c(4, 7, 8, 12),
      onlyfactor = 7,
      ordered = c(4, 8, 12))
    expect_equal(cpo.df2[subset], data)
    data[c(3, 2, 1), , drop = FALSE]
  }

  cpo = function(test) {
    split = datasplitToDataformat(test)
    makeCPOObject("datasplit_extra", test: discrete[numeric, factor, onlyfactor, ordered],
    .dataformat = split$dataformat, .dataformat.factor.with.ordered = split$dataformat.factor.with.ordered,
    cpo.trafo = checkfn, cpo.retrafo = checkfn)(test)
  }

  tsk = makeClassifTask("cpo.df2", cpo.df2, target = "F1", check.data = FALSE)

  reverseSubset = function(df, subset) {
    if (is.logical(subset)) {
      subset = which(subset)
    }
    for (idx in subset) {
      df[[idx]] = rev(df[[idx]])
    }
    row.names(df) = as.integer(rev(row.names(df)))
    df
  }

  expected = reverseSubset(cpo.df2, c(1, 5, 9))
  expect_equal(getTaskData(tsk %>>% cpo("numeric")), expected)
  expect_equal(cpo.df2 %>>% retrafo(tsk %>>% cpo("numeric")), expected)

  expected = reverseSubset(cpo.df2, c(4, 7, 8, 12))
  expect_equal(getTaskData(tsk %>>% cpo("factor")), expected)
  expect_equal(cpo.df2 %>>% retrafo(tsk %>>% cpo("factor")), expected)

  expected = reverseSubset(cpo.df2, 7)
  expect_equal(getTaskData(tsk %>>% cpo("onlyfactor")), expected)
  expect_equal(cpo.df2 %>>% retrafo(tsk %>>% cpo("onlyfactor")), expected)

  expected = reverseSubset(cpo.df2, c(4, 8, 12))
  expect_equal(getTaskData(tsk %>>% cpo("ordered")), expected)
  expect_equal(cpo.df2 %>>% retrafo(tsk %>>% cpo("ordered")), expected)

})

test_that("datasplit with matrix in numeric split works", {

  checkfn = function(data, ...) {
    control = 0
    if ("numeric" %in% names(data)) {
      data$numeric = as.matrix(data$numeric)[c(3, 2, 1), ]
      data
    } else {
      as.matrix(data[c(3, 2, 1), , drop = FALSE])
    }
  }

  tsk = makeClassifTask("cpo.df2", cpo.df2, target = "F1", check.data = FALSE)
  reverseSubset = function(df, subset) {
    if (is.logical(subset)) {
      subset = which(subset)
    }
    for (idx in subset) {
      df[[idx]] = rev(df[[idx]])
    }
    df
  }

  expected = reverseSubset(cpo.df2, c(1, 5, 9))
  row.names(expected) = as.integer(rev(row.names(expected)))
  expected.invrn = expected
  row.names(expected.invrn) = as.integer(rev(row.names(expected)))

  expected.chr = reverseSubset(cpo.df2, c(1, 5, 9))
  row.names(expected.chr) = row.names(expected)

  cpo = function(split) {
    split = datasplitToDataformat(split)
    makeCPOObject("datasplit_numeric_matrix",
    .dataformat = split$dataformat, .dataformat.factor.with.ordered = split$dataformat.factor.with.ordered,
    cpo.trafo = checkfn, cpo.retrafo = checkfn)()
  }

  retd = tsk %>>% cpo("numeric")
  expect_equal(getTaskData(retd), expected.chr)
  expect_equal(cpo.df2 %>>% retrafo(retd), expected.chr)

  retd = tsk %>>% cpo("most")
  expect_equal(getTaskData(retd), expected.invrn)
  expect_equal(cpo.df2 %>>% retrafo(retd), expected.invrn)

  retd = tsk %>>% cpo("all")
  expect_equal(getTaskData(retd), expected.invrn)
  expect_equal(cpo.df2 %>>% retrafo(retd), expected.invrn)
})
Any scripts or data that you put into this service are public.
mlrCPO documentation built on Nov. 18, 2022, 1:05 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

tests/testthat/test_core_datasplit.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

Try the mlrCPO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mlrCPO Composable Preprocessing Operators and Pipelines for Machine Learning

tests/testthat/test_core_datasplit.R In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning

Try the mlrCPO package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mlrCPO
Composable Preprocessing Operators and Pipelines for Machine Learning

tests/testthat/test_core_datasplit.R
In mlrCPO: Composable Preprocessing Operators and Pipelines for Machine Learning