attic/get_results_openml.R

# FIXME: This is the start of some functions that allow us to get
#        Results from OpenML in order to directly create the tasks.
#        As this is not entirely straightforward, development is
#        paused for here now.
# Current Problems:
#        listOMLSetup does not allow us to subset for task.id,
#        passing setup.ids explodes the url string and without subsetting
#        we have to query way too much unneccessary data.


# Put the results into a table
# get performances

# listOMLRunEvaluations(tag = "botV1",
#  evaluation.measure = "area_under_roc_curve",
#   limit = 1000))
# # subset results on some information(s) and measure(s)
# my_runs = my_runs[, c("run.id", "task.id", "area.under.roc.curve" )]

# # get hyperparameters
# runs = listOMLRuns(tag = "botV1", task.id = 31, limit = 10000)
# paras = listOMLSetup(runs$setup.id)
# paras_names = names()
# paras = paras[paras$parameter.name %in% paras_names, c("setup.id", "parameter.name", "value")]
# library(tidyr)
# library(dplyr)
# paras = spread(paras, key = parameter.name, value = value)
# paras = merge(paras, runs[, c("run.id", "setup.id")], key = "setup.id")
# paras = select(paras, -setup.id)

# # Put things together
# results = merge(my_runs, paras, by = "run.id")
# # Put it in a nice order
# results = results[, c(setdiff(names(results), "area.under.roc.curve"), "area.under.roc.curve")]
# # Now you can compare the performances of your different hyperparameters
# print(head(results))



get_flow_ids = function(lrn) {
  flow = listOMLFlows(tag = "botV1")
  flow = flow[flow$name == paste0("mlr.classif.", learner), ]
  return(flow$flow.id)
}

get_runs = function(base_learner, task.id) {

  flows = get_flow_ids(base_learner)


  i = 0
  results_left = TRUE
  runs = data.frame()

  while(results_left) {

    data = listOMLRuns(tag = "botV1",
      task.id = task_id,
      flow.id = flows,
      limit = 10000,
      offset = i * 10000
    )

    runs = rbind(runs, data)
    if (nrow(data) == 0) {
      results_left = FALSE
    }
    i = i + 1
  }
  return(runs)
}


get_params = function(base_learner, task.id, paras_names) {
  flows = get_flow_ids(base_learner)
  runs = get_runs(base_learner, task.id)
  setups = split(runs$setup.id, ceiling(seq_len(nrow(runs))/100))
  for(i in seq_len(length(setups))) {
    data = listOMLSetup(flow.id = flows, setup.id = setups[[i]])
    data = data[data$parameter.name %in% paras_names, c("setup.id", "parameter.name", "value")]
    data = tidyr::spread(data, key = parameter.name, value = value)
    params = dplyr::bind_rows(params, data)
  }

  return(params)
}

# base_learner = "glmnet"
# runz = get_runs(base_learner, 31)
# parz = get_params(base_learner, c("alpha", "lambda"))
# data = merge(data, runs[, c("run.id", "setup.id")], key = "setup.id")


df = as.data.frame(matrix(rnorm(10^6), nrow = 5*10^4, ncol = 20))
df$y = rnorm(10^4)
library(mlr)
tsk = makeRegrTask(data = df, target = "y")

lrn = makeLearner("regr.RcppHNSW", M = 64)
mod = train(lrn, tsk)
prd = predict(mod, tsk)
performance(prd, list(rmse, timetrain, timepredict), model = mod)

lrn = makeLearner("regr.RcppHNSW", M = 64, distance = "cosine")
mod = train(lrn, tsk)
prd = predict(mod, tsk)
performance(prd, list(rmse, timetrain, timepredict), model = mod)


lrn = makeLearner("regr.cubist", committees = 20L)
mod = train(lrn, tsk)
prd = predict(mod, tsk)
performance(prd, list(rmse, timetrain, timepredict), model = mod)
compstat-lmu/surrogates documentation built on Jan. 29, 2020, 1:23 p.m.