R/misc_mle_study.R

Defines functions derive_stats derive_result_mle_cpe derive_result_mle_acc set_default select_models get_hp unique_max subset_obs subset_models select_setM

Documented in select_setM

#' Select a subset of models
#'
#' @param instance instance generated by \code{sample_mle}
#' @param methods character, allowed methods e.g. "glmnet|xgbTree", if NA all methods are allowed
#' (i.e. NA is a shortcut to "glmnet|xgbTree|rpartCost|svmLinearWeights2")
#' @param M integer, number of models
#' @param M.start integer, first model; if M.start=NA, models are choosen randomly according to M.probs
#' @param M.probs character, one of "uniform", "learn.theta", "learn.theta.squared"
#' @param M.seed integer, random seed for random draw of models
select_setM <- function(instance,
                        methods = NA,
                        M=200,
                        M.start = NA,
                        M.probs = c("uniform", "learn.theta", "learn.theta.squared"),
                        M.seed = 1){

  # return all models if required:
  if(M >= length(instance$models)){
    return(instance)
  }
  # derive number of model types
  if(is.na(methods)){
    methods <- "glmnet|xgbTree|rpartCost|svmLinearWeights2"
  }
  methods.num <- length(strsplit(methods, "\\|")[[1]])
  # setM0 = setM after reduction to relevant methods:
  setM0 <- instance$models[grepl(methods, instance$models)]
  M0 <- length(setM0); stopifnot(M <= M0)
  # if no start index is given: sample randomly according probs defined by M.probs
  if(is.na(M.start)){
    set.seed(M.seed)
    M.probs <- match.arg(M.probs)
    pr <- switch(M.probs,
                 uniform = rep(1,M0),
                 learn.theta = instance$learn.models$theta[setM0, "theta"],
                 learn.theta.squared = instance$learn.models$theta[setM0, "theta"]^2)
    subs <- sample(M0, M, replace=FALSE, prob=pr)
  }else{
    b <- M/methods.num; stopifnot(b %% 1 == 0)
    subs <- as.integer(sapply((0:(methods.num-1))*(M0/methods.num),function(x){x+M.start:(M.start+b-1)}))
  }
  sel <- instance$models %in% setM0[subs]
  # rebuilt instance based on selected models
  out <- instance[1:2]
  out$models <- instance$models[sel]
  out$hyperparams <- lapply(instance$hyperparams, function(h){h[rownames(h) %in% out$models, ,drop=FALSE]})
  out$train.models <- subset_models(instance$train.models, sel)
  out$learn.models <- subset_models(instance$learn.models, sel)
  return(out)
}

subset_models <- function(models, sel){
  out <- lapply(models[1:2], function(x){
    if(is.list(x)){
      x$pred <- x$pred[,sel]; x$thetahat <- x$thetahat[sel, ]; return(x)
    }})
  out$theta <- models$theta[sel, ]
  return(out)
}

subset_obs <- function(rdm.eval, first.eval, n.eval, n.max=NULL){
  if(n.eval > n.max){
    return(1:n.max)
  }
  if(rdm.eval){
    obs.eval <- sample(1:n.max, n.eval, replace=FALSE)
  }else{
    obs.eval <- first.eval:(first.eval+n.eval-1)
  }
  return(obs.eval)
}

unique_max <- function(x){
  out <- which(x == max(x))
  if(length(out)==1){
    return(out)
  }else{
    return(sample(out, 1))
  }
}

get_hp <- function(fmn, instance){
  i <- which(sapply(instance$hyperparams, function(x)fmn %in% rownames(x)))
  return(instance$hyperparams[[i]][fmn, , drop=FALSE])
}


# select_models -------------------------------------------------------------------------------
select_models <- function(instance,
                          comp,
                          n.eval,
                          analysis,
                          select.method = c("oracle", "optimal", "close", "best", "simplest.en"),
                          select.args = "",
                          select.limit = c("none", "sqrt", "one")){

  select.method <- match.arg(select.method)
  M <- length(instance$models)

  ## maximum number of models to select:
  select.max <- switch(match.arg(select.limit),
                       none = Inf,
                       one = 1,
                       sqrt = round(sqrt(n.eval)))

  # select.method = "oracle" --------------------------------------------------------------------
  if(select.method == "oracle"){
    a <- string2list(select.args)
    a <- set_default(a, "type", paste0("learn.", analysis))
    stopifnot(a$type %in% c("train.acc", "learn.acc", "train.cpe", "learn.cpe"))

    tt <- switch(a$type,
                 train.acc = instance[["train.models"]]$theta$theta,
                 learn.acc = instance[["learn.models"]]$theta$theta,
                 train.cpe = pmin(instance[["train.models"]]$theta$theta1,
                                  instance[["train.models"]]$theta$theta0),
                 learn.cpe = pmin(instance[["learn.models"]]$theta$theta1,
                                  instance[["learn.models"]]$theta$theta0)
    )

    setS <- SEPM::select(comp,
                         method = "user",
                         sel=which(tt==max(tt)))
  }

  # select.method = "optimal" -------------------------------------------------------------------
  if(select.method == "optimal"){
    a <- string2list(select.args) %>%
      set_default("fixed_prev", FALSE) %>%
      set_default("rdm_eval", TRUE) %>%
      set_default("method_select", "few") %>%
      set_default("method_order", "param") %>%
      set_default("target_order", "tstat") %>%
      set_default("combine_order", "min") %>%
      set_default("max_iter", 50) %>%
      set_default("target_tol", 1e-4) %>%
      set_default("method_ext", "basic") %>%
      set_default("method_pred", "mbeta_approx")

    prev_eval <- ifelse(as.logical(a$fixed_prev), instance$info['pop', 'prev'], NA)

    setS <- SEPM::select_optimal(comparison = comp,
                                 method_select = a$method_select,
                                 method_order = a$method_order,
                                 target_order = a$target_order,
                                 combine_order = a$combine_order,
                                 max_models = select.max,
                                 n_eval = n.eval,
                                 prev_eval = prev_eval,
                                 n_val = instance$info['val', 'n'],
                                 prev_val = instance$info['val', 'prev'],
                                 rdm = as.logical(a$rdm_eval),
                                 threshold = comp$hypothesis$threshold,
                                 batch_size = 10,
                                 max_iter = as.numeric(a$max_iter),
                                 target_tol = as.numeric(a$target_tol),
                                 method_ext = a$method_ext,
                                 method_pred = a$method_pred,
                                 return_simvals = FALSE,
                                 steady_plot = FALSE,
                                 save_plot = FALSE,
                                 info_plot = FALSE,
                                 ylim_plot = NULL)
  }

  # select.method = "close" ---------------------------------------------------------------------
  if(select.method == "close"){
    a <- string2list(select.args)
    a <- set_default(a, "k", 1)
    a <- set_default(a, "mode", "weighted")

    setS <- SEPM::select_close(comp,
                               k= as.numeric(a$k),
                               mode = a$mode,
                               threshold = comp$hypothesis$threshold,
                               max_models = select.max,
                               regu = 1,
                               break_ties = TRUE)
  }

  # select.method = "best" ----------------------------------------------------------------------
  if(select.method == "best"){
    a <- string2list(select.args)
    a <- set_default(a, "mode", "weighted")

    setS <- SEPM::select_best(comp,
                              mode = a$mode,
                              threshold = comp$hypothesis$threshold,
                              max_models = select.max,
                              regu = 1,
                              break_ties = TRUE)
  }

  # select.method = "simplest.en" ---------------------------------------------------------------
  if(select.method == "simplest.en"){
    hp.en <- instance$hyperparams$glmnet
    stopifnot(nrow(hp.en)>0)
    mm <- instance$models %in% rownames(hp.en)
    tt <- instance$train.models$val$thetahat$theta[mm]; theta.max <- max(tt)
    incl <- tt > theta.max - 1*sqrt(theta.max*(1-theta.max)/instance$info["val", "n"])
    pp <- hp.en$alpha * hp.en$lambda; pp[!incl] <- -Inf

    setS <- SEPM::select(comp,
                         method = "user",
                         sel=which(instance$models %in% rownames(hp.en)[pp==max(pp)]))
  }

  return(setS)
}

set_default <- function(x, field, value){
  stopifnot(is.list(x))
  stopifnot(is.character(field))
  if(is.null(x[[field]])){x[[field]] <- value}
  return(x)
}


# derive_result_mle --------------------------------------------------------------------------
derive_result_mle_acc <- function(instance,
                                  pred,
                                  inf,
                                  delta = 0,
                                  final.choice="ci"){

  result <- data.frame(load.id = instance$job$id,
                       opt.theta = max(instance$learn.models$theta$theta),
                       S = ncol(pred),
                       as.data.frame(inf$inference$info[3:5]))

  mstar <- unique_max(inf$inference$result$lower)

  final.mod.name <- as.character(inf$inference$result[mstar, "model.name"])
  final.mod.index <- which(final.mod.name== instance$models)
  final.mod.result <- derive_stats(final.mod.index, "final", instance)
  final.mod.result$final.mod.thetabar <- inf$estimation$result$theta.bar[final.mod.name]

  final.mod.result <- cbind(final.mod.result,
                            inf$inference$result[mstar, c("estimate", "corrected", "lower", "upper")])

  colnames(final.mod.result)[(ncol(final.mod.result)-3):ncol(final.mod.result)] <-
    paste0("final.acc.", c("thetahat", "thetatilde", "lower", "upper"))

  return(cbind(result, final.mod.result))
}

derive_result_mle_cpe <- function(instance,
                                  pred,
                                  inf,
                                  delta = 0){

  result <- data.frame(load.id = instance$job$id,
                       opt.acc = max(instance$learn.models$theta$theta),
                       opt.se = max(instance$learn.models$theta$theta1),
                       opt.sp = max(instance$learn.models$theta$theta0),
                       S=ncol(pred), as.data.frame(inf$inference$info[3:5]))

  mopt <- unique_max(pmin(instance$learn.models$theta$theta0,
                          instance$learn.models$theta$theta1 -delta))
  result$opt.mod.name <- as.character(instance$models[mopt])
  result$opt.sp.cpe <- instance$learn.models$theta$theta0[mopt]
  result$opt.se.cpe <- instance$learn.models$theta$theta1[mopt]

  mstar <- unique_max(pmin(inf$inference$result.0$lower,
                           inf$inference$result.1$lower - delta))

  final.mod.name <- as.character(inf$inference$result.1[mstar, "model.name"])
  final.mod.index <- which(final.mod.name== instance$models)
  final.mod.result <- derive_stats(final.mod.index, "final", instance, details=TRUE)
  final.mod.result$final.se.bar <- inf$estimation$result.1$theta.bar[final.mod.name]
  final.mod.result$final.sp.bar <- inf$estimation$result.0$theta.bar[final.mod.name]

  final.mod.result <- cbind(final.mod.result,
                            inf$inference$result.1[mstar, c("estimate", "corrected", "lower", "upper")])
  colnames(final.mod.result)[(ncol(final.mod.result)-3):ncol(final.mod.result)] <-
    paste0("final.se.", c("hat", "tilde", "lower", "upper"))

  final.mod.result <- cbind(final.mod.result,
                            inf$inference$result.0[mstar, c("estimate", "corrected", "lower", "upper")])
  colnames(final.mod.result)[(ncol(final.mod.result)-3):ncol(final.mod.result)] <-
    paste0("final.sp.", c("hat", "tilde", "lower", "upper"))

  return(cbind(result, final.mod.result))
}

derive_stats <- function(index,
                         name = "final.mod",
                         instance,
                         details = FALSE){

  stopifnot(length(index)==1)

  out <- data.frame(name = instance$models[index],
                    train.theta = instance$train.models$theta$theta[index],
                    learn.theta = instance$learn.models$theta$theta[index],
                    thetabar.val = instance$train.models$val$thetahat$theta[index])
  if(details){
    out$train.se = instance$train.models$theta$theta1[index]
    out$learn.se = instance$learn.models$theta$theta1[index]
    out$se.bar.val = instance$train.models$val$thetahat$theta1[index]
    out$train.sp = instance$train.models$theta$theta0[index]
    out$learn.sp = instance$learn.models$theta$theta0[index]
    out$sp.bar.val = instance$train.models$val$thetahat$theta0[index]
  }
  names(out) <- paste(name, names(out), sep=".")

  return(out)
}
maxwestphal/SEPM.SIM documentation built on April 11, 2024, 4:06 p.m.