R/split_CV.R

Defines functions obj_sum.resample print.resample `[[.DataStorageClass` nrowS3 nrowS3.data.frame nrowS3.DataStorageClass as.integer.resample as.integer.ResampleDataClass as.data.frame.resample extract.y extract.y.resample extract.y.DataStorageClass dim.resample id resample resample.data.frame resample.DataStorageClass crossv_kfold fit.splitCVStack predict_SL.splitCVfits

## ----------------------------------------------------------------
## TO DO TO FINISH external CV for using with stremr:
##
## 1) Need to handle subsets correctly:
##    Only the subset data will be used for fitting / extract.y
##    So the question is, do we subset it here, right away or do we pass it and delay the subsetting until the actual internal fitting is performed?
##
## 2) Need to handle DataStorage object as the input data.
##    For now we can directly obtain $dat.sVar and then call crossv_kfold(). However this is very inefficient.
##
## 3) Need to figure out how do this whole operation efficiently (by reference, so that DataStorage objects are not being copied everywhere.
##
## 4) For split-specific SL need to re-set the outcomes for each training fold to their respective fold-specific predictions from the previous SL.
##
## ----------------------------------------------------------------

#' @importFrom tibble obj_sum
#' @method obj_sum resample
# @export
obj_sum.resample <- function(x, ...) {
  paste0("resample [", big_mark(nrow(x)), " x ", big_mark(ncol(x)), "]")
}

# @export
print.resample <- function(x, ...) {
  n <- length(x$idx)
  if (n > 10) {
    id10 <- c(x$idx[1:10], "...")
  } else {
    id10 <- x$idx
  }

  cat("<", obj_sum.resample(x), "> ", paste(id10, collapse = ", "), "\n",
    sep = ""
  )
}

`[[.DataStorageClass` = function(x, i, exact = TRUE) {
  x$dat.sVar[[i]]
}

nrowS3 <- function(x, ...) { UseMethod("nrowS3") }

nrowS3.data.frame <- function(x, ...) {
  nrow(x)
}

nrowS3.DataStorageClass <- function(x, ...) {
  x$nobs
}

# @export
as.integer.resample <- function(x, ...) {
  x$idx
}

as.integer.ResampleDataClass <- function(x, ...) {
  x$as.integer
}

# @export
as.data.frame.resample <- function(x, ...) {
  x$data[x$idx, , drop = FALSE]
}

extract.y <- function(x, y, ...) { UseMethod("extract.y") }

extract.y.resample <- function(x, y, ...) {
  x$data[x$idx, y, drop = FALSE, with = FALSE][[1]]
}

extract.y.DataStorageClass <- function(x, y, ...) {
  x$get.outvar(var = y)
}

# @export
dim.resample <- function(x, ...) {
  c(length(x$idx), ncol(x$data))
}

id <- function(n) {
  width <- nchar(n)
  sprintf(paste0("%0", width, "d"), seq_len(n))
}

resample <- function(data, idx, ...) { UseMethod("resample") }

resample.data.frame <- function(data, idx, ...) {
  if (!is.data.frame(data)) {
    stop("`data` must be a data frame.", call. = FALSE)
  }
  if (!is.integer(idx)) {
    stop("`idx` must be an integer vector.", call. = FALSE)
  }

  structure(
    list(
      data = data,
      idx = idx
    ),
    class = "resample"
  )
}

resample.DataStorageClass <- function(data, idx, subset_idx) {
  if (!is.DataStorageClass(data)) {
    stop("`data` must be an object of class `DataStorageClass`.", call. = FALSE)
  }
  if (!is.integer(idx)) {
    stop("`idx` must be an integer vector.", call. = FALSE)
  }
  if (!is.null(subset_idx) && !is.integer(subset_idx)) {
    stop("`subset_idx` must be NULL or an integer vector.", call. = FALSE)
  }
  ResampleDataClass$new(data, idx, subset_idx)
}

crossv_kfold <- function(data, id = ".id", fold_column = "fold", subset_idx) {

  n <- nrowS3(data)
  folds <- data[[fold_column]]
  k <- length(unique(folds))

  # folds <- sample(rep(1:k, length.out = n))

  idx <- seq_len(n)
  fold_idx <- split(idx, folds)

  fold <- function(test) {
    list(
      train = resample(data, setdiff(idx, test), subset_idx),
      test = resample(data, test, subset_idx)
    )
  }

  cols <- purrr::transpose(purrr::map(fold_idx, fold))
  cols[[id]] <- id(k)

  tibble::as_data_frame(cols)
}

#' @export
fit.splitCVStack <- function(models,
                        method = c("splitCV"),
                           ID,
                           t_name,
                           x,
                           y,
                           data,
                           fold_column,
                           seed = NULL,
                           subset_exprs = NULL,
                           subset_idx = NULL,
                           verbose = getOption("gridisl.verbose"),
                           ...) {

  method <- method[1L]
  gvars$method <- method
  gvars$verbose <- verbose

  loss_fun_MSE <- function(yhat, y0) (yhat - y0)^2
  resid_fun <- function(yhat, y0) as.numeric(yhat - y0)

  fn_model <- function(train, test, models, .id, ...){
    cat("training and scoring for fold: ", .id, "\n")
    fit_model(ID = ID,
              t_name = t_name,
              x = x,
              y = y,
              train_data = train,
              valid_data = test,
              # train_data = as.data.frame(train),
              # valid_data = as.data.frame(test),
              models = models)
  }

  data_cv <- crossv_kfold(data, ".id", fold_column, subset_idx)

  data_cv <-
    data_cv %>%
    dplyr::mutate(models = purrr::map(.id, ~ models))

  cv_fit <-
    data_cv %>%
    dplyr::mutate(fit = purrr::pmap(data_cv, fn_model))
    #  %>%
    # print()

  cv_fit <-
    cv_fit %>%
    dplyr::mutate(test_preds = purrr::map(fit, ~ gridisl:::predict_holdout(modelfit = .x, best_only = FALSE))) %>%
    dplyr::mutate(resid = purrr::map2(test, test_preds, ~ .y[, lapply(.SD, resid_fun, extract.y(.x, y))])) %>%
    dplyr::mutate(squared_resid = purrr::map(resid, ~ as.data.table(.x^2)))

  # predicted <- cv_fit %>% tidyr::unnest(test_preds)
  # resid <- cv_fit %>% tidyr::unnest(resid)
  squared_resid <- cv_fit %>% tidyr::unnest(squared_resid) %>% dplyr::select(-.id) %>% as.data.table

  ## Re-assign internally new MSE values to each fold-specific model object.
  ## These new MSE values are determined based on ALL validation folds.
  x <- purrr::map(cv_fit[["fit"]], ~ .x$reassignMSEs(squared_resid))
  x <- purrr::map(cv_fit[["fit"]], ~ print(.x$getMSEtab))

  cv_fit <-
    cv_fit %>%
    dplyr::select(train, test, .id, fit)

  class(cv_fit) <- c(class(cv_fit), "splitCVfits")

  return(cv_fit)
}



#' @export
predict_SL.splitCVfits <- function(modelfit,
                       newdata,
                       add_subject_data = FALSE,
                       subset_idx = NULL,
                       holdout = FALSE,
                       verbose = getOption("gridisl.verbose")) {

  predict_new <- function(modelfit, newdata) {
    predict_SL(modelfit, newdata = newdata, add_subject_data = add_subject_data, subset_idx = subset_idx, verbose = verbose)
  }

  if (missing(newdata) && holdout) {
    ## For holdout predictions with holdoutSL the default is to use the previous validation data
   SL_preds <- modelfit %>%
      dplyr::mutate(idx = purrr::map(test, ~ gridisl:::as.integer.ResampleDataClass(.x))) %>%
      dplyr::mutate(preds = purrr::map2(fit, test, ~ predict_new(.x, newdata = .y))) %>%
      tidyr::unnest(idx, preds) %>%
      dplyr::select(-.id) %>%
      as.data.table

    setkeyv(SL_preds, cols = "idx")
    # print("SL holdout preds"); print(SL_preds)
    SL_preds[, ("idx") := NULL]

    # SL_preds <- modelfit %>%
    #   unnest(purrr::map(fit, ~ predict_SL(.x, holdout = TRUE))) %>%
    #   dplyr::select(-.id)
    # print("holdout SL_preds"); print(SL_preds)

  } else if (missing(newdata) && !holdout) {
    ## Use the training data:
    SL_preds <- modelfit %>%
      dplyr::mutate(idx = purrr::map(train, ~ gridisl:::as.integer.ResampleDataClass(.x))) %>%
      dplyr::mutate(preds = purrr::map2(fit, train, ~ predict_new(.x, newdata = .y))) %>%
      tidyr::unnest(idx, preds) %>%
      dplyr::select(-.id) %>%
      as.data.table

    setkeyv(SL_preds, cols = "idx")
    # print("SL train preds"); print(SL_preds)
    SL_preds <- SL_preds[, list("preds" = mean(preds)), by = idx][, ("idx") := NULL]

  } else {
    ## SL predictions for new data. Averages the predictions across all V models.
    SL_preds <- modelfit %>%
      dplyr::mutate(preds = purrr::map(fit, ~ predict_new(.x, newdata = newdata))) %>%
      dplyr::mutate(idx = purrr::map(preds, ~ seq.int(nrow(.x)))) %>%
      tidyr::unnest(idx, preds) %>%
      as.data.table

    setkeyv(SL_preds, cols = "idx")
    SL_preds <- SL_preds[, list("preds" = mean(preds)), by = idx][, ("idx") := NULL]

  }

  return(SL_preds)
}
osofr/gridisl documentation built on May 24, 2019, 4:55 p.m.