PEDISdata: Preprocessing PEDIS data files

#' Create Folds for (repeated) Cross Validation
#'
#' Create folds used in the pedis train function
#'
#' @param y The outcome vector (as produced by prepare_X_y)
#' @param k Optional integer, number of folds, default = 5
#' @param times Optional integer, number of repeats, default = 5
#' @param seed Optional
#'
#' @return List containing the folds
#' @export
#'
#' @importFrom caret createMultiFolds
#'
#' @examples
#' predis <- data(pedis)
#' X_y <- prepare_X_y(pedis, outcome = "minor_amputation", vars = c("p", "e"))
#' folds <- create_folds(X_y$y)
#'
create_folds <- function(y, k = 5, times = 5, seed = 93864) {
  # Create Folds to reproduce fitting process
  ## Repeated cross validation
  folds <- caret::createMultiFolds(y, k = k, times = times)
  return(folds)
}

#' Train CV Log. Regression Model
#'
#' @param X Matrix containing the predictor variables
#' @param y Vector containing the outcome variables
#'
#' @return A train object of class pedis_train
#' @export
#'
#' @importFrom caret trainControl
#' @importFrom caret train
#'
#' @examples
#' data(pedis)
#' X_y <- prepare_X_y(pedis, outcome = "minor_amputation", var = c("p", "d"))
#' folds <- create_folds(X_y$y)
#' fit <- train_pedis(X = X_y$X, y = X_y$y, folds = folds)
#' class(fit)
train_pedis <- function(X, y, folds) {

  # Edit training options
  ##  implicitly conducting repeated cross-validation
  tr_control <- caret::trainControl(classProbs = TRUE,
                                    savePredictions = "all",
                                    returnData = TRUE,
                                    returnResamp = "all",
                                    index = folds,
                                    summaryFunction = caret::twoClassSummary)

  # Fit the binomial model
  fit <- caret::train(x = X,
                      y = y,
                      method = "glm",
                      family = "binomial",
                      metric = "ROC",
                      preProcess = c("scale", "center"),
                      trControl = tr_control)

  return(fit)

}

#' Predict Test Data
#'
#' Returns the predicted probability value and the actual outcome
#'
#' @param fit A caret model
#'
#' @return A `tibble` with two variables
#' 1. The probability of the event and
#' 2. the actual, observed outcome
#'
#' @import dplyr
#' @import tibble
#'
#' @export
#'
#' @examples
#' data(pedis)
#' X_y <- prepare_X_y(pedis, outcome = "minor_amputation", var = c("p", "d"))
#' fit <- train_pedis(X = X_y$X, y = X_y$y)
#' test_predict(fit)
#'
test_predict <- function(fit) {

  data <- fit %>%
    purrr::pluck("pred") %>%
    dplyr::select(yes, obs) %>%
    tibble::as_tibble()

  return(data)

}

#' Get a dataframe with predictor (pred) and response variable (actual observations)
#'
#'
#' @param idx Folds
#' @param X A matrix with predictors
#' @param y A vecotr of outcomes
#'
#' @return A dataframe
#'
train_predict_helper <- function(idx, X, y) {

  fit <- glm(y[idx] ~ as.matrix(X[idx, ]), family = "binomial")
  y_pred <- predict(fit, type = "response")

  data <- tibble(predictor = y_pred, response = y[idx])

  data

}

#' Train predict
#'
#' Function takes a list with integer vectors representing fold indicies
#' or a single integer vector representing these indicies and predict
#' outcomes based on the training results
#'
#' @param folds An integer vectors or a list containing integer vectors
#' @param X_y A list containing the predictor matrix `X` and the outcome vector `y`
#'
#' @import purrr
#' @import dplyr
#'
#' @return A dataframe containing the actual outcome and the predicted value
#' @export
#'
#' @examples
#' data(pedis)
#' X_y <- prepare_X_y(data = pedis, outcome = "minor_amputation", vars = c("p", "d"))
#' fold_idx <- 1:10
#' train_predict(folds = 1:10, X_y = X_y)
#' folds <- list(1:10, 11:20)
#' train_predict(folds = folds, X_y = X_y)
train_predict <- function(folds, X_y) {

  X <- X_y$X
  y <- X_y$y

  if(is.list(folds)) {
    if(!purrr::is_integer(folds[[1]])) {
      stop("Fold-List must contain integers")
    }
    df <- purrr::map_df(.x = folds, .f = train_predict_helper, X = X, y = y, .id = "fold")
    df <- df %>% dplyr::rename("yes" = predictor, "obs" = response)
    return(df)
  } else if (is.numeric(folds)){
    train_predict_helper(idx = folds, X = X, y = y)
  } else {
    stop("Check Folds, I assume they do not contain integers")
  }
}

# #' Wrapper to predict over all folds (train)
# #'
# #' @param folds A list with k * times integer vectors  containing the fold index
# #' @param X_y A list with predictor matrix and output vector
# #'
# #' @return A list
# #' @export
# #'
# #' @examples
# #' data(pedis)
# #' X_y <- prepare_X_y(data = pedis, outcome = "minor_amputation", vars = c("p", "d"))
# #' folds <- create_folds(y = X_y$y)
# #' map_train_predict(folds = folds, X_y = X_y)
# map_train_predict <- function(folds, X_y, fun = train_predict) {
#   X <- X_y$X
#   y <- X_y$y
#
# }

# testPredict <- test_predict(fit)

#   testROC <- roc(response = testPredict$obs, predictor = testPredict$yes)
#   trainROC <- roc(response = trainPredict$response, trainPredict$predictor)
#
#   # Create Plot
#   plot_train_test <- function(trainROC, testROC) {
#     smooth(trainROC) %>% plot(col = "blue", legacy.axis = FALSE, las = 1, ylim = c(0, 1))
#     smooth(testROC) %>% plot(add = TRUE, col = "red", lty = 2)
#     legend(x = .4, y =  .4, legend = c("Training ROC", "Test ROC"), col = c("blue", "red"), lty = 1:2, lwd = 3)
#     abline(h = 1, lty = 3, col = "grey")
#     abline(h = 0, lty = 3, col = "grey")
#   }
#
#
#   # # LogReg on training data
#   # # Return numeric AUC result
#   trainAUC <- function(idx, X, y) {
#     fit <- glm(y[idx] ~ as.matrix(X[idx, ]), family = "binomial")
#     y_pred <- predict(fit, type = "response")
#     roc_obj <- pROC::roc(y[idx], y_pred)
#     auc_result <- pROC::auc(roc_obj)
#     return(auc_result)
#   }
#
#   # # Training AUC metric
#   # ## A vector with AUC for every Training fold
#   train_auc <- map_dbl(folds, trainAUC, X, y)
#   #
#   # # Extracting test metrics from train object
#   test_auc <- pluck(fit, "resample", "ROC")
#   names(test_auc) <- pluck(fit, "resample", "Resample")
#
#   # Summary of train and test metrics
#   test_train_auc <- tibble(train_auc, test_auc)
#
#   differences <- test_train_auc %>% gather() %>% diff_table()
#
#   if (bayes) {
#     # Compare Test and Training AUC-Metric using Bayesian Methods
#     b_t_test <- BayesianFirstAid::bayes.t.test(test_auc, train_auc, n.iter = 3000)
#   } else {
#     b_t_test <- NULL
#   }
#   # Logistic Regression
#   LReg_full_fit <- glm(y ~ as.matrix(X), family = "binomial")
#
#   # Create Summary Table
#   train_test_diff <- test_train_auc %>%  create_summary_table()
#
#   list(full_fit  = LReg_full_fit,
#        bayes_ttest = b_t_test,
#        test_train_auc = test_train_auc,
#        plot_rocs =  function() plot_train_test(trainROC, testROC),
#        differences = differences,
#        train_test_diff = train_test_diff
#   )
#
# }
#