knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  cache = F, 
  fig.width = 7,
  fig.height = 7
)

Generate data

Dataset

nobs = 200
nvar = 20
nreal = 10

set.seed(42)

x <- matrix(rnorm(nobs * nvar), nobs, nvar)
beta <- c(2+rnorm(nreal, 0, 1), rep(0, nvar - nreal))
y_cont <- c(t(beta) %*% t(x)) + rnorm(nobs, sd = 3)
y_logit <- exp(y_cont)/(1+exp(y_cont))
y_class <- sapply(y_logit, function(x) rbinom(n = 1, size = 1, prob = x))
x <- dplyr::tibble(y = forcats::fct_rev(factor(y_class)), 
              study_id = 1:nobs,
              dplyr::as_tibble(x))

features <- colnames(x[, which(!colnames(x) %in% c("y", "study_id"))])

Preliminary diagnosis

prel_diag_metrics <- dplyr::tibble(sens = 0.7, spec = 0.7, ppv = 0.5)

Prepare work environment

library(tidymodels)
library(tidyverse)
library(nbs)
theme_set(theme_bw() + theme(plot.title = element_text(hjust = 0.5)))

# Import data
# load("data.rds")
# - x: tibble or data frame with the following columns
#   - ID: preferably, "study_id"fin
#   - y (factor, with the event as the first level)
#   - all predictors/features
# - features: character vector of all features of interest
# --> for now, we will use the multi_model_tibble dummy dataset

# Define DLCV parameters
group <- "study_id" # NA if no group
strata <- "y"

if(is.na(group)) {
  form <- as.formula(paste0("y ~ ", paste(features, collapse = " + ")))  
} else {
  form <- as.formula(paste0("y ~ ", group, " + ", paste(features, collapse = " + ")))  
}

k <- 3
form

Initialize DLCV

if(is.na(group)) {
  folds <- createFolds(x, k, group, strata) # TODO: edit group parameter 
} else{
  folds <- createFolds(x, k, group, strata) 
}

# library(doParallel)
# cl <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
# registerDoParallel(cl)
# --> without CPU ~ 14%
# --> with CPU ~ 40%

# Prepare recipe
rec <- recipes::recipe(form, 
                       data = x) %>% 
  update_role(!!group, new_role = "id")

rec
rec %>% recipes::prep() %>% recipes::bake(new_data = NULL) %>% head()

L1

Boruta in combination with:

SVM (polynomial)

K nearest neighbor

Random Forest

XGBoost

Select models and run DLCV

rf_hp_grid <- grid_regular(trees(range = c(100, 300)),
                               mtry(range = c(2, 4)),
                               min_n(range = c(5, 15)),
                               levels = 3)

models <- c("lr", "l1", "bor_lr") # "bor_rf", "rf_no_hp", "rf")

res <- tibble(model = models) %>% 
  mutate(dlcv = map(model, ~ {
    print(.x)
    if("lr" == .x)         x <- dlcvLr(folds, rec)
    if("l1" == .x)         x <- dlcvL1(folds, rec, features)
    if("bor_lr" == .x)     x <- dlcvBoruta(folds, rec, features)
    if("svm_no_hp" == .x)  x <- dlcvSvmNoHp(folds, rec)
    if("svm" == .x)        x <- dlcvSvm(folds, rec)
    if("knn_no_hp" == .x)  x <- dlcvKnnNoHp(folds, rec)
    if("knn" == .x)        x <- dlcvKnn(folds, rec)
    if("bor_rf" == .x)     x <- dlcvRf(folds, rec, rf_hp_grid, boruta = T)
    if("rf_no_hp" == .x)   x <- dlcvRfNoHp(folds, rec)
    if("rf" == .x)         x <- dlcvRf(folds, rec, rf_hp_grid)
    return(x)
  }))

Combine outputs

# Compute AUC
res <- res %>% 
  mutate(auc = map2(model, dlcv, ~ computeCvAUC(folds = .y, model = .x)))

# Compute ROC
res <- res %>% 
  mutate(trn_preds = map(dlcv, ~ .x %>% select(trn_preds) %>% unnest(trn_preds)),
         tst_preds = map(dlcv, ~ .x %>% select(tst_preds) %>% unnest(tst_preds)),
         trn_roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1)),
         tst_roc = map(tst_preds, ~ roc_curve(.x, y, .pred_1)))

Analyze data

DLCV performance

# Show AUC
res %>% 
  select(auc) %>% 
  unnest(auc) %>% 
  mutate(txt = paste0(round(mean_auc, 2), " (", 
                      round(ci_lower, 2), "-", 
                      round(ci_upper, 2), ")")) %>% 
  select(-mean_auc, -ci_lower, -ci_upper, -confidence) %>% 
  pivot_wider(names_from = "part", values_from = "txt")

# Show train/test ROCs
res %>% 
  select(model, trn_preds) %>% 
  mutate(roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1))) %>% 
  select(model, roc) %>% 
  unnest(roc) %>% 
  plotRoc(group = "model", title = NULL)

plotDisMet(x = res, bl = prel_diag_metrics, title = "10CV LR", group = "model", legend = T)

Individual model performance

fipLr(res$dlcv[[1]], plot = T)
fipL1(res$dlcv[[2]], min_n = 3, plot = T)
fipBoruta(res$dlcv[[3]], plot = T)

plotDisMet(x = res$dlcv[[3]], 
           bl = prel_diag_metrics, 
           title = "10CV bor random forest",
           group = "id", 
           legend = T)

Save output

# Save object
# save(res, file = "tmp")

Sanity checks

LR base R..

CV

lr_base_folds <- folds %>% 
  mutate(lr_base_model = map(splits, ~ {
    glm(formula = form, 
        data = .x %>% 
          analysis() %>% 
          mutate(y = if_else(y == "1", 1, 0)), 
        family = binomial)
    }))

summary(lr_base_folds$lr_base_model[[1]])$coefficients[1:5, 1]
tidy(lr_base_folds$lr_base_model[[1]])
# Importance
lr_base_folds <- lr_base_folds %>% 
  mutate(coef_estimates = map(lr_base_model, tidy),
         coef_estimates = map(coef_estimates, ~ {
           .x %>% 
             transmute(feature = term, 
                       estimate = estimate) %>% # tidymodels bug
             arrange(-estimate) %>%
             filter(estimate != 0.0 & feature != "(Intercept)")
         }))

lr_base_folds_feature_counts <- lr_base_folds %>% 
  select(coef_estimates) %>% 
  unnest(coef_estimates) %>% 
  with_groups(feature, nest) %>% 
  mutate(mean_coef = map_dbl(data, ~ mean(.x$estimate))) %>% 
  arrange(mean_coef)

lr_base_folds_feature_counts %>% 
  mutate(feature = factor(feature, levels = lr_base_folds_feature_counts$feature)) %>% 
  unnest(data) %>% 
  ggplot(aes(x = feature, y = estimate)) +
  geom_boxplot(aes(y = estimate)) +
  geom_point() +
  labs(x = "Feature",
       y = "Importance (coefficient)") +
  coord_flip()


mikeniemant/nbs documentation built on June 23, 2022, 4:52 a.m.