In mikeniemant/nbs: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  cache = F, 
  fig.width = 7,
  fig.height = 7
)

Generate data

Dataset

nobs = 200
nvar = 20
nreal = 10

set.seed(42)

x <- matrix(rnorm(nobs * nvar), nobs, nvar)
beta <- c(2+rnorm(nreal, 0, 1), rep(0, nvar - nreal))
y_cont <- c(t(beta) %*% t(x)) + rnorm(nobs, sd = 3)
y_logit <- exp(y_cont)/(1+exp(y_cont))
y_class <- sapply(y_logit, function(x) rbinom(n = 1, size = 1, prob = x))
x <- dplyr::tibble(y = forcats::fct_rev(factor(y_class)), 
              study_id = 1:nobs,
              dplyr::as_tibble(x))

features <- colnames(x[, which(!colnames(x) %in% c("y", "study_id"))])

Preliminary diagnosis

prel_diag_metrics <- dplyr::tibble(sens = 0.7, spec = 0.7, ppv = 0.5)

Prepare work environment

library(tidymodels)
library(tidyverse)
library(nbs)
theme_set(theme_bw() + theme(plot.title = element_text(hjust = 0.5)))

# Import data
# load("data.rds")
# - x: tibble or data frame with the following columns
#   - ID: preferably, "study_id"fin
#   - y (factor, with the event as the first level)
#   - all predictors/features
# - features: character vector of all features of interest
# --> for now, we will use the multi_model_tibble dummy dataset

# Define DLCV parameters
group <- "study_id" # NA if no group
strata <- "y"

if(is.na(group)) {
  form <- as.formula(paste0("y ~ ", paste(features, collapse = " + ")))  
} else {
  form <- as.formula(paste0("y ~ ", group, " + ", paste(features, collapse = " + ")))  
}

k <- 3
form

Initialize DLCV

if(is.na(group)) {
  folds <- createFolds(x, k, group, strata) # TODO: edit group parameter 
} else{
  folds <- createFolds(x, k, group, strata) 
}

# library(doParallel)
# cl <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
# registerDoParallel(cl)
# --> without CPU ~ 14%
# --> with CPU ~ 40%

# Prepare recipe
rec <- recipes::recipe(form, 
                       data = x) %>% 
  update_role(!!group, new_role = "id")

rec
rec %>% recipes::prep() %>% recipes::bake(new_data = NULL) %>% head()

Min lambda
1se lambda
--> Either optimize with tidymodels or R glmnet package

Boruta in combination with:

logistic regression
L1 regularization lambda_1se
L1 regularization lambda_min

SVM (polynomial)

Transformation: standardization
Package: e1071
Hyperparameter:
Cost (C): the cost of predicting a sample within or on the wrong side of the margin. 1
degree: The polynomial degree. 1
scale_factor: A scaling factor for the kernel. 1
margin: The epsilon in the SVM insensitive loss function (regression only)

K nearest neighbor

Transformation: standardization
Package: kknn
Hyperparameter:
neighbors: 5
weight_func: optimal
dist_power: parameter used when calculating the Minkowski distance

Random Forest

Transformation: none
Package: randomForest, and others
Hyperparameter:
ntree/range: number of trees
mtry: number of features used during training one single tree
min_n: minimum number of objects in one leaf
boruta: yes/no, feature selection method performed before training the RF

XGBoost

Transformation: none
Package: xgboost
Hyperparameter:
neighbors: 5
weight_func: optimal
dist_power: parameter used when calculating the Minkowski distance

Select models and run DLCV

rf_hp_grid <- grid_regular(trees(range = c(100, 300)),
                               mtry(range = c(2, 4)),
                               min_n(range = c(5, 15)),
                               levels = 3)

models <- c("lr", "l1", "bor_lr") # "bor_rf", "rf_no_hp", "rf")

res <- tibble(model = models) %>% 
  mutate(dlcv = map(model, ~ {
    print(.x)
    if("lr" == .x)         x <- dlcvLr(folds, rec)
    if("l1" == .x)         x <- dlcvL1(folds, rec, features)
    if("bor_lr" == .x)     x <- dlcvBoruta(folds, rec, features)
    if("svm_no_hp" == .x)  x <- dlcvSvmNoHp(folds, rec)
    if("svm" == .x)        x <- dlcvSvm(folds, rec)
    if("knn_no_hp" == .x)  x <- dlcvKnnNoHp(folds, rec)
    if("knn" == .x)        x <- dlcvKnn(folds, rec)
    if("bor_rf" == .x)     x <- dlcvRf(folds, rec, rf_hp_grid, boruta = T)
    if("rf_no_hp" == .x)   x <- dlcvRfNoHp(folds, rec)
    if("rf" == .x)         x <- dlcvRf(folds, rec, rf_hp_grid)
    return(x)
  }))

Combine outputs

# Compute AUC
res <- res %>% 
  mutate(auc = map2(model, dlcv, ~ computeCvAUC(folds = .y, model = .x)))

# Compute ROC
res <- res %>% 
  mutate(trn_preds = map(dlcv, ~ .x %>% select(trn_preds) %>% unnest(trn_preds)),
         tst_preds = map(dlcv, ~ .x %>% select(tst_preds) %>% unnest(tst_preds)),
         trn_roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1)),
         tst_roc = map(tst_preds, ~ roc_curve(.x, y, .pred_1)))

Analyze data

DLCV performance

# Show AUC
res %>% 
  select(auc) %>% 
  unnest(auc) %>% 
  mutate(txt = paste0(round(mean_auc, 2), " (", 
                      round(ci_lower, 2), "-", 
                      round(ci_upper, 2), ")")) %>% 
  select(-mean_auc, -ci_lower, -ci_upper, -confidence) %>% 
  pivot_wider(names_from = "part", values_from = "txt")

# Show train/test ROCs
res %>% 
  select(model, trn_preds) %>% 
  mutate(roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1))) %>% 
  select(model, roc) %>% 
  unnest(roc) %>% 
  plotRoc(group = "model", title = NULL)

plotDisMet(x = res, bl = prel_diag_metrics, title = "10CV LR", group = "model", legend = T)

Individual model performance

fipLr(res$dlcv[[1]], plot = T)
fipL1(res$dlcv[[2]], min_n = 3, plot = T)
fipBoruta(res$dlcv[[3]], plot = T)

plotDisMet(x = res$dlcv[[3]], 
           bl = prel_diag_metrics, 
           title = "10CV bor random forest",
           group = "id", 
           legend = T)

Save output

# Save object
# save(res, file = "tmp")

Sanity checks

LR base R..

CV

lr_base_folds <- folds %>% 
  mutate(lr_base_model = map(splits, ~ {
    glm(formula = form, 
        data = .x %>% 
          analysis() %>% 
          mutate(y = if_else(y == "1", 1, 0)), 
        family = binomial)
    }))

summary(lr_base_folds$lr_base_model[[1]])$coefficients[1:5, 1]
tidy(lr_base_folds$lr_base_model[[1]])

# Importance
lr_base_folds <- lr_base_folds %>% 
  mutate(coef_estimates = map(lr_base_model, tidy),
         coef_estimates = map(coef_estimates, ~ {
           .x %>% 
             transmute(feature = term, 
                       estimate = estimate) %>% # tidymodels bug
             arrange(-estimate) %>%
             filter(estimate != 0.0 & feature != "(Intercept)")
         }))

lr_base_folds_feature_counts <- lr_base_folds %>% 
  select(coef_estimates) %>% 
  unnest(coef_estimates) %>% 
  with_groups(feature, nest) %>% 
  mutate(mean_coef = map_dbl(data, ~ mean(.x$estimate))) %>% 
  arrange(mean_coef)

lr_base_folds_feature_counts %>% 
  mutate(feature = factor(feature, levels = lr_base_folds_feature_counts$feature)) %>% 
  unnest(data) %>% 
  ggplot(aes(x = feature, y = estimate)) +
  geom_boxplot(aes(y = estimate)) +
  geom_point() +
  labs(x = "Feature",
       y = "Importance (coefficient)") +
  coord_flip()

mikeniemant/nbs documentation built on June 23, 2022, 4:52 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

mikeniemant/nbs
What the Package Does (One Line, Title Case)

In mikeniemant/nbs: What the Package Does (One Line, Title Case)

Generate data

Dataset

Preliminary diagnosis

Prepare work environment

Initialize DLCV

Select models and run DLCV

Combine outputs

Analyze data

DLCV performance

Individual model performance

Save output

Sanity checks

LR base R..

CV

R Package Documentation

Browse R Packages

We want your feedback!

mikeniemant/nbs What the Package Does (One Line, Title Case)

In mikeniemant/nbs: What the Package Does (One Line, Title Case)

Generate data

Dataset

Preliminary diagnosis

Prepare work environment

Initialize DLCV

Select models and run DLCV

Combine outputs

Analyze data

DLCV performance

Individual model performance

Save output

Sanity checks

LR base R..

CV

R Package Documentation

Browse R Packages

We want your feedback!

mikeniemant/nbs
What the Package Does (One Line, Title Case)