knitr::opts_chunk$set( collapse = TRUE, comment = "#>", cache = F, fig.width = 7, fig.height = 7 )
nobs = 200 nvar = 20 nreal = 10 set.seed(42) x <- matrix(rnorm(nobs * nvar), nobs, nvar) beta <- c(2+rnorm(nreal, 0, 1), rep(0, nvar - nreal)) y_cont <- c(t(beta) %*% t(x)) + rnorm(nobs, sd = 3) y_logit <- exp(y_cont)/(1+exp(y_cont)) y_class <- sapply(y_logit, function(x) rbinom(n = 1, size = 1, prob = x)) x <- dplyr::tibble(y = forcats::fct_rev(factor(y_class)), study_id = 1:nobs, dplyr::as_tibble(x)) features <- colnames(x[, which(!colnames(x) %in% c("y", "study_id"))])
prel_diag_metrics <- dplyr::tibble(sens = 0.7, spec = 0.7, ppv = 0.5)
library(tidymodels) library(tidyverse) library(nbs) theme_set(theme_bw() + theme(plot.title = element_text(hjust = 0.5))) # Import data # load("data.rds") # - x: tibble or data frame with the following columns # - ID: preferably, "study_id"fin # - y (factor, with the event as the first level) # - all predictors/features # - features: character vector of all features of interest # --> for now, we will use the multi_model_tibble dummy dataset # Define DLCV parameters group <- "study_id" # NA if no group strata <- "y" if(is.na(group)) { form <- as.formula(paste0("y ~ ", paste(features, collapse = " + "))) } else { form <- as.formula(paste0("y ~ ", group, " + ", paste(features, collapse = " + "))) } k <- 3 form
if(is.na(group)) { folds <- createFolds(x, k, group, strata) # TODO: edit group parameter } else{ folds <- createFolds(x, k, group, strata) } # library(doParallel) # cl <- makePSOCKcluster(parallel::detectCores(logical = FALSE)) # registerDoParallel(cl) # --> without CPU ~ 14% # --> with CPU ~ 40% # Prepare recipe rec <- recipes::recipe(form, data = x) %>% update_role(!!group, new_role = "id") rec rec %>% recipes::prep() %>% recipes::bake(new_data = NULL) %>% head()
L1
glmnet
packageBoruta in combination with:
SVM (polynomial)
e1071
K nearest neighbor
kknn
Random Forest
randomForest
, and othersXGBoost
xgboost
rf_hp_grid <- grid_regular(trees(range = c(100, 300)), mtry(range = c(2, 4)), min_n(range = c(5, 15)), levels = 3) models <- c("lr", "l1", "bor_lr") # "bor_rf", "rf_no_hp", "rf") res <- tibble(model = models) %>% mutate(dlcv = map(model, ~ { print(.x) if("lr" == .x) x <- dlcvLr(folds, rec) if("l1" == .x) x <- dlcvL1(folds, rec, features) if("bor_lr" == .x) x <- dlcvBoruta(folds, rec, features) if("svm_no_hp" == .x) x <- dlcvSvmNoHp(folds, rec) if("svm" == .x) x <- dlcvSvm(folds, rec) if("knn_no_hp" == .x) x <- dlcvKnnNoHp(folds, rec) if("knn" == .x) x <- dlcvKnn(folds, rec) if("bor_rf" == .x) x <- dlcvRf(folds, rec, rf_hp_grid, boruta = T) if("rf_no_hp" == .x) x <- dlcvRfNoHp(folds, rec) if("rf" == .x) x <- dlcvRf(folds, rec, rf_hp_grid) return(x) }))
# Compute AUC res <- res %>% mutate(auc = map2(model, dlcv, ~ computeCvAUC(folds = .y, model = .x))) # Compute ROC res <- res %>% mutate(trn_preds = map(dlcv, ~ .x %>% select(trn_preds) %>% unnest(trn_preds)), tst_preds = map(dlcv, ~ .x %>% select(tst_preds) %>% unnest(tst_preds)), trn_roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1)), tst_roc = map(tst_preds, ~ roc_curve(.x, y, .pred_1)))
# Show AUC res %>% select(auc) %>% unnest(auc) %>% mutate(txt = paste0(round(mean_auc, 2), " (", round(ci_lower, 2), "-", round(ci_upper, 2), ")")) %>% select(-mean_auc, -ci_lower, -ci_upper, -confidence) %>% pivot_wider(names_from = "part", values_from = "txt") # Show train/test ROCs res %>% select(model, trn_preds) %>% mutate(roc = map(trn_preds, ~ roc_curve(.x, y, .pred_1))) %>% select(model, roc) %>% unnest(roc) %>% plotRoc(group = "model", title = NULL) plotDisMet(x = res, bl = prel_diag_metrics, title = "10CV LR", group = "model", legend = T)
fipLr(res$dlcv[[1]], plot = T) fipL1(res$dlcv[[2]], min_n = 3, plot = T) fipBoruta(res$dlcv[[3]], plot = T) plotDisMet(x = res$dlcv[[3]], bl = prel_diag_metrics, title = "10CV bor random forest", group = "id", legend = T)
# Save object # save(res, file = "tmp")
lr_base_folds <- folds %>% mutate(lr_base_model = map(splits, ~ { glm(formula = form, data = .x %>% analysis() %>% mutate(y = if_else(y == "1", 1, 0)), family = binomial) })) summary(lr_base_folds$lr_base_model[[1]])$coefficients[1:5, 1] tidy(lr_base_folds$lr_base_model[[1]])
# Importance lr_base_folds <- lr_base_folds %>% mutate(coef_estimates = map(lr_base_model, tidy), coef_estimates = map(coef_estimates, ~ { .x %>% transmute(feature = term, estimate = estimate) %>% # tidymodels bug arrange(-estimate) %>% filter(estimate != 0.0 & feature != "(Intercept)") })) lr_base_folds_feature_counts <- lr_base_folds %>% select(coef_estimates) %>% unnest(coef_estimates) %>% with_groups(feature, nest) %>% mutate(mean_coef = map_dbl(data, ~ mean(.x$estimate))) %>% arrange(mean_coef) lr_base_folds_feature_counts %>% mutate(feature = factor(feature, levels = lr_base_folds_feature_counts$feature)) %>% unnest(data) %>% ggplot(aes(x = feature, y = estimate)) + geom_boxplot(aes(y = estimate)) + geom_point() + labs(x = "Feature", y = "Importance (coefficient)") + coord_flip()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.