knitr::opts_chunk$set(echo = TRUE) library(data.table) library(dplyr) library(devtools) load_all()
# data preprocessing set.seed(200) train <- as.data.table(readr::read_csv('data/data-springleaf.csv.bz2')) train$y <- ifelse(train$target == 1, 'Yes', 'No') train$y <- as.factor(train$y) train <- train[, -c('ID', 'target1', 'VAR_1427', 'VAR_0847', 'VAR_1428', 'VAR0924')] not_any_na <- function(x) all(!is.na(x)) train <- train %>% select(where(not_any_na)) #head(train)
# clean up train data # remove columns with missing data (NA, 99999999X, 999X, 99X) feat_names <- colnames(train[,-c('y')]) rm_names <- c() count <- 0 for (f in feat_names) { coldata <- train[[f]] if (any(coldata < 0)) { #print(paste0(f, '-', min(coldata))) count <- count + 1 rm_names <- c(rm_names, f) next } if (any(coldata > 999999990)) { #print(paste0(f, '-', max(coldata))) count <- count + 1 rm_names <- c(rm_names, f) next } if (sum(coldata > 9990 & coldata < 9999) > 20) { #print(paste0(f, '-', max(coldata))) count <- count + 1 rm_names <- c(rm_names, f) next } if (sum(coldata > 990 & coldata < 999) > 20) { #print(paste0(f, '-', max(coldata))) count <- count + 1 rm_names <- c(rm_names, f) next } if (sum(coldata > 90 & coldata < 99) > 20) { #print(paste0(f, '-', max(coldata))) count <- count + 1 rm_names <- c(rm_names, f) next } } train_new <- train[,-rm_names, with=F]
table(train_new$y)
rho <- 5221/(16779+5221) rho
set.seed(200) folds <- createFolds(train_new$y, k=22, list = TRUE) traininglist <- lapply(folds, function(x) train_new[x, ]) testing <- traininglist[[22]] testingY <- to_label(testing$y, class1='Yes')
model_list <- c('rmda', 'pls', 'rda', 'svmLinear', 'svmRadial', 'knn', 'earth', 'mlp', 'rf', 'gbm', 'ctree', 'C5.0', 'bayesglm', 'glm', 'glmnet', 'simpls', 'dwdRadial', 'xgbTree', 'xgbLinear', 'nnet') t1 <- mtrainer(model_list, dataInfo = 'SpringLeaf') # rpart is not working -> add gbm # avNNet, nb, rotationForest is not working #t1 <- mtrainer(c('knn', 'simpls', 'pls'), dataInfo = 'SpringLeaf')
t1 <- train.mtrainer(t1, y~., traininglist, update=F)
t1 <- predict.mtrainer(t1, newdata=testing) auclist <- apply(t1$predictions, 2, auc_rank, testingY)
fde1 <- fde(t1$predictions) fde1 <- predict_performance(fde1, auclist, attr(testingY, 'rho'))
fde2 <- calculate_performance(fde1, testingY)
plot_cor(fde1, legend_flag = T, filename='results/SLM_cor_fde1.pdf')
plot_cor(fde2, legend_flag=T, filename='results/SLM_cor_fde2.pdf')
g1 <- plot_performance(fde2, nmethod_list=c(3, 5, 7), nsample=200, filename='results/SLM_perf_fde2.pdf') g1
g2 <- plot_performance_nmethods(fde2, nmethod_list=3:10, nsample=200, filename='results/SLM_perf_nmethod_fde2_STD.pdf') g2
plot_single(fde2, 'roc')
plot_ensemble(fde2, filename='results/SLM_ensemble_fde2.pdf')
library(ggpubr) g <- ggarrange(g1, g2, labels=c('A', 'B'), ncol=2, nrow=1, widths = c(2.7,1)) ggsave("results/Figure4a.pdf", width=15, height=3.8) g
plot_FDstatistics(fde2)
#plot_single(fde1, 'score') store.mtrainer(t1, 'springleaf_pre.RData') saveRDS(testingY, 'springleaf_y.RData')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.