knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(dplyr)
library(devtools)
load_all()

Train data

# data preprocessing

set.seed(200)
train <- as.data.table(readr::read_csv('data/data-springleaf.csv.bz2'))
train$y <- ifelse(train$target == 1, 'Yes', 'No')
train$y <- as.factor(train$y)
train <- train[, -c('ID', 'target1', 'VAR_1427', 'VAR_0847', 'VAR_1428', 'VAR0924')]

not_any_na <- function(x) all(!is.na(x))
train <- train %>% 
  select(where(not_any_na)) 

#head(train)
# clean up train data
# remove columns with missing data (NA, 99999999X, 999X, 99X)

feat_names <- colnames(train[,-c('y')])
rm_names <- c()

count <- 0
for (f in feat_names) {
  coldata <- train[[f]]
  if (any(coldata < 0)) {
    #print(paste0(f, '-', min(coldata)))
    count <- count + 1
    rm_names <- c(rm_names, f)
    next
  }
  if (any(coldata > 999999990)) {
    #print(paste0(f, '-', max(coldata)))
    count <- count + 1
    rm_names <- c(rm_names, f)
    next
  }
  if (sum(coldata > 9990 & coldata < 9999) > 20) {
    #print(paste0(f, '-', max(coldata)))
    count <- count + 1
    rm_names <- c(rm_names, f)
    next
  }
  if (sum(coldata > 990 & coldata < 999) > 20) {
    #print(paste0(f, '-', max(coldata)))
    count <- count + 1
    rm_names <- c(rm_names, f)
    next
  }
  if (sum(coldata > 90 & coldata < 99) > 20) {
    #print(paste0(f, '-', max(coldata)))
    count <- count + 1
    rm_names <- c(rm_names, f)
    next
  }
}

train_new <- train[,-rm_names, with=F]
table(train_new$y)
rho <- 5221/(16779+5221)
rho
set.seed(200)

folds <- createFolds(train_new$y, k=22, list = TRUE)
traininglist <- lapply(folds, function(x) train_new[x, ])

testing  <- traininglist[[22]]
testingY <- to_label(testing$y, class1='Yes')
model_list <- c('rmda', 'pls', 'rda', 'svmLinear', 'svmRadial', 'knn', 'earth', 'mlp', 'rf', 'gbm', 'ctree', 'C5.0', 'bayesglm', 'glm', 'glmnet', 'simpls', 'dwdRadial', 'xgbTree', 'xgbLinear', 'nnet')
t1 <- mtrainer(model_list, dataInfo = 'SpringLeaf')

# rpart is not working -> add gbm
# avNNet, nb, rotationForest is not working
#t1 <- mtrainer(c('knn', 'simpls', 'pls'), dataInfo = 'SpringLeaf')
t1 <- train.mtrainer(t1, y~., traininglist, update=F)
t1 <- predict.mtrainer(t1, newdata=testing)
auclist <- apply(t1$predictions, 2, auc_rank, testingY)
fde1 <- fde(t1$predictions)
fde1 <- predict_performance(fde1, auclist, attr(testingY, 'rho'))
fde2 <- calculate_performance(fde1, testingY)
plot_cor(fde1, legend_flag = T, filename='results/SLM_cor_fde1.pdf')
plot_cor(fde2, legend_flag=T, filename='results/SLM_cor_fde2.pdf')
g1 <- plot_performance(fde2, nmethod_list=c(3, 5, 7), nsample=200, filename='results/SLM_perf_fde2.pdf')
g1
g2 <- plot_performance_nmethods(fde2, nmethod_list=3:10, nsample=200, filename='results/SLM_perf_nmethod_fde2_STD.pdf')
g2
plot_single(fde2, 'roc')
plot_ensemble(fde2, filename='results/SLM_ensemble_fde2.pdf')
library(ggpubr)
g <- ggarrange(g1, g2, labels=c('A', 'B'), ncol=2, nrow=1, widths = c(2.7,1))
ggsave("results/Figure4a.pdf", width=15, height=3.8)
g
plot_FDstatistics(fde2)
#plot_single(fde1, 'score')
store.mtrainer(t1, 'springleaf_pre.RData')
saveRDS(testingY, 'springleaf_y.RData')


sungcheolkim78/FiDEL documentation built on Nov. 13, 2024, 7:58 a.m.