knitr::opts_chunk$set(echo = TRUE)
library(data.table)

Categorical Feature Encoding Challenge II

dirname <- 'data_cat_maxAUC_0.78'
flist <- list.files(path=dirname, pattern='0.*.csv')
auclist <- as.numeric(gsub(".csv", "", flist))

train <- fread(paste0(dirname, '/', 'train.csv'))
y <- as.factor(train$target)
rho <- sum(y == 1)/length(y)

predictions <- matrix(nrow = 400000, ncol=length(flist))

i <- 1
for (f in flist) {
  tmp <- fread(paste0(dirname, '/', f))
  predictions[ ,i] <- tmp[[2]]
  i <- i + 1
}

dim(predictions)
head(predictions)
hist(predictions[, 1])
library(summa)
ksumma <- summa::summa(predictions, "rank")
hist(ksumma@estimated_rank)
fde1 <- fdensemble(predictions[,1:8])
fde1 <- predict_performance(fde1, auclist[1:8], rho, alpha = 1)
plot_cor(fde1, class_flag='positive')
hist(fde1@predictions[ ,4])
submit <- data.table(id=seq(600000,999999), target=fde1@estimated_rank)
fwrite(submit, file='submission.csv')
plot_cor(fde1, legend_flag=T)
cor(fde1@rank_matrix)

check r_summa

library(summa)

data_binary <- create_predictions(3000, 30, 0.3, "rank")
str(data_binary)
summ <- summa(data_binary$predictions, "rank")
summ <- calculate_performance(summ, data_binary$actual_labels)
summa_plot(summ)

Train data

train <- as.data.table(readr::read_csv('data-CATII.csv.bz2'))

train$target <- ifelse(train$target == 1, "Yes", "No")
train <- train[sample(seq_along(train$target), 5000), ]
table(train$target)
inTraining0 <- createDataPartition(train$target, p = .75, list = FALSE)
training <- train[ inTraining0,]
testing  <- train[-inTraining0,]
testingY <- as_label(train$target, class1=1)
model_list <- c('nnet', 'rda', 'svmLinear', 'svmRadial', 'pls', 'knn', 'earth', 'avNNet', 'mlp', 'nb', 'rf', 'rpart', 'ctree', 'C5.0', 'gbm', 'bayesglm', 'glm', 'glmnet', 'simpls')
t1 <- mtrainer(c('rda', 'knn'), dataInfo = 'CATII')
t1 <- train(t1, target~., training, update=TRUE)


sungcheolkim78/FiDEL documentation built on Nov. 13, 2024, 7:58 a.m.