knitr::opts_chunk$set(echo = TRUE) library(data.table) library(FDclassifieR) library(dplyr)
dirname <- 'data_porto_maxGini_0.29' flist <- list.files(path=dirname, pattern='0.*.csv') auclist <- (as.numeric(gsub(".csv", "", flist)) + 1)/2 # calculate prevalence from train data set train <- fread(paste0(dirname, '/', 'train.csv'), na.strings=c("-1","-1.0")) #train <- train %>% # mutate_at(vars(ends_with("cat")), as.factor) %>% # mutate_at(vars(ends_with("bin")), as.logical) %>% # mutate(target = as.factor(target)) rho <- sum(y == 1)/length(y) # create prediction matrix predictions <- matrix(nrow = 892816, ncol=length(flist)) i <- 1 for (f in flist) { tmp <- fread(paste0(dirname, '/', f)) predictions[ ,i] <- tmp[[2]] i <- i + 1 } colnames(predictions) <- mapply(paste0, rep('A',length(flist)), auclist) dim(predictions)
head(predictions)
hist(predictions[, 7])
library(summa) ksumma <- summa::summa(predictions, "rank")
print(max(ksumma@estimated_rank)) print(min(ksumma@estimated_rank)) hist(ksumma@estimated_rank)
rmax <- max(ksumma@estimated_rank) rmin <- min(ksumma@estimated_rank) submit <- data.table(id=tmp$id, target=(ksumma@estimated_rank - rmin)/(rmax-rmin)) fwrite(submit, file='submission.csv')
fde1 <- fdensemble(predictions)
fde1 <- predict_performance(fde1, auclist, rho, alpha=1)
print(max(fde1@estimated_rank)) print(min(fde1@estimated_rank)) hist(fde1@estimated_rank)
hist(fde1@predictions[ ,4])
submit <- data.table(id=tmp$id, target=fde1@estimated_prob) fwrite(submit, file='submission.csv')
plot_cor(fde1, class_flag='positive')
cor(fde1@rank_matrix)
set.seed(1024) # remove complete_rate < 0.9 columns tmp <- train[, -c(23,26,28)] tmp <- train[complete.cases(tmp), ] # change categorical variables to one-hot vectors cat_vars <- names(tmp)[grepl('_cat$', names(tmp))] tmp <- tmp %>% sample_n(30000) %>% mutate_at(.vars = cat_vars, .funs = as.factor) tmp <- model.matrix(~ . - 1, data = tmp)
tmp <- as.data.frame(tmp) tmp$target <- as.factor(ifelse(tmp$target == 1, 'filed', 'not')) inTraining0 <- createDataPartition(tmp$target, p = .75, list = FALSE) training <- tmp[ inTraining0,] testing <- tmp[-inTraining0,] testingY <- as_label(y[-inTraining0])
table(training$target)
model_list <- c('nnet', 'rda', 'svmLinear', 'svmRadial', 'pls', 'knn', 'earth', 'avNNet', 'mlp', 'nb', 'rf', 'rpart', 'ctree', 'C5.0', 'gbm', 'bayesglm', 'glm', 'glmnet', 'simpls') t1 <- mtrainer(c('rda', 'pls', 'glm'), dataInfo = 'Porto')
t1 <- train(t1, target~., training, update=T)
t1 <- t1 %>% addmodel.mtrainer(c('rda', 'pls')) %>% train(target~., training)
plot(t1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.