library(tidyverse)
devtools::load_all()
# load in the train and test data
load("/Users/alastairrushworth/Documents/git_repositories/mlblitz/data/fraud/train_test.RData")
train <- prepare(train_test[[1]], target = "isFraud")
test <- prepare(train_test[[2]], train, target = "isFraud")
# train %>% select_if(is.character) %>% inspect_cat %>% show_plot
# ensure recipe can be extended with custom functions
# test_blind option?... if want to borrow feature information from test
# comparison of difference in train and test
# feature generation using grouped summaries
# conversion of NAs to binary
# save recipes for repeating on test set - add attribute
train <- train %>% mutate_if(is.character, as.factor)
test <- test %>% mutate_if(is.character, as.factor)
#
# train %>% inspect_types
#
# z <- train %>% select_if(is.logical)
# z$isFraud <- train$isFraud
# z <- z %>% mutate_if(is.logical, as.character)
# z[is.na(z)] <- "missing"
# dfs <- z %>% group_by(isFraud) %>% nest %>% .$data
# x <- inspect_cat(dfs[[1]], dfs[[2]])
# devtools::load_all("/Users/alastairrushworth/Documents/git_repositories/inspectdf")
# get the label out
label <- train["isFraud"] %>%
mutate(isFraud = as.factor(make.names(isFraud))) %>%
.$isFraud
train_d <- train %>% select(-isFraud)
fit_params <- list(iterations = 200,
loss_function = 'Logloss',
border_count = 32,
depth = 5,
learning_rate = 0.03,
l2_leaf_reg = 3.5)
library(caret)
library(catboost)
fit_control <- trainControl(method = "cv",
number = 2,
classProbs = TRUE)
grid <- expand.grid(depth = c(6, 8),
learning_rate = 0.1,
iterations = 100,
l2_leaf_reg = 1e-3,
rsm = 0.95,
border_count = 64)
report <- train(train_d, label,
method = catboost.caret,
logging_level = 'Verbose', preProc = NULL,
tuneGrid = grid, trControl = fit_control)
print(report)
importance <- varImp(report, scale = FALSE)
print(importance)
train_pool <- catboost.load_pool(train_d, label = as.integer(unlist(label)))
model <- catboost.train(pool, params = fit_params)
test_pool <- catboost.load_pool(test)
preds <- catboost.predict(model, test_pool, prediction_type = 'Probability')
# read in the sample submission file
ss <- read_csv("data/fraud/sample_submission.csv")
ss$isFraud <- preds
write_csv(ss, "data/fraud/sub.csv")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.