prediction.R

# Machine Learning And Credit Default: An Interactive Analysis In R

#### Author: Patrick Rotter
#### Date: 30/11/2017
#### Description: prediction.R imports data_b.rds (see preparation.R) and creates all necessary models presented in the problem set. 
####              Furthermore, an abundance of alternative models/ implementations are presented.
####              Apart from comments, please see [Info] for further explanations.

##########################
#       packages         ####################################################################################################################
##########################

library(RTutor)

library(glmnet)
library(caret)
library(rpart)
library(randomForest)
library(xgboost)

# To measure the elapsed time
if(!require(tictoc)){
  install.packages("tictoc")
  library(tictoc)
}

# gbm algorithm is based on plyr functions
if(!require(plyr)){
  install.packages("plyr")
  library(plyr)
}
library(dplyr)

if(!require(kernlab)){
  install.packages("kernlab")
  library(kernlab)
}

if(!require(pROC)){
  install.packages("pROC")
  library(pROC)
}

if(!require(caTools)){
  install.packages("caTools")
  library(caTools)
}

# # Optional for multi-core support
# if(!require(doMC)){
#   install.packages("doMC")
#   library(doMC)
# }
# registerDoMC(cores = 8)

# Additonal packages for bagging and boosting
# Only required for models which are not featured in the problem set
# (Second part of this script)

# if(!require(party)){
#   install.packages("party")
#   library(party)
# }
# if(!require(gbm)){
#   install.packages("gbm")
#   library(gbm)
# }
# if(!require(ada)){
#   install.packages("ada")
#   library(ada)
# }
# if(!require(adabag)){
#   install.packages("adabag")
#   library(adabag)
# }
# if(!require(e1071)){
#   install.packages("e1071")
#   library(e1071)
# }

# Deal with multiple select statements
select = dplyr::select

##########################
#         Import         ####################################################################################################################
##########################

data = readRDS("data_b.rds")
  
# [Info] A sparse matrix (data = sparse.model.matrix( ~ ., data)) decreases run time and minimum hardware requirements (RAM) to run the 
#        script. However not all implementations which are part of this script support sparse matrices (e.g. randomForest). Hence, the
#        data set has not been converted to a sparse matrix. The same applies for missing values, in general, one might achieve better results, 
#        utilizing data sets customised to a respective machine larning algorithm instead of a generlized one-for-all data set.

##########################
#         Logging        ####################################################################################################################
##########################

if(file.exists("prediction_log.txt"))
  file.remove("prediction_log.txt")
cat("preparation.R log \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat("Import [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")

##########################
#        Indexing        ####################################################################################################################
##########################

# Generate the train_index
train_index <- with.random.seed(createDataPartition(data$default, 
                                                    p = 0.8, list = FALSE), seed = 5690)

# Divide our data set into test (20%) and remaining observations (80%)
test <- data[-train_index,]
left <- data[train_index,]

# Generate the tune_index
tune_index <- with.random.seed(createDataPartition(left$default, 
                                                   p = 0.125, list = FALSE), seed = 5690)

# Split our remaining observations into tune (10%) and training data (70%)
# of total observations
tune <- left[tune_index,]
train <- left[-tune_index,]

# Clear RAM
rm(data, tune_index, train_index)
gc()

# Logging
cat("Settings and Indexing [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")

##########################
#   Logistic Regression  ####################################################################################################################
##########################

# Start counting ############################
tic()

# Logistic regression
# Inspired by Emekter, et al. (2015)
log <- glm(default ~ grade_B + grade_C + grade_D + grade_E + grade_F + grade_G + dti + mean_fico_range + revol_util, family = binomial(link=logit), data = left)

# Predict the outcome utilizing our test data
prediction_log <- predict(log, select(test, -default), type="response")

# Compute the ROC curve
roc_log <- roc(response = test$default, predictor = as.numeric(prediction_log))

############################## End counting #
elapsed_log <- toc()
elapsed_log <- elapsed_log$toc-elapsed_log$tic

# Alternatively, calculate the AUC manually as shown in the problem set:

# Order the probablities decreasingly and maintain the index
prediction_log_ordered <- sort(prediction_log, 
                               decreasing = TRUE, 
                               index.return = TRUE)

# Extract the probabilities and their index separately
probabilities <- as.numeric(prediction_log_ordered$x)
indeces <- prediction_log_ordered$ix

# Adjust the true values to the cecreasing order
true_value <- test$default[indeces]

# Calculate the true and false positive rate
# True positives divided by the sum of false positives and false negatives    
tpr = cumsum(true_value == "Default") / sum(true_value == "Default") 
# False positives divided by the sum of false positive and true negative
fpr = cumsum(true_value == "Paid") / sum(true_value == "Paid")

# Calculate the AUC
# Width: (fpr[2:length(true_value)] - fpr[1:length(true_value)-1])
# Height: tpr[2:length(true_value)]
auc = sum((fpr[2:length(true_value)] - fpr[1:length(true_value)-1]) 
          * tpr[2:length(true_value)])

# [Info] We consider all steps which are required to arrive at
#        the final AUC value, as the times required to train the
#        model and to perform predictions differ significantly 
#        accross models

####################################################################################################################

# Start counting ############################
tic()

# Full rank logistic regression model
log_full <- glm(default ~ ., family = binomial(link=logit), data = left)

# Predict the outcome utilizing our test data
prediction_log_full <- predict(log_full, select(test, -default), type="response")

# Compute the ROC curve
roc_log_full <- roc(response = test$default, predictor = as.numeric(prediction_log_full))

############################## End counting #
elapsed_log_full <- toc()
elapsed_log_full <- elapsed_log_full$toc-elapsed_log_full$tic

# Print AUC 
roc_log$auc  # Area under the curve: 0.7008
roc_log_full$auc # Area under the curve: 0.7339

# Print summary
summary(log)
summary(log_full)

# Logging
cat("Logistic Regression [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_log \n AUC: ", roc_log$auc, "\n TIME: ", elapsed_log, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")
cat(c("auc_log_full \n AUC: ", roc_log_full$auc, "\n TIME: ", elapsed_log_full, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
# [Info] We use save instead of saveRDS, as we bundle multiple objects in a single file
#        You may load any .rdata file with load("yourfilename.rdata")
save(log, log_full, prediction_log, prediction_log_full, roc_log, roc_log_full, elapsed_log, elapsed_log_full, file = "log.rdata")
# [Info] log_ps.rdata omitts log_full, log due to size (GitHub <100MB & RAM concerning the host system)
save(prediction_log, prediction_log_full, roc_log, roc_log_full, elapsed_log, elapsed_log_full, file = "log_ps.rdata") 
rm(log, log_full, prediction_log, prediction_log_full, roc_log, roc_log_full, elapsed_log, elapsed_log_full)
gc()

##########################
#   Logit Reg - glmnet   ####################################################################################################################
##########################

# Start counting ############################
tic()

# Utilize our tune data set to determine our tuning parameters alpha and lambda
log_tuned_param <- with.random.seed(
  train(# Drop the default column from tune, as default is our
    # binary response
    x = select(tune, -default),
    y = tune$default,
    method = "glmnet",
    tuneGrid = expand.grid(alpha = seq(0, 1, .05),
                           lambda = seq(0, 1, .05)),
    metric = "ROC",
    trControl = trainControl(method = "cv",
                             summaryFunction = twoClassSummary,
                             classProbs = TRUE,
                             number = 10)), seed = 5690)

# Estimate our model utilizing our training data set
log_tuned <- glmnet(x = as.matrix(select(train, -default)),
                    y = train$default,
                    # Pass the estimated alpha and lambda stored in
                    # log_tuned_param$bestTune
                    alpha = log_tuned_param$bestTune$alpha,
                    lambda = log_tuned_param$bestTune$lambda,
                    family = "binomial",
                    standardize = FALSE)

# Predict the outcome utilizing our test data set
prediction_log_tuned <- predict(log_tuned, newx = as.matrix(select(test, -default)),
                                type="response")

# Compute the ROC curve
roc_log_tuned <- roc(response = test$default, predictor = as.numeric(prediction_log_tuned))

############################## End counting #
elapsed_log_tuned <- toc()
elapsed_log_tuned <- elapsed_log_tuned$toc-elapsed_log_tuned$tic

# Print AUC
roc_log_tuned$auc # Area under the curve: 0.7339

# Logging
cat("LR - glmnet [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_log_tuned \n AUC: ", roc_log_tuned$auc, "\n TIME: ", elapsed_log_tuned, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(log_tuned_param, log_tuned, prediction_log_tuned, roc_log_tuned, elapsed_log_tuned, file = "log_tuned.rdata")
rm(log_tuned_param, log_tuned, prediction_log_tuned, roc_log_tuned, elapsed_log_tuned)
gc()

##########################
#  Classification Tree   ####################################################################################################################
##########################

# Start counting ############################
tic()

# Grow the tree
tree <- with.random.seed(
        rpart(default ~ grade_B + grade_C + grade_D + grade_E + grade_F + grade_G + dti + mean_fico_range + revol_util,
              # Without tuning the tree, the data set left
              # corresponds to the training data set
              data = left, 
              control = rpart.control(minsplit = 20, 
                                      minbucket = 100, 
                                      cp = 0.0005,
                                      method = "class")), seed = 5690)

# Complexity Levels
tree$cptable

# Predict the outcome utilizing our test data set
prediction_tree <- predict(tree, type = "prob", newdata = select(test, -default))

# Compute the ROC curve
roc_tree <- roc(test$default, prediction_tree[,1])

############################## End counting #
elapsed_tree <- toc()
elapsed_tree <- elapsed_tree$toc-elapsed_tree$tic

####################################################################################################################

# Start counting ############################
tic()

# Pruning
tree_pruned <- prune(tree , cp = 0.00066)

# Predict the outcome utilizing our test data set
prediction_tree_pruned <- predict(tree_pruned, type = "prob", newdata = select(test, -default))

# Compute the ROC curve
roc_tree_pruned <- roc(test$default, prediction_tree_pruned[,1])

############################## End counting #
elapsed_tree_pruned <- toc()
elapsed_tree_pruned <- elapsed_tree_pruned$toc-elapsed_tree_pruned$tic

####################################################################################################################

# Start counting ############################
tic()

# Utilize our tune data set to determine complexity
tree_param <- with.random.seed(
  train(x = select(tune, -default),
        y = tune$default,
        method = "rpart",
        tuneGrid = expand.grid(cp = seq(0.0001, 0.025, 0.0001)),
        metric = "ROC",
        trControl = trainControl(method = "cv",
                                 summaryFunction = twoClassSummary,
                                 classProbs = TRUE,
                                 number = 10)), seed = 5690)

# Optimal Complexity Parameter
tree_param$bestTune

# See https://www.statmethods.net/advstats/cart.html
# Develop the tree utilizing library(rpart)
# [Info] rpart.control parameters are lower than default, as previously, no tree was grown
tree_tuned <- with.random.seed(
  rpart(default ~ .,
        data = train, 
        control = rpart.control(minsplit = 20, 
                                minbucket = 100, 
                                cp = tree_param$bestTune$cp),
        method = "class"), seed = 5690)

# Complexity Levels
tree_tuned$cptable

# Predict the outcome utilizing our test data set
prediction_tree_tuned <- predict(tree_tuned, type = "prob", newdata = select(test, -default))

# Compute the ROC curve
roc_tree_tuned <- roc(test$default, prediction_tree_tuned[,1])

############################## End counting #
elapsed_tree_tuned <- toc()
elapsed_tree_tuned <- elapsed_tree_tuned$toc-elapsed_tree_tuned$tic

# Print AUC
roc_tree$auc
roc_tree_pruned$auc
roc_tree_tuned$auc

# Print summary/tree
summary(tree)
tree
tree$frame

# Bonus
# Plot
plot(tree, uniform=TRUE); text(tree, use.n = TRUE, all = TRUE, cex = .8)

# Plot the pruned tree
plot(tree_pruned, uniform=TRUE); text(tree_pruned, use.n = TRUE, all = TRUE, cex = .8)

# Plot the tuned tree
plot(tree_tuned, uniform=TRUE); text(tree_tuned, use.n = TRUE, all = TRUE, cex = .8)

# Logging
cat("Classification Tree [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_tree \n AUC: ", roc_tree$auc, "\n TIME: ", elapsed_tree, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")
cat(c("auc_tree_pruned \n AUC: ", roc_tree_pruned$auc, "\n TIME: ", elapsed_tree_pruned, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")
cat(c("auc_tree_tuned \n AUC: ", roc_tree_tuned$auc, "\n TIME: ", elapsed_tree_tuned, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")


# Save results and clear RAM
save(tree_param, tree, tree_pruned, tree_tuned, prediction_tree_pruned, prediction_tree, prediction_tree_tuned, roc_tree_pruned, roc_tree, roc_tree_tuned, elapsed_tree, elapsed_tree_pruned, elapsed_tree_tuned, file = "tree.rdata")
rm(tree_param, tree, tree_pruned, tree_tuned, prediction_tree_pruned, prediction_tree, prediction_tree_tuned, roc_tree_pruned, roc_tree, roc_tree_tuned, elapsed_tree, elapsed_tree_pruned, elapsed_tree_tuned)
gc()

##########################
#      randomForest      ####################################################################################################################
##########################

# Start counting ############################
tic()

# Utilize our tune data set to determine our tuning parameters mtry 
randomForest_param <- with.random.seed(
  train(x = select(tune, -default),
        y = tune$default,
        method = "rf",
        tuneGrid = expand.grid(mtry = seq(2, 20, 2)), 
        metric = "Accuracy",
        trControl = trainControl(method = "oob",
                                 classProbs = TRUE)), seed = 5690)

# Build Random Forest
randomForest <- randomForest(y = train$default,
                             x = select(train, -default),
                             # Utilizing OOB requires a lot of available RAM,
                             # hence the small amount of trees
                             ntree = 750,
                             mtree = randomForest_param$bestTune$mtry,
                             importance = TRUE,
                             data = train)

# Build Random Forest - including test error calculation
# randomForest <- randomForest(y = train$default,
#                              x = select(train, -default),
#                              ytest = test$default,
#                              xtest = select(test, -default),
#                              ntree = 2000, mtree = .mtry, data = train)

# Predict the outcome utilizing our test data set
prediction_randomForest <- predict(randomForest, type = "prob", newdata = select(test, -default))

# Compute the ROC curve
roc_randomForest <- roc(test$default, prediction_randomForest[,1])

############################## End counting #
elapsed_randomForest <- toc()
elapsed_randomForest <- elapsed_randomForest$toc-elapsed_randomForest$tic

# Print AUC
roc_randomForest$auc # Area under the curve: 0.7346

# Classes
cm_randomForest <- randomForest$confusion

# Mean Decrease in Gini Index
mdg_randomForest <- randomForest::importance(randomForest)

# Variable Importance
varImpPlot(randomForest,
           sort = T,
           n.var = 10)

# Logging
cat("randomForest [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_randomForest \n AUC: ", roc_randomForest$auc, "\n TIME: ", elapsed_randomForest, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(randomForest_param, randomForest, prediction_randomForest, roc_randomForest, mdg_randomForest, cm_randomForest, elapsed_randomForest, file = "randomForest.rdata")
# [Info] rf_ps.rdata omitts the randomForest model due to size
save(randomForest_param, prediction_randomForest, roc_randomForest, mdg_randomForest, cm_randomForest, elapsed_randomForest, file = "rf_ps.rdata")

rm(randomForest_param, randomForest, prediction_randomForest, roc_randomForest, mdg_randomForest, cm_randomForest, elapsed_randomForest)
gc()

##########################
#       Boosting         ####################################################################################################################
##########################

# Start counting ############################
tic()

# Utilize our tune data set to determine our tuning parameters 
gbm_param <- with.random.seed(
  train(y = tune$default,
        x = as.matrix(select(tune, -default)),
        method = "xgbTree",
        trControl = trainControl(method = "cv", 
                                 number = 10, 
                                 # Save losses across models
                                 returnResamp = "all",
                                 classProbs = TRUE),
        tuneGrid = expand.grid(# Number of trees to grow
          nrounds = 1000,
          # Learning rate
          eta = c(0.01, 0.001, 0.0001),
          # Maximum tree depth
          max_depth = c(5, 10, 20),
          # Minimum loss reduction (the larger gamma,
          # the less restricted)
          gamma = c(0.33, 0.66, 1),
          # Sub sample ratio of columns in each tree
          colsample_bytree = c(0.5, 1),
          # Avoids very small nodes
          min_child_weight = 1,
          # Ratio of data utilized to train the model
          # A lower ratio may prevent overfitting, we 
          # utilize the whole tuning data set
          subsample = 1)
  ), seed = 5690)

# temporary save
save(gbm_param, file = "gbm_param.rdata")

# Estimate our model utilizing our training data set
# See https://cran.r-project.org/web/packages/xgboost/vignettes/xgboostPresentation.html
gbm <- with.random.seed(
  xgboost(data = as.matrix(select(train, -default)),
          label = ifelse(train$default == "Default", 1, 0),
          params = list(# binary classification
            objective = "binary:logistic",
            eta = gbm_param$bestTune$eta,
            gamma = gbm_param$bestTune$gamma,
            max_depth = gbm_param$bestTune$max_depth,
            min_child_weight = gbm_param$bestTune$min_child_weight,
            subsample = gbm_param$bestTune$subsample,
            colsample_bytree = gbm_param$bestTune$colsample_bytree,
            # Metric
            eval_metric = "auc"),
          nrounds = gbm_param$bestTune$nrounds,
          # Print progress
          verbose = TRUE,
          # Truncation condition, if no further improvement after 250 trees
          early_stopping_rounds = 250), seed = 5690)

# Print test error
# err <- mean(as.numeric(pred > 0.5) != test$label)
# print(paste("test-error=", err))

# Predict the outcome utilizing our test data set
prediction_gbm <- predict(gbm, xgb.DMatrix(as.matrix(select(test, -default)), label = ifelse(test$default == "Default", 1, 0)))

# Compute the ROC curve
roc_gbm <- roc(test$default, prediction_gbm) 

############################## End counting #
elapsed_gbm <- toc()
elapsed_gbm <- elapsed_gbm$toc-elapsed_gbm$tic

# Print AUC
roc_gbm$auc # Area under the curve: 0.7482

# Print summary
summary(gbm)

# Compute an importance matrix
importance_mat <- xgb.importance(colnames(train), model = gbm)

# Logging
cat("eXtreme Gradient Boosting [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_gbm \n AUC: ", roc_gbm$auc, "\n TIME: ", elapsed_gbm, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(gbm_param, gbm, prediction_gbm, roc_gbm, elapsed_gbm, importance_mat, file = "xgbm.rdata")
rm(gbm_param, gbm, prediction_gbm, roc_gbm, elapsed_gbm, importance_mat)
gc()

############################################################################################################################################################
#                                  The following models were not used in the problem set, provide however good alternatives                                #
############################################################################################################################################################

##########################
#        Bagging         ####################################################################################################################
##########################

# Bagged ranger RF
rangerBag <- train(y = train$default,
                   x = select(train, -default),
                   method = "ranger",
                   trControl = trainControl(method = "cv",
                                            number = 10,
                                            classProbs = TRUE),
                   tuneGrid = expand.grid(mtry = c(5, .mtry)),
                   metric = "Accuracy")

# Predict the outcome utilizing our test data set
prediction_rangerBag <- predict(rangerBag, select(test, -default), type = "prob")

# Compute the ROC curve
roc_rangerBag <- roc(test$default, prediction_rangerBag$Default)

# Calculate the AUC
auc_rangerBag <- pROC::auc(roc_rangerBag)

# Print AUC
auc_rangerBag # Area under the curve: ?.????

# Print summary
summary(rangerBag)

# Logging
cat("Bagged ranger [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_rangerBag \t", roc_rangerBag$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(rangerBag, prediction_rangerBag, roc_rangerBag, auc_rangerBag, file = "rangerBag.rdata")
rm(rangerBag, prediction_rangerBag, roc_rangerBag, auc_rangerBag)
gc()

####################################################################################################################

# Bagged ADA Boost
adaBag <- train(y = train$default,
                x = select(train, -default),
                method = "AdaBag",
                trControl = trainControl(method = "cv",
                                         number = 10,
                                         classProbs = TRUE),
                tuneGrid = expand.grid(mfinal = 500,
                                       maxdepth = c(2,5)),
                metric = "Accuracy")

# Predict the outcome utilizing our test data set
prediction_adaBag <- predict(adaBag, type = "prob", newdata = select(test, -default))

# Compute the ROC curve
roc_adaBag <- roc(test$default, prediction_adaBag$Default)

# Calculate the AUC
auc_adaBag <- pROC::auc(roc_adaBag)

# Print AUC
auc_adaBag # Area under the curve: ?.????

# Print summary
summary(adaBag)

# Logging
cat("Bagged AdaBoost [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_adaBag \t", roc_adaBag$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(adaBag, prediction_adaBag, roc_adaBag, auc_adaBag, file = "adaBag.rdata")
rm(adaBag, prediction_adaBag, roc_adaBag, auc_adaBag)
gc()

####################################################################################################################

# Bagged randomForest RF
rfBag <- train(y = train$default,
               x = select(train, -default),
               method = "rf",
               trControl = trainControl(method = "cv",
                                        number = 10,
                                        classProbs = TRUE),
               tuneGrid = expand.grid(mtry = c(5, .mtry)),
               metric = "Accuracy")

# Predict the outcome utilizing our test data set
prediction_rfBag <- predict(rfBag, select(test, -default), type = "prob")

# Compute the ROC curve
roc_rfBag <- roc(test$default, prediction_rfBag$Default)

# Calculate the AUC
auc_rfBag <- pROC::auc(roc_rfBag)

# Print AUC
auc_rfBag # Area under the curve: 0.7311

# Print summary/rfBag
summary(rfBag)
rfBag

# Logging
cat("Bagged randomForest [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_rfBag \t", roc_rfBag$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(rfBag, prediction_rfBag, roc_rfBag, auc_rfBag, file = "rfBag.rdata")
rm(rfBag, prediction_rfBag, roc_rfBag, auc_rfBag)
gc()

##########################
#        Bagging         ####################################################################################################################
##########################

# # Tree Bagging
# # https://rdrr.io/cran/caret/man/bag.html
# 
# treebag <- bag(y = left$default,
#                x = select(left, -default),
#                # Number of bootstrap samples
#                B = 10, 
#                vars = 5,
#                # Downsampling (Balancing default and paid ratio)
#                downSample = FALSE,
#                bagControl = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate,
#                                        oob = TRUE, allowParallel = TRUE))
# 
# # Predict the outcome utilizing our test data set
# prediction_treebag <- predict(treebag, select(test, -default), type = "prob")
# 
# # Compute the ROC curve
# roc_treebag <- roc(test$default, prediction_treebag$Default) 
# 
# # Calculate the AUC
# auc_treebag <- pROC::auc(roc_treebag)
# 
# # Print AUC
# auc_treebag
# 
# # Print summary
# summary(treebag)
# 
# # Logging
# cat("treebag [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
# cat(c("auc_treebag \t", roc_treebag$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")
# 
# # Save results and clear RAM
# save(treebag, prediction_treebag, roc_treebag, auc_treebag, file = "treebag.rdata")
# rm(treebag, prediction_treebag, roc_treebag, auc_treebag)
# gc()

##########################
#  ranger Random Forest  ####################################################################################################################
##########################

# Random Forest via ranger package

# Utilize our tune data set to determine our tuning parameters mtry and splitrule
ranger_param <- with.random.seed(
  train(x = select(tune, -default),
        y = tune$default,
        method = "ranger",
        tuneGrid = expand.grid(mtry = seq(2, 20, 2)), 
        metric = "ROC",
        trControl = trainControl(method = "cv",
                                 summaryFunction = twoClassSummary,
                                 classProbs = TRUE,
                                 number = 10)), seed = 5690)

# See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3250568/
ranger <- with.random.seed(
  ranger(default ~ ., 
         data = train, 
         num.trees = 3000, 
         replace = TRUE, 
         mtry = ranger_param$bestTune$mtry, 
         splitrule = "gini", 
         importance = "impurity", 
         probability = TRUE), seed = 5690)

# Predict the outcome utilizing our test data set
prediction_ranger <- predict(ranger, data = select(test, -default))

# Compute the ROC curve
roc_ranger <- roc(test$default, prediction_ranger$predictions[,1]) 
# Alternatively utilizing library(precrec)
# roc_ranger <- evalmod(labels = test$default, scores = as.numeric(prediction_ranger$predictions[,2]), auc = TRUE, plot = TRUE)

# Calculate the AUC
auc_ranger <- pROC::auc(roc_ranger)

# Print AUC
auc_ranger

# Print summary
summary(ranger)

# Mean Decrease in Gini Index
mdg_ranger <- ranger::importance(ranger)

# Logging
cat("ranger [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_ranger \t", roc_ranger$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(ranger_param, ranger, prediction_ranger, roc_ranger, auc_ranger, mdg_ranger, file = "ranger.rdata")
rm(ranger_param, ranger, prediction_ranger, roc_ranger, auc_ranger, mdg_ranger)
gc()

##########################
#       Boosting         ####################################################################################################################
##########################

# gradient boosting machine alternative to xgboost

# See http://www.jstatsoft.org/v28/i05/paper - p.10
# Utilize our tune data set to determine our tuning parameters alpha and lambda
gbm_param <- with.random.seed(
  train(y = tune$default,
        x = select(tune, -default),
        method = "gbm", 
        trControl = trainControl(method = "cv", 
                                 number = 10, 
                                 classProbs = TRUE), 
        tuneGrid = expand.grid(interaction.depth = c(2, 5, 10, 20),
                               n.trees = c(500, 1000, 2000), 
                               shrinkage = seq(.001, .05, .001),
                               n.minobsinnode = 10), 
        metric = "Accuracy"), seed = 5690)

# temporary save
save(gbm_param, file = "gbm_param.rdata")

# See https://www.r-bloggers.com/using-a-gbm-for-classification-in-r/
# Estimate our model utilizing our training data set
gbm <- with.random.seed(
  gbm.fit(x = as.matrix(select(train, -default)),
          y = ifelse(train$default == "Default", 1, 0),
          distribution = "bernoulli",
          interaction.depth = gbm_param$bestTune$interaction.depth,
          n.trees =  gbm_param$bestTune$n.trees,
          shrinkage = gbm_param$bestTune$shrinkage,
          n.minobsinnode = gbm_param$bestTune$n.minobsinnode,
          nTrain = round(nrow(train)*0.8)), seed = 5690)

# Predict the outcome utilizing our test data set
prediction_gbm <- predict(gbm, newdata = select(test, -default), type = "response")

# Compute the ROC curve
roc_gbm <- roc(test$default, prediction_gbm) 

# Calculate the AUC
auc_gbm <- pROC::auc(roc_gbm)

# Print AUC
auc_gbm # Area under the curve: 

# Print summary
summary(gbm)

# Performance Plot
gbm.perf(gbm)

# Logging
cat("Gradient Boosting Machine [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_gbm \t", roc_gbm$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

save(gbm_param, gbm, prediction_gbm, roc_gbm, auc_gbm, file = "gbm.rdata")
rm(gbm_param, gbm, prediction_gbm, roc_gbm, auc_gbm)
gc()

####################################################################################################################

# Logit Boost
log_boost = train(y = train$default,
                  x = select(train, -default),
                  method = "LogitBoost",
                  trControl = trainControl(method = "cv", 
                                           number = 10, 
                                           classProbs = TRUE),
                  tuneGrid = expand.grid(nIter = 750),
                  metric = "Accuracy")

# Predict the outcome utilizing our test data set
prediction_log_boost <- predict(log_boost, select(test, -default), type = "prob")

# Compute the ROC curve
roc_log_boost <- roc(test$default, prediction_log_boost$Default) 

# Calculate the AUC
auc_log_boost <- pROC::auc(roc_log_boost)

# Print AUC
auc_log_boost # Area under the curve: 0.668

# Print 
log_boost

# Logging
cat("Logit Boost [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_log_boost \t", roc_log_boost$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(log_boost, prediction_log_boost, roc_log_boost, auc_log_boost, file = "log_boost.rdata")
rm(log_boost, prediction_log_boost, roc_log_boost, auc_log_boost)
gc()

####################################################################################################################

# ADA Boost
ada = train(y = train$default,
            x = select(train, -default),
            method = "ada",
            trControl = trainControl(method = "cv", 
                                     number = 10, 
                                     classProbs = TRUE),
            # ADA performance 
            tuneGrid = expand.grid(iter = 750,
                                   maxdepth = 4,
                                   nu = c(.1, .01)),
            metric = "Accuracy"
)

# Predict the outcome utilizing our test data set
prediction_ada <- predict(ada, select(test, -default), type = "prob")

# Compute the ROC curve
roc_ada <- roc(test$default, prediction_ada$Default) 

# Calculate the AUC
auc_ada <- pROC::auc(roc_ada)

# Print AUC
auc_ada # Area under the curve: 0.7358

# Print 
ada

# Logging
cat("Ada Boosting [x] \n==============================================", file="prediction_log.txt", append=TRUE, sep = "\n")
cat(c("auc_ada \t", roc_ada$auc, "\n==============================================\n"), file="prediction_log.txt", append=TRUE, sep = "")

# Save results and clear RAM
save(ada, prediction_ada, roc_ada, auc_ada, file = "ada.rdata")
rm(ada, prediction_ada, roc_ada, auc_ada)
gc()

##########################
#          FINIS         ####################################################################################################################
##########################
rotterp/RTutorMachineLearningAndCreditDefault documentation built on May 19, 2019, 1:47 a.m.