Introduction to SuperML

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

SuperML R package is designed to unify the model training process in R like Python. Generally, it's seen that people spend lot of time in searching for packages, figuring out the syntax for training machine learning models in R. This behaviour is highly apparent in users who frequently switch between R and Python. This package provides a python´s scikit-learn interface (fit, predict) to train models faster.

In addition to building machine learning models, there are handy functionalities to do feature engineering

This ambitious package is my ongoing effort to help the r-community build ML models easily and faster in R.

Install

You can install latest cran version using (recommended):

install.packages("superml")

You can install the developmemt version directly from github using:

devtools::install_github("saraswatmks/superml")

Caveats on superml installation

For machine learning, superml is based on the existing R packages. Hence, while installing the package, we don't install all the dependencies. However, while training any model, superml will automatically install the package if its not found. Still, if you want to install all dependencies at once, you can simply do:

install.packages("superml", dependencies=TRUE)

Examples - Machine Learning Models

This package uses existing r-packages to build machine learning model. In this tutorial, we'll use data.table R package to do all tasks related to data manipulation.

Regression Data

We'll quickly prepare the data set to be ready to served for model training.

load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")

library(data.table)
library(caret)
library(superml)

library(Metrics)

head(reg_train)

split <- createDataPartition(y = reg_train$SalePrice, p = 0.7)
xtrain <- reg_train[split$Resample1]
xtest <- reg_train[!split$Resample1]
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
na_cols <- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])

xtrain[, c(na_cols, "Id") := NULL]
xtest[, c(na_cols, "Id") := NULL]

# encode categorical variables
cat_cols <- names(xtrain)[sapply(xtrain, is.character)]

for(c in cat_cols){
    lbl <- LabelEncoder$new()
    lbl$fit(c(xtrain[[c]], xtest[[c]]))
    xtrain[[c]] <- lbl$transform(xtrain[[c]])
    xtest[[c]] <- lbl$transform(xtest[[c]])
}

# removing noise column
noise <- c('GrLivArea','TotalBsmtSF')

xtrain[, c(noise) := NULL]
xtest[, c(noise) := NULL]

# fill missing value with  -1
xtrain[is.na(xtrain)] <- -1
xtest[is.na(xtest)] <- -1

KNN Regression

knn <- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn$fit(train = xtrain, test = xtest, y = 'SalePrice')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
rmse(actual = xtest$SalePrice, predicted=labels)

SVM Regression

svm <- SVMTrainer$new()
svm$fit(xtrain, 'SalePrice')
pred <- svm$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)

Simple Regresison

lf <- LMTrainer$new(family="gaussian")
lf$fit(X = xtrain, y = "SalePrice")
summary(lf$model)
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

Lasso Regression

lf <- LMTrainer$new(family = "gaussian", alpha = 1, lambda = 1000)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

Ridge Regression

lf <- LMTrainer$new(family = "gaussian", alpha=0)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

Logistic Regression with CV

lf <- LMTrainer$new(family = "gaussian")
lf$cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
predictions <- lf$cv_predict(df = xtest)
coefs <- lf$get_importance()
rmse(actual = xtest$SalePrice, predicted = predictions)

Random Forest

rf <- RFTrainer$new(n_estimators = 500,classification = 0)
rf$fit(X = xtrain, y = "SalePrice")
pred <- rf$predict(df = xtest)
rf$get_importance()
rmse(actual = xtest$SalePrice, predicted = pred)

Xgboost

xgb <- XGBTrainer$new(objective = "reg:linear"
                      , n_estimators = 500
                      , eval_metric = "rmse"
                      , maximize = F
                      , learning_rate = 0.1
                      ,max_depth = 6)
xgb$fit(X = xtrain, y = "SalePrice", valid = xtest)
pred <- xgb$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)

Grid Search

xgb <- XGBTrainer$new(objective = "reg:linear")

gst <- GridSearchCV$new(trainer = xgb,
                             parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'))
gst$fit(xtrain, "SalePrice")
gst$best_iteration()

Random Search

rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
                             parameters = list(n_estimators = c(5,10),
                             max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'),
                             n_iter = 3)
rst$fit(xtrain, "SalePrice")
rst$best_iteration()

Binary Classification Data

Here, we will solve a simple binary classification problem (predict people who survived on titanic ship). The idea here is to demonstrate how to use this package to solve classification problems.

Data Preparation

# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")

head(cla_train)

# split the data
split <- createDataPartition(y = cla_train$Survived,p = 0.7)
xtrain <- cla_train[split$Resample1]
xtest <- cla_train[!split$Resample1]

# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')) {
    lbl <- LabelEncoder$new()
    lbl$fit(c(xtrain[[c]], xtest[[c]]))
    xtrain[[c]] <- lbl$transform(xtrain[[c]])
    xtest[[c]] <- lbl$transform(xtest[[c]])
}

# impute missing values
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]

# drop these features
to_drop <- c('PassengerId','Ticket','Name')

xtrain <- xtrain[,-c(to_drop), with=F]
xtest <- xtest[,-c(to_drop), with=F]

Now, our data is ready to be served for model training. Let's do it.

KNN Classification

knn <- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn$fit(train = xtrain, test = xtest, y = 'Survived')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type = 'raw')
auc(actual = xtest$Survived, predicted = labels)

Naive Bayes Classification

nb <- NBTrainer$new()
nb$fit(xtrain, 'Survived')
pred <- nb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)

SVM Classification

#predicts labels
svm <- SVMTrainer$new()
svm$fit(xtrain, 'Survived')
pred <- svm$predict(xtest)
auc(actual = xtest$Survived, predicted=pred)

Logistic Regression

lf <- LMTrainer$new(family = "binomial")
lf$fit(X = xtrain, y = "Survived")
summary(lf$model)
predictions <- lf$predict(df = xtest)
auc(actual = xtest$Survived, predicted = predictions)

Lasso Logistic Regression

lf <- LMTrainer$new(family="binomial", alpha=1)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)

Ridge Logistic Regression

lf <- LMTrainer$new(family="binomial", alpha=0)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)

Random Forest

rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf$fit(X = xtrain, y = "Survived")

pred <- rf$predict(df = xtest)
rf$get_importance()

auc(actual = xtest$Survived, predicted = pred)

Xgboost

xgb <- XGBTrainer$new(objective = "binary:logistic"
                      , n_estimators = 500
                      , eval_metric = "auc"
                      , maximize = T
                      , learning_rate = 0.1
                      ,max_depth = 6)
xgb$fit(X = xtrain, y = "Survived", valid = xtest)

pred <- xgb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)

Grid Search

xgb <- XGBTrainer$new(objective="binary:logistic")
gst <-GridSearchCV$new(trainer = xgb,
                             parameters = list(n_estimators = c(10,50),
                             max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'))
gst$fit(xtrain, "Survived")
gst$best_iteration()

Random Search

rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
                             parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'),
                             n_iter = 3)
rst$fit(xtrain, "Survived")
rst$best_iteration()

Let's create some new feature based on target variable using target encoding and test a model.

# add target encoding features
xtrain[, feat_01 := smoothMean(train_df = xtrain,
                        test_df = xtest,
                        colname = "Embarked",
                        target = "Survived")$train[[2]]]
xtest[, feat_01 := smoothMean(train_df = xtrain,
                               test_df = xtest,
                               colname = "Embarked",
                               target = "Survived")$test[[2]]]

# train a random forest
# Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()

auc(actual = xtest$Survived, predicted = pred)


Try the superml package in your browser

Any scripts or data that you put into this service are public.

superml documentation built on Nov. 14, 2022, 9:05 a.m.