inst/doc/introduction.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("superml")

## ----eval=FALSE---------------------------------------------------------------
#  devtools::install_github("saraswatmks/superml")

## ----eval=FALSE---------------------------------------------------------------
#  install.packages("superml", dependencies=TRUE)

## -----------------------------------------------------------------------------
load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")
# superml::check_package("caret")
library(data.table)
library(caret)
library(superml)

library(Metrics)

head(reg_train)

split <- createDataPartition(y = reg_train$SalePrice, p = 0.7)
xtrain <- reg_train[split$Resample1]
xtest <- reg_train[!split$Resample1]

## -----------------------------------------------------------------------------
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
na_cols <- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])

xtrain[, c(na_cols, "Id") := NULL]
xtest[, c(na_cols, "Id") := NULL]

# encode categorical variables
cat_cols <- names(xtrain)[sapply(xtrain, is.character)]

for(c in cat_cols){
    lbl <- LabelEncoder$new()
    lbl$fit(c(xtrain[[c]], xtest[[c]]))
    xtrain[[c]] <- lbl$transform(xtrain[[c]])
    xtest[[c]] <- lbl$transform(xtest[[c]])
}

# removing noise column
noise <- c('GrLivArea','TotalBsmtSF')

xtrain[, c(noise) := NULL]
xtest[, c(noise) := NULL]

# fill missing value with  -1
xtrain[is.na(xtrain)] <- -1
xtest[is.na(xtest)] <- -1

## -----------------------------------------------------------------------------
knn <- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn$fit(train = xtrain, test = xtest, y = 'SalePrice')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
rmse(actual = xtest$SalePrice, predicted=labels)

## ----eval=FALSE---------------------------------------------------------------
#  svm <- SVMTrainer$new()
#  svm$fit(xtrain, 'SalePrice')
#  pred <- svm$predict(xtest)
#  rmse(actual = xtest$SalePrice, predicted = pred)

## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family="gaussian")
lf$fit(X = xtrain, y = "SalePrice")
summary(lf$model)
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "gaussian", alpha = 1, lambda = 1000)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "gaussian", alpha=0)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)

## ----eval=FALSE---------------------------------------------------------------
#  lf <- LMTrainer$new(family = "gaussian")
#  lf$cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
#  predictions <- lf$cv_predict(df = xtest)
#  coefs <- lf$get_importance()
#  rmse(actual = xtest$SalePrice, predicted = predictions)

## -----------------------------------------------------------------------------
rf <- RFTrainer$new(n_estimators = 500,classification = 0)
rf$fit(X = xtrain, y = "SalePrice")
pred <- rf$predict(df = xtest)
rf$get_importance()
rmse(actual = xtest$SalePrice, predicted = pred)

## ----eval=FALSE---------------------------------------------------------------
#  xgb <- XGBTrainer$new(objective = "reg:linear"
#                        , n_estimators = 500
#                        , eval_metric = "rmse"
#                        , maximize = F
#                        , learning_rate = 0.1
#                        ,max_depth = 6)
#  xgb$fit(X = xtrain, y = "SalePrice", valid = xtest)
#  pred <- xgb$predict(xtest)
#  rmse(actual = xtest$SalePrice, predicted = pred)

## ----eval=FALSE---------------------------------------------------------------
#  xgb <- XGBTrainer$new(objective = "reg:linear")
#  
#  gst <- GridSearchCV$new(trainer = xgb,
#                               parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
#                               n_folds = 3,
#                               scoring = c('accuracy','auc'))
#  gst$fit(xtrain, "SalePrice")
#  gst$best_iteration()

## -----------------------------------------------------------------------------
rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
                             parameters = list(n_estimators = c(5,10),
                             max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'),
                             n_iter = 3)
rst$fit(xtrain, "SalePrice")
rst$best_iteration()

## -----------------------------------------------------------------------------
# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")

head(cla_train)

# split the data
split <- createDataPartition(y = cla_train$Survived,p = 0.7)
xtrain <- cla_train[split$Resample1]
xtest <- cla_train[!split$Resample1]

# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')) {
    lbl <- LabelEncoder$new()
    lbl$fit(c(xtrain[[c]], xtest[[c]]))
    xtrain[[c]] <- lbl$transform(xtrain[[c]])
    xtest[[c]] <- lbl$transform(xtest[[c]])
}

# impute missing values
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]

# drop these features
to_drop <- c('PassengerId','Ticket','Name')

xtrain <- xtrain[,-c(to_drop), with=F]
xtest <- xtest[,-c(to_drop), with=F]

## -----------------------------------------------------------------------------
knn <- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn$fit(train = xtrain, test = xtest, y = 'Survived')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type = 'raw')
auc(actual = xtest$Survived, predicted = labels)

## -----------------------------------------------------------------------------
nb <- NBTrainer$new()
nb$fit(xtrain, 'Survived')
pred <- nb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)

## ----eval=FALSE---------------------------------------------------------------
#  #predicts labels
#  svm <- SVMTrainer$new()
#  svm$fit(xtrain, 'Survived')
#  pred <- svm$predict(xtest)
#  auc(actual = xtest$Survived, predicted=pred)

## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "binomial")
lf$fit(X = xtrain, y = "Survived")
summary(lf$model)
predictions <- lf$predict(df = xtest)
auc(actual = xtest$Survived, predicted = predictions)


## ----eval=FALSE---------------------------------------------------------------
#  lf <- LMTrainer$new(family="binomial", alpha=1)
#  lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#  pred <- lf$cv_predict(df = xtest)
#  auc(actual = xtest$Survived, predicted = pred)
#  

## ----eval=FALSE---------------------------------------------------------------
#  lf <- LMTrainer$new(family="binomial", alpha=0)
#  lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#  pred <- lf$cv_predict(df = xtest)
#  auc(actual = xtest$Survived, predicted = pred)

## -----------------------------------------------------------------------------
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf$fit(X = xtrain, y = "Survived")

pred <- rf$predict(df = xtest)
rf$get_importance()

auc(actual = xtest$Survived, predicted = pred)

## ----eval=FALSE---------------------------------------------------------------
#  xgb <- XGBTrainer$new(objective = "binary:logistic"
#                        , n_estimators = 500
#                        , eval_metric = "auc"
#                        , maximize = T
#                        , learning_rate = 0.1
#                        ,max_depth = 6)
#  xgb$fit(X = xtrain, y = "Survived", valid = xtest)
#  
#  pred <- xgb$predict(xtest)
#  auc(actual = xtest$Survived, predicted = pred)

## ----eval=FALSE---------------------------------------------------------------
#  xgb <- XGBTrainer$new(objective="binary:logistic")
#  gst <-GridSearchCV$new(trainer = xgb,
#                               parameters = list(n_estimators = c(10,50),
#                               max_depth = c(5,2)),
#                               n_folds = 3,
#                               scoring = c('accuracy','auc'))
#  gst$fit(xtrain, "Survived")
#  gst$best_iteration()

## -----------------------------------------------------------------------------
rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
                             parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
                             n_folds = 3,
                             scoring = c('accuracy','auc'),
                             n_iter = 3)
rst$fit(xtrain, "Survived")
rst$best_iteration()

## -----------------------------------------------------------------------------
# add target encoding features
xtrain[, feat_01 := smoothMean(train_df = xtrain,
                        test_df = xtest,
                        colname = "Embarked",
                        target = "Survived")$train[[2]]]
xtest[, feat_01 := smoothMean(train_df = xtrain,
                               test_df = xtest,
                               colname = "Embarked",
                               target = "Survived")$test[[2]]]

# train a random forest
# Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()

auc(actual = xtest$Survived, predicted = pred)

Try the superml package in your browser

Any scripts or data that you put into this service are public.

superml documentation built on May 29, 2024, 1:09 a.m.