Nothing
## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----eval=FALSE---------------------------------------------------------------
# install.packages("superml")
## ----eval=FALSE---------------------------------------------------------------
# devtools::install_github("saraswatmks/superml")
## ----eval=FALSE---------------------------------------------------------------
# install.packages("superml", dependencies=TRUE)
## -----------------------------------------------------------------------------
load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")
# superml::check_package("caret")
library(data.table)
library(caret)
library(superml)
library(Metrics)
head(reg_train)
split <- createDataPartition(y = reg_train$SalePrice, p = 0.7)
xtrain <- reg_train[split$Resample1]
xtest <- reg_train[!split$Resample1]
## -----------------------------------------------------------------------------
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
na_cols <- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])
xtrain[, c(na_cols, "Id") := NULL]
xtest[, c(na_cols, "Id") := NULL]
# encode categorical variables
cat_cols <- names(xtrain)[sapply(xtrain, is.character)]
for(c in cat_cols){
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
# removing noise column
noise <- c('GrLivArea','TotalBsmtSF')
xtrain[, c(noise) := NULL]
xtest[, c(noise) := NULL]
# fill missing value with -1
xtrain[is.na(xtrain)] <- -1
xtest[is.na(xtest)] <- -1
## -----------------------------------------------------------------------------
knn <- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn$fit(train = xtrain, test = xtest, y = 'SalePrice')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
rmse(actual = xtest$SalePrice, predicted=labels)
## ----eval=FALSE---------------------------------------------------------------
# svm <- SVMTrainer$new()
# svm$fit(xtrain, 'SalePrice')
# pred <- svm$predict(xtest)
# rmse(actual = xtest$SalePrice, predicted = pred)
## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family="gaussian")
lf$fit(X = xtrain, y = "SalePrice")
summary(lf$model)
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "gaussian", alpha = 1, lambda = 1000)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "gaussian", alpha=0)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
## ----eval=FALSE---------------------------------------------------------------
# lf <- LMTrainer$new(family = "gaussian")
# lf$cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
# predictions <- lf$cv_predict(df = xtest)
# coefs <- lf$get_importance()
# rmse(actual = xtest$SalePrice, predicted = predictions)
## -----------------------------------------------------------------------------
rf <- RFTrainer$new(n_estimators = 500,classification = 0)
rf$fit(X = xtrain, y = "SalePrice")
pred <- rf$predict(df = xtest)
rf$get_importance()
rmse(actual = xtest$SalePrice, predicted = pred)
## ----eval=FALSE---------------------------------------------------------------
# xgb <- XGBTrainer$new(objective = "reg:linear"
# , n_estimators = 500
# , eval_metric = "rmse"
# , maximize = F
# , learning_rate = 0.1
# ,max_depth = 6)
# xgb$fit(X = xtrain, y = "SalePrice", valid = xtest)
# pred <- xgb$predict(xtest)
# rmse(actual = xtest$SalePrice, predicted = pred)
## ----eval=FALSE---------------------------------------------------------------
# xgb <- XGBTrainer$new(objective = "reg:linear")
#
# gst <- GridSearchCV$new(trainer = xgb,
# parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
# n_folds = 3,
# scoring = c('accuracy','auc'))
# gst$fit(xtrain, "SalePrice")
# gst$best_iteration()
## -----------------------------------------------------------------------------
rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(5,10),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
rst$fit(xtrain, "SalePrice")
rst$best_iteration()
## -----------------------------------------------------------------------------
# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")
head(cla_train)
# split the data
split <- createDataPartition(y = cla_train$Survived,p = 0.7)
xtrain <- cla_train[split$Resample1]
xtest <- cla_train[!split$Resample1]
# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')) {
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
# impute missing values
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
# drop these features
to_drop <- c('PassengerId','Ticket','Name')
xtrain <- xtrain[,-c(to_drop), with=F]
xtest <- xtest[,-c(to_drop), with=F]
## -----------------------------------------------------------------------------
knn <- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn$fit(train = xtrain, test = xtest, y = 'Survived')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type = 'raw')
auc(actual = xtest$Survived, predicted = labels)
## -----------------------------------------------------------------------------
nb <- NBTrainer$new()
nb$fit(xtrain, 'Survived')
pred <- nb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)
## ----eval=FALSE---------------------------------------------------------------
# #predicts labels
# svm <- SVMTrainer$new()
# svm$fit(xtrain, 'Survived')
# pred <- svm$predict(xtest)
# auc(actual = xtest$Survived, predicted=pred)
## -----------------------------------------------------------------------------
lf <- LMTrainer$new(family = "binomial")
lf$fit(X = xtrain, y = "Survived")
summary(lf$model)
predictions <- lf$predict(df = xtest)
auc(actual = xtest$Survived, predicted = predictions)
## ----eval=FALSE---------------------------------------------------------------
# lf <- LMTrainer$new(family="binomial", alpha=1)
# lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
# pred <- lf$cv_predict(df = xtest)
# auc(actual = xtest$Survived, predicted = pred)
#
## ----eval=FALSE---------------------------------------------------------------
# lf <- LMTrainer$new(family="binomial", alpha=0)
# lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
# pred <- lf$cv_predict(df = xtest)
# auc(actual = xtest$Survived, predicted = pred)
## -----------------------------------------------------------------------------
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
auc(actual = xtest$Survived, predicted = pred)
## ----eval=FALSE---------------------------------------------------------------
# xgb <- XGBTrainer$new(objective = "binary:logistic"
# , n_estimators = 500
# , eval_metric = "auc"
# , maximize = T
# , learning_rate = 0.1
# ,max_depth = 6)
# xgb$fit(X = xtrain, y = "Survived", valid = xtest)
#
# pred <- xgb$predict(xtest)
# auc(actual = xtest$Survived, predicted = pred)
## ----eval=FALSE---------------------------------------------------------------
# xgb <- XGBTrainer$new(objective="binary:logistic")
# gst <-GridSearchCV$new(trainer = xgb,
# parameters = list(n_estimators = c(10,50),
# max_depth = c(5,2)),
# n_folds = 3,
# scoring = c('accuracy','auc'))
# gst$fit(xtrain, "Survived")
# gst$best_iteration()
## -----------------------------------------------------------------------------
rf <- RFTrainer$new()
rst <- RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
rst$fit(xtrain, "Survived")
rst$best_iteration()
## -----------------------------------------------------------------------------
# add target encoding features
xtrain[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$train[[2]]]
xtest[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$test[[2]]]
# train a random forest
# Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
auc(actual = xtest$Survived, predicted = pred)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.