Introduction.R
In pipeliner: Machine Learning Pipelines for R

## ---- eval=FALSE, include=TRUE-------------------------------------------
#  devtools::install_github("alexioannides/pipeliner")

## ---- eval=FALSE, include=TRUE-------------------------------------------
#  transform_features(function(df) {
#    data.frame(x1 = log(df$var1))
#  })

## ---- eval=FALSE, include=TRUE-------------------------------------------
#  transform_response(function(df) {
#    data.frame(y = log(df$response))
#  })

## ---- eval=FALSE, include=TRUE-------------------------------------------
#  estimate_model(function(df) {
#    lm(y ~ 1 + x1, df)
#  })

## ---- eval=FALSE, include=TRUE-------------------------------------------
#  inv_transform_response(function(df) {
#    data.frame(pred_response = exp(df$pred_y))
#  })

## ------------------------------------------------------------------------
library(pipeliner)

data <- faithful

lm_pipeline <- pipeline(
  data,
  
  transform_features(function(df) { 
    data.frame(x1 = (df$waiting - mean(df$waiting)) / sd(df$waiting))
  }),
  
  transform_response(function(df) {
    data.frame(y = (df$eruptions - mean(df$eruptions)) / sd(df$eruptions))
  }),
  
  estimate_model(function(df) { 
    lm(y ~ 1 + x1, df)
  }),
  
  inv_transform_response(function(df) { 
    data.frame(pred_eruptions = df$pred_model * sd(df$eruptions) + mean(df$eruptions))
  })
)

in_sample_predictions <- predict(lm_pipeline, data, verbose = TRUE)  
head(in_sample_predictions)

## ------------------------------------------------------------------------
summary(lm_pipeline$inner_model)

## ------------------------------------------------------------------------
pred_function <- lm_pipeline$predict 
predictions <- pred_function(data, verbose = FALSE)

head(predictions)

## ---- warning=FALSE, message=FALSE---------------------------------------
library(tidyverse)

lm_pipeline <- data %>% 
  pipeline(
    transform_features(function(df) { 
      transmute(df, x1 = (waiting - mean(waiting)) / sd(waiting))
    }),
    
    transform_response(function(df) {
      transmute(df, y = (eruptions - mean(eruptions)) / sd(eruptions))
    }),
    
    estimate_model(function(df) { 
      lm(y ~ 1 + x1, df)
    }),
    
    inv_transform_response(function(df) { 
      transmute(df, pred_eruptions = pred_model * sd(eruptions) + mean(eruptions))
    })
  )

head(predict(lm_pipeline, data))

## ------------------------------------------------------------------------
library(modelr)

# define a function that estimates a machine learning pipeline on a single fold of the data
pipeline_func <- function(df) {
  pipeline(
    df,
    transform_features(function(df) {
      transmute(df, x1 = (waiting - mean(waiting)) / sd(waiting))
    }),

    transform_response(function(df) {
      transmute(df, y = (eruptions - mean(eruptions)) / sd(eruptions))
    }),

    estimate_model(function(df) {
      lm(y ~ 1 + x1, df)
    }),

    inv_transform_response(function(df) {
      transmute(df, pred_eruptions = pred_model * sd(eruptions) + mean(eruptions))
    })
  )
}

# 5-fold cross-validation using machine learning pipelines
cv_rmse <- crossv_kfold(data, 5) %>% 
  mutate(model = map(train, ~ pipeline_func(as.data.frame(.x))),
         predictions = map2(model, test, ~ predict(.x, as.data.frame(.y))),
         residuals = map2(predictions, test, ~ .x - as.data.frame(.y)$eruptions),
         rmse = map_dbl(residuals, ~ sqrt(mean(.x ^ 2)))) %>% 
  summarise(mean_rmse = mean(rmse), sd_rmse = sd(rmse))

cv_rmse