options(htmltools.dir.version = FALSE)
# see: https://github.com/yihui/xaringan
# install.packages("xaringan")
# see: 
# https://github.com/yihui/xaringan/wiki
# https://github.com/gnab/remark/wiki/Markdown
options(digits = 4)

What is machine learning?


Algorithms autonomously learning from data.

Given data, an algorithm tunes its parameters to match the data, understand how it works, and make predictions for what will occur in the future.






Everyone uses machine learning



Machine learning drives our algorithms for demand forecasting, product search ranking, product and deals recommendations, merchandising placements, fraud detection, translations, and much more. ~ Jeff Bezos, Amazon founder





What is the basic machine learning process?


Why do we separate training from prediction?


Just because an algorithm can fit past (training) data well, does not necessarily mean that it will predict new data well.




"Prediction is difficult, especially when it is about the future" ~ Niels Bohr


"An economist is an expert who will know tomorrow why the things he predicted yesterday didn't happen today." ~ Evan Esar

"A prediction about the direction of the stock market tells you nothing about where stocks are headed, but a whole lot about the person doing the predicting" ~ Warren Buffett


Training (fitting) vs. Testing (prediction)


Training (fitting) vs. Testing (prediction)


Training (fitting) vs. Testing (prediction)


Training (fitting) vs. Testing (prediction)


Why do we separate training from prediction?

x <- seq(0, 5, length.out = 50)
noise <- rnorm(50, mean = 0, sd = 2.5)

model_fun <- function(x) {x ^ 3 - 4 * x ^ 2 + .5 * x + 5}

y <- model_fun(x)
y_obs <- model_fun(x) + noise

par(mar = c(3, 4, 3, 1))

par(mfrow = c(1, 3))

plot(x, y_obs, main = "Data", xlab = "", ylab = "", col = "black")

# segments(x, y, x, y_obs)

# lines(y, y = y_obs)

# Plot 1

plot(x, y_obs, main = "Signal", xlab = "", ylab = "", col = "lightgray")
curve(model_fun, from = 0, to = 5, add = TRUE, col = "green", lwd = 2)

# segments(x, y, x, y_obs)

# lines(y, y = y_obs)

# Plot 2

plot(x, y_obs, main = "Noise", xlab = "", ylab = "")
 curve(model_fun, from = 0, to = 5, add = TRUE, col = "darkgray", lwd = .5)

 segments(x, y, x, y_obs)

# lines(y, y = y_obs)

# # Plot 3
# plot(x, y_obs, main = "A bad model tries to fit everything", xlab = "", ylab = "")
# curve(model_fun, from = 0, to = 5, add = TRUE, col = "darkgray", lwd = .5)
# text(.5, 20, "Hey I can draw a line through all points\nI don't have any error!", adj = 0)
# lines(x, y_obs)
# # Plot 4
# plot(x, y_obs, main = "A good model will try to focus on the signal", xlab = "", ylab = "")
# curve(model_fun, from = 0, to = 5, add = TRUE, col = "blue", lwd = 2)
# text(.5, 20, "I won't try to fit all points because\nI think there is random error", adj = 0)

# lines(x, y_obs)

Why do we separate training from prediction?

x <- seq(0, 5, length.out = 50)
noise <- rnorm(50, mean = 0, sd = 2.5)

model_fun <- function(x) {x ^ 3 - 4 * x ^ 2 + .5 * x + 5}

y <- model_fun(x)
y_obs <- model_fun(x) + noise

par(mar = c(3, 4, 3, 1))

par(mfrow = c(1, 3))

plot(x, y_obs, main = "Data", xlab = "", ylab = "", col = "black")

# segments(x, y, x, y_obs)

# lines(y, y = y_obs)

# Plot 1

plot(x, y_obs, main = "Good Model", xlab = "", ylab = "", col = "darkgray")

lines(x, y, col = "blue")

# curve(model_fun, from = 0, to = 5, add = TRUE, col = "green", lwd = 2)

text(.5, 20, "Fitting error = Medium", adj = 0)
text(.5, 15, "Prediction error = Low", adj = 0)

# segments(x, y, x, y_obs)

# lines(y, y = y_obs)

# Plot 2

plot(x, y_obs, main = "Bad Model", xlab = "", ylab = "")

text(.5, 20, "Fitting error = None", adj = 0)
text(.5, 15, "Prediction error = High", adj = 0)

lines(x, y_obs, col = "red")
# lines(y, y = y_obs)

# # Plot 3
# plot(x, y_obs, main = "A bad model tries to fit everything", xlab = "", ylab = "")
# curve(model_fun, from = 0, to = 5, add = TRUE, col = "darkgray", lwd = .5)
# text(.5, 20, "Hey I can draw a line through all points\nI don't have any error!", adj = 0)
# lines(x, y_obs)
# # Plot 4
# plot(x, y_obs, main = "A good model will try to focus on the signal", xlab = "", ylab = "")
# curve(model_fun, from = 0, to = 5, add = TRUE, col = "blue", lwd = 2)
# text(.5, 20, "I won't try to fit all points because\nI think there is random error", adj = 0)

# lines(x, y_obs)

What machine learning algorithms are there?


| Algorithm|Complexity?| |:------|:----| | Regression| Low / Medium | | Decision Trees| Low | | Random Forests| High | | Support Vector Machines| High |



Wikipedia lists 57 Categories of machine learning algorithms, each with dozens of examples



How do you fit and evaluate models in R?


Fitting a model

A_model <- A_fun(formula = y ~.,
                 data = data_train,

| Argument| Description| Note | |------:|:----|:---| | formula| Formula indicating variables to use| y ~ . is often used as a catch-all | | data| The dataset for model training| | | ...| Optional other arguments| See the function help page for details|



Evaluating a model

# Common ways to explore / use a model

A_model           # Print generic information

names(A_model)    # Show attributes

summary(A_model)  # Print summary information

predict(A_model,  # Predict test data
        newdata = data_test)  

plot(A_model)     # Visualize the model


Regression with glm()


In regression, the criterion is modeled as the weighted sum of predictors times weights $\beta_{1}$, $\beta_{2}$

Example: Default on a loan

One could model the risk of defaulting on a loan as:

$$Risk = Age \times \beta_{age} + Income \times \beta_{income} + ...$$

Training a model means finding values of $\beta_{Age}$ and $\beta_{Income}$ that 'best' match the training data.




Create regressions using the glm() function (part of base-R)

# glm() function for regression
glm(formula = y ~.,     # Formula
    data = data_train,  # Training data
    family, ...)        # Optional arguments

# Train glm model
glm_mod <- glm(formula = risk ~ ., 
               data = data_train)

# Predict new data with glm model
glm_pred <- predict(glm_mod,
                    newdata = data_test)


Fast-and-Frugal Trees with FFTrees()


In decision trees, the criterion is modeled as a sequence of logical Yes or No questions.

Example: Default on a loan




Create decision trees using the FFTrees package

# Load the FFTrees package

# Main Function
FFTrees(formula = y ~ .,
        data, ...)

# Train FFTrees model
FFTrees_mod <- FFTrees(formula = default ~ ., 
                       data = loan_data,
                       main = "Default Risk")

# Plot the tree

# Predict new data with FFTrees model
FFTrees_loan_pred <- predict(FFTrees_mod,
                        data = data_test)


Advanced algorithms


Support Vector Machines with e1071::svm()

# Creating support vector machine model

svm_mod <- svm(formula = risk ~ .,
               data = loan_data,
               kernel, degree, gamma, 



Random Forests with randomForest::randomForest()

# Creating random forest model

rf_mod <- randomForest(formula = risk ~ .,
                       data = loan_data,
                       mtry, cutoff, sampsize, 


How do I do machine learning in R?


If you're really into machine learning, packages such as mlr and caret can automate much of the the machine learning process.




In the practical, we will go through the basic steps "by hand" so you can see the process:

# Create training and test data
data_train <- ...
data_test <- ...

# Train models on training data
model_A <- A_fun(formula = y ~ ., 
                 data = data_train)

# Model A predictions
pred_A <- predict(model_A, 
                  newdata = data_test)

# Calculate Model A error
pred_err_A <- mean(abs(pred_A - data_test$y))

# Compare to Models B, C, D...



What is the history of machine learning?

Sources: Wikipedia, Bernard Marr, "A Short History of Machine Learning", Forbes.

therbootcamp/BaselRBootcamp2017 documentation built on May 3, 2019, 10:45 p.m.