# output: rmarkdown::html_vignette
# vignette: >
#   %\VignetteIndexEntry{Vignette Title}
#   %\VignetteEncoding{UTF-8}
#   %\VignetteEngine{knitr::rmarkdown}
knitr::opts_chunk$set(include = FALSE, warning=FALSE, message = FALSE, echo=FALSE, fig.align='center')
library('dplyr')
library('ggplot2')
library('tidyr')
library('tibble')
library('stringr')
library('broom')
library('keanu')
#devtools::install_github("thomasp85/patchwork")
library('patchwork')

color_scale_discrete_strong <- c("#00CBFE","#2B4141","#8AB9B5","#C8C2AE")
theme_strong <- function (rotate_xlab=FALSE)  {
  out <- ggplot2::theme_bw() +
    ggplot2::theme(title = ggplot2::element_text(size = 14, family = "SFNS Display"),
                   legend.text = ggplot2::element_text(size = 10, family = "SFNS Display"),
                   plot.title = ggplot2::element_text(size = 18, family = "SFNS Display"),
                   axis.text = ggplot2::element_text(size = 14),
                   axis.title = ggplot2::element_text(size = 16, family = "SFNS Display"),
                   text = ggplot2::element_text(family = "SFNS Display"),
                   legend.background = ggplot2::element_rect(fill = "white"),
                   panel.grid.major = ggplot2::element_blank(),
                   panel.grid.minor = ggplot2::element_blank(),
                   panel.border = ggplot2::element_rect(colour = "black", size = 2),
                   strip.background = ggplot2::element_rect(fill = "white", colour = NA),
                   strip.text.x = ggplot2::element_text(size = 14, face = "bold", colour = "black"),
                   strip.text.y = ggplot2::element_text(size = 14, face = "bold", colour = "black"))
  if (rotate_xlab)
    out <- out + theme(axis.text.x = element_text(angle = 45, hjust = 1))
  return(out)
}

add_fourier <- function (.data, time_col, period, K)  {
  if (length(period) != length(K)) stop("Number of periods does not match number of orders")
  if (any(2 * K > period)) stop("K must be not be greater than period/2")
  times <- .data[[time_col]]
  p <- numeric(0)
  labels <- character(0)
  for (j in seq_along(period)) {
    if (K[j] > 0) {
      p <- c(p, (1:K[j])/period[j])
      labels <- c(labels, paste(paste0(c("S", "C"), rep(1:K[j], rep(2, K[j]))), round(period[j]), sep = "-"))
    }
  }
  k <- duplicated(p)
  p <- p[!k]
  labels <- labels[!rep(k, rep(2, length(k)))]
  k <- abs(2 * p - round(2 * p)) > .Machine$double.eps
  X <- matrix(NA_real_, nrow = length(times), ncol = 2L * length(p))
  for (j in seq_along(p)) {
    if (k[j]) 
      X[, 2L * j - 1L] <- sin(2 * p[j] * times * pi)
    X[, 2L * j] <- cos(2 * p[j] * times * pi)
  }
  colnames(X) <- labels
  X <- X[, !is.na(colSums(X)), drop = FALSE]
  return( bind_cols(.data, as.data.frame(X)) )
}

One of R's strengths is its powerful formula-based interface to modelling. It saves you the headache of one-hot-encoding categorical variables, manually constructing higher-level interactions, and rewriting models just to add or drop terms. Instead, you just specify the model you want in high-level notation, and R will handle transforming it into the raw numerical model-matrix that's needed for model-fitting.

Thanks to R's rich ecosystem of packages, for many situations you'll find a ready-to-use modelling solution that leverages this formula-interface. But there will inevitably be situations where out-of-the-box solutions don't fit your use-case. When that happens, you're stuck with something more time-consuming: hand-coding the underlying model-fitting, perhaps using numerical optimization (e.g., stats::optim), or maybe a language like Stan.

We wrote keanu to make tackling situations like these painless and fruitful. It allows you to focus on the core aspects of your models, because it provides tools that handle the busywork of translating abstract model-specifications into the nuts-and-bolts of model-fitting.

Let's introduce the package with a quick example.

An Example

An online retailer is interested in better understanding and predicting the purchasing behavior of a particular segment of their customers. This segment consists of customers who visited the website once, didn't make a purchase, then later returned and made a purchase. The retailer would like to take metrics from that first visit and use them to predict the purchase-amount on the second visit.

They're particularly interested in the relationship between first-visit session-length and second-visit purchase-amount:

set.seed(22)
N <- 3500

df_return_segment <- data_frame(customer_id = 1:N) %>%
  # main predictors:
  mutate(session_length = if_else(runif(n())<.5, runif(n(),0,4), 6*rbeta(n(),2,2)),
         sl_ca_slope = rnorm(n(), mean = -1.5, sd = 1.5),
         landing_page = sample(LETTERS[1:4], size = n(), replace = TRUE),
         region = sample(c("US","UK","CAN"), size=n(), replace = TRUE, prob = c(.5,.25,.25)),
         browser = sample(c("Chrome","Safari","Firefox","IE"), size=n(), replace = TRUE, prob = c(.50,.10,.20,.10)),
         client = if_else(browser %in% c("Chrome","Safari"),
                          true = sample(c("Mobile","Desktop"), size=n(), replace = TRUE),
                          false = "Desktop"),
         time_of_day = rbeta(n(),2,2)*24) %>%
  # fourier for time-of-day predictor
  add_fourier(time_col = 'time_of_day', period = 24, K=2) %>%
  rename(time_of_day_c1 = `C1-24`, time_of_day_c2 = `C2-24`, time_of_day_s1 = `S1-24`, time_of_day_s2 = `S2-24`) %>%
  mutate(checkout_amount_v2 = rlnorm(n(), meanlog = 2.0/sqrt(abs(18-time_of_day)+2), sdlog = 1) + 10*exp(sqrt(session_length) * sl_ca_slope),
         checkout_amount_v2 = if_else( checkout_amount_v2>150, runif(n(), 50,150), checkout_amount_v2),
         checkout_amount_v2 = if_else( checkout_amount_v2<1, 1+rlnorm(n()), checkout_amount_v2) )

predictors <- c('session_length', 'landing_page', 'region', 'client',
                'time_of_day_s1', 'time_of_day_s2', 'time_of_day_c1', 'time_of_day_c2')
p_session_checkout <-  ggplot(df_return_segment, aes(session_length, checkout_amount_v2)) +
  geom_point(alpha=.25) + 
  theme_strong() +
  scale_x_continuous("First-Visit Session Length (Hours)", breaks=seq(1,5,1))
p_session_checkout + 
  scale_y_continuous("Second-Visit Checkout Amount", labels = scales::dollar) +
  stat_smooth(method="lm", color=color_scale_discrete_strong[1]) +
  geom_hline(yintercept = 0)

We'd like to better understand these data. We have a handful of predictors, and we'd like to assess how they work together — rather than looking at them in isolation like in the plot above. One simple approach is multiple regression. We'll try two versions: first on the vanilla data, then after log-transforming the target-variable (since it's skewed and constrained to be positive).

# vanilla model: we're just dividing the DV by 10 so that the coefficients for the two models are 
# easier to plot on the same graph
vanilla_model <- lm(formula = checkout_amount_v2/10 ~ ., 
                    data = df_return_segment[c('checkout_amount_v2',predictors)])

# log-transformed model
log_trans_model <- lm(formula = log(checkout_amount_v2) ~ .,  
                      data = df_return_segment[c('checkout_amount_v2', predictors)])
pd <- position_dodge(.5)
bind_rows(.id = 'model', 
          `Vanilla` = broom::tidy(vanilla_model),
          `Log-Transformed` = broom::tidy(log_trans_model)) %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(x=term, y=estimate, group = model, color=model)) +
  geom_point(position = pd, size=2.5) +
  geom_linerange(aes(ymin = estimate - 1.96*std.error, ymax = estimate + 1.96*std.error), 
                 position = pd, size=1) +
  geom_hline(yintercept = 0, linetype='dashed') +
  theme_strong() +
  scale_color_manual(name="Model", values = color_scale_discrete_strong[-3]) +
  scale_x_discrete("Predictor") + scale_y_continuous("Estimate") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

bind_rows(.id = 'model', 
          `Vanilla` = broom::augment(vanilla_model),
          `Log-Transformed` = broom::augment(log_trans_model)) %>%
  ggplot(aes(x=.resid, fill = model)) +
  geom_histogram(bins=60) +
  scale_x_continuous("Residuals", labels = scales::comma) +
  scale_y_continuous("Count", labels = scales::comma) +
  theme_strong() +
  scale_fill_manual(name="Model", guide=FALSE, values = color_scale_discrete_strong[-3]) +
  facet_wrap(~model, ncol=1, scales='free') +
  coord_cartesian(expand = FALSE)

A first glance at the results suggests that the log-transformed model is probably more appropriate: its residuals are more normally-distributed, which is an important assumption in regression models like these. No surprises there.

However, if we look closely, something very strange is going on across these two models. Our "session-length" predictor is flipping signs!

We already plotted session-length against checkout amount, where we could clearly see the checkout amount increasing with longer sessions. This matches the vanilla model. The log-transformed model says the opposite. Let's try re-doing that plot but with log-scaling on our y-axis:

p_session_checkout + 
  scale_y_continuous("Second-Visit Checkout Amount (Log-Scaling)", 
                     labels = scales::dollar, trans="log", breaks = c(1,5,25,100)) +
  stat_smooth(method="lm", color=color_scale_discrete_strong[1])

Once we've re-scaled the values like this, it looks like longer sessions mean smaller purchases (just like the log-transformed model said)!

What is happening here is a conflict between the mean and the median. As session-length goes down, the median checkout-amount goes down. But the variance increases, and checkout amounts are always positive, so this has an asymmetric effect: while the median goes down, the mean goes up.

# bonus: the problem is we are violating the homeoskedacity
augment(log_trans_model) %>%
  ggplot(., aes(session_length, .resid)) +
  geom_point(alpha=.10) + stat_summary_bin(fun.data=median_hilow, size=.75, breaks=seq(0,6,by=1)) +
  stronger::theme_strong() +
  scale_x_continuous("First-Visit Session Length (Hours)", breaks=seq(1,5,1)) +
  scale_y_continuous("Residuals from the Second Model")
p_session_checkout +
  stat_summary_bin(fun.y = median, geom='line', aes(color='Median'),size=2) +
  stat_summary_bin(fun.y = mean, geom='line', aes(color='Mean'),size=2) +
  coord_cartesian(ylim=c(0,75)) +
  scale_color_brewer("", palette = "Paired", direction = -1) +
  scale_y_continuous("Second-Visit Checkout Amount", labels = scales::dollar) +
  geom_hline(yintercept = 0)

This is a frustrating situation: two out-of-the-box solutions to modelling our data are giving us conflicting and hard-to-interpret results, due to violated model-assumptions. Perhaps we should avoid modelling altogether, using descriptives and visualizations on each predictor separately? Or in the opposite direction, we could spend time coding up a custom model. Neither of these situations are ideal: we're looking for quick insights in order to drive the more time-consuming, central tasks on the project — we don't want our kick-off analysis to get bogged down like this.

Keanu in Action

Luckily, keanu offers a better route, by letting us define a custom modelling function like lm within seconds.

Above, we pinned our issue on a conflict between the mean and median. This conflict was due to the fact that, as session-length increases, the median goes down, but the variance goes up. So we just need a model that not only captures the central-tendency of the data, but also its variance.

First, we write out the likelihood function that describes (the error in) our data. The lognormal distribution seems like a good choice, since our data was very close to normally distributed when we log-transformed it.

R provides the density for the lognormal distribution, we'll just wrap it in a function so that it gives us the log-density by default:

lognormal_loglik_fun <- function(Y, mu, sigma) dlnorm(x = Y, meanlog = mu, sdlog = sigma, log = TRUE)

Next, we just pass this to keanu's modeller_from_loglik. This is a "function factory": it is a function that produces a function.

lognormal_model <- modeller_from_loglik(loglik_fun = lognormal_loglik_fun)

In this case, the function produced is lognormal_model. It acts a lot like other modelling functions in R, like lm. The biggest difference is that we pass it a list of formulas, instead of a single formula. And instead of just specifying the DV in our formulas, we also specify the parameter of the DV that's being modelled by that formula.

For example, to model the mu parameter of our checkout_amount_v2 using just session_length, we'd write:

mu(checkout_amount_v2) ~ session_length

For sigma, we'd write something similar. But since the sigma parameter must be greater than zero, we'll specify a log-link:

sigma(checkout_amount_v2, link = "log") ~ session_length

Now let's put it all together, using all predictors:

keanu_model <- lognormal_model(

  formulas =  list( mu(checkout_amount_v2) ~ .,
                    sigma(checkout_amount_v2, link='log') ~ . ),

  data = df_return_segment[c('checkout_amount_v2', predictors)]

)

And... that's it! Looking at the coefficients (I've removed all but the interesting ones to avoid clutter), we see that the model has exactly captured the paradox above. Session-length has a negative coefficient when predicting the mu parameter, but a very large positive coefficient when predicting the sigma parameter:

plot_coefficients(keanu_model, terms = vars(starts_with("time_of_day"), session_length), 
                  facet=FALSE,
                  size=1, color="steelblue3") +
  theme_strong(rotate_xlab=TRUE) +
  scale_y_continuous("Estimate") +
  scale_x_discrete("Predictor") +
  coord_flip() +
  facet_wrap(~parameter, scales="free_x") +
  scale_shape_discrete(guide=FALSE)

Since the mu parameter of the lognormal distribution corresponds to the median, this explains why the median decreased. Meanwhile, the mean (expected value) of this distribution is given by $exp(\mu + \frac{\sigma^2}{2})$. When we plug our coefficients into this, we can see that model correctly captures that the expected-value goes up with longer sessions:

univariate_predictions(keanu_model)$session_length %>%
  distinct() %>%
  mutate(expected_value = exp(.fitted_mu + .fitted_sigma^2/2)) %>%
  ggplot(aes(x=session_length, y=expected_value)) +
  geom_line(size=2, color=color_scale_discrete_strong[1]) +
  theme_strong() +
  scale_x_continuous("Session-Length", breaks=1:5) +
  scale_y_continuous("Expected Value of Checkout Amount", breaks = 10:15, labels = scales::dollar)

The example here showcases the virtues of keanu nicely. We've not only done a better job at building a model that fits our data, but this model also provides more insight than our initial attempts: its coefficient-estimates perfectly capture the paradoxical relationship between session-length and checkout-amount.

So to recap: keanu gives us an fast and easy way of defining modelling functions — all we need to do is provide the likelihood function for our data. These functions let us do quick analyses of data that don't fit into the mold of out-of-the-box modelling tools in R.



strongio/keanu documentation built on May 8, 2019, 11:16 p.m.