suppressPackageStartupMessages(library(rgdal)) suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(ggthemes)) suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(tidyr)) suppressPackageStartupMessages(library(lubridate)) suppressPackageStartupMessages(library(lme4)) suppressPackageStartupMessages(library(arm)) suppressPackageStartupMessages(library(caret)) suppressPackageStartupMessages(library(rstanarm)) suppressPackageStartupMessages(library(heatmapFit)) suppressPackageStartupMessages(library(stargazer)) suppressPackageStartupMessages(library(gridExtra))
As our first steps in modeling ride rating, we will start to model without route data. Instead we will focus on other question in the modeling as a start for our model:
We actually expect a fair amount of the variance in ride rating to be explained by these variables, based on tests of a smaller sample.
load('../data/rides.RData') load('../data/weather_daily.RData') load('../data/rain_hourly.RData')
total.obs <- nrow(rides) no.response <- rides %>% filter(is.na(stressful)) %>% nrow()
There are r total.obs
rides in the data set, with r no.response
(r 100 * no.response / total.obs
%) rides with no rating.
# Merging in weather data rides.final <- rides %>% mutate(date = floor_date(datetime, "day")) %>% left_join(weather, by = "date") %>% mutate(datetime_hour = floor_date(datetime, "hour")) %>% left_join(rain, by=c("datetime_hour" = "datetime"))
ggplot(rides.final, aes(x = log(length), fill = stressful)) + geom_density(alpha = 0.4) + labs(title = "Ride Rating and Log Length", xlab = "Log Length", ylab = "Density") + theme_tufte()
We also want to consider patterns with weather. We have data on daily weather, including wind speed, temperature highs and lows, and rain data. But we also have hourly rain data from a local fire station.
ggplot(rides.final, aes(x = log(rainfall.4h), fill = stressful)) + geom_density(alpha = 0.4) + labs(title="Ride Ratings and Recent Rainfall", xlab="4-Hour Cumulative Rainfall (in)", ylab="Density") + theme_tufte() ggplot(rides.final, aes(x =log(mean.wind.speed), fill = stressful)) + geom_density(alpha = 0.4, adjust = 2.5) + labs(title="Ride Ratings and Daily Windspeed", xlab="Mean Wind Speed of Day of Ride)", ylab="Density") + theme_tufte()
We would like to incorporate traffic, but to simplify our model, we may simple use time of day as a proxy.
ggplot(rides.final, aes(x = hour(datetime), fill = stressful)) + geom_density(alpha = 0.4) + labs(title = "Ride Rating and Time of Day", xlab = "Hour of Day", ylab = "Density") + theme_tufte()
# Data Prep rides.scaled <- rides.final %>% mutate(log.length = scale(log(length)), length = scale(length)) %>% filter(!is.na(stressful), !is.na(max.temp), !is.na(rain)) # Compute rider average log.length rider.avg.lengths <- rides.scaled %>% group_by(rider) %>% summarise(avg.log.length = mean(log.length, na.rm=TRUE)) rides.scaled <- rides.scaled %>% left_join(rider.avg.lengths, by="rider") # Create a scaled hour term rides.scaled <- rides.scaled %>% mutate(hour = scale(hour(datetime))) inTraining <- createDataPartition(rides.scaled$id, p = .75, list = FALSE) training <- rides.scaled[inTraining,] testing <- rides.scaled[-inTraining,]
First, we might simply try to model ride rating by modeling ride rating $Y$ as
[ Y \sim \text{Bernoulli}(p), ]
where $p$ is just the probability that a ride is rated "stressful". Essentially, what is $p$?
We also want to consider how a classical logistic regression model compares to a model with a random intercept for riders. So we will model:
[ Y = \text{logit}^{-1} \left( \alpha + \beta_1 \cdot \text{log.length} + \beta_2 \cdot \text{log.windspeed} + \beta_3 \cdot \text{log.rainfall.4h} \right). ]
Now we want to explore how we can capture variance with and between riders. So we will use the basic model
[ Y \sim \text{Bernoulli} (\text{logit}^{-1}(\alpha_{j[i]})), \quad \alpha_{j[i]} \sim \text{Normal}(\mu_\alpha, \sigma^2_\alpha). ]
Now we want to add effects based on time of day. We will try using polynomial regression to do this first, by adding to our regression the terms,
[ \beta_1 \cdot \text{hour} + \beta_2 \cdot \text{hour}^2 + \beta_3 \cdot \text{hour}^3 + \beta_4 \cdot \text{hour}^4. ]
Our last model will take the rider intercepts and day effects and add the terms we had in our first regression with variables.
# Fit each of the models model0 <- glm(stressful ~ 1, data = training, family=binomial(link="logit")) model1 <- glm(stressful ~ log.length + rainfall.4h + mean.wind.speed + 1, data = training, family=binomial(link="logit")) model2 <- glmer(stressful ~ 1 + (1|rider), data = training, family=binomial(link="logit")) model3 <- glmer(stressful ~ 1 + (1|rider) + hour + I(hour^2) + I(hour^3) + I(hour^4), data = training, family=binomial(link="logit")) model4 <- glmer(stressful ~ 1 + (1|rider) + hour + I(hour^2) + I(hour^3) + I(hour^4) + log.length + rainfall.4h + mean.wind.speed , data = training, family=binomial(link="logit"))
# Now let's make the table of coefficients stargazer(model0, model1, model2, model3, model4, title="results", align=TRUE)
separation.plot <- function(data, col.actual, col.probs) { results <- data %>% arrange_(col.probs) %>% select_(col.actual, col.probs) %>% rename_(Y = col.actual, Yhat = col.probs) expected.true = sum(results$Y) ggplot(results) + geom_rect(aes(xmin = 0, xmax = seq(length.out = length(Yhat)), ymin = 0, ymax = 1), fill = "white") + geom_linerange(aes(color = Y, ymin = 0, ymax = 1, x = seq(length.out = length(Yhat)))) + geom_line(aes(y = Yhat, x = seq(length.out = length(Yhat))), lwd = 0.8) + scale_y_continuous("Y-hat\n", breaks = c(0, 0.25, 0.5, 0.75, 1.0)) + scale_x_continuous("", breaks = NULL) + theme_linedraw() + scale_colour_grey(start=1, end=0) + geom_point(aes(y = 0, x = length(Yhat) - expected.true), shape=17) }
model.results <- data.frame(stressful = testing$stressful, pred0 = predict(model0, newdata=testing, type="response"), pred1 = predict(model1, newdata=testing, type="response"), pred2 = predict(model2, newdata=testing, type="response"), pred3 = predict(model3, newdata=testing, type="response"), pred4 = predict(model4, newdata=testing, type="response")) separation.plot(model.results, "stressful", "pred0") separation.plot(model.results, "stressful", "pred1") separation.plot(model.results, "stressful", "pred2") separation.plot(model.results, "stressful", "pred3") separation.plot(model.results, "stressful", "pred4")
heatmap.fit(model.results$stressful, model.results$pred1, reps=5) heatmap.fit(model.results$stressful, model.results$pred2, reps=5) heatmap.fit(model.results$stressful, model.results$pred3, reps=5) heatmap.fit(model.results$stressful, model.results$pred4, reps=5) #mean(testing$stressful != testing$prediction1)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.