packages <- c("dplyr", "ggplot2", "ggthemes", "caret", "lubridate", "gamm4", "gridExtra", "binom", "tidyr", "knitr", "stargazer", "extrafont", "AUC") sapply(packages, library, character.only = TRUE) source("prob-var-plot.R") source("separation-plot.R")
# Load data #load('./rides.RData') #rides_final <- rides %>% # mutate(length_meters = length) rides <- read.csv('sample.csv', col.names = c("trip_id", "rider", "length_meters", "datetime", "rating", "rating_text"), colClasses = c("character", "factor", "numeric", "character", "factor", "factor")) rides_final <- rides %>% mutate(datetime = ymd_hms(datetime, tz="America/Los_Angeles"), stressful = ifelse(rating_text == "none", NA, ifelse(rating_text == "good", FALSE, TRUE)), rider = as.factor(as.numeric(rider))) ################################### ## Comment this out later!!! # Shift the dates for the test data #day(rides_final$datetime) <- day(rides_final$datetime) - 200 #rides_final$rider <- as.factor(rep(1:3, length.out = nrow(rides_final))) load('./weather_daily.RData') load('./rain_hourly.RData')
# Join in various data sources rides_final <- rides_final %>% mutate(date = floor_date(datetime, "day")) %>% left_join(weather, by = "date") %>% mutate(datetime_hour = floor_date(datetime, "hour")) %>% left_join(rain, by=c("datetime_hour" = "datetime")) rides_final <- mutate(rides_final, time = datetime) day(rides_final$time) <- 1 month(rides_final$time) <- 1 year(rides_final$time) <- 2015 rides_final <- rides_final %>% mutate(time = as.numeric(time), time = time - min(time), time = time / 3600) # convert time from seconds to hours rides_final$weekend <- as.factor(wday(rides_final$datetime) %in% c(1, 7)) # Filter out the rides with zero length rides_scaled <- rides_final %>% filter(length_meters != 0) # Scale the variables scaled_log_length <- scale(log(rides_scaled$length_meters)) scaled_length <- scale(rides_scaled$length_meters) scaled_temp <- scale(rides_scaled$mean.temp) rides_scaled$length <- scaled_length[,1] rides_scaled$log_length <- scaled_log_length[,1] rides_scaled$mean.temp <- scaled_temp[,1] rides_scaled <- rides_scaled %>% filter(!is.na(mean.temp) & !is.na(rainfall)) # Find riders with at least 20 rated rides rider_counts <- rides_scaled %>% group_by(rider) %>% summarise(count = n()) %>% filter(count > 20) rides_scaled <- rides_scaled %>% filter(rider %in% rider_counts$rider) rides_scaled <- rides_scaled %>% mutate(gust.speed = ifelse(is.na(gust.speed), 0, gust.speed)) # Filter out riders that don't rate any rides riders <- rides_scaled$rider %>% unique() riders_with_ratings <- filter(rides_scaled, !is.na(stressful))$rider %>% unique() riders_without_ratings <- riders[!riders %in% riders_with_ratings] rides_scaled <- filter(rides_scaled, !rider %in% riders_without_ratings) # Reset the rider indexes rides_scaled$rider <- rides_scaled$rider %>% as.numeric() %>% as.factor() %>% plyr::mapvalues(from = levels(.), to = 1:length(levels(.)))
some_riders <- rides_scaled$rider %>% unique() %>% sample(9) #rides_scaled <- rides_scaled %>% filter(rider %in% some_riders)
ggplot(filter(rides_scaled, !is.na(stressful)), aes(x = time, y = length_meters)) + # geom_point(alpha = 0.2) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12am", "6am", "12pm", "6pm", "12am")) + scale_fill_gradientn(colors = rainbow(7)) + #scale_fill_gradient(low = "white", high = "black") + theme_bw(base_family="CMU Serif") + facet_wrap(~weekend) + scale_y_log10(limits = c(50, 30000), breaks = 10^seq(2, 4, length = 5), labels = c("0.1 km", "0.32 km", "1 km", "3.2 km", "10 km")) ggplot(filter(rides_scaled, rider %in% some_riders & weekend == TRUE), aes(x = time, y = log_length)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12am", "6am", "12pm", "6pm", "12am")) + scale_fill_gradientn(colors = rainbow(7)) + theme_bw(base_family="CMU Serif") + facet_wrap(~rider, nrow = 3) time_ranges <- data.frame(name = c("morning", "afternoon", "evening"), start = c(7, 11.5, 16), end = c(10, 13.5, 19.5)) time_designations <- ggplot() + scale_x_continuous(breaks = c(0, 7, 10, 11.5, 13.5, 16, 19.5, 24), labels = c("12am", "7am", "10am", "11:30am", "1:30pm", "5pm", "7:30pm", "12am")) + geom_density(data = filter(rides_scaled, weekend == FALSE), aes(x = time), fill = "grey14", color = NA) + theme_bw(base_family = "CMU Serif") + labs(title = "Weekday Time of Day Designations") + geom_rect(aes(xmin = start, xmax = end), ymin = 0, ymax = 0.14, data = time_ranges, alpha = 0.6, fill = "white") + geom_label(data = time_ranges, aes(label = name, x = (start + end) / 2), y = .02, family = "CMU Serif") + theme(axis.text.x=element_text(size=10, angle=50, vjust = 1.05, hjust = 1)) time_designations ggsave("plots/time-designations.pdf", width = 4, height = 2.5)
Feature ideas:
We need to understand how riders differ. Our result from the previous chapter underscore this need. Clearly there are different types of riders, and this accounts for a huge amount of the patterns in how riders ride. We can get for each rider
The last two are a little vague. How can we get variables that describe patterns of rides? Summary statistics like mean and variance can help, but really don't give a great description of rider patterns. How would mean time of day help us? Instead, we can transform time of day and length patterns into high dimensional varianbles and then do principle component analysis (PCA) to create summaries that best describe those distributions. We keep these as two separate PCA a distinct from other variables to keep some interprebility in this model.
I need to find a way to handle the riders without any rides on weekends. We will just give them the median value.
# Compute rider averages riders <- rides_scaled %>% group_by(rider) %>% summarise(first_date = min(datetime), last_date = max(datetime), count = n(), count_weekday = sum(weekend == FALSE), prop_weekend = sum(weekend == TRUE) / count, #count_balance = log(sum(weekend == TRUE & between(time, 0, 12)) / count + 1), #length_balance = log(sum(exp(log_length) * as.numeric(weekend == FALSE & # between(time, 0, 12))) / # sum(exp(log_length) * as.numeric(weekend == FALSE)) + 1), median_length = median(ifelse(weekend == FALSE, log_length, NA), na.rm = TRUE), median_length_weekend = median(ifelse(weekend == TRUE, log_length, NA), na.rm = TRUE), var_length = sqrt(var(log_length * as.numeric(weekend == FALSE))), var_length_weekend = sqrt(var(log_length * as.numeric(weekend == TRUE))), morning = sum(weekend == FALSE & between(time, 7, 10)) / count_weekday, afternoon = log(sum(weekend == FALSE & between(time, 11.5, 13.5)) / count_weekday + 1), evening = sum(weekend == FALSE & between(time, 16, 19.5)) / count_weekday) %>% mutate(freq = count / as.numeric(last_date - first_date), median_length_weekend = ifelse(is.na(median_length_weekend), mean(median_length_weekend, na.rm = TRUE), median_length_weekend), var_length_weekend = ifelse(var_length_weekend == 0, mean(var_length_weekend), var_length_weekend))
Okay, so here's the game plan:
riders_final <- riders # Need to standardize these variables scaled_freq <- scale(log(riders_final$freq + 1)) scaled_weekend <- scale(log(riders_final$prop_weekend + 1)) riders_final$freq <- scaled_freq[,1] riders_final$prop_weekend <- scaled_weekend[,1] #riders_final$length_balance <- scale(riders_final$length_balance)[,1] #riders_final$count_balance <- scale(riders_final$count_balance)[,1] riders_final$median_length <- scale(riders_final$median_length)[,1] riders_final$median_length_weekend <- scale(riders_final$median_length_weekend)[,1] riders_final$var_length <- scale(riders_final$var_length)[,1] riders_final$var_length_weekend <- scale(riders_final$var_length_weekend)[,1] riders_final$morning <- scale(riders_final$morning)[,1] riders_final$afternoon <- scale(riders_final$afternoon)[,1] riders_final$evening <- scale(riders_final$evening)[,1] clustering_data <- select(riders_final, -first_date, -last_date, -rider, -count, -count_weekday) test_k <- function(k) kmeans(clustering_data, k, nstart = 100)$tot.withinss ss <- sapply(1:8, test_k) choice_k <- qplot(x = 1:8, y = ss) + geom_line() + labs(title = "Total Within SS by Choice of k", x = "k", y = "Total Within SS") + theme_bw(base_family="CMU Serif") choice_k ggsave("plots/choice_k.pdf", width = 3.5, height = 2.5) set.seed(42424278) clustering <- kmeans(clustering_data, 4, nstart = 50) riders_final$cluster <- as.factor(clustering$cluster) pc <- prcomp(clustering_data) qplot(x = 1:length(pc$sdev), y = cumsum(pc$sdev^2 / sum(pc$sdev^2))) + geom_line() + theme_bw(base_family = "CMU Serif") + geom_abline(slope = 1, intercept = 0, linetype = "dotted") riders_final$PC1 <- pc$x[,1] riders_final$PC2 <- pc$x[,2] scatter1 <- ggplot(riders_final, aes(x = PC1, y = PC2, color = cluster)) + geom_point(size = .6) + theme_bw(base_family="CMU Serif") + labs(x = "PC1", y = "PC2") + theme(legend.position="top") cluster_centroids <- cbind(riders, "cluster" = riders_final$cluster) %>% group_by(cluster) %>% summarise(freq = mean(freq), prop_weekend = mean(prop_weekend)) scatter3 <- ggplot(riders_final, aes(x = riders$freq, y = riders$prop_weekend, color = cluster)) + geom_point(size = .6) + theme_bw(base_family="CMU Serif") + labs(x = "rides per day", y = "prop. during weekend") + theme(legend.position="top") + geom_point(data = cluster_centroids, aes(x = freq, y = prop_weekend), shape = 17, size = 4) + scale_x_log10() + scale_y_log10() scatter1 scatter3 ggsave("plots/cluster_scatter1.pdf", scatter1, width = 3, height = 3.5) ggsave("plots/cluster_scatter3.pdf", scatter3, width = 3, height = 3.5)
rides_scaled_w_cluster <- select(riders_final, rider, cluster) %>% right_join(rides_scaled, by = "rider") %>% mutate(weekend_named = as.factor(ifelse(weekend == TRUE, "weekend", "weekday"))) #jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", # "yellow", "#FF7F00", "red", "#7F0000")) cluster_patterns <- rides_scaled_w_cluster %>% ggplot(aes(x = time, y = length_meters)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12am", "6am", "12pm", "6pm", "12am")) + facet_grid(weekend_named ~ cluster) + scale_fill_gradientn(colors = rainbow(7)) + #scale_fill_gradient(low = "white", high = "black") + theme_bw(base_family="CMU Serif") + labs(x = "time of day", y = "ride length", title = "Cluster Time and Length Patterns") + theme(axis.text.x=element_text(size=10, angle=50, vjust = 1.05, hjust = 1)) + scale_y_log10(limits = c(50, 30000), breaks = 10^seq(2, 4, length = 5), labels = c("0.1 km", "0.32 km", "1 km", "3.2 km", "10 km")) cluster_patterns ggsave("plots/cluster_patterns.pdf", cluster_patterns, width = 6, height = 4)
model0 <- gam(stressful ~ 1 + log_length + mean.temp + gust.speed + rainfall + rainfall.4h + s(time, bs="cc", k=9, by = weekend), knots=list(time=seq(0, 24, 3)), data = rides_scaled_w_cluster, family = binomial) # Model 4 from last chapter model1 <- gamm4(stressful ~ 1 + log_length + mean.temp + gust.speed + rainfall + rainfall.4h + s(time, bs="cc", k=9, by = weekend), random =~(1|rider), knots=list(time=seq(0, 24, 3)), data = rides_scaled_w_cluster, family = binomial) # Model 4 but with cluster intercepts model2 <- gamm4(stressful ~ 1 + log_length + mean.temp + gust.speed + rainfall + rainfall.4h + s(time, bs="cc", k=9, by = weekend), random =~(1|cluster), knots=list(time=seq(0, 24, 3)), data = rides_scaled_w_cluster, family = binomial)
invlogit <- function (x) { 1 / (1 + exp(-x))} # Trying to get predictions that combine smoothers and random intercept. It's hard. predict_gamm <- function(model, newdata, group_name) { intercepts <- coef(model$mer)[[group_name]][["(Intercept)"]] intercept_vector <- intercepts[match(newdata[[group_name]], rownames(coef(model$mer)[[group_name]]))] invlogit(predict(model$gam, newdata = newdata) + intercept_vector) }
fitted_values <- data.frame("actual" = rides_scaled$stressful, "predict0" = predict(model0, rides_scaled_w_cluster, type = "response"), "predict1" = predict_gamm(model1, rides_scaled_w_cluster, "rider"), "predict2" = predict_gamm(model2, rides_scaled_w_cluster, "cluster")) fitted_values <- fitted_values[complete.cases(fitted_values),] for (i in 0:2) { p <- separation_plot(fitted_values, "actual", paste("predict", as.character(i), sep = "")) ggsave(p, file=paste("plots/rider-model", as.character(i), "-sep.pdf", sep=""), width = 4, height = 0.25) } # Loglikelihood logLik(model0) logLik(model1$mer) logLik(model2$mer) # AIC AIC(model0) AIC(model1$mer) AIC(model2$mer) # AUC calc_auc <- function(model) auc(sensitivity(fitted_values[[model]], as.factor(as.numeric(fitted_values$actual)))) calc_auc("predict0") calc_auc("predict1") calc_auc("predict2")
Let's see how models in STAN do to compare
rides_complete <- rides_scaled %>% filter(!is.na(stressful)) library(rstan) rstan_options(auto_write = TRUE) #package the data for stan data = list( num_rides = length(rides_complete$stressful), num_cyclists = rides_complete$rider %>% unique() %>% length(), # Ride-level variables rating = as.numeric(rides_complete$stressful), length = as.numeric(rides_complete$log_length), mean_temp = as.numeric(rides_complete$mean.temp), wind_speed = as.numeric(rides_complete$mean.wind.speed), gust_speed = rides_complete$gust.speed, rainfall = rides_complete$rainfall, rainfall_4h = rides_complete$rainfall.4h, cyclist = as.numeric(rides_complete$rider), # Cyclist-level variables cyclist_freq = riders_final$freq, cyclist_weekend = riders_final$prop_weekend, cyclist_morning = riders_final$morning, cyclist_afternoon = riders_final$afternoon, cyclist_evening = riders_final$evening, cyclist_median_length = riders_final$median_length, cyclist_median_length_weekend = riders_final$median_length_weekend, cyclist_var_length = riders_final$var_length, cyclist_var_length_weekend = riders_final$var_length_weekend ) #compile the model (takes a minute or so) model = rstan::stan_model(file="rider_model1.stan") options(mc.cores = 2) #evaluate the model sampling_iterations = 1e3 #best to use 1e3 or higher out = rstan::sampling( object = model , data = data , chains = 2 , iter = sampling_iterations , warmup = sampling_iterations/2 , refresh = sampling_iterations/50 , seed = 1 , init = list(list(beta_length = 0, sigma_a = 1, a_beta_0 = 0, a_beta_freq = 0), list(beta_length = 0, sigma_a = 1, a_beta_0 = 0, a_beta_freq = 0)) ) print(out) save(out, file = "stan_results.RData") plot(out) library(purrr) out_e <- extract(out) out_e$beta_length compute_percentiles <- function(vec) { quants <- quantile(vec, probs = c(.025, .1, 0.9, 0.975)) list(lwr_95 = quants[1], upr_95 = quants[4], lwr_80 = quants[2], upr_80 = quants[3]) } out_e %>% map(compute_percentiles)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.