source("prob-var-plot.R") source("separation-plot.R") packages <- c("ggplot2", "gamm4", "dplyr", "knitr", "AUC", "lubridate") sapply(packages, library, character.only = TRUE) filter <- dplyr::filter
# Load data #load('./rides.RData') #rides_final <- rides %>% # mutate(length_meters = length) rides <- read.csv('sample.csv', col.names = c("trip_id", "rider", "length_meters", "datetime", "rating", "rating_text"), colClasses = c("character", "factor", "numeric", "character", "factor", "factor")) rides_final <- rides %>% mutate(datetime = ymd_hms(datetime, tz="America/Los_Angeles"), stressful = ifelse(rating_text == "none", NA, ifelse(rating_text == "good", FALSE, TRUE)), rider = as.factor(as.numeric(rider))) # Small sample for testing #rides_final <- sample_n(rides_final, 5e3) ################################### ## Comment this out later!!! # Shift the dates for the test data #day(rides_final$datetime) <- day(rides_final$datetime) - 200 #rides_final$rider <- as.factor(rep(1:3, length.out = nrow(rides_final))) load('./weather_daily.RData') load('./rain_hourly.RData')
# Join in various data sources rides_final <- rides_final %>% mutate(date = floor_date(datetime, "day")) %>% left_join(weather, by = "date") %>% mutate(datetime_hour = floor_date(datetime, "hour")) %>% left_join(rain, by=c("datetime_hour" = "datetime")) rides_final <- mutate(rides_final, time = datetime) day(rides_final$time) <- 1 month(rides_final$time) <- 1 year(rides_final$time) <- 2015 rides_final <- rides_final %>% mutate(time = as.numeric(time), time = time - min(time), time = time / 3600) # convert time from seconds to hours rides_final$weekend <- as.factor(wday(rides_final$datetime) %in% c(1, 7)) # Filter out the rides with zero length rides_scaled <- rides_final %>% dplyr::filter(length_meters != 0) # Scale the variables scaled_log_length <- scale(log(rides_scaled$length_meters)) scaled_length <- scale(rides_scaled$length_meters) scaled_temp <- scale(rides_scaled$mean.temp) rides_scaled$length <- scaled_length[,1] rides_scaled$log_length <- scaled_log_length[,1] rides_scaled$mean.temp <- scaled_temp[,1] rides_scaled <- rides_scaled %>% filter(!is.na(mean.temp) & !is.na(rainfall)) rides_scaled <- rides_scaled %>% mutate(gust.speed = ifelse(is.na(gust.speed), 0, gust.speed)) # Reset the rider indexes rides_scaled$rider <- plyr::mapvalues(rides_scaled$rider, from = levels(rides_scaled$rider), to = 1:length(levels(rides_scaled$rider)))
rides_scaled <- sample_n(rides_scaled, 10000) # Find riders with at least 20 rated rides rider_counts <- rides_scaled %>% filter(!is.na(stressful)) %>% group_by(rider) %>% summarise(count = n()) %>% filter(count > 20) rides_scaled <- rides_scaled %>% filter(rider %in% rider_counts$rider) rides_scaled$r <- is.na(rides_scaled$stressful) qplot(x = log_length, y = r, data = rides_scaled) ggplot(aes(x = r, y = mean.temp), data = rides_scaled) + geom_boxplot() ggplot(aes(x = r, y = log_length), data = rides_scaled) + geom_boxplot() #ggplot(aes(x = r, y = ), data = rides_scaled) + geom_boxplot() ggplot(rides_scaled, aes(x = time, y = log_length)) + stat_density_2d(geom = "raster", aes(fill = ..density..), contour = FALSE) + scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12am", "6am", "12pm", "6pm", "12am")) + facet_wrap(~r) + scale_fill_gradientn(colors = rainbow(7)) + #scale_fill_gradient(low = "white", high = "black") + theme_bw(base_family="CMU Serif") + labs(x = "time of day", y = "log(length)", title = "Cluster Time and Length Patterns") + theme(axis.text.x=element_text(size=10, angle=50, vjust = 1.05, hjust = 1))
invlogit <- function (x) { 1 / (1 + exp(-x))} # Trying to get predictions that combine smoothers and random intercept. It's hard. predict_gamm <- function(model, newdata) { intercepts <- coef(model$mer)$rider[["(Intercept)"]] intercept_vector <- intercepts[match(newdata$rider, rownames(coef(model$mer)$rider))] predict(model$gam, newdata = newdata) %>% is.na() %>% sum() %>% print() intercept_vector %>% is.na() %>% sum() %>% print() invlogit(predict(model$gam, newdata = newdata) + intercept_vector) }
fit_y <- function(data, weights = NULL) { gamm4(stressful ~ 1 + log_length + mean.temp + gust.speed + mean.wind.speed + rainfall + rainfall.4h + s(time, bs="cc", k=9, by = weekend), random =~(1|rider), knots=list(time=seq(0, 24, 3)), data = data, family = binomial, weights = weights) } fit_r <- function(data, weights = NULL) { gam(r ~ 1 + log_length + mean.temp + gust.speed + rainfall + rainfall.4h + mean.wind.speed + pred_y + s(time, bs = "cc", k=9, by = weekend), knots=list(time=seq(0, 24, 3)), data = data, family = binomial, weights = weights) } # Get initial estimates of parameters model_y <- fit_y(rides_scaled) rides_scaled$pred_y <- predict_gamm(model_y, rides_scaled) model_r <- fit_r(rides_scaled) init_model_y <- model_y init_model_r <- model_r summary(model_r) summary(model_y$gam) # Setup data frames rides_complete <- rides_scaled %>% tbl_df() %>% filter(!r) %>% mutate(weight = 1) rides_missing <- rides_scaled %>% tbl_df() %>% filter(r) %>% mutate(weight = NA) last_aic <- AIC(model_y$mer) for (i in 1:1000) { # get prob of 1 pred_y <- predict_gamm(model_y, rides_missing) # get prob missing given y pred_r_y1 <- predict(model_r, newdata = mutate(rides_missing, pred_y = 1), type="response") pred_r_y0 <- predict(model_r, newdata = mutate(rides_missing, pred_y = 0), type="response") # Make weights denom <- (pred_y * pred_r_y1) + ((1-pred_y) * pred_r_y0) w_y1 <- pred_y * pred_r_y1 / denom w_y2 <- (1-pred_y) * pred_r_y0 / denom # print(pred) df_augmented <- bind_rows(rides_complete, mutate(rides_missing, weight = w_y1, stressful = TRUE), mutate(rides_missing, weight = w_y2, stressful = FALSE)) %>% data.frame() model_y <- fit_y(df_augmented, weights = df_augmented$weight) model_r <- fit_r(df_augmented, weights = df_augmented$weight) current_aic <- AIC(model_y$mer) print(last_aic - current_aic) if ((i > 1) && (last_aic - current_aic < 0.01)) { break } last_aic <- current_aic } new_aic <- AIC(model_y$mer) summary(init_model_y$gam) summary(model_y$gam) plot(init_model_y$gam) plot(model_y$gam) summary(init_model_r) summary(model_r) plot(init_model_r) plot(model_r) save(init_model_y, init_model_r, model_y, model_r, file = "missing_data_models.RData")
The new AIC is r new_aic
. Impressive!
rides_complete <- rides_scaled %>% filter(!is.na(stressful)) predictions <- data.frame("actual" = rides_complete$stressful, "pred" = predict_gamm(model_y, rides_complete)) sep_plot <- separation_plot(predictions, "actual", "pred") sep_plot ggsave(sep_plot, file = "plots/em-separation-plot.pdf", width = 4, height = 0.25) kable(confint(model_y$gam)) #kable(confint(model_r)) #kable(confint(init_model_r)) ## Compute AUC actual <- rides_complete$stressful %>% as.numeric() %>% as.factor() pred <- predict_gamm(model_y, rides_complete) auc(sensitivity(pred, actual))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.