This document reports on methods and preliminary findings associated with the modelling of cycling uptake associated with infrastructure.
The first stage is to load region-specific data. Eventually these will cover any region, e.g. as specified by the region variable and selected from an appropriate data source:
region_name = "avon" data_source = "https://github.com/npct/pct-data/raw/master/"
For the case study region of Bristol, the data is stored in the example-data
folder:
library(sf) library(tidyverse) region = st_read("areas/bristol-poly.geojson")
The input data comes from 3 main sources:
# Commented - no longer use this data source if(!file.exists("l.Rds")) { download.file(paste0(data_source, region_name, "/l.Rds"), "l.Rds", mode = "wb") download.file(paste0(data_source, region_name, "/rf.Rds"), "rf.Rds", mode = "wb") download.file(paste0(data_source, region_name, "/rq.Rds"), "rq.Rds", mode = "wb") }
if(!file.exists("l_nat.Rds")) { download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.6-beta/l_nat.Rds", destfile = "l_nat.Rds", mode = "wb") download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.7-beta/rf_nat.Rds", destfile = "rf_nat.Rds", mode = "wb") download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.7-beta/rq_nat.Rds", destfile = "rq_nat.Rds", mode = "wb") }
# This code takes the national lines and saves by region l_nat = st_as_sf(readRDS("l_nat.Rds")) rf_nat = st_as_sf(readRDS("rf_nat.Rds")) rq_nat = st_as_sf(readRDS("rq_nat.Rds")) l = l_nat[region,] l_sel = st_intersects(l_nat, region, sparse = FALSE)[,1] l = l_nat[l_sel,] rf = rf_nat[l_sel,] rq = rq_nat[l_sel,] plot(l[6]) plot(rf[1], add = T, col = "red") plot(rq[1], add = T, col = "green") plot(region[1], col = "white", lwd = 5, add = T) lfq = list(l, rf, rq) names(lfq) = c("l", "rf", "rq") saveRDS(lfq, "../example-data/bristol/lfq.Rds")
lfq = readRDS("../example-data/bristol/lfq.Rds") # plot(lfq$l[6]) # plot(lfq$rf[1], add = T, col = "red") # plot(lfq$rq[1], add = T, col = "green") # plot(region[1], col = "white", lwd = 5, add = T)
osm_data = readRDS("../example-data/bristol/osm-all-highways.Rds") ways = osm_data$osm_lines ways$osm_id = as.integer(as.character(ways$osm_id)) sel = sample(nrow(ways), 10000) plot(ways[sel, "highway"])
# Quietness quietness = readr::read_csv("input-data/scorings/bristol.csv") ways = left_join(ways, quietness, by = c("osm_id" = "id"))
# PCU pcu = readr::read_csv("trafficcounts/trafficcounts-osm.csv") ways = left_join(ways, pcu) summary(ways$aadt)
These can be visualised as follows, with a sample of 5 routes overlaid on a sample of 1000 OSM line elements:
plot(ways[sel, "quietness"]) plot(lfq$l[1:5, 6], add = T, col = "black") plot(lfq$rf[1:5, 6], add = T, col = "red", lwd = 3) plot(lfq$rq[1:5, 6], add = T, col = "green")
We can join the relevant variables from the rf
and rq
objects onto l
for modelling:
names(lfq$rf) rf = transmute(lfq$rf, dist_fast = length / 1000, time_fast = time, busyness_fast = busyness, av_incline_fast = av_incline) st_geometry(rf) <- NULL rq = transmute(lfq$rq, dist_quiet = length / 1000, time_quiet = time, busyness_quiet = busyness, av_incline_quiet = av_incline) st_geometry(rq) <- NULL l = bind_cols(lfq$l, rf, rq)
We can also join segment-level data to routes. This is done for one route as follows:
route_single = lfq$rf[1,] ways_touching = ways[route_single,] source("R/geobuffer.R") ways$length = geo_projected(ways, st_length) %>% as.numeric()
We can also join all lines that overlap (rather than just touch) the route, but this introduces a buffer whose width must be set and removes some routes that may be relevant to the junctions:
sel_overlap = st_relate(x = ways, route_single) # fails to find centrelines summary(sel_overlap) route_sp = as(object = route_single, Class = "Spatial") route_buff_sp = stplanr::buff_geo(route_sp, 100) route_buff = as(object = route_buff_sp, "sf") sel_overlap = st_within(ways_touching, route_buff, sparse = FALSE)[,1] ways_overlap = ways_touching[sel_overlap,] plot(route_single[1], lwd = 50) # plot(ways, add = T) plot(ways_overlap, add = T, lwd = 30, col = "white") plot(ways_touching, add = T, lwd = 5) plot(route_buff, add = T)
Summary statistics can be pulled from the the results as follows:
mean(ways_touching$quietness, na.rm = T) # depreciated version source("https://raw.githubusercontent.com/r-spatial/sf/752c12eae34efe734d70ebbc420b99d90cbcc5ef/R/join.R") stjoin_old = st_join rm(list = ls(pattern = "_join"))
We can run this for all routes as follows:
# for(i in 1:nrow(l)) # l$quietness[i] = mean(ways[lfq$rf[i,],]$quietness, na.rm = T) ; print(i) summary(ways$quietness) ways_no_quiet = filter(ways, !is.na(quietness)) l_joined = st_join(lfq$rf, ways_no_quiet["quietness"], FUN = mean) # old way l$quietness = l_joined$quietness # sel_intersects = st_intersects(ways_no_quiet, lfq$rf) # takes some time! # length(sel_intersects) # find mean busyness for each # head(sel_intersects) # head(unlist(sel_intersects)) ways_pcu = filter(ways, !is.na(aadt)) l_pcu = st_join(lfq$rf, ways_pcu["aadt"], FUN = mean) l$aadt = l_pcu$aadt cor(l$busyness_fast, l$quietness, use = "complete.obs") busyness_by_length = l$busyness_fast / l$dist /1000 cor(busyness_by_length, l$quietness, use = "complete.obs") cor(l$quietness, l$aadt, use = "complete.obs") # idea: use a buffer saveRDS(l, "../example-data/bristol/l_cyipt.Rds")
l = readRDS("../example-data/bristol/l_cyipt.Rds")
The results show the quietness score is related to the 'busynance' of the route provided by CycleStreets.net:
busyness_by_length = l$busyness_fast / l$dist /1000 plot(busyness_by_length, l$quietness) cor(busyness_by_length, l$quietness, use = "complete.obs")
In the propensity to cycle tool, we modelled cycling uptake in terms of pcycle
, the percentage cycling.
In this section we will estimate the number cycling directly and use inference about the impact of the route network to estimate uptake, using a wide range of variables.
names(l)
The simplest model of cycling update under this framework would be to estimate the number of cyclists as a linear function of total number travelling:
m1 = lm(formula = bicycle ~ all, data = l) summary(m1) (rmse1 = sqrt(c(crossprod(m1$residuals)) / nrow(l)))
Already, over half of the number of cyclists using roads can be modelled based on the total number of commuters alone, but we're not capturing any of the variability in the proportion cycling.
The impact of adding distance can be isolated in the linear term as follows:
m2 = lm(formula = bicycle ~ dist + all, data = l) summary(m2) (rmse2 = sqrt(c(crossprod(m2$residuals)) / nrow(l)))
Note that the fit has improved, but only very slightly. This is partly because only a linear function of distance is used and it does not interact with all
.
Let's simulate that interaction.
So we can fit an interaction term:
m3 = lm(formula = bicycle ~ all + dist:all, data = l) summary(m3) (rmse3 = sqrt(c(crossprod(m3$residuals)) / nrow(l)))
plot(l$all, l$bicycle) points(l$all, m1$fitted.values, col = "red") points(l$all, m3$fitted.values, col = "grey")
There are various issues here. We need to model cycling uptake but that is always a function of all
. We must add additional variables such as hilliness. Further, we must use non-linear function of some predictor variables such as distance.
The mission is to improve this fit to account for the impact of infrastructure so we can model cycling uptake when the road network changes.
This is where machine learning can come in.
Machine learning can provide a way to extract knowledge from the input data. An implementation is provided by the xgboost package:
st_geometry(l) = NULL # remove geometry cols library(xgboost) set.seed(2017) l_sub = select(l, bicycle, dist, all) xgboost(data = as.matrix(l_sub[-1]), label = l_sub$bicycle, nrounds = 5) train = l_sub %>% sample_frac(size = 0.5)
Note that the RMSE has been more than halved by the use of machine learning.
The method's compatibility with the predict()
function allows us to model uptake of cycling when conditions change.
As a hypothetical example, imagine that all people moved 50%.
Rather than fitting on the complete dataset, we'll build the model on a subset (train
):
m4 = xgboost(data = as.matrix(train[-1]), label = train$bicycle, nrounds = 5, max_depth = 3) m4_fitted = predict(m4, as.matrix(l_sub[-1])) plot(l$all, l$bicycle) points(l$all, m4_fitted, col = "red") (rmse4 = sqrt(c(crossprod(m4_fitted - l_sub$bicycle)) / nrow(l)))
We can query the model results to explore the feature importance:
importance_m4 = xgb.importance(model = m4, feature_names = names(l_sub)[-1]) xgb.plot.importance(importance_m4)
To specify a full model is relatively easy, building on the existing framework (and training against the full dataset):
l_full = mutate(l, bicycle, all, dist, cirquity = dist_fast / dist, qdf = dist_quiet / dist_fast, av_incline_fast, busyness_fast = busyness_fast / dist, busyness_quiet = busyness_quiet / dist, aadt) l_full = select(l_full, bicycle, all, dist, cirquity, qdf, av_incline_fast, busyness_fast, busyness_quiet, aadt) train = sample_frac(l_full, 1) m5 = xgboost(data = as.matrix(train[-1]), label = train$bicycle, nrounds = 10, max_depth = 5) importance_m5 = xgb.importance(model = m5, feature_names = names(l_full)[-1]) xgb.plot.importance(importance_m5) m5_fitted = predict(m5, as.matrix(l_full[-1])) (rmse5 = sqrt(c(crossprod(m5_fitted - l_full$bicycle)) / nrow(l))) cor(m5_fitted * l$all, l_full$bicycle)^2
We know that bicycle
is intimately related to all
. It cannot be higher and is a proportion of it. So instead, we can remove all
from the equation (the number of people travelling on a route should be independent of their propensity to cycle) and instead to fit to pcycle
:
train_df = sample_frac(l_full, 1) train = as.matrix(train_df[,-c(1, 2)]) m6 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all) importance_m6 = xgb.importance(model = m6, feature_names = names(l_full)[-c(1, 2)]) xgb.plot.importance(importance_m6) m6_fitted = predict(m6, as.matrix(l_full[-c(1, 2)])) (rmse6 = sqrt(c(crossprod(m6_fitted * l$all - l_full$bicycle)) / nrow(l))) cor(m6_fitted * l$all, l_full$bicycle)^2
This is an impressive result: more than 95% of the variability in the number of cyclists can be predicted by our model, which includes infrastructure variables such as traffic (AADT) and busynance.
On this foundation we can estimate effectiveness.
This model uses aggregated data from the OSM network which we can modify:
l_full_all = mutate(l, bicycle, all, dist, cirquity = dist_fast / dist, qdf = dist_quiet / dist_fast, av_incline_fast, quietness, aadt) l_full = select(l_full_all, bicycle, all, dist, cirquity, qdf, av_incline_fast, quietness, aadt) train_df = sample_frac(l_full, 1) train = as.matrix(train_df[,-c(1, 2)]) m7 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all) importance_m7 = xgb.importance(model = m7, feature_names = names(l_full)[-c(1, 2)]) xgb.plot.importance(importance_m7) m7_fitted = predict(m7, as.matrix(l_full[-c(1, 2)])) (rmse7 = sqrt(c(crossprod(m7_fitted * l$all - l_full$bicycle)) / nrow(l))) cor(m7_fitted * l$all, l_full$bicycle)^2
Let's take a look at the impact of the cycleway
variable:
ways_cycleway = filter(ways, !is.na(cycleway)) ways_cycleway$length_cycleway = ways_cycleway$length l_cycleway = stjoin_old(lfq$rf, ways_cycleway["length_cycleway"], FUN = sum) summary(l_cycleway$length_cycleway) l_full$length_cycleway = l_cycleway$length_cycleway train_df = sample_frac(l_full, 1) train = as.matrix(train_df[,-c(1, 2)]) m8 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all) importance_m8 = xgb.importance(model = m8, feature_names = names(l_full)[-c(1, 2)]) xgb.plot.importance(importance_m8) m8_fitted = predict(m8, as.matrix(l_full[-c(1, 2)])) (rmse8 = sqrt(c(crossprod(m8_fitted * l$all - l_full$bicycle)) / nrow(l))) cor(m8_fitted * l$all, l_full$bicycle)^2
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.