In cyipt/cyipt: Cycling Infrastructure Prioritisation Toolkit

This document reports on methods and preliminary findings associated with the modelling of cycling uptake associated with infrastructure.

Input data

The first stage is to load region-specific data. Eventually these will cover any region, e.g. as specified by the region variable and selected from an appropriate data source:

region_name = "avon"
data_source = "https://github.com/npct/pct-data/raw/master/"

For the case study region of Bristol, the data is stored in the example-data folder:

library(sf)
library(tidyverse)
region = st_read("areas/bristol-poly.geojson")

The input data comes from 3 main sources:

Outputs from the PCT, which reports current cycling levels and estimated 'fastest routes' for cyclists. After combining the straight lines, quietest routes and fastest routes into a single list object, they can be loaded as follows:

# Commented - no longer use this data source
if(!file.exists("l.Rds")) {
  download.file(paste0(data_source, region_name, "/l.Rds"), "l.Rds", mode = "wb")
  download.file(paste0(data_source, region_name, "/rf.Rds"), "rf.Rds", mode = "wb")
  download.file(paste0(data_source, region_name, "/rq.Rds"), "rq.Rds", mode = "wb")
}

if(!file.exists("l_nat.Rds")) {
  download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.6-beta/l_nat.Rds", destfile = "l_nat.Rds", mode = "wb")
  download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.7-beta/rf_nat.Rds", destfile = "rf_nat.Rds", mode = "wb")
  download.file(url = "https://github.com/npct/pct-bigdata/releases/download/1.7-beta/rq_nat.Rds", destfile = "rq_nat.Rds", mode = "wb")
}

# This code takes the national lines and saves by region
l_nat = st_as_sf(readRDS("l_nat.Rds"))
rf_nat = st_as_sf(readRDS("rf_nat.Rds"))
rq_nat = st_as_sf(readRDS("rq_nat.Rds"))
l = l_nat[region,]
l_sel = st_intersects(l_nat, region, sparse = FALSE)[,1]
l = l_nat[l_sel,]
rf = rf_nat[l_sel,]
rq = rq_nat[l_sel,]
plot(l[6])
plot(rf[1], add = T, col = "red")
plot(rq[1], add = T, col = "green")
plot(region[1], col = "white", lwd = 5, add = T)
lfq = list(l, rf, rq)
names(lfq) = c("l", "rf", "rq")
saveRDS(lfq, "../example-data/bristol/lfq.Rds")

lfq = readRDS("../example-data/bristol/lfq.Rds")
# plot(lfq$l[6])
# plot(lfq$rf[1], add = T, col = "red")
# plot(lfq$rq[1], add = T, col = "green")
# plot(region[1], col = "white", lwd = 5, add = T)

OSM data, which can be downloaded using the osmdata R package. We have saved the osm route network already:

osm_data = readRDS("../example-data/bristol/osm-all-highways.Rds")
ways = osm_data$osm_lines
ways$osm_id = as.integer(as.character(ways$osm_id))
sel = sample(nrow(ways), 10000)
plot(ways[sel, "highway"])

Data created by the CyIPT project, which provides data on the current road network from the perspective of cycling, including:

# Quietness
quietness = readr::read_csv("input-data/scorings/bristol.csv")
ways = left_join(ways, quietness, by = c("osm_id" = "id"))

# PCU
pcu = readr::read_csv("trafficcounts/trafficcounts-osm.csv")
ways = left_join(ways, pcu)
summary(ways$aadt)

These can be visualised as follows, with a sample of 5 routes overlaid on a sample of 1000 OSM line elements:

plot(ways[sel, "quietness"])
plot(lfq$l[1:5, 6], add = T, col = "black")
plot(lfq$rf[1:5, 6], add = T, col = "red", lwd = 3)
plot(lfq$rq[1:5, 6], add = T, col = "green")

We can join the relevant variables from the rf and rq objects onto l for modelling:

names(lfq$rf)
rf = transmute(lfq$rf, dist_fast = length / 1000, time_fast = time, busyness_fast = busyness, av_incline_fast = av_incline) 
st_geometry(rf) <- NULL

rq = transmute(lfq$rq, dist_quiet = length / 1000, time_quiet = time, busyness_quiet = busyness, av_incline_quiet = av_incline) 
st_geometry(rq) <- NULL
l = bind_cols(lfq$l, rf, rq)

We can also join segment-level data to routes. This is done for one route as follows:

route_single = lfq$rf[1,]
ways_touching = ways[route_single,]
source("R/geobuffer.R")
ways$length = geo_projected(ways, st_length) %>% as.numeric()

We can also join all lines that overlap (rather than just touch) the route, but this introduces a buffer whose width must be set and removes some routes that may be relevant to the junctions:

sel_overlap = st_relate(x = ways, route_single) # fails to find centrelines
summary(sel_overlap)

route_sp = as(object = route_single, Class = "Spatial")
route_buff_sp = stplanr::buff_geo(route_sp, 100)
route_buff = as(object = route_buff_sp, "sf")
sel_overlap = st_within(ways_touching, route_buff, sparse = FALSE)[,1]
ways_overlap = ways_touching[sel_overlap,]

plot(route_single[1], lwd = 50)
# plot(ways, add = T)
plot(ways_overlap, add = T, lwd = 30, col = "white")
plot(ways_touching, add = T, lwd = 5)
plot(route_buff, add = T)

Summary statistics can be pulled from the the results as follows:

mean(ways_touching$quietness, na.rm = T)
# depreciated version
source("https://raw.githubusercontent.com/r-spatial/sf/752c12eae34efe734d70ebbc420b99d90cbcc5ef/R/join.R")
stjoin_old = st_join
rm(list = ls(pattern = "_join"))

We can run this for all routes as follows:

# for(i in 1:nrow(l))
#   l$quietness[i] = mean(ways[lfq$rf[i,],]$quietness, na.rm = T) ; print(i)
summary(ways$quietness)
ways_no_quiet = filter(ways, !is.na(quietness))
l_joined = st_join(lfq$rf, ways_no_quiet["quietness"], FUN = mean) # old way
l$quietness = l_joined$quietness
# sel_intersects = st_intersects(ways_no_quiet, lfq$rf) # takes some time!
# length(sel_intersects)
# find mean busyness for each 
# head(sel_intersects)
# head(unlist(sel_intersects))
ways_pcu = filter(ways, !is.na(aadt))
l_pcu = st_join(lfq$rf, ways_pcu["aadt"], FUN = mean)
l$aadt = l_pcu$aadt

cor(l$busyness_fast, l$quietness, use = "complete.obs")
busyness_by_length = l$busyness_fast / l$dist /1000
cor(busyness_by_length, l$quietness, use = "complete.obs")
cor(l$quietness, l$aadt, use = "complete.obs")
# idea: use a buffer
saveRDS(l, "../example-data/bristol/l_cyipt.Rds")

l = readRDS("../example-data/bristol/l_cyipt.Rds")

The results show the quietness score is related to the 'busynance' of the route provided by CycleStreets.net:

busyness_by_length = l$busyness_fast / l$dist /1000
plot(busyness_by_length, l$quietness)
cor(busyness_by_length, l$quietness, use = "complete.obs")

Modelling raw cyclist counts

In the propensity to cycle tool, we modelled cycling uptake in terms of pcycle, the percentage cycling.

In this section we will estimate the number cycling directly and use inference about the impact of the route network to estimate uptake, using a wide range of variables.

names(l)

The simplest model of cycling update under this framework would be to estimate the number of cyclists as a linear function of total number travelling:

m1 = lm(formula = bicycle ~ all, data = l)
summary(m1)
(rmse1 = sqrt(c(crossprod(m1$residuals)) / nrow(l)))

Already, over half of the number of cyclists using roads can be modelled based on the total number of commuters alone, but we're not capturing any of the variability in the proportion cycling.

The impact of adding distance can be isolated in the linear term as follows:

m2 = lm(formula = bicycle ~ dist + all, data = l)
summary(m2)
(rmse2 = sqrt(c(crossprod(m2$residuals)) / nrow(l)))

Note that the fit has improved, but only very slightly. This is partly because only a linear function of distance is used and it does not interact with all. Let's simulate that interaction.

So we can fit an interaction term:

m3 = lm(formula = bicycle ~ all + dist:all, data = l)
summary(m3)
(rmse3 = sqrt(c(crossprod(m3$residuals)) / nrow(l)))

plot(l$all, l$bicycle)
points(l$all, m1$fitted.values, col = "red")
points(l$all, m3$fitted.values, col = "grey")

There are various issues here. We need to model cycling uptake but that is always a function of all. We must add additional variables such as hilliness. Further, we must use non-linear function of some predictor variables such as distance.

The mission is to improve this fit to account for the impact of infrastructure so we can model cycling uptake when the road network changes.

This is where machine learning can come in.

Boosted regression trees

Machine learning can provide a way to extract knowledge from the input data. An implementation is provided by the xgboost package:

st_geometry(l) = NULL # remove geometry cols
library(xgboost)
set.seed(2017)
l_sub = select(l, bicycle, dist, all)
xgboost(data = as.matrix(l_sub[-1]), label = l_sub$bicycle, nrounds = 5)
train = l_sub %>% 
    sample_frac(size = 0.5)

Note that the RMSE has been more than halved by the use of machine learning. The method's compatibility with the predict() function allows us to model uptake of cycling when conditions change. As a hypothetical example, imagine that all people moved 50%.

Rather than fitting on the complete dataset, we'll build the model on a subset (train):

m4 = xgboost(data = as.matrix(train[-1]), label = train$bicycle, nrounds = 5, max_depth = 3)
m4_fitted = predict(m4, as.matrix(l_sub[-1]))
plot(l$all, l$bicycle)
points(l$all, m4_fitted, col = "red")
(rmse4 = sqrt(c(crossprod(m4_fitted - l_sub$bicycle)) / nrow(l)))

We can query the model results to explore the feature importance:

importance_m4 = xgb.importance(model = m4, feature_names = names(l_sub)[-1])
xgb.plot.importance(importance_m4)

A full model

To specify a full model is relatively easy, building on the existing framework (and training against the full dataset):

l_full = mutate(l, bicycle, all, dist, cirquity = dist_fast / dist, qdf = dist_quiet / dist_fast, av_incline_fast, busyness_fast = busyness_fast / dist, busyness_quiet = busyness_quiet / dist, aadt)
l_full = select(l_full, bicycle, all, dist, cirquity, qdf, av_incline_fast, busyness_fast, busyness_quiet, aadt)
train = sample_frac(l_full, 1)
m5 = xgboost(data = as.matrix(train[-1]), label = train$bicycle, nrounds = 10, max_depth = 5)
importance_m5 = xgb.importance(model = m5, feature_names = names(l_full)[-1])
xgb.plot.importance(importance_m5)
m5_fitted = predict(m5, as.matrix(l_full[-1]))
(rmse5 = sqrt(c(crossprod(m5_fitted - l_full$bicycle)) / nrow(l)))
cor(m5_fitted * l$all, l_full$bicycle)^2

Fitting to the proportion cycling

We know that bicycle is intimately related to all. It cannot be higher and is a proportion of it. So instead, we can remove all from the equation (the number of people travelling on a route should be independent of their propensity to cycle) and instead to fit to pcycle:

train_df = sample_frac(l_full, 1)
train = as.matrix(train_df[,-c(1, 2)])
m6 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all)
importance_m6 = xgb.importance(model = m6, feature_names = names(l_full)[-c(1, 2)])
xgb.plot.importance(importance_m6)
m6_fitted = predict(m6, as.matrix(l_full[-c(1, 2)]))
(rmse6 = sqrt(c(crossprod(m6_fitted * l$all  - l_full$bicycle)) / nrow(l)))
cor(m6_fitted * l$all, l_full$bicycle)^2

This is an impressive result: more than 95% of the variability in the number of cyclists can be predicted by our model, which includes infrastructure variables such as traffic (AADT) and busynance.

On this foundation we can estimate effectiveness.

Geographically inferred quietness

This model uses aggregated data from the OSM network which we can modify:

l_full_all = mutate(l, bicycle, all, dist, cirquity = dist_fast / dist, qdf = dist_quiet / dist_fast, av_incline_fast, quietness, aadt)
l_full = select(l_full_all, bicycle, all, dist, cirquity, qdf, av_incline_fast, quietness, aadt)

train_df = sample_frac(l_full, 1) 
train = as.matrix(train_df[,-c(1, 2)])
m7 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all)
importance_m7 = xgb.importance(model = m7, feature_names = names(l_full)[-c(1, 2)])
xgb.plot.importance(importance_m7)
m7_fitted = predict(m7, as.matrix(l_full[-c(1, 2)]))
(rmse7 = sqrt(c(crossprod(m7_fitted * l$all  - l_full$bicycle)) / nrow(l)))
cor(m7_fitted * l$all, l_full$bicycle)^2

Let's take a look at the impact of the cycleway variable:

ways_cycleway = filter(ways, !is.na(cycleway))
ways_cycleway$length_cycleway = ways_cycleway$length
l_cycleway = stjoin_old(lfq$rf, ways_cycleway["length_cycleway"], FUN = sum)
summary(l_cycleway$length_cycleway)
l_full$length_cycleway = l_cycleway$length_cycleway

train_df = sample_frac(l_full, 1) 
train = as.matrix(train_df[,-c(1, 2)])
m8 = xgboost(data = train, label = train_df$bicycle / train_df$all, nrounds = 10, max_depth = 5, weight = train_df$all)
importance_m8 = xgb.importance(model = m8, feature_names = names(l_full)[-c(1, 2)])
xgb.plot.importance(importance_m8)
m8_fitted = predict(m8, as.matrix(l_full[-c(1, 2)]))
(rmse8 = sqrt(c(crossprod(m8_fitted * l$all  - l_full$bicycle)) / nrow(l)))
cor(m8_fitted * l$all, l_full$bicycle)^2