historic-uptake.md

# Aim: create predictive model of uptake of cycling following exposure to infrastructure
devtools::install_github("robinlovelace/ukboundaries")
library(tmap)
tmap_mode("view")
library(ukboundaries)
library(tidyverse)
library(stplanr)
region_name = "Bristol"

# read-in data ----
lads = readRDS("../cyipt-bigdata/boundaries/local_authority/local_authority.Rds") %>%
  st_transform(4326)
z_msoa = msoa2011_vsimple %>%
  select(geo_code = msoa11cd)
# u_flow_11 = "https://github.com/npct/pct-outputs-national/raw/master/commute/msoa/l_all.Rds"
# download.file(u_flow_11, "~/npct/l_all.Rds")
# flow_11 = readRDS("~/npct/l_all.Rds") %>%
#   st_as_sf()
flow_11 = readRDS("~/npct/pct-outputs-regional-R/commute/msoa/avon/l.Rds") %>%
  as(Class = "sf")
c_oa01 = st_read("../cyipt-inputs-official/Output_Areas_December_2001_Population_Weighted_Centroids.shp") %>%
  st_transform(4326)

## Reading layer `Output_Areas_December_2001_Population_Weighted_Centroids' from data source `/home/robin/cyipt/cyipt-inputs-official/Output_Areas_December_2001_Population_Weighted_Centroids.shp' using driver `ESRI Shapefile'
## Simple feature collection with 175434 features and 3 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: 87344.87 ymin: 10447.89 xmax: 655115.4 ymax: 654773.4
## epsg (SRID):    NA
## proj4string:    +proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +datum=OSGB36 +units=m +no_defs

aggzones = readRDS("../cyipt-bigdata/boundaries/TTWA/TTWA_England.Rds")
aggzone = filter(aggzones, ttwa11nm == region_name)
# aggzone = st_buffer(aggzones, dist = 0) # for all of UK
aggzone = flow_11 %>%
  st_transform(27700) %>%
  st_buffer(1000, 4) %>%
  st_union() %>%
  st_transform(4326)
plot(aggzone)

# subset areal data to region and aggregate msoa-cas flows ----
c_oa01 = c_oa01[aggzone, ] # get points
z = z_msoa[c_oa01, ]
cas = cas2003_simple[c_oa01, ]

# read-in and process infra data ----
sc2sd = readRDS("../cyinfdat/sc2sd") %>%
  filter(OpenDate < "2010-12-01") %>%
  mutate(OpenDate = as.character(OpenDate)) %>%
  select(date = OpenDate, on_road = OnRoad)
# all before 2011
sl2sc = readRDS("../cyinfdat/ri_04_11_dft") %>%
  select(date = BuildYear, on_road = OnRoad)
old_infra = rbind(sc2sd, sl2sc)
summary(as.factor(old_infra$on_road))

##    f    t 
## 1850  815

qtm(old_infra, lines.col = "green")

b = old_infra %>%
  st_transform(27700) %>%
  st_buffer(dist = 500, nQuadSegs = 4) %>%
  st_union() %>%
  st_transform(4326)
qtm(b)

# subset lines of interest and aggregate them to cas level
# flow_11 = flow_11[b, ]
# cas = cas[b, ]
# z = z[b, ]

summary(flow_11$geo_code1 %in% z$geo_code)

##    Mode    TRUE 
## logical    4551

f11 = select(flow_11, geo_code1, geo_code2, all, bicycle) %>%
  st_set_geometry(NULL) %>%
  filter(geo_code1 %in% z$geo_code, geo_code2 %in% z$geo_code) %>%
  filter(all > 20)
# time-consuming...
od_11 = od_aggregate(flow = f11, zones = z, aggzones = cas) %>%
  na.omit() %>%
  mutate(pcycle11 = bicycle / all) %>%
  select(o = flow_new_orig, d = flow_new_dest, all11 = all, pcycle11)

# process OD data ----
od_01 = read_csv("../cyoddata/wicid_output.csv", skip = 4, col_names = F)
names(od_01) = c("o", "d", "all", "mfh", "car", "bicycle", "foot")
sum(od_01$all, na.rm = T) # 48.5 million

## [1] 48452156

od_01$all = od_01$all - od_01$mfh
sum(od_01$all, na.rm = T) # 44 million

## [1] 43990536

od_01$mfh = NULL

cas_codes = select(cas2003_simple, ons_label, name) %>%
  st_set_geometry(NULL) %>%
  filter(!duplicated(name), name %in% od_01$o | name %in% od_01$d)
# join-on the ons labels
od_01 = inner_join(od_01, select(cas_codes, ons_label_o = ons_label, o = name))
sum(od_01$all, na.rm = T) # 44 million

## [1] 40857794

od_01 = inner_join(od_01, select(cas_codes, ons_label_d = ons_label, d = name)) # removes ~20m ppl
sum(od_01$all, na.rm = T) # 44 million

## [1] 23146182

od_01$o = od_01$ons_label_o
od_01$d = od_01$ons_label_d
od_01 = od_01 %>% mutate(pcycle01 = bicycle / all) %>%
  select(o, d, pcycle01, all01 = all)

od_01_region = od_01 %>%
  filter(o %in% cas$ons_label, d %in% cas$ons_label) # 6k results

summary(od_01_region$o %in% od_11$o) # test readiness to merge with 2011

##    Mode   FALSE    TRUE 
## logical    7747   11949

od_01_region = od_01_region %>%
  filter(all01 > 10, o %in% od_11$o, d %in% od_11$d) %>%
  na.omit()

od = inner_join(od_01_region, od_11)
od = mutate(od, p_uptake = (pcycle11 - pcycle01))

l = od2line(flow = od, cas)
plot(l$geometry) # works

l$dist = as.numeric(st_length(l))
l = filter(l, dist > 0, dist < 10000) %>%
  na.omit()
qtm(l) + qtm(b)

# testing
sum(l$all01) # 100k

## [1] 85000

sum(l$all11) # many more in 2011

## [1] 169105

sum(l$all01 * l$pcycle01) / sum(l$all01)

## [1] 0.05449412

sum(l$all11 * l$pcycle11) / sum(l$all11) # doubling in cycling in Avon in affected routes

## [1] 0.07816445

# crude measure of exposure: % of route near 1 cycle path
i = 1
l$exposure = NA
for(i in 1:nrow(l)) {
  intersection = st_intersection(l$geometry[i], b)
  if(length(intersection) > 0) {
    l$exposure[i] = st_length(intersection) /
      st_length(l$geometry[i])
  }
}
summary(l$exposure)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0008  0.0997  0.1652  0.2328  0.3010  1.0000     689

sel_na = is.na(l$exposure)
plot(l$dist, l$exposure)

l$exposure[sel_na] = 0
m = lm(p_uptake ~ dist + exposure, l, weights = all11)
summary(m)

## 
## Call:
## lm(formula = p_uptake ~ dist + exposure, data = l, weights = all11)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1158 -0.3579 -0.0184  0.2821  4.5564 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.372e-02  4.207e-03   5.639 2.28e-08 ***
## dist        -1.671e-07  8.424e-07  -0.198  0.84283    
## exposure     4.147e-02  1.523e-02   2.724  0.00658 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7972 on 906 degrees of freedom
## Multiple R-squared:  0.008318,   Adjusted R-squared:  0.006128 
## F-statistic: 3.799 on 2 and 906 DF,  p-value: 0.02274

p = (predict(m, l) + l$pcycle01) * l$all11
sum(p)

## [1] 13218

psimple = l$pcycle01 * l$all11
sum(psimple) # 4000+ more cyclists estimated

## [1] 8950.415

cor(l$all11, p)^2

## [1] 0.7145888

cor(l$all11, psimple)^2

## [1] 0.5759274

cor(l$all01 * l$pcycle01, p)

## [1] 0.5832665

cor(l$all01 * l$pcycle01, psimple)

## [1] 0.5649184

# Estimate uptake
schemes = readRDS("../cyipt-bigdata/osm-prep/Bristol/schemes.Rds")
plot(schemes$geometry[schemes$group_id == 1])

sum(st_length(schemes$geometry[schemes$group_id == 1]))

## 8943.85 m

b_scheme = schemes %>%
  st_buffer(nQuadSegs = 8, dist = 500) %>%
  group_by(group_id) %>%
  summarise(length = sum(length), costTotal = sum(costTotal)) %>%
  st_union(by_feature = T) %>%
  st_transform(4326)
summary(b_scheme)

##     group_id          length          costTotal                 geometry  
##  Min.   :  1.00   Min.   :  207.2   Min.   :    3350   POLYGON      :172  
##  1st Qu.: 43.75   1st Qu.:  479.8   1st Qu.:  189086   epsg:4326    :  0  
##  Median : 86.50   Median : 1647.0   Median :  754584   +proj=long...:  0  
##  Mean   : 88.31   Mean   : 2778.6   Mean   : 1644270                      
##  3rd Qu.:130.50   3rd Qu.: 4318.8   3rd Qu.: 2085344                      
##  Max.   :221.00   Max.   :11444.6   Max.   :12978118

# run model on modified interventions
l$scheme_number = NA
b_scheme$uptake = NA
j = 1
for(j in seq_along(b_scheme$group_id)) {
  intersection = st_intersection(l, b_scheme[j, ])
  intersection$exposure = as.numeric(st_length(intersection)) / intersection$dist
  b_scheme$uptake[j] = sum(predict(m, intersection) * intersection$all11)
}
plot(b_scheme$length, b_scheme$uptake)

sum(b_scheme$uptake) / sum(l$all11)

## [1] 0.2829218

b_scheme$bcr = b_scheme$uptake / (b_scheme$costTotal / 1000)
summary(b_scheme$bcr)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.08476  0.25673  3.44074  0.54986 92.14290

qtm(filter(b_scheme, bcr > 1)) # schemes with > 1 person cycling per £1k spend...

# communicate:
# file.copy("scripts/benefits/historic-uptake.R", "./", overwrite = T)
# knitr::spin("historic-uptake.R")
# file.remove("historic-uptake.R")
# m = lm(pcycle11 ~ pcycle01, data = od, weights = all11)
# plot(od$all11 * predict(m, od), od$all11 * od$pcycle11)
# cor(od$all11 * predict(m, od), od$all11 * od$pcycle11, use = "complete.obs")^2 # 2001 level explains 81% of cycling in 2011!
#
# # test data
# summary(as.factor(ways$cycleway.left))
# ways = remove_cycle_infra(ways)
# summary # 200 cycle paths removed
#
# # imagine all infrastructure is new...
# # lines most exposed to new infrastructure (within a 1km buffer around them)
# l_buf = geo_buffer(l, dist = 1000) # looks good
# st_crs(l_buf)
# cpaths = st_transform(cpaths, 4326)
# cpaths$length = as.numeric(st_length(cpaths))
# l$cpath_length_buff = aggregate(cpaths["length"], l_buf, sum)$length
# l$cpath_length_buff[is.na(l$cpath_length_buff)] = 0
# summary(l)
# plot(l$cpath_length_buff, l$uptake)
# cor(l$cpath_length_buff, l$uptake, use = "complete.obs")
# m1 = lm(formula = uptake ~ cpath_length_buff, data = l, weights = all11)
#
# cor(predict(m1, l) * l$all, l$cpath_length_buff)^2

# # subset those with high flows and high exposure
# l_sub = l %>% filter(all11 > median(all11)) %>%
#   filter(cpath_length_buff > median(.$cpath_length_buff))
# r = line2route(l_sub)
#
# # incongruence example
# st_crs(lads)
# st_intersects(z, lads[lads$lad16nm == "Bristol, ",])


cyipt/cyipt documentation built on Aug. 16, 2020, 10:24 p.m.