code/national-uptake-model.R

# Aim: model national data on cycling uptake, with newness of development as an explanatory variable

library(tidyverse)

property_age_lsoa = read_rds("property-age-lsoa.Rds")
dim(property_age_lsoa)


# Find LSOAs with high proportions of new homes
over_80pc = property_age_lsoa %>%
  filter(p2000_09 > 0.8)
dim(over_80pc)

over_60pc = property_age_lsoa %>%
  filter(p2000_09 > 0.6)
dim(over_60pc)

new_homes_2000_09 = property_age_lsoa %>%
  filter(p2000_09 > 0.9)
new_homes_2000_09$id = seq.int(nrow(new_homes_2000_09))
dim(new_homes_2000_09)
# View(new_homes_2000_09)


# Remove 20% of records for model validation
n_excluded = round(nrow(new_homes_2000_09)*0.2)
excluded = new_homes_2000_09[sample(nrow(new_homes_2000_09),n_excluded),]
included = new_homes_2000_09[! new_homes_2000_09$id %in% excluded$id,]
dim(included)

# Get OD data for all sites

# First, find out which PCT region sites are in
new_homes_2000_09$region = substr(new_homes_2000_09$Name, 1, nchar(new_homes_2000_09$Name)-5)
new_homes_2000_09 = inner_join(new_homes_2000_09, pct_regions_lookup, by = c("region" = "lad16nm"))
dim(new_homes_2000_09)
summary(factor(new_homes_2000_09$region))

# View(new_homes_2000_09)

# add pct data ------------------------------------------------------------

u = "https://github.com/npct/pct-outputs-national/raw/master/commute/lsoa/l_all.Rds"
desire_lines_lsoa_national_sp = readRDS(url(u))
l = sf::st_as_sf(desire_lines_lsoa_national_sp)

u2 = "https://github.com/npct/pct-outputs-national/raw/master/commute/lsoa/c_all.Rds"
centroids_lsoa_national_sp = readRDS(url(u2))
centroids_lsoa = sf::st_as_sf(centroids_lsoa_national_sp)
names(centroids_lsoa)
nrow(centroids_lsoa)
nrow(lsoas)

summary(centroids_lsoa$all) # this records intrazonal commutes only

u3 = "https://github.com/npct/pct-outputs-national/raw/master/commute/lsoa/z_all.Rds"
zones_lsoa_national_sp = readRDS(url(u3))
zones_lsoa = sf::st_as_sf(zones_lsoa_national_sp)
names(zones_lsoa)
nrow(zones_lsoa)
nrow(lsoas)

summary(zones_lsoa$all)

object.size(zones_lsoa) / object.size(lsoas)

# Why do many of these LSOAs have such low commuter totals??
pct_lsoa_selected_variables = centroids_lsoa %>%
  dplyr::select(geo_code, all:taxi_other, govtarget_slc, dutch_slc, lad_name)

lsoas = inner_join(pct_lsoa_selected_variables, property_age_lsoa, by = c("geo_code" = "lsoa"))
library(sf)
plot(lsoas %>% filter(lad_name == "Leeds"))
plot(lsoas %>% filter(lad_name == "Leeds") %>% select(Pre_1900:p2000_09))
plot(lsoas %>% filter(lad_name == "Leeds") %>% select(p2000_09))



# National LSOA OD data ---------------------------------------------------


#        commissioned dataset = WM12EW[CT0489]_lsoa ; save to 'pct-inputs\01_raw\02_travel_data\commute\lsoa\WM12EW[CT0489]_lsoa.zip'
# from PCT project

# unzip("~/cyipt/icicle/WM12EW[CT0489]_lsoa.zip", exdir = tempdir())
# list.files("/tmp/")
# # od_lsoa_wicid = read_csv(file = file.path(tempdir(), "WM12EW[CT0489]_lsoa.csv"))
# od_lsoa_wicid = vroom::vroom(file = file.path(tempdir(), "WM12EW[CT0489]_lsoa.csv"))
# summary(od_lsoa_wicid$AllMethods_AllSexes_Age16Plus)
# sum(od_lsoa_wicid$AllMethods_AllSexes_Age16Plus)
# od_lsoa_16_plus_10_plus = od_lsoa_wicid %>% filter(AllMethods_AllSexes_Age16Plus >= 10)
# nrow(od_lsoa_16_plus_10_plus) / nrow(od_lsoa_wicid)
# sum(od_lsoa_16_plus_10_plus$AllMethods_AllSexes_Age16Plus) / sum(od_lsoa_wicid$AllMethods_AllSexes_Age16Plus)
# saveRDS(od_lsoa_16_plus_10_plus, "od_lsoa_16_plus_10_plus.Rds")
# piggyback::pb_upload("od_lsoa_16_plus_10_plus.Rds", repo = "cyipt/icicle")
piggyback::pb_download("od_lsoa_16_plus_10_plus.Rds", repo = "cyipt/icicle")
od_lsoa_16_plus_10_plus = readRDS("od_lsoa_16_plus_10_plus.Rds") # routes used by at least 10 commuters, aged 16+
od_lsoa = od_lsoa_16_plus_10_plus %>%
  select(geocode1 = "Area of usual residence", geocode2 = "Area of Workplace", matches("AllSexes_Age16Plus"))
names(od_lsoa) = gsub(pattern = "AllSexes_Age16Plus|_", "", names(od_lsoa))
summary(rowSums(od_lsoa %>% select(WorkAtHome:OtherMethod)) == od_lsoa$AllMethods)
# od_lsoa$AllMethods = NULL

remotes::install_github("itsleeds/od")
library(od)

od_lsoa_sf = od_to_sf(od_lsoa, z = centroids_lsoa) # 433331 rows
sf::st_crs(centroids_lsoa)
sf::st_crs(od_lsoa_sf) = 4326

od_lsoas_sf_interzonal = od_lsoa_sf %>%
  filter(geocode1 != geocode2) # 410559 rows

od_lsoas_sf_interzonal = od_lsoas_sf_interzonal %>%
  filter(geocode1 %in% centroids_lsoa$geo_code) # still the same

od_lsoas_sf_interzonal = od_lsoas_sf_interzonal %>%
  filter(geocode2 %in% centroids_lsoa$geo_code) # 340595 rows (in the other 69964 NA geocode2 is a destination outside the country or other unusual place)

sum(od_lsoas_sf_interzonal$AllMethods) / sum(od_lsoa_sf$AllMethods) # 56% remain

od_lsoas_sf_interzonal$distance_euclidean = as.numeric(sf::st_length(od_lsoas_sf_interzonal))
summary(od_lsoas_sf_interzonal$distance)

# Divide into lines over and under 20km
od_lsoas_short = od_lsoas_sf_interzonal %>%
  filter(distance_euclidean < 20000) # 330567 rows

od_lsoas_long = od_lsoas_sf_interzonal %>%
  filter(distance_euclidean >= 20000) # 10028 rows

saveRDS(od_lsoas_short, "od_lsoas_short.Rds")
saveRDS(od_lsoas_long, "od_lsoas_long.Rds")

piggyback::pb_upload("od_lsoas_short.Rds")
piggyback::pb_upload("od_lsoas_long.Rds")


od_lsoas_short$rownum = as.numeric(rownames(od_lsoas_short))

# routing script ----------------------------------------------------------

od_lsoas_short = readRDS("od_lsoas_short.Rds")
dim(od_lsoas_short) # 330567 rows
remotes::install_github("ropensci/stplanr")
remotes::install_github("Robinlovelace/cyclestreets")

library(sf)
library(stplanr)
library(parallel)
library(cyclestreets)
cl = makeCluster(detectCores())
clusterExport(cl, c("journey"))
# test run:
l_test = od_lsoas_short[1:100,]
# routes_lsoa = stplanr::route(l_test, route_fun = cyclestreets::journey, cl = cl)
# names(routes_lsoa) # df variables have been lost
system.time({routes_lsoa = stplanr::route(l = l_test, route_fun = cyclestreets::journey, cl = cl, smooth_gradient = TRUE)})
names(routes_lsoa) # df variables have been lost

# N = 10
split_grouping_variable = rep(1:33100, each = 10)[1:nrow(od_lsoas_short)]
summary(split_grouping_variable)[1:10]
od_lsoas_short_list_10 = split(od_lsoas_short, split_grouping_variable)
nrow(od_lsoas_short_list_10[[1]])
head(od_lsoas_short_list_10[[1]])
# test it's the same
identical(od_lsoas_short[1:10,], od_lsoas_short_list_10[[1]])

system.time({routes_lsoa_1 = route(l = od_lsoas_short_list_10[[82]], route_fun = cyclestreets::journey, cl = cl, smooth_gradient = TRUE, output_list = TRUE, raw_output = TRUE)})

# N = 1000
# split_grouping_variable = cut(1:nrow(od_lsoas_short), breaks = N, labels = 1:N)
split_grouping_variable = rep(1:331, each = 1000)[1:nrow(od_lsoas_short)]
summary(split_grouping_variable)[1:10]
od_lsoas_short_list = split(od_lsoas_short, split_grouping_variable)
class(od_lsoas_short_list)
length(od_lsoas_short_list)
nrow(od_lsoas_short_list[[1]])
head(od_lsoas_short_list[[1]])
# test it's the same
identical(od_lsoas_short[1:1000,], od_lsoas_short_list[[1]])

system.time({routes_lsoa_1 = route(l = od_lsoas_short_list[[1]], route_fun = cyclestreets::journey, cl = cl, smooth_gradient = TRUE)})
# 70 seconds, implying ~ 20 hrs for all routes

data_dir = "od_lsoa_routes_20020-04-07-chunks-of-1000-rows"
dir.create(data_dir)
old_working_directory = setwd(data_dir)
n_chunks = length(od_lsoas_short_list)
# started at 23:05
for(i in 1:n_chunks) {
  message("Routing batch ", i, " of ", n_chunks)
  system.time({routes_lsoa_n = route(l = od_lsoas_short_list[[i]], route_fun = cyclestreets::journey, cl = cl, smooth_gradient = TRUE)})
  saveRDS(routes_lsoa_n, paste0("routes_lsoa_", i, ".Rds"))
}
route_chunks_list = lapply(1:n_chunks, {
  function(i) readRDS(paste0("routes_lsoa_", i, ".Rds"))
}
)
length(route_chunks_list)
length(route_chunks_list[1:33])
system.time({od_lsoas_short_routes = do.call(rbind, route_chunks_list[1:33])}) # time taken for 10%... ~2 minutes
system.time({od_lsoas_short_routes = data.table::rbindlist(route_chunks_list[1:33])}) # less than 1 second!
# Guess: it will take around 2*10 = 20 minutes
system.time({od_lsoas_short_routes = data.table::rbindlist(route_chunks_list)}) #

setwd(old_working_directory)

saveRDS(od_lsoas_short_routes, "od_lsoas_short_routes.Rds")
piggyback::pb_upload("od_lsoas_short_routes.Rds")



# Identify missing route data -----------------------------------------------


dim(routes_whole_nogeo) # 329464 rows - why is this less than od_lsoas_short?

od_geocodes = unique(od_lsoas_short$geocode2)
route_geocodes = unique(routes_whole_nogeo$geocode2)

od_geocodes[! od_geocodes %in% route_geocodes] # there are 120 geocode2 found in od_lsoas_short but not in routes_whole_nogeo. why is this?

missing = od_lsoas_short[! od_lsoas_short$geocode2 %in% od_lsoas_short_routes$geocode2,]
dim(missing) # there are 1066 rows in od_lsoas_short where the geocode2 didn't make it into od_lsoas_short_routes. These are normal geocodes, within the UK

unique(od_lsoas_short$geocode2[! od_lsoas_short$geocode2 %in% od_lsoas_short_routes$geocode2]) # 120
unique(od_lsoas_short$geocode1[! od_lsoas_short$geocode1 %in% od_lsoas_short_routes$geocode1]) # 0

length(unique(od_lsoas_short$geocode1))
length(unique(od_lsoas_short$geocode2))

missing2 = od_lsoas_short_routes[! od_lsoas_short_routes$geocode2 %in% routes_whole_nogeo$geocode2,]
dim(missing2) # everything from od_lsoas_short_routes made it into routes_whole_nogeo


mapview::mapview(missing) # these are all in northamptonshire or west sussex (Adur and Worthing)
summary(missing$rownum) # are they particular rows ie a section of od_lsoas_short?
View(missing$rownum)


# Add in missing route data -----------------------------------------------


routes_lsoa_219 = route(l = od_lsoas_short_list[[219]], route_fun = cyclestreets::journey, cl = cl)
length(unique(routes_lsoa_219$rownum))
saveRDS(routes_lsoa_219, "routes_lsoa_219.Rds")
routes_lsoa_219 = read_rds("routes_lsoa_219.Rds")

routes_lsoa_258 = route(l = od_lsoas_short_list[[258]], route_fun = cyclestreets::journey, cl = cl)
length(unique(routes_lsoa_258$rownum))
saveRDS(routes_lsoa_258, "routes_lsoa_258.Rds")
routes_lsoa_258 = read_rds("routes_lsoa_258.Rds")


routes_lsoa_219_tojoin = routes_lsoa_219 %>%
  filter(rownum %in% missing$rownum)
length(unique(routes_lsoa_219_tojoin$rownum))

routes_lsoa_258_tojoin = routes_lsoa_258 %>%
  filter(rownum %in% missing$rownum)
length(unique(routes_lsoa_258_tojoin$rownum))

names(od_lsoas_short_routes)
names(routes_lsoa_219_tojoin)
names(routes_lsoa_258_tojoin)

routes_lsoa_219_tojoin = routes_lsoa_219_tojoin %>%
  select(geocode1:distance_euclidean, route_number, name:finish_latitude)
routes_lsoa_258_tojoin = routes_lsoa_258_tojoin %>%
  select(geocode1:distance_euclidean, route_number, name:finish_latitude)

names(od_lsoas_short_routes)
names(routes_lsoa_219_tojoin)
names(routes_lsoa_258_tojoin)

route_list = list(od_lsoas_short_routes,routes_lsoa_219_tojoin,routes_lsoa_258_tojoin)
system.time({od_lsoas_short_routes_j = data.table::rbindlist(l = route_list)}) # less than 1 second!

still_missing = od_lsoas_short[! od_lsoas_short$geocode2 %in% od_lsoas_short_routes_j$geocode2,]
dim(still_missing) # 0

od_lsoas_short_routes_j = st_as_sf(od_lsoas_short_routes_j)

write_rds(od_lsoas_short_routes_j, "od_lsoas_short_routes_j.Rds")
piggyback::pb_upload("od_lsoas_short_routes_j.Rds")


# analysis with route data ------------------------------------------------

piggyback::pb_download("od_lsoas_short_routes_j.geojson")
od_lsoas_short_routes_j = read_rds("od_lsoas_short_routes_j.geojson")

od_short_nogeo = od_lsoas_short_routes_j %>%
  st_drop_geometry() %>%
  select(geocode1:elevations)

od_short_nogeo = od_short_nogeo %>%
  mutate(
    all = AllMethods,
    bicycle = Bicycle,
    foot = OnFoot,
    car_driver = CarOrVan,
    car_passenger = Passenger,
    motorbike = Motorcycle,
    train_tube = Underground + Train,
    bus = Bus,
    taxi_other = Taxi + OtherMethod
  )

write_rds(od_short_nogeo, "od-short-nogeo.Rds")
piggyback::pb_upload("od-short-nogeo.Rds")

# od_lsoas_short_routes$busyness = od_lsoas_short_routes$busynance / od_lsoas_short_routes$distances

# group route segments into routes
# summarise with average steepness, median, 75% percentile, 90% percentile, 95th percentile, 99th percentile, 100th percent...
system.time({routes_whole_nogeo = od_short_nogeo %>%
  group_by(geocode1, geocode2, all, bicycle, foot, car_driver, car_passenger, motorbike, train_tube, bus, taxi_other, distance_euclidean, route_number) %>%
  summarise(
    n = n(),
    average_incline = sum(abs(diff(elevations))) / sum(distances),
    median_incline = median(abs(diff(elevations))/distances),
    p75_incline = quantile(abs(diff(elevations))/distances,0.75),
    p85_incline = quantile(abs(diff(elevations))/distances,0.85),
    p90_incline = quantile(abs(diff(elevations))/distances,0.90),
    p95_incline = quantile(abs(diff(elevations))/distances,0.95),
    p99_incline = quantile(abs(diff(elevations))/distances,0.99),
    max_incline = max(abs(diff(elevations))/distances),
    distance_m = sum(distances),
    time_s = sum(time),
    mean_speed = sum(distances) / sum(time),
    min_speed = min(time / distances), # steps/road crossings/gates
    mean_busyness = sum(busynance)/sum(distances),
    p75_busyness = quantile(busynance/distances, 0.75),
    p85_busyness = quantile(busynance/distances, 0.85),
    p90_busyness = quantile(busynance/distances, 0.90),
    p95_busyness = quantile(busynance/distances, 0.95),
    p99_busyness = quantile(busynance/distances, 0.99),
    max_busyness = max(busynance/distances)
  ) %>%
  ungroup()
})

write_rds(routes_whole_nogeo, "routes-whole-nogeo.Rds")
piggyback::pb_upload("routes-whole-nogeo.Rds")

routes_whole_nogeo = read_rds(piggyback::pb_download_url("routes-whole-nogeo.Rds"))



###




# join-on housing data from origins
routes_full_data = routes_whole_nogeo %>%
  inner_join(property_age_lsoa, by = c("geocode1" = "lsoa"))


routes_full_data$pcycle = routes_full_data$bicycle / routes_full_data$all
routes_full_data$pwalk = routes_full_data$foot / routes_full_data$all
routes_full_data$pcycle[routes_full_data$pcycle < 0.001] = 0.001
routes_full_data$pwalk[routes_full_data$pwalk < 0.001] = 0.001



# The uptake model --------------------------------------------------------

# Could use AICcmodavg() for model selection
# Or use a Bayesian approach

m_national = glm(pcycle ~
                   distance_m + # d1
                   sqrt(distance_m)  +  # d2
                   average_incline +    # h1
                   distance_m * average_incline +  # i1
                   mean_busyness +
                   p2000_09,
                 data = routes_full_data,
                 family = quasibinomial(),
                 weights = all
)
summary(m_national)

m_national2 = glm(pcycle ~
                    distance_m + # d1
                    sqrt(distance_m)  +  # d2
                    average_incline +    # h1
                    distance_m * average_incline +  # i1
                    mean_busyness,
                  data = routes_full_data,
                  family = quasibinomial(),
                  weights = all
)
summary(m_national2)

m_national3 = glm(pcycle ~
                    distance_m + # d1
                    sqrt(distance_m)  +  # d2
                    distance_m^2 + # d3
                    average_incline +    # h1
                    distance_m * average_incline +  # i1
                    mean_busyness +
                    p2000_09,
                  data = routes_full_data,
                  family = quasibinomial(),
                  weights = all
)
summary(m_national3)

anova(m_national, test = "Chi")

m_national4 = glm(pcycle ~
                    distance_m + # d1
                    sqrt(distance_m)  +  # d2
                    distance_m^2 + # d3
                    average_incline +    # h1
                    distance_m * average_incline +  # i1
                    sqrt(distance_m) * average_incline + # i2
                    mean_busyness +
                    p2000_09,
                  data = routes_full_data,
                  family = quasibinomial(),
                  weights = all
)
summary(m_national4)

anova(m_national4, test = "F")


fitted.values = m_national$fitted.values

plot(routes_full_data$distance_m, fitted.values)
plot(routes_full_data$average_incline, fitted.values)
plot(routes_full_data$p2000_09, fitted.values)
cor(routes_full_data$pcycle, fitted.values)^2 # explains around 11% of variability






----------------------------


od_lsoas_short_routes = readRDS(url("https://github.com/cyipt/acton/releases/download/0.0.1/od_lsoas_short_routes.Rds"))
nrow(od_lsoas_short_routes)
nrow(od_lsoas_short_routes) / nrow(od_lsoas_short)


# summarise with average steepness, median, 75% percentile, 90% percentile, 95th percentile, 99th percentile, 100th percent...



# join-on data from origins
stopCluster(cl)
# 69964 NA where geocode2 is a destination outside the country or other unusual place
View(od_lsoas_sf_interzonal[is.na(od_lsoas_sf_interzonal$distance_euclidean),])

od_lsoas_sf_interzonal = od_lsoas_sf_interzonal %>%
  filter(is)

plot(od_lsoa_sf[1:100,])

# library(pct)
# lsoas_reg = inner_join(lsoas, pct_regions_lookup, by = c("lad_name" = "lad16nm"))
# dim(lsoas_reg)
# summary(factor(lsoas_reg$region_name))
#
# View(lsoas_reg[is.na(lsoas_reg$region_name),])


# Join with centroids from PCT to get geometry --------------------------------------------
# this joins with the geometry of the origin
od_national = inner_join(od_lsoa, lsoas, by = c("geocode1" = "geo_code"))

# destination geometry



# subset the files of interest
lsoa_line_aggregated = l %>%
  group_by()


# get high res data -------------------------------------------------------
#
# u = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/England_lsoa_2011_clipped.zip"
# u2 = "https://borders.ukdataservice.ac.uk/ukborders/easy_download/prebuilt/shape/Wales_lsoa_2011_clipped.zip"
#
# lsoas_eng = ukboundaries::duraz(u = u)
# lsoas_wales = ukboundaries::duraz(u = u2)
# lsoas = rbind(lsoas_eng,lsoas_wales)
#
# new_homes_2000_09 = new_homes_2000_09 %>%
#   inner_join(lsoas, ., by = c("code" = "lsoa"))
#
# class(new_homes_2000_09)
# summary(new_homes_2000_09)
# plot(new_homes_2000_09[1, ])
# mapview::mapview(new_homes_2000_09[1:1000, ])
cyipt/acton documentation built on Jan. 29, 2021, 7:30 a.m.