WARNING: this does not work on Windows.
This function generates the data object stats19_schema
in a reproducible way
using DfT's schema definition (see function [dl_schema()]).
The function also generates stats19_variables
(see the function's source code
for details).
read_schema = function( data_dir = tempdir(), filename = "Road-Accident-Safety-Data-Guide.xls", sheet = NULL ) { file_path = file.path(data_dir, filename) if(!file.exists(file_path)) { dl_schema() } if(is.null(sheet)) { export_variables = readxl::read_xls(path = file_path, sheet = 2, skip = 2) export_variables_collisions = tibble::tibble( table = collision, variable = export_variables$`Accident Circumstances` ) export_variables_vehicles = tibble::tibble( table = "vehicles", variable = export_variables$Vehicle ) export_variables_casualties = tibble::tibble( table = "casualties", variable = export_variables$Casualty ) export_variables_long = rbind( export_variables_collisions, export_variables_casualties, export_variables_vehicles ) stats19_variables = stats::na.omit(export_variables_long) stats19_variables$type = stats19_vtype(stats19_variables$variable) # add variable linking to names in formatted data names_acc = names(accidents_sample) names_veh = names(vehicles_sample) names_cas = names(casualties_sample) names_all = c(names_acc, names_veh, names_cas) variables_lower = schema_to_variable(stats19_variables$variable) # test result # variables_lower[!variables_lower %in% names_all] # names_all[!names_all %in% variables_lower] stats19_variables$column_name = variables_lower # head(stats19_variables) #' export result: usethis::use_data(stats19_variables, overwrite = TRUE) sel_character = stats19_variables$type == "character" character_vars = stats19_variables$variable[sel_character] character_vars = stats19_vname_switch(character_vars) schema_list = lapply( X = seq_along(character_vars), FUN = function(i) { x = readxl::read_xls(path = file_path, sheet = character_vars[i]) names(x) = c("code", "label") x } ) stats19_schema = do.call(what = rbind, args = schema_list) n_categories = vapply(schema_list, nrow, FUN.VALUE = integer(1)) stats19_schema$variable = rep(character_vars, n_categories) character_cols = stats19_variables$column_name[sel_character] stats19_schema$variable_formatted = rep(character_cols, n_categories) } else { stats19_schema = readxl::read_xls(path = file_path, sheet = sheet) } stats19_schema }
dl_schema = function(data_dir = tempdir()) { u = paste0( "https://data.dft.gov.uk/road-accidents-safety-data/", "Road-Accident-Safety-Data-Guide.xls" ) destfile = file.path(data_dir, "Road-Accident-Safety-Data-Guide.xls") utils::download.file(u, destfile = destfile) # download and unzip the data if it's not present }
schema_to_variable = function(x) { x = format_column_names(x) x = gsub(pattern = " ", replacement = "_", x = x) x = gsub(pattern = "_null_if_not_known", replacement = "", x) x = gsub(pattern = "_dd/mm/yyyy|_hh:mm", replacement = "", x) x = gsub(pattern = "highway_authority___ons_code", replacement = "highway", x) x = gsub(pattern = "lower_super_ouput_area", replacement = "lsoa", x) x = gsub(pattern = "_england_&_wales_only", replacement = "", x) x = gsub(pattern = "_cc", replacement = "", x) x = gsub(pattern = "vehicle_propulsion_code", replacement = "propulsion_code", x) x = gsub(pattern = "pedestrian_road_maintenance_worker_from_2011", replacement = "pedestrian_road_maintenance_worker", x) x = gsub(pattern = "engine_capacity", replacement = "engine_capacity_cc", x) x = gsub(pattern = "age_of_vehicle_manufacture", replacement = "age_of_vehicle", x) x }
stats19_vtype = function(x) { variable_types = rep("character", length(x)) sel_numeric = grepl(pattern = "Number|Speed|Age*.of|Capacity", x = x) variable_types[sel_numeric] = "numeric" sel_date = grepl(pattern = "^Date", x = x) variable_types[sel_date] = "date" sel_time = grepl(pattern = "^Time", x = x) variable_types[sel_time] = "time" sel_location = grepl(pattern = "^Location|Longi|Lati", x = x) variable_types[sel_location] = "location" #' remove other variables with no lookup: no weather ?! sel_other = grepl( pattern = paste0( "Did|Lower|Accident*.Ind|Reference|Restricted|", "Leaving|Hit|Age*.Band*.of*.D|Driver*.H" ), x = x ) variable_types[sel_other] = "other" variable_types } stats19_vname_switch = function(x) { x = gsub(pattern = " Authority - ONS code", "", x = x) x = gsub( pattern = "Pedestrian Crossing-Human Control", "Ped Cross - Human", x = x ) x = gsub( pattern = "Pedestrian Crossing-Physical Facilities", "Ped Cross - Physical", x = x ) x = gsub(pattern = "r Conditions", "r", x = x) x = gsub(pattern = "e Conditions", "e", x = x) x = gsub(pattern = " Area|or ", "", x = x) x = gsub(pattern = "Age Band of Casualty", "Age Band", x = x) x = gsub(pattern = "Pedestrian", "Ped", x = x) x = gsub(pattern = "Bus Coach Passenger", "Bus Passenger", x = x) x = gsub(pattern = " \\(From 2011\\)", "", x = x) x = gsub(pattern = "Casualty Home Type", "Home Area Type", x = x) x = gsub(pattern = "Casualty IMD Decile", "IMD Decile", x = x) x = gsub(pattern = "Driver IMD Decile", "IMD Decile", x = x) x = gsub(pattern = "Journey Purpose of Driver", "Journey Purpose", x = x) x }
Informal test:
variable_types = stats19_vtype(stats19_variables$variable) names(variable_types) = stats19_variables$variable variable_types x = names(get_stats19(year = 2022, type = collision, ask = FALSE)) n = stats19_vtype(x) names(n) = x n
The data bundled with the package can then be (re-)generated with
stats19_schema = read_schema() View(stats19_schema)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.