.dev/script/new-traffic-data_20210425.R

library(tidyverse)
library(readxl)
library(here)
library(hkdatasets)
library(dataCompareR)

# load data ---------------------------------------------------------------

hk_accidents_new <-
  read_xlsx(
    here("data-raw",
         "FurtherData_Accident20142019_joined_210201.xlsx"))

hk_casualties_new <-
  read_xlsx(
    here("data-raw",
         "FurtherData_Casualty20142019_Joined_201216.xlsx"))

hk_vehicles_new <-
  read_xlsx(
    here("data-raw",
         "FurtherData_Vehicle20142019_Joined_201216.xlsx"))

# labels for matching up to new data
# read each individual data frame to global environment
code_sheets_path <-
  here("data-raw",
       "reformatted_Code table_v1.xlsx")

code_sheets <- excel_sheets(code_sheets_path)

code_sheets %>%
  purrr::map(function(sheet){ # iterate through each sheet name
    assign(x = paste0("t_", sheet), # prefixed variable names
           value = readxl::read_xlsx(path = code_sheets_path, sheet = sheet),
           envir = .GlobalEnv)
  })

# accidents check ---------------------------------------------------------

hk_accidents_new %>% glimpse()
hkdatasets::hk_accidents %>% glimpse()

setdiff(names(hk_accidents_new), names(hkdatasets::hk_accidents))
setdiff(names(hkdatasets::hk_accidents), names(hk_accidents_new))


# casualties check --------------------------------------------------------

hk_casualties_new %>% glimpse()
hkdatasets::hk_casualties %>% glimpse()

setdiff(names(hk_casualties_new), names(hkdatasets::hk_casualties))
setdiff(names(hkdatasets::hk_casualties), names(hk_casualties_new))


# vehicles check ----------------------------------------------------------

hk_vehicles_new %>% glimpse()
hkdatasets::hk_vehicles %>% glimpse()

setdiff(names(hk_vehicles_new), names(hkdatasets::hk_vehicles))
setdiff(names(hkdatasets::hk_vehicles), names(hk_vehicles_new))

# dataCompareR ------------------------------------------------------------

# hk_accidents
compare_acc <-
  dataCompareR::rCompare(
    dfA = hk_accidents,
    dfB = hk_accidents_new
  )

compare_acc %>%
  saveReport(
    paste("Comparison of hk_accidents old and new", wpa::tstamp())
  )

# hk_casualties
compare_cas <-
  dataCompareR::rCompare(
    dfA = hk_casualties,
    dfB = hk_casualties_new
  )

compare_cas %>%
  saveReport(
    paste("Comparison of hk_casualties old and new", wpa::tstamp())
    )

# hk_vehicles
compare_veh <-
  dataCompareR::rCompare(
    dfA = hk_vehicles,
    dfB = hk_vehicles_new
  )

compare_veh %>%
  saveReport(
    paste("Comparison of hk_vehicles old and new", wpa::tstamp())
  )

# data clean --------------------------------------------------------------
# this chunk does two things:
# 1. join new variables up to the old datasets
# 2. "code" new variables as factor / categorical variables

look_up <- function(x,
                    dictionary,
                    index = "Code",
                    match = "Description"){
  dictionary[[match]][c(match(x, dictionary[[index]]))]
}

# look_up(iris$Species,
#         c("virginica", "versicolor", "setosa"),
#         replace = c("tum", "de", "da"))

hk_accidents_new_cleaned_unlabelled <-
  hk_accidents %>%
  left_join(
    # New columns
    select(
      hk_accidents_new,
      
      # variables for joining ---------------------------------------
      Date, # For joining
      Serial_No_, # For joining
      Year, # For joining
      
      # FIXME: New variables with code frames missing - what are these?
      Pedal_cycl, 
      Pedal_col, 
      TypeOfCo_P, 
      Struc_Type, 
      RD_Class_L, 
      
      # New variables with code frames identified -------------------
      
      Street_nam, # Street name, need to be joined with A1
      Within_70m, # Whether within 70m of junction
      Second_str, # Second street name, need to be joined with A1
      Road_type_, # E.g. One way, Two way, Dual carriageway
      Road_class, # Slightly different from `Road_Classification`
      Overtaking, # E.g. One vehicle, two or more, etc. 
      ),
    by = c("Date", "Serial_No_", "Year")
  )
  
# labelling ----------------------------------------------------------------

hk_accidents_new_cleaned_labelled <-
  hk_accidents_new_cleaned_unlabelled %>%
  left_join(
    rename(t_A1_street_name, Street_Name = "Description"),
    by = c("Street_nam" = "Code")) %>%
  mutate(Street_Name = look_up(Street_nam, dictionary = t_A1_street_name)) %>%
  mutate(Within_70m = look_up(Within_70m, dictionary =  t_Within_70m)) %>%
  
  # Remove old names
  select( 
    -Street_nam
  )

# interactive tests

table(hk_accidents$Road_Classification)
table(hk_accidents_new$RD_Class_L)
table(hk_accidents_new$Road_class) # Slightly different from Road Classification
table(hk_accidents_new$Road_type_) # Road type - completely new

table(hk_vehicles_new$Main_vehic) # Main vehicle manouvre
table(hk_vehicles_new$Vehicle_co) # Vehicle collision with
table(hk_vehicles_new$First_poin) # First point of impact

table(hk_casualties_new$Pedestri_1) # Pedestrian location
table(hk_casualties_new$Pedestri_2) # Pedestrian special circumstances
table(hk_casualties_new$Pedestrian) # Pedestrian action
table(hk_casualties_new$X_Pedestri) # ???
Hong-Kong-Districts-Info/hkdatasets documentation built on Sept. 8, 2021, 11:34 p.m.