.dev/script/new-traffic-data_20210508_accidents.R

# load wrangle packages ---------------------------------------------------
library(tidyverse)
library(readxl)
library(here)
library(hkdatasets)
library(dataCompareR)

# custom functions for hour minute treatment ------------------------------

# to_hm <- function(x){
#   
#   h_str <- as.numeric(substr(x, start = 1, stop = 2))
#   m_str <- as.numeric(substr(x, start = 3, stop = 4))
#   
#   hms::hms(hours = h_str,
#            minutes = m_str)
#   
# }


# load data ---------------------------------------------------------------

hk_accidents_new <-
  read_xlsx(
    here("data-raw",
         "FurtherData_Accident20142019_joined_210201.xlsx"))

# labels for matching up to new data
# read each individual data frame to global environment
code_sheets_path <-
  here("data-raw",
       "reformatted_Code table_v1.xlsx")

code_sheets <- excel_sheets(code_sheets_path)

code_sheets %>%
  purrr::map(function(sheet){ # iterate through each sheet name
    assign(x = paste0("t_", sheet), # prefixed variable names
           value = readxl::read_xlsx(path = code_sheets_path, sheet = sheet),
           envir = .GlobalEnv)
  })

# accidents check ---------------------------------------------------------

# hk_accidents_new %>% glimpse()
# hkdatasets::hk_accidents %>% glimpse()
# 
# setdiff(names(hk_accidents_new), names(hkdatasets::hk_accidents))
# setdiff(names(hkdatasets::hk_accidents), names(hk_accidents_new))

# dataCompareR ------------------------------------------------------------

# # hk_accidents
# compare_acc <-
#   dataCompareR::rCompare(
#     dfA = hk_accidents,
#     dfB = hk_accidents_new
#   )
# 
# compare_acc %>%
#   saveReport(
#     paste("Comparison of hk_accidents old and new", wpa::tstamp())
#   )
# 

# data clean --------------------------------------------------------------
# this chunk does two things:
# 1. join new variables up to the old datasets
# 2. "code" new variables as factor / categorical variables

look_up <- function(x,
                    dictionary,
                    index = "Code",
                    match = "Description"){
  dictionary[[match]][c(match(x, dictionary[[index]]))]
}

# look_up(iris$Species,
#         c("virginica", "versicolor", "setosa"),
#         replace = c("tum", "de", "da"))

# hk_accidents ------------------------------------------------------------

hk_accidents_new_cleaned_unlabelled <-
  hk_accidents %>%
  left_join(
    # New columns
    select(
      hk_accidents_new,
      
      # variables for joining ---------------------------------------
      Date, # For joining
      Serial_No_, # For joining
      Year, # For joining
      
      # New variables with code frames identified --------------------
      Street_nam, # Street name, need to be joined with A1_street_name
      Second_str, # Second street name, need to be joined with A1_street_name
      Overtaking, # E.g. One vehicle, two or more, etc.
      Within_70m, # Whether within 70m of junction
      Road_type_, # E.g. One way, Two way, Dual carriageway
      Pedal_cycl, # Others, Pedal Cycle, Motorcycle
      TypeOfCo_P, # E.g. Vehicle collision with Pedestrian, etc.
      Struc_Type, # E.g. At grade road, Flyover, Bridge, etc.
      RD_Class_L, # E.g. Expressway, Main Road, Secondary Road, etc.
      Road_class, # Slightly different from `Road_Classification`
      
      # New variables that do not require code frame -----------------
      Precise_lo, # Precise location. Strength.
      Accident_a, # No/ Yes
      Junction_t, # E.g. Cross roads, T-junction, Y-junction, Multiple, etc.
      Whether_at, # E.g. No crossing control, On a crossing control, etc.
      Type_of_Cr, # E.g. Footbridge/Subway, Traffic signal, Zebra, etc.
      
    ),
    by = c("Date", "Serial_No_", "Year")
  )
  
# labelling ----------------------------------------------------------------

hk_accidents_new_cleaned_labelled <-
  hk_accidents_new_cleaned_unlabelled %>%
  mutate(
    Street_Name = look_up(Street_nam, dictionary = t_A1_street_name),
    Overtaking = look_up(Overtaking, dictionary = t_Overtaking),
    Within_70m = look_up(Within_70m, dictionary =  t_Within_70m),
    Road_Type = look_up(Road_type_, dictionary = t_Road_type_),
    Cycle_Type = look_up(Pedal_cycl, dictionary = t_Ped_Cycle),
    Type_of_Collision_v2 = look_up(TypeOfCo_P, dictionary = t_Collision_Type),
    Structure_Type = look_up(Struc_Type, dictionary = t_Struc_Type),
    Road_Class_L = look_up(RD_Class_L, dictionary = t_Road_class_L),
    Road_Classification_v2 = look_up(Road_class, dictionary = t_Road_class)
  ) %>% 
  
  # Remove old names ------------------------------------------------------
  select( 
    -Street_nam,
    -Second_str,
    -Road_type_,
    -Pedal_cycl,
    -TypeOfCo_P,
    -Struc_Type,
    -RD_Class_L,
    -Road_class,
    -Overtaking # not used
  ) %>%
  
  # Rename new variables --------------------------------------------------
  rename(
    Precise_Location = "Precise_lo",
    Accident = "Accident_a",
    Junction_Type = "Junction_t",
    Crossing_Control = "Whether_at",
    Crossing_Type = "Type_of_Cr",
    Type_of_Collision_with_cycle = "Type_of_Collision_v2",
    Road_Hierarchy = "Road_Class_L"
    ) %>%

  # Rename rows with value "n.a." to "NA" ---------------------------------
  mutate(
    Junction_Type = ifelse(Junction_Type == "n.a.", NA, Junction_Type),
    Crossing_Type = ifelse(Crossing_Type == "n.a.", NA, Crossing_Type),
    Road_Hierarchy = ifelse(Road_Hierarchy == "N.A.", NA, Road_Hierarchy),
    Structure_Type = ifelse(Structure_Type == "N.A. (missing data or outliner)", NA, Structure_Type)
  ) %>%
  
  # Replace old Road Classification with new ------------------------------
  select(-Road_Classification) %>%
  rename(Road_Classification = "Road_Classification_v2") %>%
  rename(Road_Ownership = "Road_Classification") %>%
  
  # To logical where binary -----------------------------------------------
  mutate(
    Accident = ifelse(Accident == "Yes", TRUE, FALSE),
    Within_70m = ifelse(Within_70m == "Yes", TRUE, FALSE),
    Hit_and_Run = ifelse(Hit_and_Run == "Yes", TRUE, FALSE)
    ) %>%
  
  # turn time to date/time instead of string -----------------------------
  mutate(Date_Time = as.POSIXct(
    strptime(
      paste0(Date, " ", Time),
      format = "%Y-%m-%d %H%M",
      tz = "Asia/Hong_Kong"
    )
  )) %>%
  # Remove redundant columns
  select(
    -Date,
    -Time
    ) %>%
  # Reorder
  select(
    Date_Time,
    everything()
  ) %>%
  
  # Final variable name cleaning ------------------------------------------
  rename(No_of_Vehicles_Involved = "No__of_Vehicles_Involved",
         No_of_Casualties_Injured = "No__of_Casualties_Injured")
  
# interactive tests -------------------------------------------------------

table(hk_accidents$Road_Classification)
table(hk_accidents_new$RD_Class_L)
table(hk_accidents_new$Road_class) # Slightly different from Road Classification
table(hk_accidents_new$Road_type_) # Road type - completely new

table(hk_accidents_new_cleaned_labelled$Structure_Type)

table(hk_accidents_new_cleaned_labelled$Accident) # Boolean
table(hk_accidents_new_cleaned_labelled$Within_70m) # Boolean
table(hk_accidents_new_cleaned_labelled$Hit_and_Run) # Boolean

table(hk_accidents_new_cleaned_labelled$Overtaking) # All blanks
hk_accidents_new_cleaned_labelled$Date_Time



# TODO: Check for missing values for all string variables
explore_vars_accidents <-
  hk_accidents_new_cleaned_labelled %>%
  select(where(is_character)) %>%
  purrr::map(~unique(.))

explore_vars_accidents %>%
  purrr::map(~str_extract(., "n\\.a"))


# Additional cleaning 17 May 2021 ------------------------------------------

## Overwrite dataset for hk_accidents 
hk_accidents_new_cleaned_labelled %>% glimpse()
hk_accidents <- hk_accidents_new_cleaned_labelled # overwrite

usethis::use_data(
  hk_accidents,
  # internal = TRUE, # Lazy loading
  internal = FALSE,
  overwrite = TRUE
  )

hk_accidents %>%
  write_csv(
    here::here(
      "data-ready",
      "hk_accidents.csv"
    )
  )

hk_accidents %>%
  fst::write_fst(
    here::here(
      "data-ready",
      "hk_accidents.fst"
    )
  )
Hong-Kong-Districts-Info/hkdatasets documentation built on Sept. 8, 2021, 11:34 p.m.