This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(tidyverse)
library(data.table)
p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = wisdotcrashdatabase::get_list_of_years("17", "21"), columns = c("DRUGSUSP", "ALCSUSP", "DRVRFLAG","STATNM", "UNITPRSN", "DISTACT", "DRVRDS", "SFTYEQP"))
p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = wisdotcrashdatabase::get_list_of_years("16", "18"), columns = c("DRUGSUSP", "ALCSUSP", "DRVRFLAG","STATNM", "UNITPRSN", "DISTACT", "DRVRDS"))
c = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = wisdotcrashdatabase::get_list_of_years("16", "18"), columns = c("DRUGFLAG", "ALCFLAG"))
#%>% wisdotcrashdatabase::county_rename()
p |> get_alc_drug_impaired_person()
p %>% wisdotcrashdatabase::relabel_person_variables()
select_bike_ped_persons(p) |> dplyr::count(WISINJ, year)
p %>% class()
DT = data.table::data.table(y = c(1,2,3), WISINJ = c("Fatal Injury", "Fatal Injury", "Fatal Injury")) #%>% wisdotcrashdatabase::relabel_person_variables()

DT[ , list(wisdotcrashdatabase::relabel_person_variables(DT),wisdotcrashdatabase::relabel_person_variables(DT)) ]
devtools::load_all()
p = import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = "21", columns = c("DRUGSUSP", "ALCSUSP", "STATNM"), filter_by = "county:Dane")
p = get_driver_flags(p, c( "impaired"))

find_person_flag(p, "impaired_flag")
# for new db
alc_drg_statutes = "^346.63\\(" # Only the new db uses "(" after statute number
driver = p[DRVRFLAG == "Y"]
suspected_yes = c("^101$|^Yes$|^Y$")
# TODO match for old db - use all persons??
p_stat =
  p |>
  # for str_detect, add "Y" so we get flags under alcohol_flag
  dplyr::select(.data[["CRSHNMBR"]], .data[["ALCSUSP"]], .data[["DRUGSUSP"]], .data[["UNITNMBR"]], year,
                dplyr::starts_with("STATNM")) |>
  dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0(
    alc_drg_statutes,"|", suspected_yes
  )))) |>
  dplyr::mutate(impaired_flag = "Y") |>
  dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["impaired_flag"]], year)



driver |> get_impaired_driver() |> distinct(CRSHNMBR, .keep_all = T) |> count(year)
p_stat |> distinct(CRSHNMBR, .keep_all = T) |> count(year) # MATCHES CM!!

p[year == "2016" & (ALCSUSP == "Y" | DRUGSUSP == "Y")][!duplicated(CRSHNMBR)]#[,.N, by = ROLE]
c[yearget_impaired_driver == "2016" & (ALCSUSP == "Y" | DRUGSUSP == "Y")]
driver$STATNMB1 |> unique()

# 2016: 6240 in CM. 5672 for here
# 2017: 7573

# distracted -MATCHES CM!!
wisdotcrashdatabase::get_driver_flags(driver, flags = "distracted") |> filter(distracted_flag == "Y") |> distinct(CRSHNMBR, .keep_all = T) |> count(year)
p$SFTYEQP |> unique()

# MATCH CM
p[SFTYEQP %in% c("None Used - Vehicle Occupant")][!duplicated(CRSHNMBR), .N, by = year]

# OLD?? 
p[SFTYEQP %in% c("NONE USED-VEHICLE OCCUPANT")][!duplicated(CRSHNMBR), .N, by = year]

 p |>
  # for str_detect, add "Y" so we get flags under alcohol_flag
  dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["SFTYEQP"]], year) |>
  dplyr::filter(.data[["SFTYEQP"]] %in% c("None Used - Vehicle Occupant", "NONE USED-VEHICLE OCCUPANT")) |> 
  dplyr::mutate(seatbelt_flag = "Y") |>
  dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["seatbelt_flag"]], year) |>  distinct(CRSHNMBR, .keep_all = T) |> count(year)
library(tidyverse)
library(data.table)
p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = "18", columns = c("DRUGSUSP", "AGE", "ALCSUSP", "DRVRFLAG","DRVRPC", "STATNM", "DISTACT", "DRVRDS", "SFTYEQP"))
dr = get_driver_flags(p, c("distracted","speed","teen","older", "impaired"))
dr[distracted_flag == "Y"][!duplicated(CRSHNMBR), .N]
get_seatbelt_flag_by_unit(p) |> filter(seatbelt_flag == "Y") |> distinct(CRSHNMBR, .keep_all = T) |> count(year)

# CM in 2018:
SB = 11364
dist = 22916
speed = 20061
teen = 20076
older = 23465
imp = 7605
# TODO Just added .data[["UNITNMBR"]]
alc_statutes = "^346.63\\(|^346.63 |OPERATING WHILE INTOXICATED|OPERATING WHILE UNDER INFLUENCE" # except 1 am
drug_statutes = "346.63\\(1\\)\\(am\\)|346.63\\(1\\)\\(AM\\)|346.63 1  am|346.63 1 am|346.63 1  AM|346.63 1 AM|346.63 1AM"

# Make column to make an id for each unique person in a crash. For old db.
if(is.null(p[["UNITPRSN"]])){
 p = p[ , .data[["UNITPRSN"]] := NA]
}

# for old db, trying to join p_stat back to p, but seems like it's not joining right. Missing crashes with stat and no alcsusp. ??
p = p |> dplyr::mutate(UNITPRSN = ifelse(is.na(.data[["UNITPRSN"]]), paste0(.data[["UNITNMBR"]], .data[["SEATLOC"]]), .data[["UNITPRSN"]]))
# p[year == 2010, STATNMB1] |> unique() |> sort()
p_join[grepl(suspected_yes, ALCSUSP) & DRVRFLAG == "Y"][!duplicated(CRSHNMBR)][, .N, by = year]
driver = p[DRVRFLAG == "Y"]
suspected_yes = c("^101$|^Yes$|^Y$")
suspected_no = c("^102$|^No$|^N$")
# p =  p[DRVRFLAG == "Y" & year == 2017]
# p[CRSHNMBR == "100100004"]
driver |> dplyr::count(CRSHNMBR, UNITPRSN, UNITNMBR) |> dplyr::arrange(-n) # n should = 1
p[CRSHNMBR == "170112409"]

# Get all sus alcohol by ALCSUSP or STATNM
p_stat =
  p |>
  # for str_detect, add "Y" so we get flags under alcohol_flag
  dplyr::select(.data[["CRSHNMBR"]], .data[["UNITPRSN"]], .data[["ALCSUSP"]], .data[["UNITNMBR"]],
                dplyr::starts_with("STATNM")) |>
  dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse(
    stringr::str_detect(., drug_statutes), NA, .
  ))) |>
  dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0(
    alc_statutes, "|", suspected_yes
  )))) |>
  dplyr::mutate(alcohol_flag = "Y") |>
  dplyr::select(.data[["UNITPRSN"]], .data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["alcohol_flag"]])

p_join = dplyr::full_join(p, p_stat, c("CRSHNMBR", "UNITPRSN", "UNITNMBR")) #by = c(.data[["CRSHNMBR"]], .data[["UNITNMBR"]]))
p_join |>
  dplyr::mutate(alcohol_flag = dplyr::case_when(
    alcohol_flag == "Y" ~ "Y", # ??
    grepl(suspected_no, ALCSUSP) ~ "N",
    # !is.na(alcohol_flag) ~ alcohol_flag,
    TRUE ~ "U"
  )) |> #select(ALCSUSP, STATNMB1, STATNMB2, alcohol_flag)# |> filter(ALCSUSP != alcohol_flag)
  dplyr::filter(alcohol_flag == "Y", DRVRFLAG == "Y") |> dplyr::distinct(CRSHNMBR, .keep_all = TRUE) |> dplyr::count(year)
# 24514/ 28627

# Not ALC SUSP but has a OWI citation.
p_join[!grepl(suspected_yes, ALCSUSP) & DRVRFLAG == "Y" & alcohol_flag == "Y"][, .N, by = DRUGSUSP]

# 4,618

# Get all sus drug by DRUGSUSP or STATNM
d_stat =
  p |>
  # for str_detect, add "Y" so we get flags under alcohol_flag
  dplyr::select(.data[["CRSHNMBR"]], .data[["UNITPRSN"]], .data[["DRUGSUSP"]], .data[["UNITNMBR"]],
                dplyr::starts_with("STATNM")) |>
  dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0(
    drug_statutes,"|", suspected_yes
  )))) |>
  dplyr::mutate(drug_flag = "Y") |>
  dplyr::select(.data[["UNITPRSN"]], .data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["drug_flag"]])


d_join = dplyr::full_join(p, d_stat, c("CRSHNMBR", "UNITPRSN", "UNITNMBR")) #by = c(.data[["CRSHNMBR"]], .data[["UNITNMBR"]]))
d_join |>
  dplyr::mutate(drug_flag = dplyr::case_when(
    drug_flag == "Y" ~ "Y",
    grepl(suspected_no, DRUGSUSP) ~ "N",
    # !is.na(drug_flag) ~ drug_flag,
    TRUE ~ "U"
  )) |> dplyr::filter(drug_flag == "Y", DRVRFLAG == "Y") |> dplyr::distinct(CRSHNMBR, .keep_all = TRUE) |> dplyr::count(year)

7467 / 7476
p[ , .N, by = .(DRVRFLAG, ROLE)]
p2[ , .N, by = .(DRVRFLAG, ROLE)]
v = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "vehicle", years = wisdotcrashdatabase::get_list_of_years("17", "17"), columns = c("TKBSCFG"))

relabel_vehicle_variables <-
  function(vehicle_df, relabel_by = "vehtype") {
    if (relabel_by %in% "vehtype") {
      vehicle_df = vehicle_df |> dplyr::mutate(
        veh =
          dplyr::case_when(
            .data[["VEHTYPE"]] %in% c(
              "Passenger Car",
              "(Sport) Utility Vehicle",
              "Cargo Van (10,000 Lbs or Less)",
              "Passenger Van"
            ) ~ "Passenger Vehicle",
            .data[["VEHTYPE"]] == "Utility Truck/Pickup Truck" ~ "Light Trucks",
            .data[["VEHTYPE"]] %in% c(
              "Straight Truck",
              "Truck Tractor (Trailer Not Attached)",
              "Truck Tractor (Trailer Attached)",
              "Truck Tractor (More Than One Trailer)"
            ) ~ "Large Trucks",
            # TKBSCFG != "" ~ "CMV",
            .data[["VEHTYPE"]] %in% c(
              "ATV/UTV (Utility Terrain Vehicle)",
              "UTV (Utility Terrain Vehicle)",
              "ATV"
            ) ~ "ATV/UTV",
            .data[["VEHTYPE"]] == "Bicycle" ~ "Bicycle",
            .data[["VEHTYPE"]] %in% c("Moped", "Motorcycle") ~ "Moped/Motorcycle",
            grepl("Emergency|EMERGENCY", .data[["VEHTYPE"]]) ~ "Emergency Vehicle",
            grepl("Bus", .data[["VEHTYPE"]]) ~ "School/Passenger Bus",
            # .data[["VEHTYPE"]] %in% c("POLICE ON EMERGENCY") ~ "Emergency Vehicle"
            # TRUE ~ .data[["VEHTYPE"]]
            .data[["VEHTYPE"]] == .data[["VEHTYPE"]] ~ "Other"
          )
      )
    }
    if (relabel_by %in% "cmv") {
      vehicle_df = dplyr::mutate(vehicle_df, cmv = ifelse(.data[["TKBSCFG"]] != "", "Y", "N"))
    }
    vehicle_df
  }

relabel_vehicle_variables(v, "cmv")
years = "18"
p = import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", columns = c("OLICTYPE"), years = c("16", "17"))
v = import_db_data("C:/data/crashes_duck.duckdb", db_type = "vehicle", years = years)
m = get_motorcycle_persons(p,v)

m |> count(CRSHNMBR)

c = import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", columns = c("CYCLFLAG", "CRSHTYPE", "MNRCOLL"), years = years)

get_lanedeparture_crashes(c)
anti_join( m[!duplicated(CRSHNMBR)], c[CYCLFLAG == "Y"], by = "CRSHNMBR")
anti_join( c[CYCLFLAG == "Y"], m[!duplicated(CRSHNMBR)], by = "CRSHNMBR") # involve parked MC?
p$UNITTYPE |> unique()

p[UNITTYPE %in% c("Bicycle", "BICYCLE")][!duplicated(CRSHNMBR)][, .N, by = year]

p[UNITTYPE %in% c("Pedestrian", "PEDESTRIAN")][!duplicated(CRSHNMBR)][, .N, by = year]

select_bike_ped_persons <-
  function(person_df,
           ped = "Y",
           bike = "Y") {
    if (ped == "Y") {
      ped_select = c("Pedestrian", "PEDESTRIAN")
    } else {
      ped_select = c()
    }
    if (bike == "Y") {
      bike_select =  c("Bicycle", "BICYCLE")
    } else {
      bike_select = c()
    }
    selection = dplyr::filter(person_df, .data[["UNITTYPE"]] %in% c(ped_select, bike_select))
    # selection = person_df[.data[["UNITTYPE"]] %in% c(ped_select, bike_select)]
    selection |> dplyr::mutate(
      UNITTYPE = ifelse(.data[["UNITTYPE"]] == "BICYCLE", "Bicycle", UNITTYPE),
      UNITTYPE = ifelse(UNITTYPE == "PEDESTRIAN", "Pedestrian", .data[["UNITTYPE"]])
    )
  }

select_bike_ped_persons(p, ped = "Y")

p[.data[["UNITTYPE"]] %in% c("Pedestrian")]
x |> count(UNITTYPE)
x = import_db_data( "C:/CSV/csv_from_sas/fst/", db_type = "vehicle", columns = c("OWNRTYP", "OWNRORG"), years = "17")
x = import_db_data(filepath =  "C:/CSV/csv_from_sas/fst/", db_type = "crash", columns = c("ANMLTY", "OWNRORG", "TOTFATL", "TOTINJ"), years = "17")
aggregate_crashes_by_crshsvr_tot_inj(x)

file_name = "C:/CSV/csv_from_sas/fst/17crash.fst"
colsToKeep =  c("ANMLTY") #, "OWNRORG")

read_cols <- function(file_name, colsToKeep) {
  header <- fst::read_fst(file_name, to = 1)
  if (is.null(colsToKeep)) {
    return(colnames(header))
  } else {
    colsToKeep2 <-
      union(
        c(
          "CRSHDATE",
          "CNTYCODE",
          "CRSHSVR",
          "UNITNMBR",
          "ROLE",
          "VEHTYPE",
          "WISINJ"
        ),
        colsToKeep
      ) # Tack these on

    columns_not_in_db <-
      subset(colsToKeep, !colsToKeep %in% colnames(header))
    if (length(columns_not_in_db) != 0) warning(paste0(paste0(columns_not_in_db, collapse = ", "), ' does not exist'))
    # Returns only columns found in the df
    return(subset(colsToKeep2, colsToKeep2 %in% colnames(header)))
  }
}




 if( columns_not_in_db != "" ) warning(paste0(columns_not_in_db, ' does not exist'))

import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "DRVRPC"), db_type = "person", years = "17")

read_cols(file_name =  "C:/CSV/csv_from_sas/fst/17person.fst", colsToKeep = c("CRSHTIME", "DRVRPC", "dd", "ANMLTY"))
read_fst_for_new_db("C:/CSV/csv_from_sas/fst/17person.fst", c("CRSHTIME", "DRVRPC", "as"))

col_to_select =  c("CRSHTIME", "DRVRPC", "dd", "ANMLTY")
col_to_select = "all"
if (col_to_select[1] != "all") {
  # Get all names for data that may be in multiple column
  columns_with_multiples <-
    subset(col_to_select,
           grepl(# Columns that have multiples
             db_columns_with_multiples,
             # "WTCOND|RDCOND|ENVPC|RDWYPC|ADDTL|CLSRSN|ANMLTY|DMGAR|VEHPC|HAZPLAC|HAZNMBR|HAZCLSS|HAZNAME|HAZFLAG|DRVRDS|DRUGYT|DRVRRS|DRVRPC|DNMFTR|STATNM|NMTACT|NMTSFQ|PROTGR|CITISS|CITNM|STATDS|STATSV|RSTRCT|CITNM",
             col_to_select))

  # If columns with multiples were found, make a list of all values, tack them onto the end. i.e. DRVRPC01, DRVRPC02, etc.
  if (length(columns_with_multiples) != 0) {
    col_to_select2 <-          get_list_of_multiple_columns()
  }
  else {
    col_to_select2 <-            union(c("CRSHNMBR"), col_to_select)
  }
  # col_to_select2
}

# read_fst_for_new_db(file_to_read = "C:/CSV/csv_from_sas/fst/17person.fst", col_to_select = c("CRSHTIME", "DRVRPC"))

# # Returns list of XX01:xx20 for columns with multiple (i.e. DRVRPC)
# get_list_of_multiple_columns <-
#   function(mult_col = columns_with_multiples, col_sel = col_to_select) {
#     get_all_names <-
#       sapply(mult_col,
#              paste0,
#              formatC(seq(1, 20), width = 2, flag = "0")) %>% as.character()
#     Reduce(union,
#            list(c("CRSHNMBR"),
#                 col_sel,
#                 get_all_names))
#     
#   }
# 
# get_list_of_multiple_columns()

p = import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME"), db_type = "person", years = "21")
p %>% filter(WISINJ == "Fatal Injury") %>% arrange(CRSHDATE)
y =import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "dd"), db_type = "crash", years = "17")
a = import_db_data( "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = c("17", "18"))

a$year %>% unique()
z = y %>% mutate(year = factor(year, levels = sort(unique(y$year))))
y %>% mutate(year = factor(.data$year, levels = sort(unique(.data$year))))
y %>% get_crash_times(column_name = "tes")
z=z %>% wisdotcrashdatabase::get_crash_times(combine_with_old = TRUE)
y %>% mutate(crash_time = convert_time_to_hms(CRSHTIME))

y %>% convert_time_to_hms()
y = y %>% convert_to_datetime()
y$crsh_datetime
all_years <- wisdotcrashdatabase::get_list_of_years("18", "21") %>% as.integer()

years_old = all_years[which(all_years<17)]
years_new = all_years[which(all_years>=17)]

import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "DRVRPC"), db_type = "crash", years = c("16"))
library(tidyverse)
# library(wisdotcrashdatabase)

y =import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "DRVRPC"), db_type = "person", years = c("16", "17"))

y = y%>% get_crash_times()

y = y %>% convert_time_to_hms()
y %>% convert_to_datetime()
library(tidyverse)
library(data.table)

old = import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("16", "17"), filetype = "fst")

x = import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = c("MUNICODE"), db_type = "crash", years = "19")

narr = import_narrative(filepath = "C:/CSV/csv_from_sas/fst/", years = "19")
library(duckdb)
library(tidyverse)
library(wisdotcrashdatabase)
con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = "C:/data/crashes_duck.duckdb", read_only = TRUE)
duckdb::dbDisconnect(con, shutdown = TRUE)
duckdb::duckdb_shutdown(duckdb::duckdb())

test = import_db_data(db_type = "crash", years = get_list_of_years("17", "17"), filetype = "duckdb", db_loc = "C:/data/crashes_duck.duckdb")

import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years =  c("18", "19"))
import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = c("89", "94"))
import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = c("89", "17"))
import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years =  "00")

narr = import_narrative(filepath = "C:/CSV/csv_from_sas/fst/", years = "19", filetype = "duckdb")
test %>% count(year)
loc = "C:/data/crashes_duck.duckdb"

# RSQLite::dbConnect(RSQLite::SQLite(), dbname = "C:/data/crash_db.sqlite")
# microbenchmark::microbenchmark(times = 10L,
#   import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "person", years = "00", filetype = "fst"), # 9.6
#   import_db_data(db_loc = "C:/data/crash_db.db", db_type = "person", years = "00")) # 10.6

microbenchmark::microbenchmark(times = 10L, # 63% faster
  import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19"), filetype = "fst"), # 7.3 s
  import_db_data("C:/data/crashes_duck.duckdb", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19")) # 2.7
  # import_db_data("C:/data/crashes_db.sqlite", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19")) # sqlite
)

microbenchmark::microbenchmark(times =  10L, # 13% faster
  import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = "all", db_type = "person", years = c("18"), filetype = "fst"), # 8.5 s
  import_db_data("C:/data/crashes_duck.duckdb", columns = "all", db_type = "person", years = c("18"), filetype = "duckdb") # 7.3
)

microbenchmark::microbenchmark(times = 10L, # 37% faster
  import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "fst"), # 13.5
  import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "duckdb") # 8.5
)

microbenchmark::microbenchmark(times = 10L, # 72% faster
  import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("17", "21"), filetype = "fst"), # 6.4
  import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("17", "21"), filetype = "duckdb") # 1.8
)
microbenchmark::microbenchmark(
  times =  10L,
  # import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "duckdb") .7 sec with empty df
    # x %>%
  #       dplyr::filter(
  #         .data$CRSHSVR != 'NON-REPORTABLE',
  #         .data$CRSHLOC == 'INTERSECTION' |
  #           .data$CRSHLOC == 'NON-INTERSECTION'
  #       ), #.7
  # x %>% 
  # dplyr::mutate(
  #       CRSHDATE = lubridate::ymd(.data$CRSHDATE),
  #       # year = lubridate::year(.data$CRSHDATE),
  #       year = factor(.data$year, levels = sort(unique(.data$year)))
  #     ) %>% data.table::as.data.table() # 1.3 sec
)

x =   purrr::map_dfr(
    .x = c("12crash", "13crash", "14crash", "15crash", "16crash"),
    .f = read_duckdb_for_new_old_db,
    filepath = NULL,
    col_to_select = "all",
    con = con,
    filter_by = NULL
  )  # 4.2

4.7 + .7 + 1.3
# DBI::dbReadTable(con, "18person")DBI::dbReadTable(con, "18person")
filter_by = "county:Dane, Fond du Lac"
sub(pattern = "^\\:.", replacement = "\\1", filter_by)
c = strsplit(sub(pattern = "^county:", replacement = "\\1", filter_by), ", ") %>% unlist()

crash <- crash %>% wisdotcrashdatabase::county_rename(combine_with_old = TRUE)
crash %>% filter(countyname %in% "Fond du Lac")
data_years = paste(c("17", "18"), "crash", sep = "")
col_to_grab = c("ANMLTY", "CRSHDATE")
cols_to_keep = columns_to_find(col_to_find = col_to_grab)

find_columns_in_all_new_db(data_new = data_years, filepath = "C:/CSV/csv_from_sas/fst/", colsToKeep = cols_to_keep, filetype = "fst", con = con)

columns_to_find <-
  function(col_to_find) {
    # If specific columns were selected, find which match in the database
    if (col_to_find[1] == "all") {
      found_columns <- "all"
    } else {
      # Get all names for data that may be in multiple column
      columns_with_multiples <-
        subset(col_to_find,
               grepl(# Columns that have multiples
                 db_columns_with_multiples,
                 col_to_find))

      # If columns with multiples were found, make a list of all values, tack them onto the end. i.e. DRVRPC01, DRVRPC02, etc.
      if (length(columns_with_multiples) != 0) {
        # This adds the '01' to '20' to the end of each matching column
        get_all_names <-
          sapply(columns_with_multiples,
                 paste0,
                 formatC(seq(1, 20), width = 2, flag = "0")) %>% as.character()

        col_to_find <- Reduce(union,
                                list(c("CRSHNMBR"),
                                     col_to_find,
                                     get_all_names))
      } else {
        col_to_find <-
          union(c("CRSHNMBR"), col_to_find)
      }
    }
    return(col_to_find)
  }

get_colnames_fst <- function(file_to_read){
  header <- fst::read_fst(file_to_read, to = 1)
  colnames(header)
}

get_colnames_duckdb <- function(file_to_read, con){
  header <- dplyr::tbl(con, file_to_read) %>% utils::head(1)
  colnames(header)
}

find_columns_in_all_new_db <-
  function(data_new,
           filepath,
           colsToKeep,
           filetype,
           con = con) {
    if (filetype == "fst") {
      full_file_name = paste(filepath, data_new, ".fst", sep = "")
      all_col_names = lapply(full_file_name, get_colnames_fst) %>% unique(unlist(.)) %>% unlist()
    }

    if (filetype == "duckdb") {
     all_col_names = lapply(data_new, get_colnames_duckdb, con = con) %>% unique(unlist(.)) %>% unlist()
    }

    if (is.null(colsToKeep)) { # WHY THIS??
      return(all_col_names)
    } else {
      colsToKeep2 <-
        union(
          c(
            "CRSHDATE",
            "CNTYCODE",
            "CRSHSVR",
            "UNITNMBR",
            "ROLE",
            "VEHTYPE",
            "WISINJ"
          ),
          colsToKeep
        ) # Tack these on

      columns_not_in_db <-
        subset(colsToKeep, !colsToKeep %in% all_col_names)
      columns_not_in_db = subset(columns_not_in_db,
                                 !grepl('[[:digit:]]+', columns_not_in_db))
      # Returns warning if column not found
      if (length(columns_not_in_db) != 0)
        warning(
          paste0(
            " ",
            paste0(columns_not_in_db, collapse = ", "),
            ' not found in new db (ignore if variable is in multiple cols)'
          ),
          call. = FALSE
        )
      # Returns only columns found in the df
      return(subset(colsToKeep2, colsToKeep2 %in% all_col_names))
    }
  }


jacciz/wisdotcrashdatabase documentation built on June 3, 2023, 2:26 a.m.