This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(tidyverse) library(data.table) p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = wisdotcrashdatabase::get_list_of_years("17", "21"), columns = c("DRUGSUSP", "ALCSUSP", "DRVRFLAG","STATNM", "UNITPRSN", "DISTACT", "DRVRDS", "SFTYEQP")) p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = wisdotcrashdatabase::get_list_of_years("16", "18"), columns = c("DRUGSUSP", "ALCSUSP", "DRVRFLAG","STATNM", "UNITPRSN", "DISTACT", "DRVRDS")) c = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = wisdotcrashdatabase::get_list_of_years("16", "18"), columns = c("DRUGFLAG", "ALCFLAG")) #%>% wisdotcrashdatabase::county_rename() p |> get_alc_drug_impaired_person() p %>% wisdotcrashdatabase::relabel_person_variables() select_bike_ped_persons(p) |> dplyr::count(WISINJ, year) p %>% class() DT = data.table::data.table(y = c(1,2,3), WISINJ = c("Fatal Injury", "Fatal Injury", "Fatal Injury")) #%>% wisdotcrashdatabase::relabel_person_variables() DT[ , list(wisdotcrashdatabase::relabel_person_variables(DT),wisdotcrashdatabase::relabel_person_variables(DT)) ]
devtools::load_all() p = import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = "21", columns = c("DRUGSUSP", "ALCSUSP", "STATNM"), filter_by = "county:Dane") p = get_driver_flags(p, c( "impaired")) find_person_flag(p, "impaired_flag")
# for new db alc_drg_statutes = "^346.63\\(" # Only the new db uses "(" after statute number driver = p[DRVRFLAG == "Y"] suspected_yes = c("^101$|^Yes$|^Y$") # TODO match for old db - use all persons?? p_stat = p |> # for str_detect, add "Y" so we get flags under alcohol_flag dplyr::select(.data[["CRSHNMBR"]], .data[["ALCSUSP"]], .data[["DRUGSUSP"]], .data[["UNITNMBR"]], year, dplyr::starts_with("STATNM")) |> dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0( alc_drg_statutes,"|", suspected_yes )))) |> dplyr::mutate(impaired_flag = "Y") |> dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["impaired_flag"]], year) driver |> get_impaired_driver() |> distinct(CRSHNMBR, .keep_all = T) |> count(year) p_stat |> distinct(CRSHNMBR, .keep_all = T) |> count(year) # MATCHES CM!! p[year == "2016" & (ALCSUSP == "Y" | DRUGSUSP == "Y")][!duplicated(CRSHNMBR)]#[,.N, by = ROLE] c[yearget_impaired_driver == "2016" & (ALCSUSP == "Y" | DRUGSUSP == "Y")] driver$STATNMB1 |> unique() # 2016: 6240 in CM. 5672 for here # 2017: 7573 # distracted -MATCHES CM!! wisdotcrashdatabase::get_driver_flags(driver, flags = "distracted") |> filter(distracted_flag == "Y") |> distinct(CRSHNMBR, .keep_all = T) |> count(year)
p$SFTYEQP |> unique() # MATCH CM p[SFTYEQP %in% c("None Used - Vehicle Occupant")][!duplicated(CRSHNMBR), .N, by = year] # OLD?? p[SFTYEQP %in% c("NONE USED-VEHICLE OCCUPANT")][!duplicated(CRSHNMBR), .N, by = year] p |> # for str_detect, add "Y" so we get flags under alcohol_flag dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["SFTYEQP"]], year) |> dplyr::filter(.data[["SFTYEQP"]] %in% c("None Used - Vehicle Occupant", "NONE USED-VEHICLE OCCUPANT")) |> dplyr::mutate(seatbelt_flag = "Y") |> dplyr::select(.data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["seatbelt_flag"]], year) |> distinct(CRSHNMBR, .keep_all = T) |> count(year)
library(tidyverse) library(data.table) p = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", years = "18", columns = c("DRUGSUSP", "AGE", "ALCSUSP", "DRVRFLAG","DRVRPC", "STATNM", "DISTACT", "DRVRDS", "SFTYEQP")) dr = get_driver_flags(p, c("distracted","speed","teen","older", "impaired")) dr[distracted_flag == "Y"][!duplicated(CRSHNMBR), .N] get_seatbelt_flag_by_unit(p) |> filter(seatbelt_flag == "Y") |> distinct(CRSHNMBR, .keep_all = T) |> count(year) # CM in 2018: SB = 11364 dist = 22916 speed = 20061 teen = 20076 older = 23465 imp = 7605
# TODO Just added .data[["UNITNMBR"]] alc_statutes = "^346.63\\(|^346.63 |OPERATING WHILE INTOXICATED|OPERATING WHILE UNDER INFLUENCE" # except 1 am drug_statutes = "346.63\\(1\\)\\(am\\)|346.63\\(1\\)\\(AM\\)|346.63 1 am|346.63 1 am|346.63 1 AM|346.63 1 AM|346.63 1AM" # Make column to make an id for each unique person in a crash. For old db. if(is.null(p[["UNITPRSN"]])){ p = p[ , .data[["UNITPRSN"]] := NA] } # for old db, trying to join p_stat back to p, but seems like it's not joining right. Missing crashes with stat and no alcsusp. ?? p = p |> dplyr::mutate(UNITPRSN = ifelse(is.na(.data[["UNITPRSN"]]), paste0(.data[["UNITNMBR"]], .data[["SEATLOC"]]), .data[["UNITPRSN"]])) # p[year == 2010, STATNMB1] |> unique() |> sort() p_join[grepl(suspected_yes, ALCSUSP) & DRVRFLAG == "Y"][!duplicated(CRSHNMBR)][, .N, by = year] driver = p[DRVRFLAG == "Y"] suspected_yes = c("^101$|^Yes$|^Y$") suspected_no = c("^102$|^No$|^N$") # p = p[DRVRFLAG == "Y" & year == 2017] # p[CRSHNMBR == "100100004"] driver |> dplyr::count(CRSHNMBR, UNITPRSN, UNITNMBR) |> dplyr::arrange(-n) # n should = 1 p[CRSHNMBR == "170112409"] # Get all sus alcohol by ALCSUSP or STATNM p_stat = p |> # for str_detect, add "Y" so we get flags under alcohol_flag dplyr::select(.data[["CRSHNMBR"]], .data[["UNITPRSN"]], .data[["ALCSUSP"]], .data[["UNITNMBR"]], dplyr::starts_with("STATNM")) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse( stringr::str_detect(., drug_statutes), NA, . ))) |> dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0( alc_statutes, "|", suspected_yes )))) |> dplyr::mutate(alcohol_flag = "Y") |> dplyr::select(.data[["UNITPRSN"]], .data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["alcohol_flag"]]) p_join = dplyr::full_join(p, p_stat, c("CRSHNMBR", "UNITPRSN", "UNITNMBR")) #by = c(.data[["CRSHNMBR"]], .data[["UNITNMBR"]])) p_join |> dplyr::mutate(alcohol_flag = dplyr::case_when( alcohol_flag == "Y" ~ "Y", # ?? grepl(suspected_no, ALCSUSP) ~ "N", # !is.na(alcohol_flag) ~ alcohol_flag, TRUE ~ "U" )) |> #select(ALCSUSP, STATNMB1, STATNMB2, alcohol_flag)# |> filter(ALCSUSP != alcohol_flag) dplyr::filter(alcohol_flag == "Y", DRVRFLAG == "Y") |> dplyr::distinct(CRSHNMBR, .keep_all = TRUE) |> dplyr::count(year) # 24514/ 28627 # Not ALC SUSP but has a OWI citation. p_join[!grepl(suspected_yes, ALCSUSP) & DRVRFLAG == "Y" & alcohol_flag == "Y"][, .N, by = DRUGSUSP] # 4,618 # Get all sus drug by DRUGSUSP or STATNM d_stat = p |> # for str_detect, add "Y" so we get flags under alcohol_flag dplyr::select(.data[["CRSHNMBR"]], .data[["UNITPRSN"]], .data[["DRUGSUSP"]], .data[["UNITNMBR"]], dplyr::starts_with("STATNM")) |> dplyr::filter_all(dplyr::any_vars(stringr::str_detect(., paste0( drug_statutes,"|", suspected_yes )))) |> dplyr::mutate(drug_flag = "Y") |> dplyr::select(.data[["UNITPRSN"]], .data[["CRSHNMBR"]], .data[["UNITNMBR"]], .data[["drug_flag"]]) d_join = dplyr::full_join(p, d_stat, c("CRSHNMBR", "UNITPRSN", "UNITNMBR")) #by = c(.data[["CRSHNMBR"]], .data[["UNITNMBR"]])) d_join |> dplyr::mutate(drug_flag = dplyr::case_when( drug_flag == "Y" ~ "Y", grepl(suspected_no, DRUGSUSP) ~ "N", # !is.na(drug_flag) ~ drug_flag, TRUE ~ "U" )) |> dplyr::filter(drug_flag == "Y", DRVRFLAG == "Y") |> dplyr::distinct(CRSHNMBR, .keep_all = TRUE) |> dplyr::count(year) 7467 / 7476
p[ , .N, by = .(DRVRFLAG, ROLE)] p2[ , .N, by = .(DRVRFLAG, ROLE)]
v = wisdotcrashdatabase::import_db_data("C:/data/crashes_duck.duckdb", db_type = "vehicle", years = wisdotcrashdatabase::get_list_of_years("17", "17"), columns = c("TKBSCFG")) relabel_vehicle_variables <- function(vehicle_df, relabel_by = "vehtype") { if (relabel_by %in% "vehtype") { vehicle_df = vehicle_df |> dplyr::mutate( veh = dplyr::case_when( .data[["VEHTYPE"]] %in% c( "Passenger Car", "(Sport) Utility Vehicle", "Cargo Van (10,000 Lbs or Less)", "Passenger Van" ) ~ "Passenger Vehicle", .data[["VEHTYPE"]] == "Utility Truck/Pickup Truck" ~ "Light Trucks", .data[["VEHTYPE"]] %in% c( "Straight Truck", "Truck Tractor (Trailer Not Attached)", "Truck Tractor (Trailer Attached)", "Truck Tractor (More Than One Trailer)" ) ~ "Large Trucks", # TKBSCFG != "" ~ "CMV", .data[["VEHTYPE"]] %in% c( "ATV/UTV (Utility Terrain Vehicle)", "UTV (Utility Terrain Vehicle)", "ATV" ) ~ "ATV/UTV", .data[["VEHTYPE"]] == "Bicycle" ~ "Bicycle", .data[["VEHTYPE"]] %in% c("Moped", "Motorcycle") ~ "Moped/Motorcycle", grepl("Emergency|EMERGENCY", .data[["VEHTYPE"]]) ~ "Emergency Vehicle", grepl("Bus", .data[["VEHTYPE"]]) ~ "School/Passenger Bus", # .data[["VEHTYPE"]] %in% c("POLICE ON EMERGENCY") ~ "Emergency Vehicle" # TRUE ~ .data[["VEHTYPE"]] .data[["VEHTYPE"]] == .data[["VEHTYPE"]] ~ "Other" ) ) } if (relabel_by %in% "cmv") { vehicle_df = dplyr::mutate(vehicle_df, cmv = ifelse(.data[["TKBSCFG"]] != "", "Y", "N")) } vehicle_df } relabel_vehicle_variables(v, "cmv")
years = "18" p = import_db_data("C:/data/crashes_duck.duckdb", db_type = "person", columns = c("OLICTYPE"), years = c("16", "17")) v = import_db_data("C:/data/crashes_duck.duckdb", db_type = "vehicle", years = years) m = get_motorcycle_persons(p,v) m |> count(CRSHNMBR) c = import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", columns = c("CYCLFLAG", "CRSHTYPE", "MNRCOLL"), years = years) get_lanedeparture_crashes(c) anti_join( m[!duplicated(CRSHNMBR)], c[CYCLFLAG == "Y"], by = "CRSHNMBR") anti_join( c[CYCLFLAG == "Y"], m[!duplicated(CRSHNMBR)], by = "CRSHNMBR") # involve parked MC?
p$UNITTYPE |> unique() p[UNITTYPE %in% c("Bicycle", "BICYCLE")][!duplicated(CRSHNMBR)][, .N, by = year] p[UNITTYPE %in% c("Pedestrian", "PEDESTRIAN")][!duplicated(CRSHNMBR)][, .N, by = year] select_bike_ped_persons <- function(person_df, ped = "Y", bike = "Y") { if (ped == "Y") { ped_select = c("Pedestrian", "PEDESTRIAN") } else { ped_select = c() } if (bike == "Y") { bike_select = c("Bicycle", "BICYCLE") } else { bike_select = c() } selection = dplyr::filter(person_df, .data[["UNITTYPE"]] %in% c(ped_select, bike_select)) # selection = person_df[.data[["UNITTYPE"]] %in% c(ped_select, bike_select)] selection |> dplyr::mutate( UNITTYPE = ifelse(.data[["UNITTYPE"]] == "BICYCLE", "Bicycle", UNITTYPE), UNITTYPE = ifelse(UNITTYPE == "PEDESTRIAN", "Pedestrian", .data[["UNITTYPE"]]) ) } select_bike_ped_persons(p, ped = "Y") p[.data[["UNITTYPE"]] %in% c("Pedestrian")] x |> count(UNITTYPE)
x = import_db_data( "C:/CSV/csv_from_sas/fst/", db_type = "vehicle", columns = c("OWNRTYP", "OWNRORG"), years = "17") x = import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", columns = c("ANMLTY", "OWNRORG", "TOTFATL", "TOTINJ"), years = "17") aggregate_crashes_by_crshsvr_tot_inj(x) file_name = "C:/CSV/csv_from_sas/fst/17crash.fst" colsToKeep = c("ANMLTY") #, "OWNRORG") read_cols <- function(file_name, colsToKeep) { header <- fst::read_fst(file_name, to = 1) if (is.null(colsToKeep)) { return(colnames(header)) } else { colsToKeep2 <- union( c( "CRSHDATE", "CNTYCODE", "CRSHSVR", "UNITNMBR", "ROLE", "VEHTYPE", "WISINJ" ), colsToKeep ) # Tack these on columns_not_in_db <- subset(colsToKeep, !colsToKeep %in% colnames(header)) if (length(columns_not_in_db) != 0) warning(paste0(paste0(columns_not_in_db, collapse = ", "), ' does not exist')) # Returns only columns found in the df return(subset(colsToKeep2, colsToKeep2 %in% colnames(header))) } } if( columns_not_in_db != "" ) warning(paste0(columns_not_in_db, ' does not exist')) import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "DRVRPC"), db_type = "person", years = "17") read_cols(file_name = "C:/CSV/csv_from_sas/fst/17person.fst", colsToKeep = c("CRSHTIME", "DRVRPC", "dd", "ANMLTY")) read_fst_for_new_db("C:/CSV/csv_from_sas/fst/17person.fst", c("CRSHTIME", "DRVRPC", "as")) col_to_select = c("CRSHTIME", "DRVRPC", "dd", "ANMLTY") col_to_select = "all" if (col_to_select[1] != "all") { # Get all names for data that may be in multiple column columns_with_multiples <- subset(col_to_select, grepl(# Columns that have multiples db_columns_with_multiples, # "WTCOND|RDCOND|ENVPC|RDWYPC|ADDTL|CLSRSN|ANMLTY|DMGAR|VEHPC|HAZPLAC|HAZNMBR|HAZCLSS|HAZNAME|HAZFLAG|DRVRDS|DRUGYT|DRVRRS|DRVRPC|DNMFTR|STATNM|NMTACT|NMTSFQ|PROTGR|CITISS|CITNM|STATDS|STATSV|RSTRCT|CITNM", col_to_select)) # If columns with multiples were found, make a list of all values, tack them onto the end. i.e. DRVRPC01, DRVRPC02, etc. if (length(columns_with_multiples) != 0) { col_to_select2 <- get_list_of_multiple_columns() } else { col_to_select2 <- union(c("CRSHNMBR"), col_to_select) } # col_to_select2 } # read_fst_for_new_db(file_to_read = "C:/CSV/csv_from_sas/fst/17person.fst", col_to_select = c("CRSHTIME", "DRVRPC")) # # Returns list of XX01:xx20 for columns with multiple (i.e. DRVRPC) # get_list_of_multiple_columns <- # function(mult_col = columns_with_multiples, col_sel = col_to_select) { # get_all_names <- # sapply(mult_col, # paste0, # formatC(seq(1, 20), width = 2, flag = "0")) %>% as.character() # Reduce(union, # list(c("CRSHNMBR"), # col_sel, # get_all_names)) # # } # # get_list_of_multiple_columns() p = import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME"), db_type = "person", years = "21") p %>% filter(WISINJ == "Fatal Injury") %>% arrange(CRSHDATE)
y =import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "dd"), db_type = "crash", years = "17") a = import_db_data( "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = c("17", "18")) a$year %>% unique() z = y %>% mutate(year = factor(year, levels = sort(unique(y$year)))) y %>% mutate(year = factor(.data$year, levels = sort(unique(.data$year))))
y %>% get_crash_times(column_name = "tes") z=z %>% wisdotcrashdatabase::get_crash_times(combine_with_old = TRUE) y %>% mutate(crash_time = convert_time_to_hms(CRSHTIME)) y %>% convert_time_to_hms() y = y %>% convert_to_datetime() y$crsh_datetime
all_years <- wisdotcrashdatabase::get_list_of_years("18", "21") %>% as.integer() years_old = all_years[which(all_years<17)] years_new = all_years[which(all_years>=17)] import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "DRVRPC"), db_type = "crash", years = c("16"))
library(tidyverse) # library(wisdotcrashdatabase) y =import_db_data( "C:/CSV/csv_from_sas/fst/", columns = c("CRSHTIME", "CRSHDATE", "DRVRPC"), db_type = "person", years = c("16", "17")) y = y%>% get_crash_times() y = y %>% convert_time_to_hms() y %>% convert_to_datetime()
library(tidyverse) library(data.table) old = import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("16", "17"), filetype = "fst") x = import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = c("MUNICODE"), db_type = "crash", years = "19") narr = import_narrative(filepath = "C:/CSV/csv_from_sas/fst/", years = "19")
library(duckdb) library(tidyverse) library(wisdotcrashdatabase) con <- duckdb::dbConnect(duckdb::duckdb(), dbdir = "C:/data/crashes_duck.duckdb", read_only = TRUE) duckdb::dbDisconnect(con, shutdown = TRUE) duckdb::duckdb_shutdown(duckdb::duckdb()) test = import_db_data(db_type = "crash", years = get_list_of_years("17", "17"), filetype = "duckdb", db_loc = "C:/data/crashes_duck.duckdb") import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = c("18", "19")) import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = c("89", "94")) import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = c("89", "17")) import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = "00") narr = import_narrative(filepath = "C:/CSV/csv_from_sas/fst/", years = "19", filetype = "duckdb") test %>% count(year) loc = "C:/data/crashes_duck.duckdb" # RSQLite::dbConnect(RSQLite::SQLite(), dbname = "C:/data/crash_db.sqlite") # microbenchmark::microbenchmark(times = 10L, # import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "person", years = "00", filetype = "fst"), # 9.6 # import_db_data(db_loc = "C:/data/crash_db.db", db_type = "person", years = "00")) # 10.6 microbenchmark::microbenchmark(times = 10L, # 63% faster import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19"), filetype = "fst"), # 7.3 s import_db_data("C:/data/crashes_duck.duckdb", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19")) # 2.7 # import_db_data("C:/data/crashes_db.sqlite", columns = c("DRVRPC", "AGE"), db_type = "person", years = c("18", "19")) # sqlite ) microbenchmark::microbenchmark(times = 10L, # 13% faster import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", columns = "all", db_type = "person", years = c("18"), filetype = "fst"), # 8.5 s import_db_data("C:/data/crashes_duck.duckdb", columns = "all", db_type = "person", years = c("18"), filetype = "duckdb") # 7.3 ) microbenchmark::microbenchmark(times = 10L, # 37% faster import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "fst"), # 13.5 import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "duckdb") # 8.5 ) microbenchmark::microbenchmark(times = 10L, # 72% faster import_db_data(filepath = "C:/CSV/csv_from_sas/fst/", db_type = "crash", years = get_list_of_years("17", "21"), filetype = "fst"), # 6.4 import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("17", "21"), filetype = "duckdb") # 1.8 )
microbenchmark::microbenchmark( times = 10L, # import_db_data("C:/data/crashes_duck.duckdb", db_type = "crash", years = get_list_of_years("12", "16"), filetype = "duckdb") .7 sec with empty df # x %>% # dplyr::filter( # .data$CRSHSVR != 'NON-REPORTABLE', # .data$CRSHLOC == 'INTERSECTION' | # .data$CRSHLOC == 'NON-INTERSECTION' # ), #.7 # x %>% # dplyr::mutate( # CRSHDATE = lubridate::ymd(.data$CRSHDATE), # # year = lubridate::year(.data$CRSHDATE), # year = factor(.data$year, levels = sort(unique(.data$year))) # ) %>% data.table::as.data.table() # 1.3 sec ) x = purrr::map_dfr( .x = c("12crash", "13crash", "14crash", "15crash", "16crash"), .f = read_duckdb_for_new_old_db, filepath = NULL, col_to_select = "all", con = con, filter_by = NULL ) # 4.2 4.7 + .7 + 1.3
# DBI::dbReadTable(con, "18person")DBI::dbReadTable(con, "18person") filter_by = "county:Dane, Fond du Lac" sub(pattern = "^\\:.", replacement = "\\1", filter_by) c = strsplit(sub(pattern = "^county:", replacement = "\\1", filter_by), ", ") %>% unlist() crash <- crash %>% wisdotcrashdatabase::county_rename(combine_with_old = TRUE) crash %>% filter(countyname %in% "Fond du Lac")
data_years = paste(c("17", "18"), "crash", sep = "") col_to_grab = c("ANMLTY", "CRSHDATE") cols_to_keep = columns_to_find(col_to_find = col_to_grab) find_columns_in_all_new_db(data_new = data_years, filepath = "C:/CSV/csv_from_sas/fst/", colsToKeep = cols_to_keep, filetype = "fst", con = con) columns_to_find <- function(col_to_find) { # If specific columns were selected, find which match in the database if (col_to_find[1] == "all") { found_columns <- "all" } else { # Get all names for data that may be in multiple column columns_with_multiples <- subset(col_to_find, grepl(# Columns that have multiples db_columns_with_multiples, col_to_find)) # If columns with multiples were found, make a list of all values, tack them onto the end. i.e. DRVRPC01, DRVRPC02, etc. if (length(columns_with_multiples) != 0) { # This adds the '01' to '20' to the end of each matching column get_all_names <- sapply(columns_with_multiples, paste0, formatC(seq(1, 20), width = 2, flag = "0")) %>% as.character() col_to_find <- Reduce(union, list(c("CRSHNMBR"), col_to_find, get_all_names)) } else { col_to_find <- union(c("CRSHNMBR"), col_to_find) } } return(col_to_find) } get_colnames_fst <- function(file_to_read){ header <- fst::read_fst(file_to_read, to = 1) colnames(header) } get_colnames_duckdb <- function(file_to_read, con){ header <- dplyr::tbl(con, file_to_read) %>% utils::head(1) colnames(header) } find_columns_in_all_new_db <- function(data_new, filepath, colsToKeep, filetype, con = con) { if (filetype == "fst") { full_file_name = paste(filepath, data_new, ".fst", sep = "") all_col_names = lapply(full_file_name, get_colnames_fst) %>% unique(unlist(.)) %>% unlist() } if (filetype == "duckdb") { all_col_names = lapply(data_new, get_colnames_duckdb, con = con) %>% unique(unlist(.)) %>% unlist() } if (is.null(colsToKeep)) { # WHY THIS?? return(all_col_names) } else { colsToKeep2 <- union( c( "CRSHDATE", "CNTYCODE", "CRSHSVR", "UNITNMBR", "ROLE", "VEHTYPE", "WISINJ" ), colsToKeep ) # Tack these on columns_not_in_db <- subset(colsToKeep, !colsToKeep %in% all_col_names) columns_not_in_db = subset(columns_not_in_db, !grepl('[[:digit:]]+', columns_not_in_db)) # Returns warning if column not found if (length(columns_not_in_db) != 0) warning( paste0( " ", paste0(columns_not_in_db, collapse = ", "), ' not found in new db (ignore if variable is in multiple cols)' ), call. = FALSE ) # Returns only columns found in the df return(subset(colsToKeep2, colsToKeep2 %in% all_col_names)) } }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.