Include some notes here about your assessment of the initial validation.
knitr::opts_chunk$set(comment = NA) library(tidyverse) library(lubridate) library(DBI) library(salic) library(salicprep) source("../params.R")
lic <- read_csv("../../data/lic-clean.csv") %>% filter(type %in% c("fish", "hunt", "combo", "trap")) con <- dbConnect(RSQLite::SQLite(), db_standard) sale <- tbl(con, "sale") %>% select(lic_id, cust_id, year, dot, start_date, end_date) %>% collect() %>% semi_join(lic, by = "lic_id") cust <- tbl(con, "cust") %>% select(cust_id, sex, dob, cust_res) %>% collect() %>% semi_join(sale, by = "cust_id") dbDisconnect(con) cust <- df_factor_sex(cust, "sex") %>% df_factor_res("cust_res") %>% mutate(dob = ymd(dob), birth_year = year(dob))
Initial summary of customers per year:
summary_initial(sale)
filter(cust, birth_year >= 1900) %>% plot_demo(c("sex", "birth_year", "cust_res"))
hunt <- sale %>% semi_join(filter(lic, type %in% c("hunt", "combo")), by = "lic_id") summary_initial(hunt)
filter(cust, birth_year >= 1900) %>% semi_join(hunt, by = "cust_id") %>% plot_demo(c("sex", "birth_year", "cust_res"))
hunt <- left_join(hunt, cust, by = "cust_id") %>% recode_agecat() %>% df_factor_age() plot_demo(hunt, c("sex", "age", "cust_res"), trend = TRUE)
fish <- sale %>% semi_join(filter(lic, type %in% c("fish", "combo")), by = "lic_id") summary_initial(fish)
filter(cust, birth_year >= 1900) %>% semi_join(fish, by = "cust_id") %>% plot_demo(c("sex", "birth_year", "cust_res"))
fish <- left_join(fish, cust, by = "cust_id") %>% recode_agecat() %>% df_factor_age() plot_demo(fish, c("sex", "age", "cust_res"), trend = TRUE)
The year variable provided by the state doesn't always line up with transaction dates. This typically has to do with a "license year" not directly corresponding to calendar year (at least for certain license types).
sale <- mutate_at(sale, vars(dot, start_date, end_date), "ymd") summary_year_dot(sale, lastyr) %>% knitr::kable()
Getting a sense of what the transaction date looks like.
summary_date(sale, "dot", yrs)
Looking at the difference between start and end date to get a sense of what that looks like.
summary_duration(sale, yrs)
Gaps in license types for certain years can sometimes (though not always) indicate missing data. This is typically alot of data to scan, included mainly in case it's needed for future reference.
lic <- mutate(lic, description = paste(type, lic_id, description, sep = "-")) sale <- left_join(sale, lic, by = "lic_id") x <- summary_lic_types(sale) spread(x, year, n, fill = "") %>% knitr::kable(caption = "Customer counts by year for license types")
plot_lic_types(x)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.