#install.packages("extrafont")
#library(extrafont)
library(AMGutils)
#font_import(paths = "/mnt/fonts/" ,prompt = FALSE)
#' Create the recoded and ordered experian table for grouping
#' @param aggregation_level (string) "person" or "household"
#' if "person" aggregation will use person_id605
#' if "household" aggregation will use id605
#' @param experian_table_name (string) non PII experian table name
#' @param experian_pii_table_name (string) PII experian table name
#' @param experian_id (string) "id605" or "person_id605"
#' @param additional_demographics_tbl (remote table) distinct rows of id605 or person_id605 + additional, already coded and ordered additional demographics
#' @export
create_recoded_experian_tbl <- function(aggregation_level,
experian_table_name,
experian_pii_table_name,
experian_id,
additional_demographics_tbl) { # Remote table: id605/person_id605 | additional demogrpahic columns
# select + rename demographic column names =========================
if(aggregation_level == "household") {
# Experian table & selected demographics
experian_tbl <- tbl(db, experian_table_name) %>%
select(experian_id, recipient_reliability_code,
estimated_household_income_amount_v6, # HH income
person_1_ethnic_group, # Person Race
person_1_education_model, # Person Education
person_1_marital_status, # Person Marital Status
number_of_children_in_living_unit) %>% # Number of Children
filter(recipient_reliability_code != 6) %>%
select(-recipient_reliability_code)
experian_pii_tbl <- tbl(db, experian_pii_table_name) %>%
select(experian_id,
person_combined_age, # Person Age
person_gender) # Person Gender
experian_combo <- experian_tbl %>%
inner_join(experian_pii_tbl, by = experian_id) %>%
rename(income = estimated_household_income_amount_v6,
race = person_1_ethnic_group,
education = person_1_education_model,
marital_status = person_1_marital_status,
n_children = number_of_children_in_living_unit,
age = person_combined_age,
gender = person_gender)
} else if(aggregation_level == "person") {
experian_pii_tbl <- tbl(db, experian_pii_table_name) %>%
select(experian_id,
id605,
household_income, # Person HH Income
person_1_race, # Person 1 Race
person_1_education, # Person 1 Education
number_of_children, # Number of Children
person_combined_age, # Person Age
person_gender) # Person Gender
marriage_tbl <- tbl(db, experian_table_name) %>%
select(id605, person_1_marital_status) %>% distinct()
experian_combo <- experian_pii_tbl %>%
inner_join(marriage_tbl, by = "id605") %>%
rename(income = household_income,
race = person_1_race,
education = person_1_education,
marital_status = person_1_marital_status,
n_children = number_of_children,
age = person_combined_age,
gender = person_gender) %>%
select(-id605)
}
# Set up SQL for binning continuous variables
pretty_cut_age <- pretty_cut.sql(col_name = "age",
cuts = c(24, 34, 44, 54, 64, 74),
right = TRUE,
labs = c("18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+"))
pretty_cut_income <- pretty_cut.sql(col_name = "income",
cuts = c(24, 49, 74, 99, 149),
right = TRUE,
labs = c("Less than 25K", "25-49K", "50-74K", "75-99K", "100-149K", "150K+"))
# Bin/catageorize demographic columns =========================
if(aggregation_level == "household") {
experian_combo <- experian_combo %>%
mutate(age_bin = pretty_cut_age,
income_bin = pretty_cut_income,
gender_bin = ifelse(gender == "M", "Male",
ifelse(gender == "F", "Female", NA)),
ethnicity_bin = ifelse(race %in% c("E", "G", "K", "L", "I"), "White",
ifelse(race %in% c("A", "M"), "Black",
ifelse(race == "O", "Hispanic",
ifelse(race %in% c('B', 'C', 'D', 'H', 'N'), "Asian", NA)))),
education_bin = ifelse(education %in% c(15, 55), "Less than High School",
ifelse(education %in% c(11, 51), "High School Diploma",
ifelse(education %in% c(12, 52), "Some College",
ifelse(education %in% c(13, 53), "Bachelor Degree",
ifelse(education %in% c(14, 54), "Graduate Degree", NA))))),
marital_bin = ifelse(marital_status %in% c("1M", "5M"), "Married",
ifelse(marital_status == "5S", "Single", NA)),
children_bin = ifelse(n_children > 0, "Children in\nHousehold", "No Children in\nHousehold")) %>%
select(experian_id, ends_with("bin"))
} else if (aggregation_level == "person") {
experian_combo <- experian_combo %>%
mutate(age_bin = pretty_cut_age,
income_bin = pretty_cut_income,
gender_bin = ifelse(gender == "M", "Male",
ifelse(gender == "F", "Female", NA)),
ethnicity_bin = ifelse(race == "Caucasian", "White",
ifelse(race == "African American", "Black",
ifelse(race == "Hispanic", "Hispanic",
ifelse(race == "Asian", "Asian", NA)))),
education_bin = ifelse(education == "Less Than HS Diploma", "Less than High School",
ifelse(education == "HS Diploma", "High School Diploma",
ifelse(education == "Some College", "Some College",
ifelse(education == "Bach Degree", "Bachelor Degree",
ifelse(education == "Grad Degree", "Graduate Degree", NA))))),
marital_bin = ifelse(marital_status %in% c("1M", "5M"), "Married",
ifelse(marital_status == "5S", "Single", NA)),
children_bin = ifelse(n_children > 0, "Children in\nHousehold", "No Children in\nHousehold")) %>%
select(experian_id, ends_with("bin"))
}
# NOTE: maybe join farther up?
if(!is.na(additional_demographics_tbl) %>% all()) {
experian_combo <- experian_combo %>%
inner_join(additional_demographics_tbl, by = experian_id)
}
# are sortkey/distkey needed/recommended?
experian_combo_computed <- compute_redshift(x = experian_combo,
sortKey = experian_id,
distKey = experian_id,
temporary = TRUE)
return(experian_combo_computed)
}
#' Create and return the proportion calculations for each demographic x universe
#' @param recoded_experian_tbl (remote table) created by create_recoded_experian_tbl()
#' @param base_universe_tbl (remote table) DEFAULT = NA | if not, a remote table with a column of id605/person_id605 of self defined baseline universe
#' @param base_universe_name (string) Label name for the base universe in the plots
#' @param user_universe_list (list of remote tables) each remote table should be just a table of 1 column = id605 or person_id605
#' @param user_universe_names (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param demographic_list (list of strings with names) list of demographics to plot with names
#' @param experian_id (string) id605 or person_id605
#' @export
create_proportion_tbl <- function(recoded_experian_tbl,
base_universe_tbl,
base_universe_name,
user_universe_list,
user_universe_names,
demographic_list,
experian_id) {
# BASELINE UNIVERSE
if(is.na(base_universe_tbl) %>% all()) {
base_universe <- recoded_experian_tbl %>% select(experian_id) %>% distinct()
} else {
base_universe <- base_universe_tbl
}
# User defined target universes
universe_list <- user_universe_list
names(universe_list) <- user_universe_names
# Append baseline universe to universe_list for looping
universe_list[[base_universe_name]] <- base_universe
# Loop through universes & demographic lists to calculate proportions
proportion_tbl <- lapply(seq(universe_list), function(x) {
# select out individual universe
universe_ids <- universe_list[[x]]
universe_name <- names(universe_list)[[x]]
# inner join to filter to the demographics of the current universe
universe_demogs <- recoded_experian_tbl %>% inner_join(universe_ids, by = experian_id) %>% compute_redshift(sortKey = experian_id)
# within the selected universe, loop through all demographics
all_demog_proportions <- lapply(seq(demographic_list), function(y) {
# select out current demographic
demographic <- demographic_list[[y]]
plot_title <- names(demographic_list)[[y]]
# Calculate demographic proportion
current_universe_demogs <- universe_demogs %>% rename("current_demog" = demographic)
prop <- current_universe_demogs %>%
filter(!is.na(current_demog)) %>% # filter out NAs
group_by(current_demog) %>%
summarise(bin_count = n()) %>% collect()
# all_current_demogs <- current_universe_demogs %>% select(current_demog) %>% filter(!is.na(current_demog)) %>% distinct() %>% collect()
all_current_demogs <- recoded_experian_tbl %>% rename("current_demog" = demographic) %>%
select(current_demog) %>% filter(!is.na(current_demog)) %>% distinct() %>% collect()
prop <- all_current_demogs %>%
left_join(prop, by = "current_demog") %>%
mutate(bin_count = as.numeric(bin_count)) %>%
mutate(bin_count = ifelse(is.na(bin_count), 0, bin_count)) %>%
mutate(segment_total = sum(bin_count, na.rm = TRUE),
demog_prop = bin_count/segment_total,
demog_prop_round = round(bin_count/segment_total, 3),
demog_prop_print = paste0(format((demog_prop_round*100), nsmall = 1), "%"),
# demog_prop_print = paste0(format(round(demog_prop*100, 1), nsmall = 1), "%"),
demographic = demographic)
return(prop)
}) %>% bind_rows()
return(all_demog_proportions %>% mutate(segment = universe_name)) # append on universe name
}) %>% bind_rows()
return(proportion_tbl)
}
#' Creates and saves demographic profile plots
#' @param proportion_tbl (local table) proportion table created by create_proportion_tbl()
#' @param demographic_list (list of strings with names) list of demographics to plot with names
#' @param plot_save_location (string) directory path to the location the tables should be saved in
#' @param additional_demographics_levels_and_x_axis_title (nested list) levels and x axist titles of additional demographics
#' @param base_universe_name (string) Label name for the base universe in the plots
#' @param user_universe_names (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param custom_palette (list of strings) color codes for custom color palette - MUST BE LENGTH 8
#' @export
create_plots <- function(proportion_tbl,
demographic_list,
plot_save_location,
additional_demographics_levels_and_x_axis_title = NA,
base_universe_name,
user_universe_names,
custom_palette = NA) {
pretty_names_base <- list(age_bin = list(x_axis_title = "Age Group",
levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+")),
income_bin = list(x_axis_title = "Household Income",
levels = c("Less than 25K", "25-49K", "50-74K", "75-99K", "100-149K", "150K+")),
gender_bin = list(x_axis_title = "Gender",
levels = c("Female", "Male")),
education_bin = list(x_axis_title = "Head of Household Education",
levels = c("Less than High School", "High School Diploma", "Some College", "Bachelor Degree", "Graduate Degree")),
marital_bin = list(x_axis_title = "Head of Household Marital Status",
levels = c("Single", "Married")),
children_bin = list(x_axis_title = "Presence of Children",
levels = c("Children in\nHousehold", "No Children in\nHousehold")),
ethnicity_bin = list(x_axis_title = "Head of Household Race",
levels = c("White", "Hispanic", "Black", "Asian")))
if(!is.na(additional_demographics_levels_and_x_axis_title) %>% all()) {
pretty_names <- c(pretty_names_base, additional_demographics_levels_and_x_axis_title)
} else {
pretty_names <- pretty_names_base
}
lapply(seq(demographic_list), function(x) {
tbl_to_plot <- proportion_tbl %>%
filter(demographic == demographic_list[x])
prop_levels <- pretty_names[[demographic_list[x]]]$levels
x_axis_title <- pretty_names[[demographic_list[x]]]$x_axis_title
tbl_to_plot$current_demog <- factor(tbl_to_plot$current_demog,
levels = prop_levels,
ordered = TRUE)
# cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# cbPalette <- c("#363B35", "#6D59E0", "#5EAEFF", "#80388C", "#06E5D2", "#0072B2", "#D55E00", "#CC79A7")
if(!is.na(custom_palette) %>% all()) {
cbPalette <- custom_palette
} else {
cbPalette <- c("#d9d9d9","#0b5394","#3d85c6","6fa8dc", "9fc5e8", "#3d85c6","6fa8dc", "9fc5e8")
}
tbl_to_plot$segment <- factor(tbl_to_plot$segment, levels = c(base_universe_name, user_universe_names), ordered = TRUE)
if(length(user_universe_names >= 3)) {
text_size <- 2.5
} else {
text_size <- 3.5
}
plot_x <- tbl_to_plot %>%
ggplot(aes(x = current_demog, y = demog_prop_round, fill = segment)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_text(aes(label = demog_prop_print), vjust = -0.5, color = "black", # print percentage on top of the bar
position = position_dodge(0.9), size = text_size) +
scale_fill_manual(values = cbPalette) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(x = x_axis_title,
y = "% of Segment Total") +
theme_minimal() +
theme(text = element_text(family = "ArialMT"), # set all plot text fonts to Arial
panel.grid.major.x = element_blank(), # remove all grid lines
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = "bottom", # set legend position to bottom
legend.title = element_blank(), # remove title
legend.text = element_text(margin = margin(r = 10, unit = "pt"))) +
ggsave(path = plot_save_location,
filename = paste0(demographic_list[x], ".png"),
width = 11, height = 5.5)
})
}
#' Wrapper function to create demographic profile plots
#' @param aggregation_level (string) "person" or "household"
#' if "person" aggregation will use person_id605
#' if "household" aggregation will use id605
#' @param experian_table_name (string) non PII experian table name
#' @param experian_pii_table_name (string) PII experian table name
#' @param additional_demographics_tbl (remote table) distinct rows of id605 or person_id605 + additional
#' should already coded and ordered additional demographics
#' @param additional_demographic_list (list of strings with names) list of additional demographics to plot with names
#' @param additional_demographics_levels_and_x_axis_title (nested list) levels and x axis titles of additional demographics
#' @param custom_palette (list of strings) color codes for custom color palette - MUST BE LENGTH 8
#' @param user_universe_list (list of remote tables) each remote table should be just a table of 1 column = id605 or person_id605
#' @param user_universe_names (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param base_universe_tbl (remote table) function will use the id605s from the inner join of the 2 experian tables
#' if self defined, should be a remote table with a column of id605/person_id605
#' @param base_universe_name (string) Label name for the base universe in the plots
#' @param plot_save_location (string) directory path to the location the tables should be saved in
#' @export
demographic_profiles <- function(aggregation_level,
experian_table_name,
experian_pii_table_name,
additional_demographics_tbl = NA,
additional_demographic_list = NA,
additional_demographics_levels_and_x_axis_title = NA,
custom_palette = NA,
user_universe_list,
user_universe_names,
base_universe_tbl = NA,
base_universe_name,
plot_save_location) {
# depending on aggregation level, this experian_id assigned will be selected and joined on
if(aggregation_level == "household") {
experian_id <- "id605"
} else if(aggregation_level == "person") {
experian_id <- "person_id605"
}
# the default demographics that will always be plotted
default_demographics <- c("age_bin", "income_bin", "gender_bin", "ethnicity_bin", "education_bin", "children_bin", "marital_bin")
names(default_demographics) <- c("Age", "Income", "Gender", "Race", "Education", "Presence of Children", "Marital Status")
# if the user inputs additional demographics, add them to the demographics list for later use in looping and plotting
if(!is.na(additional_demographic_list) %>% all()) {
demographic_list <- c(default_demographics, additional_demographic_list)
} else {
demographic_list <- default_demographics
}
# paste in an extra " " for specific spacing in final plot legend
user_universe_names <- paste0(" ", user_universe_names)
base_universe_name <- paste0(" ", base_universe_name)
# STEP 1 : create and recode the experian demographics tables
# joins in any additonal demographics if the user specified
step_1_demog_table <- create_recoded_experian_tbl(aggregation_level = aggregation_level,
experian_id = experian_id,
experian_table_name = experian_table_name,
experian_pii_table_name = experian_pii_table_name,
additional_demographics_tbl = additional_demographics_tbl)
# STEP 2 : create and save the proportion table
step_2_prop_table <- create_proportion_tbl(recoded_experian_tbl = step_1_demog_table,
user_universe_list = user_universe_list,
user_universe_names = user_universe_names,
demographic_list = demographic_list,
base_universe_tbl = base_universe_tbl,
base_universe_name = base_universe_name,
experian_id = experian_id)
# write to CSV for future reference if needed
write.csv(x = step_2_prop_table, file = paste0(plot_save_location, "proportion_tbl.csv"), row.names = FALSE)
# STEP 3 : Create plots
step_3_create_plots <- create_plots(proportion_tbl = step_2_prop_table,
demographic_list = demographic_list,
additional_demographics_levels_and_x_axis_title = additional_demographics_levels_and_x_axis_title,
base_universe_name = base_universe_name,
user_universe_names = user_universe_names,
plot_save_location = plot_save_location,
custom_palette = custom_palette)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.