R/demo_profiles_redshift.R

Defines functions demographic_profiles create_plots create_proportion_tbl create_recoded_experian_tbl

Documented in create_plots create_proportion_tbl create_recoded_experian_tbl demographic_profiles

#install.packages("extrafont")
#library(extrafont)
library(AMGutils)

#font_import(paths = "/mnt/fonts/" ,prompt = FALSE)


#' Create the recoded and ordered experian table for grouping
#' @param aggregation_level (string) "person" or "household"
#'                          if "person" aggregation will use person_id605
#'                          if "household" aggregation will use id605
#' @param experian_table_name (string) non PII experian table name
#' @param experian_pii_table_name (string) PII experian table name
#' @param experian_id (string) "id605" or "person_id605"
#' @param additional_demographics_tbl (remote table) distinct rows of id605 or person_id605 + additional, already coded and ordered additional demographics
#' @export
create_recoded_experian_tbl <- function(aggregation_level,
                                        experian_table_name,
                                        experian_pii_table_name,
                                        experian_id,
                                        additional_demographics_tbl) {    # Remote table: id605/person_id605 | additional demogrpahic columns

  # select + rename demographic column names  =========================
  if(aggregation_level == "household") {

    # Experian table & selected demographics
    experian_tbl <- tbl(db, experian_table_name) %>%
      select(experian_id, recipient_reliability_code,
             estimated_household_income_amount_v6,                # HH income
             person_1_ethnic_group,                  # Person Race
             person_1_education_model,               # Person Education
             person_1_marital_status,                # Person Marital Status
             number_of_children_in_living_unit) %>%  # Number of Children
      filter(recipient_reliability_code != 6) %>%
      select(-recipient_reliability_code)

    experian_pii_tbl <- tbl(db, experian_pii_table_name) %>%
      select(experian_id,
             person_combined_age,    # Person Age
             person_gender)          # Person Gender

    experian_combo <- experian_tbl %>%
      inner_join(experian_pii_tbl, by = experian_id) %>%
      rename(income         = estimated_household_income_amount_v6,
             race           = person_1_ethnic_group,
             education      = person_1_education_model,
             marital_status = person_1_marital_status,
             n_children     = number_of_children_in_living_unit,
             age            = person_combined_age,
             gender         = person_gender)

  } else if(aggregation_level == "person") {

    experian_pii_tbl <- tbl(db, experian_pii_table_name) %>%
      select(experian_id,
             id605,
             household_income,       # Person HH Income
             person_1_race,          # Person 1 Race
             person_1_education,     # Person 1 Education
             number_of_children,     # Number of Children
             person_combined_age,    # Person Age
             person_gender)          # Person Gender

    marriage_tbl <- tbl(db, experian_table_name) %>%
      select(id605, person_1_marital_status) %>% distinct()

    experian_combo <- experian_pii_tbl %>%
      inner_join(marriage_tbl, by = "id605") %>%
      rename(income         = household_income,
             race           = person_1_race,
             education      = person_1_education,
             marital_status = person_1_marital_status,
             n_children     = number_of_children,
             age            = person_combined_age,
             gender         = person_gender) %>%
      select(-id605)
  }

  # Set up SQL for binning continuous variables
  pretty_cut_age <- pretty_cut.sql(col_name = "age",
                                   cuts = c(24, 34, 44, 54, 64, 74),
                                   right = TRUE,
                                   labs = c("18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+"))

  pretty_cut_income <- pretty_cut.sql(col_name = "income",
                                      cuts = c(24, 49, 74, 99, 149),
                                      right = TRUE,
                                      labs = c("Less than 25K", "25-49K", "50-74K", "75-99K", "100-149K", "150K+"))

  # Bin/catageorize demographic columns =========================
  if(aggregation_level == "household") {
    experian_combo <- experian_combo %>%
      mutate(age_bin       = pretty_cut_age,
             income_bin    = pretty_cut_income,
             gender_bin    = ifelse(gender == "M", "Male",
                                    ifelse(gender == "F", "Female", NA)),
             ethnicity_bin = ifelse(race %in% c("E", "G", "K", "L", "I"), "White",
                                    ifelse(race %in% c("A", "M"), "Black",
                                           ifelse(race == "O", "Hispanic",
                                                  ifelse(race %in% c('B', 'C', 'D', 'H', 'N'), "Asian", NA)))),
             education_bin = ifelse(education %in% c(15, 55), "Less than High School",
                                    ifelse(education %in% c(11, 51), "High School Diploma",
                                           ifelse(education %in% c(12, 52), "Some College",
                                                  ifelse(education %in% c(13, 53), "Bachelor Degree",
                                                         ifelse(education %in% c(14, 54), "Graduate Degree", NA))))),
             marital_bin   = ifelse(marital_status %in% c("1M", "5M"), "Married",
                                    ifelse(marital_status == "5S", "Single", NA)),
             children_bin  = ifelse(n_children > 0, "Children in\nHousehold", "No Children in\nHousehold")) %>%
      select(experian_id, ends_with("bin"))

  } else if (aggregation_level == "person") {

    experian_combo <- experian_combo %>%
      mutate(age_bin       = pretty_cut_age,
             income_bin    = pretty_cut_income,
             gender_bin    = ifelse(gender == "M", "Male",
                                    ifelse(gender == "F", "Female", NA)),
             ethnicity_bin = ifelse(race == "Caucasian", "White",
                                    ifelse(race == "African American", "Black",
                                           ifelse(race == "Hispanic", "Hispanic",
                                                  ifelse(race == "Asian", "Asian", NA)))),
             education_bin = ifelse(education == "Less Than HS Diploma", "Less than High School",
                                    ifelse(education == "HS Diploma", "High School Diploma",
                                           ifelse(education == "Some College", "Some College",
                                                  ifelse(education == "Bach Degree", "Bachelor Degree",
                                                         ifelse(education == "Grad Degree", "Graduate Degree", NA))))),
             marital_bin   = ifelse(marital_status %in% c("1M", "5M"), "Married",
                                    ifelse(marital_status == "5S", "Single", NA)),
             children_bin  = ifelse(n_children > 0, "Children in\nHousehold", "No Children in\nHousehold"))  %>%
      select(experian_id, ends_with("bin"))
  }

  # NOTE: maybe join farther up?
  if(!is.na(additional_demographics_tbl) %>% all()) {
    experian_combo <- experian_combo %>%
      inner_join(additional_demographics_tbl, by = experian_id)
  }
  # are sortkey/distkey needed/recommended?
  experian_combo_computed <- compute_redshift(x = experian_combo,
                                              sortKey = experian_id,
                                              distKey = experian_id,
                                              temporary = TRUE)
  return(experian_combo_computed)
}

#' Create and return the proportion calculations for each demographic x universe
#' @param recoded_experian_tbl (remote table) created by create_recoded_experian_tbl()
#' @param base_universe_tbl (remote table) DEFAULT = NA | if not, a remote table with a column of id605/person_id605 of self defined baseline universe
#' @param base_universe_name (string) Label name for the base universe in the plots
#' @param user_universe_list (list of remote tables) each remote table should be just a table of 1 column = id605 or person_id605
#' @param user_universe_names (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param demographic_list (list of strings with names) list of demographics to plot with names
#' @param experian_id (string) id605 or person_id605
#' @export
create_proportion_tbl <- function(recoded_experian_tbl,
                                  base_universe_tbl,
                                  base_universe_name,
                                  user_universe_list,
                                  user_universe_names,
                                  demographic_list,
                                  experian_id) {

  # BASELINE UNIVERSE
  if(is.na(base_universe_tbl) %>% all()) {
    base_universe <- recoded_experian_tbl %>% select(experian_id) %>% distinct()
  } else {
    base_universe <- base_universe_tbl
  }

  # User defined target universes
  universe_list <- user_universe_list
  names(universe_list) <- user_universe_names

  # Append baseline universe to universe_list for looping
  universe_list[[base_universe_name]] <- base_universe

  # Loop through universes & demographic lists to calculate proportions
  proportion_tbl <- lapply(seq(universe_list), function(x) {

    # select out individual universe
    universe_ids <- universe_list[[x]]
    universe_name <- names(universe_list)[[x]]

    # inner join to filter to the demographics of the current universe
    universe_demogs <- recoded_experian_tbl %>% inner_join(universe_ids, by = experian_id) %>% compute_redshift(sortKey = experian_id)

    # within the selected universe, loop through all demographics
    all_demog_proportions <- lapply(seq(demographic_list), function(y) {
      # select out current demographic
      demographic <- demographic_list[[y]]
      plot_title <- names(demographic_list)[[y]]

      # Calculate demographic proportion
      current_universe_demogs <- universe_demogs %>% rename("current_demog" = demographic)

      prop <- current_universe_demogs %>%
        filter(!is.na(current_demog)) %>%                     # filter out NAs
        group_by(current_demog) %>%
        summarise(bin_count = n()) %>% collect()

      # all_current_demogs <- current_universe_demogs %>% select(current_demog) %>% filter(!is.na(current_demog)) %>% distinct() %>% collect()
      all_current_demogs <- recoded_experian_tbl %>% rename("current_demog" = demographic) %>%
        select(current_demog) %>% filter(!is.na(current_demog)) %>% distinct() %>% collect()

      prop <- all_current_demogs %>%
        left_join(prop, by = "current_demog") %>%
        mutate(bin_count = as.numeric(bin_count)) %>%
        mutate(bin_count = ifelse(is.na(bin_count), 0, bin_count)) %>%
        mutate(segment_total = sum(bin_count, na.rm = TRUE),
               demog_prop = bin_count/segment_total,
               demog_prop_round = round(bin_count/segment_total, 3),
               demog_prop_print = paste0(format((demog_prop_round*100), nsmall = 1), "%"),
               # demog_prop_print = paste0(format(round(demog_prop*100, 1), nsmall = 1), "%"),
               demographic = demographic)
      return(prop)

    }) %>% bind_rows()

    return(all_demog_proportions %>% mutate(segment = universe_name))  # append on universe name

  }) %>% bind_rows()

  return(proportion_tbl)
}

#' Creates and saves demographic profile plots
#' @param proportion_tbl (local table) proportion table created by create_proportion_tbl()
#' @param demographic_list (list of strings with names) list of demographics to plot with names
#' @param plot_save_location (string) directory path to the location the tables should be saved in
#' @param additional_demographics_levels_and_x_axis_title (nested list) levels and x axist titles of additional demographics
#' @param base_universe_name (string) Label name for the base universe in the plots
#' @param user_universe_names (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param custom_palette (list of strings) color codes for custom color palette - MUST BE LENGTH 8
#' @export
create_plots <- function(proportion_tbl,
                         demographic_list,
                         plot_save_location,
                         additional_demographics_levels_and_x_axis_title = NA,
                         base_universe_name,
                         user_universe_names,
                         custom_palette = NA) {

  pretty_names_base <- list(age_bin       = list(x_axis_title = "Age Group",
                                                 levels       = c("18-24", "25-34", "35-44", "45-54", "55-64", "65-74", "75+")),
                            income_bin    = list(x_axis_title = "Household Income",
                                                 levels       = c("Less than 25K", "25-49K", "50-74K", "75-99K", "100-149K", "150K+")),
                            gender_bin    = list(x_axis_title = "Gender",
                                                 levels       = c("Female", "Male")),
                            education_bin = list(x_axis_title = "Head of Household Education",
                                                 levels       = c("Less than High School", "High School Diploma", "Some College", "Bachelor Degree", "Graduate Degree")),
                            marital_bin   = list(x_axis_title = "Head of Household Marital Status",
                                                 levels       = c("Single", "Married")),
                            children_bin  = list(x_axis_title = "Presence of Children",
                                                 levels       = c("Children in\nHousehold", "No Children in\nHousehold")),
                            ethnicity_bin = list(x_axis_title = "Head of Household Race",
                                                 levels       = c("White", "Hispanic", "Black", "Asian")))

  if(!is.na(additional_demographics_levels_and_x_axis_title) %>% all()) {
    pretty_names <- c(pretty_names_base, additional_demographics_levels_and_x_axis_title)
  } else {
    pretty_names <- pretty_names_base
  }

  lapply(seq(demographic_list), function(x) {

    tbl_to_plot <- proportion_tbl %>%
      filter(demographic == demographic_list[x])

    prop_levels <- pretty_names[[demographic_list[x]]]$levels
    x_axis_title <- pretty_names[[demographic_list[x]]]$x_axis_title

    tbl_to_plot$current_demog <- factor(tbl_to_plot$current_demog,
                                        levels = prop_levels,
                                        ordered = TRUE)

    # cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
    # cbPalette <- c("#363B35", "#6D59E0", "#5EAEFF", "#80388C", "#06E5D2", "#0072B2", "#D55E00", "#CC79A7")
    if(!is.na(custom_palette) %>% all()) {
      cbPalette <- custom_palette
    } else {
      cbPalette <- c("#d9d9d9","#0b5394","#3d85c6","6fa8dc", "9fc5e8", "#3d85c6","6fa8dc", "9fc5e8")
    }

    tbl_to_plot$segment <- factor(tbl_to_plot$segment, levels = c(base_universe_name, user_universe_names), ordered = TRUE)

    if(length(user_universe_names >= 3)) {
      text_size <- 2.5
    } else {
      text_size <- 3.5
    }

    plot_x <- tbl_to_plot %>%
      ggplot(aes(x = current_demog, y = demog_prop_round, fill = segment)) +
      geom_bar(stat = "identity", position = position_dodge()) +
      geom_text(aes(label = demog_prop_print), vjust = -0.5, color = "black",    # print percentage on top of the bar
                position = position_dodge(0.9), size = text_size) +
      scale_fill_manual(values = cbPalette) +
      scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
      labs(x = x_axis_title,
           y = "% of Segment Total") +
      theme_minimal() +
      theme(text = element_text(family = "ArialMT"),                  # set all plot text fonts to Arial
            panel.grid.major.x = element_blank(),                     # remove all grid lines
            panel.grid.major.y = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.minor.y = element_blank(),
            legend.position = "bottom",                               # set legend position to bottom
            legend.title = element_blank(),                           # remove title
            legend.text = element_text(margin = margin(r = 10, unit = "pt"))) +
      ggsave(path = plot_save_location,
             filename = paste0(demographic_list[x], ".png"),
             width = 11, height = 5.5)
  })
}

#' Wrapper function to create demographic profile plots
#' @param aggregation_level           (string) "person" or "household"
#'                                             if "person"    aggregation will use person_id605
#'                                             if "household" aggregation will use id605
#' @param experian_table_name         (string) non PII experian table name
#' @param experian_pii_table_name     (string) PII experian table name
#' @param additional_demographics_tbl (remote table) distinct rows of id605 or person_id605 + additional
#'                                                   should already coded and ordered additional demographics
#' @param additional_demographic_list (list of strings with names) list of additional demographics to plot with names
#' @param additional_demographics_levels_and_x_axis_title (nested list) levels and x axis titles of additional demographics
#' @param custom_palette              (list of strings) color codes for custom color palette - MUST BE LENGTH 8
#' @param user_universe_list          (list of remote tables) each remote table should be just a table of 1 column = id605 or person_id605
#' @param user_universe_names         (list of strings) names of the universes from user_universe_list - must be in the same order
#' @param base_universe_tbl           (remote table) function will use the id605s from the inner join of the 2 experian tables
#'                                                   if self defined, should be a remote table with a column of id605/person_id605
#' @param base_universe_name          (string) Label name for the base universe in the plots
#' @param plot_save_location          (string) directory path to the location the tables should be saved in
#' @export
demographic_profiles <- function(aggregation_level,
                                 experian_table_name,
                                 experian_pii_table_name,
                                 additional_demographics_tbl = NA,
                                 additional_demographic_list = NA,
                                 additional_demographics_levels_and_x_axis_title = NA,
                                 custom_palette = NA,
                                 user_universe_list,
                                 user_universe_names,
                                 base_universe_tbl = NA,
                                 base_universe_name,
                                 plot_save_location) {

  # depending on aggregation level, this experian_id assigned will be selected and joined on
  if(aggregation_level == "household") {
    experian_id <- "id605"
  } else if(aggregation_level == "person") {
    experian_id <- "person_id605"
  }

  # the default demographics that will always be plotted
  default_demographics <- c("age_bin", "income_bin", "gender_bin", "ethnicity_bin", "education_bin", "children_bin", "marital_bin")
  names(default_demographics) <- c("Age", "Income", "Gender", "Race", "Education", "Presence of Children", "Marital Status")

  # if the user inputs additional demographics, add them to the demographics list for later use in looping and plotting
  if(!is.na(additional_demographic_list) %>% all()) {
    demographic_list <- c(default_demographics, additional_demographic_list)
  } else {
    demographic_list <- default_demographics
  }

  # paste in an extra " " for specific spacing in final plot legend
  user_universe_names <- paste0(" ", user_universe_names)
  base_universe_name <- paste0(" ", base_universe_name)

  # STEP 1 : create and recode the experian demographics tables
  # joins in any additonal demographics if the user specified
  step_1_demog_table <- create_recoded_experian_tbl(aggregation_level           = aggregation_level,
                                                    experian_id                 = experian_id,
                                                    experian_table_name         = experian_table_name,
                                                    experian_pii_table_name     = experian_pii_table_name,
                                                    additional_demographics_tbl = additional_demographics_tbl)

  # STEP 2 : create and save the proportion table
  step_2_prop_table <- create_proportion_tbl(recoded_experian_tbl = step_1_demog_table,
                                             user_universe_list   = user_universe_list,
                                             user_universe_names  = user_universe_names,
                                             demographic_list     = demographic_list,
                                             base_universe_tbl    = base_universe_tbl,
                                             base_universe_name   = base_universe_name,
                                             experian_id          = experian_id)

  # write to CSV for future reference if needed
  write.csv(x = step_2_prop_table, file = paste0(plot_save_location, "proportion_tbl.csv"), row.names = FALSE)

  # STEP 3 : Create plots
  step_3_create_plots <- create_plots(proportion_tbl      = step_2_prop_table,
                                      demographic_list    = demographic_list,
                                      additional_demographics_levels_and_x_axis_title = additional_demographics_levels_and_x_axis_title,
                                      base_universe_name  = base_universe_name,
                                      user_universe_names = user_universe_names,
                                      plot_save_location  = plot_save_location,
                                      custom_palette      = custom_palette)
}
605data/DemoProfiles documentation built on June 12, 2020, 12:01 a.m.