#' Pretty data function
#'
#' Prepare data for plotting.
#'
#' This function performs the following operations (if the variables exist):
#' \itemize{
#' \item \code{itemYear} is converted to factor format
#' \item \code{ageGroup} is converted to factor format with levels \code{c("0-17", "18-24",
#' "25-34", "35-44", "45-54", "55-64", "65+", "Invalid")}
#' \item \code{issueDate} is converted to date format
#' \item \code{monthAbb} factor variable is created with three-letter abbreviations for
#' the English month names
#' \item \code{yearAbb} factor variable is created with two-digit year abbreviations
#' \item \code{monthYear} factor variable is created with three-letter month
#' abbreviations and two-digit year abbreviations
#' \item \code{residency} is converted to a factor with levels \code{c("Resident",
#' "Non-Resident")}
#' \item \code{itemResidency} is converted to a factor with levels \code{c("Resident Item",
#' "Non-Resident Item")}
#' \item \code{gender} is converted to a factor
#' \item \code{itemType} is converted to a factor
#' \item Any factor variables with missing values are given an explicit factor
#' level to ensure that they appear in summaries and on plots
#' }
#'
#' @param df A dataframe
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
prettyData <- function(df) {
df %>%
# Convert item year to factor
conditional_mutate(
exists("itemYear", where = .),
LHS = itemYear, RHS = as.factor(itemYear)
) %>%
# Convert age group to factor
conditional_mutate(
exists("ageGroup", where = .),
LHS = ageGroup, RHS = factor(ageGroup,
levels = c(
"0-17", "18-24",
"25-34", "35-44",
"45-54", "55-64",
"65+", "Invalid"
)
)
) %>%
# Convert issue date to date
conditional_mutate(
exists("issueDate", where = .),
LHS = issueDate, RHS = as.Date(issueDate)
) %>%
# Create month abbreviation variable
conditional_mutate(
exists("month", where = .),
LHS = monthAbb,
RHS = factor(month.abb[month], levels = month.abb)
) %>%
# Create year abbreviation variable
conditional_mutate(
exists("year", where = .),
LHS = yearAbb,
RHS = forcats::fct_reorder(substr(year, 3, 4), year)
) %>%
# Create month year variable
conditional_mutate(
exists("month", where = .) & exists("year", where = .),
LHS = monthYear,
RHS = paste(monthAbb, yearAbb, sep = "-")
) %>%
# Order levels of month year variable
conditional_mutate(
exists("month", where = .) & exists("year", where = .),
LHS = monthYear,
RHS = factor(monthYear, levels = unique(monthYear[order(yearAbb, monthAbb)]))
) %>%
# Convert residency to descriptive values
conditional_mutate(
exists("residency", where = .),
LHS = residency,
RHS = forcats::fct_recode(residency,
`Resident` = "T",
`Non-resident` = "F"
)
) %>%
# Convert residency of item to descriptive values
conditional_mutate(
exists("itemResidency", where = .),
LHS = itemResidency,
RHS = forcats::fct_recode(
itemResidency,
`Resident Item` = "T",
`Non-resident Item` = "F"
)
) %>%
# Convert gender to factor
conditional_mutate(
exists("gender", where = .),
LHS = gender,
RHS = as.factor(gender)
) %>%
# Convert itemType to factor
conditional_mutate(
exists("itemType", where = .),
LHS = itemType,
RHS = as.factor(itemType)
) %>%
# Replace missing values in factors
mutate_if(is.factor, forcats::fct_explicit_na)
}
#' Round axis labels
#'
#' Scale axis labels
#'
#' @param df A data frame
#' @param y Name of y-variable to scale
#' @param groupVars Variables to group by
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
scaleVariable <- function(df, y, groupVars = NULL) {
df %>%
group_by(!!!syms(groupVars)) %>%
mutate(!!sym(paste0(y, "Sum")) := sum(!!sym(y))) %>%
ungroup() %>%
mutate(divisor = case_when(
(max(!!sym(paste0(y, "Sum"))) %/% 1e6) > 0 ~ 1e6,
(max(!!sym(paste0(y, "Sum"))) %/% 1e3) > 0 ~ 1e3,
TRUE ~ 1
)) %>%
select(-!!sym(paste0(y, "Sum"))) %>%
mutate(!!sym(paste0(y, "Scaled")) := !!sym(y) / divisor)
}
#' Age cut
#'
#' Define cuts in age vector
#'
#' @param x A vector of ages
#' @param lower Lower bound for age bins
#' @param upper Upper bound for ang bins
#' @param by Size of age bins
#' @param sep Separator character for bin labels
#' @param above.char Character indicating top bin
#'
#'
#' @export
ageCut <- function(x, lower = 0, upper, by = 10,
sep = "-", above.char = "+") {
# Force upper bound to match bin size
if ((upper %% by) != 0) {
upper <- upper + (upper %% by)
}
# Special case of size one bins
if (by == 1) {
lvls <- c(
seq(lower, upper - by, by = by),
paste(upper, above.char, sep = "")
)
myCuts <-
factor(if_else(x >= upper, paste(upper, above.char, sep = ""), as.character(x)),
levels = lvls
)
} else {
lvls <- c(
paste(
seq(lower, upper - by, by = by),
seq(lower + by - 1, upper - 1, by = by),
sep = sep
),
paste(upper, above.char, sep = "")
)
myCuts <-
cut(
floor(x),
breaks = c(seq(lower, upper, by = by), Inf),
right = FALSE,
labels = lvls
)
}
return(myCuts)
}
#' Count customers by group
#'
#' Count number of customers by group. This function will collect data from
#' the database if using SQL backend.
#'
#' @param df A data frame with a column named customerUID
#' @param groupVars A character vector of variable names to group by
#'
#' @return A data frame with columns for grouping variables and a column named
#' \code{customers} for number of customers Data frame is passed through
#' \code{\link{prettyData}} function.
#'
#' @examples
#' # Demo data: Count number of customers each year purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' countCustomers(c("itemYear", "itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Count number of customers each year purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' countCustomers(c("itemYear", "itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
countCustomers <- function(df, groupVars = NULL) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for required columns
reqVars <- c("customerUID")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
df %>%
group_by(!!!(syms(groupVars))) %>%
summarize(customers = n_distinct(customerUID)) %>%
ungroup() %>%
collect(n = Inf) %>%
arrange(!!!syms(groupVars)) %>%
prettyData()
}
#' Average participants by group
#'
#' Calculate average value by group
#'
#' @param df A data frame
#' @param startYear An integer
#' @param endYear An integer
#' @param groupVars A vector
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
calcAvgValue <- function(df, var, startYear, endYear, groupVars = NULL) {
df %>%
group_by(!!!(syms(c("itemYear", groupVars)))) %>%
summarize(!!sym(var) := sum(!!sym(var))) %>%
ungroup() %>%
mutate(itemYear = as.integer(as.character(itemYear))) %>%
filter(itemYear >= startYear, itemYear <= endYear) %>%
group_by(!!!(syms(groupVars))) %>%
summarize(!!sym(paste0(var, "Avg")) := mean(!!sym(var))) %>%
ungroup() %>%
mutate(avgLab = paste0("average (", startYear, "-", endYear, ")"))
}
#' Calculate gender proportion
#'
#' Calculate gender proportion by group. This function collects data from
#' the database.
#'
#' @param df A data frame with a column named customerUID
#' @param groupVars A character vector of variable names to group by
#'
#' @return A dataframe with columns for grouping variables and a column named
#' \code{genderProportion} for gender proportion. Data frame is passed through
#' \code{\link{prettyData}} function.
#'
#' @examples
#' # Demo data: Gender proportions for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcGenderProportion(c("itemYear", "itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Gender proportions for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcGenderProportion(c("itemYear", "itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
calcGenderProportion <- function(df, groupVars = NULL) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for required columns
reqVars <- c("customerUID", "gender")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
# Check invalid grouping variables
if ("gender" %in% groupVars) {
stop("Gender is not a valid grouping variable")
}
countCustomers(df, c("gender", groupVars)) %>%
group_by(!!!(syms(groupVars))) %>%
mutate(genderProportion = customers / sum(customers)) %>%
ungroup() %>%
arrange(!!!syms(c("gender", groupVars))) %>%
prettyData()
}
#' Sum revenue by group
#'
#' Calculate total revenue by group. Total revenue is the sum price variable.
#'
#' @param df A data frame
#' @param groupVars A character vector of variable names to group by
#'
#' @return A dataframe with columns for grouping variables and a column named
#' revenue for total revenue
#'
#' @examples
#' # Demo data: Calculate revenue each year from fishing licenses between 2010
#' # and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' sumRevenue(c("itemYear", "itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Calculate revenue each year from fishing licenses between 2010
#' # and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' sumRevenue(c("itemYear", "itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
sumRevenue <- function(df, groupVars = NULL) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for required columns
reqVars <- c("price")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
df %>%
select(!!!(syms(c("price", groupVars)))) %>%
group_by(!!!(syms(groupVars))) %>%
summarize(revenue = sum(price, na.rm = TRUE)) %>%
ungroup() %>%
collect(n = Inf) %>%
arrange(!!!syms(groupVars)) %>%
prettyData()
}
#' Count items by group
#'
#' Count number of items (permits, licenses, stamps, etc.) by group. This
#' function will collect data from the database if using SQL backend.
#'
#' @param df A data frame
#' @param groupVars A character vector of variable names to group by
#'
#' @return A data frame with columns for grouping variables and a column named
#' \code{items} for number of items. Data frame is passed through
#' \code{\link{prettyData}} function.
#'
#' @examples
#' # Demo data: Count number of deer licenses each year between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Deer", itemYear = c(2010, 2017))
#' ) %>%
#' countItems(c("itemYear", "itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Count number of deer licenses each year between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Deer", itemYear = c(2010, 2017))
#' ) %>%
#' countItems(c("itemYear", "itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
countItems <- function(df, groupVars = NULL) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
df %>%
group_by(!!!(syms(groupVars))) %>%
summarize(items = n()) %>%
ungroup() %>%
collect(n = Inf) %>%
arrange(!!!syms(groupVars)) %>%
prettyData()
}
#' Calculate churn
#'
#' Calculate churn using variable window size
#'
#' @param df A data frame
#' @param groupVars A character vector of variable names to group by
#' @param deathThreshold A numeric value indicating how many years
#' must elapse before an individual is considered churned
#'
#' @return A dataframe with columns for grouping variables and columns
#' churnedCustomers, retainedCustomers, totalCustomers, churnRate, and
#' retentionRate
#'
#' @examples
#' # Demo data: Churn for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcChurn(c("itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Churn for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcChurn(c("itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#' @import tidyr
#'
#' @export
calcChurn <- function(df, groupVars = NULL, deathThreshold = 5) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for required columns
reqVars <- c("customerUID", "itemYear")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
# Check invalid grouping variables
if ("itemYear" %in% groupVars) {
stop("Item year is not a valid grouping variable")
}
# Check range of data
years <- df %>%
distinct(itemYear) %>%
pull(itemYear)
if ((range(years)[2] - range(years)[1] + 1) <=
deathThreshold) {
stop(paste0(
"Window size is ", deathThreshold, ", but range of data",
" is only ", range(years)[2] - range(years)[1] + 1, " years"
))
}
# SPECIAL CASE: age ang ageGroup are not used in calculating customer status
groupVarsBefore <- groupVars[!groupVars %in% c("age", "ageGroup")]
groupVarsAfter <- groupVars[groupVars %in% c("age", "ageGroup")]
# Calculate years until next purchase
nextPurchase <-
df %>%
distinct(!!!(syms(c("customerUID", "itemYear", groupVarsBefore)))) %>%
group_by(!!!(syms(c("customerUID", groupVarsBefore)))) %>%
mutate(
nextPurchase = lead(itemYear, order_by = itemYear),
gap = nextPurchase - itemYear - 1
) %>%
ungroup()
# Define customer status
customerStatus <-
nextPurchase %>%
mutate(churnStatus = if_else(is.na(gap) | gap >= deathThreshold,
"churnedCustomers",
"retainedCustomers"
))
# Customer covariates (join age and ageGroup if needed)
customerCovar <- df %>%
distinct(!!!syms(c("customerUID", "itemYear", groupVarsAfter)))
# Calculate churn
churn <-
customerStatus %>%
left_join(customerCovar,
by = c("customerUID", "itemYear")
) %>%
group_by(!!!(syms(c("itemYear", groupVars, "churnStatus")))) %>%
summarize(N = n_distinct(customerUID)) %>%
collect(n = Inf) %>%
ungroup() %>%
spread(churnStatus, N, fill = 0) %>%
mutate(
totalCustomers = churnedCustomers + retainedCustomers,
churnRate = churnedCustomers / totalCustomers,
retentionRate = 1 - churnRate
) %>%
filter(itemYear <= (max(itemYear) - deathThreshold)) %>%
prettyData()
return(churn)
}
#' Calculate recruitment
#'
#' Calculate recruitment using a variable window size
#'
#' @param df A data frame
#' @param groupVars A character vector of variable names to group by
#' @param birthThreshold A numeric value indicating how many years
#' without purchase before an individual is considered a recruit
#'
#' @return A dataframe with columns for grouping variables and columns
#' for totalCustomers, retainedCustomers, recruitedCustomers, and recruitementRate
#'
#' @examples
#' # Demo data: Recruitment for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcRecruitment(c("itemType"))
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Recruitment for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017))
#' ) %>%
#' calcRecruitment(c("itemType"))
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
calcRecruitment <- function(df, groupVars = NULL, birthThreshold = 5) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check groupVars class
if (!(is.null(groupVars) | class(groupVars) == "character")) {
stop("Input 'groupVars' should be a character vector")
}
# Check for required columns
reqVars <- c("customerUID", "itemYear")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Check for grouping variable columns
if (any(!groupVars %in% colnames(df))) {
stop(paste(
"Grouping variables not present in data frame:",
paste(groupVars[!groupVars %in% colnames(df)], collapse = ", ")
))
}
# Check invalid grouping variables
if ("itemYear" %in% groupVars) {
stop("Permit year is not a valid grouping variable")
}
# Check range of data
years <- df %>%
distinct(itemYear) %>%
pull(itemYear)
if ((range(years)[2] - range(years)[1] + 1) <=
birthThreshold) {
stop(paste0(
"Window size is ", birthThreshold, ", but range of data",
" is only ", range(years)[2] - range(years)[1] + 1, " years"
))
}
# SPECIAL CASE: age ang ageGroup are not used in calculating customer status
groupVarsBefore <- groupVars[!groupVars %in% c("age", "ageGroup")]
groupVarsAfter <- groupVars[groupVars %in% c("age", "ageGroup")]
# Calculate years since last purchase
lastPurchase <-
df %>%
distinct(!!!(syms(c("customerUID", "itemYear", groupVarsBefore)))) %>%
group_by(!!!(syms(c("customerUID", groupVarsBefore)))) %>%
mutate(
lastPurchase = lag(itemYear, order_by = itemYear),
gap = itemYear - lastPurchase - 1
) %>%
ungroup()
# Define customer status
customerStatus <-
lastPurchase %>%
mutate(recruitStatus = if_else(is.na(gap) | gap >= birthThreshold,
"recruitedCustomers",
"retainedCustomers"
))
# Customer covariates (join age and ageGroup if needed)
customerCovar <- df %>%
distinct(!!!syms(c("customerUID", "itemYear", groupVarsAfter)))
# Calculate recruitment
recruitment <-
customerStatus %>%
left_join(customerCovar,
by = c("customerUID", "itemYear")
) %>%
group_by(!!!(syms(c("itemYear", groupVars, "recruitStatus")))) %>%
summarize(N = n_distinct(customerUID)) %>%
collect(n = Inf) %>%
ungroup() %>%
spread(recruitStatus, N, fill = 0) %>%
arrange(!!!(syms(c(groupVars, "itemYear")))) %>%
group_by(!!!(syms(groupVars))) %>%
mutate(
totalCustomers = recruitedCustomers + retainedCustomers,
recruitmentRate = recruitedCustomers / lag(totalCustomers)
) %>%
ungroup() %>%
filter(itemYear >= (min(itemYear) + birthThreshold)) %>%
prettyData()
return(recruitment)
}
#' Calculate participation rate
#'
#' Calculate participation rate by group. Participation rate is the number of
#' customers divided by the population size.
#'
#' @param df A data frame
#' @param pop A dataframe of population by county, year, gender, and age group.
#' @param groupVars A character vector of variable names to group by
#'
#' @examples
#' # Get list of valid counties (used to filter out data entry errors)
#' validCounties <- countyMaps %>%
#' dplyr::filter(region == "nebraska") %>%
#' dplyr::distinct(county) %>%
#' dplyr::pull(county)
#'
#' # Demo data: Participation rate for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017), state = "NE")
#' ) %>%
#' calcParticipation(c("itemType", "county"),
#' pop = statePop %>% filter(state == "NE")
#' ) %>%
#' dplyr::filter(county %in% validCounties)
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # Get list of valid counties (used to filter out data entry errors)
#' validCounties <- countyMaps %>%
#' dplyr::filter(region == "nebraska") %>%
#' dplyr::distinct(county) %>%
#' dplyr::pull(county)
#'
#' # SQL Backend: Participation rate for customers purchasing a fishing
#' # license between 2010 and 2017
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(itemType = "Fish", itemYear = c(2010, 2017), state = "NE")
#' ) %>%
#' calcParticipation(c("itemType", "county"),
#' pop = statePop %>% filter(state == "NE")
#' ) %>%
#' dplyr::filter(county %in% validCounties)
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
calcParticipation <- function(df, groupVars, pop) {
# Count participants
totalCount <- countCustomers(df, groupVars)
# Define population grouping variables
popGroupVars <- NULL
if ("gender" %in% groupVars) {
popGroupVars <- c(popGroupVars, "gender")
}
if ("county" %in% groupVars) {
popGroupVars <- c(popGroupVars, "county")
}
if ("state" %in% groupVars) {
popGroupVars <- c(popGroupVars, "state")
}
if ("ageGroup" %in% groupVars) {
popGroupVars <- c(popGroupVars, "ageGroup")
}
# Summarize population data
popCount <- pop %>%
mutate(gender = as.factor(gender)) %>%
group_by(!!!(syms(popGroupVars))) %>%
summarize(pop = sum(population)) %>%
ungroup()
# Calculate participation rate
participation <-
totalCount %>%
mutate(key = 1) %>%
conditional_mutate(
exists("county", where = .),
LHS = county,
RHS = as.character(county)
) %>%
conditional_mutate(
exists("state", where = .),
LHS = state,
RHS = as.character(state)
) %>%
left_join(
popCount %>%
mutate(key = 1) %>%
conditional_mutate(
exists("county", where = .),
LHS = county,
RHS = as.character(county)
)%>%
conditional_mutate(
exists("state", where = .),
LHS = state,
RHS = as.character(state)
),
by = c("key", popGroupVars)
) %>%
select(-key) %>%
group_by(!!!(syms(groupVars))) %>%
mutate(participationRate = customers / pop) %>%
ungroup() %>%
prettyData()
# Return summarized data
return(participation)
}
#' Count customers by permit groups
#'
#' Count customers who bought each combination of permits and stamps
#'
#' @param df A data frame
#'
#' @examples
#' # Demo data: Count number of customers by combination of items purchased
#' filterData(
#' dataSource = "csv",
#' activeFilters = list(
#' itemType = c("Fish", "Deer", "Spring Turkey"),
#' itemYear = c(2018, 2018)
#' )
#' ) %>%
#' itemGroupCount()
#' \dontrun{
#' # Database connection. Suggest using keyring package to avoid hardcoding
#' # passwords
#' myConn <- DBI::dbConnect(odbc::odbc(),
#' dsn = "HuntFishApp", # Your datasource name
#' uid = keyring::key_get("HuntFishAppUID"), # Your username
#' pwd = keyring::key_get("HuntFishAppPWD")
#' ) # Your password
#'
#' # SQL Backend: Count number of customers by combination of items purchased
#' filterData(
#' dataSource = "sql",
#' conn = myConn,
#' activeFilters = list(
#' itemType = c("Fish", "Deer", "Spring Turkey"),
#' itemYear = c(2018, 2018)
#' )
#' ) %>%
#' itemGroupCount()
#' }
#'
#' @family analysis functions
#'
#' @import dplyr
#' @importFrom magrittr %>%
#'
#' @export
itemGroupCount <- function(df) {
# Check df class
if (!("tbl" %in% class(df) | "data.frame" %in% class(df))) {
stop(paste("Input 'df' should be a data frame"))
}
# Check for required columns
reqVars <- c("customerUID", "itemType")
if (any(!reqVars %in% colnames(df))) {
stop(paste(
"Required variables not present in data frame:",
paste(reqVars[!reqVars %in% colnames(df)], collapse = ", ")
))
}
# Get list of permit types
permitTypeLabels <- df %>%
distinct(itemType) %>%
pull(itemType)
# Drop duplicate rows and group data
wideDf <- df %>%
distinct(!!!(syms(c("customerUID", "itemType")))) %>%
group_by(!!!(syms(c("customerUID"))))
# Create indicator variables
for (i in 1:length(permitTypeLabels)) {
wideDf <- wideDf %>%
mutate(!!sym(permitTypeLabels[i]) :=
max(if_else(itemType == !!permitTypeLabels[i], 1, 0), na.rm = T))
}
# Summarize data
wideData <-
wideDf %>%
ungroup() %>%
select(-itemType) %>%
distinct() %>%
countCustomers(permitTypeLabels) %>%
arrange(desc(customers)) %>%
mutate(groupID = row_number()) %>%
select(groupID, everything())
# Convert data to long form
longData <- wideData %>%
gather(itemType, purchase, !!!syms(permitTypeLabels)) %>%
arrange(groupID) %>%
group_by(groupID) %>%
mutate(degree = sum(purchase)) %>%
ungroup()
return(longData)
}
#' Count shared customers
#'
#' @export
countSharedCustomers <- function(comboCount) {
# Spread count to wide form
comboCountWide <-
comboCount %>%
spread(itemType, purchase)
# Create matrix of shared customers between groups
itemTypes <- unique(comboCount[["itemType"]])
nTypes <- length(itemTypes)
edges <- matrix(
data = 0,
nrow = nTypes,
ncol = nTypes
)
rownames(edges) <- itemTypes
colnames(edges) <- itemTypes
for (i in c(1:nTypes)) {
for (j in c(1:nTypes)) {
edges[i, j] <-
comboCountWide %>%
filter(
UQ(sym(itemTypes[i])) == 1,
UQ(sym(itemTypes[j])) == 1
) %>%
summarize(customers = sum(customers)) %>%
pull(customers)
}
}
# Convert matrix to tibble and calculate proportions
sharedCustomers <- as_tibble(edges) %>%
mutate(type1 = row.names(edges)) %>%
gather(type2, customers, -type1) %>%
arrange(type1) %>%
group_by(type1) %>%
mutate(total1 = customers[type1 == type2]) %>%
group_by(type2) %>%
mutate(total2 = customers[type1 == type2]) %>%
ungroup() %>%
mutate(
prop1 = customers / total1,
prop2 = customers / total2,
prop = if_else(type1 != type2, customers / (total1 + total2), 1),
type1 = factor(type1),
type2 = factor(type2, levels = levels(type1))
)
}
#' Count customers by number of other item types
#'
#' @export
countItemTypes <- function(comboCount) {
# Spread count to wide form
comboCountWide <-
comboCount %>%
spread(itemType, purchase)
# Create matrix of customer count by number of other item types purchased
itemTypes <- unique(comboCount[["itemType"]])
nTypes <- length(itemTypes)
maxDegree <- 4
countByItemTypes <- matrix(
data = 0,
nrow = nTypes,
ncol = maxDegree
)
rownames(countByItemTypes) <- itemTypes
colnames(countByItemTypes) <- 1:maxDegree
for (i in c(1:nTypes)) {
for (j in c(1:maxDegree)) {
countByItemTypes[i, j] <-
comboCountWide %>%
filter(
UQ(sym(itemTypes[i])) == 1,
degree == j |
((j == maxDegree) & degree >= j)
) %>%
summarize(customers = sum(customers)) %>%
pull(customers)
}
}
# Convert matrix to tibble and calculate proportions
degreeCount <-
as_tibble(countByItemTypes) %>%
mutate(itemType = row.names(countByItemTypes)) %>%
gather(degree, customers, -itemType) %>%
arrange(itemType) %>%
group_by(itemType) %>%
mutate(total = sum(customers)) %>%
ungroup() %>%
mutate(prop = customers / total) %>%
mutate(
degreeLabel = if_else(degree == "4", "4+", degree),
degreeLabel = factor(degreeLabel),
degree = as.numeric(degree),
itemType = factor(itemType)
)
}
#' Calculate edge matrix for Radial Sets plot
#'
#' @export
radialSetsData <- function(sharedCustomers,
degreeCount,
linkThickness = "percent",
focusGroup = "none",
countScale = 1 / 1e3) {
# Types of items
itemTypes <- levels(degreeCount$itemType)
nTypes <- length(itemTypes)
# Maximum degree
maxDegree <- max(degreeCount[["degree"]])
# Convert degree count to matrix
degreeCountMat <-
degreeCount %>%
arrange(itemType) %>%
select(itemType, degree, customers) %>%
mutate(customers = customers * countScale) %>%
spread(degree, customers) %>%
select(-itemType) %>%
as.matrix()
rownames(degreeCountMat) <- itemTypes
colnames(degreeCountMat) <- 1:maxDegree
# Number of customers with each type
totalVec <- degreeCount %>%
arrange(itemType) %>%
distinct(itemType, total) %>%
mutate(total = total * countScale) %>%
pull(total)
names(totalVec) <- itemTypes
# Edges matrix
if (linkThickness != "percent") {
# Calculate edges
edges <-
sharedCustomers %>%
arrange(type1) %>%
select(type1, type2, customers) %>%
mutate(customers = customers) %>%
spread(type2, customers) %>%
select(-type1) %>%
as.matrix()
rownames(edges) <- itemTypes
if (focusGroup != "none") {
# Remove all links besides focus group
edges[-which(itemTypes == focusGroup), ] <- 0
}
# Scale edges by thousands
edges <- edges * countScale
} else if (linkThickness == "percent") {
if (focusGroup == "none") {
edges <-
sharedCustomers %>%
arrange(type1) %>%
select(type1, type2, prop) %>%
spread(type2, prop) %>%
select(-type1) %>%
as.matrix()
rownames(edges) <- itemTypes
} else {
edges <-
sharedCustomers %>%
arrange(type1) %>%
select(type1, type2, prop1) %>%
spread(type2, prop1) %>%
select(-type1) %>%
as.matrix()
rownames(edges) <- itemTypes
# Remove all links besides focus group
edges[-which(itemTypes == focusGroup), ] <- 0
}
# Scale edges to percent
edges <- edges * 100
}
# Remove self-links
diag(edges) <- 0
# Maximum edge width
maxWidth <- signif(max(edges), 1)
return(list(
edges = edges,
itemTypes = itemTypes,
nTypes = nTypes,
maxDegree = maxDegree,
degreeCountMat = degreeCountMat,
totalVec = totalVec,
maxWidth = maxWidth
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.