# Functions for processing the popchips dataframe in popchips.R
#' Function for getting useable names.
#'
#' Function for getting useable names.
#' @param x dataframe, to be given better names
#' @return x but with column names standardized
better_names <- function(x) {
# names(x) <- gsub('[[:punct:]]', '', names(x))
names(x) <- gsub("\\s+", " ", names(x))
return (x)
}
# Function to paste ignoring NAs
paste_noNA <- function(x, sep=", ") {
gsub(", ", sep, toString(x[!is.na(x) & x!="" & x!="NA"]))
}
# Function to combine two columns into a third, drop now redundant columns
column_combiner <- function(df, col1, col2, combo) {
df[[combo]] <- apply(df[ , c(col1, col2)], 1, paste_noNA, sep='')
df <- df[!str_detect(names(df), paste0('', col1, '|', col2))]
return(df)
}
# Improvement for x columns
# column_combiner <- function(df, col_vec, combo) {
# df[[combo]] <- apply(df[ , col_vec], 1, paste_noNA, sep='')
# df <- select(df, -col_vec)
# return(df)
# }
# gatherer <- function(x, new_col = "dsf", gathered_cols) {
# x %>%
# gather(key = "brand", new_col, gathered_cols, na.rm = TRUE) %>%
# select(-brand)
# }
# Function to convert to numeric
column_to_numeric <- function(df, columns) {
df[ , columns] <- as.numeric(as.character(unlist(df[ , columns])))
return(df)
}
# Function to replace NAs in a given column with a string
factor_na_drop <- function(df, column, replacement) {
df[[column]][is.na(df[[column]])] <- replacement
return(df)
}
# Function to replace variant string with definitive string
factor_cleaner <- function(df, column, bad_factor, replacement) {
df[[column]] <-str_replace_all(df[[column]], bad_factor, replacement)
return(df)
}
#' Function to reorder factor presentation.
#'
#' Function to reorder factor presentation.
#' @export
#' @param df dataframe, dataframe with factors being reordered.
#' @param column numeric (or string), identifies column containing factors to reorder.
#' @param column_order vector, identifies factors to 'push' to beginning of factor levels.
#' @return A dataframe like \code{df}, but with \code{column}'s factor levels reordered to put \code{column_order} at the beginning.
factor_orderer <- function(df, column, column_order) {
for (i in length(column_order):1) {
df[[column]] <- stats::relevel(df[[column]], column_order[i])
}
return(df)
}
#' Factor ordering for already-ordered factors; must specify all factors
#'
#' Factor ordering for already-ordered factors; must specify all factors
#' @export
#' @param df dataframe, dataframe with factors being reordered.
#' @param columns numeric (or string), identifies columns containing factors to reorder.
#' @param column_order vector, identifies factors in new order.
#' @return A dataframe like \code{df}, but with \code{column}'s factor levels reordered according to \code{column_order}.
fo <- function(df, columns, column_order) {
for (i in columns) {
df[[i]] <- factor(df[[i]], levels = column_order)
}
return(df)
}
# Function to reduce factor levels
factor_conflator <- function(df, column, factors) {
df[[column]] <- factor(df[[column]])
df[[column]] <- addNA(df[[column]])
levels(df[[column]]) <- factors
return(df)
}
# Function to reorder columns
column_orderer <- function(df, column_order) {
return(df[ , column_order])
}
# Functions for processing pepsi data
# Function for getting column names into a format that pivot_longer() can deal with
col_regularizer <- function(x, conditions, replacements,
products = NULL, reverse = FALSE) {
# Check whether replacements and conditions and of the same length
len <- length(conditions)
if (len != length(replacements)) {
stop("Conditions and replacements are of unequal length")
}
# If products are empty, then replacement strings replace columns names
# directly and without adding a suffix
if (missing(products)) {
for (i in 1:len) {
x[grep(conditions[i], x)] <- replacements[i]
}
# Otherwise, go through every product, and each condition for each product
# and replace column names accordingly.
# `reversed` parameter controls whether the condition is expected before
# the product name -- reversed == TRUE when product comes first.
} else {
for (p in products) {
for (i in 1:len) {
condition_string <- ifelse(reverse == FALSE,
paste0(conditions[i], ".*", p),
paste0(p, ".*", conditions[i]))
x[grep(tolower(condition_string), tolower(x))] <- paste0(replacements[i], "_", p)
}
}
}
return(x)
}
#' For swapping out one factor for another.
#'
#' For swapping out one factor for another (e.g. numerical code to actual sentences in response).
#' @export
#' @param df dataframe, dataframe with factors being swapped.
#' @param columns vector, identifies columns containing factors to swap.
#' @param original_factors vector, gives factors to swap.
#' @param new_factors vector, gives replacements for original_factors.
#' @param addNA logical or string, if not NULL, replaces NAs with string.
#' @return A dataframe like \code{df}, but with \code{original_factors} in \code{columns} replaced by \code{new_factors} (and NAs replaced with \code{addNA} if \code{addNA} used).
factor_swap <- function(df, columns, original_factors = NA, new_factors = NA,
addNA = NULL) {
for (c in columns) {
df[[c]] <- factor(df[[c]])
df[[c]] <- plyr::mapvalues(df[[c]], from = original_factors, to = new_factors,
warn_missing = FALSE)
if (!missing(addNA)) {
df[[c]] <- forcats::fct_explicit_na(df[[c]], addNA)
}
}
return(df)
}
# factor_swap <- function(df, columns, original_factors = NA, new_factors = NA,
# addNA = FALSE) {
# for (c in columns) {
# df[[c]] <- factor(df[[c]])
# df[[c]] <- plyr::mapvalues(df[[c]], from = original_factors, to = new_factors)
# if (!missing(addNA)) {
# df[[c]] <- addNA(df[[c]])
# df[[c]] <- plyr::mapvalues(df[[c]], from = NA, to = addNA)
# }
# }
# return(df)
# }
# Helper functions within the app (server.R)
#' For making bar plots with multiple traces.
#'
#' For making bar plots with multiple traces.
#' @export
#' @param t dataframe containing data for plot.
#' @param title_string string, title for plot.
#' @param col_vec vector containing colours for plot.
#' @param orient string, "h" or "v", whether the bars are horizontal or vertical.
#' @return A plotly bar plot.
bp <- function(t, title_string, col_vec, orient = 'h') {
# col_vec --> vector of colours for graph
# Determining how to access temporary dtaframe
t_cols <- 2:(length(t)-1)
plt <- t %>%
dplyr::mutate(primary_col = factor(primary_col, levels = factor(primary_col))) %>%
plotly::plot_ly(type = "bar") %>%
plotly::layout(
barmode = "stack", font = list(family = font_fam, color = font_color),
showlegend = FALSE, title = title_string, orientation = orient,
xaxis = list(title = ""), yaxis = list(title =""),
paper_bgcolor = box_color, plot_bgcolor = box_color)
for (i in t_cols) {
df_trace <- data.frame(x = t[[i]], y = t$primary_col, nm = names(t)[i])
plt <- plt %>%
plotly::add_trace(plt, data = df_trace, x = ~x, y = ~y, name = ~nm,
hoverlabel = list(namelength = -1),
hoverinfo = "text", text = ~paste0(nm, ": ", x),
marker = list(color = col_vec[i-1],
line = list(color = font_color, width = 1)))
}
return(plt)
}
#' For duplicating ggwordcloud plots.
#'
#' For duplicating ggwordcloud plots.
#' @seealso \code{\link{wc_plotter}}
#' @param x dataframe, contains data for word cloud.
#' @param font_size, font size for word cloud.
#' @return A ggplot2 plot.
wc <- function(x, font_size) {
ggplot2::ggplot(x, ggplot2::aes(size = n, family = font_fam, fontface = "bold")) +
ggwordcloud::geom_text_wordcloud_area(rm_outside = TRUE) +
ggplot2::scale_size_area(max_size = font_size) +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = box_color),
plot.background = ggplot2::element_rect(fill = box_color, color = box_color))
}
# For standard grouping by n
grouper <- function(x, group_col) {
quocol <- rlang::enquo(group_col)
x %>%
dplyr::group_by(!!quocol) %>%
dplyr::summarize(n = n()) %>%
dplyr::arrange(desc(n), !!quocol)
}
# Format pos/ neg setiment format for data tables
dt_sentiment_formatter <- function(x, sent = "sentiment") {
x %>%
DT::datatable() %>%
DT::formatRound(sent, 2) %>%
DT::formatStyle(
sent, color = DT::styleInterval(c(-0.001, 0.001),
c(neg_color, box_color, pos_color)))
}
#' Ensure demographics filters haven't filtered out all relevant data.
#'
#' Ensure demographics filters haven't filtered out all relevant data.
#' @export
#' @param x dataframe, previously filtered; hopefully not filtered to 0 rows.
val_dat <- function(x) {
shiny::validate(shiny::need(nrow(x) > 0,
'All reviews filtered out; please include elements from each filter.'))
}
val_demos <- function(x) {
shiny::validate(shiny::need(length(x) == 2,
"Please select another set of demographic variables (even if you don't intent to filter by them)."))
}
# Generic reversable gauge
reversable_gauge <- function(gauge_value, threshold,
label_text = '', symbol = '',
min_val = 0, max_val = 100, reversed = FALSE,
colours = c("success", "warning", "danger")) {
print(threshold)
if (length(threshold) == 1) {
if (reversed == FALSE) {
hi <- c(threshold+0.05, max_val)
lo <- c(min_val, threshold-0.05)
} else {
lo <- c(threshold+0.05, max_val)
hi <- c(min_val, threshold-0.05)
}
sec <- flexdashboard::gaugeSectors(success = hi,
warning = c(threshold-0.05, threshold+0.05),
danger = lo,
colors = colours)
} else {
if (reversed == FALSE) {
hi <- c(threshold[2], max_val)
lo <- c(min_val, threshold[1])
} else {
lo <- c(threshold[2], max_val)
hi <- c(min_val, threshold[1])
}
sec <- flexdashboard::gaugeSectors(success = hi,
warning = c(threshold[1], threshold[2]),
danger = lo,
colors = colours)
}
gauge <- flexdashboard::gauge(
gauge_value, min = min_val, max = max_val, symbol = symbol,
label = label_text, sectors = sec
)
return(gauge)
}
#' For getting average by user.
#'
#' Some users give more than one rating.
#' This gloms all ratings by a given user (determined by Email)
#' together and then gives the average fraction of users who have
#' rated positively in at least one rating.
#' @export
#' @param df dataframe, data of interest
#' @param pcol numeric, the number of df's column containing the question of interest.
#' @param per_options vector, all the strings in df[[pcol]] that are counted when the percentage is calculated.
#' @param ignore_options string (or vector of strings), the factors to ignore during calculations.
# #' @param neg_options vector, all the strings in df[[pcol]] that are considered 'bad' responses.
#' @param dropNA logical, if TRUE, applies tidyr::drop_na(df, pcol).
#' @return A numeric value, the fraction of users who have given positive responses.
average_weighted_by_user <- function(df, pcol,
per_options = "Yes",
ignore_options = "Decline to state",
dropNA = TRUE) {
if (dropNA == TRUE) {
value <- tidyr::drop_na(df, pcol)
} else {value <- df}
value <- value %>% dplyr::rename(primary_col = pcol)
for (i in ignore_options) {
value <- value %>% dplyr::filter(primary_col != i)
}
if (class(df[[pcol]]) == "numeric") {
value <- value %>%
dplyr::group_by(Email) %>% # At one point, grouped by Product too; cut
dplyr::summarize(avg = mean(primary_col, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::summarize(avg = mean(avg, na.rm = TRUE)) %>%
round(2)
value <- value[[1]]
} else {
value <- value %>%
cc() %>%
dplyr::filter(primary_col %in% per_options) %>%
dplyr::select(`per`) %>%
round(2)
value <- value[[1]]
}
return(value)
}
#' For getting the difference between rating and benchmark.
#'
#' For getting the difference between rating and benchmark.
#' @param x dataframe
#' @param target_column numeric (or string), identifying column of interest.
#' @param warning_range numeric, benchmark to compare rating to; if larger than rating, negative result
#' @return A numeric value, positive if rating is greater than benchmark.
rating_diff_value <- function(x, target_column, warning_range) {
val <- average_weighted_by_user(x, target_column) - warning_range
return(val)
}
#' For creating the valueBox() that compares rating to benchmark.
#'
#' For creating the valueBox() that compares rating to benchmark.
#' @export
#' @param x numeric, value to show off in the box.
#' @param reversed logical, determines whether positive or negative numbers are considered 'good' / green.
#' @return A shinydashboard::box() object.
rating_diff_box <- function(x, label = "Rating vs. benchmark", reversed = FALSE,
symbol = '') {
y <- ifelse(reversed == TRUE, -x, x)
if (is.na(x)) {
val <- "None of this flavor."
} else { val <- paste0(round(x, 2), symbol)}
shinydashboard::valueBox(
val, label,
icon = icon_input(x), width = NULL,
color = color_input(y))
}
#' For swapping between up-arrows and down-arrows.
#'
#' For swapping between up-arrows and down-arrows.
#' @export
#' @param x numeric, tested against 0
#' @return An appropriate icon() arrow.
icon_input <- function(x) {
if (length(x) == 1) {
greater <- ifelse(x > 0, TRUE, FALSE)
if (isTRUE(greater)) {
return(shiny::icon("arrow-circle-up"))
} else {
return(shiny::icon("arrow-circle-down"))
}
} else if (length(x) == 2) {
lesser1 <- ifelse(x[1] < 0, TRUE, FALSE)
greater2 <- ifelse(x[2] > 0, TRUE, FALSE)
if (isTRUE(greater2)) {
return(shiny::icon("arrow-circle-up"))
} else if (isTRUE(lesser1)) {
return(shiny::icon("arrow-circle-down"))
} else { # In between or error:
return(shiny::icon("asterisk"))
}
}
}
#' For swapping between green and red colors.
#'
#' For swapping between green and red colors.
#' @export
#' @param x numeric, tested against 0
#' @return An appropriate colour string
color_input <- function(x) {
# greatereq <- ifelse(x >= 0, TRUE, FALSE)
# if (is.na(greatereq)) {return("aqua")}
# if (isTRUE(greatereq == TRUE)) {
# return("green")
# } else {
# return("red")
# }
if (length(x) == 1) {
greater <- ifelse(x > 0, TRUE, FALSE)
if (isTRUE(greater)) {
return("green")
} else {
return("red")
}
} else if (length(x) == 2) {
lesser1 <- ifelse(x[1] < 0, TRUE, FALSE)
greater2 <- ifelse(x[2] > 0, TRUE, FALSE)
if (isTRUE(greater2)) {
return("green")
} else if (isTRUE(lesser1 == TRUE)) {
return("red")
} else { # In between or error:
return("yellow")
}
}
}
#' Helper for overview plotters. Generates breaks.
#'
#' Helper for overview plotters. Generates breaks.
#' @inheritParams factor_overview_plotter
break_level_maker <- function(break_values) {
break_levels <- c()
for (i in 1:(length(break_values)-1)) {
break_levels <- c(break_levels,
paste0("(", break_values[i], ",", break_values[i+1], "]"))
}
return(break_levels)
}
#' For making faceted plots with numeric data (eg. 1 through 5).
#'
#' For making faceted plots with numeric data (eg. 1 through 5).
#' @export
#' @seealso \code{\link{factor_overview_plotter}}
#' @inheritParams factor_overview_plotter
num_overview_plotter <- function(x, title_string, warning,
demo_one, demo_two,
break_values = c(0, 10, 30, 1000),
reversed_values = FALSE) { # reversed values not implemented
break_levels <- break_level_maker(break_values)
y <- x %>%
dplyr::group_by(!!sym(demo_one), !!sym(demo_two), Product) %>%
dplyr::summarize(Reviews = cut(n(), breaks = break_values, labels = break_levels),
Benchmark = ifelse(
(mean(target_column, na.rm = TRUE) - warning >= 0),
TRUE, FALSE)) %>%
dplyr::mutate(Reviews = factor(Reviews, levels = break_levels)) %>%
dplyr::right_join(x %>% dplyr::select(!!rlang::sym(demo_one), !!rlang::sym(demo_two), Product, target_column)) %>%
fo("Benchmark", c(TRUE, FALSE))
p <- y %>%
ggplot2::ggplot(ggplot2::aes(Product, target_column, fill = Benchmark,
color = Benchmark, alpha = Reviews)) +
ggplot2::geom_bar(stat = "summary", fun.y = "mean")
return(add_overview_plot_default_options(p, title_string, demo_one, demo_two))
}
#' For making faceted plots with factor data (eg. "Yes" / "No").
#'
#' For making faceted plots with factor data (eg. "Yes" / "No").
#' @seealso overview
#' @param x dataframe, column of interest named \'target_column\'.
#' @param title_string string, title of plot.
#' @param warning numeric, cutoff between \'positive\' and \'negative\' values.
#' @param demo_one string, (numeric probably works too?) identifies 1st demographic column in x to filter by.
#' @param demo_two string, (numeric probably works too?) identifies 2nd demographic column in x to filter by.
#' @param foptions vector, strings identifying which factors/strings in target_column count as \'positive\'.
#' @param break_values vectors, numeric values sample sizes into appropriate categories. Ideally statistically relevant.
#' @param reversed_values logical, indicates whether to flip values (e.g. y \%>\% mutate(Percent = 100 - Percent) ).
#' @return A ggplo2 plot.
factor_overview_plotter <- function(x, title_string, warning,
demo_one, demo_two,
foptions = aware_options,
break_values = c(0, 10, 30, 1000),
reversed_values = FALSE) {
# Email used to identify unique reviewers
break_levels <- break_level_maker(break_values)
y <- x %>%
dplyr::mutate(terms = ifelse(target_column %in% foptions, 1, 0)) %>% # Count as aware if any answer was yes
dplyr::group_by(Product, Email) %>%
dplyr::summarize(tally = sum(terms)) %>%
dplyr::mutate(target_column = ifelse(tally > 0, "Yes", "No")) %>% # Count as aware if any answer was yes
dplyr::left_join(x %>% select(!!rlang::sym(demo_one), !!rlang::sym(demo_two), Email),
by = "Email") %>%
dplyr::group_by(!!rlang::sym(demo_one), !!rlang::sym(demo_two), Product) %>%
dplyr::summarize(Reviews = cut(n(), breaks = break_values, labels = break_levels),
Benchmark = ifelse(
(sum(target_column == "Yes")/n() * 100 - warning >= 0),
TRUE, FALSE),
n = n(), Percent = sum(target_column == "Yes")/n()*100)
if (isTRUE(reversed_values)) {
y <- y %>% mutate(Percent = 100 - Percent) %>% mutate(Benchmark = !Benchmark)
}
y <- y %>%
dplyr::mutate(Reviews = factor(Reviews, levels = break_levels)) %>%
fo("Benchmark", c(TRUE, FALSE))
p <- y %>%
ggplot2::ggplot(ggplot2::aes(stringr::str_wrap(Product, 8), Percent, fill = Benchmark,
color = Benchmark, alpha = Reviews)) +
ggplot2::geom_bar(stat = "identity")
return(add_overview_plot_default_options(p, title_string, demo_one, demo_two))
}
#' factor_overview_plotter with default foptions set to "Yes" rather than aware_options.
#'
#' factor_overview_plotter with default foptions set to "Yes" rather than aware_options.
#' @export
#' @inheritParams factor_overview_plotter
#' @return A ggplo2 plot.
yes_factor_overview_plotter <- function(x, title_string, warning,
demo_one, demo_two,
foptions = c("Yes"), # Only change from f_o_p
break_values = c(0, 10, 30, 1000),
reversed_values = FALSE) {
return(factor_overview_plotter(x, title_string, warning, demo_one, demo_two,
foptions, break_values, reversed_values))
}
# Setting default options for various overview plots
add_overview_plot_default_options <- function(plot, title_string, d1, d2,
title_size = 22) {
r <- plot +
ggplot2::scale_x_discrete(labels=product_labels) +
ggplot2::facet_grid(dplyr::vars(!!rlang::sym(d1)), dplyr::vars(!!rlang::sym(d2))) +
ggplot2::scale_fill_manual(values=c(pos_color, neg_color)) +
ggplot2::scale_color_manual(values=c(pos_color, neg_color), guide = FALSE) +
ggplot2::scale_alpha_discrete(range = c(0.2, 1), drop = FALSE) +
ggtitle(title_string) +
guides(fill=guide_legend(title="Over Benchmark")) +
theme(plot.title = element_text(size=title_size),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.text.x = element_text(angle = 30),
legend.position="right")
return(r)
}
# add_overview_plot_default_options <- function(plot, title_string, d1, d2) {
# quod1 <- rlang::enquo(d1)
# quod2 <- rlang::enquo(d2)
# r <- plot +
# facet_grid(Age ~ Gender
# # !!quod1 ~ !!quod2
# # , labeller=labeller(`Ethnic Origin` = eo_labels)
# ) +
# scale_fill_manual(values=c(pos_color, neg_color)) +
# scale_color_manual(values=c(pos_color, neg_color), guide = FALSE) +
# scale_alpha_discrete(range = c(0.2, 1)) +
# ggtitle(title_string) +
# guides(fill=guide_legend(title="Over Benchmark")) +
# theme(axis.title.x=element_blank(),
# axis.title.y=element_blank(),
# axis.text.x = element_text(angle = 75),
# legend.position="right")
# return(r)
# }
# Helpers for generalizing stacked bar graphs
#' Getting percentages of total values for each row, column.
#'
#' Getting percentages of total values for each row, column. Also renaming original columns for easy access.
#' @param df dataframe
#' @param cols vector labelling columns of interest
#' @param total numeric, total number of responses.
percents <- function(df, cols, total) {
t <- df
for (i in 1:length(cols)) {
per_ <- paste0("per_", i)
s_ <- paste0("s", i)
exp <- rlang::sym(cols[i])
t <- t %>%
dplyr::mutate(!!per_ := (!!exp) / total * 100) %>%
dplyr::rename(!!s_:=(!!exp))
}
return(t)
}
#' Plotly bar chart for intents by brand awareness.
#'
#' Plotly bar chart for intents by brand awareness. Uses \code{\link{percents}}.
#' Future improvement: aggregate intents by user; get user intents rather than response intents.
#' @param dt dataframe, its primary column of interest must already be renamed primary_col and its secondary column of interest -- the column by which the bars will be stacked -- sec_col.
#' @param title_string string, for plot title.
#' @param legend_string string, for plot legend title.
#' @param col_vec vector of strings labelling colours.
#' @param dropSecNA logical, whether to drop NA/ "Decline to state" factors from sec_col.
#' @param legend_position list with two named numerics (x and y), position of legend on plot.
#' @return A plotly bar plot.
intents_by_awareness_bar <- function(dt, title_string, legend_string, col_vec,
dropSecNA = TRUE,
legend_position = list(x = 0.05, y = 0.95)) {
# Drop rows with NA values, "Decline to state" in sec_col
if (isTRUE(dropSecNA)) {
dt <- dt %>%
tidyr::drop_na(sec_col) %>%
dplyr::filter(sec_col != "Decline to state")
}
# Getting total number of responses
total <- dt %>% dplyr::tally() ; total <- total[[1]]
# Getting temporary dataframe with responses shown by primary_col and sec_col
t <- dt %>%
group_by_primary_sec() %>%
select(-total)
# # Reordering
# %>% select(primary_col, contains("Yes,"), everything())
# Getting the actual response phrases (factors of sec_col)
sec_opts <- t %>%
dplyr::select(-primary_col) %>%
names()
# Adding percentages
t <- percents(t, sec_opts, total)
# Plotting graph
plot <- t %>%
# Starting with empty bar graph
plotly::plot_ly(type = 'bar') %>%
# Adding layout
plotly::layout(
title = title_string,
legend = legend_position,
annotations = list(yref='paper', xref="paper",
y=legend_position[["y"]]+0.04,
x=legend_position[["x"]]-0.02,
font = list(size = 13),
text = legend_string, showarrow = FALSE),
xaxis = list(title=""), yaxis = list(title="Count"),
barmode = "stack",
hovermode = "compare",
font = list(family = font_fam, color = font_color),
paper_bgcolor = box_color,
plot_bgcolor = box_color)
# Determining how to access temporary dtaframe
t_cols <- 2:length(t); or_cols <- t_cols[1:(length(t_cols)/2)]
# Looping through temporary dataframe to add 'traces' to the empty bar graph
for (i in or_cols) {
df_trace <- data.frame(y=t[[i]], p=t[[i+length(or_cols)]],
x=t$primary_col)
plot <- plotly::add_trace(
plot, data = df_trace, x = ~x, hoverinfo = "name+text", y = ~y, #type = 'bar',
name = sec_opts[i-1], marker = list(color= col_vec[i-1]),
text = ~paste0('Count: ', y, '\nPercent: ', round(p, 2), '%'))
}
return(plot)
}
#' Plotly pie chart, simple, but nicely formatted.
#'
#' Plotly pie chart, simple, but nicely formatted.
#' @param x dataframe
#' @param title_string string, title of plot
#' @param color_vec vector, colours for plot
#' @param show_legend logical, whether to show legend in plot
#' @param dropNA logical, whether to drop null values before making graph.
#' @return Plotly pie chart.
spc <- function(x, title_string, color_vec, show_legend = FALSE, dropNA = TRUE) {
### Column of interest to be renamed primary_col beforehand
# scnt <- sc(x)
scnt <- cc(x) # Use this to be more accurate.
if (isTRUE(dropNA)) {scnt <- tidyr::drop_na(scnt)}
scnt %>%
plotly::plot_ly(
labels = ~primary_col, values = ~n, type = 'pie', rotation = 0,
direction = "clockwise", sort = FALSE, textposition = 'inside',
textinfo = 'label+percent+value',
insidetextfont = list(family = font_fam, color = font_color),
hoverinfo = 'label+text',
text = ~paste('Count:', n),
marker = list(colors = color_vec,
line = list(color = font_color, width = 1)),
showlegend = show_legend) %>%
plotly::layout(
title = title_string,
font = list(family = font_fam, color = font_color),
hovermode = "compare", #position = "top centre",
paper_bgcolor = box_color, plot_bgcolor = box_color)
}
# Simple count-retrieval
sc <- function(x) {
### column of interest must be named primary_col beforehand
x %>%
dplyr::group_by(primary_col) %>%
dplyr::count() %>%
dplyr::ungroup() %>%
dplyr::mutate(per = n/sum(n) * 100)
}
#' Complicated count-retrieval.
#'
#' Complicated count-retrieval. Counts primary_col responses; if a user has give more than one response, counts them as "Mixed".
#' @param x dataframe, column of interest named primary_col.
#' @return A dataframe with all distinct variants of primary_col (and maybe "Mixed" too) and the number and % of responses of that variant.
cc <- function(x) {
### column of interest must be named primary_col beforehand
p_levels <- c(x[["primary_col"]] %>% factor() %>% levels(), "Mixed")
dist <- x %>%
dplyr::group_by(Email) %>%
dplyr::summarize(tally = dplyr::n_distinct(primary_col)) %>% # Get uniques per person
ungroup()
x <- x %>%
dplyr::left_join(dist, by = c("Email" = "Email")) %>% # Join them only main df
dplyr::mutate(primary_col = ifelse(tally > 1, "Mixed", as.character(primary_col))) %>% # Replace varying responses with "Mixed".
dplyr::group_by(primary_col, Email) %>% # Eliminate duplicate counts from individual users
dplyr::count(primary_col) %>%
dplyr::ungroup() %>%
dplyr::group_by(primary_col) %>%
dplyr::count() %>% # Count how many are left afterwards
dplyr::ungroup() %>%
dplyr::mutate(per = n/sum(n) * 100) %>%
dplyr::mutate(primary_col = factor(primary_col, levels = p_levels))
}
# For summarizing conflicting evaluations
# Helper function to evaluate conflicting term hierarchies
conflicting_term_hierarchies <- function(conflicting_terms, ordered_hierarchy) {
# ordered_hierarchy is the order if which the conflicting_terms overrule each other
# Ie. If 1 shows up, that's what the user is classified as;
# 2 overrules anything but but 1, 3 anything but 1 and 2, etc.
if (length(conflicting_terms) != length(ordered_hierarchy)) {stop("Error; (length(conflicting_terms) != length(ordered_hierarchy)")}
ordered_terms <- conflicting_terms[ordered_hierarchy]
}
#' For getting sums by row.
#'
#' For getting sums by row.
#' @export
#' @param df dataframe
#' @param cols vector of strings, column names.
#' @param sify logical, whether to rename col into s_xs.
#' @return A dataframe...
row_percents <- function(df, cols, sify = TRUE) {
for (i in 1:length(cols)) {
per_ <- paste0("per_", i)
s_ <- paste0("s", i)
exp <- rlang::sym(cols[i])
df <- df %>%
dplyr::mutate(!!per_ := (!!exp) / sec_tally * 100)
if (isTRUE(sify)) {
df <- df %>%
dplyr::rename(!!s_:=(!!exp))
}
}
return(df)
}
#' Pick an evaluation when users have >1 conflicting answers.
#'
#' Pick an evaluation when users have >1 conflicting answers (eg. users evaluating multiple products who have said yes and no when asked whether they've used it before).
#' @param x dataframe
#' @param primary_col numeric, column of interest.
#' @param conflicting_terms vector, strings possible answers from column of interest.
#' @param ordered_hierarchy vector, numeric order in which to accept conflicting_terms.
#' @param sec_col string, (or numeric) denoting column to divide into separate rows/bars by.
#' @param dropNA logical, whether to drop NA values from output.
#' @return A dataframe tallying the frequency of responses by product.
conflicting_evaluations <- function(x, primary_col, conflicting_terms,
ordered_hierarchy, sec_col = "Product",
dropNA = TRUE) {
### terms: column with conflicting evaluations
### Email: column used to identify individuals with conflicting evaluations for a given Flavor
ordered_terms <- conflicting_term_hierarchies(conflicting_terms, ordered_hierarchy)
x <- x %>%
dplyr::rename(terms = primary_col) %>%
dplyr::rename(sec = sec_col) %>%
dplyr::group_by(terms, sec, Email) %>%
dplyr::count()
if (isTRUE(dropNA)) {x <- tidyr::drop_na(x)}
x <- x %>%
dplyr::ungroup() %>%
tidyr::spread(terms, n, fill = 0) %>%
dplyr::mutate(terms = NA)
for (i in (length(x)-1):3) {
x <- x %>% dplyr::mutate(terms = ifelse(x[[i]] > 0, names(x)[i], terms))
}
x <- x %>%
dplyr::group_by(terms, sec) %>%
dplyr::count() %>%
dplyr::ungroup() %>%
tidyr::spread(terms, n, fill = 0) %>%
dplyr::mutate(sec_tally = rowSums(.[2:length(.)])) %>%
dplyr::select(sec, c(ordered_terms), everything()) %>%
row_percents(ordered_terms) %>%
dplyr::select(sec, matches("s[0-9]"), contains("per"), everything()) %>%
# dplyr::mutate(sec = factor(sec, levels = rev(factor(sec))))
dplyr::arrange(rev(sec))
return(x)
}
#' For plotting awareness by product.
#'
#' For plotting awareness by product.
#' @export
#' @seealso \code{\link{conflicting_evaluations}}
#' @param adf dataframe, ideally the output of conflcting_evaluations().
#' @param title_string string, title of plot.
#' @param ordered_terms vector, ordered factors for column of interest.
#' @param color_vec vector, strings denoting colours for all bar colour options (eg. "Yes", "No", "Brand but not product", etc.).
#' @param pc_end logical, if TRUE, tacks on total, sum(per_x) to end of bars; otherwise sec_tally.
#' @return A plotly bar plot with parallel horizontal bars.
horizontal_bars_plot <- function(adf, title_string, ordered_terms, color_vec,
pc_end = FALSE) {
plot <- plotly::plot_ly(
adf, type = 'bar', orientation = 'h', hoverinfo = "text",
marker = list(line = list(color = font_color, width = 1))) %>%
plotly::layout(
title = title_string, barmode = "stack", hovermode = "y",
yaxis = list(title = ""),
xaxis = list(title = "", showgrid = FALSE, showline = FALSE,
showticklabels = FALSE, zeroline = FALSE),
font = list(family = font_fam, color = font_color),
# position = "top centre",
showlegend = FALSE,
paper_bgcolor = box_color, plot_bgcolor = box_color)
adf_cols <- 1:length(adf); or_cols <- adf_cols[2:(length(adf_cols)/2)]
rows <- adf %>% tally() %>% `[[`(1)
df_cum <- data.frame(x = rep(0, rows), p = rep(0, rows), y = rep(0, rows))
for (i in or_cols) {
df_trace <- data.frame(x=adf[[i]], p=adf[[i+length(or_cols)]], y=adf[[1]],
l = ordered_terms[i-1])
plot <- plotly::add_trace(
plot, data = df_trace,
x = ~p, hoverinfo = "text", #name+
y = ~y, #type = 'bar',
name = ordered_terms[i-1],
marker = list(color= color_vec[i-1]),
text = ~paste0(l, '\nCount: ', x, '\nPercent: ', round(p, 2), '%'),
hoverlabel = list(namelength = -1)) %>%
plotly::add_annotations(
xref = ordered_terms[i-1], yref = 'sec',
x = df_cum$p + (df_trace$p / 2), y = df_trace$y,
text = paste0(round(df_trace$p, 2), "%"),
font = list(family = font_fam, color = font_color), showarrow = FALSE)
df_cum$p = df_cum$p + df_trace$p
}
if (isTRUE(pc_end)) {
plot <- plot %>%
plotly::add_annotations(
x = df_cum$p + 5, y = df_trace$y,
text = paste0(adf$total, "\n/ ", round(df_cum$p), "%"),
font = list(family = font_fam, color = font_color), showarrow = FALSE)
} else {
plot <- plot %>%
plotly::add_annotations(
xref = ordered_terms[length(ordered_terms)], yref = 'sec',
x = 105, y = adf$sec, text = adf$sec_tally,
font = list(family = font_fam, color = font_color), showarrow = FALSE)
}
return(plot)
}
#' Finds the mode of a vector.
#'
#' Finds the mode of a vector.
#' @param x vector
#' @return The first-occurring modal value.
md <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
#' Groups data by a primary_col and a sec_col.
#'
#' Counts primary_col, sec_col responses; if a user has give more than one response, counts them as "Mixed".
#' @export
#' @seealso \code{\link{understanding_by_awareness}} uses group_by_primary_sec()'s output as its primary input.
#' @param x dataframe containing primary_col and sec_col.
#' @param arrange_by_total logical, whether to arrange rows by total number in each.
#' @param mode_by_user logical, whether to consolidate multiple responses from the same user -- if responses vary, they are replaced with "Mixed".
#' @return A dataframe with all distinct variants of primary_col and sec_col (and maybe "Mixed" for either or both too) and the number of responses of that combo.
group_by_primary_sec <- function(x, arrange_by_total = FALSE,
mode_by_user = TRUE) {
### primary column of interest must be named primary_col beforehand
### secondary column of interest must be named sec_col beforehand
if (isTRUE(mode_by_user)) { # Eliminate multiple answers per user.
pcol_levels <- c(x[["primary_col"]] %>% factor() %>% levels(), "Mixed")
scol_levels <- c(x[["sec_col"]] %>% factor() %>% levels(), "Mixed")
dist_p <- x %>%
dplyr::group_by(Email) %>%
dplyr::summarize(tally = dplyr::n_distinct(primary_col)) %>% # Get uniques per person
ungroup()
p <- x %>%
dplyr::left_join(dist_p, by = c("Email" = "Email")) %>% # Join them on main df
dplyr::mutate(p = ifelse(tally > 1, "Mixed", as.character(primary_col))) %>% # Replace varying responses with "Mixed".
dplyr::group_by(p, Email) %>% # Eliminate duplicate counts from individual users
dplyr::count(p) %>%
dplyr::ungroup()
dist_s <- x %>%
dplyr::group_by(Email) %>%
dplyr::summarize(tally = dplyr::n_distinct(sec_col)) %>% # Get uniques per person
ungroup()
s <- x %>%
dplyr::left_join(dist_s, by = c("Email" = "Email")) %>% # Join them on main df
dplyr::mutate(s = ifelse(tally > 1, "Mixed", as.character(sec_col))) %>% # Replace varying responses with "Mixed".
dplyr::group_by(s, Email) %>% # Eliminate duplicate counts from individual users
dplyr::count(s) %>%
dplyr::ungroup()
x <- x %>%
dplyr::left_join(p, by = "Email") %>%
dplyr::left_join(s, by = "Email") %>%
dplyr::mutate(primary_col = ifelse(is.na(primary_col), NA, as.character(p))) %>%
dplyr::mutate(sec_col = ifelse(is.na(sec_col), NA, as.character(s))) %>%
dplyr::mutate(primary_col = as.factor(p)) %>%
dplyr::mutate(sec_col = as.factor(s)) %>%
dplyr::mutate(primary_col = factor(primary_col, levels = pcol_levels)) %>%
dplyr::mutate(sec_col = factor(sec_col, levels = scol_levels)) %>%
dplyr::mutate(primary_col = forcats::fct_explicit_na(primary_col, "Decline to state")) %>%
dplyr::mutate(sec_col = forcats::fct_explicit_na(sec_col, "Decline to state")) %>%
dplyr::mutate(primary_col = forcats::fct_drop(primary_col)) %>%
dplyr::mutate(sec_col = forcats::fct_drop(sec_col)) %>%
dplyr::group_by(primary_col, sec_col, Email) %>%
dplyr::count() %>%
dplyr::ungroup()
}
x <- x %>%
dplyr::group_by(primary_col, sec_col) %>%
dplyr::count() %>%
dplyr::ungroup() %>%
tidyr::spread(sec_col, n, fill = 0, drop = FALSE) %>%
dplyr::mutate(total = rowSums(dplyr::select(., -primary_col)))
if (isTRUE(arrange_by_total)) {x <- x %>% dplyr::arrange(total)}
return(x)
}
#' This function plots the frequency of primary_col stacked by secondary_col.
#'
#' Titled according to its first use, this function plots the frequency of primary_col stacked by secondary_col.
#' @export
#' @seealso \code{\link{group_by_primary_sec}}
#' @param df dataframe, must include columns named primary_col and seconary_col; intended to be the output of \code{\link{group_by_primary_sec}}.
#' @param title_string string, titles plot.
#' @param legend_string string, titles legend.
#' @param color_vec vector, strings labelling each factor of secondary_col.
#' @param legend_position vector, two numeric elements, where the legend goes.
#' @param drop_na logical, whether to drop missing values.
#' @return A plotly vertical bar plot.
understanding_by_awareness <- function(df, title_string, legend_string,
color_vec,
legend_position = c(0.06, 0.94),
drop_na = TRUE) {
if (drop_na == TRUE) {
x <-df[stats::complete.cases(df), ]
x <- x[, colSums(x != 0) > 0]
} else {
x <- df
}
sec_opts <- x %>%
select(-c(primary_col, total)) %>%
names()
# Adding percentages
t <- percents(x, sec_opts, total) # temporary dataframe
# Determining how to access temporary dtaframe
t_cols <- 2:length(t); or_cols <- t_cols[1:(length(t_cols)/2)]
# Plotting graph
plot <- t %>%
# Starting with empty bar graph
plotly::plot_ly(type = 'bar') %>%
# Adding layout
plotly::layout(
title = title_string,
legend = list(x = legend_position[1], y = legend_position[2]),
annotations = list(yref='paper', xref="paper",
x=legend_position[1] - 0.05,
y=legend_position[2] + 0.05,
font = list(size = 13),
text = legend_string, showarrow = FALSE),
xaxis = list(title=""), yaxis = list(title="Count"),
barmode = "stack",
hovermode = "compare",
font = list(family = font_fam, color = font_color),
paper_bgcolor = box_color,
plot_bgcolor = box_color)
# Looping through temporary dataframe to add 'traces' to the empty bar graph
for (i in or_cols) {
df_trace <- data.frame(y=t[[i]], p=t[[i+length(or_cols)]],
x=t$primary_col)
plot <- plotly::add_trace(
plot, data = df_trace, x = ~x, hoverinfo = "name+text", y = ~y, #type = 'bar',
name = sec_opts[i-1], marker = list(color= color_vec[i-1]),
text = ~paste0(x, '\nCount: ', y, '\nPercent: ', round(p, 2), '%'))
}
return(plot)
}
# Processing for free-form quesitons
# # For picking out which terms were used in a free-form response by Flavor.
# # Must follow on with additional data-cleaning
# # CANNOT usefully group reviewers by product since 1 reviewer can review multiple products
# separate_terms_by_product <- function(x, identity_col, target_col) {
# # terms --> primary_col
# x %>%
# rename(primary_col = target_col) %>%
# mutate(primary_col = tolower(primary_col)) %>%
# filter(!is.na(primary_col)) %>%
# select(primary_col, Product) %>%
# separate_rows(primary_col, # Separate terms out by various markers
# sep = "(or )|(,)|( and)|( /)|(/ )|(\\.)|(!)|(:)|(;)", convert = FALSE) %>%
# mutate(primary_col = trimws(primary_col)) %>% # Omit trailing spaces
# filter(primary_col != '') # Omit blank terms
# }
#' Separates terms by gender.
#'
#' Separates terms by gender. Drops NAs, selects gender & target_col, then divides target_col up by various string markers (eg. \'or \', \',\', \' and\', etc.). Filters out non-sense pieces from broken up terms.
#' @export
#' @param x dataframe, contains columns Gender and a column identified by target_col.
#' @param target_col numeric (or string), identifies column of interest.
#' @return A dataframe with columns primary_col and Gender.
separate_terms_by_gender <- function(x, target_col) {
x %>%
rename(primary_col = target_col) %>%
mutate(primary_col = tolower(primary_col)) %>%
filter(!is.na(primary_col)) %>%
pre_cleaner(primary_col) %>%
select(primary_col, Gender) %>%
separate_rows(primary_col, # Separate terms out by various markers
sep = "(or )|(,)|( and)|( /)|(/ )|(\\.)|(!)|(:)|(;)", convert = FALSE) %>%
mutate(primary_col = trimws(primary_col)) %>% # Omit trailing spaces
filter(primary_col != '') %>% # Omit blank terms
filter(primary_col != 'n/a') %>%
filter(primary_col != '#n/a') %>%
filter(primary_col != '?')
}
single_plural_chooser <- function(x) {
# Choose single or plural depending on which is more common; ties --> no s
x %>%
dplyr::mutate(word_no_s = stringr::str_replace(primary_col, pattern = "(s$)", '')) %>% # Get s-less version # No idea why used to have |(') in pattern
dplyr::mutate(word_s = stringr::str_replace(primary_col, pattern = "(?<!s)$", "s")) %>% # Get version ending in s
dplyr::mutate(sless = ifelse(primary_col == word_no_s, TRUE, FALSE)) %>% # Get marker for true version
dplyr::add_count(word_no_s, name = 'n') %>% # Tally count ignoring ending-s status
dplyr::add_count(word_no_s, sless, name = 'which') %>% # Get count using ending-s status
dplyr::mutate(choose_sless = ifelse(n-which > n/2, !sless, sless)) %>% # See which ending is more common
dplyr::mutate(primary_col = ifelse(choose_sless, word_no_s, word_s)) #%>% # Select more common version
# select(-word_no_s, -word_s, -sless, -n, -which, -choose_sless)
}
#' Functions to clean word variants for particular kinds of questions.
#'
#' Functions to clean word variants for particular kinds of questions.
#' @name cleaner_function
#' @param x dataframe
#' @param target_col numeric, identifies which column of dataframe contains question of interest.
#' @return Dataframe with column cleaned.
NULL
#' @rdname cleaner_function
#' @export
pre_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_to_lower(!!quocol)) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'smart.*final', 'smart & final'))
}
#' @rdname cleaner_function
#' @export
snack_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_to_lower(!!quocol)) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = ' bar[s]?', ' bars')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'cliff bars', 'clif bars')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*protein bars.*', 'protein bars')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*granola bars.*', 'granola bars')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*almonds.*', 'almonds')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'trailmix', 'trail mix')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*trail mix.*', 'trail mix')) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' @rdname cleaner_function
#' @export
grocery_store_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'stater brothers', 'stater bros')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '7/11', '7-eleven')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '7-11', '7-eleven')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '711', '7-eleven')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '7/11', '7-eleven')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '7 eleven', '7-eleven')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '\u2019', "'")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'pavillions', 'pavilions')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'von\'?s', "vons")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'whole?.?food\'?s.*', "whole foods")) %>% # ANYTHING after whole foods gets eaten!
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.alph\'?s?.*', "ralph's")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*joe\'?s?.*', "trader joe's")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'food4less', "food 4 less")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'food for less', "food 4 less")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*superking.*', "super king")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'super king$', "super king markets")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'super king market$', "super king markets")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'smart.*final', "smart & final")) %>%
# dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'smart&', "smart &")) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' @rdname cleaner_function
#' @export
energy_drink_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'redbull', 'red bull')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*coffee.*', 'coffee')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*starbuck.*', 'starbucks')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*tea.*', 'tea')) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' @rdname cleaner_function
#' @export
workout_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*weight.*', 'weightlifting')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*sports.*', 'sports')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*running.*', 'running')) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' @rdname cleaner_function
#' @export
subscription_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'imperfect.*', 'imperfect produce')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'post(\\s)*mate.*', 'postmates')) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' @rdname cleaner_function
#' @export
coffee_cleaner <- function(x, target_col) {
quocol <- rlang::enquo(target_col)
x %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'red(\\s)*bull.*', 'red bull')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = '.*starbuck.*', 'starbucks')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'ground(\\s)*work.*', 'groundworks')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'blue bottled', 'blue bottle')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'ice coffee', 'iced coffee')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = ".*don't have one", "none")) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'rockstar.*', 'rockstar')) %>%
dplyr::mutate(!!quocol := stringr::str_replace(!!quocol, pattern = 'monster energy.*', 'monster')) %>%
dplyr::mutate(!!quocol := stringr::str_to_title(!!quocol))
}
#' Filter out stopwords, assign polarity to terms from target_column.
#'
#' Filter out stopwords, assign polarity to terms from target_column. Makes use of \code{\link{wc_plotter}}.
#' @seealso \code{\link{wc_plotter}}
#' @param x dataframe with column of interest.
#' @param target_column number of the column in x containing data being assigned polarity.
#' @param stopwords_dict dictionary of stopwords e.g. \code{tibble::tibble(stopwords::stopwords(source="stopwords-iso"))}
#' @return A dataframe containing 5 columns including the polarity of target_column's words.
assign_polarity <- function(x, target_column, stopwords_dict = stop_d) {
targ <- x %>% select(target_column) # targ for temporary tibble; must access with [[]]
targ <- targ[[1]] %>%
as.character() %>%
sentimentr::get_sentences() %>%
sentimentr::extract_sentiment_terms() %>%
attributes()
targ$counts %>%
dplyr:: mutate(valence = ifelse(polarity > 0, "positive",
ifelse(polarity < 0, "negative", 'neutral'))) %>%
dplyr::anti_join(stopwords_dict) %>%
dplyr::filter(!is.na(words)) %>%
dplyr::arrange(desc(n)) %>%
dplyr::mutate(terms = str_to_title(words))
}
# Get average sentiment and product for each response
sentiment_prod_response <- function(x, target_column) {
x %>%
rename("Q" = target_column) %>%
mutate("response" = Q) %>%
select(Q, response, Product) %>%
sentimentr::get_sentences() %$% # Like a with()
sentimentr::sentiment_by(., by=c('Product', 'response')) %>%
filter(word_count > 0) %>%
filter(!is.na(response)) %>%
filter(response != "n/a") %>%
rename(sentiment = ave_sentiment) %>%
select(response, Product, sentiment) %>%
arrange(desc(sentiment))
}
#' Select appropriate colors depending on 'valence' values in selection.
#'
#' Select appropriate colors depending on 'valence' values in selection. Takes a named vector containing colours and assigns them in the correct order to factor levels from a dataframe.
#' @param valence_df dataframe containing valence data; (truncated, arranged) output from \code{\link{assign_polarity}}.
#' @param valence_col string, selects column with strings / factors labelling valences to derive colors from.
#' @param avc vector, named with valences, contains strings defining colours to use for each.
#' @return A named vector with avc's elements in the order matching their levels in valence_df.
valence_colorer <- function(valence_df, valence_col = "valence", avc = all_valence_colors) {
# all_valence_colors must be a named vector with colors for each valence.
valences <- valence_df[[valence_col]] %>% factor() %>% levels()
v_colors <- c()
for (i in valences) {
if (i %in% names(avc)) {
v_colors <- c(v_colors, i)
}
}
return(avc[v_colors])
}
#' Function that generates wordclouds.
#'
#' Function that generates wordclouds. Makes use of \code{\link{valence_colorer}}, \code{\link{wc}}.
#' @seealso \code{\link{assign_polarity}}, \code{\link{valence_colorer}}
#' @param df_reactive dataframe, output from \code{\link{assign_polarity}}.
#' @param cutoff numeric, number of words to include in wordcloud.
#' @param size numeric, size of text for wordcloud.
#' @return A plot.
wc_plotter <- function(df_reactive, cutoff, size) {
# df_reactive is presumed the output of assign_polarity()
trms <- df_reactive %>%
dplyr::top_n(cutoff, n) %>%
dplyr::arrange(valence)
v_col <- valence_colorer(trms)
plt <- trms %>%
wc(size) +
aes(label = terms, color = valence) +
scale_color_manual(values=v_col)
return(plt)
}
# Broken with non-pepsi data; needs fixing
# # Get counts for specific terms, broken down by Flavor
# specific_term_counts_by_product <- function(x, specific_terms) {
# # specific_terms originally an output of grouper() %>% top_n()
# x %>%
# filter(terms %in% specific_terms$terms) %>%
# group_by(terms, Product) %>%
# summarize(n = n()) %>%
# ungroup() %>%
# spread(Product, n, fill = 0, drop = FALSE) %>%
# mutate(total = rowSums(
# select(., c(`Rockstar XDurance (Both)`, `Rockstar XDurance (Cotton Candy)`,
# `Rockstar XDurance (Peach & Iced Tea)`, `Game Fuel`,
# `Starbucks Tripleshot Caffe Mocha`)))) %>%
# arrange(total)
# }
# vertical 3-layer stacked single-bar bar chart
vertical_three_layer_bar <- function(x, color_vec) {
# Primary column of interest must be named primary_col beforehand
targ <- sc(x)
x <- targ %>% dplyr::select(primary_col)
x <- x[[1]]
y <- targ %>% dplyr::select(n)
y <- y[[1]]
plotly::plot_ly(
y = y[1], type = "bar",
marker = list(color = color_vec[1], line = list(color = font_color, width = 1)),
hoverinfo = "skip") %>%
plotly::add_trace(y=y[2],marker=list(color=color_vec[2],
line=list(color=font_color,width=1))) %>%
plotly::add_trace(y=y[3],marker=list(color=color_vec[3],
line=list(color=font_color,width=1))) %>%
plotly::layout(
barmode = "stack", title = "Preferred Flavor",
hovermode = "compare", font = list(family = font_fam, color = font_color),
xaxis = list(title = "", showticklabels = FALSE),
yaxis = list(title =""), showlegend = FALSE,
paper_bgcolor = box_color, plot_bgcolor = box_color) %>%
plotly::add_annotations(
x = 0, y = y[1]/2,
text = paste(x[1],paste0(y[1]," / ",round(y[1]/sum(y)*100,2),"%"),sep='\n'),
font = list(family = font_fam, color = font_color), showarrow = FALSE) %>%
plotly::add_annotations(
x = 0, y = y[1] + y[2]/2,
text = paste(x[2],paste0(y[2]," / ",round(y[2]/sum(y)*100,2),"%"),sep='\n'),
font = list(family = font_fam, color = font_color), showarrow = FALSE) %>%
plotly::add_annotations(
x = 0, y = y[1] + y[2] + y[3]/2,
text = paste(x[3],paste0(y[3]," / ",round(y[3]/sum(y)*100,2),"%"),sep='\n'),
font = list(family = font_fam, color = font_color), showarrow = FALSE)
}
# Colour datatable text rows by Product
product_format <- function(x, product_col, color_vec) {
DT::formatStyle(x, "Product", target = 'row',
color = DT::styleEqual(unique(product_col), color_vec))
}
# For checking whether all variants of a product are displayed
check_var <- function(x, product, variants) {
counts <- str_detect(x, product)
count <- sum(counts)
if (count == 0) {
ret <- NULL
} else if (count == variants) {
ret <- product
} else if (count > variants) {
ret <- paste0(product, " variants exceed expected: ", variants)
} else {
ret <- paste0(product, " (some)")
}
return(ret)
}
#' For displaying which products are selected; avoid overcluttering.
#'
#' For displaying which products are selected; avoid overcluttering.
#' @param x dataframe
#' @param products_short vector of strings, a truncated list of productds avoiding within-brand repetition.
#' @param p_var vector of numerics, the number of different variants each products_short element represents.
#' @param color_cutoff numeric, threshold for pos/neg colors.
#' @return HTML code to display the product names succinctly.
display_prod <- function(x, products_short, p_var, color_cutoff) {
# products_short is a truncated list of products avoiding within-brand repetition
# p_var is the number of different flavours/ variants of a products are in the data
txt <- ''
for (i in 1:length(products_short)) {
txt <- paste(txt, check_var(x, products_short[i], p_var[i]), sep = "<br/>")
}
return(shiny::HTML(txt))
}
# Tallies factors for multiple columns, gives counts for each in one dataframe.
tallyer <- function(df, orderby, ...) {
# orderby the name of the column to order by in a quosure --
# q*, * being the column # or totals
# columns listed in ...
quos <- rlang::enquos(...)
cnt <- 1
for (i in quos) {
# Naming each tally q*
nm <- paste0("top", quo_name(cnt))
b <- df %>%
group_by(!!i) %>%
count() %>%
rename(!!nm := n) %>%
rename(primary_col := !!i) %>%
ungroup()
# Joining each tally together
if (cnt > 1) {a <- full_join(a, b, by = "primary_col")} else {a <- b}
cnt <- cnt + 1
}
a <- a %>%
# Replacing NAs with 0.
replace(., is.na(.), 0) %>%
# Add totals
dplyr::mutate(totals = rowSums(select(., contains("top"))))
# Order
for (i in rev(orderby)) {
a <- a %>% arrange(!!i)
}
return(a)
}
#' For filtering using quosures.
#'
#' For filtering using quosures.
#' @export
#' @param x dataframe, to be filtered
#' @param target_col numeric, the column number of the dataframe to be filtered
#' @param values vector, the values that target_col should contain
#' @return dataframe, like x but with target_col filtered for values.
filter_by <- function(x, target_col, values) {
quo_col <- dplyr::quo(target_col)
x %>% dplyr::filter_at(vars(!!quo_col), any_vars(. %in% values))
}
#' shinydashboard::box() but with different default settings.
#'
#' shinydashboard::box() but with different default settings
#' @export
#' @inheritParams shinydashboard::box
#' @return A shinydashboard::box().
box_d <- function(..., title = NULL, footer = NULL, status = "primary",
solidHeader = TRUE, background = NULL, width = 12, height = NULL,
collapsible = TRUE, collapsed = FALSE) {
shinydashboard::box(..., title = title, footer = footer, status = "primary",
solidHeader = solidHeader, background = background, width = width,
height = height, collapsible = collapsible, collapsed = collapsed)
}
#' Takes dataframe, prepares it for horizontal_bars_plot().
#'
#' Takes dataframe, prepares a column formed as a concatenated checklist for graphing with horizontal_bars_plot().
#' @export
#' @seealso \code{\link{horizontal_bars_plot}}
#' @param x dataframe
#' @param pcol string or numeric, column of primary interest; ends up being named sec at end of function.
#' @param scol string or numeric, column of seconday interest; how the bar graph's colors will be stacked.
#' @param pcol_strings vector of 5 strings (in future, generalize this; not sure how yet), pcol's 'pure' values.
#' @return A list containing a dataframe ready for horizontal_bars_plot() and a vector containing strings of the factors sec_col (including "Mixed" if relevant).
checkbox_to_hbp <- function(x, pcol = lookfor_col, scol = "Product",
pcol_strings = lookfor_strings) {
qs <- c()
for (i in 1:length(pcol_strings)) {
qs <- c(qs, rlang::sym(pcol_strings[i]))
}
x <- x %>%
dplyr::rename(primary_col = pcol) %>%
dplyr::rename(sec_col = scol) %>%
concat_by_user() %>%
spread_by_strings(pcol_strings) %>%
col_sep()
rowcount <- nrow(x)
ordered_products <- x %>%
dplyr::select(sec_col) %>%
`[[`(1) %>%
factor() %>%
levels() %>%
forcats::fct_relevel("Mixed", after = Inf) %>%
levels()
x <- x %>%
dplyr::add_count() %>%
dplyr::group_by(sec_col) %>%
dplyr::summarize(!!qs[[1]] := sum(!!qs[[1]]),
!!qs[[2]] := sum(!!qs[[2]]),
!!qs[[3]] := sum(!!qs[[3]]),
!!qs[[4]] := sum(!!qs[[4]]),
!!qs[[5]] := sum(!!qs[[5]])) %>%
tidyr::gather(sec, count, !!qs[[1]]:!!qs[[length(qs)]]) %>%
tidyr::spread(sec_col, count) %>%
select(sec, ordered_products) %>%
mutate(total = rowSums(.[2:length(.)])) %>%
mutate(sec_tally = rowcount) %>%
dplyr::select(sec, ordered_products, everything()) %>%
row_percents(ordered_products, sify = FALSE) %>%
dplyr::select(sec, ordered_products, contains("per"), everything()) %>%
dplyr::arrange(desc(sec))
return(list(dtfrm = x, ordered_products = ordered_products))
}
# Helpers for checkbox_to_hbp().
#' Get new column for each strings value.
#'
#' Get new logical/ boolean columns for checking whether each value of strings is found in primary_col.
#' @param x dataframe with column of interest named primary_col.
#' @param strings vector of strings to check against primary_col.
#' @return A dataframe with a new column for each element of strings.
spread_by_strings <- function(x, strings) {
for (i in strings) {
quocol <- rlang::enquo(i)
col_name <- rlang::quo_name(quocol)
x <- x %>%
dplyr::mutate(!!col_name := stringr::str_detect(primary_col, i))
}
return(x)
}
#' Concatenates primary_col for each individual user (as defined by Email).
#'
#' Concatenates primary_col for each individual user (as defined by Email).
#' @param x dataframe with column of interest primary_col and users identified by Email.
#' @param cllps string, divided concatenated primary_col entries. Vary if your primary_col might actually have a " | ".
#' @return A dataframe with grouped by Email with primary_col concatenated.
concat_by_user <- function(x, cllps = " | ") {
x %>%
dplyr::group_by(Email) %>%
dplyr::summarize(primary_col = paste(primary_col, collapse = cllps),
sec_col = paste(sec_col, collapse = cllps))
}
#' Takes a vector (ideally of strings), leaves it be if it only has one element; otherwise replaces it with "Mixed".
#'
#' Takes a vector (ideally of strings), leaves it be if it only has one element; otherwise replaces it with "Mixed".
#' @param x vector (ideally of strings).
#' @return An element of the vector (or "Mixed").
more_than_one_unique <- function(x) {
ifelse(length(unique(x)) > 1, "Mixed", unique(x))
}
#' Splits a string column into a vectors, checks whether those vectors are unique.
#'
#' Splits a string column into a vectors, checks whether those vectors are unique. If so, leaves be; if not, replaces with "Mixed".
#' @param x dataframe
#' @param clmn string naming the column that will be split into vectors and checked for uniqueness.
#' @param sep string, used to separate into clmn into vectors.
#' @return A dataframe with clmn squashed down into one string (as opposed to multiple concatenatd by sep).
col_sep <- function(x, clmn = "sec_col", sep = " | ") {
quocol <- rlang::enquo(clmn)
col_name <- rlang::quo_name(quocol)
split_by_sep <- x %>%
select(!!col_name) %>%
`[[`(1) %>%
strsplit(sep, fixed = TRUE)
x <- x %>% dplyr::mutate(!!col_name := unlist(lapply(split_by_sep, more_than_one_unique)))
return(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.