#' transforms .campaign_data into structure that is based on events, meaning the conversion would be
#' a specific event (e.g. clicking a submit button on a website)
#' this mimics what we would typically see from a clickstream
#' randomly assigns "Button Submit Event" & "Button Submit Event 2"
#' randomly gives the conversion timestamp either the same timestamp of the corresponding step or 1 second after
#' @param .campaign_data dataframe with id|timestamp|step|step_type|num_conversions|conversino_value columns
#' This dataframe has a value num_conversions on the step association with the conversion.
#' So for example, if a user signs up on the pricing page, there is no specific signup step/event,
#' but rather a step for the pricing page indicating number of conversions (probably always 1 for a signup event)
#' and total value of conversions
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr filter bind_rows mutate if_else arrange
#' @importFrom stringr str_ends
#' @importFrom lubridate seconds
#'
#' @export
rt__mock__attribution_to_clickstream <- function(.campaign_data) {
conversion_clickstream <- .campaign_data %>%
filter(num_conversions > 0)
# need to make the current step worth 0 (because it isn't the conversion event, just where the conversion happened)
# then create a conversion event with a timestamp that is the same
conversion_clickstream <- bind_rows(conversion_clickstream %>%
mutate(num_conversions = 0, conversion_value = 0),
conversion_clickstream %>%
mutate(step = if_else(str_ends(id, 'f'), 'Button Submit Event 2', 'Button Submit Event'),
step_type = 'Conversion',
# actually, i should make some of the time-stamps the same
# and some 1 second after to mimic what might
# happen in the click-stream data
# the dataset i'm working with, this id has another event 1
# second after, so ensure the conversion event has the same timestamp for this particular id
timestamp = if_else(str_ends(id, 'f') | id == 'fbd2f972542f5b6a9dfa602f4cac4d5c', timestamp, timestamp + seconds(1)))
)
click_stream_data <- bind_rows(.campaign_data %>% filter(num_conversions == 0),
conversion_clickstream) %>%
arrange(id, timestamp)
return (click_stream_data)
}
#' Transforms .clickstream_data into the expected format for attribution calculations.
#'
#' Essentially it removes the conversion event, and gives the conversion and conversion value to the last
#' touch-point before the conversion.
#'
#' Different types of conversion events will be ignored, so the user is expected to filter out any
#' conversion events they are not interested in.
#'
#' This function does not handle the case when the there are multiple types of conversions that are
#' triggered from a single step
#' For example: Someone lands on the homepage and from that page does 2 conversions `Lead Form-Fill` & `Lead Sign-up`)
#' The user is expected to filter the events of interest accordingly.
#'
#' @param .clickstream_data dataframe with id|timestamp|step|step_type|num_conversions|conversion_value columns
#' This dataframe has "clickstream" data, which means that it has a list of steps/events that might
#' correspond to, for example, page visits on a website.
#' num_conversions should indicate which steps are conversion events.
#' A conversion event should be its own step, that has a timestamp equal to or after the step that
#' should get the conversion event.
#' For example, if someone visits the pricing page, and then signs up for the product (which is the conversion),
#' there should be a single row (i.e. step) for the visit to the pricing page, and a single row for the conversion step.
#' The timestamp of the conversion event, in this case, would be immdediately after (seconds or minutes) the pricing step.
#' The step that is before the conversion event (regardless of how much before) gets credit (last-touch) for the conversion.
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr arrange group_by ungroup mutate lead row_number n select filter contains
#'
#' @export
rt_clickstream_to_attribution <- function(.clickstream_data) {
temp <- .clickstream_data %>%
arrange(id, timestamp, step) %>%
# need to arrange by num_conversions so that if the conversion step and corresponding step that
# should get credit have the same timestamp, arranging by num_conversions should ensure
# that the conversion step comes after
arrange(id, timestamp, num_conversions) %>%
group_by(id) %>%
# need to create a unique index per unique id per each path up until a conversion event
# people can have multiple conversions
# this will allow us to identify the event immediately before (or with the same timestamp as) the
# conversion event (arranging by num_conversions ensures that if the corresponding step and
# conversion event have the same timestamp, the conversion event will be ordered after)
mutate(temp___lag_cumulative_conv=dplyr::lag(cumsum(num_conversions)),
temp___path_no = ifelse(is.na(temp___lag_cumulative_conv), 0, temp___lag_cumulative_conv) + 1) %>%
ungroup() %>%
group_by(id, temp___path_no) %>%
mutate(temp___path_index = row_number(timestamp),
temp___path_num_steps = n(),
temp___path_has_conversions = any(num_conversions > 0),
temp___check = ifelse(any(num_conversions > 0),
# temp___path_index of the conversion event should be the max path index
max(temp___path_index[num_conversions > 0]) == max(temp___path_index),
# if there aren't any conversions this check doesn't apply so return TRUE
TRUE),
num_conversions = ifelse(temp___path_has_conversions, lead(num_conversions), num_conversions),
conversion_value = ifelse(temp___path_has_conversions, lead(conversion_value), conversion_value)
) %>%
ungroup() %>%
select(-temp___lag_cumulative_conv)
# if the path has conversions, there should be more than 1 steps
# so either the path doesn't have conversinos, or it has 2 or more steps
stopifnot(all(!temp$temp___path_has_conversions | temp$temp___path_num_steps >= 2))
# this makes sure that the conversion event always has the max indexstep number
stopifnot(all(temp$temp___check))
return(temp %>%
filter(!is.na(num_conversions)) %>%
select(-contains('temp___')))
}
#' adds `.path_id` column to dataframe. Each conversion step triggers a new path-id for subsequent steps
#'
#' @param .campaign_data dataframe with columns: `id | timestamp | step | num_conversions | conversion_value`
#' @param .id string identifying the id column
#' @param .timestamp string identifying the timestamp column
#' @param .num_conversions string identifying the num_conversions column
#' @param .conversion_value string identifying the conversion_values column
#' @param .use_first_conversion if true, only use the first conversion - the path id will be identical to the .id (by definition there will never be more than 1 path)
#' @param .sort if true, the df is sorted by id, timestamp, conversion_values, & num_conversions; the dataframe needs to be sorted in this way to work, but this option let's the user avoid this action if it has already been done
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr arrange group_by ungroup mutate filter select
#'
#' @export
rt_campaign_add_path_id <- function(.campaign_data,
.id='id',
.timestamp='timestamp',
.num_conversions='num_conversions',
.conversion_value='conversion_value',
.use_first_conversion=TRUE,
.sort=TRUE) {
if(.sort) {
.campaign_data <- .campaign_data %>%
arrange(!!sym(.id), !!sym(.timestamp), !!sym(.conversion_value), !!sym(.num_conversions))
}
if(.use_first_conversion) {
.campaign_data <- suppressWarnings(.campaign_data %>%
group_by(!!sym(.id)) %>%
# arrange(time) %>%
mutate(.___datetimes___ = !!sym(.timestamp)) %>%
mutate(.___first_conversion___=min(.___datetimes___[!!sym(.num_conversions) > 0], na.rm = TRUE)) %>%
ungroup() %>%
filter(!!sym(.timestamp) <= .___first_conversion___) %>%
select(-.___datetimes___, -.___first_conversion___))
# if we only use the first conversion, by definition, there will never be multiple path ids
# so we don't have to do the complex logic
# treat single person as 1 path regardless how many conversions
.campaign_data <- .campaign_data %>% mutate(.path_id = !!sym(.id))
} else {
# treat events after conversion as new path
.campaign_data <- .campaign_data %>%
group_by(!!sym(.id)) %>%
# arrange(time) %>%
mutate(.___path_no___ = ifelse(is.na(dplyr::lag(cumsum(!!sym(.num_conversions)))), 0, dplyr::lag(cumsum(!!sym(.num_conversions)))) + 1) %>%
ungroup() %>%
mutate(.path_id = paste0(!!sym(.id), '-', .___path_no___)) %>%
select(-.___path_no___)
}
return (.campaign_data)
}
#' adds converts campaign data to path data in the format expected by ChannelAttribution::markov_model & ChannelAttribution::heuristic_models
#'
#' @param .campaign_data dataframe with columns `id | timestamp | step | num_conversions | conversion_value`
#' @param .id string identifying the id column
#' @param .path_id string identifying the path_id column
#' @param .step string identifying the step column
#' @param .timestamp string identifying the timestamp column
#' @param .num_conversions string identifying the num_conversions column
#' @param .conversion_value string identifying the conversion_values column
#' @param .separate_paths_ids if TRUE, each .path_id gets its own row & path_sequence. Each .id will be represented >= 1 times
#' if FALSE, each person will only be counted once, with their entire path, and cumulative conversions/conversion-value/null-conversions
#' @param .sort if true, the df is sorted by id, timestamp, conversion_values, & num_conversions; the dataframe needs to be sorted in this way to work, but this option let's the user avoid this action if it has already been done
#' @param .symbol the symbol to separate the steps
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr arrange group_by ungroup mutate filter select
#'
#' @export
rt_campaign_to_markov_paths <- function(.campaign_data,
.id='id',
.path_id='.path_id',
.step='step',
.timestamp='timestamp',
.num_conversions='num_conversions',
.conversion_value='conversion_value',
.separate_paths_ids=TRUE,
.sort=TRUE,
.symbol=" > ") {
if(.sort) {
.campaign_data <- .campaign_data %>%
arrange(!!sym(.path_id), !!sym(.timestamp), !!sym(.conversion_value), !!sym(.num_conversions))
}
if(.separate_paths_ids) {
.campaign_data %>%
group_by(!!sym(.path_id)) %>%
#arrange(time) %>%
summarise(path_sequence = paste(!!sym(.step), collapse = .symbol),
num_conversions = sum(!!sym(.num_conversions)),
conversion_value = sum(!!sym(.conversion_value))) %>%
ungroup() %>%
rename(path_id = !!sym(.path_id)) %>%
mutate(null_conversions = ifelse(num_conversions > 0 | conversion_value > 0, 0, 1)) # adding information about path that have not led to conversion
} else {
inner_join(
.campaign_data %>%
group_by(!!sym(.id)) %>%
#arrange(time) %>%
summarise(path_sequence = paste(!!sym(.step), collapse = .symbol),
num_conversions = sum(!!sym(.num_conversions)),
conversion_value = sum(!!sym(.conversion_value))) %>%
ungroup() %>%
rename(path_id = !!sym(.id)),
.campaign_data %>%
group_by(!!sym(.id), !!sym(.path_id)) %>%
summarise(path_converted = any(!!sym(.num_conversions) > 0) | any(!!sym(.conversion_value) > 0)) %>%
ungroup() %>%
group_by(!!sym(.id)) %>%
summarise(null_conversions = sum(!path_converted)) %>%
rename(path_id = !!sym(.id)),
by = "path_id")
}
}
#' wrapper around ChannelAttribution::markov_model
#'
#' @param .path_data dataframe expected by ChannelAttribution::markov_model
#' @param .path_sequence var_path
#' @param .num_conversions var_conv
#' @param .conversion_value var_value
#' @param .null_conversions var_null
#' @param .order order
#' @param .symbol sep
#' @param .seed seed
#'
#' @importFrom ChannelAttribution markov_model
#' @importFrom magrittr "%>%"
#' @importFrom dplyr rename
#'
#' @export
rt_markov_model <- function(.path_data,
.path_sequence='path_sequence',
.num_conversions='num_conversions',
.conversion_value='conversion_value',
.null_conversions='null_conversions',
.order=1,
.symbol='>',
.seed=42) {
set.seed(.seed)
markov_attribution <- markov_model(Data = .path_data,
var_path = .path_sequence,
var_conv = .num_conversions,
var_value = .conversion_value,
var_null = .null_conversions,
order = .order, # higher order markov chain
out_more = TRUE,
verbose = FALSE,
sep=.symbol,
seed = .seed)
# inconsistent naming if var_value is NULL
if('removal_effects' %in% names(markov_attribution$removal_effects)) {
markov_attribution$removal_effects <- markov_attribution$removal_effects %>%
rename(removal_effects_conversion = removal_effects)
}
return (markov_attribution)
}
#' graphs removal effects returned by rt_markov_model (i.e. ChannelAttribution::markov_model)
#'
#' @param .markov_attribution results from rt_markov_model
#' @param .channel_categories if provided, colors removal effects by channel categories; named vector with categories as value and channel names as vector names
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr mutate left_join
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_replace
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot aes geom_col position_dodge geom_text coord_flip scale_fill_manual scale_y_continuous theme_light guides guide_legend labs theme facet_wrap
#' @importFrom scales pretty_breaks
#'
#' @export
rt_plot_markov_removal_effects <- function(.markov_attribution, .channel_categories=NULL) {
markov_long <- .markov_attribution$removal_effects %>%
pivot_longer(names(.markov_attribution$removal_effects) %>% rt_remove_val('channel_name'),
names_to = 'removal_type',
values_to = 'removal_value') %>%
mutate(channel_name = as.character(channel_name),
removal_type = rt_pretty_text(removal_type),
removal_type = str_replace(removal_type, "Removal Effects ", ""))
if(is.null(.channel_categories)) {
if(length(unique(markov_long$removal_type)) == 1) {
fill_colors <- rt_colors()[1]
} else {
fill_colors <- rev(rt_colors()[1:2])
}
markov_plot <- markov_long %>%
mutate(removal_type = factor(removal_type, levels=c("Conversion Value", "Conversion"))) %>%
mutate(channel_name = fct_reorder(channel_name, removal_value)) %>%
ggplot(aes(x=channel_name, y=removal_value, fill=removal_type)) +
geom_col(position = position_dodge(0.9),
alpha=0.75) +
geom_text(aes(label=rt_pretty_axes_percent(removal_value, increase_precision_delta = 0)),
position = position_dodge(0.9),
size=3.3) +
coord_flip() +
scale_fill_manual(values=fill_colors) +
scale_y_continuous(breaks=pretty_breaks(10),
labels=rt_pretty_axes_percent) +
theme_light() +
#expand_limits(y=1) +
guides(fill = guide_legend(reverse = TRUE)) +
labs(title="Markov Removal Effects",
subtitle = "Shows the estimated percent decrease in conversions from removing specific channels.",
x='Channel',
y='Estimated Removal Effect')
if(length(unique(markov_long$removal_type)) == 1) {
markov_plot <- markov_plot + theme(legend.position = 'none')
}
} else {
.channel_categories <- data.frame(channel_name = names(.channel_categories),
category = as.character(.channel_categories),
stringsAsFactors = FALSE)
markov_plot <- markov_long %>%
left_join(.channel_categories, by = "channel_name") %>%
mutate(category = ifelse(is.na(category), 'Uncategorized', category)) %>%
mutate(channel_name = fct_reorder(channel_name, removal_value)) %>%
ggplot(aes(x=channel_name, y=removal_value, fill=category)) +
geom_col(position = position_dodge(0.9),
alpha=0.75) +
geom_text(aes(label=rt_pretty_axes_percent(removal_value, increase_precision_delta = 0)),
position = position_dodge(0.9),
size=3.3) +
coord_flip() +
scale_fill_manual(values=rt_colors()) +
scale_y_continuous(breaks=pretty_breaks(10),
labels=rt_pretty_axes_percent) +
theme_light() +
labs(title="Markov Removal Effects",
subtitle = "Shows the estimated percent decrease in conversions from removing specific channels (i.e. the removal effect).",
x='Channel',
y='Estimated Removal Effect')
if(length(unique(markov_long$removal_type)) > 1) {
markov_plot <- markov_plot + facet_wrap(~removal_type)
}
}
return (markov_plot)
}
#' wrapper around ChannelAttribution::markov_model & ChannelAttribution::heuristic_models
#'
#' returns a dataframe with the combined results
#'
#' @param .path_data dataframe expected by ChannelAttribution::markov_model & & ChannelAttribution::heuristic_models
#' @param .path_sequence var_path
#' @param .num_conversions var_conv
#' @param .conversion_value var_value
#' @param .null_conversions var_null
#' @param .order order
#' @param .symbol sep
#' @param .seed seed
#'
#' @importFrom ChannelAttribution heuristic_models
#' @importFrom magrittr "%>%"
#' @importFrom dplyr rename inner_join mutate select
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_replace str_ends str_remove
#'
#' @export
rt_get_channel_attribution <- function(.path_data,
.path_sequence='path_sequence',
.num_conversions='num_conversions',
.conversion_value='conversion_value',
.null_conversions='null_conversions',
.order=1,
.symbol='>',
.seed=42) {
markov_results <- rt_markov_model(.path_data=.path_data,
.path_sequence=.path_sequence,
.num_conversions=.num_conversions,
.conversion_value=.conversion_value,
.null_conversions=.null_conversions,
.order=.order,
.symbol=.symbol,
.seed=.seed)
markov_attribution <- markov_results$result
heuristic_attribution <- ChannelAttribution::heuristic_models(Data = .path_data,
var_path = .path_sequence,
var_conv = .num_conversions,
var_value = .conversion_value,
#out_more = TRUE,
sep=.symbol
)
# if var_value is null, heuristic_models returns column names like "first_touch"
# if var_value is not null, heuristic_models returns column names like "first_touch_conversions"
# let's make consistent regardless
if(is.null(.conversion_value)) {
colnames(heuristic_attribution) <- str_replace(colnames(heuristic_attribution), 'touch', 'touch_conversions')
markov_attribution <- markov_attribution %>%
rename(markov_conversions=total_conversions)
} else {
markov_attribution <- markov_attribution %>%
rename(markov_conversions=total_conversions,
markov_value=total_conversion_value)
}
all_models <- inner_join(heuristic_attribution, markov_attribution, by = 'channel_name')
all_models <- rt_attribution_pivot_longer(all_models)
return (all_models)
}
#' wrapper around ChannelAttribution::markov_model & ChannelAttribution::heuristic_models
#'
#' returns a dataframe with the combined results
#'
#' @param .channel_attribution dataframe returned by rt_get_channel_attribution()
#' @param .channel_categories if provided, colors removal effects by channel categories; named vector with categories as value and channel names as vector names
#' @param .show_values show the attribution values
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr mutate left_join
#' @importFrom purrr map_chr
#' @importFrom forcats fct_reorder
#' @importFrom ggplot2 ggplot geom_col position_dodge scale_fill_manual theme_light theme labs geom_text aes facet_wrap
#'
#' @export
rt_plot_channel_attribution <- function(.channel_attribution, .channel_categories=NULL, .show_values=TRUE) {
base_attribution_plot <- function(channel_plot, .show_values) {
custom_colors <- rt_colors()
if(channel_plot$labels$fill == 'channel_name') {
fill_label <- "Channel Name"
} else {
#print(channel_plot$labels)
stopifnot(channel_plot$labels$fill == 'category')
fill_label <- "Channel Category"
}
channel_plot <- channel_plot +
geom_col(position = position_dodge(width = 0.9),
alpha=0.75) +
scale_fill_manual(values=custom_colors) +
scale_y_continuous(breaks=pretty_breaks(10), labels = rt_pretty_axes) +
theme_light() +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(y='Conversions',
x="Attribution Model",
fill=fill_label)
if(.show_values) {
round_values_by <- 0
channel_plot <- channel_plot +
geom_text(aes(label=ifelse(attribution_value < 1,
map_chr(attribution_value, ~ rt_pretty_axes_percent(., increase_precision_delta = 0)),
map_chr(attribution_value, ~ rt_pretty_numbers_short(.)))),
position = position_dodge(width = 0.9),
angle=90,
hjust=1)
}
return (channel_plot)
}
known_models <- c("First Touch", "Last Touch", "Linear Touch", "Markov", "Any Touch")
.channel_attribution <- .channel_attribution %>%
mutate(attribution_name = factor(attribution_name, levels=known_models))
if(length(unique(.channel_attribution$attribution_type)) == 1) {
if(is.null(.channel_categories)) {
channel_plot <- .channel_attribution %>%
mutate(channel_name = fct_reorder(channel_name, attribution_value, .fun = max, .desc = TRUE)) %>%
ggplot(aes(x=attribution_name, y=attribution_value, fill=channel_name)) %>%
base_attribution_plot(.show_values)
} else {
.channel_categories <- data.frame(channel_name = names(.channel_categories),
category = as.character(.channel_categories),
stringsAsFactors = FALSE)
channel_plot <- .channel_attribution %>%
mutate(channel_name = as.character(channel_name)) %>%
left_join(.channel_categories, by = "channel_name") %>%
mutate(category = ifelse(is.na(category), 'Uncategorized', category)) %>%
mutate(channel_name = fct_reorder(channel_name, attribution_value, .fun = max, .desc = TRUE)) %>%
ggplot(aes(x=channel_name, y=attribution_value, fill=category)) %>%
base_attribution_plot(.show_values) +
facet_wrap( ~ attribution_name) +
labs(x='Channel Name')
}
} else {
channel_plot <- .channel_attribution %>%
mutate(channel_name = fct_reorder(channel_name, attribution_value, .fun = max, .desc = TRUE)) %>%
ggplot(aes(x=attribution_name, y=attribution_value, fill=channel_name)) %>%
base_attribution_plot(.show_values) +
facet_wrap(~ attribution_type, scales = 'free_y')
}
return (channel_plot)
}
#' gives each step credit for the number of conversions that resulted from the corresponding path conversions
#'
#' if the path is Facebook -> Facebook -> Facebook -> Instagram -> 2 Conversions, then both Facebook & Instagram get credit for two conversions, regardless how many times they are in the path
#'
#' returns a dataframe with the combined results
#'
#' @param .campaign_data dataframe with columns `id | timestamp | step | num_conversions | conversion_value`
#' @param .conversion_column e.g. num_conversions or conversion_value
#' @param .path_id string identifying the path_id column
#' @param .step string identifying the step column
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr select group_by ungroup mutate filter distinct
#' @importFrom tidyr pivot_wider
#'
#' @export
rt_get_any_touch_attribution <- function(.campaign_data,
.conversion_column,
.path_id='.path_id',
.step='step') {
path_conversions <- .campaign_data %>%
select(!!sym(.path_id), !!sym(.step), !!sym(.conversion_column)) %>%
group_by(!!sym(.path_id)) %>%
mutate(temp___path_conversion = sum(!!sym(.conversion_column))) %>%
ungroup() %>%
filter(temp___path_conversion > 0)
path_conversion_matrix <- path_conversions %>%
select(-!!sym(.conversion_column)) %>%
distinct() %>%
pivot_wider(names_from = !!sym(.step),
values_from = temp___path_conversion,
values_fill = list(temp___path_conversion = 0)) %>%
select(-!!sym(.path_id))
path_conversions <- path_conversions %>%
select(!!sym(.path_id), temp___path_conversion) %>%
distinct()
rt_stopif(any(duplicated(path_conversions[[.path_id]])))
stopifnot(all(rowSums(path_conversion_matrix) > 0))
stopifnot(all.equal(apply(path_conversion_matrix, 1, max), path_conversions$temp___path_conversion))
# they should equal 2, unless there is a path that had all channels, which isn't the case
stopifnot(all(apply(path_conversion_matrix, 1, function(x) length(unique(x))) == 2))
any_touch <- colSums(path_conversion_matrix)
#any_touch <- any_touch / sum(any_touch)
#stopifnot(sum(any_touch) == 1)
any_touch_df <- data.frame(channel_name=names(any_touch), any_touch=as.numeric(any_touch))
return (any_touch_df)
}
#' transforms attribution dataframe to long format
#'
#' @param attribution_models dataframe with columns `channel_name | xxx_conversions | xxx_value`
#'
#' @importFrom magrittr "%>%"
#' @importFrom dplyr mutate select case_when
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_ends str_remove
#'
#' @export
rt_attribution_pivot_longer <- function(attribution_models) {
attribution_models <- attribution_models %>%
pivot_longer(colnames(attribution_models) %>% rt_remove_val('channel_name'),
names_to = 'attribution_column_name',
values_to = 'attribution_value') %>%
mutate(attribution_type = case_when(
str_ends(attribution_column_name, '_conversions') ~ 'Conversion',
str_ends(attribution_column_name, '_value') ~ 'Conversion Value',
TRUE ~ 'unknown'
),
attribution_name = str_remove(attribution_column_name, "_conversions"),
attribution_name = str_remove(attribution_name, "_value"),
attribution_name = rt_pretty_text(attribution_name)) %>%
select(-attribution_column_name) %>%
select(channel_name, attribution_name, attribution_type, attribution_value)
return (attribution_models)
}
#' creates a sankey diagram based on a data-frame containing touch-points in the form of
#'
#' entity_id | touch_category | touch_index | weight (optional)
#' ============================================================
#' personA | Home Page | 1 0
#' personA | Pricing Page | 2 0
#' personA | Conversion | 3 0
#' personB | Home Page | 1 2
#' personB | Bounced | 2 2
#' personC | Pricing | 1 1
#'
#' where the column names are specified as parameters.
#'
#' @param .path_data data-frame of touch-points
#' @param .id entity id column
#' @param .path_column column of the path or touch-points
#' @param .visit_index numeric index touch-points
#' @param .weight column that gives a weight value, which should be the same for each touch-point for a given entity_id (basically the value of the entity)
#'
#' @param .valid_final_touch_points specify what is considered the ending events.
#'
#' @param .ensure_complete_funnel If TRUE, this makes sure that the number of people/entities in the first level
#' of the sankey plot equals the number of people/entities in the final level of the plot
#' Any path where the last touch-point is not one of the values in
#' `.valid_final_touch_points` is considered a "bounce" and an additional touch-point will be added.
#' The value of the new touch-point will be based on `.bounced_fill_value`
#' Any path that only 1 touch-point and the value is in `.valid_final_touch_points` will have an additional
#' touch-point added of "<No Prior Touch-Point>"
#' @param .bounced_fill_value the value to be inserted as a touch-point when an entity bounces (does not have a conversion touch-point)
#' @param .no_prior_data the value to be inserted as a touch-point when an entity only has a conversion touch-point
#'
#' @param .global_path_values a list of all possible touch-categories, which the colors will be based on. This
#' ensures that the colors of graphs are consistent across multiple graphs even if the same categories
#' don't show up in each graph
#'
#' @param .depth_threshold the number of leves at the beginning and end of the journey/path
#' @param .show_percentages if TRUE, then show percentage in parentheses for each node which corresponds to percentage of all ids or percentage of all weights depending on if .weight field is populated
#' @param .order_by determines the arrangement of the diagram; `both` returns a list of 2 items, each containing a plot
#' @param .connection_threshold used to limit the number of connections: too many and the function call to generate the diagram will just hang
#'
#' @importFrom magrittr "%>%"
#' @importFrom forcats fct_lump
#' @importFrom networkD3 sankeyNetwork
#' @importFrom tidyr unite
#' @importFrom dplyr select group_by filter ungroup distinct bind_rows arrange mutate summarise desc count rename pull
#' @importFrom stringr str_remove
#'
#' @export
rt_plot_sankey <- function(.path_data,
.id='entity_id',
.path_column='touch_category',
.visit_index='touch_index',
.weight=NULL,
.valid_final_touch_points=NULL,
.ensure_complete_funnel=TRUE,
.bounced_fill_value='Bounced',
.no_prior_data='<No Prior Touch-Point>',
.global_path_values=NULL,
.depth_threshold=NULL,
.show_percentages=FALSE,
.order_by=c('size', 'optimize', 'both'),
.connection_threshold=200) {
rt_stopif(is.null(.valid_final_touch_points))
.path_data <- .path_data %>% arrange(!!sym(.id), !!sym(.visit_index))
number_of_unique_ids <- length(unique(.path_data[[.id]]))
if(!is.null(.weight)) {
# if weight is given, let's make sure the same weight value is used for each touch-point per id
stopifnot(all(.path_data %>%
group_by(!!sym(.id)) %>%
summarise(num_unique_weights = n_distinct(!!sym(.weight))) %>%
pull(num_unique_weights) == 1))
weight_per_id <- .path_data %>%
group_by(!!sym(.id)) %>%
summarise(weight = min(!!sym(.weight))) %>%
ungroup()
total_weight <- sum(weight_per_id$weight)
}
# apparently there is a bug in networkD3 where colors seem to get messed up if touch-points have spaces in
# the names :(
# https://stackoverflow.com/questions/39647938/r-specify-colors-in-sankeynetwork
# but even when I fixed it, I kept getting the same issue. I ended up putting an underscore between
# Natural and Gas (Natural Gas -> Natural_Gas) and that fixed it.
.path_data[[.path_column]] <- str_replace_all(.path_data[[.path_column]], pattern = ' ', replacement = '-')
.valid_final_touch_points <- str_replace_all(.valid_final_touch_points, pattern = ' ', replacement = '-')
if(!is.null(.global_path_values)) {
.global_path_values <- str_replace_all(.global_path_values, pattern = ' ', replacement = '-')
}
if(.ensure_complete_funnel) {
# if we are going to be adding in "Bounce" touch-points, we have to know what is considered a non-bounce
# otherwise, we can just get all of the final touch-points to merge at the end
# if this isn't null, then
# 1) anyone who doesn't have a success event has "bounced"
# 2) anyone who doesn't have anything other than a success event needs an initial filler event
# 3) if we don't have this value, then the final event will show up multiple
# these 2 things will make sure everyone is represent from beginning to end.
# 1) bounced
bounced_path_data <- .path_data %>%
group_by(!!sym(.id)) %>%
filter(!any(!!sym(.path_column) %in% .valid_final_touch_points)) %>%
ungroup() %>%
rt_select_all_of(.id, .weight) %>%
distinct()
bounced_path_data[[.path_column]] <- .bounced_fill_value
bounced_path_data[[.visit_index]] <- Inf
# nothing other than success event
only_success_data <- .path_data %>%
group_by(!!sym(.id)) %>%
filter(n() == 1 & all(!!sym(.path_column) %in% .valid_final_touch_points)) %>%
ungroup() %>%
rt_select_all_of(.id, .weight) %>%
distinct()
only_success_data[[.path_column]] <- .no_prior_data
only_success_data[[.visit_index]] <- -Inf
.path_data <- .path_data %>%
bind_rows(bounced_path_data) %>%
bind_rows(only_success_data) %>%
arrange(!!sym(.id), !!sym(.visit_index))
stop_if_any_duplicated(.path_data)
}
if(!is.null(.depth_threshold)) {
.path_data <- .path_data %>%
group_by(!!sym(.id)) %>%
mutate(custom__touch_index = row_number(!!sym(.visit_index)),
custom__touch_index_rev = rev(custom__touch_index)) %>%
ungroup()
# -1 because we want to exclude the last event from this filter e.g. if .depth_threshold is 1 we still want to include the event
# before the last event (we want to see the final touch before the "conversion")
.path_data[[.path_column]] <- ifelse(.path_data$custom__touch_index >.depth_threshold & .path_data$custom__touch_index_rev - 1 > .depth_threshold,
'<Not Shown>',
.path_data[[.path_column]])
.path_data$custom__touch_index <- NULL
.path_data$custom__touch_index_rev <- NULL
# now we have to remove multiple not showns
# but we can't remove "other", which also might appear multiple times
.path_data <- .path_data %>%
group_by(!!sym(.id), !!sym(.path_column)) %>%
# either the column is not <Not Shown> or (if it is) it has to be the first occurance
filter(!!sym(.path_column) != '<Not Shown>' | row_number(!!sym(.visit_index)) == 1) %>%
ungroup()# %>%
#filter(!!sym(.path_column) == '<Not Shown>') %>%
#View()
stopifnot(all(.path_data %>% filter(!!sym(.path_column) == '<Not Shown>') %>% pull(!!sym(.visit_index)) == .depth_threshold + 1))
}
# convert dataset so that it has `source->target` pairs (e.g. visit1 -> visit2; visit2 -> visit3)
source_target_data <- .path_data %>%
unite(channel_source, c(!!sym(.path_column), !!sym(.visit_index)), sep = "~~") %>%
group_by(!!sym(.id)) %>%
mutate(channel_target = lead(channel_source)) %>%
ungroup() %>%
filter(!is.na(channel_target))
if(!is.null(.weight)) {
stopifnot(all(source_target_data %>%
group_by(!!sym(.id)) %>%
summarise(num_unique_weights = n_distinct(!!sym(.weight))) %>%
pull(num_unique_weights) == 1))
}
rt_stopif(is.null(.valid_final_touch_points))
rt_stopif(length(.valid_final_touch_points) == 0)
original_event_name <- str_remove(source_target_data$channel_target, pattern = "~~.*")
source_target_data$channel_target <- ifelse(original_event_name %in% .valid_final_touch_points,
original_event_name,
source_target_data$channel_target)
# we should only check counts if the user provides us .valid_final_touch_points value(s), otherwise, all bets are off
# e.g. might happen if the person only has 1 touch-point (e.g. bounced or converted without any prior touch-points)
if(.ensure_complete_funnel) {
stopifnot(setequal(source_target_data[[.id]], .path_data[[.id]]))
# create temp___weight so we ensure weight column exists so that we can use it in logic regardless
if(is.null(.weight)) {
total_weighting <- 0
.path_data$temp___weight <- 0
source_target_data$temp___weight <- 0
} else {
total_weighting <- sum(weight_per_id$weight)
.path_data$temp___weight <- .path_data[[.weight]]
source_target_data$temp___weight <- source_target_data[[.weight]]
}
first_touch_expected <- .path_data %>%
# get the first visit for each id
group_by(!!sym(.id)) %>%
filter(!!sym(.visit_index) == min(!!sym(.visit_index))) %>%
ungroup() %>%
# get count and weight by path
group_by(!!sym(.path_column)) %>%
summarise(n = n(),
weight = sum(temp___weight)) %>%
ungroup() %>%
arrange(desc(n)) %>%
rename(channel_source = !!sym(.path_column))
first_touch_calculated <- source_target_data %>%
group_by(!!sym(.id)) %>%
filter(row_number() == 1) %>%
ungroup() %>%
#filter(str_detect(channel_source, pattern = '~~1$')) %>%
mutate(channel_source = str_remove(channel_source, pattern = '~~.*')) %>%
group_by(channel_source) %>%
summarise(n = n(),
weight = sum(temp___weight)) %>%
ungroup() %>%
arrange(desc(n))
stopifnot(rt_are_dataframes_equal(first_touch_expected, first_touch_calculated))
last_touch_expected <- .path_data %>%
group_by(!!sym(.id)) %>%
filter(!!sym(.visit_index) == max(!!sym(.visit_index))) %>%
ungroup() %>%
# get count and weight by path
group_by(!!sym(.path_column)) %>%
summarise(n = n(),
weight = sum(temp___weight)) %>%
ungroup() %>%
arrange(desc(n)) %>%
rename(channel_target = !!sym(.path_column))
last_touch_calculated <- source_target_data %>%
group_by(!!sym(.id)) %>%
filter(row_number() == max(row_number())) %>%
ungroup() %>%
#filter(str_detect(channel_target, pattern = '~~1$')) %>%
mutate(channel_target = str_remove(channel_target, pattern = '~~.*')) %>%
group_by(channel_target) %>%
summarise(n = n(),
weight = sum(temp___weight)) %>%
ungroup() %>%
arrange(desc(n))
stopifnot(rt_are_dataframes_equal(last_touch_expected, last_touch_calculated))
stopifnot(all(last_touch_calculated$channel_target %in% c(.bounced_fill_value, .valid_final_touch_points)))
.path_data$temp___weight <- NULL
source_target_data$temp___weight <- NULL
}
total_rows <- nrow(source_target_data)
# create temp___weight so we ensure weight column exists so that we can use it in logic regardless
if(is.null(.weight)) {
source_target_data$temp___weight <- 0
} else {
source_target_data$temp___weight <- source_target_data[[.weight]]
}
source_target_data <- source_target_data %>%
unite(step, c(channel_source, channel_target), remove = FALSE, sep = " -> ") %>%
group_by(step, channel_source, channel_target) %>%
summarise(num_touch_points=n(),
num_touch_points_distinct=n_distinct(!!sym(.id)),
weight = sum(temp___weight))%>%
ungroup() %>%
arrange(desc(num_touch_points))
stop_if_not_identical(source_target_data$num_touch_points, source_target_data$num_touch_points_distinct)
stop_if_any_duplicated(source_target_data$step)
stop_if_any_duplicated(source_target_data %>% select(channel_source, channel_target))
stop_if_not_identical(total_rows, sum(source_target_data$num_touch_points))
source_target_data <- source_target_data %>% select(-num_touch_points_distinct)
if(is.null(.weight)) {
source_target_data <- source_target_data %>% select(-weight)
total_value <- number_of_unique_ids
} else {
source_target_data <- source_target_data %>% select(-num_touch_points) %>% rename(num_touch_points = weight)
total_value <- total_weight
}
rt_stopif(nrow(source_target_data) > .connection_threshold)
# the intersecting channels will double-count the numbers because they will be counted once for the source
# and once for the target each time
# so if the channel_name is in this list, then we need to divide these groups by 2...
# This should work even for weights, because again, we are double-counting,
# i.e. multiplying each value by 2, so still divide by 2 to get the actual
intersecting_channels <- intersect(source_target_data$channel_source, source_target_data$channel_target)
unique_nodes <- bind_rows(source_target_data %>%
count(channel_source, wt=num_touch_points, name = 'num_touch_points') %>%
arrange(num_touch_points) %>%
select(channel_source, num_touch_points) %>%
rename(channel_name=channel_source),
source_target_data %>%
count(channel_target, wt=num_touch_points, name = 'num_touch_points') %>%
arrange(num_touch_points) %>%
select(channel_target, num_touch_points) %>%
rename(channel_name=channel_target)) %>%
count(channel_name, wt=num_touch_points, name = 'num_touch_points') %>%
arrange(desc(num_touch_points)) %>%
mutate(num_touch_points = ifelse(channel_name %in% intersecting_channels,
num_touch_points / 2,
num_touch_points)) %>%
mutate(perc_touch_points = rt_pretty_percent(num_touch_points / total_value)) %>%
mutate(channel_name_perc = paste0(channel_name, " (", perc_touch_points, ")")) %>%
select(channel_name, channel_name_perc)
source_indexes <- match(source_target_data$channel_source, unique_nodes$channel_name) - 1
target_indexes <- match(source_target_data$channel_target, unique_nodes$channel_name) - 1
source_target_data$source <- source_indexes
source_target_data$target <- target_indexes
unique_nodes_perc <- str_remove(string=unique_nodes$channel_name_perc, pattern = "(~~)([^\\s]+)")
unique_nodes_names <- str_remove(string=unique_nodes$channel_name, pattern = "~~.*")
rm(unique_nodes)
if(.show_percentages) {
sankey_nodes_df <- data.frame(name=unique_nodes_perc)
} else {
sankey_nodes_df <- data.frame(name=unique_nodes_names)
}
#stopifnot(all(unique_nodes %in% names(color_dictionary)))
# color_dictionary <- c(color_dictionary,
# c("Joined Experiment"=rt_colors_good_bad()[1], "No Further Visits"=rt_colors_good_bad()[2],
# "Other"=rt_colors(color_names = 'dove_gray')))
if(is.null(.global_path_values)) {
.global_path_values <- sort(unique(.path_data[[.path_column]]))
} else {
.global_path_values <- sort(unique(c(.global_path_values, .bounced_fill_value, .no_prior_data)))
}
rt_stopif(is.null(.global_path_values))
color_dictionary <- rep(rt_colors(), 20)[1:length(.global_path_values)]
names(color_dictionary) <- .global_path_values
rt_stopif(any(duplicated(names(color_dictionary))))
#rt_stopif(any(duplicated(as.character(color_dictionary))))
selected_colors <- as.character(color_dictionary[unique_nodes_names])
color_string <- rt_str_collapse(unique(selected_colors),.surround = '"', .separate = ", ")
ColourScal <- paste0('d3.scaleOrdinal().range([', color_string,'])')
rt_stopif(nrow(source_target_data) > .connection_threshold)
stopifnot(all(.order_by %in% c('size', 'optimize', 'both')))
.order_by <- .order_by[1]
sankey_plots <- list()
if(.order_by %in% c('size', 'both')) {
sankey_plot <- sankeyNetwork(Links = source_target_data %>% as.data.frame(),
Nodes = sankey_nodes_df %>% as.data.frame(),
Source = 'source',
Target = 'target',
Value = 'num_touch_points',
NodeID = 'name',
iterations=0, # forces the cells in the diagram to appear in order of size
colourScale = ColourScal,
#units = 'TWh',
fontSize = 12, nodeWidth = 30)
sankey_plots <- append_list(sankey_plots, sankey_plot)
}
if(.order_by %in% c('optimize', 'both')) {
# save an alternative image where we do not force the order of the items
sankey_plot <- sankeyNetwork(Links = source_target_data %>% as.data.frame(),
Nodes = sankey_nodes_df %>% as.data.frame(),
Source = 'source',
Target = 'target',
Value = 'num_touch_points',
NodeID = 'name',
#iterations=0,
colourScale = ColourScal,
#units = 'TWh',
fontSize = 12, nodeWidth = 30)
sankey_plots <- append_list(sankey_plots, sankey_plot)
}
sankey_plots <- sankey_plots[!is.na(sankey_plots)]
if(length(sankey_plots) == 1) {
sankey_plots <- sankey_plots[[1]]
}
return (sankey_plots)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.