# #####################################################################
# Title: Counterfactual functions
# Author: Abel Camacho Guardian
# Date: 25.10.2019
# Version: 1
# Comments: None
#
# #####################################################################
# #####################################################################
# Function: estim_counterfactual
#' @name estim_counterfactual
#' @title Calculate counterfactual
#' @description Simulate counterfactual scenarios.
#' @usage estim_counterfactual(data,
#' model,
#' period,
#' alpha)
#' @export tidyverse
#'
#' @examples
#' ## Calculate counterfactual scenarios under different scenarios
#' data("sales_data")
#' sales_model1 <- lm(total ~
#' wday +
#' month +
#' adj_med_spend,
#' data = sales_data %>%
#' filter(date < as.Date("2018-07-01"))
#' )
#'
#' start_campaign <- as.Date("2018-07-01")
#' end_campaign <- as.Date("2018-07-25")
#' chosen_period <- start_campaign + seq(0, end_campaign - start_campaign)
#'
#' ## Scenario with no Media Spend
#' sales_data %>%
#' mutate(adj_med_spend = min(adj_med_spend)) %>%
#' estim_counterfactual(sales_model1, chosen_period) %>%
#' sum()
#'
#' ## Sales under the scenario that we haven't done any marketing campaign
#' sales_data %>%
#' estim_counterfactual(sales_model1, chosen_period) %>%
#' sum()
#' @param data_input The data containing the variables in the model,
#' and a named column date with date time data.
#' If no dates column is provided, then the first row is assumed to be a date.
#' @param chosen_model A model for which scenarios are simulated
#' @param chosen_period The period when the counterfactual scenario is simulated.
#' The period covered by data_input should cover the chosen_period.
#' @param alpha Optionally, the lag effect between (0,1). If it is not specified, it is set to zero.
# Columns with lag information should be named as lag_i with i being a number greater than zero.
#' @details Most scenarios are constructed with historical actuals
# and counterfactuls (what if). The what if is specified in the period defined by chosen_period.
estim_counterfactual <- function(data_input,
chosen_model,
chosen_period,
alpha = 0,
model='ols') {
if (is.character(alpha)) {
stop("alpha most be a numeric vector")
}
if (max(alpha) > 1 | alpha < 0) {
warning("alpha must be a vector between with entries in [0,1]")
}
# If no dates column is provided, then the first row is assumed to have dates.
if (!"dates" %in% names(data_input)) {
names(data_input)[1] <- "date"
}
data_input <- data_input %>%
arrange(date)
# get all the lags variables
lag_variables <- as.numeric(substring(names(data_input)[substr(names(data_input), 1, 4) == "lag_"], 5, 7))
if (length(alpha) == 1) {
alpha <- rep(alpha, length(lag_variables))
}
counterfactual_values <- rep(0, length(chosen_period))
s <- 1
for (t in chosen_period) {
if (model %in% c("ols", "poisson")) {
pred.y <- predict(chosen_model,
type = "response",
newdata = data_input %>%
filter(date == t)
)
}
if (model %in% c("bayesian")) {
pred.y <- predict(selecte_model,
type = "response",
newdata = data_input %>%
filter(date == t))$fit
}
counterfactual_values[s] <- pred.y
k2 <- 1
for (k in lag_variables) {
data_input[data_input$date == t + k, paste0("lag_", k)] <-
alpha[k2] * pred.y + (1 - alpha[k2]) * data_input[data_input$date == t + k, paste0("lag_", k)]
k2 <- k2 + 1
}
s <- s + 1
}
#' @return Return counterfactual values
return(counterfactual_values)
}
# #####################################################################
# Simulate Mix
#' @name counterfactual_mix
#' @title Counterfactual Mix
#' @description For a given variable and scenario,it simulates a counterfactual mix,
#' and it estimates the differeence between the actual mix and the counterfactual mix.
#' @export tidyverse
#'
#' @examples
#' ## Calculate counterfactual scenarios under different scenarios
#'
#' data("sales_data")
#'
#' sales_model1 <- lm(total ~
#' wday +
#' month,
#' data = sales_data %>%
#' filter(date < as.Date("2018-07-01"))
#' )
#'
#' start_campaign <- as.Date("2018-07-01")
#' end_campaign <- as.Date("2018-07-25")
#' campaign_period <- start_campaign + seq(0, end_campaign - start_campaign)
#' baseline_period <- min(chosen_period) + seq(-14, 0, 1)
#'
#'
#' cf_scenario1 <- sales_data %>%
#' mutate(adj_med_spend = min(adj_med_spend)) %>%
#' estim_counterfactual(sales_model1, chosen_period)
#' cf <- data.frame("date" = chosen_period, "counterfactual" = cf_scenario1)
#'
#' #
#' sales_data <- sales_data %>%
#' left_join(cf, on = "date")
#'
#'
#' delta_shops <- sales_data %>%
#' mutate(predictions = counterfactual) %>%
#' counterfactual_mix(
#' col_names = c("x", "y"),
#' group_names = c("Shop x", "Shop y"),
#' baseline_period = baseline_period,
#' counterfactual_period = campaign_period
#' )
#'
#' delta_shops %>%
#' filter(date %in% campaign_period) %>%
#' group_by(group) %>%
#' summarise(
#' actual = sum(actual),
#' counterfactual = sum(counterfactual),
#' delta = sum(delta),
#' delta_days = sum(delta) / length(campaign_period),
#' delta_perc = 100 * sum(delta) / sum(counterfactual)
#' )
#' @param data_input baseline and counterfactuals are provided in the data
#' @param col_name Model used to construct senarios
#' @param group_names The period when the counterfactual scenario is calculated.
#' @param baseline_period The period where the baseline mix is assumed
#' @param counterfactual_period the period where the counterfactuals are simulated
# Columns with lag information should be named as lag_i with i being a number greater than zero.
counterfactual_mix <- function(data_input,
col_names,
group_names,
baseline_period,
counterfactual_period) {
actual_mix <- data_input %>%
pivot(
columns = col_names,
groups = group_names
) %>%
rename(actual = total)
# counterfactual SCENARIO
cf.sales <- data_input %>%
mutate(predictions = counterfactual) %>%
select(date, predictions)
# Estimate baseline Mix
baseline_mix <- actual_mix %>%
filter(date %in% baseline_period) %>%
group_by(group) %>%
summarise(actual = sum(actual)) %>%
ungroup()
TOTAL <- sum(baseline_mix$actual)
baseline_mix <- baseline_mix %>%
mutate(
perc_mix = actual / TOTAL
)
# Estimate Counterfactual mix
sales_info <- merge(cf.sales, baseline_mix)
sales_info <- sales_info %>%
mutate(counterfactual = predictions * perc_mix) %>%
select(date, group, counterfactual)
# Combine Actual and Counterfactual scenario
output <- actual_mix %>%
left_join(sales_info,
on = c("date", "group")
)
# Counterfactual equal to actuals in dates not in the campaign period
output <- output %>%
mutate(
counterfactual = ifelse(date %in% counterfactual_period,
ifelse(counterfactual > 0, counterfactual, 0),
actual
),
delta = actual - counterfactual
)
# Final format
output <- output %>%
select(date, group, actual, counterfactual, delta)
#' @return Return Counterfactual mix
return(output)
}
# #####################################################################
# data transformation
#' @name pivot
#' @title Pivot Table
#' @description Data transformation
#' @export tidyverse
#'
#' @examples
#' ## Calculate counterfactual scenarios under different scenarios
#'
#' data("sales_data")
#' sales_data %>%
#' pivot(c("cs1", "cs2"), c("cs1", "cs2")) %>%
#' filter(date %in% c(baseline_period, campaign_period)) %>%
#' ggplot() +
#' geom_line(aes(x = date, y = total, col = group)) +
#' geom_vline(xintercept = min(campaign_period))
#' @param data_input Data
#' @param columns Columns representing different values
#' @param groups Name of the groups
#'
pivot <- function(data_input, columns, groups) {
data_ouput <- cbind(data_input[, c("date", columns[1])], "group" = groups[1])
names(data_ouput)[2] <- "total"
for (i in 2:length(columns)) {
data_temp <- cbind(data_input[, c("date", columns[i])], "group" = groups[i])
names(data_temp)[2] <- "total"
data_ouput <- rbind(data_ouput, data_temp)
}
data_ouput[, "group"] <- factor(data_ouput[, "group"], levels = groups)
return(data_ouput)
}
# ####################################################
# Data transformation
# ####################################################
data_model <- function(data_input,
column,
chosen_groups = NULL,
drop_na = T,
prefix = "col_") {
data_input$chosen_col <- data_input[, column]
if (length(chosen_groups) == 0) {
if (drop_na) {
chosen_groups <- unique(data_input$chosen_col)[!is.na(unique(data_input$chosen_col))]
} else {
chosen_groups <- unique(data_input$chosen_col)
}
}
for (l in 1:length(chosen_groups)) {
data_temp <- data_input %>%
filter(chosen_col == chosen_groups[l]) %>%
group_by(date) %>%
summarise(total = n()) %>%
mutate(total = replace_na(total, 0))
if (l == 1) {
data_output <- data_temp
} else {
data_output <- data_output %>%
full_join(data_temp,
by = "date"
) %>%
mutate(total = replace_na(total, 0))
}
for (i in 2:length(names(data_output))) {
data_output[is.na(data_output[, i]), names(data_output)[i]] <- 0
}
names(data_output)[length(data_output)] <- paste0(prefix, tolower(chosen_groups[l]))
}
return(data_output)
}
##########################################################################################################
# Detect source of deviations
source_deviation <- function(
data_input,
model_baseline,
baseline_mix,
chosen_period,
attributes,
thresholds = -10) {
if (length(thresholds) == 1) {
thresholds <- c(-abs(thresholds), abs(thresholds))
}
cf <- data_input %>% estim_counterfactual(
model_baseline,
chosen_period,
0
)
cf <- data.frame(
"date" = chosen_period,
"counterfactual" = cf
)
data_input <- data_input %>%
left_join(cf,
by = "date"
)
data_input <- data_input %>%
mutate(
counterfactual = ifelse(!is.na(counterfactual),
counterfactual,
total
),
counterfactual = ifelse(counterfactual < 0, 0, counterfactual),
delta = total - counterfactual
)
data_temp <- data_input %>%
mutate(predictions = counterfactual) %>%
counterfactual_mix(
col_names = attributes[[1]],
group_names = attributes[[1]],
baseline_period = baseline_mix,
counterfactual_period = chosen_period
) %>%
filter(date %in% chosen_period)
for (l in 2:length(attributes)) {
data_temp <- rbind(
data_temp,
data_input %>%
mutate(predictions = counterfactual) %>%
counterfactual_mix(
col_names = attributes[[l]],
group_names = attributes[[l]],
baseline_period = baseline_mix,
counterfactual_period = chosen_period
) %>%
filter(date %in% chosen_period)
)
}
data_output <- data_temp %>%
filter(delta > thresholds[2] | delta < thresholds[1]) %>%
arrange(desc(abs(delta)))
return(data_output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.