if(!require(tidyverse, quietly = TRUE)) install.packages("tidyverse") if(!require(glue, quietly = TRUE)) install.packages("glue") if(!require(here, quietly = TRUE)) install.packages("here") if(!require(lubridate, quietly = TRUE)) install.packages("lubridate") if(!require(cowplot, quietly = TRUE)) install.packages("cowplot") if(!require(ggthemes, quietly = TRUE)) install.packages("ggthemes") if(!require(DT, quietly = TRUE)) install.packages("DT") if(!require(devtools, quietly = TRUE)) install.packages("devtools") library(tidyverse) # Data Processing library(glue) # Text formatting library(here) # File Location library(lubridate) # Date processing library(cowplot) # Data visualization library(ggthemes) # Data Visualization library(DT) # Printing table library(devtools) # Maintain package
devtools::load_all()
I created six variables:
overspend
: campaign_spend - campaign_budgetoverspent
: logical flag for whether campaign_spend > campaign_budgetoverspent_1
: logical flag for whether campaign_spend > campaign_budget * 1.01log_campaign_budget
: log transformation of campaign budgetlog_campaign_spend
: log transformation of campaign spendcapped_overspend
: max(0, overspend)log_capped_overspend
: log transformation of capped_overspenddf <- get_data()
There is no missing values.
if (nrow(df[rowSums(is.na(df))==0,]) == nrow(df)) { print("There is no NA in dataframe") }
Boxplots provide visual summaries of the data to quickly identify median, interquartile intervals and outliers. I plot box plots for log transformations of data below:
p <- plot_boxplot_by_vars(df, "log_budget", "size") ggsave(file.path(here::here(), "fig/boxplot_log_campaign_budget.png"), p, width = 12) print(p)
p <- plot_boxplot_by_vars(df, "log_spend", "size") ggsave(file.path(here::here(), "fig/boxplot_log_campaign_spend.png"), p, width = 12) print(p)
p <- plot_boxplot_by_vars(df, "log_capped_overspend", "size") ggsave(file.path(here::here(), "fig/boxplot_log_capped_overspend.png"), p, width = 12) print(p)
I split overspend into bins and visualized:
bins <- seq(from = -5000000, to = 500000, by = 50000) df$overspend_bins <- cut(df$overspend, breaks = bins) overspend_bins <- tibble(overspend_bins = sort(unique(df$overspend_bins))) p1 <- df %>% filter(treat == TRUE) %>% group_by(overspend_bins) %>% tally() %>% ungroup() %>% right_join(overspend_bins, by = "overspend_bins") %>% mutate(n = replace_na(n, 0)) %>% ggplot(aes(x = n, y = overspend_bins, label = n)) + geom_bar(stat = "identity", fill = "#69b3a2", color = "#e9ecef") + geom_text(hjust = -1) + xlim(0, 7500) + ggtitle("Overspend, in dollar value, of the treatment group") + xlab("Number of campaigns") + ylab("Value range of overspend") p2 <- df %>% filter(treat == FALSE) %>% group_by(overspend_bins) %>% tally() %>% ungroup() %>% right_join(overspend_bins, by = "overspend_bins") %>% mutate(n = replace_na(n, 0)) %>% ggplot(aes(x = n, y = overspend_bins, label = n)) + geom_bar(stat = "identity", fill = "#69b3a2", color = "#e9ecef") + geom_text(hjust = -1) + xlim(0, 7500) + ggtitle("Overspend, in dollar value, of the control group") + xlab("Number of campaigns") + ylab("Value range of overspend") p <- plot_grid(p1, p2, labels = c('A', 'B'), align = 'v') ggsave(file.path(here::here(), "fig/overspend_dollar.png"), p, width = 12) print(p)
I plot the distributions of original and log-transformed data side-by-side.
p <- plot_hist_by_vars(df, "budget") ggsave(file.path(here::here(), "fig/budget_hist.png"), p, width = 12) print(p)
p <- plot_hist_by_vars(df, "spend") ggsave(file.path(here::here(), "fig/spend_hist.png"), p, width = 12) print(p)
p <- plot_hist_by_vars(df, "capped_overspend") ggsave(file.path(here::here(), "fig/capped_overspend_hist.png"), p, width = 12) print(p)
df %>% group_by(treat) %>% summarize(prct_overspent = round(mean(overspent) * 100, 2), num_overspend = sum(overspent), count_group_size = n()) %>% DT::datatable()
There are two ways to answer this question:
One-sided two-sample Z-test:
Hypotheses:
With p-value < 2.2e-16, we can reject the null hypothesis that p_treatment >= p_control. There is convincing evidence that p_treatment < p_control.
perform_prop_test(df, var = 'overspent')
To assess the effect of size, I performed logistic regression and ANOVA.
model_prop_overspent <- glm(overspent ~ treat + size + treat:size, data = df, family = "binomial") summary(model_prop_overspent)
anova(model_prop_overspent, test = "Chisq")
As the interaction term is not statistically significant, I reran the logistic regression without the interaction terms. The dummy variables for medium and large company size are statistically significant. The treatment effect is also statistically significant.
model_prop_overspent <- glm(overspent ~ treat + size, data = df, family = "binomial") summary(model_prop_overspent)
Raw overspending is spend - budget. Raw overspend might not reflect the full picture of wasted opportunity due to overspending, because a significant portion of campaigns underspent.
This is to make sure that the sample mean indeed follow approximately a normal distribution.
# Check the distribution of sample means of overspend in the control group # Sample means of overspend indeed followed (approximately) normal distribution in the control group control <- df %>% filter(treat == FALSE) %>% pull(overspend) set.seed(2021) to_test <- get_sample_means(control, n_inter = 10000, sample_size = 5000) p <- tibble(to_test = to_test) %>% ggplot(aes(x = to_test)) + geom_histogram(bins = 100, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Mean Overspend for Sample Size = 5,000 (Control Group)")) ggsave(file.path(here::here(), "fig/raw_overspend_sampling_statistics.png"), p, width = 12) print(p)
# Check the distribution of sample means of overspend in the treatment group # Sample means of overspend do not followed (approximately) normal distribution in the treatment group. This may mean the underlying population is too different from the normal distribution, which will not allow me to do a t-test. treatment <- df %>% filter(treat == TRUE) %>% pull(overspend) set.seed(2021) to_test <- get_sample_means(treatment, n_inter = 10000, sample_size = 5000) p <- tibble(to_test = to_test) %>% ggplot(aes(x = to_test)) + geom_histogram(bins = 100, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Mean Overspend for Sample Size = 5,000 (Treatment Group)")) ggsave(file.path(here::here(), "raw_overspend_sampling_statistics_treatment.png"), p, width = 12) print(p)
overspend
Hypothesis: Mean overspend in Control > Mean overspend in Treatment, so mean_control - mean_treatment should be larger than 0.
# Get permutated means in 100,000 iterations # This can take some time. perm_diffs <- get_n_perm_diffs(df, "overspend", iter = 100000) saveRDS(perm_diffs, file.path(here::here(), "data/perm_diffs.rds"))
mean_control <- df %>% filter(treat == FALSE) %>% pull(overspend) %>% mean() mean_treatment <- df %>% filter(treat == TRUE) %>% pull(overspend) %>% mean() perm_diff_df <- tibble(perm_diffs = perm_diffs) p <- perm_diff_df %>% ggplot(aes(x = perm_diffs)) + geom_histogram(bins = 100, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + geom_vline(xintercept = mean_control - mean_treatment) + xlab("Mean overspend difference (in dollars)") + ylab("Frequency") + ggtitle("Frequency distribution for mean dollar overspend differences between control and treatment; \nThe vertical line shows the true observed difference") ggsave(file.path(here::here(), "fig/permutation_test.png"), p, width = 12) print(p)
The permuted differences exceed the observed differences 43.5% of the time. The observed difference is likely not statistically significant.
mean(perm_diffs > mean_control - mean_treatment)
Capped overspending is max(spend - budget, 0).
# Check the distribution of sample means of capped overspend in the control group # Sample means of capped overspend indeed followed (approximately) normal distribution in the control group control <- df %>% filter(treat == FALSE) %>% pull(capped_overspend) set.seed(2021) to_test <- get_sample_means(control, n_inter = 10000, sample_size = 5000) p <- tibble(to_test = to_test) %>% ggplot(aes(x = to_test)) + geom_histogram(bins = 100, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Mean Capped Overspend for Sample Size = 5,000 (Control Group)")) ggsave(file.path(here::here(), "fig/capped_overspend_sampling_statistics_control.png"), p, width = 12) print(p)
# Check the distribution of sample means of capped overspend in the treatment group # Sample means of capped overspend indeed followed (approximately) normal distribution in the treatment group treatment <- df %>% filter(treat == TRUE) %>% pull(capped_overspend) set.seed(2021) to_test <- get_sample_means(treatment, n_inter = 10000, sample_size = 5000) p <- tibble(to_test = to_test) %>% ggplot(aes(x = to_test)) + geom_histogram(bins = 100, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Mean Capped Overspend for Sample Size = 5,000 (Treatment Group)")) ggsave(file.path(here::here(), 'fig/capped_overspend_sampling_statistics_treatment.png'), p, width = 12) print(p)
Null hypothesis: mean_capped_overspend_treatment >= mean_capped_overspend_control Alternative hypothesis: mean_capped_overspend_treatment < mean_capped_overspend_control
Failed to reject the null hypothesis. In fact, the observed mean (capped) overspend in the treatment group is larger than the observed mean (capped) overspend in the control group.
t.test(capped_overspend ~ treat, data = df, var.equal = FALSE, alternative = 'greater')
Both p-values are < 0.05, the distribution of overspend are statistically significantly different between the control and treatment group, in both raw value and capped values.
Null: The population distributions are the same, P(control > treatment) <= 0.5 Alternative: Observations from control tend to be larger than observations from treatment, P(control > treatment) > 0.5
wilcox.test(overspend ~ treat, data = df, alternative = 'greater')
wilcox.test(capped_overspend ~ treat, data = df, alternative = 'greater')
p1 <- df %>% filter(treat == TRUE) %>% ggplot(aes(x = log_budget)) + geom_histogram(bins = 50, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Distribution of log(campaign_budget) in the Treatment Group")) p2 <- df %>% filter(treat == FALSE) %>% ggplot(aes(x = log_budget)) + geom_histogram(bins = 50, fill = "#69b3a2", color = "#e9ecef", alpha = 0.9) + ggtitle(glue("Distribution of log(campaign_budget) in the Control Group")) p <- plot_grid(p1, p2, nrow = 2, ncol = 1) ggsave(file.path(here::here(), 'fig/dist_plot_budget.png'), p) print(p)
One-sided two-sampled Welch t-test.
We can reject the null hypothesis. We estimate the treatment effect of the mean campaign budget in the control group is exp(4.633362 - 4.341255) = 1.34 times larger than the mean campaign budget in the treatment group, with statistical significant. Therefore, the treatment effect reduces campaign budget by approximately 26%.
t.test(log_budget ~ treat, data = df, var.equal = FALSE, alternative = 'greater')
model <- lm(log_budget ~ treat + size + treat:size, data = df) summary(model)
We can observe that the interaction term is statistically significant.
anova(model)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.