Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
warning = FALSE,
message = FALSE,
dpi = 100,
out.width = "60%",
out.height = "60%",
fig.height = 4,
fig.width = 4,
fig.align = "center"
)
## ----setup--------------------------------------------------------------------
library(correlationfunnel)
library(dplyr)
library(tibble)
library(ggplot2)
library(stringr)
library(forcats)
library(knitr)
## ---- echo=F------------------------------------------------------------------
tribble(
~"", ~`y=0`, ~`y=1`,
"x=0", "f00", "f01",
"x=1", "f10", "f11"
) %>% knitr::kable()
## -----------------------------------------------------------------------------
# Generate Data
set.seed(1)
linear_data <- tibble(
sales = rnorm(100, mean = 10, sd = 5) + seq(1, 200,length.out = 100) * 1e6,
macro_indicator = rnorm(100, mean = 1, sd = 2) + seq(5, 20, length.out = 100)
) %>%
mutate_all(~round(., 2))
# Plot Relationship
linear_data %>%
ggplot(aes(macro_indicator, sales)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
labs(title = "Data with Linear Relationship")
## -----------------------------------------------------------------------------
linear_data %>%
correlate(target = sales)
## -----------------------------------------------------------------------------
linear_data_corr_tbl <- linear_data %>%
binarize() %>%
correlate(sales__150250005.6625_Inf)
linear_data_corr_tbl
## -----------------------------------------------------------------------------
linear_data_corr_tbl %>%
plot_correlation_funnel()
## -----------------------------------------------------------------------------
# Generate synthetic data
set.seed(1)
nonlinear_data <- tibble(
sales = c(
rnorm(100, mean = 10, sd = 25),
rnorm(100, mean = 50, sd = 100),
rnorm(100, mean = 2, sd = 40)),
age = c(
runif(100, min = 20, max = 29),
runif(100, min = 30, max = 39),
runif(100, min = 39, max = 59)
)
) %>%
mutate(
sales = ifelse(sales < 0, 0, sales) %>% round(2),
age = floor(age)
)
# Visualize the nonlinear relationship
nonlinear_data %>%
ggplot(aes(age, sales)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = scales::dollar_format()) +
expand_limits(y = 300) +
labs(title = "Data with Non-Linear Relationship")
## -----------------------------------------------------------------------------
nonlinear_data %>%
correlate(sales)
## -----------------------------------------------------------------------------
nonlinear_data_corr_tbl <- nonlinear_data %>%
binarize(n_bins = 5) %>%
correlate(sales__46.552_Inf)
nonlinear_data_corr_tbl
## -----------------------------------------------------------------------------
nonlinear_data_corr_tbl %>%
plot_correlation_funnel(limits = c(-0.2, 1))
## -----------------------------------------------------------------------------
data("marketing_campaign_tbl")
marketing_campaign_tbl
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
pull(CAMPAIGN) %>%
quantile()
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
ggplot(aes(CAMPAIGN)) +
geom_histogram() +
labs(title = "Skewed Data")
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(CAMPAIGN) %>%
binarize()
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
pull(PDAYS) %>%
quantile()
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
ggplot(aes(PDAYS)) +
geom_histogram() +
labs(title = "Highly Skewed Data")
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(PDAYS) %>%
binarize()
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(EDUCATION)
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(EDUCATION) %>%
binarize(one_hot = TRUE)
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(EDUCATION) %>%
binarize(one_hot = FALSE)
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(JOB) %>%
binarize(thresh_infreq = 0) %>%
glimpse()
## ---- fig.height=8, fig.width=5-----------------------------------------------
marketing_campaign_tbl %>%
# Count Frequency
count(JOB) %>%
mutate(prop = n / sum(n)) %>%
# Format columns for plot
mutate(JOB = as.factor(JOB) %>% fct_reorder(n)) %>%
mutate(label_text = str_glue("JOB: {JOB}
Count: {n}
Prop: {scales::percent(prop)}")) %>%
# Creat viz
ggplot(aes(JOB, n)) +
geom_point(aes(size = n)) +
geom_segment(aes(yend = 0, xend = JOB)) +
geom_label(aes(label = label_text), size = 3, hjust = "inward") +
coord_flip() +
labs(title = "Frequency of Each Level in JOB Feature")
## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
select(JOB) %>%
binarize(thresh_infreq = 0.06, name_infreq = "-OTHER") %>%
glimpse()
## ---- fig.height=8------------------------------------------------------------
marketing_campaign_tbl %>%
binarize(n_bins = 5, thresh_infreq = 0.06, name_infreq = "-OTHER") %>%
correlate(TERM_DEPOSIT__yes) %>%
plot_correlation_funnel(alpha = 0.7)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.