inst/doc/key_considerations.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = FALSE,
  dpi = 100,
  out.width = "60%",
  out.height = "60%",
  fig.height = 4,
  fig.width = 4,
  fig.align = "center"
)

## ----setup--------------------------------------------------------------------
library(correlationfunnel)
library(dplyr)
library(tibble)
library(ggplot2)
library(stringr)
library(forcats)
library(knitr)

## ---- echo=F------------------------------------------------------------------
tribble(
  ~"", ~`y=0`, ~`y=1`,
  "x=0", "f00", "f01",
  "x=1", "f10", "f11"
) %>% knitr::kable()

## -----------------------------------------------------------------------------
# Generate Data
set.seed(1)
linear_data <- tibble(
  sales = rnorm(100, mean = 10, sd = 5) + seq(1, 200,length.out = 100) * 1e6,
  macro_indicator = rnorm(100, mean = 1, sd = 2) + seq(5, 20, length.out = 100)
  ) %>%
  mutate_all(~round(., 2))

# Plot Relationship
linear_data %>%
  ggplot(aes(macro_indicator, sales)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
  labs(title = "Data with Linear Relationship")
  

## -----------------------------------------------------------------------------
linear_data %>%
  correlate(target = sales) 

## -----------------------------------------------------------------------------
linear_data_corr_tbl <- linear_data %>%
  binarize() %>%
  correlate(sales__150250005.6625_Inf) 

linear_data_corr_tbl

## -----------------------------------------------------------------------------
linear_data_corr_tbl %>%
  plot_correlation_funnel()

## -----------------------------------------------------------------------------
# Generate synthetic data
set.seed(1)
nonlinear_data <- tibble(
  sales = c(
    rnorm(100, mean = 10, sd = 25),
    rnorm(100, mean = 50, sd = 100),
    rnorm(100, mean = 2,  sd = 40)),
  age = c(
    runif(100, min = 20, max = 29),
    runif(100, min = 30, max = 39),
    runif(100, min = 39, max = 59)
  )
) %>%
  mutate(
    sales = ifelse(sales < 0, 0, sales) %>% round(2),
    age   = floor(age)
    )

# Visualize the nonlinear relationship
nonlinear_data %>%
  ggplot(aes(age, sales)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = scales::dollar_format()) +
  expand_limits(y = 300) +
  labs(title = "Data with Non-Linear Relationship")

## -----------------------------------------------------------------------------
nonlinear_data %>%
  correlate(sales)

## -----------------------------------------------------------------------------
nonlinear_data_corr_tbl <- nonlinear_data %>%
  binarize(n_bins = 5) %>%
  correlate(sales__46.552_Inf) 

nonlinear_data_corr_tbl

## -----------------------------------------------------------------------------
nonlinear_data_corr_tbl %>%
  plot_correlation_funnel(limits = c(-0.2, 1))

## -----------------------------------------------------------------------------
data("marketing_campaign_tbl")

marketing_campaign_tbl 

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  pull(CAMPAIGN) %>%
  quantile()

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  ggplot(aes(CAMPAIGN)) +
  geom_histogram() +
  labs(title = "Skewed Data")

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(CAMPAIGN) %>%
  binarize()

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  pull(PDAYS) %>%
  quantile()

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  ggplot(aes(PDAYS)) +
  geom_histogram() +
  labs(title = "Highly Skewed Data")

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(PDAYS) %>%
  binarize()

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(EDUCATION)

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(EDUCATION) %>%
  binarize(one_hot = TRUE)

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(EDUCATION) %>%
  binarize(one_hot = FALSE)

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(JOB) %>%
  binarize(thresh_infreq = 0) %>% 
  glimpse()

## ---- fig.height=8, fig.width=5-----------------------------------------------
marketing_campaign_tbl %>%
  # Count Frequency
  count(JOB) %>%
  mutate(prop = n / sum(n)) %>%
  
  # Format columns for plot
  mutate(JOB = as.factor(JOB) %>% fct_reorder(n)) %>%
  mutate(label_text = str_glue("JOB: {JOB}
                               Count: {n}
                               Prop: {scales::percent(prop)}")) %>%
  
  # Creat viz
  ggplot(aes(JOB, n)) +
  geom_point(aes(size = n)) +
  geom_segment(aes(yend = 0, xend = JOB)) +
  geom_label(aes(label = label_text), size = 3, hjust = "inward") +
  coord_flip() +
  labs(title = "Frequency of Each Level in JOB Feature")

## -----------------------------------------------------------------------------
marketing_campaign_tbl %>%
  select(JOB) %>%
  binarize(thresh_infreq = 0.06, name_infreq = "-OTHER") %>% 
  glimpse()

## ---- fig.height=8------------------------------------------------------------
marketing_campaign_tbl %>%
  binarize(n_bins = 5, thresh_infreq = 0.06, name_infreq = "-OTHER") %>%
  correlate(TERM_DEPOSIT__yes) %>%
  plot_correlation_funnel(alpha = 0.7)

Try the correlationfunnel package in your browser

Any scripts or data that you put into this service are public.

correlationfunnel documentation built on July 1, 2020, 10:28 p.m.