inst/examples/ex-mk_cdfplot.R

library(ezplot)
library(dplyr)
library(ggplot2)

# --- example 1 --- #

df = data.frame(x = c(rnorm(100, 0, 3), rnorm(100, 0, 10)), gp = gl(2, 100))
f = mk_cdfplot(df)
f('x') # default geom is 'step'
f('x', geom = 'point')

range(df$x)
f('x', pad = FALSE) # do not extend the curve to -Inf and +Inf horizontally
f('x', colorby = 'gp', add_vline_median = T, font_size = 10)

# --- example 2 --- #

rate = nrow(births) / (24 * 60) # number of events occured in a unit time (per minute)
avg = 1 / rate # mean time between events
model_str = paste0('Model: exponential(rate = ', round(rate, 3), '), ',
                   'mean time between events = ', round(avg, 2))

# add data generated by an exponential model with rate determined by observed
# sample data
set.seed(9203)
dat = rbind(births %>% filter(!is.na(diffs)) %>% mutate(type = 'Data') %>%
                    select(type, diffs),
            data.frame(type = model_str, diffs = rexp(160, rate)))

f = mk_cdfplot(dat)
f("diffs", colorby = 'type', pad = F, legend_title = NULL, legend_pos = 'top',
  add_vline_median = T, show_label_median = F) %>%
        scale_axis(axis = 'x', scale = 'comma') %>%
        add_labs(xlab = 'Minutes between births')

# --- example 3 --- #

f = mk_cdfplot(births %>% filter(!is.na(diffs)))

# plot CDF
f('diffs', complement = F) %>% scale_axis('x')

# plot Complement CDF (CCDF)
p = f('diffs', complement = T) %>% scale_axis('x')
print(p)

# use log-y scale for CCDF, and if linear, data is exponentially distributed.
p %>% scale_axis(scale = 'log', nticks = 6) %>%
        scale_axis('x') %>%
        square_fig() %>%
        # the absolute value of the slope is the rate of the exponential distribution
        add_lm_line(linew = 0.7)

# plot CDFs of each gender
f('diffs', colorby = 'sex', complement = F)

# plot CCDFs of each gender
f('diffs', colorby = 'sex', complement = T) %>%
        # can also use log10-y scale, then rate = abs(slope) * log10(exp(1))
        scale_axis(scale = 'log10') %>%
        square_fig()


# --- example 4 --- #

# make fake data
set.seed(123)
n = 200
df = data.frame(model_score = rexp(n=n, rate=1:n),
                obs_set = sample(c("training", "validation"), n, replace=TRUE))
df$model_rank = rank(df$model_score)/n
df$target_outcome = rbinom(n, 1, 1-df$model_rank)

# plot
f = mk_cdfplot(subset(df, target_outcome==1))
f('model_rank', colorby = 'obs_set')
f('model_rank', colorby = 'obs_set', pad = F)

p = f('model_rank', colorby = 'obs_set', add_vline_median = T,
      legend_title = NULL, font_size = 12)
print(p)
p %>% scale_axis(axis = 'y', scale = 'pct') %>%
        scale_axis(axis = 'x', scale = 'pct') +
        coord_equal()
p %>% scale_axis(axis = 'y', scale = 'pct', digits = 1) %>%
        scale_axis(axis = 'x', scale = 'pct', digits = 1) %>%
        add_labs(xlab = "Model Percentile", ylab = "Percent of Target Outcome",
                 title = "Gain Chart") +
        coord_equal() +
        geom_segment(aes(x=0, y=0, xend=0.9, yend=1),
                     color = "gray", linetype="longdash", size=1)
gmlang/ezplot documentation built on Sept. 18, 2022, 6:33 a.m.