WVPlots_examples.R
In WVPlots: Common Plots for Analysis

## ----echo=FALSE---------------------------------------------------------------
knitr::opts_chunk$set(fig.width=6, fig.height=6)
if (requireNamespace('data.table', quietly = TRUE)) {
	# don't multi-thread during CRAN checks
	data.table::setDTthreads(1)
}

## -----------------------------------------------------------------------------
set.seed(34903490)
x = rnorm(50)
y = 0.5*x^2 + 2*x + rnorm(length(x))
frm = data.frame(
  x = x,
  y = y,
  yC = y>=as.numeric(quantile(y,probs=0.8)),
  stringsAsFactors = FALSE)
frm$absY <- abs(frm$y)
frm$posY = frm$y > 0

## -----------------------------------------------------------------------------
WVPlots::ScatterHist(frm, "x", "y", title="Example Fit")

## -----------------------------------------------------------------------------
WVPlots::ScatterHist(frm, "x", "y", smoothmethod="lm", 
                     title="Example Linear Fit", estimate_sig = TRUE)

## -----------------------------------------------------------------------------
WVPlots::ScatterHist(frm, "x", "y", smoothmethod="identity", 
                     title="Example Relation Plot", estimate_sig = TRUE)

## -----------------------------------------------------------------------------
set.seed(34903490)
fmScatterHistC = data.frame(
  x=rnorm(50),
  y=rnorm(50),
  stringsAsFactors = FALSE)
fmScatterHistC$cat <- fmScatterHistC$x+fmScatterHistC$y>0
WVPlots::ScatterHistC(fmScatterHistC, "x", "y", "cat", title="Example Conditional Distribution")

## -----------------------------------------------------------------------------
set.seed(34903490)
frmScatterHistN = data.frame(
  x=rnorm(50),
  y=rnorm(50),
  stringsAsFactors = FALSE)
frmScatterHistN$z <- frmScatterHistN$x+frmScatterHistN$y
WVPlots::ScatterHistN(frmScatterHistN, "x", "y", "z", title="Example Joint Distribution")

## -----------------------------------------------------------------------------
WVPlots::BinaryYScatterPlot(frm, "x", "posY", use_glm=FALSE,
                            title="Example 'Probability of Y' Plot (ggplot2 smoothing)")

WVPlots::BinaryYScatterPlot(frm, "x", "posY", use_glm=TRUE, 
                            title="Example 'Probability of Y' Plot (GLM smoothing)")

## -----------------------------------------------------------------------------
if(requireNamespace("hexbin", quietly = TRUE)) {
  set.seed(5353636)
  
  df = rbind(data.frame(x=rnorm(1000, mean = 1), 
                        y=rnorm(1000, mean = 1, sd = 0.5 ),
                        stringsAsFactors = FALSE),
             data.frame(x = rnorm(1000, mean = -1, sd = 0.5), 
                        y = rnorm(1000, mean = -1, sd = 0.5),
                        stringsAsFactors = FALSE),
             stringsAsFactors = FALSE)
  
  print(WVPlots::HexBinPlot(df, "x", "y", "Two gaussians"))
}


## -----------------------------------------------------------------------------
set.seed(34903490)
y = abs(rnorm(20)) + 0.1
x = abs(y + 0.5*rnorm(20))

frm = data.frame(
  model=x, 
  value=y,
  stringsAsFactors = FALSE)

frm$costs=1
frm$costs[1]=5
frm$rate = with(frm, value/costs)

frm$isValuable = (frm$value >= as.numeric(quantile(frm$value, probs=0.8)))


## -----------------------------------------------------------------------------
WVPlots::GainCurvePlot(frm, "model", "value", title="Example Continuous Gain Curve")

## -----------------------------------------------------------------------------

gainx = 0.10  # get the top 10% most valuable points as sorted by the model

# make a function to calculate the label for the annotated point
labelfun = function(gx, gy) {
  pctx = gx*100
  pcty = gy*100
  
  paste("The top ", pctx, "% most valuable points by the model\n",
        "are ", pcty, "% of total actual value", sep='')
}

WVPlots::GainCurvePlotWithNotation(frm, "model", "value", 
                                   title="Example Gain Curve with annotation", 
                          gainx=gainx,labelfun=labelfun) 



## -----------------------------------------------------------------------------
WVPlots::GainCurvePlotC(frm, "model", "costs", "value", title="Example Continuous Gain CurveC")

## -----------------------------------------------------------------------------
set.seed(34903490)

# data with two different regimes of behavior
frm <- rbind(
  data.frame(
    model = rnorm(1000),
    isValuable = sample(c(TRUE, FALSE), prob = c(0.02, 0.98), size = 1000, replace = TRUE)),
  data.frame(
    model = rnorm(200) + 5,
    isValuable = sample(c(TRUE, FALSE), size = 200, replace = TRUE))
)

WVPlots::ROCPlot(frm, "model", "isValuable", TRUE, title="Example ROC plot")

## -----------------------------------------------------------------------------
set.seed(34903490)

x1 = rnorm(50)
x2 = rnorm(length(x1))
y = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
frmP = data.frame(
  x1=x1,
  x2=x2,
  yC = y>=as.numeric(quantile(y,probs=0.8)),
  stringsAsFactors = FALSE)
# WVPlots::ROCPlot(frmP, "x1", "yC", TRUE, title="Example ROC plot")
# WVPlots::ROCPlot(frmP, "x2", "yC", TRUE, title="Example ROC plot")
WVPlots::ROCPlotPair(frmP, "x1", "x2", "yC", TRUE, title="Example ROC pair plot")

## -----------------------------------------------------------------------------
set.seed(2342458)

make_data <- function(nrows) {
    d <- data.frame(x = rnorm(nrows))
    d['y'] = sin(d['x']) + 0.25*rnorm(n = nrows)
    d['x2'] = rnorm(n = nrows)
    d['yc'] = d[['y']]>0.5
    return(d)
}


training <- make_data(500)
test <- make_data(200)

model <- glm(yc ~ x + x2, data=training, family=binomial)

training$pred <- predict(model, newdata=training, type="response")
test$pred <- predict(model, newdata=test, type="response")

WVPlots::ROCPlotPair2(nm1 = "Training", # model 1
                      frame1 = training,
                      xvar1 = "pred",  truthVar1 = "yc", truthTarget1 = TRUE,
                      nm2 ="Test", # model 2
                      frame2 = test,
                      xvar2 = "pred", truthVar2 = "yc", truthTarget2 = TRUE,
                      title = "Model performance, training vs test",
                      estimate_sig = FALSE)

## -----------------------------------------------------------------------------
set.seed(34903490)
x1 = rnorm(50)
x2 = rnorm(length(x1))
x3 = rnorm(length(x1))
y = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
frm_m = data.frame(
   x1 = x1,
   x2 = x2,
   x3 = x3,
   yC = y >= as.numeric(quantile(y,probs=0.8)))
WVPlots::ROCPlotPairList(
   frame = frm_m,
   xvar_names = c("x1", "x2", "x3"),
   truthVar = "yC", truthTarget = TRUE,
   title = "Example ROC list plot")

## -----------------------------------------------------------------------------
WVPlots::PRTPlot(frm, "model", "isValuable", TRUE, title="Example Precision-Recall plot")

## -----------------------------------------------------------------------------
# replicate PRTPlot. Looks a little different because ThresholdPlot does different smoothing
WVPlots::ThresholdPlot(frm, "model", "isValuable", title="Reproduce PRTPlot",
              truth_target=TRUE, # default
              metrics = c("precision", "recall"))

# default: sensitivity/specificity
WVPlots::ThresholdPlot(frm, "model", "isValuable", 
                       title="Sensitivity and Specificity as a Function of Threshold")


## -----------------------------------------------------------------------------
WVPlots::ThresholdPlot(frm, "model", "isValuable", title="ROC 'unrolled'",
              metrics = c("true_positive_rate", "false_positive_rate"))

## -----------------------------------------------------------------------------
WVPlots::DoubleDensityPlot(frm, "model", "isValuable", title="Example double density plot")

## -----------------------------------------------------------------------------
WVPlots::DoubleHistogramPlot(frm, "model", "isValuable", title="Example double histogram plot")

## -----------------------------------------------------------------------------
set.seed(34903490)

# discrete variable: letters of the alphabet
# frequencies of letters in English
# source: http://en.algoritmy.net/article/40379/Letter-frequency-English
letterFreqs = c(8.167, 1.492, 2.782, 4.253, 12.702, 2.228,
                2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929,
                0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.360, 0.150, 1.974, 0.074)
letterFreqs = letterFreqs/100
letterFrame = data.frame(
  letter = letters, 
  freq=letterFreqs, 
  stringsAsFactors = FALSE)

# now let's generate letters according to their letter frequencies
N = 1000
randomDraws = data.frame(
  draw=1:N, 
  letter=sample(letterFrame$letter, size=N, replace=TRUE, prob=letterFrame$freq), 
  stringsAsFactors = FALSE)

WVPlots::ClevelandDotPlot(randomDraws, "letter", title = "Example Cleveland-style dot plot")
WVPlots::ClevelandDotPlot(randomDraws, "letter", limit_n = 10,  title = "Top 10 most frequent letters")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=0, title="Example Cleveland-style dot plot, unsorted")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=1, stem=FALSE, title="Example with increasing sort order + coord_flip, no stem") + ggplot2::coord_flip()


## -----------------------------------------------------------------------------
set.seed(34903490)
N = 1000
ncar_vec = 0:5
prob = c(1.5, 3, 3.5, 2, 1, 0.75); prob = prob/sum(prob)

df = data.frame(
  num_cars = sample(ncar_vec, size = N, replace = TRUE, prob=prob),
  stringsAsFactors = FALSE)
WVPlots::ClevelandDotPlot(df, "num_cars", sort = 0, title = "Distribution of household vehicle ownership")
              


## -----------------------------------------------------------------------------
set.seed(354534)
N = 100

# rough proportions of eye colors
eprobs = c(0.37, 0.36, 0.16, 0.11)

eye_color  = sample(c("Brown", "Blue", "Hazel", "Green"), size = N, replace = TRUE, prob = eprobs)
sex = sample(c("Male", "Female"), size = N, replace = TRUE)

# A data frame of eye color by sex
dframe = data.frame(
  eye_color = eye_color, 
  sex = sex,
  stringsAsFactors = FALSE)

WVPlots::ShadowPlot(dframe, "eye_color", "sex", title = "Shadow plot of eye colors by sex")


## -----------------------------------------------------------------------------
set.seed(354534)
N = 100

dframe = data.frame(
  x = rnorm(N), 
  gp = "region 2", 
  stringsAsFactors = FALSE)
dframe$gp = with(dframe, ifelse(x < -0.5, "region 1", 
                                ifelse(x > 0.5, "region 3", gp)))

WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region")


## -----------------------------------------------------------------------------
WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region", monochrome=TRUE) 

## -----------------------------------------------------------------------------
colormap = c("#1F968BFF", "#29AF7FFF", "#55C667FF")

WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region", palette=NULL) +
  ggplot2::scale_fill_manual(values=colormap)


## -----------------------------------------------------------------------------
classes = c("a", "b", "c")
means = c(2, 4, 3)
names(means) = classes
label = sample(classes, size=1000, replace=TRUE)
meas = means[label] + rnorm(1000)
frm2 = data.frame(label=label,
                  meas = meas,
                  stringsAsFactors = FALSE)

WVPlots::ScatterBoxPlot(frm2, "label", "meas", pt_alpha=0.2, title="Example Scatter/Box plot")
WVPlots::ScatterBoxPlotH(frm2, "meas", "label",  pt_alpha=0.2, title="Example Scatter/Box plot")

## -----------------------------------------------------------------------------
frmx = data.frame(x = rbinom(1000, 20, 0.5),
                  stringsAsFactors = FALSE)
WVPlots::DiscreteDistribution(frmx, "x","Discrete example")

## -----------------------------------------------------------------------------
set.seed(52523)
d <- data.frame(wt=100*rnorm(100),
                stringsAsFactors = FALSE)
WVPlots::PlotDistCountNormal(d,'wt','example')
WVPlots::PlotDistDensityNormal(d,'wt','example')

## -----------------------------------------------------------------------------
set.seed(13951)
trial_size = 20  # one trial is 20 flips
ntrial = 100     # run 100 trials
true_frate = 0.4  # true heads probability
fdata = data.frame(n_heads = rbinom(ntrial, trial_size, true_frate),
                   stringsAsFactors = FALSE)

title = paste("Distribution of head counts, trial size =", trial_size)
# compare to empirical p
WVPlots::PlotDistCountBinomial(fdata, "n_heads", trial_size, title)

## -----------------------------------------------------------------------------
# compare to theoretical p of 0.5
WVPlots::PlotDistCountBinomial(fdata, "n_heads", trial_size, title,
                      p = 0.5)

## -----------------------------------------------------------------------------
set.seed(349521)
N = 100  # number of cohorts
psucc = 0.15  # true success rate in population
group_size = round(runif(N, min=25, 50))  # sizes of observed sample groups
nsucc = rbinom(N, group_size, psucc)   # successes in each group
hdata = data.frame(n_success=nsucc, 
                   group_size=group_size,
                   stringsAsFactors = FALSE)

# observed rate of successes in each group
hdata$rate_success = with(hdata, n_success/group_size)

title = "Observed prevalence of success in population"

WVPlots::PlotDistHistBeta(hdata, "rate_success", title) 
WVPlots::PlotDistDensityBeta(hdata, "rate_success", title)


## -----------------------------------------------------------------------------
y = c(1,2,3,4,5,10,15,18,20,25)
x = seq_len(length(y))
df = data.frame(x=x,
                y=y,
                stringsAsFactors = FALSE)

WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="centered smooth, one group")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="left smooth, one group", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="right smooth, one group", align="right")

n = length(x)
df = rbind(data.frame(x=x, y=y+rnorm(n), gp="times 1", stringsAsFactors = FALSE),
           data.frame(x=x, y=0.5*y + rnorm(n), gp="times 1/2", stringsAsFactors = FALSE),
           data.frame(x=x, y=2*y + rnorm(n), gp="times 2", stringsAsFactors = FALSE),
           stringsAsFactors = FALSE)

WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="centered smooth, multigroup")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="left smooth, multigroup", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="right smooth, multigroup", align="right")

## -----------------------------------------------------------------------------
set.seed(52523)
d = data.frame(meas=rnorm(100), stringsAsFactors = FALSE)
threshold = -1.5
WVPlots::ShadedDensity(d, "meas", threshold, 
                       title="Example shaded density plot, left tail")
WVPlots::ShadedDensity(d, "meas", -threshold, tail="right", 
                       title="Example shaded density plot, right tail")


## -----------------------------------------------------------------------------
set.seed(52523)
d = data.frame(meas=rnorm(100), stringsAsFactors = FALSE)
# first and third quartiles of the data (central 50%)
boundaries = quantile(d$meas, c(0.25, 0.75))
WVPlots::ShadedDensityCenter(d, "meas", boundaries, 
                       title="Example center-shaded density plot")