# define some helper and reporting functions
# calculate area under the curve of numeric vectors x,y
# length(x)==length(y)
# y>=0, 0<=x<=1 x non-empty, and x strictly increasing
areaCalc <- function(x, y) {
# append extra points to get rid of degenerate cases
if (!all(diff(x) > 0)) {
stop("areaCalc x wasn't strinctly increasing")
}
if (x[1] < 0) {
x <- c(0, x)
y <- c(0, y)
}
if (x[length(x)] < 1) {
x <- c(x, 1)
y <- c(y, 1)
}
n <- length(x)
sum(0.5 * (y[-1] + y[-n]) * (x[-1] - x[-n]))
}
relativeGiniScore <- function(modelValues, yValues) {
d = data.frame(predcol = modelValues, truthcol = yValues)
n <- nrow(d)
predord = order(d[['predcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
wizard = order(d[['truthcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(
pctpop = (1:npop) / npop,
model = cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol']) / sum(d[['truthcol']])
)
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(results$pctpop, results$wizard) - 0.5
modelArea = areaCalc(results$pctpop, results$model) - 0.5
modelArea / idealArea # actually, normalized gini score
}
# sample with respect to multiple orders to get smooth sampling
thin_frame_by_orders <- function(d, cols, groupcol, large_count) {
n <- nrow(d)
if(n<=(length(cols)+1)*large_count) {
return(d)
}
takes <- c()
for(ci in cols) {
ordi <- order(d[[groupcol]],
d[[ci]],
sample.int(n, n, replace = FALSE))
takesi <- seq(1, n, length.out = large_count)
sg <- d[[groupcol]][ordi]
deltas <- which(sg[-1]!=sg[-n])
boundsi <- NULL
if(length(deltas>1)) {
boundsi <- pmin(n, c(deltas, 1+deltas))
}
invperm <- wrapr::invert_perm(ordi)
takes <- sort(unique(c(takes,
invperm[takesi],
invperm[boundsi])))
}
if(2*length(takes)>=n) {
return(d)
}
d[takes, , drop = FALSE]
}
#' Plot the cumulative gain curve of a sort-order.
#'
#' Plot the cumulative gain curve of a sort-order.
#'
#' The use case for this visualization is to compare a predictive model
#' score to an actual outcome (either binary (0/1) or continuous). In this case the
#' gain curve plot measures how well the model score sorts the data compared
#' to the true outcome value.
#'
#' The x-axis represents the fraction of items seen when sorted by score, and the
#' y-axis represents the cumulative summed true outcome represented by the items seen so far.
#' See, for example,
#' \url{https://www.ibm.com/docs/SSLVMB_24.0.0/spss/tutorials/mlp_bankloan_outputtype_02.html}.
#'
#' For comparison, \code{GainCurvePlot} also plots the "wizard curve": the gain curve when the
#' data is sorted according to its true outcome.
#'
#' To improve presentation quality, the plot is limited to approximately \code{large_count} points (default: 1000).
#' For larger data sets, the data is appropriately randomly sampled down before plotting.
#'
#'
#' @param frame data frame to get values from
#' @param xvar name of the independent (input or model score) column in frame
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param estimate_sig logical, if TRUE compute significance.
#' @param large_count numeric, upper bound target for number of plotting points.
#' @param truth_target if not NULL compare to this scalar value.
#' @param model_color color for the model curve
#' @param wizard_color color for the "wizard" (best possible) curve
#' @param shadow_color color for the shaded area under the curve
#'
#' @examples
#'
#' if (requireNamespace('data.table', quietly = TRUE)) {
#' # don't multi-thread during CRAN checks
#' data.table::setDTthreads(1)
#' }
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' WVPlots::GainCurvePlot(frm, "model", "value",
#' title="Example Continuous Gain Curve")
#'
#' @export
GainCurvePlot = function(frame, xvar, truthVar, title,
...,
estimate_sig = FALSE,
large_count = 1000,
truth_target = NULL,
model_color='darkblue',
wizard_color='darkgreen',
shadow_color='darkgray') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = list(xvar = xvar, truthVar = truthVar),
title = title,
funname = "WVPlots::GainCurvePlot")
pctpop <- NULL # used as a symbol, declare not an unbound variable
pct_outcome <-
NULL # used as a symbol, declare not an unbound variable
sort_criterion <-
NULL # used as a symbol, declare not an unbound variable
if(!is.null(truth_target)) {
truthcol <- as.numeric(frame[[truthVar]]==truth_target)
} else {
truthcol <- as.numeric(frame[[truthVar]])
}
predcol <- as.numeric(frame[[xvar]])
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol, truthcol = truthcol)
n <- nrow(d)
predord = order(d[['predcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
wizard = order(d[['truthcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results = data.frame(
pctpop = (1:npop) / npop,
model = cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']]),
wizard = cumsum(d[wizard, 'truthcol']) / sum(d[['truthcol']])
)
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(results$pctpop, results$wizard) - 0.5
modelArea = areaCalc(results$pctpop, results$model) - 0.5
relGiniScore = modelArea / idealArea # actually, normalized gini score
# transform the frame into the tall form, for plotting
r1 <- data.frame(pctpop = results$pctpop,
pct_outcome = results$model,
sort_criterion = "model",
stringsAsFactors = FALSE)
r2 <- data.frame(pctpop = results$pctpop,
pct_outcome = results$wizard,
sort_criterion = "wizard",
stringsAsFactors = FALSE)
results <- rbind(r1, r2, stringsAsFactors = FALSE)
# rename sort_criterion
sortKeyM <- c('model' = paste('model: sort by', xvar),
'wizard' = paste('wizard: sort by', truthVar))
results$sort_criterion <- sortKeyM[results$sort_criterion]
# rename levels of sort criterion
colorKey = as.character(sortKeyM) %:=% c(model_color, wizard_color)
names(colorKey) = c(paste('model: sort by', xvar),
paste('wizard: sort by', truthVar))
modelKey = names(colorKey)[[1]]
pString <- ''
if(estimate_sig && requireNamespace('sigr', quietly = TRUE)) {
sp <-
sigr::permutationScoreModel(predcol, truthcol, relativeGiniScore)
pString <-
sigr::render(sigr::wrapSignificance(sp$pValue), format = 'ascii')
pString <-
paste0('\nalt. hyp.: relGini(',
xvar,
')>permuted relGini, ',
pString)
}
# cut down the number of points
results <- thin_frame_by_orders(results,
c("pctpop", "pct_outcome"),
"sort_criterion",
large_count)
# plot
gplot = ggplot2::ggplot(data = results) +
ggplot2::geom_point(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
shape = sort_criterion
),
alpha = 0.5
) +
ggplot2::geom_line(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
linetype = sort_criterion
)
) +
ggplot2::geom_abline(
color = 'gray',
slope = 1,
intercept = 0
) +
ggplot2::geom_ribbon(
data = results[results$sort_criterion == modelKey, , drop = FALSE],
mapping = ggplot2::aes(
x = pctpop,
ymin = pctpop,
ymax = pct_outcome,
color = sort_criterion
),
alpha = 0.3,
color = NA,
fill = shadow_color
) +
ggplot2::ggtitle(
paste0(
title,
'\n',
truthVar,
'~',
xvar),
subtitle=paste0(
'Gini score: ',
format(modelArea, digits = 2),
', relative Gini score: ',
format(relGiniScore, digits = 2),
pString
)
) +
ggplot2::xlab("fraction items in sort order") +
ggplot2::ylab(paste("fraction total sum", truthVar)) +
ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1)) +
ggplot2::scale_y_continuous(breaks = seq(0, 1, 0.1)) +
ggplot2::scale_color_manual(values = colorKey) +
ggplot2::coord_fixed() +
ggplot2::theme(legend.position = "bottom")
gplot
}
makeRelativeGiniCostScorer <- function(costcol) {
force(costcol)
function(modelValues, yValues) {
truthcol <- yValues
predcol <- modelValues
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol,
truthcol = truthcol,
costcol = costcol)
n <- nrow(d)
predord = order(d[['predcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
wizard = order(d[['truthcol']] / d[['costcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
mName = paste("model: sort by model")
resultsM = data.frame(
pctpop = cumsum(d[predord, 'costcol']) / sum(d[['costcol']]),
pct_outcome = cumsum(d[predord, 'truthcol']) /
sum(d[['truthcol']]),
sort_criterion = mName
)
wName = paste("wizard: sort by varlue/cost")
resultsW = data.frame(
pctpop = cumsum(d[wizard, 'costcol']) / sum(d[['costcol']]),
pct_outcome = cumsum(d[wizard, 'truthcol']) /
sum(d[['truthcol']]),
sort_criterion = wName
)
results = rbind(resultsM, resultsW, stringsAsFactors = FALSE)
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(resultsW$pctpop, resultsW$pct_outcome) - 0.5
modelArea = areaCalc(resultsM$pctpop, resultsM$pct_outcome) - 0.5
modelArea / idealArea # actually, normalized gini score
}
}
#' Plot the cumulative gain curve of a sort-order with costs.
#'
#' Plot the cumulative gain curve of a sort-order with costs.
#'
#' \code{GainCurvePlotC} plots a cumulative gain curve for the case where
#' items have an additional cost, in addition to an outcome value.
#'
#' The x-axis represents the fraction of total cost experienced when items are sorted by score, and the
#' y-axis represents the cumulative summed true outcome represented by the items seen so far.
#'
#' For comparison, \code{GainCurvePlotC} also plots the "wizard curve": the gain curve when the
#' data is sorted according to its true outcome/cost (the optimal sort order).
#'
#' To improve presentation quality, the plot is limited to approximately \code{large_count} points (default: 1000).
#' For larger data sets, the data is appropriately randomly sampled down before plotting.
#'
#'
#' @param frame data frame to get values from
#' @param xvar name of the independent (input or model score) column in frame
#' @param costVar cost of each item (drives x-axis sum)
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param estimate_sig logical, if TRUE compute significance
#' @param large_count numeric, upper bound target for number of plotting points
#' @param model_color color for the model curve
#' @param wizard_color color for the "wizard" (best possible) curve
#' @param shadow_color color for the shaded area under the curve
#'
#' @seealso \code{\link{GainCurvePlot}}
#'
#' @examples
#'
#' if (requireNamespace('data.table', quietly = TRUE)) {
#' # don't multi-thread during CRAN checks
#' data.table::setDTthreads(1)
#' }
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' frm$costs=1
#' frm$costs[1]=5
#' WVPlots::GainCurvePlotC(frm, "model", "costs", "value",
#' title="Example Continuous Gain CurveC")
#'
#' @export
GainCurvePlotC = function(frame, xvar, costVar, truthVar, title,
...,
estimate_sig = FALSE,
large_count = 1000,
model_color='darkblue',
wizard_color='darkgreen',
shadow_color='darkgray') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = list(xvar = xvar, costVar= costVar, truthVar = truthVar),
title = title,
funname = "WVPlots::GainCurvePlotC")
pctpop <- NULL # used as a symbol, declare not an unbound variable
pct_outcome <-
NULL # used as a symbol, declare not an unbound variable
sort_criterion <-
NULL # used as a symbol, declare not an unbound variable
truthcol <- as.numeric(frame[[truthVar]])
predcol <- as.numeric(frame[[xvar]])
costcol <- as.numeric(frame[[costVar]])
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol,
truthcol = truthcol,
costcol = costcol)
n <- nrow(d)
predord = order(d[['predcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
wizard = order(d[['truthcol']] / d[['costcol']],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
npop = dim(d)[1]
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
mName = paste("model: sort by", xvar)
resultsM = data.frame(
pctpop = cumsum(d[predord, 'costcol']) / sum(d[['costcol']]),
pct_outcome = cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']]),
sort_criterion = mName
)
wName = paste("wizard: sort by ", truthVar, '/', costVar)
resultsW = data.frame(
pctpop = cumsum(d[wizard, 'costcol']) / sum(d[['costcol']]),
pct_outcome = cumsum(d[wizard, 'truthcol']) / sum(d[['truthcol']]),
sort_criterion = wName
)
results = rbind(resultsM, resultsW, stringsAsFactors = FALSE)
# calculate the areas under each curve
# gini score is 2* (area - 0.5)
idealArea = areaCalc(resultsW$pctpop, resultsW$pct_outcome) - 0.5
modelArea = areaCalc(resultsM$pctpop, resultsM$pct_outcome) - 0.5
relGiniScore = modelArea / idealArea # actually, normalized gini score
# map names to colors
colorKey = c('model' = model_color, 'wizard' = wizard_color)
names(colorKey) = c(mName, wName)
modelKey = mName
pString <- ''
if (estimate_sig && requireNamespace('sigr', quietly = TRUE)) {
relativeGiniCostScorer <- makeRelativeGiniCostScorer(costcol)
sp <-
sigr::permutationScoreModel(predcol, truthcol, relativeGiniCostScorer)
pString <-
sigr::render(sigr::wrapSignificance(sp$pValue), format = 'ascii')
pString <-
paste0('\nalt. hyp.: relGini(',
xvar,
')>permuted relGini, ',
pString)
}
# cut down the number of points
results <- thin_frame_by_orders(results,
c("pctpop", "pct_outcome"),
"sort_criterion",
large_count)
# plot
gplot = ggplot2::ggplot(data = results) +
ggplot2::geom_point(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
shape = sort_criterion
),
alpha = 0.5
) +
ggplot2::geom_line(
mapping = ggplot2::aes(
x = pctpop,
y = pct_outcome,
color = sort_criterion,
linetype = sort_criterion
)
) +
ggplot2::geom_abline(
color = "gray",
slope = 1,
intercept = 0
) +
ggplot2::geom_ribbon(
data = results[results$sort_criterion == modelKey, , drop = FALSE],
mapping = ggplot2::aes(
x = pctpop,
ymin = pctpop,
ymax = pct_outcome,
color = sort_criterion
),
alpha = 0.2,
color = NA,
fill = shadow_color
) +
ggplot2::ggtitle(
paste0(
title,
'\n',
truthVar,
'~',
xvar),
subtitle=paste0(
'Gini score: ',
format(modelArea, digits = 2),
', relative Gini score: ',
format(relGiniScore, digits = 2),
pString
)
) +
ggplot2::xlab(paste("fraction of sum", costVar, " in sort order")) +
ggplot2::ylab(paste("fraction total sum", truthVar)) +
ggplot2::scale_x_continuous(breaks = seq(0, 1, 0.1)) +
ggplot2::scale_y_continuous(breaks = seq(0, 1, 0.1)) +
ggplot2::scale_color_manual(values = colorKey) +
ggplot2::coord_fixed() +
ggplot2::theme(legend.position = "bottom")
gplot
}
# --------------------------------------------------------------
# find the y value that approximately corresponds to an x value on the gain curve
get_gainy = function(frame, xvar, truthVar, gainx) {
# The sort order for predicted salary, decreasing
n <- nrow(frame)
ord = order(frame[[xvar]],
sample.int(n, n, replace = FALSE),
decreasing = TRUE)
# top 25 predicted salaries
n = round(nrow(frame) * gainx)
topN = ord[1:n]
truth_topN = sum(frame[topN, truthVar])
totalY = sum(frame[[truthVar]])
truth_topN / totalY
}
#' Plot the cumulative gain curve of a sort-order with extra notation
#'
#' Plot the cumulative gain curve of a sort-order with extra notation.
#'
#' This is the standard gain curve plot (see \code{\link{GainCurvePlot}}) with
#' a label attached to a particular value of x. The label is created by
#' a function \code{labelfun}, which takes as inputs the x and y coordinates
#' of a label and returns a string (the label).
#'
#' By default, uses the model to calculate the y value of the calculated point;
#' to use the wizard curve, set \code{sort_by_model = FALSE}
#'
#' @param frame data frame to get values from
#' @param xvar name of the independent (input or model score) column in frame
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param gainx the point on the x axis corresponding to the desired label
#' @param labelfun a function to return a label for the marked point
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param sort_by_model logical, if TRUE use the model to calculate gainy, else use wizard.
#' @param estimate_sig logical, if TRUE compute significance
#' @param large_count numeric, upper bound target for number of plotting points
#' @param model_color color for the model curve
#' @param wizard_color color for the "wizard" (best possible) curve
#' @param shadow_color color for the shaded area under the curve
#' @param crosshair_color color for the annotation location lines
#' @param text_color color for the annotation text
#' @seealso \code{\link{GainCurvePlot}}
#'
#' @examples
#'
#' if (requireNamespace('data.table', quietly = TRUE)) {
#' # don't multi-thread during CRAN checks
#' data.table::setDTthreads(1)
#' }
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' gainx = 0.25 # get the predicted top 25% most valuable points as sorted by the model
#' # make a function to calculate the label for the annotated point
#' labelfun = function(gx, gy) {
#' pctx = gx*100
#' pcty = gy*100
#'
#' paste("The predicted top ", pctx, "% most valuable points by the model\n",
#' "are ", pcty, "% of total actual value", sep='')
#' }
#' WVPlots::GainCurvePlotWithNotation(frm, "model", "value",
#' title="Example Gain Curve with annotation",
#' gainx=gainx,labelfun=labelfun)
#'
#' # now get the top 25% actual most valuable points
#'
#'labelfun = function(gx, gy) {
#' pctx = gx*100
#' pcty = gy*100
#'
#' paste("The actual top ", pctx, "% most valuable points\n",
#' "are ", pcty, "% of total actual value", sep='')
#' }
#'
#' WVPlots::GainCurvePlotWithNotation(frm, "model", "value",
#' title="Example Gain Curve with annotation",
#' gainx=gainx,labelfun=labelfun, sort_by_model=FALSE)
#'
#' @export
GainCurvePlotWithNotation = function(frame,
xvar,
truthVar,
title,
gainx,
labelfun,
...,
sort_by_model = TRUE,
estimate_sig = FALSE,
large_count = 1000,
model_color='darkblue',
wizard_color='darkgreen',
shadow_color='darkgray',
crosshair_color = 'red',
text_color='black') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = list(xvar = xvar, truthVar = truthVar),
title = title,
funname = "WVPlots::GainCurvePlotWithNotation")
if(sort_by_model) {
gainy = get_gainy(frame, xvar, truthVar, gainx)
}
else {
gainy = get_gainy(frame, truthVar, truthVar, gainx)
}
gainy_p = round(100 * gainy) / 100 # two sig figs
label = labelfun(gainx, gainy_p)
gp = GainCurvePlot(frame, xvar, truthVar, title,
estimate_sig = estimate_sig,
large_count = large_count,
model_color = model_color,
wizard_color = wizard_color,
shadow_color = shadow_color) +
ggplot2::geom_vline(xintercept = gainx,
color = crosshair_color,
alpha = 0.5) +
ggplot2::geom_hline(yintercept = gainy,
color = crosshair_color,
alpha = 0.5) +
ggplot2::scale_shape_discrete(guide = "none") +
ggplot2::annotate(
geom = "text",
x = gainx + 0.01,
y = gainy - 0.01,
color = text_color,
label = label,
vjust = "top",
hjust = "left"
)
gp
}
#' Plot the cumulative gain curves of a sort-order.
#'
#' Plot the cumulative gain curves of a sort-order.
#'
#' The use case for this visualization is to compare a predictive model
#' score to an actual outcome (either binary (0/1) or continuous). In this case the
#' gain curve plot measures how well the model score sorts the data compared
#' to the true outcome value.
#'
#' The x-axis represents the fraction of items seen when sorted by score, and the
#' y-axis represents the gain seen so far (cumulative value of model over cummulative value of random selection)..
#'
#'
#'
#' @param frame data frame to get values from
#' @param xvars name of the independent (input or model score) columns in frame
#' @param truthVar name of the dependent (output or result to be modeled) column in frame
#' @param title title to place on plot
#' @param ... no unnamed argument, added to force named binding of later arguments.
#' @param truth_target if not NULL compare to this scalar value.
#' @param palette color palette for the model curves
#'
#' @examples
#'
#' if (requireNamespace('data.table', quietly = TRUE)) {
#' # don't multi-thread during CRAN checks
#' data.table::setDTthreads(1)
#' }
#'
#' set.seed(34903490)
#' y = abs(rnorm(20)) + 0.1
#' x = abs(y + 0.5*rnorm(20))
#' frm = data.frame(model=x, value=y)
#' WVPlots::GainCurvePlotList(frm, c("model", "value"), "value",
#' title="Example Continuous gain Curves")
#'
#' @export
GainCurvePlotList = function(frame, xvars, truthVar, title,
...,
truth_target = NULL,
palette = 'Dark2') {
frame <- check_frame_args_list(...,
frame = frame,
name_var_list = c(xvars = xvars, truthVar = truthVar),
title = title,
funname = "WVPlots::GainCurvePlot")
curve <- percent_total <- NULL # mark as not unbound
pct_outcome <- pctpop <- sort_criterion <- NULL # mark as not unbound variables
if(!is.null(truth_target)) {
truthcol <- as.numeric(frame[[truthVar]]==truth_target)
} else {
truthcol <- as.numeric(frame[[truthVar]])
}
n <- nrow(frame)
# data frame the cumulative prediction/truth as a function
# of the fraction of the population we're considering, highest first
results <- data.frame(
pctpop = (1:n) / n
)
for(xvar in xvars) {
predcol <- as.numeric(frame[[xvar]])
# data frame of pred and truth, sorted in order of the predictions
d = data.frame(predcol = predcol, truthcol = truthcol)
predord <- order(d$predcol,
sample.int(n, n, replace = FALSE),
decreasing = TRUE) # reorder, with highest first
gain <- cumsum(d[predord, 'truthcol']) / sum(d[['truthcol']])
results[[xvar]] <- gain
}
# transform the frame into the tall form, for plotting
results <- cdata::pivot_to_blocks(results,
nameForNewKeyColumn = 'curve',
nameForNewValueColumn = 'percent_total',
columnsToTakeFrom = setdiff(colnames(results), 'pctpop'))
# plot
gplot = ggplot2::ggplot(
data = results,
mapping = ggplot2::aes(
x = pctpop,
y = percent_total,
color = curve)) +
ggplot2::geom_point(alpha = 0.5) +
ggplot2::geom_line() +
ggplot2::scale_color_brewer(palette = palette) +
ggplot2::xlab("fraction items in sort order") +
ggplot2::ylab("percent of total value") +
ggplot2::coord_fixed() +
ggplot2::theme(legend.position = "bottom")
gplot
}
#' @export
#' @rdname GainCurvePlotList
GainCurveListPlot <- GainCurvePlotList
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.