Nothing
#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Mon Sep 30 10:09:06 2019
# ~: user-visible visualization functions
#----------------------------------------------#
#' Sets the defaults of plot_distr
#'
#' The default values of most arguments of \code{\link[fplot]{plot_distr}} can be
#' set with \code{setFplot_distr}.
#'
#' @inheritParams plot_distr
#'
#' @param reset Logical scalar, default is \code{FALSE}. Whether the defaults should be reset.
#'
#' @seealso
#' \code{\link[fplot]{plot_distr}}, \code{\link[fplot]{pdf_fit}}, \code{\link[fplot]{fit.off}}.
#'
#' @return
#' The function `setFplot_distr()` does not return anything, it only sets the default
#' parameters for the function [plot_distr()].
#'
#' The function `getFplot_distr()` returns a named list containing the arguments
#' that have been set with the function `setFplot_distr()`.
#'
#' @examples
#'
#' # Changing the default color set for plot_distr only
#' my_col = c("#36688D", "#F3CD05", "#F49F05", "#F18904", "#BDA589")
#'
#' setFplot_distr(col = my_col, mod.method = "split", border = NA)
#'
#' plot_distr(~ Petal.Length | Species, iris)
#'
#' # Back to normal
#' setFplot_distr(reset = TRUE)
#'
#' plot_distr(~ Petal.Length | Species, iris)
#'
#'
#'
setFplot_distr = function(sorted, log, top, yaxis.num, col, border = "black",
mod.method, at_5, labels.tilted, other,
cumul = FALSE, centered = TRUE, weight.fun, int.categorical,
dict = NULL, mod.title = TRUE, labels.angle, cex.axis,
trunc = 20, trunc.method = "auto", reset = FALSE){
# Function to set the defaults of plot_distr
fm_distr = formals(plot_distr)
arg_list = names(fm_distr)
# arg_no_default = c("fml", "data", "moderator", "weight", "nbins", "bin.size", "legend_options", "yaxis.show", "mod.select", "plot", "sep", "...")
# m = fm_distr[!names(fm_distr) %in% arg_no_default]
# cat(gsub(" = ,", ",", paste0(names(m), " = ", sapply(m, deparse), collapse = ", ")))
check_arg("logical scalar", sorted, log, yaxis.num, labels.tilted, other, cumul)
check_arg("logical scalar", centered, int.categorical, reset)
check_arg_plus(top, "match(frac, nb, none, FALSE)")
if(!missing(top) && top == "FALSE") top = "none"
check_arg(col, "vector")
check_arg(border, "NULL NA | vector(character, integer) NA OK")
check_arg_plus(mod.method, "match(side, split, stack)")
check_arg_plus(at_5, "match(roman, line, FALSE)")
check_arg(weight.fun, "function arg(1,)")
check_arg(dict, "null named character vector | logical scalar")
check_arg(labels.angle, "numeric scalar GE{10} LE{90}")
check_arg(cex.axis, "numeric scalar GT{0}")
check_arg(trunc, "integer scalar GT{5}")
check_arg_plus(trunc.method, "match(auto, mid, right)")
# Getting the existing defaults
opts = getOption("fplot_distr")
if(is.null(opts)){
opts = list()
} else if(!is.list(opts)){
warning("Wrong formatting of option 'fplot_distr', all options are reset.")
opts = list()
} else if(reset){
opts = list()
}
# Saving the default values
mc = match.call()
args_default = intersect(names(mc), arg_list)
# NOTA: we don't allow delayed evaluation => all arguments must have hard values
for(v in args_default){
opts[[v]] = eval(as.name(v))
}
options(fplot_distr = opts)
}
#' @rdname setFplot_distr
getFplot_distr = function(){
opts = getOption("fplot_distr")
if(!is.list(opts)){
warning("Wrong formatting of option 'fplot_distr', all options are reset.")
opts = list()
options(fplot_distr = opts)
}
opts
}
#' Plot distributions, possibly conditional
#'
#' This function plots distributions of items (a bit like an histogram) which can
#' be easily conditioned over.
#'
#' @param fml A formula or a vector. If a formula, it must be of the type:
#' \code{weights ~ var | moderator}. If there are no moderator nor weights, you
#' can use directly a vector, or use a one-sided formula \code{fml = ~var}. You
#' can use multiple variables as weights, if so, you cannot use moderators at the
#' same time. See examples.
#' @param data A data.frame: data set containing the variables in the formula.
#' @param moderator Optional, only if argument \code{fml} is a vector. A vector
#' of moderators.
#' @param weight Optional, only if argument \code{fml} is a vector. A vector of
#' (positive) weights.
#' @param sorted Logical: should the first elements displayed be the most frequent?
#' By default this is the case except for numeric values put to log or to integers.
#' @param log Logical, only used when the data is numeric. If \code{TRUE}, then
#' the data is put to logarithm beforehand. By default numeric values are put to
#' log if the log variation exceeds 3.
#' @param col A vector of colors, default is close to paired. You can also use \dQuote{set1}
#' or \dQuote{paired}.
#' @param border Outer color of the bars. Defaults is \code{"black"}. Use \code{NA}
#' to remove the borders.
#' @param nbins Maximum number of items displayed. The default depends on the number
#' of moderator cases. When there is no moderator, the default is 15, augmented
#' to 20 if there are less than 20 cases.
#' @param bin.size Only used for numeric values. If provided, it creates bins of
#' observations of size \code{bin.size}. It creates bins by default for numeric non-integer data.
#' @param legend_options A list. Other options to be passed to \code{legend} which
#' concerns the legend for the moderator.
#' @param yaxis.show Whether the y-axis should be displayed, default is \code{TRUE}.
#' @param yaxis.num Whether the y-axis should display regular numbers instead of
#' frequencies in percentage points. By default it shows numbers only when the data
#' is weighted with a different function than the sum. For conditionnal distributions,
#' a numeric y-axis can be displayed only when \code{mod.method = "sideTotal"},
#' \code{mod.method = "splitTotal"} or \code{mod.method = "stack"}, since for the
#' within distributions it does not make sense (because the data is rescaled for each moderator).
#' @param yaxis.scale Either `NULL` (default) or equal to `"moderator"`, `"variable"`, or `"total"`.
#' Defines how to scale the y-axis. If `"total"`, each value is scaled to reflect the
#' share in the total population. This is the default for `mod.method = "stack"` or when
#' there is no moderator. The values `"moderator"` and `"variable"` only apply in
#' the presence of moderators. If `"moderator"` the data is scaled to report the share
#' within each moderator value. If `"variable"`, it is the share within each value taken by
#' the main variable (does not work with `mod.method = "split"`).
#' @param mod.select Which moderators to select. By default the top 3 moderators
#' in terms of frequency (or in terms of weight value if there's a weight) are displayed.
#' If provided, it must be a vector of moderator values whose length cannot be greater
#' than 5. Alternatively, you can put an integer between 1 and 5. This argument
#' also accepts regular expressions.
#' @param mod.NA Logical, default is \code{FALSE}. If \code{TRUE}, and if the moderator
#' contains \code{NA} values, all \code{NA} values from the moderator will be treated
#' as a regular case: allows to display the distribution for missing values.
#' @param mod.method A character scalar: either i) \dQuote{split}, the default for
#' categorical data, ii) \dQuote{side}, the default for data in logarithmic form
#' or numeric data, or iii) \dQuote{stack}. This is only used when there is more
#'รน than one moderator. If \code{"split"}: there is one separate histogram for each
#' moderator case. If \code{"side"}: moderators are represented side by side for
#' each value of the variable. If \code{"stack"}: the bars of the moderators are
#' stacked onto each other, the bar heights representing the distribution in the
#' total population. You can use the other arguments \code{within} and \code{total}
#' to say whether the distributions should be within each moderator or over the
#' total distribution.
#' @param labels.tilted Whether there should be tilted labels. Default is \code{FALSE}
#' except when the data is split by moderators (see \code{mod.method}).
#' @param labels.angle Only if the labels of the x-axis are tilted. The angle of the tilt.
#' @param other Logical. Should there be a last column counting for the observations
#' not displayed? Default is \code{TRUE} except when the data is split.
#' @param plot Logical, default is \code{TRUE}. If \code{FALSE} nothing is plotted,
#' only the data is returned.
#' @param sep Positive number. The separation space between the bars. The scale
#' depends on the type of graph.
#' @param centered Logical, default is \code{TRUE}. For numeric data only and when
#' \code{sorted=FALSE}, whether the histogram should be centered on the mode.
#' @param weight.fun A function, by default it is \code{sum}. Aggregate function
#' to be applied to the weight with respect to variable and the moderator. See examples.
#' @param at_5 Equal to \code{FALSE}, \code{"roman"} or \code{"line"}. When plotting
#' categorical variables, adds a small Roman number under every 5 bars
#' (\code{at_5 = "roman"}), or draws a thick axis line every 5 bars (\code{at_5 = "line"}).
#' Helps to get the rank of the bars. The default depends on the type of data --
#' Not implemented when there is a moderator.
#' @param dict A dictionnary to rename the variables names in the axes and legend.
#' Should be a named vector. By default it s the value of \code{getFplot_dict()},
#' which you can set with the function \code{\link[fplot]{setFplot_dict}}.
#' @param mod.title Character scalar. The title of the legend in case there is a
#' moderator. You can set it to \code{TRUE} (the default) to display the moderator
#' name. To display no title, set it to \code{NULL} or \code{FALSE}.
#' @param cex.axis Cex value to be passed to biased labels. By defaults, it finds
#' automatically the right value.
#' @param int.categorical Logical. Whether integers should be treated as categorical
#' variables. By default they are treated as categorical only when their range is
#' small (i.e. smaller than 1000).
#' @param trunc If the main variable is a character, its values are truncaded to
#' \code{trunc} characters. Default is 20. You can set the truncation method with
#' the argument \code{trunc.method}.
#' @param trunc.method If the elements of the x-axis need to be truncated, this
#' is the truncation method. It can be "auto", "right" or "mid".
#' @param cumul Logical, default is \code{FALSE}. If \code{TRUE}, then the cumulative
#' distribution is plotted.
#' @param top What to display on the top of the bars. Can be equal to "frac" (for
#' shares), "nb" or "none". The default depends on the type of the plot. To disable
#' it you can also set it to \code{FALSE} or the empty string.
#' @param ... Other elements to be passed to plot.
#'
#' @details
#' Most default values can be modified with the function \code{\link[fplot]{setFplot_distr}}.
#'
#' @author Laurent Berge
#'
#' @seealso
#' To plot temporal evolutions: \code{\link[fplot]{plot_lines}}. For boxplot: \code{\link[fplot]{plot_box}}.
#' To export graphs: \code{\link[fplot]{pdf_fit}}, \code{\link[fplot]{png_fit}},
#' \code{\link[fplot]{fit.off}}.
#'
#' @return
#' This function returns *invisibly* the output data.table containing the processed data
#' used for plotting. With the argument `plot = FALSE`, only the data is returned.
#'
#' @examples
#'
#' # Data on publications from U.S. institutions
#' data(us_pub_econ)
#'
#' # 0) Let's set a dictionary for a better display of variables
#' setFplot_dict(c(institution = "U.S. Institution", jnl_top_25p = "Top 25% Pub.",
#' jnl_top_5p = "Top 5% Pub.", Frequency = "Publications"))
#'
#' # 1) Let's plot the distribution of publications by institutions:
#' plot_distr(~institution, us_pub_econ)
#'
#' # When there is only the variable, you can use a vector instead:
#' plot_distr(us_pub_econ$institution)
#'
#' # 2) Now the production of institution weighted by journal quality
#' plot_distr(jnl_top_5p ~ institution, us_pub_econ)
#'
#' # You can plot several variables:
#' plot_distr(1 + jnl_top_25p + jnl_top_5p ~ institution, us_pub_econ)
#'
#' # 3) Let's plot the journal distribution for the top 3 institutions
#'
#' # We can get the data from the previous graph
#' graph_data = plot_distr(jnl_top_5p ~ institution, us_pub_econ, plot = FALSE)
#' # And then select the top universities
#' top3_instit = graph_data$x[1:3]
#' top5_instit = graph_data$x[1:5] # we'll use it later
#'
#' # Now the distribution of journals
#' plot_distr(~ journal | institution, us_pub_econ[institution %in% top3_instit])
#' # Alternatively, you can use the argument mod.select:
#' plot_distr(~ journal | institution, us_pub_econ, mod.select = top3_instit)
#'
#' # 3') Same graph as before with "other" column, 5 institutions
#' plot_distr(~ journal | institution, us_pub_econ,
#' mod.select = top5_instit, other = TRUE)
#'
#' #
#' # Example with continuous data
#' #
#'
#' # regular histogram
#' plot_distr(iris$Sepal.Length)
#'
#' # now splitting by species:
#' plot_distr(~ Sepal.Length | Species, iris)
#'
#' # idem but the three distr. are separated:
#' plot_distr(~ Sepal.Length | Species, iris, mod.method = "split")
#'
#' # Now the three are stacked
#' plot_distr(~ Sepal.Length | Species, iris, mod.method = "stack")
#'
#'
#'
plot_distr = function(fml, data, moderator, weight, sorted, log, nbins, bin.size,
legend_options = list(), top, yaxis.show = TRUE, yaxis.num,
yaxis.scale = NULL,
col, border = "black", mod.method, mod.select,
mod.NA = FALSE, at_5, labels.tilted, other, cumul = FALSE,
plot = TRUE, sep, centered = TRUE, weight.fun, int.categorical,
dict = NULL, mod.title = TRUE, labels.angle, cex.axis,
trunc = 20, trunc.method = "auto", ...){
# This function plots frequencies
# DT VARS
total_variable = total_moderator = x_nb = isOther = otherValue = NULL
nb_new = share = share_top = moderator_nb = xleft = xright = NULL
ybottom = xleft_real = xright_real = x_num = mid_point = NULL
ytop_new = ybottom_new = share_top_cum = value_cum = ytop_cum = value = NULL
check_arg(fml, "formula | vector mbt")
# save full formula
fml_in = fml
# New Structure 2020/ARPIL:
# model.method is now split in two:
# - mod.method, equal to "side", "split", "stack"
# - within and total: TRUE/FALSE
# - all values are missing
check_arg_plus(mod.method, "null match(side, split, stack)")
check_arg_plus(top, "null match(frac, nb, none, FALSE)")
if(!missnull(top) && top == "FALSE") top = "none"
check_arg_plus(at_5, "NULL match(roman, lines, FALSE)")
check_arg(moderator, "null vector na ok")
check_arg(weight, "null numeric vector na ok")
check_arg(sep, "null numeric scalar GE{0}")
check_arg(nbins, "null integer scalar GT{0}")
check_arg(col, "vector(integer, character) NA OK")
check_arg(bin.size, "null numeric scalar GT{0}")
check_arg(dict, "null named character vector | logical scalar")
check_arg(labels.angle, "numeric scalar")
check_arg("null logical scalar", sorted, log, labels.tilted, centered, other,
within, total, int.categorical)
check_set_arg(yaxis.scale, "NULL match(moderator, variable, total)")
check_arg("logical scalar", yaxis.show, yaxis.num, cumul, mod.NA, plot)
mc = match.call()
# Argument whose names have changed
dots = list(...)
args_deprec = c("maxBins", "addOther", "onTop", "maxFirst", "toLog", "within", "total")
if(any(args_deprec %in% names(dots))){
if("maxBins" %in% names(dots)){
if(is.null(getOption("fplot_warn_maxBins"))){
warning("Argument 'maxBins' has been renamed. Please use 'nbins' instead.")
options(fplot_warn_maxBins = TRUE)
}
nbins = dots$maxBins
}
if("addOther" %in% names(dots)){
if(is.null(getOption("fplot_warn_addOther"))){
warning("Argument 'addOther' has been renamed. Please use 'other' instead.")
options(fplot_warn_addOther = TRUE)
}
other = dots$addOther
}
if("onTop" %in% names(dots)){
if(is.null(getOption("fplot_warn_onTop"))){
warning("Argument 'onTop' has been renamed. Please use 'top' instead.")
options(fplot_warn_onTop = TRUE)
}
top = dots$onTop
}
if("maxFirst" %in% names(dots)){
if(is.null(getOption("fplot_warn_maxFirst"))){
warning("Argument 'maxFirst' has been renamed. Please use 'sorted' instead.")
options(fplot_warn_maxFirst = TRUE)
}
sorted = dots$maxFirst
}
if("toLog" %in% names(dots)){
if(is.null(getOption("fplot_warn_toLog"))){
warning("Argument 'toLog' has been renamed. Please use 'log' instead.")
options(fplot_warn_toLog = TRUE)
}
log = dots$toLog
}
if("within" %in% names(dots)){
if(is.null(getOption("fplot_warn_within"))){
warning("Argument 'within' is deprecated. Use the argument `yaxis.scale = \"moderator\"` instead.")
options(fplot_warn_within = TRUE)
}
yaxis.scale = "moderator"
}
if("total" %in% names(dots)){
if(is.null(getOption("fplot_warn_total"))){
warning("Argument 'total' is deprecated. Use the argument `yaxis.scale = \"total\"` instead.")
options(fplot_warn_total = TRUE)
}
yaxis.scale = "total"
}
}
set_defaults("fplot_distr")
# validate_dots(valid_args = c(args_deprec, names(par()), formalArgs(plot.default)), stop = TRUE)
#
# Extracting x and the moderator ####
#
USE_WEIGHT = MOD_IS_WEIGHT = FALSE
x_name = ylab = weight_name = moderator_name = ""
if(inherits(fml_in, "formula")){
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
# we check the variables data are there
check_arg(fml, "formula left(,1) right(,2) var(data)", .data = data, .message = "Argument 'fml' must be a formula of the type weights ~ value | moderator ('weights' and 'moderator' are optional).")
# Creation of x and the condition
fml = fml_in
# 2020/APRIL
# New Structure:
# weight_1 + weight_2 + etc ~ value | moderator
# You can use one sided formulas: ~value
info = extract_pipe(fml_in)
fml = info$fml
lhs_fml = info$lhs_fml
pipe_fml = info$pipe_fml
# OLD
# x = eval(fml[[2]], data)
# moderator = eval(fml[[3]], data)
# weight = extract_df(pipe_fml, data)
# NEW
x = eval(fml[[3]], data)
moderator = eval(info$pipe, data)
weight = extract_df(lhs_fml, data)
if(length(x) == 1 || length(all.vars(fml[[3]])) == 0){
stop("The formula must be of the type 'weight(s) ~ variable | moderator', and there must always be the 'variable' element, which is currently missing.")
}
if(info$lhs_fml == ~1){
weight = rep(1, length(x))
} else if(length(weight) > 1) {
MOD_IS_WEIGHT = TRUE
USE_WEIGHT = TRUE
if(length(moderator) > 1){
stop("You cannot use a moderator and multiple weights at the same time.")
# Later: implement a par mf_row (or sorts) to allow the plotting of multi weights + multi moderators
}
if(missing(mod.title)) mod.title = FALSE
# check
for(i in 1:length(weight)){
if(is.logical(weight[[i]])){
weight[[i]] = weight[[i]] * 1
} else if(!is.numeric(weight[[i]])){
stop("All weights must be numeric, but variable '", names(weight)[i], "' isn't.")
}
}
# We need to stack the data
n = length(weight[[1]])
all_mod_names = dict_apply(names(weight), dict)
# we create a factor to keep the user's order
moderator = factor(rep(all_mod_names, each = n), levels = all_mod_names)
x = rep(x, length(weight))
weight = unlist(weight)
if(missing(mod.title)){
mod.title = "Variables"
}
moderator_name = ""
weight_name = paste(enumerate_items(all_mod_names))
} else {
USE_WEIGHT = TRUE
weight = unlist(weight)
# weight_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
weight_name = deparse(lhs_fml[[2]])
check_value_plus(weight, "numeric conv vector na ok", .message = "The 'weight' must be a numeric vector.")
}
if(length(moderator) <= 1){
moderator = rep(1, length(x))
}
# other info
# x_name = deparse(fml[[2]])
# if(!MOD_IS_WEIGHT) moderator_name = deparse(fml[[3]])
x_name = deparse(fml[[3]])
if(!MOD_IS_WEIGHT) moderator_name = deparse(info$pipe_fml[[2]])
} else {
x = fml_in
# default no name when no formula => otherwise ugly when exporting
x_name = clean_name(deparse(substitute(fml)))
# We check the vector
if(!checkVector(x)){
if(inherits(x, "table")){
x = as.vector(x)
} else {
if(length(x) == 0){
reason = "Currently, the length of fml is 0."
} else {
reason = paste0("Currently, fml is of class ", enumerate_items(class(x)), ".")
}
stop("Wrong argument in fml. It must be either a formula, either a vector. ", reason)
}
}
# moderator
if(!missnull(moderator)){
if(length(x) != length(moderator)){
stop("The arguments 'x' and 'moderator' must be of the same length.")
}
moderator_name = clean_name(deparse(substitute(moderator)))
} else {
moderator = rep(1, length(x))
}
# weight
if(!missnull(weight)){
check_arg(weight, "numeric vector len(data)", .data = x)
USE_WEIGHT = TRUE
weight_name = clean_name(deparse(substitute(weight)))
} else {
weight = rep(1, length(x))
}
}
#
# Parameters setting ####
#
if(is.logical(x)){
x = as.factor(x)
}
# Renaming: dict
x_name = dict_apply(x_name, dict)
moderator_name = dict_apply(moderator_name, dict)
weight_name = dict_apply(weight_name, dict)
# Dropping NAs
quiNA_x = is.na(x)
quiNA_mod = is.na(moderator)
quiNA_weight = is.na(weight)
if(mod.NA){
quiNA = quiNA_x | quiNA_weight
total_NA = sum(quiNA | quiNA_mod)
if(is.factor(moderator)){
moderator = addNA(moderator)
} else {
# The following will coerce moderator into character
moderator[is.na(moderator)] = "NA"
}
} else {
quiNA = quiNA_x | quiNA_mod | quiNA_weight
total_NA = sum(quiNA)
}
if(total_NA > 0){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_weight))
msg_na = paste0(c("x: ", "moderator: ", "weight: "), nb_na)
message("NOTE: ", total_NA, " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ")")
x = x[!quiNA]
moderator = moderator[!quiNA]
weight = weight[!quiNA]
}
# case of empty strings
if(is.character(x)){
qui_empty = x == ""
if(any(qui_empty)){
x[qui_empty] = "[empty]"
}
}
# Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
if(anyNA(moderator_unik)){
# This is such a pain in the neck...
moderator_unik[is.na(moderator_unik)] = "NA"
moderator = factor(unclass(moderator), labels = moderator_unik)
}
} else {
moderator_unik = sunique(moderator)
}
if(cumul){
if(!missnull(other) && other == TRUE) message("Argument 'other = TRUE' is ignored since the cumulative distribution is asked for.")
other = FALSE
}
moderator_cases = length(moderator_unik)
isLegend = FALSE
USE_MOD = FALSE
if(moderator_cases > 1){
USE_MOD = TRUE
isLegend = TRUE
n_select = NA
do_recreate = FALSE
if(missing(mod.select)){
if(moderator_cases <= 5){
# FINE!
} else {
# We select the top 3
n_select = 3
}
} else if(length(mod.select) > 5){
stop("The argument 'mod.select' (currently of length ", length(mod.select), ") cannot be of a length greater than 5!")
} else if(length(mod.select) == 1 && is.numeric(mod.select) && mod.select %% 1 == 0){
# Possibly a number
if(mod.select %in% 1:5){
n_select = mod.select
} else {
stop("Argument 'mod.select' does not have a valid value: if it is an integer, its value must lie between 1 and 5.")
}
} else {
# mod.select must contain moderator values
# 1) checking the problems
mod_pblm = setdiff(mod.select, moderator_unik)
if(length(mod_pblm) > 0){
# We recreate mod.select
if(length(mod_pblm) != length(mod.select) && any(quiNA)){
# means some are OK
suggestion = " Maybe because it has only NA values?"
} else {
suggestion = " It must be either a moderator value, or a regular expression to select moderator values."
}
# Maybe they're regular expressions => we keep the order desired by the user
mod.ok = c()
for(r in mod.select){
if(r %in% moderator_unik){
mod.ok = c(mod.ok, r)
} else {
qui = grepl(r, moderator_unik)
n_qui = sum(qui)
if(n_qui == 0){
stop("In the argument 'mod.select', the value ", r, " could not be found in the moderator variable.", suggestion)
} else if(n_qui > 5){
stop("In the argument 'mod.select', the value ", r, " leads to more than 5 moderator values to be selected (e.g. ", enumerate_items(moderator_unik[qui], nmax = 10), ".")
}
mod.ok = c(mod.ok, moderator_unik[qui])
}
}
mod.ok = unique(mod.ok)
if(length(mod.ok) > 5){
stop("Argument mod.select leads to ", length(mod.ok), " moderator values to be selected. But the maximum is 5.")
}
mod.select = mod.ok
}
# 2) selection
do_recreate = TRUE
}
# AUTOMATIC selection
if(!is.na(n_select) && moderator_cases > n_select){
# We proceed with the "automatic" selection
if(USE_WEIGHT){
info_mod = data.table(moderator = moderator, w = weight)
info_mod = info_mod[, list(value = sum(w)), by = moderator]
info_method = "the value of the weight."
} else {
info_mod = data.table(moderator = moderator)
info_mod = info_mod[, list(value = .N), by = moderator]
info_method = "frequency."
}
# We inform on the method for the choice
message(ifelse(n_select == 1, "The moderator was", paste0("The ", n_select, " moderators were")), " chosen based on ", info_method)
info_mod = info_mod[order(-value)]
mod.select = info_mod[1:n_select, moderator]
do_recreate = TRUE
if(n_select == 1){
USE_MOD = isLegend = FALSE
}
}
if(mod.NA){
if(missing(mod.select)){
do_recreate = FALSE
} else {
mod.select = unique(c(mod.select, "NA"))
do_recreate = TRUE
}
}
if(do_recreate){
# We recreate the variables
qui_select = which(moderator %in% mod.select)
# we recreate all the values
x = x[qui_select]
moderator = moderator[qui_select]
weight = weight[qui_select]
# Re-Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
} else {
moderator_unik = sunique(moderator)
}
moderator_cases = length(moderator_unik)
}
}
# if(moderator_cases > 5) {
# stop("The number of cases of the moderator cannot be greater than 5!")
# } else if(moderator_cases > 1){
# USE_MOD = TRUE
# # we put the legend only if there is a condition
# isLegend = TRUE
# }
# mod.title
if(isLegend){
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
}
# separation of the x cases:
if(missnull(sep)){
sep = 0.20 + 0.15 * (moderator_cases-1)
}
# Setting up the data
# Finding out whether log or not
isNum = is.numeric(x) && !USE_WEIGHT # no sense to weight "real" numeric data
# Default of log and sorted
delayLogChecking = FALSE
if(missnull(log)){
if(!isNum || any(x < 0)){
log = FALSE
} else {
log = FALSE
if(diff(range(base::log(x[x>0]))) >= 3 && max(x) > 100){
delayLogChecking = TRUE
}
}
} else if(log && !isNum){
warning("Since 'x' is not numeric, argument 'log' is ignored.")
log = FALSE
}
isInteger = NA
if(missnull(sorted)){
if(!isNum){
sorted = TRUE
} else if(log){
sorted = FALSE
} else {
isInteger = all(x %% 1 == 0)
if(isInteger){
sorted = FALSE
} else {
sorted = FALSE
}
}
}
IS_MAXBIN_USER = TRUE
if(missnull(nbins)){
IS_MAXBIN_USER = FALSE
bin_fit_all = c(20, 11, 8, 6, 5)
bin_max_all = c(15, 9, 7, 5, 4)
nb = ifelse(!missnull(mod.method) && mod.method == "stack", 1, moderator_cases)
nbins = bin_fit_all[nb]
if(length(unique(x)) > nbins){
nbins = bin_max_all[nb]
}
}
if(log){
# Regle: la valeur d'une colonne est: superieur ou egal a la valeur gauche et strictement inferieurs a la valeur droite
x = pmax(floor(base::log(x + 1e-6)), -1)
}
#
# Binning ####
#
numAxis = FALSE
binned_data = FALSE
maxBins_old = nbins # used with delayLogChecking
if(isNum && !log){
if(!missnull(bin.size)){
if(!missnull(int.categorical) && int.categorical && all(x %% 1 == 0)){
isInteger = TRUE
} else {
x = (x %/% bin.size) * bin.size
binned_data = TRUE
if(!sorted){
numAxis = TRUE
if(IS_MAXBIN_USER == FALSE){
# Since it's a numeric axis, we put a LOT of bins
# It is a maximum anyway
nbins = 50
}
}
}
} else {
if(is.na(isInteger)){
isInteger = all(x %% 1 == 0)
}
goNum = TRUE
if(!missnull(int.categorical)){
# we force integers as categorical
if(int.categorical == TRUE) goNum = FALSE
# if int.categorical == FALSE, then we force it as numeric
} else if(isInteger && diff(range(x)) < 1000){
# for integers with small range, we don't consider them
# as numeric
goNum = FALSE
}
if(goNum){
if(!sorted){
numAxis = TRUE
}
if(IS_MAXBIN_USER == FALSE){
# we reset the number of bins: specific to the numeric case
nbins = bin_max_all[1]
}
binned_data = TRUE
# we try to find a "good" bin size
x_min = min(x)
x_max = max(x)
bin.size = signif((x_max - x_min) / min(max(nbins - 1, 10), 15), 1)
x_tmp = (x %/% bin.size) * bin.size
tx = table(x_tmp)
if(sum(tx/sum(tx) > 0.03) < 6){
# Condition: nber of bins with more than 3% is lower than 6
if(delayLogChecking){
# Would it be better looking in logarithmic form?
x_ln = pmax(floor(base::log(x + 1e-6)), -1)
tx_ln = table(x_ln)
if(sum(tx_ln/sum(tx_ln) > 0.05) >= 5){
# nber of signif bins greater than 5
log = TRUE
x = x_ln
numAxis = FALSE
binned_data = FALSE
nbins = maxBins_old
}
}
}
if(!log){
if(sum(tx/sum(tx) > 0.01) < 6){
# Condition: nber of bins with more than 1% is lower than 6
# More complex binning:
# We try to find the "optimal" number of bins
n_bins = nbins
q001 = quantile(x, seq(0, 1, by = 1 / 40))
n_q = length(q001)
s_vect = c()
score_total_vect = c()
score_range_vect = c()
score_fullness_vect = c()
for(i in 1:20){
if(i == 1){
s = signif(diff(range(x)) / n_bins, 1)
} else {
s = signif(s / 1.5, 1)
}
value_range = integer(n_bins)
for(k in 1:n_q){
s_start = q001[k]
value_range[k] = sum(q001 >= s_start & q001 <= s_start + n_bins*s)
if(value_range[k] == length(q001)) break
}
s_start = q001[which.max(value_range)]
missing_range = 1 - sum(q001 >= s_start & q001 <= s_start + n_bins*s)/n_q
score_fullness = length(unique(na.omit(cut(q001, seq(s_start, by = s, length.out = n_bins + 1), include.lowest = TRUE))))/ n_bins
# Save
s_vect[i] = s
score_total_vect[i] = score_fullness/2 - 2*missing_range
if(missing_range > 0.4) break
}
bin.size = s_vect[which.max(score_total_vect)]
x = (x %/% bin.size) * bin.size
} else {
x = x_tmp
}
}
}
}
}
#
# Default values for mod.method
#
# Splitting decision
if(missnull(mod.method)){
if(log || numAxis){
mod.method = "side"
} else {
mod.method = "split"
}
}
is_yaxis_scale_default = is.null(yaxis.scale)
if(is.null(yaxis.scale)){
yaxis.scale = "moderator"
} else if(yaxis.scale == "variable" && mod.method == "split"){
stop("You cannot use `yaxis.scale = \"variable\"` when using `mod.method == \"split\"`.")
}
DO_STACK = FALSE
if(mod.method == "stack"){
if(cumul == TRUE){
stop("You cannot use the argument cumul=TRUE with mod.method='stack'.")
# I should implement it later, it might be useful in some situations (non overlapping cases)
}
DO_STACK = TRUE
mod.method = "side"
yaxis.scale = "total"
}
DO_SPLIT = FALSE
checkNotTilted = FALSE
delayLabelsTilted = missnull(labels.tilted)
if(moderator_cases > 1 && mod.method == "split"){
DO_SPLIT = TRUE
if(missnull(labels.tilted)){
labels.tilted = TRUE
checkNotTilted = TRUE
}
# We don't add the other column only when the data is split
# and it is not a numeric axis or log axis
if(missnull(other) && !log && !numAxis){
other = FALSE
}
}
if(missnull(other)){
other = TRUE
}
# yaxis.num
if(missnull(yaxis.num)){
if(USE_WEIGHT == FALSE || missing(weight.fun) || (moderator_cases > 1 && yaxis.scale != "total")){
yaxis.num = FALSE
} else {
yaxis.num = TRUE
}
} else if(yaxis.num == TRUE){
if(is_yaxis_scale_default){
yaxis.scale = "total"
} else if(moderator_cases > 1 && (!is_yaxis_scale_default && yaxis.scale != "total") && !MOD_IS_WEIGHT){
stop("Argument yaxis.num: You cannot have a numeric y-axis when `yaxis.scale != \"", yaxis.scale, "\"`.",
"(Because the data is rescaled as frequencies first.) Use `yaxis.scale = \"total\"` to display a numeric y-axis.")
}
}
checkForTilting = FALSE
if(missnull(labels.tilted)){
labels.tilted = FALSE
# We check if it's better for tilting only in the regular case
checkForTilting = TRUE
# NOTE that we later update the value of checkForTilting
# because we don't want to do it all the time (but we first need more information)
# just search the next occurrence of checkForTilting
}
# The color
if(missnull(col)){
if(moderator_cases == 1){
col = "#1F78B4"
} else if(DO_SPLIT){
col = "set1"
} else {
col = c("#1F78B4", "#A6CEE3", "#33A02C", "#B2DF8A", "#E31A1C", "#FB9A99", "#FF7F00", "#FDBF6F")
}
}
if(length(col) == 1 && is.character(col)){
# col_match = try(match.arg(tolower(col), c("set1", "paired")), silent = TRUE)
# if(col_match == "set1"){
# col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999")
# } else if(col_match == "paired"){
# col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
# }
check_value_plus(col, "match(set1, paired) | character scalar")
if(col == "set1"){
col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999")
} else if(col == "paired"){
col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
}
}
numLabel = FALSE
if(moderator_cases >= 2 && numAxis){
if(DO_SPLIT){
numLabel = TRUE
}
sep = 0
}
# for table elements
if(is.table(x)){
x = as.vector(x)
}
X_FACTOR = FALSE
if(is.factor(x)){
X_FACTOR = TRUE
# we keep the order of x when factors
x = x[drop = TRUE]
x_fact_names = levels(x)
x = unclass(x)
# => we rename x later
}
#
# Computing data_freq ####
#
ADD_OTHER = ADD_OTHER_LEFT = CUT_BAR = FALSE
if(DO_SPLIT == FALSE){
if(USE_WEIGHT){
raw_data = data.table(x = x, weight = weight, moderator = moderator)
if(missnull(weight.fun)){
data_freq = raw_data[, list(value = sum(weight)), keyby = list(x, moderator)]
} else {
if(!is.function(weight.fun)){
stop("Argument 'weight.fun' must be a function.")
} else if(length(weight.fun(head(weight))) != 1){
stop("Function 'weight.fun' must be return a value of length one.")
}
data_freq = raw_data[, list(value = weight.fun(weight)), keyby = list(x, moderator)]
}
} else {
raw_data = data.table(x = x, moderator = moderator)
data_freq = raw_data[, list(value = .N), keyby = list(x, moderator)]
}
if(USE_MOD && mod.method == "side" && yaxis.scale %in% c("moderator", "variable")){
if(yaxis.scale == "variable"){
data_freq[, "total_variable" := list(sum(value)), by = x]
data_freq[, "share" := list(value/total_variable*100)]
} else {
data_freq[, "total_moderator" := list(sum(value)), by = moderator]
data_freq[, "share" := list(value/total_moderator*100)]
}
} else {
total = sum(data_freq$value)
data_freq[, "share" := list(value/total*100)]
}
# We balance the panel for log data
if(log && sorted == FALSE){
x_all = min(data_freq$x):max(data_freq$x)
data_all = data.table(expand.grid(x = x_all, moderator = moderator_unik))
tmp = merge(data_all, data_freq, all.x = TRUE, by = c("x", "moderator"))
tmp[is.na(tmp)] = 0
data_freq = tmp
}
# The sorting
if(sorted){
if(USE_MOD){
data_freq[, "total_x" := list(sum(value)), by = x]
# data_freq = data_freq[order(-total_x, moderator)]
# setorderv(data_freq, c("total_x", "moderator", "x"), c(-1, 1, 1))
setorderv(data_freq, c("total_x", "x"), c(-1, 1))
} else {
# data_freq = data_freq[order(-value)]
setorderv(data_freq, "value", -1)
}
data_freq[, "x_nb" := list(rleid(x))]
} else {
data_freq = data_freq[order(x, moderator)]
data_freq[, x_nb := quickUnclassFactor(x)]
}
# the "other" column
data_freq[, isOther := FALSE]
data_freq[, otherValue := ""]
# if we center on the mode
if(centered && any(data_freq$x_nb > nbins) && isNum){
# we center the distribution
data_freq[, nb_new := x_nb - which.max(value)]
data_freq[, nb_new := nb_new + max(min(ceiling(nbins/2), abs(min(nb_new)) + 1), nbins - max(nb_new))]
data_freq[, x_nb := nb_new]
data_freq[, nb_new := NULL]
if(other && sorted == FALSE && any(data_freq$x_nb <= 0)){
if(!any(data_freq$x_nb < 0)){
# we just add that bin
data_freq[, x_nb := x_nb + 1]
nbins = nbins + 1
} else {
ADD_OTHER_LEFT = TRUE
# We create a bin on the left
data_other_left = data_freq[x_nb <= 0]
data_other_left = data_other_left[, list(value = sum(value), share_top = sum(share)), by = moderator]
tmp = as.data.table(expand.grid(moderator = data_other_left$moderator))
other_col = merge(data_other_left, tmp)
other_col[is.na(other_col)] = 0
other_col[, x_nb := 0]
# we limit the size of this last column
share_max = max(data_freq[x_nb %in% 1:nbins, share])
# r = 1 + rank(other_col$value, ties.method = "max") / moderator_cases / 10
# other_col[, share := pmin(share_top, r * share_max)]
other_col[, share := pmin(share_top, 1.1 * share_max)]
if(any(other_col$share < other_col$share_top)){
CUT_BAR = TRUE
}
# we merge the information
data_freq = rbindDS(other_col, data_freq[x_nb > 0])
qui_NA = is.na(data_freq$share_top)
data_freq$share_top[qui_NA] = data_freq$share[qui_NA]
# Other text => depends on the type of values
data_freq[x_nb == 0, "isOther" := TRUE]
if(log){
x_first = ceiling(exp(data_freq[x_nb == 1, x][1]))
otherTextLeft = substitute(phantom()<x_val, list(x_val = formatAxisValue(x_first)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_first))]
} else {
if(binned_data == FALSE){
x_other = data_freq[x_nb == 1, x][1]
otherTextLeft = substitute(phantom()<x_val, list(x_val = formatAxisValue(x_other)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_other))]
} else {
# we update the value
x_other = data_freq[x_nb == 1, x][1]
data_freq[x_nb == 0, x := x_other - bin.size]
otherTextLeft = substitute(phantom()<x_other, list(x_other = formatAxisValue(x_other)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_other))]
}
}
}
} else {
# We clean negative numbers
data_freq = data_freq[x_nb > 0]
}
}
# The "other" column
if(other && max(data_freq$x_nb) == nbins + 1){
# we just add the bin
nbins = nbins + 1
data_freq = data_freq[x_nb <= nbins]
} else if(other && any(data_freq$x_nb > nbins)){
# we create the other colum as the sum of the rest
ADD_OTHER = TRUE
data_other = data_freq[x_nb > nbins]
data_other = data_other[, list(value = sum(value), share_top = sum(share)), by = moderator]
tmp = as.data.table(expand.grid(moderator = data_other$moderator))
other_col = merge(data_other, tmp)
other_col[is.na(other_col)] = 0
other_col[, x_nb := nbins + 1]
# we limit the size of this last column
share_max = max(data_freq[x_nb %in% 1:nbins, share])
# r = 1 + rank(other_col$value, ties.method = "max") / moderator_cases / 10
# other_col[, share := pmin(share_top, r * share_max)]
other_col[, share := pmin(share_top, 1.1 * share_max)]
if(any(other_col$share < other_col$share_top)){
CUT_BAR = TRUE
}
# we merge the information
data_freq = rbindDS(data_freq[x_nb <= nbins], other_col)
qui_NA = is.na(data_freq$share_top)
data_freq$share_top[qui_NA] = data_freq$share[qui_NA]
# Other text => depends on the type of values
data_freq[x_nb == nbins + 1, "isOther" := TRUE]
if(!isNum || sorted){
otherText = "Other"
data_freq[x_nb == nbins + 1, "otherValue" := "Other"]
} else {
if(log){
x_last = round(exp(data_freq[x_nb == nbins, x] + 1))
otherText = substitute(phantom()>x_val, list(x_val = formatAxisValue(x_last)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_last))]
} else {
if(binned_data == FALSE){
x_value = data_freq[x_nb == nbins, x]
otherText = substitute(phantom()>x_val, list(x_val = formatAxisValue(x_value)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_value))]
} else {
x_other = data_freq[x_nb == nbins, x][1] + bin.size
data_freq[x_nb == nbins + 1, x := x_other]
otherText = substitute(phantom()>=x_other, list(x_other = formatAxisValue(x_other)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_other))]
}
}
}
} else {
data_freq = data_freq[x_nb <= nbins]
}
if(!"share_top" %in% names(data_freq)){
data_freq[, share_top := share]
}
x_nb_all = sort(unique(data_freq$x_nb))
x_cases = length(x_nb_all)
# We add the 0s
if(nrow(data_freq) != x_cases*moderator_cases){
# we need to add the 0s
data_all = data.table(expand.grid(x_nb = x_nb_all, moderator = moderator_unik))
setkey(data_all, x_nb, moderator)
setkey(data_freq, x_nb, moderator)
tmp = merge(data_all, data_freq, all.x = TRUE)
tmp[, c("x", "isOther", "otherValue") := NULL]
# Merging additional info
base2merge = unique(data_freq[, list(x, x_nb, isOther, otherValue)])
quoi = merge(tmp, base2merge, by = "x_nb")
tmp = quoi[order(x_nb, moderator)]
# setting to 0
tmp[is.na(tmp)] = 0
data_freq = tmp
}
# moderator_nb
if(is.factor(moderator)){
data_freq[, moderator_nb := as.numeric(unclass(moderator[drop = TRUE]))]
} else {
data_freq[, moderator_nb := quickUnclassFactor(moderator)]
}
# The coordinates
if(numAxis){
data_freq[, xleft := x + bin.size/moderator_cases * (moderator_nb - 1)]
data_freq[, xright := x + bin.size/moderator_cases * moderator_nb]
} else {
data_freq[, xleft := (x_nb-1)*(moderator_cases+sep) + moderator_nb - 1]
data_freq[, xright := (x_nb-1)*(moderator_cases+sep) + moderator_nb]
}
xlim = range_plus(c(data_freq$xleft, data_freq$xright), 5)
data_freq[, ybottom := 0]
data_freq[, ytop := share]
if(X_FACTOR){
# we reintroduce the values of x
# Beware: "Other" can be either 0 (when side/split) // or NA (without moderator)
x_value = rep(NA, nrow(data_freq))
if(anyNA(data_freq$x)){
qui_ok = !is.na(data_freq$x)
} else {
qui_ok = !data_freq$x == 0
}
x_value[qui_ok] = x_fact_names[data_freq$x[qui_ok]]
data_freq[, x := x_value]
}
} else if(DO_SPLIT == TRUE){
#
# With SPLIT ####
#
# We apply a split to the data
info = list()
x_add = 0
for(i in seq_along(moderator_unik)){
# we apply plot_distr to subsets
qui = moderator == moderator_unik[i]
if(USE_WEIGHT){
data_freq = plot_distr(x[qui], moderator = moderator[qui], weight = weight[qui],
sorted = sorted, nbins = nbins, log = FALSE,
other = other, plot = FALSE, bin.size = bin.size,
int.categorical = !numAxis)
} else {
data_freq = plot_distr(x[qui], moderator = moderator[qui],
sorted = sorted, nbins = nbins, log = FALSE,
other = other, plot = FALSE, bin.size = bin.size,
int.categorical = !numAxis)
}
# updating the information
if(numAxis){
# We add the value of a bin, BUT start when the last stops
# we save the "real" values
data_freq[, xleft_real := xleft]
data_freq[, xright_real := xright]
# Now we update xleft/xright
min_left = min(data_freq$xleft)
x_add_next = x_add + max(data_freq$xright) - min_left
data_freq[, xleft := xleft - min_left + x_add]
data_freq[, xright := xright - min_left + x_add]
x_add = x_add_next + bin.size
} else {
min_left = min(data_freq$xleft)
x_add_next = x_add + max(data_freq$xright) - min_left
data_freq[, xleft := xleft - min_left + x_add]
data_freq[, xright := xright - min_left + x_add]
x_add = x_add_next + 0.7
}
data_freq[, moderator_nb := i]
info[[i]] = data_freq
}
info_all = rbindlist(info, use.names=TRUE)
if(mod.method == "split" && yaxis.scale == "total"){
# SPLIT TOTAL
total = sum(weight)
info_all[, share := value/total*100]
info_all[, share_top := share]
info_all[, ytop := share]
}
if(binned_data && other){
# we format the data properly
info_all$x = formatAxisValue(info_all$x)
}
if(X_FACTOR){
# we reintroduce the values of x
info_all[, x := x_fact_names[x]]
}
# managing "other"
# info_all[, isOther := is.na(x)]
if(any(info_all$isOther)){
# bounding bar size
share_max = max(info_all[isOther == FALSE, share])
info_all[isOther == TRUE, share := pmin(share_top, 1.1 * share_max)]
info_all[, ytop := share]
if(any(info_all$share < info_all$share_top)){
CUT_BAR = TRUE
}
info_all$x = as.character(info_all$x)
info_all$x[is.na(info_all$x)] = "Other"
}
data_freq = info_all
# Some information
xlim = range_plus(c(data_freq$xleft, data_freq$xright), 5)
}
#
# Graph setting ####
#
if(DO_STACK == TRUE){
# we reformat the data
# return(data_freq)
new_coords = data_freq[isOther == FALSE, list(x_nb, xleft, xright, ybottom, ytop)]
new_coords[, c("xleft_new", "xright_new") := list(min(xleft), max(xright)), by = x_nb]
new_coords[, ytop_new := cumsum(ytop), by = x_nb]
new_coords[, ybottom_new := shift(ytop_new, n = 1, fill = 0), by = x_nb]
data_freq[isOther == FALSE, c("xleft", "xright", "ybottom", "ytop") := list(new_coords$xleft_new, new_coords$xright_new, new_coords$ybottom_new, new_coords$ytop_new)]
if(any(data_freq$isOther)){
# new values top
max_y = max(data_freq$ytop)
data_freq[isOther == TRUE, ytop := pmin(1.1 * max_y, share_top)]
}
# browser()
}
# Cumulative
if(cumul == TRUE){
data_freq[, share_top_cum := cumsum(share_top), by = moderator]
data_freq[, value_cum := cumsum(share), by = moderator]
data_freq[, ytop_cum := cumsum(ytop), by = moderator]
data_freq[, share_top := share_top_cum]
data_freq[, value := value_cum]
data_freq[, ytop := ytop_cum]
}
if(plot == FALSE){
return(data_freq)
}
# Plot preparation
# default for top
if(missnull(top)){
if(cumul){
top = "frac"
} else if(numAxis){
if(any(data_freq$isOther)){
top = "frac"
} else {
top = "none"
}
} else if(moderator_cases > 1 && mod.method == "split"){
# when neck to neck => much easier to compare shares
# otherwise it is confusing
top = "frac"
} else {
# if coef of variation of x is low => better information is the number
values = data_freq[share_top <= share, ytop]
cv = sd(values) / mean(values)
if(cv > 0.7){
top = "frac"
} else {
top = "nb"
}
}
}
if(is.null(dots$ylim)){
ylim = c(0, max(data_freq$ytop))
ymax_grid = ylim[2]
# hauteur_caractere_legend = 0
total_height = 0
if(isLegend){
info_legend = legendFit(legend = moderator_unik, plot = FALSE, title = mod.title)
total_height = info_legend$total_height
}
hauteur_caractere_top = 0
if(top != "none"){
# We create the information to display top
if(DO_STACK == FALSE){
# Normal case
info_top = data_freq[, list(x_mid = (xleft+xright)/2, ytop, value, share_top)]
} else if(DO_STACK == TRUE){
info_top = data_freq[isOther == FALSE, list(x_mid = mean((xleft+xright)/2), ytop = max(ytop), value = sum(value), share_top = sum(share_top)), by = x_nb]
info_top_other = data_freq[isOther == TRUE, list(x_mid = (xleft+xright)/2, ytop, value, share_top)]
}
# We find the optimal cex for the top numbers
if(top == "nb"){
top.value2display = formatAxisValue(info_top$value)
} else {
top.value2display = paste0(mysignif(info_top$share_top, d = 2, r = 0), "%")
# We truncate => otherwise ugly
# top.value2display = gsub("0.0.+", "0.0%", top.value2display)
# Only one digit after "0."
top.value2display = gsub("(?<=\\.[[:digit:]]).+", "%", top.value2display, perl = TRUE)
}
#
# NOTE on R behavior with new windows (x11) that are resized
#
# The following code computes the optimal width of the number/fractions to be displayed on the top
# of the bars.
# However, when we use x11() and then resize the window, R is unable to find out the actual window size
# Instead we are only able to access the original size.
# Unfortunately, the drawbacks of the following solutions are too important:
# - plotting a "clean" plot, so that R knows the actual size of the window.
# * Drawback: if the user calls for several plots in the same window => we're screwed
# - making a "smart guess" by assuming that the new window will be larger
# * drawback: if it is not the case, it will be ugly (better too small than too big)
# Getting the width of the bars in inches
unitary_width = par("pin")[1] / usr_width(xlim)
if(numAxis){
tdx = table(data_freq$xright - data_freq$xleft)
bar_w = as.numeric(names(tdx)[which.max(tdx)])
unitary_width = unitary_width * bar_w
}
# When DO_STACK==TRUE and numAxis==FALSE:
# the width is equal to the nber of moderators
unitary_width_top = unitary_width * (1 + (moderator_cases - 1) * DO_STACK * (!numAxis))
top.cex = 1
minCex = 0.5
v_max = top.value2display[which.max(strwidth(top.value2display, units = "in"))]
while(top.cex > minCex && strwidth(v_max, units = "in", cex = top.cex) > unitary_width_top*0.9){
top.cex = top.cex * 0.95
}
# the height we need to add to the plot
hauteur_caractere_top = strheight("W", "in", cex = top.cex) * 1.35 + strwidth("W", "in", cex = 1) * top.cex
# If DO_STACK==TRUE: the "other" columns have another width
if(DO_STACK == TRUE && nrow(info_top_other) > 0){
if(top == "nb"){
top.value2display_other = formatAxisValue(info_top_other$value)
} else {
top.value2display_other = paste0(mysignif(info_top_other$share_top, d = 2, r = 0), "%")
}
top.cex_other = 1
minCex = 0.5
v_max = top.value2display_other[which.max(strwidth(top.value2display_other, units = "in"))]
while(top.cex_other > minCex && strwidth(v_max, units = "in", cex = top.cex_other) > unitary_width*0.9){
top.cex_other = top.cex_other * 0.95
}
hauteur_caractere_top_other = strheight("W", "in", cex = top.cex_other) * 1.35 + strwidth("W", "in", cex = 1) * top.cex_other
# We are careful to add only the necessary height to the plot
hauteur_caractere_top_usr = hauteur_caractere_top / par("pin")[2] * diff(ylim)
hauteur_caractere_top_other_usr = hauteur_caractere_top_other / par("pin")[2] * diff(ylim)
height_normal = max(info_top$ytop) + hauteur_caractere_top_usr
height_other = max(info_top_other$ytop) + hauteur_caractere_top_other_usr
hauteur_caractere_top = (max(height_normal, height_other) - ylim[2]) * par("pin")[2] / diff(ylim)
}
}
# We transform the character heights into user format (produit en croix)
add_usr = (hauteur_caractere_top + total_height) / par("pin")[2] * diff(ylim)
ylim[2] = ylim[2] + add_usr
dots$ylim = ylim
}
dots$xlim = xlim
dots$axes = FALSE
dots$x = dots$y = dots$col = 0
# ylab
prefix = ""
if(cumul) prefix = "Cumulative "
if(moderator_cases > 1){
if(nchar(moderator_name) == 0){
text = if(yaxis.scale == "total") "% Total" else "% Within"
} else {
text = switch(yaxis.scale,
total = "% Total",
moderator = paste0("% Within ", moderator_name, ""),
variable = paste0("% Within ", x_name, ""))
}
if(USE_WEIGHT){
if(yaxis.num == TRUE){
ylab = paste0(prefix, weight_name)
} else {
if(MOD_IS_WEIGHT){
ylab = paste0("Distribution of ", weight_name)
} else {
ylab = paste0("Distribution of ", weight_name, " (", prefix, text, ")")
}
}
} else {
if(yaxis.num == TRUE){
ylab = paste0(prefix, "Frequency")
} else {
ylab = paste0(prefix, text)
}
}
} else if(USE_WEIGHT){
ylab = paste0(prefix, "Distribution of ", weight_name)
} else if(cumul){
ylab = "Cumulative %"
}
# xlab
# user providing xlab?
noXlab = is.null(dots$xlab)
noSub = is.null(dots$sub)
# if(!noXlab || isNum){
# # if data is numeric, we will want the x-label, even if tilted
# line.max = 1 + 1
# } else if(!noSub){
# line.max = 2 + 1
# } else {
# line.max = 3 + 1
# }
line.max = 3 + 1
# Update of checkForTilting
if(checkForTilting){
# Only for the standard case
checkForTilting = !numAxis & !log
}
# if(checkForTilting){
# # "at risk" of tilting => we don't show the name
# listDefault(dots, "xlab", "")
# } else if(labels.tilted){
# # we don't show the name of the variable if lab is tilted
# # except if the data is numeric
#
# if(isNum){
# listDefault(dots, "xlab", x_name)
# } else {
# listDefault(dots, "xlab", "")
# }
#
# # listDefault(dots, "sub", x_name)
# } else {
# listDefault(dots, "xlab", x_name)
# }
if("xlab" %in% names(dots)){
xlab = dots$xlab
} else {
xlab = x_name
}
dots$xlab = ""
if("sub" %in% names(dots)){
sub = dots$sub
} else {
sub = ""
}
dots$sub = ""
#
# Margins setting ####
#
if(yaxis.show){
# we get the "nice" points display
# y_points = axis(2, lwd=0, col.axis = 0)
y_points = pretty(c(data_freq$ytop, 0))
# We don't go above 100%
y_points = y_points[y_points <= 100]
if(yaxis.num){
# x1 = data_freq[x_nb == 1][1, ]
x1 = data_freq[ytop > 0][1, ]
y_labels = formatAxisValue(y_points / x1$ytop * x1$value)
} else {
y_labels = paste0(y_points, "%")
}
} else {
y_labels = ""
}
if("ylab" %in% names(dots)){
ylab.resize = FALSE # we don't resize user-provided ylab
ylab = dots$ylab
} else {
ylab.resize = TRUE
}
dots$ylab = ""
# listDefault(dots, "ylab", ylab)
listDefault(dots, "yaxs", "i")
mleft = find_margins_left(ylab, y_labels, ylab.resize = ylab.resize)
# finding the bottom margin is a pain in the neck!!!
mbot = find_margins_bottom(xlab = xlab, sub = sub, data_freq = data_freq, log = log, isNum = isNum, numLabel = numLabel, numAxis = numAxis, nbins = nbins, DO_SPLIT = DO_SPLIT, ADD_OTHER = ADD_OTHER, ADD_OTHER_LEFT = ADD_OTHER_LEFT, sorted = sorted, labels.tilted = labels.tilted, delayLabelsTilted = delayLabelsTilted, checkForTilting = checkForTilting, checkNotTilted = checkNotTilted, noSub = noSub, binned_data = binned_data, line.max = line.max, trunc = trunc, trunc.method = trunc.method, cex.axis = cex.axis, labels.angle = labels.angle, at_5 = at_5, xlim = xlim)
if(mleft$total_width > par("mai")[2] || mbot$total_height > par("mai")[1]){
new_mai = par("mai")
if(mleft$total_width > par("mai")[2]){
new_mai[2] = mleft$total_width + 0.05
}
if(mbot$total_height > par("mai")[1]){
new_mai[1] = mbot$total_height + 0.05
}
op = par(mai = new_mai)
on.exit(par(op))
}
do.call("plot", dots)
# ylab
title(ylab = mleft$ylab, line = mleft$ylab.line)
hgrid(ymax = ymax_grid)
rect(xleft = data_freq$xleft, ybottom = data_freq$ybottom, xright = data_freq$xright, ytop = data_freq$ytop, col = col[data_freq$moderator_nb], border = border)
#
# "cutting" the bar (if other is very big)
#
if(CUT_BAR){
data_other = data_freq[share_top > share]
ytop = data_other$ytop[1]
x_span = (data_other$xright - data_other$xleft)[1] / diff(get_x_lim()) * diff(get_y_lim())
y_span = ytop * .03
y_all = ytop - 2:1*x_span
for(i in 1:nrow(data_other)){
shade_area(y_all, y_all - y_span, x = c(data_other$xleft[i], data_other$xright[i]), col = "white", border = "white")
}
}
#
# x-labels ####
#
# Information on plot
at_info = data_freq[, list(mid_point = (max(xright) + min(xleft)) / 2), by = list(x_nb, x)]
myat = at_info$mid_point
if(log){
if(DO_SPLIT){
data_freq_valid = data_freq[isOther == FALSE, ]
data_freq_valid[, x_num := as.numeric(x)]
x_all = data_freq_valid$x_num
exp_value = ceiling(exp(x_all))
exp_value[x_all == -1] = 0
if(sorted){
if(delayLabelsTilted){
# better display
labels.tilted = TRUE
}
myat = data_freq_valid[, (xleft+xright)/2]
exp_value_right = ceiling(exp(x_all + 1))
# Formatting
exp_value_format = formatAxisValue(exp_value)
exp_value_right_format = formatAxisValue(exp_value_right)
label_displayed = paste0("[", exp_value_format, "; ", exp_value_right_format, ")")
# finding the location
if(labels.tilted){
info_tilt = xaxis_biased(at = myat, line.max = line.max, labels = label_displayed)
} else {
xaxis_labels(at = myat, labels = label_displayed)
}
} else {
# we draw the axes with nice display
moreLine = any(data_freq$isOther) * .25
# Displaying the ticks "all at once" (including the last one)
mysep = (data_freq$xleft[2] - data_freq$xright[1]) / 2
exp_value_right = ceiling(exp(x_all + 1))
# on the right
myat = data_freq_valid$xright + mysep
# We add the first tick on the left
data_first = data_freq_valid[x_nb == 1]
myat = c(data_first$xleft - mysep, myat)
# exp_value = c(ceiling(exp(data_first$x_num - 1)), exp_value)
first_val = ceiling(exp(data_first$x_num - 1))
first_val[data_first$x_num == -1] = 0
exp_value = c(first_val, exp_value_right)
exp_value_format = formatAxisValue(exp_value)
# tick location
# 1) the ticks
axis(1, at = myat, labels = NA, lwd.ticks = 1, lwd = 0, line = moreLine)
# 2) The labels
# Tilted labels not implemented for this axis
if(delayLabelsTilted){
if(strwidth(paste0(exp_value_format, collapse = " ")) / diff(get_x_lim()) > 0.9){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
}
}
if(labels.tilted){
xaxis_biased(at = myat, labels = exp_value_format, yadj = 2, angle = 25)
} else {
axis(1, at = myat, labels = exp_value_format, line = moreLine, lwd = 0)
}
# for extra ticking when info is ambiguous
tck_location = par("usr")[3] - strheight("W")*81/100
qui = which(exp_value < 10)
if(length(qui) > 0){
# tick location
axis(2, at = tck_location - moreLine, pos = myat[qui], labels = NA, tcl = 0.15, xpd = TRUE)
}
}
# Now the "other" ticks
if(any(data_freq$isOther)){
data_freq_other = data_freq[isOther == TRUE]
if(sorted){
text2show = rep("Other", nrow(data_freq_other))
my_font = 1
} else {
text2show = rep(">", nrow(data_freq_other))
text2show[grepl("<", data_freq_other$otherValue)] = "<"
my_font = 2
}
myat = data_freq_other[, (xleft + xright)/2]
if(sorted && labels.tilted){
xaxis_biased(at = myat, line.max = line.max, labels = text2show, angle = info_tilt$angle, cex = info_tilt$cex)
} else {
axis(1, at = myat, labels = text2show, lwd = 0, line = -0.8, font = my_font)
}
}
} else {
#
# formatting of the values
#
x_unik = at_info[x_nb %in% 1:nbins, x]
x_cases = length(x_unik)
myat = at_info[x_nb %in% 1:nbins, mid_point]
exp_value = ceiling(exp(x_unik))
exp_value[x_unik == -1] = 0
if(is.unsorted(x_unik) || x_cases == 1 || any(diff(x_unik) != 1)){
exp_value_right = ceiling(exp(x_unik + 1))
# Formatting
exp_value_format = substr(exp_value, 1, 7)
exp_value_right_format = substr(exp_value_right, 1, 7)
# finding the location
location = xaxis_labels(at = myat, labels = paste0("[", exp_value_format, "; ", exp_value_right_format, "["), only.params = TRUE)
# drawing
for(i in 1:length(x_unik)){
value = substitute(group("[",list(x1, x2),")"), list(x1 = formatAxisValue(exp_value[i]), x2 = formatAxisValue(exp_value_right[i])))
axis(1, at = myat[i], lwd = 0, labels = value, cex = location$cex, line = 1 + location$line[i])
}
} else {
# we draw the axes with nice display
moreLine = (ADD_OTHER || ADD_OTHER_LEFT) * .25
tck_location = par("usr")[3] - strheight("W")*81/100
# Displaying the ticks "all at once" (including the last one)
val = c(exp_value, ceiling(exp(tail(x_unik, 1) + 1)))
exp_value_format = formatAxisValue(val)
# tick location
loc = (1:length(val)-1)*(moderator_cases+sep) - sep/2
axis(1, at = loc, labels = exp_value_format, line = moreLine, lwd.ticks = 1, lwd = 0)
for(i in 1:x_cases){
# tick location
loc = (i-1)*(moderator_cases+sep) - sep/2
# Ticks showing "strictly" inferior
if(x_unik[1] < 2){
axis(2, at = tck_location - moreLine, pos = loc, labels = NA, tcl = 0.15, xpd = TRUE)
}
}
}
# We find the right cex to fit the "other" group
unitary_width = par("pin")[1] / usr_width(xlim)
too_large = function(text, the_cex, the_shift) strwidth(text, units = "in", cex = the_cex)/2 - the_shift*unitary_width > unitary_width*(1/2+sep/2*1.5)
if(ADD_OTHER){
axis(1, at = at_info[x_nb == nbins + 1, mid_point], line = -0.8, labels = ">", lwd = 0, font = 2)
}
if(ADD_OTHER_LEFT){
axis(1, at = at_info[x_nb == 0, mid_point], line = -0.8, labels = "<", lwd = 0, font = 2)
}
}
} else if(numLabel){
# moderator > 1 + split + numeric axis
current_lim = get_x_lim()
for(i in 1:moderator_cases){
x_sub = data_freq[moderator == moderator_unik[i]]
real_range = c(min(x_sub$xleft_real), max(x_sub$xright_real))
x_show = pretty(real_range, 3)
x_show = x_show[x_show >= real_range[1] & x_show <= real_range[2]]
current_range = c(min(x_sub$xleft), max(x_sub$xright))
x_show_current = to01(x_show, real_range) * diff(current_range) + min(current_range)
axis(1, x_show_current, formatAxisValue(x_show), lwd = 0, lwd.ticks = 1)
axis(1, current_range, NA, lwd = 1, lwd.ticks = 0)
}
# We add the bin information
if(noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
# now the Other
if(any(data_freq$isOther)){
data_freq_other = data_freq[isOther == TRUE]
# we get the right cex
w = data_freq_other[1, xright - xleft]
max_lab = data_freq_other$otherValue[which.max(strwidth(data_freq_other$otherValue))]
minCex = 0.7
myCex = 1
while(myCex > minCex && strwidth(max_lab, cex = myCex) > w){
myCex = myCex * 0.95
}
axis(1, at = data_freq_other[, (xleft + xright)/2], labels = data_freq_other$otherValue, lwd = 0, line = -0.8, cex.axis = myCex)
}
} else if(numAxis){
# We add the bin information
if(noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
myBox(1)
axis_at = axis(1, lwd = 0, labels = NA)
if(ADD_OTHER){
# the axis without last tick
at = axis_at[1:(length(axis_at) - 1)]
if(abs(bin.size) > 1e4){
axis(1, at, formatAxisValue(at))
} else {
axis(1, at)
}
# the "other" text
axis(1, at = at_info[x_nb == nbins + 1, mid_point], line = -1, labels = otherText, lwd = 0)
} else {
if(abs(bin.size) > 1e4){
axis(1, axis_at, formatAxisValue(axis_at))
} else {
axis(1, axis_at)
}
}
if(ADD_OTHER_LEFT){
# the "other" text
axis(1, at = at_info[x_nb == 0, mid_point], line = -1, labels = otherTextLeft, lwd = 0)
}
# ticks at the right place
if(moderator_cases > 1){
# we add the rectangles to replicate a histogram
# axis(1, at = data_freq[moderator_nb == 1, xleft], labels = NA, lwd = 0, lwd.ticks = 1, tcl = -0.45)
rect_info = data_freq[, list(xleft = min(xleft), xright = max(xright), ytop = max(ytop)), by = x_nb]
rect(xleft = rect_info$xleft, ybottom = 0, xright = rect_info$xright, ytop = rect_info$ytop, density = 0, lty = 2)
}
} else if(DO_SPLIT){
# we need to display all xs
# We add the bin information => specific case: max first + numeric data
if(binned_data && noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
data_freq[, mid_point := (xleft + xright) / 2]
myLabels = data_freq$x
myAt = data_freq$mid_point
if(checkNotTilted){
# If very short labels => we don't tilt them // allows to reintroduce xlab
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(length(unique(axis_info$line)) == 1){
labels.tilted = FALSE
} else {
labels.tilted = TRUE
}
}
if(labels.tilted){
xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
} else {
xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method)
}
} else if(isNum){
# we can have the "other" column both left and right
x_unik = at_info[x_nb %in% 1:nbins, x]
myLabels = x_unik
myAt = at_info[x_nb %in% 1:nbins, mid_point]
info_axis = NULL
if(labels.tilted == FALSE && mean(diff(x_unik)) == 1){
# This is a "normal" axis
# everything number follows, this is fine
# # we still add the xlab we removed before (we could not anticipate the result)
# if(checkForTilting){
# if(noXlab) title(xlab = x_name)
# }
axis(1, myAt, labels = myLabels)
} else {
if(checkForTilting){
# If normal axis does not fit => tilt
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(axis_info$failed){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
# # we add the label only if the user didn't provide it before
# if(noXlab) title(xlab = x_name)
}
}
if(labels.tilted){
info_axis = xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
} else {
info_axis = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
}
}
if(ADD_OTHER){
# the "other" text
if(is.null(info_axis)){
axis(1, at = at_info[x_nb == nbins + 1, mid_point] + strwidth("> "), line = -0.85, labels = otherText, lwd = 0, cex.axis = 1)
} else if(labels.tilted){
xaxis_biased(at = at_info[x_nb == nbins + 1, mid_point], labels = otherText, angle = info_axis$angle, cex = info_axis$cex, trunc = trunc, trunc.method = trunc.method)
} else {
all_lines = sort(unique(info_axis$line))
my_line = min(info_axis$line[info_axis$line != tail(info_axis$line, 1)])
axis(1, at = at_info[x_nb == nbins + 1, mid_point] + strwidth("> ", cex = info_axis$cex), line = my_line, labels = otherText, lwd = 0, cex.axis = info_axis$cex)
}
}
if(ADD_OTHER_LEFT){
# the "other" text on the left
if(is.null(info_axis)){
axis(1, at = at_info[x_nb == 0, mid_point] - strwidth("< "), line = -0.85, labels = otherTextLeft, lwd = 0)
} else if(labels.tilted){
xaxis_biased(at = at_info[x_nb == 0, mid_point], labels = otherTextLeft, angle = info_axis$angle, cex = info_axis$cex, trunc = trunc, trunc.method = trunc.method)
} else {
my_line = min(info_axis$line[info_axis$line != info_axis$line[1]])
axis(1, at = at_info[x_nb == 0, mid_point] - strwidth("< ", cex = info_axis$cex), line = my_line, labels = otherTextLeft, lwd = 0, cex.axis = info_axis$cex)
}
}
} else {
if(ADD_OTHER){
nbins = nbins + 1
at_info$x[nbins] = otherText
}
x_unik = at_info[x_nb %in% 1:nbins, x]
myLabels = x_unik
myAt = at_info[x_nb %in% 1:nbins, mid_point]
if(checkForTilting){
# If normal axis does not fit => tilt
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(axis_info$failed){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
# # we add the label only if the user didn't provide it before
# if(noXlab) title(xlab = x_name)
}
}
# We also add ticks every 5/10 bins to help counting
if(missnull(at_5)){
at_5 = ifelse(max(at_info$x_nb) > 10, TRUE, FALSE)
if(at_5) {
at_5 = ifelse(labels.tilted, "line", "roman")
}
} else {
at_5 = at_5[1]
}
if(at_5 == "roman"){
roman_dict = toupper(c("v", "x", "xv", "xx", "xxv", "xxx", "xxxv", "xL", "xLv", "L", "Lv", "Lx", "lxv", "lxx", "lxxv", "lxxx", "lxxxv", "xc", "xcv", "c"))
names(roman_dict) = (1:20) * 5
}
if(at_5 != "FALSE"){
qui = which(at_info$x_nb %% 5 == 0)
# qui = qui[qui != nrow(at_info)]
if(length(at_info)){
# quoi data_freq[qui]
for(i in qui){
if(at_5 == "roman"){
axis(1, at = at_info[i, mid_point], lwd = 0, labels = roman_dict[as.character(i)], line = -1.3, cex.axis = 0.6)
} else {
axis(1, at = data_freq[i, list(xleft, xright)], lwd = 3, lwd.ticks = 0, labels = NA)
}
}
}
}
if(labels.tilted){
info_axis = xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max, line.min = 0.45 * (at_5 == "roman"))
} else {
info_axis = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, line.min = 0.25 * (at_5 == "roman"))
}
}
#
# yaxis and topping ####
#
if(xlab != ""){
title(xlab = xlab, line = mbot$xlab.line)
}
if(sub != ""){
title(sub = sub, line = mbot$sub.line)
}
if(yaxis.show){
# we get the "nice" points display
y_points = axis(2, lwd=0, col.axis = 0)
# We don't go above 100%
y_points = y_points[y_points <= 100]
if(yaxis.num){
# x1 = data_freq[x_nb == 1][1, ]
x1 = data_freq[ytop > 0][1, ]
y_labels = formatAxisValue(y_points / x1$ytop * x1$value)
} else {
y_labels = paste0(y_points, "%")
}
# we find the proper width
if("ylab" %in% names(dots)){
ylab = dots$ylab
}
if(nchar(ylab) == 0){
myWidth = strwidth("7775%")
} else {
myWidth = strwidth("75%")
}
# We find the right cex
minCex = 0.7
myCex = 1
y_labels_max = y_labels[which.max(strwidth(y_labels))]
while(strwidth(y_labels_max, cex = myCex) > myWidth && myCex >= minCex){
myCex = 0.95 * myCex
}
axis(2, at=y_points, labels = y_labels, las = 2, cex.axis = myCex)
}
# Legend if needed
if(isLegend){
#
# finding the cex of the legend => depends on the longuest character
#
# legendFit("top", moderator_unik, bg="white", bty="o", box.col="white", fill=col[1:moderator_cases])
legendFit("top", moderator_unik, fill=col[1:moderator_cases], title = mod.title, title_out = TRUE, trunc = info_legend$trunc)
}
# On top of the bars:
if(top != "none"){
qui = strwidth(top.value2display, units = "in", cex = top.cex) <= unitary_width_top*1.5 | nchar(top.value2display) <= 3
# we don't display the 0s only
qui = qui & info_top$value > 0
# if(top.cex < 0.95 && top == "frac"){
# # we don't display the small numbers
# max_height = max(info_top$ytop)
# qui = qui & (info_top$ytop > max_height*0.05 | info_top$share_top >= 1)
# }
if(any(qui)){
# text(data_freq[, (xleft+xright)/2][qui], data_freq$ytop[qui], labels = top.value2display[qui], pos = 3, cex = top.cex, offset = 0.5 * top.cex * 0.9)
text(info_top$x_mid[qui], info_top$ytop[qui], labels = top.value2display[qui], pos = 3, cex = top.cex, offset = 0.5 * top.cex * 0.9)
}
if(DO_STACK && nrow(info_top_other) > 0){
text(info_top_other$x_mid[qui], info_top_other$ytop[qui], labels = top.value2display_other[qui], pos = 3, cex = top.cex_other, offset = 0.5 * top.cex_other * 0.9)
}
}
return(invisible(data_freq))
}
#' Display means conditionnally on some other values
#'
#' The typical use of this function is to represents trends of average along some
#' categorical variable.
#'
#' @inheritParams plot_distr
#'
#' @param fml A formula of the type \code{variable ~ time | moderator}. Note that
#' the moderator is optional. Can also be a vector representing the elements of
#' the variable If a formula is provided, then you must add the argument \sQuote{data}.
#' You can use multiple variables. If so, you cannot use a moderator at the same time.
#' @param data Data frame containing the variables of the formula. Used only if the
#' argument \sQuote{fml} is a formula.
#' @param time Only if argument \sQuote{fml} is a vector. It should be the vector
#' of \sQuote{time} identifiers to average over.
#' @param moderator Only if argument \sQuote{fml} is a vector. It should be a vector
#' of conditional values to average over. This is an optional parameter.
#' @param fun Function to apply when aggregating the values on the time variable.
#' Default is \code{mean}.
#' @param mod.select Which moderators to select. By default the top 5 moderators
#' in terms of frequency (or in terms of the value of fun in case of identical
#' frequencies) are displayed. If provided, it must be a vector of moderator values
#' whose length cannot be greater than 10. Alternatively, you can put an integer between 1 and 10.
#' @param smoothing_window Default is 0. The number of time periods to average over.
#' Note that if it is provided the new value for each period is the average of
#' the current period and the \code{smoothing_window} time periods before and after.
#' @param col The colors. Either a vector or a keyword (\dQuote{Set1} or \dQuote{paired}).
#' By default those are the \dQuote{Set1} colors colorBrewer. This argument is
#' used only if there is a moderator.
#' @param lty The line types, in the case there are more than one moderator.
#' By default it is equal to 1 (ie no difference between moderators).
#' @param pch The form types of the points, in the case there are more than one
#' moderator. By default it is equal to \code{c(19, 17, 15, 8, 5, 4, 3, 1)}.
#' @param pt.cex Default to 2. The \code{cex} of the points.
#' @param lwd Default to 2. The width of the lines.
#' @param legend_options A list containing additional parameters for the function
#' \code{\link[graphics]{legend}} -- only concerns the moderator. Note that you can
#' set the additionnal arguments \code{trunc} and \code{trunc.method} which relates
#' to the number of characters to show and the truncation method. By default the
#' algorithm truncates automatically when needed.
#' @param ... Other arguments to be passed to the function \code{plot}.
#'
#' @author Laurent Berge
#'
#' @return
#' This function returns *invisibly* the output data.table containing the processed data
#' used for plotting.
#'
#' @examples
#'
#' data(airquality)
#'
#' plot_lines(Ozone ~ Day, airquality)
#'
#' plot_lines(Ozone ~ Day | Month, airquality)
#'
#' plot_lines(Ozone ~ Month | cut(Day, 8), airquality)
#'
#'
plot_lines = function(fml, data, time, moderator, mod.select, mod.NA = TRUE,
smoothing_window = 0, fun, col = "set1", lty = 1,
pch = c(19, 17, 15, 8, 5, 4, 3, 1), legend_options = list(),
pt.cex = 2, lwd = 2, dict = NULL, mod.title = TRUE, ...){
# This functions plots the means of x wrt the id
# we can also add a moderator
# DT VARS
moderator_cases = xleft = xright = ybottom = ytop = value = NULL
check_arg(fml, "formula right(,2) | vector mbt")
# old params
style = "line"
outCol = "black"
# style = match.arg(style)
mc = match.call()
fml_in = fml
check_arg(smoothing_window, "integer scalar GE{0}")
check_arg(mod.NA, "logical scalar")
check_arg(dict, "null named character vector | logical scalar")
# The color
if(length(col) == 1){
check_value_plus(col, "match(set1, paired) | scalar")
if(col == "set1"){
col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00",
"#FFFF33", "#A65628", "#F781BF", "#999999")
} else if(col == "paired"){
col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C",
"#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
}
}
#
# Extracting the variables ####
#
moderator_name = ""
if(inherits(fml_in, "formula")){
# Control of the formula
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
vars = all.vars(fml_in)
if(any(!vars %in% names(data))){
stop("The variable", enumerate_items(setdiff(vars, names(data)), "s.is")," not in the data set (", deparse(mc$data), ").")
}
# Creation of x and the condition
if(!length(fml_in) == 3){
stop("The formula must be of the type 'var ~ time' or 'var ~ time | moderator'.")
}
info = extract_pipe(fml_in)
lhs_fml = info$lhs_fml
fml = info$fml
pipe = info$pipe
x = extract_df(lhs_fml, data)
time = eval(fml[[3]], data)
moderator = eval(pipe, data)
# other info
MULTI_VAR = FALSE
if(length(x) > 1){
# multiple variables to plot
MULTI_VAR = TRUE
if(length(moderator) > 1){
stop("You cannot use a moderator and multiple variables at the same time. If you intended to use one variable only, wrap it into I(): I(", deparse(lhs_fml[[2]]), ").")
}
# check
for(i in 1:length(x)){
if(is.logical(x[[i]])){
x[[i]] = x[[i]] * 1
} else if(!is.numeric(x[[i]])){
stop("All variables must be numeric, but variable '", names(x)[i], "' isn't.")
}
}
if("Frequency" %in% names(x) && missing(fun)){
fun = sum
fun_name = "sum"
} else if(missing(fun)){
fun_name = "mean"
} else {
fun_name = deparse(substitute(fun))
}
# We need to stack the data
n = length(x[[1]])
all_x_names = dict_apply(names(x), dict)
# x_name = paste0(fun_name, "(Variables)")
x_name = substitute(paste(f, phantom(1), group("(", italic(Variables), ")")), list(f = fun_name))
moderator = factor(rep(all_x_names, each = n), levels = all_x_names)
time = rep(time, length(x))
x = unlist(x)
if(missing(mod.title)){
mod.title = "Variables"
}
moderator_name = ""
} else {
x = unlist(x)
if(is.logical(x)) x = x * 1
x_name = deparse(fml[[2]])
if(!is.numeric(x)) stop("The variable to plot must be numeric, currently '", x_name, "' isn't.")
if(x_name == "1"){
x_name = "Frequency"
fun = sum
}
}
# if(length(x) == 1){
# # this is a shortcut to say that we want to display the frequency
# fun = sum
# x_name = "Frequency"
# x = rep(1, length(time))
# } else {
# x_name = deparse(fml[[2]])
# }
if(is.null(moderator)){
moderator = rep(1, length(x))
} else if(MULTI_VAR == FALSE){
moderator_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
}
time_name = deparse(fml[[3]])
} else {
# Wow, the next two lines replace 10 lines of code, and error-handling is much better done
check_arg(time, "vector len(data) mbt", .data = fml)
check_arg(moderator, "safe NULL vector len(data)", .data = fml)
x = fml_in
if(!missnull(moderator)){
moderator_name = deparse(substitute(moderator))
} else {
moderator = rep(1, length(x))
}
x_name = deparse(substitute(fml_in))
time_name = deparse(substitute(time))
# 14 lignes
# if(missing(time)){
# stop("You must provide the argument 'time'.")
# } else {
# x = fml_in
#
# if(length(x) != length(time)){
# stop("The arguments 'x' and 'time' must be of the same length.")
# }
#
# if(!missnull(moderator)){
# if(length(x) != length(moderator)){
# stop("If provided, the argument 'moderator' must be of the same length of 'x'.")
# }
# moderator_name = deparse(substitute(moderator))
# } else {
# moderator = rep(1, length(x))
# }
#
# # other info
# x_name = deparse(substitute(fml_in))
# time_name = deparse(substitute(time))
# }
# 22 lignes
}
# Naming
if(!is.call(x_name)) x_name = dict_apply(x_name, dict)
time_name = dict_apply(time_name, dict)
moderator_name = dict_apply(moderator_name, dict)
# Dropping NAs
quiNA_x = is.na(x)
quiNA_time = is.na(time)
quiNA_mod = is.na(moderator)
if(mod.NA){
quiNA = quiNA_x | quiNA_time
total_NA = sum(quiNA | quiNA_mod)
if(is.factor(moderator)){
moderator = addNA(moderator)
} else {
# The following will coerce moderator into character
moderator[is.na(moderator)] = "NA"
}
} else {
quiNA = quiNA_x | quiNA_time | quiNA_mod
total_NA = sum(quiNA)
}
if(total_NA > 0){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_time))
msg_na = paste0(c("x: ", "moderator: ", "time: "), nb_na)
message("NOTE: ", total_NA, " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ")")
x = x[!quiNA]
moderator = moderator[!quiNA]
time = time[!quiNA]
}
# We first check the function is valid
if(missing(fun)){
fun = mean
} else {
if(!is.function(fun)){
stop("Argument 'fun' must be a function. A the moment its class is ", class(fun)[1], ".")
}
test = try(fun(head(x, 5)), silent = TRUE)
if(inherits(test, "try-error")){
stop("Evaluation of the function '", deparse_long(mc$fun), "' leads to an error\n", test)
} else if(length(test) > 1){
stop("The function in argument 'fun' MUST be an aggregating function: that is returning something of length 1. This is not the case currently.")
}
}
n_moderator = length(unique(moderator))
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
if(anyNA(moderator_unik)){
# This is such a pain in the neck...
moderator_unik[is.na(moderator_unik)] = "NA"
moderator = factor(unclass(moderator), labels = moderator_unik)
}
} else {
moderator_unik = sunique(as.character(moderator))
}
if(n_moderator == 1){
isLegend = FALSE
} else {
isLegend = TRUE
}
# mod.select
if(n_moderator > 1){
n_select = NA
do_recreate = FALSE
if(missing(mod.select)){
if(n_moderator <= 8){
# FINE!
} else {
# We select the top 5
n_select = 5
}
} else if(length(mod.select) > 10){
stop("The argument 'mod.select' (currently of length ", length(mod.select), ") cannot be of a length greater than 10.")
} else if(length(mod.select) == 1 && is.numeric(mod.select) && mod.select %% 1 == 0){
# Possibly a number
if(mod.select %in% 1:10){
n_select = mod.select
} else {
stop("Argument 'mod.select' does not have a valid value: if it is an integer, its value must lie between 1 and 10.")
}
} else {
# mod.select must contain moderator values
# 1) checking the problems
mod_pblm = setdiff(mod.select, moderator_unik)
if(length(mod_pblm) > 0){
# We recreate mod.select
if(length(mod_pblm) != length(mod.select) && any(quiNA)){
# means some are OK
suggestion = " Maybe because it has only NA values?"
} else {
suggestion = " It must be either a moderator value, or a regular expression to select moderator values."
}
# Maybe they're regular expressions => we keep the order desired by the user
mod.ok = c()
for(r in mod.select){
if(r %in% moderator_unik){
mod.ok = c(mod.ok, r)
} else {
qui = grepl(r, moderator_unik)
n_qui = sum(qui)
if(n_qui == 0){
stop("In the argument 'mod.select', the value ", r, " could not be found in the moderator variable.", suggestion)
} else if(n_qui > 10){
stop("In the argument 'mod.select', the value ", r, " leads to more than 10 moderator values to be selected (e.g. ", enumerate_items(moderator_unik[qui], nmax = 15), ".")
}
mod.ok = c(mod.ok, moderator_unik[qui])
}
}
mod.ok = unique(mod.ok)
if(length(mod.ok) > 10){
stop("Argument mod.select leads to ", length(mod.ok), " moderator values to be selected. But the maximum is 10.")
}
mod.select = mod.ok
}
# 2) selection
do_recreate = TRUE
}
# AUTOMATIC selection
if(!is.na(n_select) && n_moderator > n_select){
# We proceed with the "automatic" selection
info_mod = data.table(moderator = moderator)
info_mod = info_mod[, list(value = .N), by = moderator]
if(var(info_mod$value) == 0){
# same freq for all
info_mod = data.table(x = x, moderator = moderator)
info_mod = info_mod[, list(value = fun(x)), by = moderator]
info_method = paste0("the max of ", deparse(mc$fun)[1], ".")
} else {
info_method = "frequency."
}
# We inform on the method for the choice
message(ifelse(n_select == 1, "The moderator was", paste0("The ", n_select, " moderators were")), " chosen based on ", info_method)
info_mod = info_mod[order(-value)]
mod.select = info_mod[1:n_select, moderator]
do_recreate = TRUE
if(n_select == 1){
isLegend = FALSE
}
}
if(mod.NA){
if(missing(mod.select)){
do_recreate = FALSE
} else {
mod.select = unique(c(mod.select, "NA"))
do_recreate = TRUE
}
}
if(do_recreate){
# We recreate the variables
qui_select = which(moderator %in% mod.select)
# we recreate all the values
x = x[qui_select]
moderator = moderator[qui_select]
time = time[qui_select]
# Re-Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
} else {
moderator_unik = sunique(moderator)
}
n_moderator = length(moderator_unik)
}
}
#
# Aggregation
#
quoi = data.table(x=x, time=time, moderator=moderator)
base_agg = quoi[!is.na(x), list(x = fun(x)), by = list(time, moderator)]
#
# Preparation
#
dots = list(...)
# ylim
if(!is.null(dots$ylim)){
ylim = dots$ylim
} else if(style == "line"){
ylim = range(base_agg[!is.na(moderator), x])
} else if(style == "bar"){
rx = range(base_agg[!is.na(moderator), x])
ylim = c(min(0, rx[1]), max(0, rx[2]))
}
ymax = Inf
if(isLegend){
ymax = ylim[2]
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
info_legend = legendFit(legend = moderator_unik, title = mod.title, plot = FALSE)
legend_height = info_legend$total_height
ylim[2] = ylim[2] + (legend_height + 0.25) / par("pin")[2] * diff(ylim)
}
listDefault(dots, "ylim", ylim)
is_ylab = "ylab" %in% names(dots)
my_ylab = ifelse(is_ylab, dots$ylab, "")
dots$ylab = ""
# listDefault(dots, "ylab", x_name)
listDefault(dots, "xlab", time_name)
dots$type = "n"
dots$y = rep(1, length(unique(base_agg$time)))
makeAxes = FALSE
if(is.numeric(base_agg$time)){
dots$x = sunique(base_agg$time)
} else {
# for character vectors, we need to recreate the xaxis
if(is.null(dots$axes) || dots$axes){
makeAxes = TRUE
}
dots$axes = FALSE
if(is.factor(base_agg$time)){
time_unik = levels(unique(base_agg$time)[, drop=TRUE])
dict_time = 1:length(time_unik)
names(dict_time) = time_unik
dots$x = 1:length(time_unik)
base_agg[, time := dict_time[as.character(time)]]
} else {
time_unik = sunique(base_agg$time)
dots$x = quickUnclassFactor(time_unik)
base_agg[, time := quickUnclassFactor(time)]
}
}
if(style == "bar"){
total_width = min(diff(sort(unique(base_agg$time))))
bar_width = total_width / (n_moderator + 1)
if(is.null(dots$xlim)){
total_width = min(diff(sort(unique(base_agg$time))))
dots$xlim = c(min(base_agg$time) - total_width * 0.5 + bar_width/2, max(base_agg$time) + total_width * 0.5 - bar_width/2)
}
} else {
if(smoothing_window > 0){
quoi = sort(dots$x)
if(length(quoi) < 2*smoothing_window + 2){
stop("To smooth with smoothing_window=", smoothing_window, " you need at least ", 2*smoothing_window + 2, " periods. Yet at the moment there is only ", length(quoi), " periods.")
}
dots$xlim = quoi[c(smoothing_window + 1, length(quoi) - smoothing_window)]
}
}
do.call("plot", dots)
dots$axes = NULL # not a graphical parameter afterwards
# ylab: because of problems with calls
if(is_ylab){
title(ylab = my_ylab)
} else {
title(ylab = x_name)
}
if(style == "bar"){
hgrid(ymax = ymax)
} else {
hgrid(ymax = ymax)
vgrid(ymax = ymax)
# grid(col = "darkgray")
}
if(makeAxes){
box()
axis(2)
axis(1, dots$x, time_unik)
}
#
# The plot_line
#
all_col = col[1 + (0:(n_moderator-1))%%length(col)]
all_lty = lty[1 + (0:(n_moderator-1))%%length(lty)]
all_pch = pch[1 + (0:(n_moderator-1))%%length(pch)]
if(style == "line"){
for(i in 1:n_moderator){
dots$col = all_col[i]
dots$lty = all_lty[i]
dots$pch = all_pch[i]
dots$cex = pt.cex
dots$add = TRUE
dots$smoothing_window = smoothing_window
dots$x = base_agg[moderator == moderator_unik[i], time]
dots$y = base_agg[moderator == moderator_unik[i], x]
do.call("plot_line", dots)
}
} else if(style == "bar"){
# Style is barplot
base_agg[, moderator_cases := quickUnclassFactor(moderator)]
base_agg[, xleft := time - total_width/2 + bar_width/2 + bar_width*(moderator_cases - 1)]
base_agg[, xright := xleft + bar_width]
base_agg[, ybottom := pmin(0, x)]
base_agg[, ytop := pmax(0, x)]
# the call to rect
all_outcol = outCol[1 + (0:(n_moderator-1))%%length(outCol)]
dots$xleft = base_agg$xleft
dots$ybottom = base_agg$ybottom
dots$xright = base_agg$xright
dots$ytop = base_agg$ytop
if(n_moderator > 1){
dots$col = all_col[base_agg$moderator_cases]
dots$border = all_outcol[base_agg$moderator_cases]
} else {
dots$col = col
dots$border = outCol
}
dots = dots[names(dots) %in% names(formals(rect))]
do.call("rect", dots)
}
#
# Legend
#
if(n_moderator > 1 && isLegend){
if(!is.list(legend_options)){
stop("Argument'legend_options' must be a list.")
}
# mandatory
legend_options$legend = moderator_unik
if(style == "line"){
legend_options$col = all_col
legend_options$lty = all_lty
legend_options$pch = all_pch
legend_options$pt.cex = pmin(pt.cex, 1.5)
legend_options$lwd = lwd
} else if(style == "bar"){
legend_options$fill = all_col
}
legend_options$title = mod.title
legend_options$trunc = info_legend$trunc
# options
listDefault(legend_options, "x", "top")
# the call
do.call("legendFit", legend_options)
}
invisible(base_agg)
}
#' Boxplots with possibly moderators
#'
#' This function allows to draw a boxplot, with possibly separating different moderators.
#'
#' @inheritParams plot_distr
#'
#' @param fml A numeric vector or a formula of the type:
#' \code{vars ~ moderator_1 | moderator_2}. Note that if a formula is provided then
#' the argument \sQuote{data} must be provided. You can plot several variables,
#' if you don't want a moderator, use 1 instead: e.g.
#' \code{plot_box(Petal.Width +Petal.Length ~ 1, iris)}. You can plot all numeric
#' variables from a data set using \code{"."}: \code{plot_box(. ~ 1, iris)}.
#' @param data A data.frame/data.table containing the relevant information.
#' @param case When argument fml is a vector, this argument can receive a vector of cases.
#' @param moderator When argument fml is a vector, this argument can receive a vector of moderators.
#' @param inCol A vector of colors that will be used for within the boxes.
#' @param outCol The color of the outer box. Default is black.
#' @param pch The patch of the outliers. Default is 18.
#' @param addLegend Default is \code{TRUE}. Should a legend be added at the top
#' of the graph is there is more than one moderator?
#' @param lwd The width of the lines making the boxes. Default is 2.
#' @param outlier Default is \code{TRUE}. Should the outliers be displayed?
#' @param dict_case A named character vector. If provided, it changes the values
#' of the variable \sQuote{case} to the ones contained in the vector \code{dict_case}.
#' Example: to change the variable named "a" to "Australia" and "b" to "Brazil",
#' use \code{dict=c(a="Australia",b="Brazil")}.
#' @param dict_moderator A named character vector. If provided, it changes
#' the values of the variable \sQuote{moderator} to the ones contained in
#' the vector \code{dict_moderator}. Example: to change the variable
#' named "a" to "Australia" and "b" to "Brazil", use \code{dict=c(a="Australia",b="Brazil")}.
#' @param order_case Character vector. This element is used if the user wants the
#' \sQuote{case} values to be ordered in a certain way.
#' This should be a regular expression (see \code{\link[base]{regex}} help for more info).
#' There can be more than one regular expression. The variables satisfying
#' the first regular expression will be placed first, then the order follows
#' the sequence of regular expressions.
#' @param order_moderator Character vector. This element is used if the user wants
#' the \sQuote{moderator} values to be ordered in a certain way. This should be
#' a regular expression (see \code{\link[base]{regex}} help for more info).
#' There can be more than one regular expression. The variables satisfying the
#' first regular expression will be placed first, then the order follows the
#' sequence of regular expressions.
#' @param addMean Whether to add the average for each boxplot. Default is true.
#' @param mean.col The color of the mean. Default is darkred.
#' @param mean.cex The cex of the mean, default is 2.
#' @param mean.pch The patch of the mean, default is 18.
#' @param line.max Option for the x-axis, how far should the labels go.
#' Default is 1 for normal labels, 2 for tilted labels.
#' @param density The density of lines within the boxes. By default it is equal to -1,
#' which means the boxes are filled with color.
#' @param lty The type of lines for the border of the boxes. Default is 1 (solid line).
#' @param ... Other parameters to be passed to \code{plot}.
#'
#' @author Laurent Berge
#'
#' @return
#' Invisibly returns the coordinates of the x-axis.
#'
#' @examples
#'
#' # Simple iris boxplot
#' plot(1:10)
#'
#' # All numeric variables
#' plot_box(. ~ 1, iris)
#'
#' # All numeric variable / splitting by species
#' plot_box(. ~ Species, iris)
#'
#' # idem but with renaming
#' plot_box(. ~ Species, iris, dict = c(Species="Iris species",
#' setosa="SETOSA", Petal.Width="Width (Petal)"))
#'
#' # Now using two moderators
#' base = iris
#' base$period = sample(1:4, 150, TRUE)
#'
#' plot_box(Petal.Length ~ period | Species, base)
#'
#'
#'
#'
#'
plot_box = function(fml, data, case, moderator, inCol, outCol = "black", density = -1,
lty = 1, pch = 18, addLegend = TRUE, legend_options = list(),
lwd = 2, outlier, dict = NULL, dict_case, dict_moderator, order_case,
order_moderator, addMean, mean.col = "darkred", mean.pch = 18,
mean.cex = 2, mod.title = TRUE, labels.tilted, trunc = 20,
trunc.method = "auto", line.max, ...){
# DT VARS USED
case_nb = moderator_nb = span = q3 = q1 = min_whisker = y_min = max_whisker = y_max = NULL
fml_in = fml
mc = match.call()
# Controls
check_arg("logical scalar", addLegend, outlier, addMean, labels.tilted)
check_arg(order_moderator, order_case, "character vector", .message = "Argument '__ARG__' must be a vector of regular expressions.")
check_arg(trunc, "integer scalar GE{5}")
check_arg(dict, "null named character vector | logical scalar")
#
# Extracting the information
#
moderator_name = case_name = ""
if(inherits(fml_in, "formula")){
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
# we check the variables data are there
check_arg(fml, "ts formula left(1) right(,2)", .message = "Argument 'fml' must be a formula of the type variables ~ moderator_1 | moderator_2 ('moderator_1' [you can replace it by 1] and 'moderator_2' are optional).")
# "." is a special variable
vars = setdiff(all.vars(fml_in), ".")
if(any(!vars %in% names(data))){
stop("The variable", enumerate_items(setdiff(vars, names(data)), "s.is")," not in the data set (", deparse_long(mc$data), ").")
}
info = extract_pipe(fml_in)
fml = info$fml
pipe = info$pipe
case = eval(fml[[3]], data)
moderator = eval(pipe, data)
if(deparse(fml[[2]]) == "."){
x_all = as.data.frame(data)
qui_num = sapply(x_all, function(z) is.numeric(z) || is.logical(z))
if(length(case) > 1){
# There is a case variable => we drop it from the variables
case_name = deparse(fml[[3]])
qui_num = qui_num & !names(x_all) == case_name
}
if(!any(qui_num)){
stop("Only numeric variables can be displayed, and none were found in the data set.")
}
x_all = x_all[, qui_num]
} else {
x_all = extract_df(info$lhs_fml, data)
qui_num = sapply(x_all, function(z) is.numeric(z) || is.logical(z))
if(any(!qui_num)){
stop("Only numeric variables can be displayed, the variable", enumerate_items(names(x_all), "s.quote.is"), " not numeric.")
}
}
mod_is_set = FALSE
if(length(x_all) == 1){
x = x_all[[1]]
x_name = names(x_all)
if(length(case) > 1){
case_name = deparse(fml[[3]])
} else {
case = rep(x_name, length(x))
x_name = ""
}
} else {
if(is.data.frame(x_all)){
n = nrow(x_all)
K = ncol(x_all)
} else {
n = length(x_all[[1]])
K = length(x_all)
}
if(length(case) == 1){
# No case => the variables become the case
case = rep(names(x_all), each = n)
} else if(length(moderator) > 1){
stop("When plotting multiple variables, you cannot use a second moderator. This is possible only when plotting kust one variable.")
} else {
# case: the variables // moderator: the cases
mod_is_set = TRUE
moderator_name = deparse(fml[[3]])
moderator = rep(case, K)
case = rep(names(x_all), each = n)
}
x = unlist(x_all)
x_name = ""
case_name = ""
}
if(is.null(moderator)){
moderator = rep(1, length(x))
} else if(mod_is_set == FALSE){
moderator_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
}
# other info
# x_name = deparse(fml[[2]])
# case_name = deparse(fml[[3]])
} else {
if(missing(case)){
stop("You must provide the argument 'case'.")
} else {
x = fml_in
if(length(x) != length(case)){
stop("The arguments 'x' and 'case' must be of the same length.")
}
if(!missing(moderator) && !is.null(moderator)){
if(length(x) != length(moderator)){
stop("If provided, the argument 'moderator' must be of the same length of 'x'.")
}
moderator_name = clean_name(deparse(substitute(moderator)))
} else {
moderator = rep(1, length(x))
}
# other info
x_name = clean_name(deparse(substitute(fml)))
case_name = clean_name(deparse(substitute(case)))
}
}
# Dropping NAs
quiNA_x = is.na(x)
quiNA_mod = is.na(moderator)
quiNA_case = is.na(case)
quiNA = quiNA_x | quiNA_mod | quiNA_case
if(any(quiNA)){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_case))
msg_na = paste0(c("x: ", "moderator: ", "case: "), nb_na)
message("NOTE: ", sum(quiNA), " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ").")
x = x[!quiNA]
moderator = moderator[!quiNA]
case = case[!quiNA]
}
# Renaming: dict
x_name = dict_apply(x_name, dict)
case_name = dict_apply(case_name, dict)
moderator_name = dict_apply(moderator_name, dict)
#
# Aggregation
#
mod_num_logical = is.logical(moderator) || is.numeric(moderator) # used later
moderator = as.character(moderator)
CASE_FACTOR = FALSE
if(is.factor(case)){
CASE_FACTOR = TRUE
case_unik = levels(case[, drop = TRUE])
case = unclass(case[, drop = TRUE])
}
quoi = data.table(x=as.numeric(x), case=case, moderator=moderator)
delayAddMean = FALSE
if(missnull(addMean)){
addMean = TRUE
delayAddMean = TRUE
}
if(addMean){
base_agg = quoi[, list(y_min = min(x), q1 = quantile(x, 0.25), m = median(x), q3 = quantile(x, 0.75), y_max = max(x), avg = mean(x)), keyby = list(case, moderator)]
} else {
base_agg = quoi[, list(y_min = min(x), q1 = quantile(x, 0.25), m = median(x), q3 = quantile(x, 0.75), y_max = max(x)), keyby = list(case, moderator)]
}
#
# the coordinates
#
# putting case to number and formatting it
if(!missnull(order_case)){
case_unik = sunique(case)
for(var2order in rev(order_case)){
who = grepl(var2order, case_unik)
case_unik = c(case_unik[who], case_unik[!who])
}
base_agg[, case_nb := dict2number(case_unik, case)]
} else {
base_agg[, case_nb := quickUnclassFactor(case)]
if(CASE_FACTOR == FALSE){
what = unique(base_agg[, list(case, case_nb)])
what = what[order(case_nb)]
case_unik = what$case
}
}
aliasCase = dict_apply(case_unik, dict)
if(!missing(dict_case) && !is.null(dict_case)){
if(!is.character(dict_case) || is.null(names(dict_case))) stop("the arg. 'dict_case' must be a named character vector.")
qui = which(case_unik %in% names(dict_case))
aliasCase[qui] = dict_case[case_unik[qui]]
}
# putting moderator to number and formatting it
if(!missnull(order_moderator)){
moderator_unik = sunique(moderator)
for(var2order in rev(order_moderator)){
who = grepl(var2order, moderator_unik)
moderator_unik = c(moderator_unik[who], moderator_unik[!who])
}
base_agg[, moderator_nb := dict2number(moderator_unik, moderator)]
} else {
base_agg[, moderator_nb := quickUnclassFactor(moderator)]
moderator_unik = sunique(moderator)
}
aliasModerator = dict_apply(moderator_unik, dict)
if(!missing(dict_moderator) && !is.null(dict_moderator)){
if(!is.character(dict_moderator)|| is.null(names(dict_moderator))) stop("The argument 'dict_moderator' must be a named character vector.")
qui = which(moderator_unik %in% names(dict_moderator))
aliasModerator[qui] = dict_moderator[moderator_unik[qui]]
}
n_case = length(case_unik)
n_moderator = length(moderator_unik)
# The colors
if(missnull(inCol)){
if(n_moderator == 1){
inCol = "#1F78B4"
} else {
inCol = c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
}
}
# separation between the different cases:
sep = 0.30 + 0.15 * (n_moderator-1)
base_agg[, x := (case_nb-1)*(n_moderator+sep) + moderator_nb]
# Information on the length of the whiskers
base_agg[, span := 1.5 * (q3 - q1)]
base_agg[, min_whisker := pmax(y_min, q1 - span)]
base_agg[, max_whisker := pmin(y_max, q3 + span)]
# the legend
isLegend = FALSE
if(addLegend && n_moderator > 1){
isLegend = TRUE
}
# mod.title
# if(isLegend){
# if(missing(mod.title)){
# if(mod_num_logical){
# mod.title = moderator_name
# } else {
# mod.title = NULL
# }
# } else if(isTRUE(mod.title)){
# mod.title = moderator_name
# } else if(isFALSE(mod.title)){
# mod.title = NULL
# }
# }
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
#
# Preparation
#
dots = list(...)
# default for outlier
if(missnull(outlier)){
outlier = FALSE
if(diff(range(x)) < 2 * diff(range(c(base_agg$min_whisker, base_agg$max_whisker)))){
outlier = TRUE
}
}
# default for addMean
if(delayAddMean){
addMean = !outlier
}
# ylim
if(outlier){
ylim = range(quoi$x)
} else {
ylim = c(min(base_agg$min_whisker), max(base_agg$max_whisker))
}
ymax = Inf
if(isLegend){
ymax = ylim[2]
info_legend = legendFit(legend = aliasModerator, plot = FALSE, title = mod.title)
# hauteur_caractere = strheight("W", "in", cex = info_legend$cex)
# ylim[2] = ylim[2] + 2.5*hauteur_caractere / par("pin")[2] * diff(ylim)
total_height = info_legend$total_height
ylim[2] = ylim[2] + total_height / par("pin")[2] * diff(ylim)
}
if(missnull(labels.tilted)){
if(sum(nchar(aliasCase)) * strwidth("w", "in") > 2 * par("pin")[1]){
labels.tilted = TRUE
case_name = ""
} else {
labels.tilted = FALSE
}
} else if(labels.tilted){
case_name = ""
}
listDefault(dots, "ylim", ylim)
listDefault(dots, "ylab", x_name)
listDefault(dots, "xlab", case_name)
dots$type = "n"
isAxes = TRUE
if(!is.null(dots$axes)){
isAxes = dots$axes
}
dots$axes = FALSE
dots$x = dots$y = 1
# xlim
dots$xlim = range(base_agg$x) + c(-1, 1) * ifelse(nrow(base_agg) == 1, 1, 0.5)
do.call("plot", dots)
if(isAxes){
box()
las = dots$las
axis(2, las = las)
at_labels = 1 + (n_moderator-1)/2 + (0:(n_case-1))*(n_moderator+sep)
# axis(1, at = at_labels, labels = aliasCase, lwd = 0)
if(labels.tilted){
if(missnull(line.max)) line.max = 2 + 1
xaxis_biased(at = at_labels, labels = aliasCase, trunc = trunc, trunc.method = trunc.method, line.max=line.max)
} else {
if(missnull(line.max)) line.max = 1 + 1
xaxis_labels(at = at_labels, labels = aliasCase, trunc = trunc, trunc.method = trunc.method, line.max=line.max)
}
}
hgrid(ymax = ymax)
#
# The plot_line
#
if(n_moderator == 1){
all_inCol = inCol[1 + (0:(n_case-1))%%length(inCol)]
all_outCol = outCol[1 + (0:(n_case-1))%%length(outCol)]
all_densities = density[1 + (0:(n_case-1))%%length(density)]
} else {
all_inCol = inCol[1 + (0:(n_moderator-1))%%length(inCol)]
all_outCol = outCol[1 + (0:(n_moderator-1))%%length(outCol)]
all_densities = density[1 + (0:(n_moderator-1))%%length(density)]
}
for(i in 1:n_moderator){
quoi = base_agg[moderator == moderator_unik[i]]
if(n_moderator == 1){
box_single(x = quoi$x, quoi$y_min, quoi$q1, quoi$m, quoi$q3, quoi$y_max, width = 1, inCol = all_inCol, lwd = lwd, outCol = all_outCol, density = all_densities)
} else {
box_single(x = quoi$x, quoi$y_min, quoi$q1, quoi$m, quoi$q3, quoi$y_max, width = 1, inCol = all_inCol[i], lwd = lwd, outCol = all_outCol[i], density = all_densities[i])
}
# now the outliers
if(outlier){
for(j in 1:n_case){
if(CASE_FACTOR){
# values have been unclassed
quoi_case = quoi[case == j]
y = x[moderator == moderator_unik[i] & case == j]
} else {
quoi_case = quoi[case == case_unik[j]]
y = x[moderator == moderator_unik[i] & case == case_unik[j]]
}
qui_outlier = y > quoi_case$max_whisker | y < quoi_case$min_whisker
if(any(qui_outlier)){
points(rep(quoi_case$x, sum(qui_outlier)), y[qui_outlier], pch=pch)
}
}
}
if(addMean){
quoi = base_agg[moderator == moderator_unik[i]]
points(quoi$x, quoi$avg, col = mean.col, pch = mean.pch, cex = mean.cex)
}
}
#
# Legend
#
if(isLegend){
if(!is.list(legend_options)){
stop("Argument'legend_options' must be a list.")
}
# mandatory
legend_options$legend = aliasModerator
legend_options$fill = all_inCol
legend_options$title = mod.title
# options
listDefault(legend_options, "x", "top")
# the call
do.call("legendFit", legend_options)
}
invisible(1 + (n_moderator-1)/2 + (0:(n_case-1))*(n_moderator+sep))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.