Nothing
#----------------------------------------------#
# Author: Laurent Berge
# Date creation: Mon Sep 30 10:09:06 2019
# ~: user-visible visualization functions
#----------------------------------------------#
# load("data/us_pub_econ.RData")
# roxygen2::roxygenise(roclets = "rd")
#' Sets the defaults of plot_distr
#'
#' The default values of most arguments of \code{\link[fplot]{plot_distr}} can be
#' set with \code{setFplot_distr}.
#'
#' @inheritParams plot_distr
#'
#' @param reset Logical scalar, default is \code{FALSE}. Whether the defaults should be reset.
#'
#' @seealso
#' \code{\link[fplot]{plot_distr}}, \code{\link[fplot]{pdf_fit}}, \code{\link[fplot]{fit.off}}.
#'
#' @return
#' The function `setFplot_distr()` does not return anything, it only sets the default
#' parameters for the function [plot_distr()].
#'
#' The function `getFplot_distr()` returns a named list containing the arguments
#' that have been set with the function `setFplot_distr()`.
#'
#' @examples
#'
#' # Changing the default color set for plot_distr only
#' my_col = c("#36688D", "#F3CD05", "#F49F05", "#F18904", "#BDA589")
#'
#' setFplot_distr(col = my_col, mod.method = "split", border = NA)
#'
#' plot_distr(~ Petal.Length | Species, iris)
#'
#' # Back to normal
#' setFplot_distr(reset = TRUE)
#'
#' plot_distr(~ Petal.Length | Species, iris)
#'
#'
#'
setFplot_distr = function(sorted, log, top, yaxis.num, col, border = "black", mod.method, within, total, at_5, labels.tilted, other, cumul = FALSE, centered = TRUE, weight.fun, int.categorical, dict = NULL, mod.title = TRUE, labels.angle, cex.axis, trunc = 20, trunc.method = "auto", reset = FALSE){
# Function to set the defaults of plot_distr
fm_distr = formals(plot_distr)
arg_list = names(fm_distr)
# arg_no_default = c("fml", "data", "moderator", "weight", "nbins", "bin.size", "legend_options", "yaxis.show", "mod.select", "plot", "sep", "...")
# m = fm_distr[!names(fm_distr) %in% arg_no_default]
# cat(gsub(" = ,", ",", paste0(names(m), " = ", sapply(m, deparse), collapse = ", ")))
check_arg("logical scalar", sorted, log, yaxis.num, within, total, labels.tilted, other, cumul)
check_arg("logical scalar", centered, int.categorical, reset)
check_arg_plus(top, "match(frac, nb, none, FALSE)")
if(!missing(top) && top == "FALSE") top = "none"
check_arg(col, "vector")
check_arg(border, "NULL NA | vector(character, integer) NA OK")
check_arg_plus(mod.method, "match(side, split, stack)")
check_arg_plus(at_5, "match(roman, line, FALSE)")
check_arg(weight.fun, "function arg(1,)")
check_arg(dict, "null named character vector | logical scalar")
check_arg(labels.angle, "numeric scalar GE{10} LE{90}")
check_arg(cex.axis, "numeric scalar GT{0}")
check_arg(trunc, "integer scalar GT{5}")
check_arg_plus(trunc.method, "match(auto, mid, right)")
# Getting the existing defaults
opts = getOption("fplot_distr")
if(is.null(opts)){
opts = list()
} else if(!is.list(opts)){
warning("Wrong formatting of option 'fplot_distr', all options are reset.")
opts = list()
} else if(reset){
opts = list()
}
# Saving the default values
mc = match.call()
args_default = intersect(names(mc), arg_list)
# NOTA: we don't allow delayed evaluation => all arguments must have hard values
for(v in args_default){
opts[[v]] = eval(as.name(v))
}
options(fplot_distr = opts)
}
#' @rdname setFplot_distr
getFplot_distr = function(){
opts = getOption("fplot_distr")
if(!is.list(opts)){
warning("Wrong formatting of option 'fplot_distr', all options are reset.")
opts = list()
options(fplot_distr = opts)
}
opts
}
#' Plot distributions, possibly conditional
#'
#' This function plots distributions of items (a bit like an histogram) which can
#' be easily conditioned over.
#'
#' @param fml A formula or a vector. If a formula, it must be of the type:
#' \code{weights ~ var | moderator}. If there are no moderator nor weights, you
#' can use directly a vector, or use a one-sided formula \code{fml = ~var}. You
#' can use multiple variables as weights, if so, you cannot use moderators at the
#' same time. See examples.
#' @param data A data.frame: data set containing the variables in the formula.
#' @param moderator Optional, only if argument \code{fml} is a vector. A vector
#' of moderators.
#' @param weight Optional, only if argument \code{fml} is a vector. A vector of
#' (positive) weights.
#' @param sorted Logical: should the first elements displayed be the most frequent?
#' By default this is the case except for numeric values put to log or to integers.
#' @param log Logical, only used when the data is numeric. If \code{TRUE}, then
#' the data is put to logarithm beforehand. By default numeric values are put to
#' log if the log variation exceeds 3.
#' @param col A vector of colors, default is close to paired. You can also use \dQuote{set1}
#' or \dQuote{paired}.
#' @param border Outer color of the bars. Defaults is \code{"black"}. Use \code{NA}
#' to remove the borders.
#' @param nbins Maximum number of items displayed. The default depends on the number
#' of moderator cases. When there is no moderator, the default is 15, augmented
#' to 20 if there are less than 20 cases.
#' @param bin.size Only used for numeric values. If provided, it creates bins of
#' observations of size \code{bin.size}. It creates bins by default for numeric non-integer data.
#' @param legend_options A list. Other options to be passed to \code{legend} which
#' concerns the legend for the moderator.
#' @param yaxis.show Whether the y-axis should be displayed, default is \code{TRUE}.
#' @param yaxis.num Whether the y-axis should display regular numbers instead of
#' frequencies in percentage points. By default it shows numbers only when the data
#' is weighted with a different function than the sum. For conditionnal distributions,
#' a numeric y-axis can be displayed only when \code{mod.method = "sideTotal"},
#' \code{mod.method = "splitTotal"} or \code{mod.method = "stack"}, since for the
#' within distributions it does not make sense (because the data is rescaled for each moderator).
#' @param mod.select Which moderators to select. By default the top 3 moderators
#' in terms of frequency (or in terms of weight value if there's a weight) are displayed.
#' If provided, it must be a vector of moderator values whose length cannot be greater
#' than 5. Alternatively, you can put an integer between 1 and 5. This argument
#' also accepts regular expressions.
#' @param mod.NA Logical, default is \code{FALSE}. If \code{TRUE}, and if the moderator
#' contains \code{NA} values, all \code{NA} values from the moderator will be treated
#' as a regular case: allows to display the distribution for missing values.
#' @param mod.method A character scalar: either i) \dQuote{split}, the default for
#' categorical data, ii) \dQuote{side}, the default for data in logarithmic form
#' or numeric data, or iii) \dQuote{stack}. This is only used when there is more
#'รน than one moderator. If \code{"split"}: there is one separate histogram for each
#' moderator case. If \code{"side"}: moderators are represented side by side for
#' each value of the variable. If \code{"stack"}: the bars of the moderators are
#' stacked onto each other, the bar heights representing the distribution in the
#' total population. You can use the other arguments \code{within} and \code{total}
#' to say whether the distributions should be within each moderator or over the
#' total distribution.
#' @param within Logical, default is missing. Whether the distributions should be
#' scaled to reflect the distribution within each moderator value. By default it
#' is \code{TRUE} if \code{mod.method} is different from \code{"stack"}.
#' @param total Logical, default is missing. Whether the distributions should be
#' scaled to reflect the total distribution (and not the distribution within each
#' moderator value). By default it is \code{TRUE} only if \code{mod.method="stack"}.
#' @param labels.tilted Whether there should be tilted labels. Default is \code{FALSE}
#' except when the data is split by moderators (see \code{mod.method}).
#' @param labels.angle Only if the labels of the x-axis are tilted. The angle of the tilt.
#' @param other Logical. Should there be a last column counting for the observations
#' not displayed? Default is \code{TRUE} except when the data is split.
#' @param plot Logical, default is \code{TRUE}. If \code{FALSE} nothing is plotted,
#' only the data is returned.
#' @param sep Positive number. The separation space between the bars. The scale
#' depends on the type of graph.
#' @param centered Logical, default is \code{TRUE}. For numeric data only and when
#' \code{sorted=FALSE}, whether the histogram should be centered on the mode.
#' @param weight.fun A function, by default it is \code{sum}. Aggregate function
#' to be applied to the weight with respect to variable and the moderator. See examples.
#' @param at_5 Equal to \code{FALSE}, \code{"roman"} or \code{"line"}. When plotting
#' categorical variables, adds a small Roman number under every 5 bars
#' (\code{at_5 = "roman"}), or draws a thick axis line every 5 bars (\code{at_5 = "line"}).
#' Helps to get the rank of the bars. The default depends on the type of data --
#' Not implemented when there is a moderator.
#' @param dict A dictionnary to rename the variables names in the axes and legend.
#' Should be a named vector. By default it s the value of \code{getFplot_dict()},
#' which you can set with the function \code{\link[fplot]{setFplot_dict}}.
#' @param mod.title Character scalar. The title of the legend in case there is a
#' moderator. You can set it to \code{TRUE} (the default) to display the moderator
#' name. To display no title, set it to \code{NULL} or \code{FALSE}.
#' @param cex.axis Cex value to be passed to biased labels. By defaults, it finds
#' automatically the right value.
#' @param int.categorical Logical. Whether integers should be treated as categorical
#' variables. By default they are treated as categorical only when their range is
#' small (i.e. smaller than 1000).
#' @param trunc If the main variable is a character, its values are truncaded to
#' \code{trunc} characters. Default is 20. You can set the truncation method with
#' the argument \code{trunc.method}.
#' @param trunc.method If the elements of the x-axis need to be truncated, this
#' is the truncation method. It can be "auto", "right" or "mid".
#' @param cumul Logical, default is \code{FALSE}. If \code{TRUE}, then the cumulative
#' distribution is plotted.
#' @param top What to display on the top of the bars. Can be equal to "frac" (for
#' shares), "nb" or "none". The default depends on the type of the plot. To disable
#' it you can also set it to \code{FALSE} or the empty string.
#' @param ... Other elements to be passed to plot.
#'
#' @details
#' Most default values can be modified with the function \code{\link[fplot]{setFplot_distr}}.
#'
#' @author Laurent Berge
#'
#' @seealso
#' To plot temporal evolutions: \code{\link[fplot]{plot_lines}}. For boxplot: \code{\link[fplot]{plot_box}}.
#' To export graphs: \code{\link[fplot]{pdf_fit}}, \code{\link[fplot]{png_fit}},
#' \code{\link[fplot]{fit.off}}.
#'
#' @return
#' This function returns *invisibly* the output data.table containing the processed data
#' used for plotting. With the argument `plot = FALSE`, only the data is returned.
#'
#' @examples
#'
#' # Data on publications from U.S. institutions
#' data(us_pub_econ)
#'
#' # 0) Let's set a dictionary for a better display of variables
#' setFplot_dict(c(institution = "U.S. Institution", jnl_top_25p = "Top 25% Pub.",
#' jnl_top_5p = "Top 5% Pub.", Frequency = "Publications"))
#'
#' # 1) Let's plot the distribution of publications by institutions:
#' plot_distr(~institution, us_pub_econ)
#'
#' # When there is only the variable, you can use a vector instead:
#' plot_distr(us_pub_econ$institution)
#'
#' # 2) Now the production of institution weighted by journal quality
#' plot_distr(jnl_top_5p ~ institution, us_pub_econ)
#'
#' # You can plot several variables:
#' plot_distr(1 + jnl_top_25p + jnl_top_5p ~ institution, us_pub_econ)
#'
#' # 3) Let's plot the journal distribution for the top 3 institutions
#'
#' # We can get the data from the previous graph
#' graph_data = plot_distr(jnl_top_5p ~ institution, us_pub_econ, plot = FALSE)
#' # And then select the top universities
#' top3_instit = graph_data$x[1:3]
#' top5_instit = graph_data$x[1:5] # we'll use it later
#'
#' # Now the distribution of journals
#' plot_distr(~ journal | institution, us_pub_econ[institution %in% top3_instit])
#' # Alternatively, you can use the argument mod.select:
#' plot_distr(~ journal | institution, us_pub_econ, mod.select = top3_instit)
#'
#' # 3') Same graph as before with "other" column, 5 institutions
#' plot_distr(~ journal | institution, us_pub_econ,
#' mod.select = top5_instit, other = TRUE)
#'
#' #
#' # Example with continuous data
#' #
#'
#' # regular histogram
#' plot_distr(iris$Sepal.Length)
#'
#' # now splitting by species:
#' plot_distr(~ Sepal.Length | Species, iris)
#'
#' # idem but the three distr. are separated:
#' plot_distr(~ Sepal.Length | Species, iris, mod.method = "split")
#'
#' # Now the three are stacked
#' plot_distr(~ Sepal.Length | Species, iris, mod.method = "stack")
#'
#'
#'
plot_distr = function(fml, data, moderator, weight, sorted, log, nbins, bin.size,
legend_options=list(), top, yaxis.show = TRUE, yaxis.num,
col, border = "black", mod.method, within, total, mod.select,
mod.NA = FALSE, at_5, labels.tilted, other, cumul = FALSE,
plot = TRUE, sep, centered = TRUE, weight.fun, int.categorical,
dict = NULL, mod.title = TRUE, labels.angle, cex.axis,
trunc = 20, trunc.method = "auto", ...){
# This function plots frequencies
# DT VARS
total_moderator = x_nb = isOther = otherValue = nb_new = share = share_top = moderator_nb = xleft = xright = ybottom = xleft_real = xright_real = x_num = mid_point = NULL
ytop_new = ybottom_new = share_top_cum = value_cum = ytop_cum = value = NULL
check_arg(fml, "formula | vector mbt")
# save full formula
fml_in = fml
# New Structure 2020/ARPIL:
# model.method is now split in two:
# - mod.method, equal to "side", "split", "stack"
# - within and total: TRUE/FALSE
# - all values are missing
check_arg_plus(mod.method, "null match(side, split, stack)")
check_arg_plus(top, "null match(frac, nb, none, FALSE)")
if(!missnull(top) && top == "FALSE") top = "none"
check_arg_plus(at_5, "NULL match(roman, lines, FALSE)")
check_arg(moderator, "null vector na ok")
check_arg(weight, "null numeric vector na ok")
check_arg(sep, "null numeric scalar GE{0}")
check_arg(nbins, "null integer scalar GT{0}")
check_arg(col, "vector(integer, character) NA OK")
check_arg(bin.size, "null numeric scalar GT{0}")
check_arg(dict, "null named character vector | logical scalar")
check_arg(labels.angle, "numeric scalar")
check_arg("null logical scalar", sorted, log, labels.tilted, centered, other, within, total, int.categorical)
check_arg("logical scalar", yaxis.show, yaxis.num, cumul, mod.NA, plot)
mc = match.call()
# Argument whose names have changed
dots = list(...)
args_deprec = c("maxBins", "addOther", "onTop", "maxFirst", "toLog")
if(any(args_deprec %in% names(dots))){
if("maxBins" %in% names(dots)){
if(is.null(getOption("fplot_warn_maxBins"))){
warning("Argument 'maxBins' has been renamed. Please use 'nbins' instead.")
options(fplot_warn_maxBins = TRUE)
}
nbins = dots$maxBins
}
if("addOther" %in% names(dots)){
if(is.null(getOption("fplot_warn_addOther"))){
warning("Argument 'addOther' has been renamed. Please use 'other' instead.")
options(fplot_warn_addOther = TRUE)
}
other = dots$addOther
}
if("onTop" %in% names(dots)){
if(is.null(getOption("fplot_warn_onTop"))){
warning("Argument 'onTop' has been renamed. Please use 'top' instead.")
options(fplot_warn_onTop = TRUE)
}
top = dots$onTop
}
if("maxFirst" %in% names(dots)){
if(is.null(getOption("fplot_warn_maxFirst"))){
warning("Argument 'maxFirst' has been renamed. Please use 'sorted' instead.")
options(fplot_warn_maxFirst = TRUE)
}
sorted = dots$maxFirst
}
if("toLog" %in% names(dots)){
if(is.null(getOption("fplot_warn_toLog"))){
warning("Argument 'toLog' has been renamed. Please use 'log' instead.")
options(fplot_warn_toLog = TRUE)
}
log = dots$toLog
}
}
set_defaults("fplot_distr")
# validate_dots(valid_args = c(args_deprec, names(par()), formalArgs(plot.default)), stop = TRUE)
#
# Extracting x and the moderator ####
#
USE_WEIGHT = MOD_IS_WEIGHT = FALSE
x_name = ylab = weight_name = moderator_name = ""
if(inherits(fml_in, "formula")){
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
# we check the variables data are there
check_arg(fml, "formula left(,1) right(,2) var(data)", .data = data, .message = "Argument 'fml' must be a formula of the type weights ~ value | moderator ('weights' and 'moderator' are optional).")
# Creation of x and the condition
fml = fml_in
# 2020/APRIL
# New Structure:
# weight_1 + weight_2 + etc ~ value | moderator
# You can use one sided formulas: ~value
info = extract_pipe(fml_in)
fml = info$fml
lhs_fml = info$lhs_fml
pipe_fml = info$pipe_fml
# OLD
# x = eval(fml[[2]], data)
# moderator = eval(fml[[3]], data)
# weight = extract_df(pipe_fml, data)
# NEW
x = eval(fml[[3]], data)
moderator = eval(info$pipe, data)
weight = extract_df(lhs_fml, data)
if(length(x) == 1 || length(all.vars(fml[[3]])) == 0){
stop("The formula must be of the type 'weight(s) ~ variable | moderator', and there must always be the 'variable' element, which is currently missing.")
}
if(info$lhs_fml == ~1){
weight = rep(1, length(x))
} else if(length(weight) > 1) {
MOD_IS_WEIGHT = TRUE
USE_WEIGHT = TRUE
if(length(moderator) > 1){
stop("You cannot use a moderator and multiple weights at the same time.")
# Later: implement a par mf_row (or sorts) to allow the plotting of multi weights + multi moderators
}
if(missing(mod.title)) mod.title = FALSE
# check
for(i in 1:length(weight)){
if(is.logical(weight[[i]])){
weight[[i]] = weight[[i]] * 1
} else if(!is.numeric(weight[[i]])){
stop("All weights must be numeric, but variable '", names(weight)[i], "' isn't.")
}
}
# We need to stack the data
n = length(weight[[1]])
all_mod_names = dict_apply(names(weight), dict)
# we create a factor to keep the user's order
moderator = factor(rep(all_mod_names, each = n), levels = all_mod_names)
x = rep(x, length(weight))
weight = unlist(weight)
if(missing(mod.title)){
mod.title = "Variables"
}
moderator_name = ""
weight_name = paste(enumerate_items(all_mod_names))
} else {
USE_WEIGHT = TRUE
weight = unlist(weight)
# weight_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
weight_name = deparse(lhs_fml[[2]])
check_value_plus(weight, "numeric conv vector na ok", .message = "The 'weight' must be a numeric vector.")
}
if(length(moderator) <= 1){
moderator = rep(1, length(x))
}
# other info
# x_name = deparse(fml[[2]])
# if(!MOD_IS_WEIGHT) moderator_name = deparse(fml[[3]])
x_name = deparse(fml[[3]])
if(!MOD_IS_WEIGHT) moderator_name = deparse(info$pipe_fml[[2]])
} else {
x = fml_in
# default no name when no formula => otherwise ugly when exporting
x_name = clean_name(deparse(substitute(fml)))
# We check the vector
if(!checkVector(x)){
if(inherits(x, "table")){
x = as.vector(x)
} else {
if(length(x) == 0){
reason = "Currently, the length of fml is 0."
} else {
reason = paste0("Currently, fml is of class ", enumerate_items(class(x)), ".")
}
stop("Wrong argument in fml. It must be either a formula, either a vector. ", reason)
}
}
# moderator
if(!missnull(moderator)){
if(length(x) != length(moderator)){
stop("The arguments 'x' and 'moderator' must be of the same length.")
}
moderator_name = clean_name(deparse(substitute(moderator)))
} else {
moderator = rep(1, length(x))
}
# weight
if(!missnull(weight)){
check_arg(weight, "numeric vector len(data)", .data = x)
USE_WEIGHT = TRUE
weight_name = clean_name(deparse(substitute(weight)))
} else {
weight = rep(1, length(x))
}
}
#
# Parameters setting ####
#
if(is.logical(x)){
x = as.factor(x)
}
# Renaming: dict
x_name = dict_apply(x_name, dict)
moderator_name = dict_apply(moderator_name, dict)
weight_name = dict_apply(weight_name, dict)
# Dropping NAs
quiNA_x = is.na(x)
quiNA_mod = is.na(moderator)
quiNA_weight = is.na(weight)
if(mod.NA){
quiNA = quiNA_x | quiNA_weight
total_NA = sum(quiNA | quiNA_mod)
if(is.factor(moderator)){
moderator = addNA(moderator)
} else {
# The following will coerce moderator into character
moderator[is.na(moderator)] = "NA"
}
} else {
quiNA = quiNA_x | quiNA_mod | quiNA_weight
total_NA = sum(quiNA)
}
if(total_NA > 0){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_weight))
msg_na = paste0(c("x: ", "moderator: ", "weight: "), nb_na)
message("NOTE: ", total_NA, " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ")")
x = x[!quiNA]
moderator = moderator[!quiNA]
weight = weight[!quiNA]
}
# case of empty strings
if(is.character(x)){
qui_empty = x == ""
if(any(qui_empty)){
x[qui_empty] = "[empty]"
}
}
# Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
if(anyNA(moderator_unik)){
# This is such a pain in the neck...
moderator_unik[is.na(moderator_unik)] = "NA"
moderator = factor(unclass(moderator), labels = moderator_unik)
}
} else {
moderator_unik = sunique(moderator)
}
if(cumul){
if(!missnull(other) && other == TRUE) message("Argument 'other = TRUE' is ignored since the cumulative distribution is asked for.")
other = FALSE
}
moderator_cases = length(moderator_unik)
isLegend = FALSE
USE_MOD = FALSE
if(moderator_cases > 1){
USE_MOD = TRUE
isLegend = TRUE
n_select = NA
do_recreate = FALSE
if(missing(mod.select)){
if(moderator_cases <= 5){
# FINE!
} else {
# We select the top 3
n_select = 3
}
} else if(length(mod.select) > 5){
stop("The argument 'mod.select' (currently of length ", length(mod.select), ") cannot be of a length greater than 5!")
} else if(length(mod.select) == 1 && is.numeric(mod.select) && mod.select %% 1 == 0){
# Possibly a number
if(mod.select %in% 1:5){
n_select = mod.select
} else {
stop("Argument 'mod.select' does not have a valid value: if it is an integer, its value must lie between 1 and 5.")
}
} else {
# mod.select must contain moderator values
# 1) checking the problems
mod_pblm = setdiff(mod.select, moderator_unik)
if(length(mod_pblm) > 0){
# We recreate mod.select
if(length(mod_pblm) != length(mod.select) && any(quiNA)){
# means some are OK
suggestion = " Maybe because it has only NA values?"
} else {
suggestion = " It must be either a moderator value, or a regular expression to select moderator values."
}
# Maybe they're regular expressions => we keep the order desired by the user
mod.ok = c()
for(r in mod.select){
if(r %in% moderator_unik){
mod.ok = c(mod.ok, r)
} else {
qui = grepl(r, moderator_unik)
n_qui = sum(qui)
if(n_qui == 0){
stop("In the argument 'mod.select', the value ", r, " could not be found in the moderator variable.", suggestion)
} else if(n_qui > 5){
stop("In the argument 'mod.select', the value ", r, " leads to more than 5 moderator values to be selected (e.g. ", enumerate_items(moderator_unik[qui], nmax = 10), ".")
}
mod.ok = c(mod.ok, moderator_unik[qui])
}
}
mod.ok = unique(mod.ok)
if(length(mod.ok) > 5){
stop("Argument mod.select leads to ", length(mod.ok), " moderator values to be selected. But the maximum is 5.")
}
mod.select = mod.ok
}
# 2) selection
do_recreate = TRUE
}
# AUTOMATIC selection
if(!is.na(n_select) && moderator_cases > n_select){
# We proceed with the "automatic" selection
if(USE_WEIGHT){
info_mod = data.table(moderator = moderator, w = weight)
info_mod = info_mod[, list(value = sum(w)), by = moderator]
info_method = "the value of the weight."
} else {
info_mod = data.table(moderator = moderator)
info_mod = info_mod[, list(value = .N), by = moderator]
info_method = "frequency."
}
# We inform on the method for the choice
message(ifelse(n_select == 1, "The moderator was", paste0("The ", n_select, " moderators were")), " chosen based on ", info_method)
info_mod = info_mod[order(-value)]
mod.select = info_mod[1:n_select, moderator]
do_recreate = TRUE
if(n_select == 1){
USE_MOD = isLegend = FALSE
}
}
if(mod.NA){
if(missing(mod.select)){
do_recreate = FALSE
} else {
mod.select = unique(c(mod.select, "NA"))
do_recreate = TRUE
}
}
if(do_recreate){
# We recreate the variables
qui_select = which(moderator %in% mod.select)
# we recreate all the values
x = x[qui_select]
moderator = moderator[qui_select]
weight = weight[qui_select]
# Re-Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
} else {
moderator_unik = sunique(moderator)
}
moderator_cases = length(moderator_unik)
}
}
# if(moderator_cases > 5) {
# stop("The number of cases of the moderator cannot be greater than 5!")
# } else if(moderator_cases > 1){
# USE_MOD = TRUE
# # we put the legend only if there is a condition
# isLegend = TRUE
# }
# mod.title
if(isLegend){
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
}
# separation of the x cases:
if(missnull(sep)){
sep = 0.20 + 0.15 * (moderator_cases-1)
}
# Setting up the data
# Finding out whether log or not
isNum = is.numeric(x) && !USE_WEIGHT # no sense to weight "real" numeric data
# Default of log and sorted
delayLogChecking = FALSE
if(missnull(log)){
if(!isNum || any(x < 0)){
log = FALSE
} else {
log = FALSE
if(diff(range(base::log(x[x>0]))) >= 3 && max(x) > 100){
delayLogChecking = TRUE
}
}
} else if(log && !isNum){
warning("Since 'x' is not numeric, argument 'log' is ignored.")
log = FALSE
}
isInteger = NA
if(missnull(sorted)){
if(!isNum){
sorted = TRUE
} else if(log){
sorted = FALSE
} else {
isInteger = all(x %% 1 == 0)
if(isInteger){
sorted = FALSE
} else {
sorted = FALSE
}
}
}
IS_MAXBIN_USER = TRUE
if(missnull(nbins)){
IS_MAXBIN_USER = FALSE
bin_fit_all = c(20, 11, 8, 6, 5)
bin_max_all = c(15, 9, 7, 5, 4)
nb = ifelse(!missnull(mod.method) && mod.method == "stack", 1, moderator_cases)
nbins = bin_fit_all[nb]
if(length(unique(x)) > nbins){
nbins = bin_max_all[nb]
}
}
if(log){
# Regle: la valeur d'une colonne est: superieur ou egal a la valeur gauche et strictement inferieurs a la valeur droite
x = pmax(floor(base::log(x + 1e-6)), -1)
}
#
# Binning ####
#
numAxis = FALSE
binned_data = FALSE
maxBins_old = nbins # used with delayLogChecking
if(isNum && !log){
if(!missnull(bin.size)){
if(!missnull(int.categorical) && int.categorical && all(x %% 1 == 0)){
isInteger = TRUE
} else {
x = (x %/% bin.size) * bin.size
binned_data = TRUE
if(!sorted){
numAxis = TRUE
if(IS_MAXBIN_USER == FALSE){
# Since it's a numeric axis, we put a LOT of bins
# It is a maximum anyway
nbins = 50
}
}
}
} else {
if(is.na(isInteger)){
isInteger = all(x %% 1 == 0)
}
goNum = TRUE
if(!missnull(int.categorical)){
# we force integers as categorical
if(int.categorical == TRUE) goNum = FALSE
# if int.categorical == FALSE, then we force it as numeric
} else if(isInteger && diff(range(x)) < 1000){
# for integers with small range, we don't consider them
# as numeric
goNum = FALSE
}
if(goNum){
if(!sorted){
numAxis = TRUE
}
if(IS_MAXBIN_USER == FALSE){
# we reset the number of bins: specific to the numeric case
nbins = bin_max_all[1]
}
binned_data = TRUE
# we try to find a "good" bin size
x_min = min(x)
x_max = max(x)
bin.size = signif((x_max - x_min) / min(max(nbins - 1, 10), 15), 1)
x_tmp = (x %/% bin.size) * bin.size
tx = ttable(x_tmp)
if(sum(tx/sum(tx) > 0.03) < 6){
# Condition: nber of bins with more than 3% is lower than 6
if(delayLogChecking){
# Would it be better looking in logarithmic form?
x_ln = pmax(floor(base::log(x + 1e-6)), -1)
tx_ln = ttable(x_ln)
if(sum(tx_ln/sum(tx_ln) > 0.05) >= 5){
# nber of signif bins greater than 5
log = TRUE
x = x_ln
numAxis = FALSE
binned_data = FALSE
nbins = maxBins_old
}
}
}
if(!log){
if(sum(tx/sum(tx) > 0.01) < 6){
# Condition: nber of bins with more than 1% is lower than 6
# More complex binning:
# We try to find the "optimal" number of bins
n_bins = nbins
q001 = quantile(x, seq(0, 1, by = 1 / 40))
n_q = length(q001)
s_vect = c()
score_total_vect = c()
score_range_vect = c()
score_fullness_vect = c()
for(i in 1:20){
if(i == 1){
s = signif(diff(range(x)) / n_bins, 1)
} else {
s = signif(s / 1.5, 1)
}
value_range = integer(n_bins)
for(k in 1:n_q){
s_start = q001[k]
value_range[k] = sum(q001 >= s_start & q001 <= s_start + n_bins*s)
if(value_range[k] == length(q001)) break
}
s_start = q001[which.max(value_range)]
missing_range = 1 - sum(q001 >= s_start & q001 <= s_start + n_bins*s)/n_q
score_fullness = length(unique(na.omit(cut(q001, seq(s_start, by = s, length.out = n_bins + 1), include.lowest = TRUE))))/ n_bins
# Save
s_vect[i] = s
score_total_vect[i] = score_fullness/2 - 2*missing_range
if(missing_range > 0.4) break
}
bin.size = s_vect[which.max(score_total_vect)]
x = (x %/% bin.size) * bin.size
} else {
x = x_tmp
}
}
}
}
}
#
# Default values for mod.method
#
# Splitting decision
if(missnull(mod.method)){
if(log || numAxis){
mod.method = "side"
} else {
mod.method = "split"
}
}
if(!missnull(total)){
if(!missnull(within)){
stop("You cannot provide the values of both arguments 'within' and 'total' at the same time.")
}
# we only use the variable 'within' in the code
within = !total
} else if(!missnull(within) && within && mod.method == "stack"){
stop("You cannot use within=TRUE when mod.method='stack'.")
} else if(missnull(within)){
within = TRUE
}
DO_STACK = FALSE
if(mod.method == "stack"){
if(cumul == TRUE){
stop("You cannot use the argument cumul=TRUE with mod.method='stack'.")
# I should implement it later, it might be useful in some situations (non overlapping cases)
}
DO_STACK = TRUE
mod.method = "side"
within = FALSE
}
DO_SPLIT = FALSE
checkNotTilted = FALSE
delayLabelsTilted = missnull(labels.tilted)
if(moderator_cases > 1 && mod.method == "split"){
DO_SPLIT = TRUE
if(missnull(labels.tilted)){
labels.tilted = TRUE
checkNotTilted = TRUE
}
# We don't add the other column only when the data is split
# and it is not a numeric axis or log axis
if(missnull(other) && !log && !numAxis){
other = FALSE
}
}
if(missnull(other)){
other = TRUE
}
# yaxis.num
if(missnull(yaxis.num)){
if(USE_WEIGHT == FALSE || missing(weight.fun) || (moderator_cases > 1 && within)){
yaxis.num = FALSE
} else {
yaxis.num = TRUE
}
} else if(yaxis.num == TRUE && moderator_cases > 1 && within && !MOD_IS_WEIGHT){
stop("Argument yaxis.num: You cannot have a numeric y-axis when the 'within' distributions among moderators should be reported (because the data is rescaled as frequencies first). Use total=TRUE (or within=FALSE) to display a numeric y-axis.")
}
checkForTilting = FALSE
if(missnull(labels.tilted)){
labels.tilted = FALSE
# We check if it's better for tilting only in the regular case
checkForTilting = TRUE
# NOTE that we later update the value of checkForTilting
# because we don't want to do it all the time (but we first need more information)
# just search the next occurrence of checkForTilting
}
# The color
if(missnull(col)){
if(moderator_cases == 1){
col = "#1F78B4"
} else if(DO_SPLIT){
col = "set1"
} else {
col = c("#1F78B4", "#A6CEE3", "#33A02C", "#B2DF8A", "#E31A1C", "#FB9A99", "#FF7F00", "#FDBF6F")
}
}
if(length(col) == 1 && is.character(col)){
# col_match = try(match.arg(tolower(col), c("set1", "paired")), silent = TRUE)
# if(col_match == "set1"){
# col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999")
# } else if(col_match == "paired"){
# col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
# }
check_value_plus(col, "match(set1, paired) | character scalar")
if(col == "set1"){
col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999")
} else if(col == "paired"){
col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
}
}
numLabel = FALSE
if(moderator_cases >= 2 && numAxis){
if(DO_SPLIT){
numLabel = TRUE
}
sep = 0
}
# for table elements
if(is.table(x)){
x = as.vector(x)
}
X_FACTOR = FALSE
if(is.factor(x)){
X_FACTOR = TRUE
# we keep the order of x when factors
x = x[drop = TRUE]
x_fact_names = levels(x)
x = unclass(x)
# => we rename x later
}
#
# Computing data_freq ####
#
ADD_OTHER = ADD_OTHER_LEFT = CUT_BAR = FALSE
if(DO_SPLIT == FALSE){
if(USE_WEIGHT){
raw_data = data.table(x = x, weight = weight, moderator = moderator)
if(missnull(weight.fun)){
data_freq = raw_data[, list(value = sum(weight)), keyby = list(x, moderator)]
} else {
if(!is.function(weight.fun)){
stop("Argument 'weight.fun' must be a function.")
} else if(length(weight.fun(head(weight))) != 1){
stop("Function 'weight.fun' must be return a value of length one.")
}
data_freq = raw_data[, list(value = weight.fun(weight)), keyby = list(x, moderator)]
}
} else {
raw_data = data.table(x = x, moderator = moderator)
data_freq = raw_data[, list(value = .N), keyby = list(x, moderator)]
}
if(USE_MOD && mod.method == "side" && within){
data_freq[, "total_moderator" := list(sum(value)), by = moderator]
data_freq[, "share" := list(value/total_moderator*100)]
} else {
total = sum(data_freq$value)
data_freq[, "share" := list(value/total*100)]
}
# We balance the panel for log data
if(log && sorted == FALSE){
x_all = min(data_freq$x):max(data_freq$x)
data_all = data.table(expand.grid(x = x_all, moderator = moderator_unik))
tmp = merge(data_all, data_freq, all.x = TRUE, by = c("x", "moderator"))
tmp[is.na(tmp)] = 0
data_freq = tmp
}
# The sorting
if(sorted){
if(USE_MOD){
data_freq[, "total_x" := list(sum(value)), by = x]
# data_freq = data_freq[order(-total_x, moderator)]
# setorderv(data_freq, c("total_x", "moderator", "x"), c(-1, 1, 1))
setorderv(data_freq, c("total_x", "x"), c(-1, 1))
} else {
# data_freq = data_freq[order(-value)]
setorderv(data_freq, "value", -1)
}
data_freq[, "x_nb" := list(rleid(x))]
} else {
data_freq = data_freq[order(x, moderator)]
data_freq[, x_nb := quickUnclassFactor(x)]
}
# the "other" column
data_freq[, isOther := FALSE]
data_freq[, otherValue := ""]
# if we center on the mode
if(centered && any(data_freq$x_nb > nbins) && isNum){
# we center the distribution
data_freq[, nb_new := x_nb - which.max(value)]
data_freq[, nb_new := nb_new + max(min(ceiling(nbins/2), abs(min(nb_new)) + 1), nbins - max(nb_new))]
data_freq[, x_nb := nb_new]
data_freq[, nb_new := NULL]
if(other && sorted == FALSE && any(data_freq$x_nb <= 0)){
if(!any(data_freq$x_nb < 0)){
# we just add that bin
data_freq[, x_nb := x_nb + 1]
nbins = nbins + 1
} else {
ADD_OTHER_LEFT = TRUE
# We create a bin on the left
data_other_left = data_freq[x_nb <= 0]
data_other_left = data_other_left[, list(value = sum(value), share_top = sum(share)), by = moderator]
tmp = as.data.table(expand.grid(moderator = data_other_left$moderator))
other_col = merge(data_other_left, tmp)
other_col[is.na(other_col)] = 0
other_col[, x_nb := 0]
# we limit the size of this last column
share_max = max(data_freq[x_nb %in% 1:nbins, share])
# r = 1 + rank(other_col$value, ties.method = "max") / moderator_cases / 10
# other_col[, share := pmin(share_top, r * share_max)]
other_col[, share := pmin(share_top, 1.1 * share_max)]
if(any(other_col$share < other_col$share_top)){
CUT_BAR = TRUE
}
# we merge the information
data_freq = rbindDS(other_col, data_freq[x_nb > 0])
qui_NA = is.na(data_freq$share_top)
data_freq$share_top[qui_NA] = data_freq$share[qui_NA]
# Other text => depends on the type of values
data_freq[x_nb == 0, "isOther" := TRUE]
if(log){
x_first = ceiling(exp(data_freq[x_nb == 1, x][1]))
otherTextLeft = substitute(phantom()<x_val, list(x_val = formatAxisValue(x_first)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_first))]
} else {
if(binned_data == FALSE){
x_other = data_freq[x_nb == 1, x][1]
otherTextLeft = substitute(phantom()<x_val, list(x_val = formatAxisValue(x_other)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_other))]
} else {
# we update the value
x_other = data_freq[x_nb == 1, x][1]
data_freq[x_nb == 0, x := x_other - bin.size]
otherTextLeft = substitute(phantom()<x_other, list(x_other = formatAxisValue(x_other)))
data_freq[x_nb == 0, "otherValue" := paste("<", formatAxisValue(x_other))]
}
}
}
} else {
# We clean negative numbers
data_freq = data_freq[x_nb > 0]
}
}
# The "other" column
if(other && max(data_freq$x_nb) == nbins + 1){
# we just add the bin
nbins = nbins + 1
data_freq = data_freq[x_nb <= nbins]
} else if(other && any(data_freq$x_nb > nbins)){
# we create the other colum as the sum of the rest
ADD_OTHER = TRUE
data_other = data_freq[x_nb > nbins]
data_other = data_other[, list(value = sum(value), share_top = sum(share)), by = moderator]
tmp = as.data.table(expand.grid(moderator = data_other$moderator))
other_col = merge(data_other, tmp)
other_col[is.na(other_col)] = 0
other_col[, x_nb := nbins + 1]
# we limit the size of this last column
share_max = max(data_freq[x_nb %in% 1:nbins, share])
# r = 1 + rank(other_col$value, ties.method = "max") / moderator_cases / 10
# other_col[, share := pmin(share_top, r * share_max)]
other_col[, share := pmin(share_top, 1.1 * share_max)]
if(any(other_col$share < other_col$share_top)){
CUT_BAR = TRUE
}
# we merge the information
data_freq = rbindDS(data_freq[x_nb <= nbins], other_col)
qui_NA = is.na(data_freq$share_top)
data_freq$share_top[qui_NA] = data_freq$share[qui_NA]
# Other text => depends on the type of values
data_freq[x_nb == nbins + 1, "isOther" := TRUE]
if(!isNum || sorted){
otherText = "Other"
data_freq[x_nb == nbins + 1, "otherValue" := "Other"]
} else {
if(log){
x_last = round(exp(data_freq[x_nb == nbins, x] + 1))
otherText = substitute(phantom()>x_val, list(x_val = formatAxisValue(x_last)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_last))]
} else {
if(binned_data == FALSE){
x_value = data_freq[x_nb == nbins, x]
otherText = substitute(phantom()>x_val, list(x_val = formatAxisValue(x_value)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_value))]
} else {
x_other = data_freq[x_nb == nbins, x][1] + bin.size
data_freq[x_nb == nbins + 1, x := x_other]
otherText = substitute(phantom()>=x_other, list(x_other = formatAxisValue(x_other)))
data_freq[x_nb == nbins + 1, "otherValue" := paste(">", formatAxisValue(x_other))]
}
}
}
} else {
data_freq = data_freq[x_nb <= nbins]
}
if(!"share_top" %in% names(data_freq)){
data_freq[, share_top := share]
}
x_nb_all = sort(unique(data_freq$x_nb))
x_cases = length(x_nb_all)
# We add the 0s
if(nrow(data_freq) != x_cases*moderator_cases){
# we need to add the 0s
data_all = data.table(expand.grid(x_nb = x_nb_all, moderator = moderator_unik))
setkey(data_all, x_nb, moderator)
setkey(data_freq, x_nb, moderator)
tmp = merge(data_all, data_freq, all.x = TRUE)
tmp[, c("x", "isOther", "otherValue") := NULL]
# Merging additional info
base2merge = unique(data_freq[, list(x, x_nb, isOther, otherValue)])
quoi = merge(tmp, base2merge, by = "x_nb")
tmp = quoi[order(x_nb, moderator)]
# setting to 0
tmp[is.na(tmp)] = 0
data_freq = tmp
}
# moderator_nb
if(is.factor(moderator)){
data_freq[, moderator_nb := as.numeric(unclass(moderator[drop = TRUE]))]
} else {
data_freq[, moderator_nb := quickUnclassFactor(moderator)]
}
# The coordinates
if(numAxis){
data_freq[, xleft := x + bin.size/moderator_cases * (moderator_nb - 1)]
data_freq[, xright := x + bin.size/moderator_cases * moderator_nb]
} else {
data_freq[, xleft := (x_nb-1)*(moderator_cases+sep) + moderator_nb - 1]
data_freq[, xright := (x_nb-1)*(moderator_cases+sep) + moderator_nb]
}
xlim = range_plus(c(data_freq$xleft, data_freq$xright), 5)
data_freq[, ybottom := 0]
data_freq[, ytop := share]
if(X_FACTOR){
# we reintroduce the values of x
# Beware: "Other" can be either 0 (when side/split) // or NA (without moderator)
x_value = rep(NA, nrow(data_freq))
if(anyNA(data_freq$x)){
qui_ok = !is.na(data_freq$x)
} else {
qui_ok = !data_freq$x == 0
}
x_value[qui_ok] = x_fact_names[data_freq$x[qui_ok]]
data_freq[, x := x_value]
}
} else if(DO_SPLIT == TRUE){
#
# With SPLIT ####
#
# We apply a split to the data
info = list()
x_add = 0
for(i in seq_along(moderator_unik)){
# we apply plot_distr to subsets
qui = moderator == moderator_unik[i]
if(USE_WEIGHT){
data_freq = plot_distr(x[qui], moderator = moderator[qui], weight = weight[qui], sorted = sorted, nbins = nbins, log = FALSE, other = other, plot = FALSE, bin.size = bin.size, int.categorical = !numAxis)
} else {
data_freq = plot_distr(x[qui], moderator = moderator[qui], sorted = sorted, nbins = nbins, log = FALSE, other = other, plot = FALSE, bin.size = bin.size, int.categorical = !numAxis)
}
# updating the information
if(numAxis){
# We add the value of a bin, BUT start when the last stops
# we save the "real" values
data_freq[, xleft_real := xleft]
data_freq[, xright_real := xright]
# Now we update xleft/xright
min_left = min(data_freq$xleft)
x_add_next = x_add + max(data_freq$xright) - min_left
data_freq[, xleft := xleft - min_left + x_add]
data_freq[, xright := xright - min_left + x_add]
x_add = x_add_next + bin.size
} else {
min_left = min(data_freq$xleft)
x_add_next = x_add + max(data_freq$xright) - min_left
data_freq[, xleft := xleft - min_left + x_add]
data_freq[, xright := xright - min_left + x_add]
x_add = x_add_next + 0.7
}
data_freq[, moderator_nb := i]
info[[i]] = data_freq
}
info_all = rbindlist(info, use.names=TRUE)
if(mod.method == "split" && within == FALSE){
# SPLIT TOTAL
total = sum(weight)
info_all[, share := value/total*100]
info_all[, share_top := share]
info_all[, ytop := share]
}
if(binned_data && other){
# we format the data properly
info_all$x = formatAxisValue(info_all$x)
}
if(X_FACTOR){
# we reintroduce the values of x
info_all[, x := x_fact_names[x]]
}
# managing "other"
# info_all[, isOther := is.na(x)]
if(any(info_all$isOther)){
# bounding bar size
share_max = max(info_all[isOther == FALSE, share])
info_all[isOther == TRUE, share := pmin(share_top, 1.1 * share_max)]
info_all[, ytop := share]
if(any(info_all$share < info_all$share_top)){
CUT_BAR = TRUE
}
info_all$x = as.character(info_all$x)
info_all$x[is.na(info_all$x)] = "Other"
}
data_freq = info_all
# Some information
xlim = range_plus(c(data_freq$xleft, data_freq$xright), 5)
}
#
# Graph setting ####
#
if(DO_STACK == TRUE){
# we reformat the data
# return(data_freq)
new_coords = data_freq[isOther == FALSE, list(x_nb, xleft, xright, ybottom, ytop)]
new_coords[, c("xleft_new", "xright_new") := list(min(xleft), max(xright)), by = x_nb]
new_coords[, ytop_new := cumsum(ytop), by = x_nb]
new_coords[, ybottom_new := shift(ytop_new, n = 1, fill = 0), by = x_nb]
data_freq[isOther == FALSE, c("xleft", "xright", "ybottom", "ytop") := list(new_coords$xleft_new, new_coords$xright_new, new_coords$ybottom_new, new_coords$ytop_new)]
if(any(data_freq$isOther)){
# new values top
max_y = max(data_freq$ytop)
data_freq[isOther == TRUE, ytop := pmin(1.1 * max_y, share_top)]
}
# browser()
}
# Cumulative
if(cumul == TRUE){
data_freq[, share_top_cum := cumsum(share_top), by = moderator]
data_freq[, value_cum := cumsum(share), by = moderator]
data_freq[, ytop_cum := cumsum(ytop), by = moderator]
data_freq[, share_top := share_top_cum]
data_freq[, value := value_cum]
data_freq[, ytop := ytop_cum]
}
if(plot == FALSE){
return(data_freq)
}
# Plot preparation
# default for top
if(missnull(top)){
if(cumul){
top = "frac"
} else if(numAxis){
if(any(data_freq$isOther)){
top = "frac"
} else {
top = "none"
}
} else if(moderator_cases > 1 && mod.method == "split"){
# when neck to neck => much easier to compare shares
# otherwise it is confusing
top = "frac"
} else {
# if coef of variation of x is low => better information is the number
values = data_freq[share_top <= share, ytop]
cv = sd(values) / mean(values)
if(cv > 0.7){
top = "frac"
} else {
top = "nb"
}
}
}
if(is.null(dots$ylim)){
ylim = c(0, max(data_freq$ytop))
ymax_grid = ylim[2]
# hauteur_caractere_legend = 0
total_height = 0
if(isLegend){
info_legend = legendFit(legend = moderator_unik, plot = FALSE, title = mod.title)
total_height = info_legend$total_height
}
hauteur_caractere_top = 0
if(top != "none"){
# We create the information to display top
if(DO_STACK == FALSE){
# Normal case
info_top = data_freq[, list(x_mid = (xleft+xright)/2, ytop, value, share_top)]
} else if(DO_STACK == TRUE){
info_top = data_freq[isOther == FALSE, list(x_mid = mean((xleft+xright)/2), ytop = max(ytop), value = sum(value), share_top = sum(share_top)), by = x_nb]
info_top_other = data_freq[isOther == TRUE, list(x_mid = (xleft+xright)/2, ytop, value, share_top)]
}
# We find the optimal cex for the top numbers
if(top == "nb"){
top.value2display = formatAxisValue(info_top$value)
} else {
top.value2display = paste0(mysignif(info_top$share_top, d = 2, r = 0), "%")
# We truncate => otherwise ugly
# top.value2display = gsub("0.0.+", "0.0%", top.value2display)
# Only one digit after "0."
top.value2display = gsub("(?<=\\.[[:digit:]]).+", "%", top.value2display, perl = TRUE)
}
#
# NOTE on R behavior with new windows (x11) that are resized
#
# The following code computes the optimal width of the number/fractions to be displayed on the top
# of the bars.
# However, when we use x11() and then resize the window, R is unable to find out the actual window size
# Instead we are only able to access the original size.
# Unfortunately, the drawbacks of the following solutions are too important:
# - plotting a "clean" plot, so that R knows the actual size of the window.
# * Drawback: if the user calls for several plots in the same window => we're screwed
# - making a "smart guess" by assuming that the new window will be larger
# * drawback: if it is not the case, it will be ugly (better too small than too big)
# Getting the width of the bars in inches
unitary_width = par("pin")[1] / usr_width(xlim)
if(numAxis){
tdx = table(data_freq$xright - data_freq$xleft)
bar_w = as.numeric(names(tdx)[which.max(tdx)])
unitary_width = unitary_width * bar_w
}
# When DO_STACK==TRUE and numAxis==FALSE:
# the width is equal to the nber of moderators
unitary_width_top = unitary_width * (1 + (moderator_cases - 1) * DO_STACK * (!numAxis))
top.cex = 1
minCex = 0.5
v_max = top.value2display[which.max(strwidth(top.value2display, units = "in"))]
while(top.cex > minCex && strwidth(v_max, units = "in", cex = top.cex) > unitary_width_top*0.9){
top.cex = top.cex * 0.95
}
# the height we need to add to the plot
hauteur_caractere_top = strheight("W", "in", cex = top.cex) * 1.35 + strwidth("W", "in", cex = 1) * top.cex
# If DO_STACK==TRUE: the "other" columns have another width
if(DO_STACK == TRUE && nrow(info_top_other) > 0){
if(top == "nb"){
top.value2display_other = formatAxisValue(info_top_other$value)
} else {
top.value2display_other = paste0(mysignif(info_top_other$share_top, d = 2, r = 0), "%")
}
top.cex_other = 1
minCex = 0.5
v_max = top.value2display_other[which.max(strwidth(top.value2display_other, units = "in"))]
while(top.cex_other > minCex && strwidth(v_max, units = "in", cex = top.cex_other) > unitary_width*0.9){
top.cex_other = top.cex_other * 0.95
}
hauteur_caractere_top_other = strheight("W", "in", cex = top.cex_other) * 1.35 + strwidth("W", "in", cex = 1) * top.cex_other
# We are careful to add only the necessary height to the plot
hauteur_caractere_top_usr = hauteur_caractere_top / par("pin")[2] * diff(ylim)
hauteur_caractere_top_other_usr = hauteur_caractere_top_other / par("pin")[2] * diff(ylim)
height_normal = max(info_top$ytop) + hauteur_caractere_top_usr
height_other = max(info_top_other$ytop) + hauteur_caractere_top_other_usr
hauteur_caractere_top = (max(height_normal, height_other) - ylim[2]) * par("pin")[2] / diff(ylim)
}
}
# We transform the character heights into user format (produit en croix)
add_usr = (hauteur_caractere_top + total_height) / par("pin")[2] * diff(ylim)
ylim[2] = ylim[2] + add_usr
dots$ylim = ylim
}
dots$xlim = xlim
dots$axes = FALSE
dots$x = dots$y = dots$col = 0
# ylab
prefix = ""
if(cumul) prefix = "Cumulative "
if(moderator_cases > 1){
if(nchar(moderator_name) == 0){
# text = switch(mod.method, sideWithin = paste0("% Within"), sideTotal = "% Total", splitWithin = paste0("% Within"), splitTotal = "% Total")
text = ifelse(within, "% Within", "% Total")
} else {
# text = switch(mod.method, sideWithin = paste0("% Within (", moderator_name, ")"), sideTotal = "% Total", splitWithin = paste0("% Within (", moderator_name, ")"), splitTotal = "% Total")
text = ifelse(within, paste0("% Within ", moderator_name, ""), "% Total")
}
if(USE_WEIGHT){
if(yaxis.num == TRUE){
ylab = paste0(prefix, weight_name)
} else {
if(MOD_IS_WEIGHT){
ylab = paste0("Distribution of ", weight_name)
} else {
ylab = paste0("Distribution of ", weight_name, " (", prefix, text, ")")
}
}
} else {
if(yaxis.num == TRUE){
ylab = paste0(prefix, "Frequency")
} else {
ylab = paste0(prefix, text)
}
}
} else if(USE_WEIGHT){
ylab = paste0(prefix, "Distribution of ", weight_name)
} else if(cumul){
ylab = "Cumulative %"
}
# xlab
# user providing xlab?
noXlab = is.null(dots$xlab)
noSub = is.null(dots$sub)
# if(!noXlab || isNum){
# # if data is numeric, we will want the x-label, even if tilted
# line.max = 1 + 1
# } else if(!noSub){
# line.max = 2 + 1
# } else {
# line.max = 3 + 1
# }
line.max = 3 + 1
# Update of checkForTilting
if(checkForTilting){
# Only for the standard case
checkForTilting = !numAxis & !log
}
# if(checkForTilting){
# # "at risk" of tilting => we don't show the name
# listDefault(dots, "xlab", "")
# } else if(labels.tilted){
# # we don't show the name of the variable if lab is tilted
# # except if the data is numeric
#
# if(isNum){
# listDefault(dots, "xlab", x_name)
# } else {
# listDefault(dots, "xlab", "")
# }
#
# # listDefault(dots, "sub", x_name)
# } else {
# listDefault(dots, "xlab", x_name)
# }
if("xlab" %in% names(dots)){
xlab = dots$xlab
} else {
xlab = x_name
}
dots$xlab = ""
if("sub" %in% names(dots)){
sub = dots$sub
} else {
sub = ""
}
dots$sub = ""
#
# Margins setting ####
#
if(yaxis.show){
# we get the "nice" points display
# y_points = axis(2, lwd=0, col.axis = 0)
y_points = pretty(c(data_freq$ytop, 0))
# We don't go above 100%
y_points = y_points[y_points <= 100]
if(yaxis.num){
# x1 = data_freq[x_nb == 1][1, ]
x1 = data_freq[ytop > 0][1, ]
y_labels = formatAxisValue(y_points / x1$ytop * x1$value)
} else {
y_labels = paste0(y_points, "%")
}
} else {
y_labels = ""
}
if("ylab" %in% names(dots)){
ylab.resize = FALSE # we don't resize user-provided ylab
ylab = dots$ylab
} else {
ylab.resize = TRUE
}
dots$ylab = ""
# listDefault(dots, "ylab", ylab)
listDefault(dots, "yaxs", "i")
mleft = find_margins_left(ylab, y_labels, ylab.resize = ylab.resize)
# finding the bottom margin is a pain in the neck!!!
mbot = find_margins_bottom(xlab = xlab, sub = sub, data_freq = data_freq, log = log, isNum = isNum, numLabel = numLabel, numAxis = numAxis, nbins = nbins, DO_SPLIT = DO_SPLIT, ADD_OTHER = ADD_OTHER, ADD_OTHER_LEFT = ADD_OTHER_LEFT, sorted = sorted, labels.tilted = labels.tilted, delayLabelsTilted = delayLabelsTilted, checkForTilting = checkForTilting, checkNotTilted = checkNotTilted, noSub = noSub, binned_data = binned_data, line.max = line.max, trunc = trunc, trunc.method = trunc.method, cex.axis = cex.axis, labels.angle = labels.angle, at_5 = at_5, xlim = xlim)
if(mleft$total_width > par("mai")[2] || mbot$total_height > par("mai")[1]){
new_mai = par("mai")
if(mleft$total_width > par("mai")[2]){
new_mai[2] = mleft$total_width + 0.05
}
if(mbot$total_height > par("mai")[1]){
new_mai[1] = mbot$total_height + 0.05
}
op = par(mai = new_mai)
on.exit(par(op))
}
do.call("plot", dots)
# ylab
title(ylab = mleft$ylab, line = mleft$ylab.line)
hgrid(ymax = ymax_grid)
rect(xleft = data_freq$xleft, ybottom = data_freq$ybottom, xright = data_freq$xright, ytop = data_freq$ytop, col = col[data_freq$moderator_nb], border = border)
#
# "cutting" the bar (if other is very big)
#
if(CUT_BAR){
data_other = data_freq[share_top > share]
ytop = data_other$ytop[1]
x_span = (data_other$xright - data_other$xleft)[1] / diff(get_x_lim()) * diff(get_y_lim())
y_span = ytop * .03
y_all = ytop - 2:1*x_span
for(i in 1:nrow(data_other)){
shade_area(y_all, y_all - y_span, x = c(data_other$xleft[i], data_other$xright[i]), col = "white", border = "white")
}
}
#
# x-labels ####
#
# Information on plot
at_info = data_freq[, list(mid_point = (max(xright) + min(xleft)) / 2), by = list(x_nb, x)]
myat = at_info$mid_point
if(log){
if(DO_SPLIT){
data_freq_valid = data_freq[isOther == FALSE, ]
data_freq_valid[, x_num := as.numeric(x)]
x_all = data_freq_valid$x_num
exp_value = ceiling(exp(x_all))
exp_value[x_all == -1] = 0
if(sorted){
if(delayLabelsTilted){
# better display
labels.tilted = TRUE
}
myat = data_freq_valid[, (xleft+xright)/2]
exp_value_right = ceiling(exp(x_all + 1))
# Formatting
exp_value_format = formatAxisValue(exp_value)
exp_value_right_format = formatAxisValue(exp_value_right)
label_displayed = paste0("[", exp_value_format, "; ", exp_value_right_format, ")")
# finding the location
if(labels.tilted){
info_tilt = xaxis_biased(at = myat, line.max = line.max, labels = label_displayed)
} else {
xaxis_labels(at = myat, labels = label_displayed)
}
} else {
# we draw the axes with nice display
moreLine = any(data_freq$isOther) * .25
# Displaying the ticks "all at once" (including the last one)
mysep = (data_freq$xleft[2] - data_freq$xright[1]) / 2
exp_value_right = ceiling(exp(x_all + 1))
# on the right
myat = data_freq_valid$xright + mysep
# We add the first tick on the left
data_first = data_freq_valid[x_nb == 1]
myat = c(data_first$xleft - mysep, myat)
# exp_value = c(ceiling(exp(data_first$x_num - 1)), exp_value)
first_val = ceiling(exp(data_first$x_num - 1))
first_val[data_first$x_num == -1] = 0
exp_value = c(first_val, exp_value_right)
exp_value_format = formatAxisValue(exp_value)
# tick location
# 1) the ticks
axis(1, at = myat, labels = NA, lwd.ticks = 1, lwd = 0, line = moreLine)
# 2) The labels
# Tilted labels not implemented for this axis
if(delayLabelsTilted){
if(strwidth(paste0(exp_value_format, collapse = " ")) / diff(get_x_lim()) > 0.9){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
}
}
if(labels.tilted){
xaxis_biased(at = myat, labels = exp_value_format, yadj = 2, angle = 25)
} else {
axis(1, at = myat, labels = exp_value_format, line = moreLine, lwd = 0)
}
# for extra ticking when info is ambiguous
tck_location = par("usr")[3] - strheight("W")*81/100
qui = which(exp_value < 10)
if(length(qui) > 0){
# tick location
axis(2, at = tck_location - moreLine, pos = myat[qui], labels = NA, tcl = 0.15, xpd = TRUE)
}
}
# Now the "other" ticks
if(any(data_freq$isOther)){
data_freq_other = data_freq[isOther == TRUE]
if(sorted){
text2show = rep("Other", nrow(data_freq_other))
my_font = 1
} else {
text2show = rep(">", nrow(data_freq_other))
text2show[grepl("<", data_freq_other$otherValue)] = "<"
my_font = 2
}
myat = data_freq_other[, (xleft + xright)/2]
if(sorted && labels.tilted){
xaxis_biased(at = myat, line.max = line.max, labels = text2show, angle = info_tilt$angle, cex = info_tilt$cex)
} else {
axis(1, at = myat, labels = text2show, lwd = 0, line = -0.8, font = my_font)
}
}
} else {
#
# formatting of the values
#
x_unik = at_info[x_nb %in% 1:nbins, x]
x_cases = length(x_unik)
myat = at_info[x_nb %in% 1:nbins, mid_point]
exp_value = ceiling(exp(x_unik))
exp_value[x_unik == -1] = 0
if(is.unsorted(x_unik) || x_cases == 1 || any(diff(x_unik) != 1)){
exp_value_right = ceiling(exp(x_unik + 1))
# Formatting
exp_value_format = substr(exp_value, 1, 7)
exp_value_right_format = substr(exp_value_right, 1, 7)
# finding the location
location = xaxis_labels(at = myat, labels = paste0("[", exp_value_format, "; ", exp_value_right_format, "["), only.params = TRUE)
# drawing
for(i in 1:length(x_unik)){
value = substitute(group("[",list(x1, x2),")"), list(x1 = formatAxisValue(exp_value[i]), x2 = formatAxisValue(exp_value_right[i])))
axis(1, at = myat[i], lwd = 0, labels = value, cex = location$cex, line = 1 + location$line[i])
}
} else {
# we draw the axes with nice display
moreLine = (ADD_OTHER || ADD_OTHER_LEFT) * .25
tck_location = par("usr")[3] - strheight("W")*81/100
# Displaying the ticks "all at once" (including the last one)
val = c(exp_value, ceiling(exp(tail(x_unik, 1) + 1)))
exp_value_format = formatAxisValue(val)
# tick location
loc = (1:length(val)-1)*(moderator_cases+sep) - sep/2
axis(1, at = loc, labels = exp_value_format, line = moreLine, lwd.ticks = 1, lwd = 0)
for(i in 1:x_cases){
# tick location
loc = (i-1)*(moderator_cases+sep) - sep/2
# Ticks showing "strictly" inferior
if(x_unik[1] < 2){
axis(2, at = tck_location - moreLine, pos = loc, labels = NA, tcl = 0.15, xpd = TRUE)
}
}
}
# We find the right cex to fit the "other" group
unitary_width = par("pin")[1] / usr_width(xlim)
too_large = function(text, the_cex, the_shift) strwidth(text, units = "in", cex = the_cex)/2 - the_shift*unitary_width > unitary_width*(1/2+sep/2*1.5)
if(ADD_OTHER){
axis(1, at = at_info[x_nb == nbins + 1, mid_point], line = -0.8, labels = ">", lwd = 0, font = 2)
}
if(ADD_OTHER_LEFT){
axis(1, at = at_info[x_nb == 0, mid_point], line = -0.8, labels = "<", lwd = 0, font = 2)
}
}
} else if(numLabel){
# moderator > 1 + split + numeric axis
current_lim = get_x_lim()
for(i in 1:moderator_cases){
x_sub = data_freq[moderator == moderator_unik[i]]
real_range = c(min(x_sub$xleft_real), max(x_sub$xright_real))
x_show = pretty(real_range, 3)
x_show = x_show[x_show >= real_range[1] & x_show <= real_range[2]]
current_range = c(min(x_sub$xleft), max(x_sub$xright))
x_show_current = to01(x_show, real_range) * diff(current_range) + min(current_range)
axis(1, x_show_current, formatAxisValue(x_show), lwd = 0, lwd.ticks = 1)
axis(1, current_range, NA, lwd = 1, lwd.ticks = 0)
}
# We add the bin information
if(noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
# now the Other
if(any(data_freq$isOther)){
data_freq_other = data_freq[isOther == TRUE]
# we get the right cex
w = data_freq_other[1, xright - xleft]
max_lab = data_freq_other$otherValue[which.max(strwidth(data_freq_other$otherValue))]
minCex = 0.7
myCex = 1
while(myCex > minCex && strwidth(max_lab, cex = myCex) > w){
myCex = myCex * 0.95
}
axis(1, at = data_freq_other[, (xleft + xright)/2], labels = data_freq_other$otherValue, lwd = 0, line = -0.8, cex.axis = myCex)
}
} else if(numAxis){
# We add the bin information
if(noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
myBox(1)
axis_at = axis(1, lwd = 0, labels = NA)
if(ADD_OTHER){
# the axis without last tick
at = axis_at[1:(length(axis_at) - 1)]
if(abs(bin.size) > 1e4){
axis(1, at, formatAxisValue(at))
} else {
axis(1, at)
}
# the "other" text
axis(1, at = at_info[x_nb == nbins + 1, mid_point], line = -1, labels = otherText, lwd = 0)
} else {
if(abs(bin.size) > 1e4){
axis(1, axis_at, formatAxisValue(axis_at))
} else {
axis(1, axis_at)
}
}
if(ADD_OTHER_LEFT){
# the "other" text
axis(1, at = at_info[x_nb == 0, mid_point], line = -1, labels = otherTextLeft, lwd = 0)
}
# ticks at the right place
if(moderator_cases > 1){
# we add the rectangles to replicate a histogram
# axis(1, at = data_freq[moderator_nb == 1, xleft], labels = NA, lwd = 0, lwd.ticks = 1, tcl = -0.45)
rect_info = data_freq[, list(xleft = min(xleft), xright = max(xright), ytop = max(ytop)), by = x_nb]
rect(xleft = rect_info$xleft, ybottom = 0, xright = rect_info$xright, ytop = rect_info$ytop, density = 0, lty = 2)
}
} else if(DO_SPLIT){
# we need to display all xs
# We add the bin information => specific case: max first + numeric data
if(binned_data && noSub){
title(sub = substitute(paste("Bin size", phantom()==b), list(b = formatAxisValue(bin.size))), cex.sub = 0.9, line = mbot$sub.line)
sub = ""
}
data_freq[, mid_point := (xleft + xright) / 2]
myLabels = data_freq$x
myAt = data_freq$mid_point
if(checkNotTilted){
# If very short labels => we don't tilt them // allows to reintroduce xlab
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(length(unique(axis_info$line)) == 1){
labels.tilted = FALSE
} else {
labels.tilted = TRUE
}
}
if(labels.tilted){
xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
} else {
xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method)
}
} else if(isNum){
# we can have the "other" column both left and right
x_unik = at_info[x_nb %in% 1:nbins, x]
myLabels = x_unik
myAt = at_info[x_nb %in% 1:nbins, mid_point]
info_axis = NULL
if(labels.tilted == FALSE && mean(diff(x_unik)) == 1){
# This is a "normal" axis
# everything number follows, this is fine
# # we still add the xlab we removed before (we could not anticipate the result)
# if(checkForTilting){
# if(noXlab) title(xlab = x_name)
# }
axis(1, myAt, labels = myLabels)
} else {
if(checkForTilting){
# If normal axis does not fit => tilt
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(axis_info$failed){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
# # we add the label only if the user didn't provide it before
# if(noXlab) title(xlab = x_name)
}
}
if(labels.tilted){
info_axis = xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
} else {
info_axis = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, line.max = line.max)
}
}
if(ADD_OTHER){
# the "other" text
if(is.null(info_axis)){
axis(1, at = at_info[x_nb == nbins + 1, mid_point] + strwidth("> "), line = -0.85, labels = otherText, lwd = 0, cex.axis = 1)
} else if(labels.tilted){
xaxis_biased(at = at_info[x_nb == nbins + 1, mid_point], labels = otherText, angle = info_axis$angle, cex = info_axis$cex, trunc = trunc, trunc.method = trunc.method)
} else {
all_lines = sort(unique(info_axis$line))
my_line = min(info_axis$line[info_axis$line != tail(info_axis$line, 1)])
axis(1, at = at_info[x_nb == nbins + 1, mid_point] + strwidth("> ", cex = info_axis$cex), line = my_line, labels = otherText, lwd = 0, cex.axis = info_axis$cex)
}
}
if(ADD_OTHER_LEFT){
# the "other" text on the left
if(is.null(info_axis)){
axis(1, at = at_info[x_nb == 0, mid_point] - strwidth("< "), line = -0.85, labels = otherTextLeft, lwd = 0)
} else if(labels.tilted){
xaxis_biased(at = at_info[x_nb == 0, mid_point], labels = otherTextLeft, angle = info_axis$angle, cex = info_axis$cex, trunc = trunc, trunc.method = trunc.method)
} else {
my_line = min(info_axis$line[info_axis$line != info_axis$line[1]])
axis(1, at = at_info[x_nb == 0, mid_point] - strwidth("< ", cex = info_axis$cex), line = my_line, labels = otherTextLeft, lwd = 0, cex.axis = info_axis$cex)
}
}
} else {
if(ADD_OTHER){
nbins = nbins + 1
at_info$x[nbins] = otherText
}
x_unik = at_info[x_nb %in% 1:nbins, x]
myLabels = x_unik
myAt = at_info[x_nb %in% 1:nbins, mid_point]
if(checkForTilting){
# If normal axis does not fit => tilt
axis_info = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, only.params = TRUE)
if(axis_info$failed){
labels.tilted = TRUE
} else {
labels.tilted = FALSE
# # we add the label only if the user didn't provide it before
# if(noXlab) title(xlab = x_name)
}
}
# We also add ticks every 5/10 bins to help counting
if(missnull(at_5)){
at_5 = ifelse(max(at_info$x_nb) > 10, TRUE, FALSE)
if(at_5) {
at_5 = ifelse(labels.tilted, "line", "roman")
}
} else {
at_5 = at_5[1]
}
if(at_5 == "roman"){
roman_dict = toupper(c("v", "x", "xv", "xx", "xxv", "xxx", "xxxv", "xL", "xLv", "L", "Lv", "Lx", "lxv", "lxx", "lxxv", "lxxx", "lxxxv", "xc", "xcv", "c"))
names(roman_dict) = (1:20) * 5
}
if(at_5 != "FALSE"){
qui = which(at_info$x_nb %% 5 == 0)
# qui = qui[qui != nrow(at_info)]
if(length(at_info)){
# quoi data_freq[qui]
for(i in qui){
if(at_5 == "roman"){
axis(1, at = at_info[i, mid_point], lwd = 0, labels = roman_dict[as.character(i)], line = -1.3, cex.axis = 0.6)
} else {
axis(1, at = data_freq[i, list(xleft, xright)], lwd = 3, lwd.ticks = 0, labels = NA)
}
}
}
}
if(labels.tilted){
info_axis = xaxis_biased(at = myAt, labels = myLabels, angle=labels.angle, cex = cex.axis, trunc = trunc, trunc.method = trunc.method, line.max = line.max, line.min = 0.45 * (at_5 == "roman"))
} else {
info_axis = xaxis_labels(at = myAt, labels = myLabels, trunc = trunc, trunc.method = trunc.method, line.min = 0.25 * (at_5 == "roman"))
}
}
#
# yaxis and topping ####
#
if(xlab != ""){
title(xlab = xlab, line = mbot$xlab.line)
}
if(sub != ""){
title(sub = sub, line = mbot$sub.line)
}
if(yaxis.show){
# we get the "nice" points display
y_points = axis(2, lwd=0, col.axis = 0)
# We don't go above 100%
y_points = y_points[y_points <= 100]
if(yaxis.num){
# x1 = data_freq[x_nb == 1][1, ]
x1 = data_freq[ytop > 0][1, ]
y_labels = formatAxisValue(y_points / x1$ytop * x1$value)
} else {
y_labels = paste0(y_points, "%")
}
# we find the proper width
if("ylab" %in% names(dots)){
ylab = dots$ylab
}
if(nchar(ylab) == 0){
myWidth = strwidth("7775%")
} else {
myWidth = strwidth("75%")
}
# We find the right cex
minCex = 0.7
myCex = 1
y_labels_max = y_labels[which.max(strwidth(y_labels))]
while(strwidth(y_labels_max, cex = myCex) > myWidth && myCex >= minCex){
myCex = 0.95 * myCex
}
axis(2, at=y_points, labels = y_labels, las = 2, cex.axis = myCex)
}
# Legend if needed
if(isLegend){
#
# finding the cex of the legend => depends on the longuest character
#
# legendFit("top", moderator_unik, bg="white", bty="o", box.col="white", fill=col[1:moderator_cases])
legendFit("top", moderator_unik, fill=col[1:moderator_cases], title = mod.title, title_out = TRUE, trunc = info_legend$trunc)
}
# On top of the bars:
if(top != "none"){
qui = strwidth(top.value2display, units = "in", cex = top.cex) <= unitary_width_top*1.5 | nchar(top.value2display) <= 3
# we don't display the 0s only
qui = qui & info_top$value > 0
# if(top.cex < 0.95 && top == "frac"){
# # we don't display the small numbers
# max_height = max(info_top$ytop)
# qui = qui & (info_top$ytop > max_height*0.05 | info_top$share_top >= 1)
# }
if(any(qui)){
# text(data_freq[, (xleft+xright)/2][qui], data_freq$ytop[qui], label = top.value2display[qui], pos = 3, cex = top.cex, offset = 0.5 * top.cex * 0.9)
text(info_top$x_mid[qui], info_top$ytop[qui], label = top.value2display[qui], pos = 3, cex = top.cex, offset = 0.5 * top.cex * 0.9)
}
if(DO_STACK && nrow(info_top_other) > 0){
text(info_top_other$x_mid[qui], info_top_other$ytop[qui], label = top.value2display_other[qui], pos = 3, cex = top.cex_other, offset = 0.5 * top.cex_other * 0.9)
}
}
return(invisible(data_freq))
}
#' Display means conditionnally on some other values
#'
#' The typical use of this function is to represents trends of average along some
#' categorical variable.
#'
#' @inheritParams plot_distr
#'
#' @param fml A formula of the type \code{variable ~ time | moderator}. Note that
#' the moderator is optional. Can also be a vector representing the elements of
#' the variable If a formula is provided, then you must add the argument \sQuote{data}.
#' You can use multiple variables. If so, you cannot use a moderator at the same time.
#' @param data Data frame containing the variables of the formula. Used only if the
#' argument \sQuote{fml} is a formula.
#' @param time Only if argument \sQuote{fml} is a vector. It should be the vector
#' of \sQuote{time} identifiers to average over.
#' @param moderator Only if argument \sQuote{fml} is a vector. It should be a vector
#' of conditional values to average over. This is an optional parameter.
#' @param fun Function to apply when aggregating the values on the time variable.
#' Default is \code{mean}.
#' @param mod.select Which moderators to select. By default the top 5 moderators
#' in terms of frequency (or in terms of the value of fun in case of identical
#' frequencies) are displayed. If provided, it must be a vector of moderator values
#' whose length cannot be greater than 10. Alternatively, you can put an integer between 1 and 10.
#' @param smoothing_window Default is 0. The number of time periods to average over.
#' Note that if it is provided the new value for each period is the average of
#' the current period and the \code{smoothing_window} time periods before and after.
#' @param col The colors. Either a vector or a keyword (\dQuote{Set1} or \dQuote{paired}).
#' By default those are the \dQuote{Set1} colors colorBrewer. This argument is
#' used only if there is a moderator.
#' @param lty The line types, in the case there are more than one moderator.
#' By default it is equal to 1 (ie no difference between moderators).
#' @param pch The form types of the points, in the case there are more than one
#' moderator. By default it is equal to \8code{c(19, 17, 15, 8, 5, 4, 3, 1)}.
#' @param pt.cex Default to 2. The \code{cex} of the points.
#' @param lwd Default to 2. The width of the lines.
#' @param legend_options A list containing additional parameters for the function
#' \code{\link[graphics]{legend}} -- only concerns the moderator. Note that you can
#' set the additionnal arguments \code{trunc} and \code{trunc.method} which relates
#' to the number of characters to show and the truncation method. By default the
#' algorithm truncates automatically when needed.
#' @param ... Other arguments to be passed to the function \code{plot}.
#'
#' @author Laurent Berge
#'
#' @return
#' This function returns *invisibly* the output data.table containing the processed data
#' used for plotting.
#'
#' @examples
#'
#' data(airquality)
#'
#' plot_lines(Ozone ~ Day, airquality)
#'
#' plot_lines(Ozone ~ Day | Month, airquality)
#'
#' plot_lines(Ozone ~ Month | cut(Day, 8), airquality)
#'
#'
plot_lines = function(fml, data, time, moderator, mod.select, mod.NA = TRUE,
smoothing_window = 0, fun, col = "set1", lty = 1,
pch = c(19, 17, 15, 8, 5, 4, 3, 1), legend_options = list(),
pt.cex = 2, lwd = 2, dict = NULL, mod.title = TRUE, ...){
# This functions plots the means of x wrt the id
# we can also add a moderator
# DT VARS
moderator_cases = xleft = xright = ybottom = ytop = value = NULL
check_arg(fml, "formula right(,2) | vector mbt")
# old params
style = "line"
outCol = "black"
# style = match.arg(style)
mc = match.call()
fml_in = fml
check_arg(smoothing_window, "integer scalar GE{0}")
check_arg(mod.NA, "logical scalar")
check_arg(dict, "null named character vector | logical scalar")
# The color
if(length(col) == 1){
check_value_plus(col, "match(set1, paired) | scalar")
if(col == "set1"){
col = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999")
} else if(col == "paired"){
col = c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00")
}
}
#
# Extracting the variables ####
#
moderator_name = ""
if(inherits(fml_in, "formula")){
# Control of the formula
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
vars = all.vars(fml_in)
if(any(!vars %in% names(data))){
stop("The variable", enumerate_items(setdiff(vars, names(data)), "s.is")," not in the data set (", deparse(mc$data), ").")
}
# Creation of x and the condition
if(!length(fml_in) == 3){
stop("The formula must be of the type 'var ~ time' or 'var ~ time | moderator'.")
}
info = extract_pipe(fml_in)
lhs_fml = info$lhs_fml
fml = info$fml
pipe = info$pipe
x = extract_df(lhs_fml, data)
time = eval(fml[[3]], data)
moderator = eval(pipe, data)
# other info
MULTI_VAR = FALSE
if(length(x) > 1){
# multiple variables to plot
MULTI_VAR = TRUE
if(length(moderator) > 1){
stop("You cannot use a moderator and multiple variables at the same time. If you intended to use one variable only, wrap it into I(): I(", deparse(lhs_fml[[2]]), ").")
}
# check
for(i in 1:length(x)){
if(is.logical(x[[i]])){
x[[i]] = x[[i]] * 1
} else if(!is.numeric(x[[i]])){
stop("All variables must be numeric, but variable '", names(x)[i], "' isn't.")
}
}
if("Frequency" %in% names(x) && missing(fun)){
fun = sum
fun_name = "sum"
} else if(missing(fun)){
fun_name = "mean"
} else {
fun_name = deparse(substitute(fun))
}
# We need to stack the data
n = length(x[[1]])
all_x_names = dict_apply(names(x), dict)
# x_name = paste0(fun_name, "(Variables)")
x_name = substitute(paste(f, phantom(1), group("(", italic(Variables), ")")), list(f = fun_name))
moderator = factor(rep(all_x_names, each = n), levels = all_x_names)
time = rep(time, length(x))
x = unlist(x)
if(missing(mod.title)){
mod.title = "Variables"
}
moderator_name = ""
} else {
x = unlist(x)
if(is.logical(x)) x = x * 1
x_name = deparse(fml[[2]])
if(!is.numeric(x)) stop("The variable to plot must be numeric, currently '", x_name, "' isn't.")
if(x_name == "1"){
x_name = "Frequency"
fun = sum
}
}
# if(length(x) == 1){
# # this is a shortcut to say that we want to display the frequency
# fun = sum
# x_name = "Frequency"
# x = rep(1, length(time))
# } else {
# x_name = deparse(fml[[2]])
# }
if(is.null(moderator)){
moderator = rep(1, length(x))
} else if(MULTI_VAR == FALSE){
moderator_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
}
time_name = deparse(fml[[3]])
} else {
# Wow, the next two lines replace 10 lines of code, and error-handling is much better done
check_arg(time, "vector len(data) mbt", .data = fml)
check_arg(moderator, "safe NULL vector len(data)", .data = fml)
x = fml_in
if(!missnull(moderator)){
moderator_name = deparse(substitute(moderator))
} else {
moderator = rep(1, length(x))
}
x_name = deparse(substitute(fml_in))
time_name = deparse(substitute(time))
# 14 lignes
# if(missing(time)){
# stop("You must provide the argument 'time'.")
# } else {
# x = fml_in
#
# if(length(x) != length(time)){
# stop("The arguments 'x' and 'time' must be of the same length.")
# }
#
# if(!missnull(moderator)){
# if(length(x) != length(moderator)){
# stop("If provided, the argument 'moderator' must be of the same length of 'x'.")
# }
# moderator_name = deparse(substitute(moderator))
# } else {
# moderator = rep(1, length(x))
# }
#
# # other info
# x_name = deparse(substitute(fml_in))
# time_name = deparse(substitute(time))
# }
# 22 lignes
}
# Naming
if(!is.call(x_name)) x_name = dict_apply(x_name, dict)
time_name = dict_apply(time_name, dict)
moderator_name = dict_apply(moderator_name, dict)
# Dropping NAs
quiNA_x = is.na(x)
quiNA_time = is.na(time)
quiNA_mod = is.na(moderator)
if(mod.NA){
quiNA = quiNA_x | quiNA_time
total_NA = sum(quiNA | quiNA_mod)
if(is.factor(moderator)){
moderator = addNA(moderator)
} else {
# The following will coerce moderator into character
moderator[is.na(moderator)] = "NA"
}
} else {
quiNA = quiNA_x | quiNA_time | quiNA_mod
total_NA = sum(quiNA)
}
if(total_NA > 0){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_time))
msg_na = paste0(c("x: ", "moderator: ", "time: "), nb_na)
message("NOTE: ", total_NA, " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ")")
x = x[!quiNA]
moderator = moderator[!quiNA]
time = time[!quiNA]
}
# We first check the function is valid
if(missing(fun)){
fun = mean
} else {
if(!is.function(fun)){
stop("Argument 'fun' must be a function. A the moment its class is ", class(fun)[1], ".")
}
test = try(fun(head(x, 5)), silent = TRUE)
if(inherits(test, "try-error")){
stop("Evaluation of the function '", deparse_long(mc$fun), "' leads to an error\n", test)
} else if(length(test) > 1){
stop("The function in argument 'fun' MUST be an aggregating function: that is returning something of length 1. This is not the case currently.")
}
}
n_moderator = length(unique(moderator))
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
if(anyNA(moderator_unik)){
# This is such a pain in the neck...
moderator_unik[is.na(moderator_unik)] = "NA"
moderator = factor(unclass(moderator), labels = moderator_unik)
}
} else {
moderator_unik = sunique(as.character(moderator))
}
if(n_moderator == 1){
isLegend = FALSE
} else {
isLegend = TRUE
}
# mod.select
if(n_moderator > 1){
n_select = NA
do_recreate = FALSE
if(missing(mod.select)){
if(n_moderator <= 8){
# FINE!
} else {
# We select the top 5
n_select = 5
}
} else if(length(mod.select) > 10){
stop("The argument 'mod.select' (currently of length ", length(mod.select), ") cannot be of a length greater than 10.")
} else if(length(mod.select) == 1 && is.numeric(mod.select) && mod.select %% 1 == 0){
# Possibly a number
if(mod.select %in% 1:10){
n_select = mod.select
} else {
stop("Argument 'mod.select' does not have a valid value: if it is an integer, its value must lie between 1 and 10.")
}
} else {
# mod.select must contain moderator values
# 1) checking the problems
mod_pblm = setdiff(mod.select, moderator_unik)
if(length(mod_pblm) > 0){
# We recreate mod.select
if(length(mod_pblm) != length(mod.select) && any(quiNA)){
# means some are OK
suggestion = " Maybe because it has only NA values?"
} else {
suggestion = " It must be either a moderator value, or a regular expression to select moderator values."
}
# Maybe they're regular expressions => we keep the order desired by the user
mod.ok = c()
for(r in mod.select){
if(r %in% moderator_unik){
mod.ok = c(mod.ok, r)
} else {
qui = grepl(r, moderator_unik)
n_qui = sum(qui)
if(n_qui == 0){
stop("In the argument 'mod.select', the value ", r, " could not be found in the moderator variable.", suggestion)
} else if(n_qui > 10){
stop("In the argument 'mod.select', the value ", r, " leads to more than 10 moderator values to be selected (e.g. ", enumerate_items(moderator_unik[qui], nmax = 15), ".")
}
mod.ok = c(mod.ok, moderator_unik[qui])
}
}
mod.ok = unique(mod.ok)
if(length(mod.ok) > 10){
stop("Argument mod.select leads to ", length(mod.ok), " moderator values to be selected. But the maximum is 10.")
}
mod.select = mod.ok
}
# 2) selection
do_recreate = TRUE
}
# AUTOMATIC selection
if(!is.na(n_select) && n_moderator > n_select){
# We proceed with the "automatic" selection
info_mod = data.table(moderator = moderator)
info_mod = info_mod[, list(value = .N), by = moderator]
if(var(info_mod$value) == 0){
# same freq for all
info_mod = data.table(x = x, moderator = moderator)
info_mod = info_mod[, list(value = fun(x)), by = moderator]
info_method = paste0("the max of ", deparse(mc$fun)[1], ".")
} else {
info_method = "frequency."
}
# We inform on the method for the choice
message(ifelse(n_select == 1, "The moderator was", paste0("The ", n_select, " moderators were")), " chosen based on ", info_method)
info_mod = info_mod[order(-value)]
mod.select = info_mod[1:n_select, moderator]
do_recreate = TRUE
if(n_select == 1){
isLegend = FALSE
}
}
if(mod.NA){
if(missing(mod.select)){
do_recreate = FALSE
} else {
mod.select = unique(c(mod.select, "NA"))
do_recreate = TRUE
}
}
if(do_recreate){
# We recreate the variables
qui_select = which(moderator %in% mod.select)
# we recreate all the values
x = x[qui_select]
moderator = moderator[qui_select]
time = time[qui_select]
# Re-Dealing with the moderator
if(is.factor(moderator)){
moderator_unik = levels(moderator[drop = TRUE])
} else {
moderator_unik = sunique(moderator)
}
n_moderator = length(moderator_unik)
}
}
#
# Aggregation
#
quoi = data.table(x=x, time=time, moderator=moderator)
base_agg = quoi[!is.na(x), list(x = fun(x)), by = list(time, moderator)]
#
# Preparation
#
dots = list(...)
# ylim
if(!is.null(dots$ylim)){
ylim = dots$ylim
} else if(style == "line"){
ylim = range(base_agg[!is.na(moderator), x])
} else if(style == "bar"){
rx = range(base_agg[!is.na(moderator), x])
ylim = c(min(0, rx[1]), max(0, rx[2]))
}
ymax = Inf
if(isLegend){
ymax = ylim[2]
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
info_legend = legendFit(legend = moderator_unik, title = mod.title, plot = FALSE)
legend_height = info_legend$total_height
ylim[2] = ylim[2] + (legend_height + 0.25) / par("pin")[2] * diff(ylim)
}
listDefault(dots, "ylim", ylim)
is_ylab = "ylab" %in% names(dots)
my_ylab = ifelse(is_ylab, dots$ylab, "")
dots$ylab = ""
# listDefault(dots, "ylab", x_name)
listDefault(dots, "xlab", time_name)
dots$type = "n"
dots$y = rep(1, length(unique(base_agg$time)))
makeAxes = FALSE
if(is.numeric(base_agg$time)){
dots$x = sunique(base_agg$time)
} else {
# for character vectors, we need to recreate the xaxis
if(is.null(dots$axes) || dots$axes){
makeAxes = TRUE
}
dots$axes = FALSE
if(is.factor(base_agg$time)){
time_unik = levels(unique(base_agg$time)[, drop=TRUE])
dict_time = 1:length(time_unik)
names(dict_time) = time_unik
dots$x = 1:length(time_unik)
base_agg[, time := dict_time[as.character(time)]]
} else {
time_unik = sunique(base_agg$time)
dots$x = quickUnclassFactor(time_unik)
base_agg[, time := quickUnclassFactor(time)]
}
}
if(style == "bar"){
total_width = min(diff(sort(unique(base_agg$time))))
bar_width = total_width / (n_moderator + 1)
if(is.null(dots$xlim)){
total_width = min(diff(sort(unique(base_agg$time))))
dots$xlim = c(min(base_agg$time) - total_width * 0.5 + bar_width/2, max(base_agg$time) + total_width * 0.5 - bar_width/2)
}
} else {
if(smoothing_window > 0){
quoi = sort(dots$x)
if(length(quoi) < 2*smoothing_window + 2){
stop("To smooth with smoothing_window=", smoothing_window, " you need at least ", 2*smoothing_window + 2, " periods. Yet at the moment there is only ", length(quoi), " periods.")
}
dots$xlim = quoi[c(smoothing_window + 1, length(quoi) - smoothing_window)]
}
}
do.call("plot", dots)
dots$axes = NULL # not a graphical parameter afterwards
# ylab: because of problems with calls
if(is_ylab){
title(ylab = my_ylab)
} else {
title(ylab = x_name)
}
if(style == "bar"){
hgrid(ymax = ymax)
} else {
hgrid(ymax = ymax)
vgrid(ymax = ymax)
# grid(col = "darkgray")
}
if(makeAxes){
box()
axis(2)
axis(1, dots$x, time_unik)
}
#
# The plot_line
#
all_col = col[1 + (0:(n_moderator-1))%%length(col)]
all_lty = lty[1 + (0:(n_moderator-1))%%length(lty)]
all_pch = pch[1 + (0:(n_moderator-1))%%length(pch)]
if(style == "line"){
for(i in 1:n_moderator){
dots$col = all_col[i]
dots$lty = all_lty[i]
dots$pch = all_pch[i]
dots$cex = pt.cex
dots$add = TRUE
dots$smoothing_window = smoothing_window
dots$x = base_agg[moderator == moderator_unik[i], time]
dots$y = base_agg[moderator == moderator_unik[i], x]
do.call("plot_line", dots)
}
} else if(style == "bar"){
# Style is barplot
base_agg[, moderator_cases := quickUnclassFactor(moderator)]
base_agg[, xleft := time - total_width/2 + bar_width/2 + bar_width*(moderator_cases - 1)]
base_agg[, xright := xleft + bar_width]
base_agg[, ybottom := pmin(0, x)]
base_agg[, ytop := pmax(0, x)]
# the call to rect
all_outcol = outCol[1 + (0:(n_moderator-1))%%length(outCol)]
dots$xleft = base_agg$xleft
dots$ybottom = base_agg$ybottom
dots$xright = base_agg$xright
dots$ytop = base_agg$ytop
if(n_moderator > 1){
dots$col = all_col[base_agg$moderator_cases]
dots$border = all_outcol[base_agg$moderator_cases]
} else {
dots$col = col
dots$border = outCol
}
dots = dots[names(dots) %in% names(formals(rect))]
do.call("rect", dots)
}
#
# Legend
#
if(n_moderator > 1 && isLegend){
if(!is.list(legend_options)){
stop("Argument'legend_options' must be a list.")
}
# mandatory
legend_options$legend = moderator_unik
if(style == "line"){
legend_options$col = all_col
legend_options$lty = all_lty
legend_options$pch = all_pch
legend_options$pt.cex = pmin(pt.cex, 1.5)
legend_options$lwd = lwd
} else if(style == "bar"){
legend_options$fill = all_col
}
legend_options$title = mod.title
legend_options$trunc = info_legend$trunc
# options
listDefault(legend_options, "x", "top")
# the call
do.call("legendFit", legend_options)
}
invisible(base_agg)
}
#' Boxplots with possibly moderators
#'
#' This function allows to draw a boxplot, with possibly separating different moderators.
#'
#' @inheritParams plot_distr
#'
#' @param fml A numeric vector or a formula of the type: \code{vars ~ moderator_1 | moderator_2}. Note that if a formula is provided then the argument \sQuote{data} must be provided. You can plot several variables, if you don't want a moderator, use 1 instead: e.g. \code{plot_box(Petal.Width +Petal.Length ~ 1, iris)}. You can plot all numeric variables from a data set using \code{"."}: \code{plot_box(. ~ 1, iris)}.
#' @param data A data.frame/data.table containing the relevant information.
#' @param case When argument fml is a vector, this argument can receive a vector of cases.
#' @param moderator When argument fml is a vector, this argument can receive a vector of moderators.
#' @param inCol A vector of colors that will be used for within the boxes.
#' @param outCol The color of the outer box. Default is black.
#' @param pch The patch of the outliers. Default is 18.
#' @param addLegend Default is \code{TRUE}. Should a legend be added at the top of the graph is there is more than one moderator?
#' @param lwd The width of the lines making the boxes. Default is 2.
#' @param outlier Default is \code{TRUE}. Should the outliers be displayed?
#' @param dict_case A named character vector. If provided, it changes the values of the variable \sQuote{case} to the ones contained in the vector \code{dict_case}. Example: I want to change my variable named "a" to "Australia" and "b" to "Brazil", then I used \code{dict=c(a="Australia",b="Brazil")}.
#' @param dict_moderator A named character vector. If provided, it changes the values of the variable \sQuote{moderator} to the ones contained in the vector \code{dict_moderator}. Example: I want to change my variable named "a" to "Australia" and "b" to "Brazil", then I used \code{dict=c(a="Australia",b="Brazil")}.
#' @param order_case Character vector. This element is used if the user wants the \sQuote{case} values to be ordered in a certain way. This should be a regular expression (see \code{\link[base]{regex}} help for more info). There can be more than one regular expression. The variables satisfying the first regular expression will be placed first, then the order follows the sequence of regular expressions.
#' @param order_moderator Character vector. This element is used if the user wants the \sQuote{moderator} values to be ordered in a certain way. This should be a regular expression (see \code{\link[base]{regex}} help for more info). There can be more than one regular expression. The variables satisfying the first regular expression will be placed first, then the order follows the sequence of regular expressions.
#' @param addMean Whether to add the average for each boxplot. Default is true.
#' @param mean.col The color of the mean. Default is darkred.
#' @param mean.cex The cex of the mean, default is 2.
#' @param mean.pch The patch of the mean, default is 18.
#' @param line.max Option for the x-axis, how far should the labels go. Default is 1 for normal labels, 2 for tilted labels.
#' @param density The density of lines within the boxes. By default it is equal to -1, which means the boxes are filled with color.
#' @param lty The type of lines for the border of the boxes. Default is 1 (solid line).
#' @param ... Other parameters to be passed to \code{plot}.
#'
#' @author Laurent Berge
#'
#' @return
#' Invisibly returns the coordinates of the x-axis.
#'
#' @examples
#'
#' # Simple iris boxplot
#' plot(1:10)
#'
#' # All numeric variables
#' plot_box(. ~ 1, iris)
#'
#' # All numeric variable / splitting by species
#' plot_box(. ~ Species, iris)
#'
#' # idem but with renaming
#' plot_box(. ~ Species, iris, dict = c(Species="Iris species",
#' setosa="SETOSA", Petal.Width="Width (Petal)"))
#'
#' # Now using two moderators
#' base = iris
#' base$period = sample(1:4, 150, TRUE)
#'
#' plot_box(Petal.Length ~ period | Species, base)
#'
#'
#'
#'
#'
plot_box = function(fml, data, case, moderator, inCol, outCol = "black", density = -1, lty = 1, pch = 18, addLegend = TRUE, legend_options = list(), lwd=2, outlier, dict = NULL, dict_case, dict_moderator, order_case, order_moderator, addMean, mean.col = "darkred", mean.pch = 18, mean.cex = 2, mod.title = TRUE, labels.tilted, trunc = 20, trunc.method = "auto", line.max, ...){
# DT VARS USED
case_nb = moderator_nb = span = q3 = q1 = min_whisker = y_min = max_whisker = y_max = NULL
fml_in = fml
mc = match.call()
# Controls
check_arg("logical scalar", addLegend, outlier, addMean, labels.tilted)
check_arg(order_moderator, order_case, "character vector", .message = "Argument '__ARG__' must be a vector of regular expressions.")
check_arg(trunc, "integer scalar GE{5}")
check_arg(dict, "null named character vector | logical scalar")
#
# Extracting the information
#
moderator_name = case_name = ""
if(inherits(fml_in, "formula")){
check_arg(data, "data.frame mbt", .message = "If you provide a formula, a data.frame must be given in the argument 'data'.")
# we check the variables data are there
check_arg(fml, "ts formula left(1) right(,2)", .message = "Argument 'fml' must be a formula of the type variables ~ moderator_1 | moderator_2 ('moderator_1' [you can replace it by 1] and 'moderator_2' are optional).")
# "." is a special variable
vars = setdiff(all.vars(fml_in), ".")
if(any(!vars %in% names(data))){
stop("The variable", enumerate_items(setdiff(vars, names(data)), "s.is")," not in the data set (", deparse_long(mc$data), ").")
}
info = extract_pipe(fml_in)
fml = info$fml
pipe = info$pipe
case = eval(fml[[3]], data)
moderator = eval(pipe, data)
if(deparse(fml[[2]]) == "."){
x_all = as.data.frame(data)
qui_num = sapply(x_all, function(z) is.numeric(z) || is.logical(z))
if(length(case) > 1){
# There is a case variable => we drop it from the variables
case_name = deparse(fml[[3]])
qui_num = qui_num & !names(x_all) == case_name
}
if(!any(qui_num)){
stop("Only numeric variables can be displayed, and none were found in the data set.")
}
x_all = x_all[, qui_num]
} else {
x_all = extract_df(info$lhs_fml, data)
qui_num = sapply(x_all, function(z) is.numeric(z) || is.logical(z))
if(any(!qui_num)){
stop("Only numeric variables can be displayed, the variable", enumerate_items(names(x_all), "s.quote.is"), " not numeric.")
}
}
mod_is_set = FALSE
if(length(x_all) == 1){
x = x_all[[1]]
x_name = names(x_all)
if(length(case) > 1){
case_name = deparse(fml[[3]])
} else {
case = rep(x_name, length(x))
x_name = ""
}
} else {
if(is.data.frame(x_all)){
n = nrow(x_all)
K = ncol(x_all)
} else {
n = length(x_all[[1]])
K = length(x_all)
}
if(length(case) == 1){
# No case => the variables become the case
case = rep(names(x_all), each = n)
} else if(length(moderator) > 1){
stop("When plotting multiple variables, you cannot use a second moderator. This is possible only when plotting kust one variable.")
} else {
# case: the variables // moderator: the cases
mod_is_set = TRUE
moderator_name = deparse(fml[[3]])
moderator = rep(case, K)
case = rep(names(x_all), each = n)
}
x = unlist(x_all)
x_name = ""
case_name = ""
}
if(is.null(moderator)){
moderator = rep(1, length(x))
} else if(mod_is_set == FALSE){
moderator_name = gsub("^.*\\| *", "", deparse(fml_in[[3]]))
}
# other info
# x_name = deparse(fml[[2]])
# case_name = deparse(fml[[3]])
} else {
if(missing(case)){
stop("You must provide the argument 'case'.")
} else {
x = fml_in
if(length(x) != length(case)){
stop("The arguments 'x' and 'case' must be of the same length.")
}
if(!missing(moderator) && !is.null(moderator)){
if(length(x) != length(moderator)){
stop("If provided, the argument 'moderator' must be of the same length of 'x'.")
}
moderator_name = clean_name(deparse(substitute(moderator)))
} else {
moderator = rep(1, length(x))
}
# other info
x_name = clean_name(deparse(substitute(fml)))
case_name = clean_name(deparse(substitute(case)))
}
}
# Dropping NAs
quiNA_x = is.na(x)
quiNA_mod = is.na(moderator)
quiNA_case = is.na(case)
quiNA = quiNA_x | quiNA_mod | quiNA_case
if(any(quiNA)){
nb_na = c(sum(quiNA_x), sum(quiNA_mod), sum(quiNA_case))
msg_na = paste0(c("x: ", "moderator: ", "case: "), nb_na)
message("NOTE: ", sum(quiNA), " observations with NAs (", enumerate_items(msg_na[nb_na>0]), ").")
x = x[!quiNA]
moderator = moderator[!quiNA]
case = case[!quiNA]
}
# Renaming: dict
x_name = dict_apply(x_name, dict)
case_name = dict_apply(case_name, dict)
moderator_name = dict_apply(moderator_name, dict)
#
# Aggregation
#
mod_num_logical = is.logical(moderator) || is.numeric(moderator) # used later
moderator = as.character(moderator)
CASE_FACTOR = FALSE
if(is.factor(case)){
CASE_FACTOR = TRUE
case_unik = levels(case[, drop = TRUE])
case = unclass(case[, drop = TRUE])
}
quoi = data.table(x=as.numeric(x), case=case, moderator=moderator)
delayAddMean = FALSE
if(missnull(addMean)){
addMean = TRUE
delayAddMean = TRUE
}
if(addMean){
base_agg = quoi[, list(y_min = min(x), q1 = quantile(x, 0.25), m = median(x), q3 = quantile(x, 0.75), y_max = max(x), avg = mean(x)), keyby = list(case, moderator)]
} else {
base_agg = quoi[, list(y_min = min(x), q1 = quantile(x, 0.25), m = median(x), q3 = quantile(x, 0.75), y_max = max(x)), keyby = list(case, moderator)]
}
#
# the coordinates
#
# putting case to number and formatting it
if(!missnull(order_case)){
case_unik = sunique(case)
for(var2order in rev(order_case)){
who = grepl(var2order, case_unik)
case_unik = c(case_unik[who], case_unik[!who])
}
base_agg[, case_nb := dict2number(case_unik, case)]
} else {
base_agg[, case_nb := quickUnclassFactor(case)]
if(CASE_FACTOR == FALSE){
what = unique(base_agg[, list(case, case_nb)])
what = what[order(case_nb)]
case_unik = what$case
}
}
aliasCase = dict_apply(case_unik, dict)
if(!missing(dict_case) && !is.null(dict_case)){
if(!is.character(dict_case) || is.null(names(dict_case))) stop("the arg. 'dict_case' must be a named character vector.")
qui = which(case_unik %in% names(dict_case))
aliasCase[qui] = dict_case[case_unik[qui]]
}
# putting moderator to number and formatting it
if(!missnull(order_moderator)){
moderator_unik = sunique(moderator)
for(var2order in rev(order_moderator)){
who = grepl(var2order, moderator_unik)
moderator_unik = c(moderator_unik[who], moderator_unik[!who])
}
base_agg[, moderator_nb := dict2number(moderator_unik, moderator)]
} else {
base_agg[, moderator_nb := quickUnclassFactor(moderator)]
moderator_unik = sunique(moderator)
}
aliasModerator = dict_apply(moderator_unik, dict)
if(!missing(dict_moderator) && !is.null(dict_moderator)){
if(!is.character(dict_moderator)|| is.null(names(dict_moderator))) stop("The argument 'dict_moderator' must be a named character vector.")
qui = which(moderator_unik %in% names(dict_moderator))
aliasModerator[qui] = dict_moderator[moderator_unik[qui]]
}
n_case = length(case_unik)
n_moderator = length(moderator_unik)
# The colors
if(missnull(inCol)){
if(n_moderator == 1){
inCol = "#1F78B4"
} else {
inCol = c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
}
}
# separation between the different cases:
sep = 0.30 + 0.15 * (n_moderator-1)
base_agg[, x := (case_nb-1)*(n_moderator+sep) + moderator_nb]
# Information on the length of the whiskers
base_agg[, span := 1.5 * (q3 - q1)]
base_agg[, min_whisker := pmax(y_min, q1 - span)]
base_agg[, max_whisker := pmin(y_max, q3 + span)]
# the legend
isLegend = FALSE
if(addLegend && n_moderator > 1){
isLegend = TRUE
}
# mod.title
# if(isLegend){
# if(missing(mod.title)){
# if(mod_num_logical){
# mod.title = moderator_name
# } else {
# mod.title = NULL
# }
# } else if(isTRUE(mod.title)){
# mod.title = moderator_name
# } else if(isFALSE(mod.title)){
# mod.title = NULL
# }
# }
if(isTRUE(mod.title)){
mod.title = moderator_name
} else if(isFALSE(mod.title)){
mod.title = NULL
}
#
# Preparation
#
dots = list(...)
# default for outlier
if(missnull(outlier)){
outlier = FALSE
if(diff(range(x)) < 2 * diff(range(c(base_agg$min_whisker, base_agg$max_whisker)))){
outlier = TRUE
}
}
# default for addMean
if(delayAddMean){
addMean = !outlier
}
# ylim
if(outlier){
ylim = range(quoi$x)
} else {
ylim = c(min(base_agg$min_whisker), max(base_agg$max_whisker))
}
ymax = Inf
if(isLegend){
ymax = ylim[2]
info_legend = legendFit(legend = aliasModerator, plot = FALSE, title = mod.title)
# hauteur_caractere = strheight("W", "in", cex = info_legend$cex)
# ylim[2] = ylim[2] + 2.5*hauteur_caractere / par("pin")[2] * diff(ylim)
total_height = info_legend$total_height
ylim[2] = ylim[2] + total_height / par("pin")[2] * diff(ylim)
}
if(missnull(labels.tilted)){
if(sum(nchar(aliasCase)) * strwidth("w", "in") > 2 * par("pin")[1]){
labels.tilted = TRUE
case_name = ""
} else {
labels.tilted = FALSE
}
} else if(labels.tilted){
case_name = ""
}
listDefault(dots, "ylim", ylim)
listDefault(dots, "ylab", x_name)
listDefault(dots, "xlab", case_name)
dots$type = "n"
isAxes = TRUE
if(!is.null(dots$axes)){
isAxes = dots$axes
}
dots$axes = FALSE
dots$x = dots$y = 1
# xlim
dots$xlim = range(base_agg$x) + c(-1, 1) * ifelse(nrow(base_agg) == 1, 1, 0.5)
do.call("plot", dots)
if(isAxes){
box()
las = dots$las
axis(2, las = las)
at_labels = 1 + (n_moderator-1)/2 + (0:(n_case-1))*(n_moderator+sep)
# axis(1, at = at_labels, labels = aliasCase, lwd = 0)
if(labels.tilted){
if(missnull(line.max)) line.max = 2 + 1
xaxis_biased(at = at_labels, labels = aliasCase, trunc = trunc, trunc.method = trunc.method, line.max=line.max)
} else {
if(missnull(line.max)) line.max = 1 + 1
xaxis_labels(at = at_labels, labels = aliasCase, trunc = trunc, trunc.method = trunc.method, line.max=line.max)
}
}
hgrid(ymax = ymax)
#
# The plot_line
#
if(n_moderator == 1){
all_inCol = inCol[1 + (0:(n_case-1))%%length(inCol)]
all_outCol = outCol[1 + (0:(n_case-1))%%length(outCol)]
all_densities = density[1 + (0:(n_case-1))%%length(density)]
} else {
all_inCol = inCol[1 + (0:(n_moderator-1))%%length(inCol)]
all_outCol = outCol[1 + (0:(n_moderator-1))%%length(outCol)]
all_densities = density[1 + (0:(n_moderator-1))%%length(density)]
}
for(i in 1:n_moderator){
quoi = base_agg[moderator == moderator_unik[i]]
if(n_moderator == 1){
box_single(x = quoi$x, quoi$y_min, quoi$q1, quoi$m, quoi$q3, quoi$y_max, width = 1, inCol = all_inCol, lwd = lwd, outCol = all_outCol, density = all_densities)
} else {
box_single(x = quoi$x, quoi$y_min, quoi$q1, quoi$m, quoi$q3, quoi$y_max, width = 1, inCol = all_inCol[i], lwd = lwd, outCol = all_outCol[i], density = all_densities[i])
}
# now the outliers
if(outlier){
for(j in 1:n_case){
if(CASE_FACTOR){
# values have been unclassed
quoi_case = quoi[case == j]
y = x[moderator == moderator_unik[i] & case == j]
} else {
quoi_case = quoi[case == case_unik[j]]
y = x[moderator == moderator_unik[i] & case == case_unik[j]]
}
qui_outlier = y > quoi_case$max_whisker | y < quoi_case$min_whisker
if(any(qui_outlier)){
points(rep(quoi_case$x, sum(qui_outlier)), y[qui_outlier], pch=pch)
}
}
}
if(addMean){
quoi = base_agg[moderator == moderator_unik[i]]
points(quoi$x, quoi$avg, col = mean.col, pch = mean.pch, cex = mean.cex)
}
}
#
# Legend
#
if(isLegend){
if(!is.list(legend_options)){
stop("Argument'legend_options' must be a list.")
}
# mandatory
legend_options$legend = aliasModerator
legend_options$fill = all_inCol
legend_options$title = mod.title
# options
listDefault(legend_options, "x", "top")
# the call
do.call("legendFit", legend_options)
}
invisible(1 + (n_moderator-1)/2 + (0:(n_case-1))*(n_moderator+sep))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.