# startup-functions.r
# Created:      July 2008
# Modified:     30-Nov-2016
# Author:       Daniel Mastropietro
# Description: 	Startup settings to be invoked when R starts
# R version:    R-2.8.0 (used in SPSS 18.0.0 @NAT starting 2013)

# AUXILIARY functions
# GRAPHICAL functions

# HISTORY: (keep track of created functions, but no updates. For updates see the Git history)
# - 2013/09/19: Created function safeLog(x) to compute sign(x)*log10(constant + abs(x))
# - 2013/12/23: Created function getAxisLimits() to retrieve the correct X and Y limits of the active plot, since
# 							the values returned by par("usr") are NOT correct for the plots done by default, which use xaxs="r"
#								and yaxs="r" (regular axis) which imply that the actual "usr" limits are extended from the xlim and
#								ylim extremes by 4% of the limits range!! (Ref: documentation of par() and the 'xaxs' and 'yaxs'
#								options therein).
# - 2014/03/12: Created functions:
#								- safeLogInv(): computes the inverse of the safeLog() function
#								- logitInv(): computes the inverse of the logit() function available in the car package
#								- plot.cdf(): plots the CDF of a variable
# - 2014/03/26: Created function:
#								- plot.bar(), plot.bars(): makes a bar plot of a target variable in terms of categorical variable
#									where the bar widths are proportional to the number of cases in each category.
# - 2014/05/07: Created function cumplot() used to plot cumulative y vs. cumulative x.
#								It can be used for continuous model fit analysis for the predicted value and by input variable
#								A binary 0/1 target variable is allowed.
# - 2014/05/11: Created functions:
#								- plot.outliers2d(), panel.outliers2d(): plot 2D outliers based on estimated multivariate gaussian density (Ref: Machine Learning course)
#								- ellipse(), ellipsem(), angle() (taken from the internet and used by plot.outliers2d()
# - 2014/05/25: Created function:
#								- midpoints(): returns the midpoint of intervals generated by the cut() function.
# - 2014/07/22: Created functions:
#								- panel.image(): same as function plot.image() but with add=TRUE always.
#								- Replaced all calls to plot.image() with panel.image() (e.g. pairs.custom() now defaults to lower.panel=panel.image)
# - 2016/11/30: Created function:
#								- parseVariables(): calls unlist(strsplit(vars, "[ \n]+"))
#								Renamed function:
#								- parseVariable() -> extractVariable() (in order to avoid confusion with the newly created parseVariables() since extractVariable()
#								extracts a variable from a data frame.

# - [DONE-2013/08/05: Function checkVariables()] 2013/08/02: Write a function to check the existence of variables in a dataset so that the user knows which variables are NOT found when passed as parameters
# - 2014/07/09: In ALL plotting functions, add the functionality to transform any of the variables plotted by the safe log function. Note that this
# may not be so easily done because this wish may be different for each plotted variable, which is especially so when doing pairs plot.
# - 2014/08/25: It seems that using list(...) it is possible to read the parameters passed as part of the ... input parameters section! Try using this
# method instead of match.call() when parsing input parameters.
# - 2016/01/17: Rewrite functions so that newer powerful packages for data manipulation and plotting are used:
#		- dplyr for data manipulation (see http://scicomp2014.edc.uri.edu/posts/2014-04-14-Smith.html for an example and how it compares with plyr)
#			(dplyr uses the %>% pipe operator!)
#		- ggplot2 or ggvis for better quality graphs. In the end, alghough ggplot seems complicated, it may not be so...
# 		(ggvis uses the %>% pipe operator!)
#		HOWEVER, ggvis does NOT support all plots that ggplot2 supports, e.g. facetted (i.e. grid) plots (ref: http://stats.stackexchange.com/questions/117078/for-plotting-with-r-should-i-learn-ggplot2-or-ggvis)
#		See also NAT Tools -> Deployment for naming conventions and references for the ggplot and ggvis packages (dated 04-Apr-2015)

# Necessary Libraries
require(gdata)		# For rename.vars() etc. (used at least in plot.binned) (note: gdata also has a trim() function to remove leading and trailing blanks in strings)
require(graphics)	# For symbols(), filled.contour(), etc. (used at least in plot.binned, biplot.custom)
require(grDevices)# For colorRampPalette() (used in biplot.custom)
require(MASS)     # For kde2d() etc. (used at least in pairs.custom)

########################################## FUNCTIONS TO ADD ###################################

# Smoohting spline fit by a grouping variable
panel.smooth <- function(x, y, group, ...)
# Created: 2009/02/02
# Modified: 2009/02/02

# Use the graphics::panel.smooth() function to plot points and add a LOESS fit line to them.

########################################## FUNCTIONS TO ADD ###################################

####################################### AUXILIARY functions ###################################
# INDEX (sorted in the order they are defined)
# cdf (auxiliary function for plot.cdf)
# getAxisLimits
# getBounds
# getParamsList
# getVarType
# pairsXAxisPosition
# pairsYAxisPosition
# checkVariables
# ellipse, ellipsem, angle (auxiliary functions for plot.outliers2d)
# extract
# extractVariable
# parseVariables
# setDefaultOptions
# who
# whos

# getAxisLimits
# Get the axis limits of the current plot
getAxisLimits = function(ext=0.04, xaxs=par("xaxs"), yaxs=par("yaxs"))
# Created: 			2013/12/23
# Modified: 		2013/12/23
# Author: 			Daniel Mastropietro
# Description: 	Returns the axis limits of the active plot by taking care of the fact that by default the user coordinates
#							 	do NOT represent the xlim and ylim values of a plot!!! Since by default the value of graphical parameters
#							 	xaxs and yaxs is "r" (regular) which means that the user coordinates are extended below and above the xlim
#							 	and ylim values by 4% of the range.
#						   	For example, if xlim=c(1,10) then the user coordinates for the X axis is set to:
#							 	[1 - 4%*(10-1), 1 + 4%*(10-1)] = [0.64, 1.36]
#							 	This does not happen when xaxs and yaxs are equal to "i" (internal) the other possible value for these parameters.
# Parameters:		ext: proportion of the axis limits range by which the user coordinates are extended.
#								This is currently (version 3.0.1) equal to 4% as indicated by the 'xaxs' and 'yaxs' options of the par() documentation.
#								NOTE: (2017/03/09) When creating a barplot with a line plot on a secondary axis
#								(typically for showing #cases (first axis) and another variable on a different scale (secondary axis)), use xaxs="i"
#								in order to make the x-axis positions of the two plots to match!
#								For more info on this x-axis matching, see:
#								http://stackoverflow.com/questions/29182758/how-to-get-the-x-coordinate-of-bars-in-barplot-in-r
#								although here they do not mention the xaxs/yaxs option, but give other useful tips.
# Examples:		 	axisLimits = getAxisLimits()
	# Read useful graphical parameters of the active plot
	usr = par("usr")

	xlim = usr[1:2]
	ylim = usr[3:4]
	if (xaxs == "r") {		# Note: the only implemented values of the xaxs and yaxs options in R are "r", "i". The other 3 options are NOT implemented.
		# Range in the extended coordinates
		xdelta = diff(xlim)
		# Correct xlim values
		# (the lower end gets increased and the upper end gets decreased by a factor of ext/(1+2*ext) --e.g. 0.04/1.08 when ext = 0.04)
		xlim = xlim + c(+1,-1) * xdelta * ext/(1 + 2*ext)
	if (yaxs == "r") {
		# Range in the extended coordinates
		ydelta = diff(ylim)
		# Correct ylim values
		# (the lower end gets increased and the upper end gets decreased by a factor of ext/(1+2*ext) --e.g. 0.04/1.08 when ext = 0.04)
		ylim = ylim + c(+1,-1) * ydelta * ext/(1 + 2*ext)

	return(c(xlim, ylim))

# getBounds
# Return the lower and upper bound of intervals of the type (x1,x2] or [x1,x2), etc. and also the brackets
# (Inspired by the midpoints() function published on Matt's Stats around May-2014, which I have implemented
# as function extract())
getBounds <- function(x)
	x = as.character(x)
	left <- substr(x,1,1); 								opleft = ""; 	if (left == "(") { opleft = "<" } else if (left == "[") { opleft = "<=" }
	right <- substr(x,nchar(x),nchar(x)); opright = ""; if (right == ")") { opright = "<" } else if (right == "]") { opright = "<=" }
	lower <- as.numeric(gsub(",.*","",gsub("\\(|\\[|\\)|\\]","", x)))
	upper <- as.numeric(gsub(".*,","",gsub("\\(|\\[|\\)|\\]","", x)))
	return(list(lower=lower, upper=upper, left=left, right=right, opleft=opleft, opright=opright))

# getVarType
# Return the type of a variable in a data frame (character or numeric)
getVarType = function(x)
	# Check if the variable is factor otherwise, typeof() returns numeric!! (since factor levels are converted or stored as numbers)
	if (is.factor(x)) {
		vartype = typeof(levels(x))
	} else {
		vartype = typeof(x)


# getParamsList
#' Return the parameter list of a function call, including optional parameters
#' @param call function call from which the parameter list should be returned. This can be retrieved using match.call()
#' from within the function whose parameter list is of interest.
#' @param ... optional parameters passed to the function call of interest.
#' @return a list containing two attributes:
#' <ul>
#' <li> a list with all parameters passed to the function.
#' <li> a list with the optional parameters passed to the function (through ...), which is a subset of the 'all parameters' list.
#' </ul>
getParamsList = function(call, ...) {
  # Put all parameters into a parameter list (this includes the optional parameters)
  paramsList = as.list(call)
  # Get the optional parameters passed as ...
  optionsList = list(...)
  # Remove the function name from the all parameters list
  paramsList[[1]] = NULL

  return(list(allParams=paramsList, optionalParams=optionsList))

# pairsXAxisPosition 
#' Define the position of the x-axis in a pairs plot
pairsXAxisPosition = function(xaxt="s")
# Created: 2013/09/16
# Modified: 2013/09/16
# Description: Defines the position in a pairs panel to place the x-axis ticks using the same logic as pairs()
  mfg = par("mfg")  # 4-element array with the following info: current panel position, panel structure (e.g. (2,3,4,4))

  # X-axis
  if (xaxt == "n") {
    # Place the axis ticks next to the axis when no axis ticks have been required at the pairs() call
    outer = FALSE
    line = 0
    side = 1
  } else {
    # Place the axis ticks on the outer panel side o.w.
    # The side is chosen based on the mfg option value, in order to mimic the same logic as the pairs() function
    outer = TRUE
    line = 0  # This number should be -gap, where gap is the space between the subplots in margin lines. However, I don't know yet how to retrieve the value of gap in the current pairs() call!
    if (mfg[2] %% 2 == 1) {
      side = 1
    } else {
      side = 3

  return(list(side=side, outer=outer, line=line))

# pairsYAxisPosition 
#' Define the position of the y-axis in a pairs plot
pairsYAxisPosition = function(yaxt="s")
# Created: 2013/09/16
# Modified: 2013/09/16
# Description: Defines the position in a pairs panel to place the y-axis ticks using the same logic as pairs()
  mfg = par("mfg")  # 4-element array with the following info: current panel position, panel structure (e.g. (2,3,4,4))
  # Y-axis
  if (yaxt == "n") {
    # Place the axis ticks next to the axis when no axis ticks have been required at the pairs() call
    outer = FALSE
    line = 0
    side = 2
  } else {
    # Place the axis ticks on the outer panel side o.w.
    # The side is chosen based on the mfg option value, in order to mimic the same logic as the pairs() function
    outer = TRUE
    line = 0  # This number should be -gap, where gap is the space between the subplots in margin lines. However, I don't know yet how to retrieve the value of gap in the current pairs() call!
    if (mfg[1] %% 2 == 1) {
      side = 4
    } else {
      side = 2
  return(list(side=side, outer=outer, line=line))

# CheckVariables
# Check existence of variables in a data frame
CheckVariables <- checkVariables <- check.variables <- function(data, vars, print=FALSE)
# Created: 			05-Aug-2013
# Modified: 		30-Mar-2014
# Author: 			Daniel Mastropietro
# Description: 	Checks whether a set of variables exist in a data frame
# Parameters:
#             	data: matrix or data frame where the existence of the variables (columns) is checked
#             	vars: vector of variable names given as strings
# Output: 			A vector with the names of the variables not found in the dataset is returned.
# Examples:			varsNotFound = checkVariables(df, c("x1","z","tt"), print=TRUE)
#  if (!is.data.frame(data)) {
#    stop("CHECKVARIABLES - ERROR: The dataset '", deparse(substitute(data)), "' is not a data frame")
#  }
  # Start
  varsNotFound = NULL
  varsInData = colnames(data)
  for (v in vars) {
    if (!v %in% varsInData) {
      varsNotFound = c(varsNotFound, v)
  if (!is.null(varsNotFound)) {
    cat("CHECKVARIABLES: The following variables do not exist in data frame '", deparse(substitute(data)), "':\n", sep="")
    for (v in varsNotFound)
      cat(v, "\n")

  if (!is.null(varsNotFound)) {
  } else {
  	# Show a message either when print=TRUE or when this function was called from the parent most environment (i.e. from the GlobalEnv environment)
#    if (print | grepl("GlobalEnv", environmentName(sys.frame(sys.parent())))) # Note the use of grepl() instead of grep() so that a boolean value is returned; grep() instead returns numeric(0) when the string is not found.
    if (print | length(grep("GlobalEnv", environmentName(sys.frame(sys.parent())))) > 0) # HOWEVER, grepl() does NOT exist in R-2.8.0!! (used in SPSS 18.0.1) So in SPSS I use length(grep()) > 0 as an equivalent condition to grepl() == TRUE.
      cat("All variables found in dataset '", deparse(substitute(data)), "'\n")

# extractVariable
# Parse a variable passed to a function so that either the actual variable or a string representing a data frame's variable name are accepted
extractVariable <- extract.variable <- function(dat, var)
# Created: 			30-Mar-2014
# Modified: 		30-Mar-2014
# Author: 			Daniel Mastropietro
# Description: 	Parses variables passed to functions so that variable names are converted to the actual variable in the data frame.
# Parameters:
#             	data: matrix or data frame that should contain variable 'var'.
#             	var: string (as in "x") or variable name (as in dat$x, or dat[,"x"] or dat[,target] where target="x") that should be parsed.
# Output: 			Either an error that the variable does not exist or the actual variable taken from data frame 'dat'
# Examples:			target = "y"; target = extractVariable(toplot, target);
#								target = dat$y; target = extractVariable(toplot, target);
	# Check if 'var' evaluates to character (presumably meaning that the variable is given as a variable NAME (as opposed to a true R variable)
  var_eval = try( eval(var) )
  if (inherits(var_eval, "try-error")) {
  } else {
    if (class(var_eval) == "character" && length(var_eval) == 1) {
    	## NOTE: This function works as expected ALSO in the case when variable 'var' is given as the column of type character in a data frame with just one row
    	## (in which case the above IF will return TRUE --and this is FINE because dat[,var] below evaluates to e.g. dat[,dat[,"x"]] which is the same as dat[,"x"]! --which is what we want, the column "x" in data frame dat)
    	## HOWEVER, it will probably fail when we call extractVariable(dat, df[,"x"]) since dat[, df[,"x"]] is NOT equivalent to df[, df[,"x"]] or to dat[, dat[,"x"]]...
    	## so this is the only scenario where the function will not return what we expect --we expect it to return simply df[,"x"] since df[,"x"] is an actual variable NOT a string naming a variable
    	## --recall this is only a problem when df[,"x"] is of length 1, which is particularly a problem when we call e.g. df[ind, "x"] and ind has only one index)
      if (!is.null(checkVariables(dat, var))) {
        cat("The variables indicated above were not found in dataset '", deparse(substitute(dat)), "'\n", sep="")
      var = dat[,var] 

# parseVariables
# Convert a list of variables which can be given as a blank/tab/line separated list or as an array into an array
parseVariables <- function(vars) {
# Created: 			30-Nov-2016
# Modified: 		30-Nov-2016
# Author: 			Daniel Mastropietro
	# First remove any potential initial blanks or new lines (o.w. the first returned variable will be empty)
	vars = sub("^[ \n]+", "", vars)
	return(unlist(strsplit(vars, "[ \n]+")))	# Note: the white space matches one or more blanks or one or more tabs (see ?regex)

# extract
# Extract the mid point, lower value or upper value of open, closed, semi-open intervals like those returned by the cut() function.
# Taken from an e-mail received from R-Bloggers on 24-May-2014, originally published on Matt's Stats and stuff (not saved)
extract <- function(intervals, digits=2, what=c("midpoint", "lower", "upper"))
	# Parse input parameters
	what = match.arg(what) 

	# Parse x
	lower = as.numeric(gsub(",.*","",gsub("\\(|\\[|\\)|\\]","", intervals)))
	upper = as.numeric(gsub(".*,","",gsub("\\(|\\[|\\)|\\]","", intervals)))

	# Define the output value
	out =	switch (tolower(what),
								midpoint 	= round(lower+(upper-lower)/2, digits),
								lower			= round(lower, digits),
								upper			= round(upper, digits))

# setDefaultOptions
#' Sets a list to a set of default values
#' @param optionsList named list of options to set to default values if the option is not given.
#' @param optionsList_default named list of default option values to set in \code{optionsList}.
#' @return updated optionsList with the unset names appearing in optionsList_default set to their default value.
setDefaultOptions = function(optionsList, optionsList_default) {
	for (prop in setdiff( names(optionsList_default), names(optionsList) )) {
		optionsList[[prop]] = optionsList_default[[prop]]

# who
# List the names of the variables existing in an environment in a matrix-like form
who = function(envir=.GlobalEnv)
# Created: 			2008
# Description: 	Function that implements the Matlab-like function 'who' listing the objects defined in memory

# whos
# List the names of the variables existing in an environment and their sizes in a matrix-like form
whos = function(envir=.GlobalEnv, sortby=c("name","size"), decreasing=FALSE)
# Created: 			2008
# Modified: 		07-Jul-2015
# Author: 			Daniel Mastropietro
# Description: 	Function that implements the Matlab-like function 'whos' showing the objects defined in memory
#								along with their sizes, optionally sorting by size in ascending or descending order.
  if (length(sortby) == 2) { sortby = "name" }
	size = sapply(ls(envir=envir) , function(x) object.size(get(x)))
	if (tolower(sortby) == "size") 
		size = sort(size, decreasing=decreasing)
	else if (decreasing)
		size = size[order(names(size), decreasing=TRUE)]
	cat("Size of objects in bytes\n")
	cat("Size of objects in bytes\n")

# ellipse
# Three auxiliary functions used by plot.outliers2d
# ellipse() and ellipsem() were taken from: http://ms.mcmaster.ca/peter/s4c03/s4c03_0506/classnotes/DrawingEllipsesinR.pdf
# The function ellipsem() addsthe ellipse t(x-mu)%*%amat%*%(x-mu) = c2 to the current plot or a new plot; 
# function ellipse() draws an ellipse given the half-lengths of the axes, the angle of orientation, and the centre; 
# angle() computes the angle between the X-axis and a vector drawn from the origin to the point (x, y).
ellipse = function (hlaxa = 1, hlaxb = 1, theta = 0, xc = 0, yc = 0, newplot = F, npoints = 100, ...) 
  a <- seq(0, 2 * pi, length = npoints + 1) 
  x <- hlaxa * cos(a) 
  y <- hlaxb * sin(a) 
  alpha <- angle(x, y) 
  rad <- sqrt(x^2 + y^2) 
  xp <- rad * cos(alpha + theta) + xc 
  yp <- rad * sin(alpha + theta) + yc 
  if (newplot) 
    plot(xp, yp, type = "l", ...) 
  else lines(xp, yp, ...)
ellipsem = function (mu, amat, c2, npoints = 100, showcentre = T, ...) 
  if (all(dim(amat) == c(2, 2))) { 
    eamat <- eigen(amat) 
    hlen <- sqrt(c2/eamat$val) 
    theta <- angle(eamat$vec[1, 1], eamat$vec[2, 1]) 
    ellipse(hlen[1], hlen[2], theta, mu[1], mu[2], npoints = npoints, 
    if (showcentre) 
      points(mu[1], mu[2], pch = 3) 
  invisible() # This allows for an object to be returned when they are assigned but which do not print when they are not assigned
angle = function (x, y) 
  angle2 <- function(xy) { 
    x <- xy[1] 
    y <- xy[2] 
    if (x > 0) { 
    else { 
      if (x < 0 & y != 0) { 
        atan(y/x) + sign(y) * pi 
      else { 
        if (x < 0 & y == 0) { 
        else { 
          if (y != 0) { 
            (sign(y) * pi)/2 
          else { 
  apply(cbind(x, y), 1, angle2) 

# cdf
# 2014/07/17: Compute empirical CDF of a numeric variable
# Application: initially tried to use to determine the sampling probabilities for a reject inference modeling simulation based on the distribution
# of a score variable. But it turned out that this only works when the population from which the sample should be drawn has a uniform distribution
# on this score variable. So I turned into using the histogram on pre-specified bins in order to determine such sampling probabilities.
# Ref: NAT -> NBC-2014 -> M2014-v2SBR-01RejectInference_v2.sps
cdf = function(x, include.lowest=FALSE)
# Created:			2015/02/04
# Modified:			2015/02/04
# Author:				Daniel Mastropietro
# Description:	Computes the empirical CDF of a numeric variable.
#	Parameters:		x: 								Numeric vector. Missing or NA values are allowed.
#								include.lowest: 	Flag indicating whether to include the lowest value of x as 0% percentile.
# Output: 			A data frame with 4 columns:
# 							- x:		distinct values of x occurring in the input vector x.
# 							- freq:	number of cases in the data taking the value x.
# 							- pdf:	probability density function: proportion of non-missing cases in the data taking the value x.
# 							- cdf:	cumulative density function: proportion of non-missing cases in the data taking the value <= x.

	# Frequency table of the data values
	x.tab = as.data.frame(table(x))			# Note that NAs are not counted by table()
	colnames(x.tab) = c("x", "freq")
	# Remove the factor property from column "x" (which is generated by table(x))
	x.tab$x = as.numeric( as.character(x.tab$x) )
	if (include.lowest)
		# Add the minimum value of x one more time so that it is regarded as 0% percentile
		x.tab = rbind(c(0,0), x.tab)

	# Count the total number of non-missing cases
	n = sum(x.tab$freq)
	# Check if there are missing values
	if (n < length(x) && !grep("plot.cdf", sys.call(1))) {
		# Only show the warning when this function was not called from plot.cdf() because that function already issues
		# a warning when there are missing values in x.
		na.pct = formatC((length(x)-n)/length(x)*100, format="f", digits=1)
		cat("CDF: WARNING -", length(x)-n, "NA(s) (", na.pct, "% ) found in array", deparse(substitute(x)), ".\n")
		cat("CDF: Analysis based on", n, "valid values out of", length(x), ".\n")

	# Compute PDF and CDF
	x.tab$pdf = x.tab$freq / n
	x.tab$cdf = cumsum(x.tab$freq) / n

####################################### AUXILIARY functions ###################################

################################## DATA TRANSFORMATION functions ##############################
# INDEX (sorted alphabetically)
# quantile.weight
# logitInv
# safeLog
# safeLogInv

# 2013/08/18
# Group of numeric variable using equal-size groups similar to the quantile() function but using weights for each observation.
# Motivation: create quantile groups based on the estimated density of a numeric variable so that the histogram bins are smaller
# where the density is larger.
# For an example see: dev\Density&Histogram.r
quantile.weight = function(x, weight, probs=seq(0,1,0.25), type=4, precision=4)
  # Parameters:
  # x:          numeric vector for which weighted quantiles are computed
  # weight:     weights to use for each value in x (any non-negative real number is allowed)
  # probs:      vector of probabilities defining the quantiles (same meaning as 'probs' in quantile() function in R)
  # precision:  number of significant digits to keep in the quantile values
  # Output:     A data frame with three columns:
  #							- non weighted probabilities corresponding to the weighted x quantiles
  #							- the weighted x quantiles
  #							- the weight quantiles
  # 						The row names of the data frame are the probabilities on which the quantiles are computed.
  #             The method used for the quantile computation is the method implemented as type=4 in the quantile() function in R,
  #             i.e. a linear interpolation is performed on the empirical weighted cdf.
  # Data example to try this out:
# x = c(0.532911970399581 , 0.343041113563211 , 0.386032839804403 , 0.700412550080721 , 0.532742005037833 , 0.58089531748196 , -0.217689730141691 , 0.22090301819094 , 0.623072695280153 , 0.566896956080503 , 0.750391260907804 , 0.516034289364722 , 0.302135940489829 , 0.489357487843298 , 0.778118464095309 , 0.28906342442205 , 0.0515489742182842 , 0.603788130986552 , 0.403093379536764 , 0.650808860183577)
# weight = c(1 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 2 , 3 , 1 , 4 , 3 , 5 , 1 , 8 , 3 , 2)
  if (length(weight) != length(x))
    stop("The weight variable ", deparse(substitute(weight))," and the analysis variable ", deparse(substitute(x))," have different lengths:", length(weight), "and", length(x), "respectively.")
  if (type != 4)
    stop("The type parameter value ", type, " is not valid. The only accepted value is 4, which corresponds to the type=4 method described in R function quantile().")
  # Note that order() is different from rank()!! order() gives the order of the indices of x so that x is in sorted.
  # rank() gives the rank of each x in the order that x is arranged!
  # This implies the x[order(x)] returns x in ascending order, but x[rank(x)] is something completely different!
  xorder = order(x)
  xsort = x[xorder]
  wsort = weight[xorder]
  wtotal = sum(as.numeric(weight), na.rm=TRUE)	# as.numeric() is used to avoid overflow... this overflow error happened once when using large values for weight!
  # Compute the CDF of x weighted by variable weight (i.e. each x is represented as many times as the corresponding value given in weight)
  X = as.data.frame(cbind(x=xsort, w=wsort, p=cumsum(wsort/wtotal)))
  # p is cdf with weights, q is cdf w.o. weights!
  X$q = as.numeric(1:nrow(na.omit(X)))/nrow(na.omit(X))
  # Look for the cut points of the weight-based quantile groups
  xsubset = X$x
  wsubset = X$w
  psubset = X$p
  qsubset = X$q
  indX = 0
  X$group = NA
  # Non-weighted quantiles (i.e. percentage of values of x smaller than the weighted quantile)
  quantPrev = qsubset[1]
  quantiles = quantPrev
  # Quantiles for x
  xquantilePrev = xsubset[1]
  xquantiles = xquantilePrev
  # Quantiles for the weight
  wquantilePrev = wsubset[1]
  wquantiles = wquantilePrev
  for (p in probs[probs>0]) {
    ind = (psubset<=p)
    last = sum(ind)
    # Check if any value in psubset was found to be <= p (recall that psubset is updated at every loop so that in the next loop
    # it contains the values that were not already analyzed in the loop just processed)
    if (last > 0) {
    	# x and weight values of the cases in the current quantile
			qgroup = qsubset[ind]
			xgroup = xsubset[ind]
			wgroup = wsubset[ind]
			# q and x quantile
			qmax = qgroup[last]
			xmax = xgroup[last]
			if (psubset[last] < p & p < 1) {
				# Interpolate to get the quantile
				quant = qsubset[last] + (p - psubset[last]) / (psubset[last+1] - psubset[last]) * (qsubset[last+1] - qsubset[last])
				xquantile = xsubset[last] + (p - psubset[last]) / (psubset[last+1] - psubset[last]) * (xsubset[last+1] - xsubset[last])
			} else {
				# Assign the quantile to be equal to xmax when the value of X$p coincides exactly with the current probability being processed in probs
				quant = qmax
				xquantile = xmax
			# Weight quantile is the cumulative sum of the weights UP TO the current group!
			wquantile = wquantilePrev + sum(as.numeric(wgroup))
			X$group[indX+1:last] = rep(paste("(", signif(xquantilePrev, digits=precision), ",", signif(xquantile, digits=precision), "]", sep=""), last)
			# Update X index, subset vectors and quantiles
			indX = indX + last
			quantPrev = quant
			xquantilePrev = xquantile
			wquantilePrev = wquantile
			qsubset = qsubset[-(1:last)]
			xsubset = xsubset[-(1:last)]
			wsubset = wsubset[-(1:last)]
			psubset = psubset[-(1:last)]
		} else {
			# When no value was found satisfying psubset <= p, assign the same x value as for the previous quantile
			quant = quantPrev
			xquantile = xquantilePrev
			wquantile = wquantilePrev
		quantiles = c(quantiles, quant)
		xquantiles = c(xquantiles, xquantile)
		wquantiles = c(wquantiles, wquantile)
  output = as.data.frame(cbind(quantiles, xquantiles, wquantiles))
  colnames(output) = c("q", deparse(substitute(x)), deparse(substitute(weight)))
  rownames(output) = paste(probs*100,"%", sep="")

# Compute the safe-log in base 10, which accepts ALL real values as input
safeLog = function(x, constant=1)
  return(sign(x)*log10(constant + abs(x)))

# Compute the inverse of the safe-log function in base 10 implemented in function safeLog()
safeLogInv = function(x, constant=1)
  return(sign(x)*(10^abs(x) - constant))

# Compute the inverse of the logit() function defined in the car package
logitInv = function(x, adjust=0)
# The logit() function in the car package allows an adjustment of the 0 and 1 probabilities to
# avoid -Inf and +Inf, by using an adjustment factor 'adj' and computes the logit as:
# log(padj/(1-padj))
# where padj = p*(1-2*adj) + adj, linear mapping from [0,1] to [adj,1-adj]
# Note: This was verified by intuition and comparing the output from logit() with my own
# computation as described above.
# Note that there is an inverse logit function also in the gtools package.
	y = 1/(1+exp(-x))
	yadj = (y - adjust)/(1-2*adjust)
################################## DATA TRANSFORMATION functions ##############################

###################################### GRAPHICAL functions ####################################
# INDEX (sorted alphabetically)
# cumplot (NEW May-2014)
# hist.log
# biplot.custom
# pairs.custom
# plot.axis
# plot2				(NEW Mar-2017)
# plot.bar		(NEW Mar-2014)
# plot.binned
# plot.cdf		(NEW Mar-2014)
# plot.dist
# plot.hist
# plot.image
# plot.log
# plot.outliers2d	(NEW May-2014)

# PANEL FUNCTIONS (functions that only make sense when ADDED to a panel --such as the panels on a pairs plot)
# panel.cor
# panel.dist
# panel.hist
# panel.outliers2d	(NEW May-2014)

# 2013/09/21
# plot.dist: Scatter plot with boxplot and density displayed on the side of each axis
plot.dist = function(dat, plot.fun=plot, respect=FALSE, histogram=FALSE, xlim=NULL, ylim=NULL, ...)
# This function uses the layout() function to distribute the differenet plots and calls panel.dist() (defined below)
# Parameters: respect is the aspect ratio respecting of the layout() function
  ### Parse the function call
  call = match.call()
  # Put all calling parameters into a parameter list
  paramsList = as.list(call)
  # Remove the function name from the parameter list
  paramsList[[1]] = NULL
  # Remove also the data frame because this name is not recognized by plot() or similar functions, whose first argument is named 'x'
  paramsList$dat = NULL

  ### Define the graphical layout, as follows:
  # Plot #1: will be done on the bottom-left tile (scatter plot)
  # Plot #2: will be done on the top-left tile (distribution of variable 1)
  # Plot #3: will be done on the bottom-right tile (distribution of variable 2)
  # No plot on the upper-right tile
  # This position is defined by the numbers 0, 1, 2, 3, where 0 means NO plot.
  # The widths and heights of each subplot are different, accordingly to what is plotted
  op = par(mar=c(4,4,0,0), no.readonly=TRUE);
  layout = layout(matrix(c(2,0,1,3), ncol=2, byrow=TRUE), widths=c(4/5,1/5), heights=c(1/5,4/5), respect=respect)
  on.exit({ par(op); layout(1) })
  x1 = dat[,1]
  x2 = dat[,2]
  # Add these variables to the parameter list to pass to the plot.fun function
  # Note the use of quote() to avoid having the values plotted shown as labels! (see comment below prior to the do.call() command)
  paramsList$x = quote(x1)
  paramsList$y = quote(x2)
  paramsList$plot.fun = NULL
  paramsList$respect = NULL
  paramsList$histogram = NULL
  varnames = colnames(dat)
  if (!is.null(varnames)) {
  	paramsList$xlab = varnames[1]
  	paramsList$ylab = varnames[2]
  if (!is.null(xlim)) {
  	paramsList$xlim = xlim
  	paramsList$ylim = ylim
  } else {
  	# Set the xlim and ylim that are used when generating the distributions of x1 and x2 on the side of the plot
  	# so that there is alignment with what is plotted in the main plot of x2 vs. x1.
  	xlim = range(x1)
  	ylim = range(x2)
  ### Main plot
  # Still cannot make the do.call() work. The problem I have is that instead of points, the plot is filled with the data values!! (strange)
  # (2014/05/12) SOLVED! The issue was solved by using the quote() function above to define the paramsList$x and paramsList$y elements.
  # The reason behind this issue is the following: prior to the do.call() execution below, all parameters are EVALUATED! Therefore
  # when deparse(substitute(x)) is used inside the plot() function in order to show the label for the x axis, the label is NOT shown
  # because x HAS ALREADY BEEN EVALUATED. Therefore the evaluated values of x are shown as labels!
  # The solution is to assign quote(x) instead of just 'x' to paramsList$x (as done above when creating the paramsList object).
  do.call(deparse(substitute(plot.fun)), paramsList)
#  plot(x1, x2, xlim=xlim, ylim=ylim, ...)	# This is no longer needed if the above do.call() works... I still leave it in case there is a problem with the above

  ### Distribution of x1
  par(mar=c(0,4,0,0), pty="m")	# The non-zero margin should be the same as the margins defined above (using mar) when doing the main plot of x2 vs. x1
    ## Make the horizontal axis of the distribution stick to the top axis of the scatter plot
    ## Also make the plot region maximal so that the whole subplot is occupied by the distribution
  panel.dist(x1, histogram=histogram, horizontal=TRUE, add=FALSE, main="", axes=FALSE, xlim=xlim, ylim=ylim)

  ### Distribution of x2
  par(mar=c(4,0,0,0), pty="m")	# The non-zero margin should be the same as the margins defined above (using mar) when doing the main plot of x2 vs. x1
    ## Make the horizontal axis of the distribution stick to the right axis of the scatter plot
    ## Also make the plot region maximal so that the whole subplot is occupied by the distribution
  panel.dist(x2, histogram=histogram, horizontal=FALSE, add=FALSE, main="", axes=FALSE, xlim=xlim, ylim=ylim)

# panel.cor: Correlations as an off-diagonal panel function in pairs(). The font size is proportional to the correlation.
# - 2013/12/13: Make the background color of the whole panel vary according to the correlation level, e.g. blue for negative
# correlation, white for no correlation, red for positive correlation.
panel.cor <- function(x, y, digits=2, prefix="", use="pairwise.complete.obs", cex.cor, ...)
# Created: 2008/06/20
# Modified: 2013/06/25
# Function to use as a panel function in pairs() that writes the correlations whose font size is ~ correlations.
# Typically used as the lower.panel function in pairs().
# (taken from help(pairs))
# Note the inclusion of the '...' parameter in order to avoid any errors when additional parameters
# are passed to other functions used in the pairs() call to be used for the different panels
# (e.g. plot.binned() below which accepts many parameters)
# HISTORY: (2013/06/25) added parameter 'use' passed in the cor() function call to define how missing values should be treated for the computation of the correlation.
  opar <- par(usr=c(0, 1, 0, 1)); on.exit(par(opar))
  r <- abs(cor(x, y, use=use))
  txt <- format(c(r, 0.123456789), digits=digits)[1]
  txt <- paste(prefix, txt, sep="")
  if(missing(cex.cor)) cex <- 1/strwidth(txt)
  text(0.5, 0.5, txt, cex=cex*r)

# panel.dist: 1D density estimation and boxplot as a diagonal panel of a pairs() plot.
# (Ref: Uwe Ligges, https://stat.ethz.ch/pipermail/r-help/2011-July/282611.html)
# - 2013/09/21: Replace the use of the barplot() function with the use of the rect() function to draw the histogram bars
# horizontally (when horizontal=FALSE --because 'horizontal' refers to the boxplot horizontality) and
# thus avoid the problem with the barplot() function which does NOT use the x scale but a simple 1 ... #bins scale, and
# this makes the overlaying of the histogram and the density not always coincide!
# (see ane example of the rect() function in panel.hist())
panel.dist = function(x, add=TRUE, histogram=FALSE, horizontal=TRUE, xlim=NULL, ylim=NULL, ...)
# Created: 2013/07/08
# Modified: 2013/09/21
# Description: Shows a density estimation and a boxplot overlayed
# Parameters: x: numeric variable of interest
# HISTORY:  (2013/09/21) Added parameters 'add', 'hist', 'horizontal' to control whether to:
#                        - create a new plot (add=FALSE)
#                        - plot also a histogram (histogram=TRUE)
#                        - show the density and boxplot in vertical position (horizontal=FALSE)
  # Density estimation
  dens = density(x, na.rm=TRUE)
  # Histogram
  if (histogram) {
    hist = hist(x, plot=FALSE)
  # Create a new plot if requested
  if (!add) {
    if (horizontal) { # boxplot is horizontal => density is horizontal
      if (missing(xlim) | is.null(xlim)) xlim = range(dens$x)
      ylim = range(dens$y)
      if (histogram) ylim = range(c(ylim), hist$density*1.5)
    } else {  # boxplot is vertical => density is rotated 180 degrees (to show the density on a vertical axis on the side of a scatter plot)
      xlim = range(dens$y)
      if (missing(ylim) | is.null(ylim)) ylim = range(dens$x)
      if (histogram) xlim = range(c(xlim), hist$density*1.5)
    plot(dens, type="n", xlab="", ylab="", xlim=xlim, ylim=ylim, ...)
  # Start plotting
  usr <- par("usr"); on.exit(par(usr))
  if (histogram) {
    ### Plot the histogram using barplot()
    ### Note that I need to use barplot() and NOT plot(hist, ...) because I want to plot the histogram
    ### horizontally (horizontal=FALSE, because this parameter refers to the boxplot horizontality!) and
    ### plot.hist() function does NOT have an option to show the plot rotated from the normal display)
    ### However, I could consider using plot(hist$density, hist$mid, type="s") to show the histogram rotated but this
    ### still needs some development because the way it is shown is not in the normal histogram way... 
    ### A possible evolution of this is using the rect() function to create bars, as is done by panel.hist() in this code! (by Uwe Ligges)
    # IMPORTANT: Define the scale for the barplot (so that the x values coincide with the x values of the plot!)
    # This is necessary because the barplot positions the bars ALWAYS on the values 0.5, 1.5, ..., length(hist$mid)-0.5,
    # regardless of the values of x!!!
    # Note also that the computation of the scale is different depending on whether the histogrma is shown horizontally or vertically
    # (This was done by trial and error and it is not assured that it works in all situations!!)
    scale = hist$mid[length(hist$mid)] / (length(hist$mid) + (as.numeric(horizontal)-0.5)*2)
    if (horizontal) {
      par(usr=c(usr[1:2]/scale, usr[3:4]))
    } else {
      par(usr=c(usr[1:2], usr[3:4]/scale))
    barplot(hist$density, axes=FALSE, space=0, horiz=!horizontal, add=TRUE)
    # Restore the user coordinates
  if (horizontal) {
    boxplot(x, at=0.5, boxwex=0.3, horizontal=TRUE, add=TRUE, ...)
  } else {
    lines(dens$y, dens$x)
    boxplot(x, at=0.5, boxwex=0.3, horizontal=FALSE, add=TRUE, ...)

# panel.hist: Histogram as a diagonal panel function in pairs()
panel.hist <- function(x, ...)
# Created: 2008/06/20
# Modified: 2008/06/20
# Function to use as a panel function in pairs() that plots the histogram of the variables.
# Typically used as the diag.panel function in pairs().
# (taken from help(pairs))
# Note the inclusion of the '...' parameter in order to avoid any errors when additional parameters
# are passed to other functions used in the pairs() call to be used for the different panels
# (e.g. plot.binned() below which accepts many parameters) 
	# Change the user coordinates for the panel so that the vertical axis varies from 0 to 1.5.
	# This allows a percent histogram to fit in the panel and leaves space above the histogram
	# to show the variable name (added by the text.panel= function specified in the pairs() plot)
	# What I don't understand is how the variable name appears on top of the histogram and not OVER the histogram...
	# (I haven't found anything in the default text.panel= function that suggests what happens with the variable name)
  usr <- par("usr"); on.exit(par(usr=usr))
  par(usr=c(usr[1:2], 0, 1.5) )
  h <- hist(x, plot = FALSE)
  breaks <- h$breaks; nB <- length(breaks)
  y <- h$counts; y <- y/max(y)
  # Use the rect() function (rect(xleft, ybottom, xright, ytop)) to ADD the histogram to the already generated plot for the panel
  # Recall that panel plots should only ADD plots, not create a new plot (which gives an error).
  rect(breaks[-nB], 0, breaks[-1], y, col="light cyan")

# plot.image: Image showing the 2D density estimation
# (Ref: Uwe Ligges, https://stat.ethz.ch/pipermail/r-help/2011-July/282611.html)
plot.image = function(x, y, col="red", nlevels=12, xaxt="s", yaxt="s", addXAxis=FALSE, addYAxis=FALSE, add=FALSE, ...)
# Created: 2013/07/08
# Modified: 2013/11/14
# Description: Shows an image that represents the levels of the 2D density estimate of a pair of numeric variables using kde2d.
# Parameters: 
#             x, y:     numeric variables whose 2D density we want to plot
#             nlevels:  number of colors used in the image representing the 2D density estimation 
#                       nlevels=12 is the value used by default by the image() function to define the number of colors used.
# Note: the xaxt and yaxt parameters are needed in order to receive the value passed by the user when calling pairs()
# For some reason the xaxt and yaxt options are not passed to this function when not explicitly listed here! (I guess this happens with all other graphical parameters passed in ...???)
# HISTORY:  (2013/09/15) Converted the function to a 'plot' function rather than 'panel' function in the sense that
#           it can be called to create a NEW plot. For this I simply added parameter 'add' (which defaults to TRUE anyway, so that  it still works with pairs())
#           and added the '...' parameter list on the call to image(), so that the user can pass customized graphical parameters.
#           (2013/09/23) Added parameter 'col' to define the color of the image intensity.
#						(2013/11/14) Fixed an error happening when the 2D density estimation produced by kde2d yields ALL NA values for xy$z.
#						This may happen when the x and y data have e.g. too many repeated values (it seems this is the reason of all NA values!)
# 					The fix was done by jittering the data a bit (i.e. adding noise with the jitter() function)
  # Color definition for the different levels of the image plot
  whitered.palette = colorRampPalette(c("white", col))  # This is a function so it can be used as the 'col' argument of image()

  # Remove any missing values
  indok = !is.na(x) & !is.na(y)
  xy <- try( kde2d(x[indok],y[indok]) )
  if (!inherits(xy, "try-error")) {
    if (all(is.na(xy$z))) {
    	# If the above density estimation fails (i.e. all z values are NA because of flawed data --e.g. too many repeated values)
    	# create a new density estimation based on jittered x and y values (i.e. with added noise)
    	# Otherwise calling image() on all NA data gives an error. Note that it suffices for ONE value to be non-NA for the image() function to work.
    	xy <- kde2d(jitter(x[indok]),jitter(y[indok]))
    # Show the image plus contour lines
		# NOTE: (2015/02/09) The add=TRUE feature does NOT add the image to an existing plot as expected,
		# because image() paints on the existing graphics (see documentation)
    image(xy, add=add, col=whitered.palette(nlevels), ...)
    contour(xy, add=TRUE)
    # Add axis ticks if requested
    if (addXAxis) {
      axisProperty = pairsXAxisPosition(xaxt=xaxt)
      axis(at=pretty(x[indok]), side=axisProperty$side, outer=axisProperty$outer, line=axisProperty$line)
    if (addYAxis) {
      axisProperty = pairsYAxisPosition(yaxt=yaxt)
      axis(at=pretty(y[indok]), side=axisProperty$side, outer=axisProperty$outer, line=axisProperty$line)

# panel.image: Same function as plot.image but as a panel function (i.e. always uses add=TRUE)
panel.image = function(x, y, col="red", nlevels=12, xaxt="s", yaxt="s", addXAxis=FALSE, addYAxis=FALSE, ...)
# Created: 22-Jul-2014
	plot.image(x, y, col=col, nlevels=nlevels, xaxt=xaxt, yaxt=yaxt, addXAxis=addXAxis, addYAxis=addYAxis, add=TRUE, ...)

# plot.binned: Binned plot
# - [DONE-2016/12/01: Partially done: the log transformation is implemented on the x variable but the x axis does not show the original
# x values] 2008/09/09: Add a parameter that allows to show the labels of the original scale when the plotted variable
# is log-transformed. OR BETTER (2015/04/03): add a parameter that requests using the log of the variable to compute the bin centers
# (in order to avoid a too stretched x scale to the rigth when the original variable has too large values), then use log(x) for the plot
# but use the original x values for the x labels.
# - 2013/09/17: Add parameter 'lmbands' which can take values NULL, "confidence", "prediction" which are the possible
# values of parameter 'interval' of the predict.lm() function in order to add the confidence or prediction bands to the lm fit.
# The code to compute those bands would be the following:
# lmfit.pred = predict(lmfit, interval=lmbands)
# lines(toplot$x_center, lmfit.pred[,"upr"], col="blue", lty=0.8)
# lines(toplot$x_center, lmfit.pred[,"lwr"], col="blue", lty=0.8)
# - 2013/09/21: Solve the minor issue of correctly placing the secondary axis for 'target' when plot.binned() is called from
# pairs(): We should set parameter 'line' in the axis() function --used to indicate the position of the secondary axis-- equal to -gap,
# where 'gap' is the space (in margin lines) between the pairs subplot. However, I don't know how to read the value of such parameter.
# One option would be to use the sys.calls() function (already used at the beginning of plot.binned()), identify the pairs() call and
# check whether the 'gap' parameter is listed in that call. If not, assume that the default 'gap' value is in use (gap=1).
# - 2013/11/14: If requested, have the function call InformationValue() to compute the information value of the analyzed variable
# on the target and in that case display it as the title of the plot and also show a table of the WOE by bin on the right-hand side
# of the plot (but only when add=FALSE, which is the case for instance when the plot is NOT part of a pairs plot)
# - 2015/04/03: Add the possibility of NOT using CIRCLES to plot each point (i.e. points that are proportional to the bin size) but to
# show just the point labels indicating the bin size. This is to mimic what the PartialPlot() function does in SPSS/Python where we can
# easily appreciate the size of the point because we can see the pointlabel indicating the bin size. However, SPSS has a smart way of
# placing the labels when there is overlap... and R does not!
# - 2016/12/01: Add the possibility of specifying xlim="both" and ylim="both" in which case two plots are shown in parallel (mfrow=c(1,2)),
# the left plot showing the original axis limits and the second plot showing the new (reduced after grouping) axis limits)
plot.binned = function(
	x, y, pred=NULL, target=NULL, grouped=FALSE, size=NULL,	# 'grouped' whether the data is already grouped; 'size' variable to use as symbol size in the plot (e.g. x_n)
	xtransform=NULL,	# transformation to apply to the x values. Either NULL, "", "orig" or "log" which applies the safe log function to the original values BEFORE grouping (this limits the influence of outliers)
	center=mean, scale=sd, groups=20, breaks=NULL, rank=TRUE,
  lm=TRUE, loess=TRUE,
  circles=NULL, thermometers=NULL,  # 'circles' and 'thermometers' must be EXPRESSIONS that parameterize the corresponding symbol based on the computed grouped values (e.g. circles=expression(x_n) or thermometers=expression(cbind(x_n,y_n,target_center)), since these values would not exist during the function call)
  col="light blue", col.pred="black", col.target="red", col.lm="blue", col.loess="green",
	pointlabels=TRUE, limits=FALSE,		# 'limits' indicates whether to show the x limits of each bin as vertical gray lines (so that we know from where to where each points spans in terms of the x values in the bin)
	bands=FALSE, width=1, stdCenter=TRUE, # stdCenter: whether to show the bands using the standard deviation of the *summarized* y value (calculated as scale/sqrt(n), where n is the number of cases in the x category), instead of showing the standard deviation of the y value, i.e. +/- 'scale'.
  boxplots=FALSE, add=FALSE, offset=1, inches=0.5, size.min=0.05, cex.label=0.5,
	xlab=NULL, ylab=NULL, xlim=NULL, ylim=NULL, ylim2=NULL,  # ylim for the secondary axis used to plot the target values
	xlimProperty=xlim, ylimProperty=ylim, ylim2Property=ylim,
		# xlim, ylim, ylim2 can be either a regular c(minvalue, maxvalue) or a character value: "new", "orig" or "both"
		# in order to specify whether the new scale coordinates (after binning), or the ORIGINAL values should be used as axis limits.
		# Within a pairs() call the same axis scales are expected to be shown for ALL panels, unless missing values vary across variables.
  type2="o", pch2=21,	  # type and pch par() options defining the form and symbol of the target variable
  xaxt="s", yaxt="s",
	addXAxis=FALSE, addYAxis=(ylim=="new" || ylimProperty=="new"), addY2Axis=(ylim2=="new" || ylim2Property=="new"),
		# addXAxis, addYAxis, addY2Axis: Whether to force showing the x/y/y2-axis tick marks next to EACH axis in a pairs plot.
		# Note that these parameters are ONLY used when plot.binned() is called as a panel function of a pairs plot.
		# This is important when plot.binned() is called from within a pairs plot, particularly for the y-axes. In fact the vertical axes on the different pair
		# combinations may be potentially different because of the grouping done on the x variable, and we may want to know what the actual vertical axes values are.
		# These parameters were added because it was not possible to check the value of graphical parameter 'yaxt' (which defines whether the y axes are shown)
		# from within a function called as a pairs() panel function because its value is always returned as NULL... this means that I cannot decide whether
		# to force showing the axis by first checking whether the axis tick marks have already been placed by the pairs() function.
   # addY2Axis: whether to force showing the secondary y-axis tick marks next to EACH axis, because the axis on the different pair combinations may be potentially different (this is set to TRUE when the secondary axis limits are NOT requested to be in the original coordinates --in which case the secondary y-axis have the same range for all pairs plots)
	clip="region",				# clipping area, either "figure" or "region". This parameter affects the xpd option of par(). Note that "region" is the default in par() but here I set "figure" to the default so that big bubbles are shown full --instead of being partially hidden by the axes.
	plot=TRUE, print=TRUE, ...)
# Created: 			09-Sep-2008
# Modified: 		11-Mar-2014
# Description:	Plots on binned variables, optionally with a third variable that for example represents a fit from a model.
# HISTORY:  (2013/06/25) Added parameter cex to change minimum label size for the points (added using text()) and to define overall label size (such as for labels added by mtext())
#           (2013/07/08) Added parameter pairsFlag so that this function can be used for the plots in the panels of a pairs() call (which does NOT allow the generation of a new plot in the panels)
#           (2013/08/14) Added parameter groups which plays the role of the previously names 'breaks', and I use now 'breaks' in case the user wants to define their own break points (instead of specifying 'groups')
#           (2013/09/03) Took care of missing values in the data (i.e. removed cases that have missing values in ANY of the analyzed variables x, y and pred.
#           (2013/09/15) Added parameter 'target' to define a target variable to plot on secondary axis
#                        Renamed parameter 'new' to 'add' to avoid confusion of the use of the 'new' parameter of par(), since until now I was setting par(new=!new), which is rather confusing...
#                        Renamed parameter 'cex' to 'cex.label' to reflect that this cex affects only labels and NOT point sizes.
#                        Renamed parameter 'value' with 'center' to be in accordance with the R convention given by function scale()
#                        Added parameter 'scale' that defines the function to use for the scale statistic. This can be any scale statistic accepted by an apply() function. E.g. 'sd', 'mad'.
#                        Added parameters 'thermometers' and 'circles' to define what is plotted by the symbols() function and parameterize them.
#                        Added parameters 'lm' and 'loess' to define whether to show the linear fit and the loess fit.
#                        Added parameter 'ylim2' for the limits of the secondary axis where the target variable is plotted.
#                        Added parameters 'xlimProperty', 'ylimProperty', 'ylim2Property' which can be either "new" or "orig" to indicate whether to use the new (after x categorization) range of values or the original range of values
#                        respectively for the x-axis scale, the y-axis scale and the secondary y-axis scale used for the target variable.
#                        Added parameters 'addXAxis' and 'addYAxis' to solve the problem of manually adding the axis ticks on the pairs() plots. The idea is that the axis ticks are manually added when pairs() is run with xaxt="n" and/or yaxt="n" and the user explicitly requests to add the axis ticks.
#                        Added parameter 'addY2Axis' to make sure that the secondary y-axis is shown on every panel subplot when the secondary y-axis scale is NOT the same (which happens --unless different missing patterns occur in the data-- when ylim2Property="orig") 
#                        Added parameter 'plot' that defines wheter the plot is produced or not. This is useful when we just want to have the binned data generated by this function.
#                        Eliminated parameter 'pairsFlag'. Now its value is computed automatically by the function by checking whether this function was called from a pairs() function, using sys.calls().
#                        Eliminated parameter 'return'. Now I ALWAYS return the binned data used for plotting.
#                        Redefined the way xlim and ylim are computed:
#                        - in both cases the newly added parameters xlimProperty and ylimProperty are used to define whether the scale used is the scale of the new categorized values or the scale of the original x and/or y values.
#                        - for xlim in addition I had to make sure that its value is used in the symbols() and the plot() functions in order to have
#                        the plot of the target vs. x_cat values and the plot of the y vs. x_cat values aligned. At first this seems not necessary,
#                        but it is required because the way that the symbols() and the plot() functions compute the user coordinates based on range of values of x
#                        are different! (generating therefore a desalignment unless the xlim option is explicitly defined with the same value in both function)
#           (2013/09/21) Added parameter 'pointlabels' to define whether labels showing the group size are shown.
#                        Added parameters 'col', 'col.lm', 'col.loess' to define the colors of the points and the line fits to draw.
#                        Changed the behaviour of xlim, ylim, ylim2 with xlimProperty, ylimProperty and ylim2Property, respectively,
#                        so that now any value given in xlim, ylim or ylim2 has precedence over the value given in xlimProperty, ylimProperty and ylim2Property.
#						(2013/11/15) Added parameter 'clip' which indicates the area to which plotting should be clipped. This affects the xpd
#												 option of par() and is set to "figure" by default in order to avoid having large bubbles near the axis being
#												 partially hidden by the axis itself.
#						(2013/12/31) Added the min and max values of x and y for each x category as part of the data returned by the function as output$data.
#												 NOTE: THIS STILL NEEDS FIXING BECAUSE IT DOES NOT WORK WHEN 'pred' OR 'target' ARE PASSED. For now it is commented out.
#						(2014/01/04) Added parameters col.pred and col.target to define the colors of the 'pred' and 'target' lines in the plot.
#						(2014/03/04) Changed the color parameter for the pointlabels and error bars from "blue" to "black", so that they are in neutral color which does not depend on the color used for the points.
#						(2014/03/11) Added parameters 'type2' and 'pch2' defining the form and symbols for the plot of the target variable.
#						(2014/03/31) Fixed errors occurring with target variable, when: target variable has missing values (NA) and error message showing up when target=NULL was explictly passed (this was solved by replacing condition "!missing(target)" with "!is.null(target)")
#												 Simplified the specification of the axis formats by removing parameters xlimProperty, ylimProperty, ylim2Property. Now the "new" or "orig" specifications should be directly passed to xlim, ylim, ylim2.
#												 Updated ylim range when pred variable is passed and ylim="new".
#						(2014/05/11) Added logical parameter 'limits' which indicates whether to show the x limits of each bin as vertical gray lines.
#												 Changed the default value of 'clip' from "figure" to "region" and at the same time forced the use of par(xpd=TRUE) before showing the point labels.
#						(2015/02/03) Added parameter 'size' to specify the variable that should be used as size for the symbols() plot when grouped=TRUE.
#												 Although the circles= and thermometers= options already existed for this purpose (by way of expressions) its functionality
#												 doesn't seem to be working properly (since the bubbles are not plotted proportinal to the variable specified in circles=,
#												 e.g. circles=expression(x_n), and in addition the size information specified in circles= is NOT used when computing the
#												 lm() and loess() fits!)
#						(2016/12/01) Added parameter 'xtransform' which allows requesting a safe log transformation for the x values prior to the analysis.
  #----------------------------------- Parse input parameters ---------------------------------#
  ### Check whether this function is called by pairs, in order to set parameter add=TRUE, so a new plot is NOT created (which gives an error in pairs())
  # Get the list of all calling functions (since the very beginning)
  callingFun = sys.calls()
  # Extract just the function name from each call (which includes also parameter names)
  # Note the use of the [[]] operator instead of the [] operator because [1] returns the function name followed by parenthesis '()' and [[1]] returns just the function name.
  callingFun = lapply(callingFun, "[[", 1)
  # Compute the flag variable that indicates whether this function was called by pairs()
  pairsFlag = length(unlist(sapply(callingFun, grep, "$pairs"))) > 0

  ### Parse parameters regarding pairs() function call, add option, etc.
  if (print)	{	cat("Function call...\n"); print(match.call()) }
  if (pairsFlag) {
    # Set add=TRUE to avoid this function from creating a new plot since the pairs() function does NOT allow
    # the creation of a new plot. Plots can only be ADDED to each panel.
    # Note that add=TRUE used in plotting functions is *logically* equivalent to new=TRUE in the par() function.
    # In fact, add=TRUE means "add a plot to the current graph *respecting the current user coordinates*"
    # while par(new=TRUE) means "create a new plot on the current graph but update the user coordinates".
    add = TRUE
  if (add) {
    # Do not show the axis ticks when the plot is added to an existing graph
    xaxt = "n"
    yaxt = "n"
  } else {
    if (!exists("xaxt", envir=sys.nframe(), mode="name")) { xaxt = par("xaxt") }
    if (!exists("yaxt", envir=sys.nframe(), mode="name")) { yaxt = par("yaxt") }
    # Axis labels
    if (missing(xlab)) {
   		xlab = deparse(substitute(x))
    # Add the (log) scale info to the x-label
		if (!is.null(xtransform) && tolower(xtransform) == "log") {
			xlab = paste(xlab, "(log)")
    if (missing(ylab)) { ylab = deparse(substitute(y)) }

  ### Parse graphical options
  # Read the current mfg property (used when this function is called by pairs() to define where to place the secondary axis for the target values (when passed)
  mfg = par("mfg")  # The structure of mfg is: (current panel (row, col), whole panel size (rows, cols))
  # Set the plot clipping to the area indicated by the 'clip' parameter, which can be either "figure" or "region".
  # By default the clipping area is set to "figure" so that when bubbles fall outside of the axis ranges (e.g. when they are too big),
  # the whole bubble is seen in its entire (instead of being hidden by the axis)
  xpd = par(xpd=(tolower(clip)=="figure"))
  #----------------------------------- Parse input parameters ---------------------------------#

  #------------------------------------ Data preparation --------------------------------------#
  ### NOTE: (2013/09/16) This section of reading the data could be radically improved by:
  ### - allowing missing values for the y, target and pred variables
  ### - creating first the data frame (as I do below) and then removing missing values using the na.omit() function for example or other useful function to deal with missing values in matrices.

  ### Collapse x, y, pred and target into numeric matrices.
  # The numeric conversion is motivated because we need to compute averages on them and only numeric values are accepted for that
  # (note that factor variables do not accept averages to be computed on them and as.numeric() removes the factor property)
  # (note also the need to use as.character() inside the as.numeric() function and this is to preserve the original values of a
  # potential factor variable because for example as.numeric(factor(c(0,1))) returns '1 2' and NOT '0 1'!!! Note that it seems that no precision is lost on numeric variables when their values are converted to character first with as.character())
  # The matrix conversion is to avoid having data frames to deal with because the handling done below may lead to errors on data frames.
  # Store the original names of x and y for later use when returning the object with the plotted data.
	if (!is.null(xtransform) && tolower(xtransform) == "log") {
	  x = as.matrix( safeLog(as.numeric(as.character(x))) );
	} else {
		# No transformation when the xtransform is NULL or takes an invalid value
		if (!is.null(xtransform) && xtransform != "" && tolower(xtransform) != "orig") cat("WARNING: The given transformation for the x axis in parameter xtransform is invalid (", xtransform, ")\nOnly 'log' is accepted.\n", sep="")
		x = as.matrix(as.numeric(as.character(x)));
	xname = colnames(x);
  y = as.matrix(as.numeric(as.character(y))); yname = colnames(y);
  if (!is.null(pred))	  { pred = as.matrix(as.numeric(as.character(pred))) }
  if (!is.null(target))	{ target = as.matrix(as.numeric(as.character(target))) }

  # Consider only valid observations, i.e. the observations not having missing values in any of x, y and pred
  indok = complete.cases(cbind(x, y, pred, target))
  	## Note that the above doesn't cause any trouble when either pred or target are NULL because
  	## cbind(x, y, NULL, NULL) is ok and is the same as cbind(x, y)
  x = x[indok]
  y = y[indok]
	if (!is.null(pred)) pred = pred[indok]
	if (!is.null(target)) target = target[indok]

	# Sort x for plotting reasons
	xorder = order(x)
	x = x[xorder]
	y = y[xorder]
	if (!is.null(pred))   { pred = pred[xorder] }
	if (!is.null(target)) { target = target[xorder] }
	if (!grouped)
	  if (is.null(breaks)) {
	    breaks = groups
	  } else  {
	    # Add -Inf and Inf to the breaks vector so that all values in the input variable x are mapped to a category
	    breaks = unique(c(-Inf, breaks, Inf))
		# Group the values of x when it is not already grouped.
		if (rank)
		  # Group the values of x based on its rank (this is equivalent to using the quantile() function to define equally-sized groups!)
		  # Note that here the breaks apply to the rank values of x, NOT to the original x values! (the rank values look like 1, 2, ..., n)
		  x_cat = as.numeric(cut(rank(x), breaks=breaks, right=FALSE))
		} else {
		  # Group the values of x in equally size distances (but not in equal size groups!)
		  x_cat = as.numeric(cut(x, breaks=breaks, right=FALSE))
	} else {
		x_cat = x
	  # Define the value of x_n to be used in the plots and lm() and loess() fits below
	  if (!is.null(size)) x_n = size
  dat = data.frame(x=x, y=y)
	if (!is.null(pred))   { dat$pred = pred }
  if (!is.null(target)) { dat$target = target }

  # Aggregated values using 'center' as the function to compute such value (e.g. center=mean)
  # Min and Max values are also computed for x and y, as well as the scale, using 'scale' as the function to compute the scale (e.g. scale=sd)
  # Note that the column names of each aggregate() function coincide with the the original variable names... (e.g. x, y, target, pred)
  # Note also that I use the trick of always using "" as second value of parameter 'suffixes' except at the very last merge() which allows
  # me to have all variable names in 'toplot' as I finally need them!
	dat_center = aggregate(dat, by=list(x_cat), FUN=center, na.rm=TRUE)
	dat_min = aggregate(dat, by=list(x_cat), FUN=min, na.rm=TRUE)
	dat_max = aggregate(dat, by=list(x_cat), FUN=max, na.rm=TRUE)
	dat_scale = aggregate(dat, by=list(x_cat), FUN=scale, na.rm=TRUE)
	dat_n = aggregate(dat, by=list(x_cat), length)
	toplot = merge(dat_center, dat_min, by="Group.1", suffixes=c("_center",""))
	toplot = merge(toplot, dat_max, by="Group.1", suffixes=c("_min",""))
	toplot = merge(toplot, dat_scale, by="Group.1", suffixes=c("_max",""))
	toplot = merge(toplot, dat_n, by="Group.1", suffixes=c("_scale","_n"))
	# 2014/05/11: This is no longer needed as I automatically name correctly all the columns in toplot by using the trick shown above
	# of always using "" as the second value of 'suffixes' except at the very last step!
#	toplot = rename.vars(toplot, from=c("Group.1", "x", "y"), to=c("x_cat", "x_n", "y_n"), info=FALSE)
#	if (!is.null(pred)) { toplot = rename.vars(toplot, from=c("pred"), to=c("pred_n"), info=FALSE) }
#	if (!is.null(target)) { toplot = rename.vars(toplot, from=c("target"), to=c("target_n"), info=FALSE) }
	# 2013/12/31: New way but this does NOT work when pred and target are NOT empty because pred_center and target_center are not found.
	# The aggregate with FUN=center should be the first to be computed!
#	dat_min = aggregate(dat, by=list(x_cat), FUN=min, na.rm=TRUE); dat_min = rename.vars(dat_min, from=c("x", "y"), to=c("x_min", "y_min"), info=FALSE);
#	dat_max = aggregate(dat, by=list(x_cat), FUN=max, na.rm=TRUE); dat_max = rename.vars(dat_max, from=c("x", "y"), to=c("x_max", "y_max"), info=FALSE);
#	dat_center = aggregate(dat, by=list(x_cat), FUN=center, na.rm=TRUE); dat_center = rename.vars(dat_center, from=c("x", "y"), to=c("x_center", "y_center"), info=FALSE);
#	dat_scale = aggregate(dat, by=list(x_cat), FUN=scale, na.rm=TRUE); dat_scale = rename.vars(dat_scale, from=c("x", "y"), to=c("x_scale", "y_scale"), info=FALSE);
#	dat_n = aggregate(dat, by=list(x_cat), length); dat_n = rename.vars(dat_n, from=c("x", "y"), to=c("x_n", "y_n"), info=FALSE);
#	toplot = merge(dat_min, dat_max, by="Group.1")
#	toplot = merge(toplot, dat_center, by="Group.1")
#	toplot = merge(toplot, dat_scale, by="Group.1")
#	toplot = merge(toplot, dat_n, by="Group.1")
#	toplot = rename.vars(toplot, from="Group.1", to="x_cat", info=FALSE)
#	toplot = toplot[,c("x_cat","x_min","x_max","x_center","y_center","y_min","y_max","x_scale","y_scale","x_n","y_n")]
  #------------------------------------ Data preparation --------------------------------------#

  #----------------------------------------- Fits --------------------------------------------#
	# Linear fit (weighted by the number of cases in each group)
	if (lm)     { lmfit = try( lm(y_center ~ x_center, weights=x_n, data=toplot) ) }
	# Loess fit (local regression, weighted by the number of cases in each group)
	if (loess)  { loessfit = try( loess(y_center ~ x_center, weights=x_n, data=toplot) ) }
  #----------------------------------------- Fits --------------------------------------------#

  #----------------------------------------- Plots -------------------------------------------#
  if (plot) {
		#--- Prepare to plot
		on.exit(detach(toplot), add=TRUE)		# The add=TRUE option adds the current command to the list of commands to execute "on exit" that has been started already.
		# Parse the *limProperty variables and if passed assign their values to the corresponding *lim variables
		# This was done to circumvect the problem when this function is called as a panel function in pairs() and trying to pass
		# the parameter e.g. ylim="new". The pairs() function also receives the ylim parameter and it does not accept a non-numeric value!
		if (!missing(xlimProperty)) { xlim = xlimProperty }
		if (!missing(ylimProperty)) { ylim = ylimProperty }
		if (!missing(ylim2Property)) { ylim2 = ylim2Property }

		# X-axis limits (use the original range?)
		if (is.null(xlim) || tolower(xlim) == "orig") {
			# Use the range of the ORIGINAL x values (before computing the summary statistic value for each x_cat)
			# either when xlim is NULL or when its value is "orig"
			xlim = range(x)
		} else if (is.character(xlim)) {
			# Initialize xlim to the limits AFTER the x categorization is performed when it has a character value other than "orig"
			xlim = range(x_center)
		## In all other cases (i.e. when xlim takes the usual c(xmin, xmax) form) use those values as xlim.

		# Y-axis limits (use the original range?)
		if (is.null(ylim) || tolower(ylim) == "orig") {
			# Use the range of the ORIGINAL y values (before computing the summary statistic value for each x_cat)
			# either when ylim is NULL or when its value is "orig"
			# and update it with the range of the pred variable (if any)
			ylim = range(c(y, pred))					# Note that this also works when pred=NULL
		} else if (is.character(ylim)) {
			# Initialize ylim to the limits AFTER the x categorization is performed when it has a character value other than "orig"
			if (!is.null(pred)) {							# Need to check first if pred is not NULL because pred_center is not part of dataset 'toplot' when pred=NULL
				ylim = range(c(y_center, pred_center))
			} else {
				ylim = range(y_center)
		## In all other cases (i.e. when ylim takes the usual c(xmin, xmax) form) use those values as ylim.

		# Secondary Y-axis limits (use the original range of target?)
		if (!is.null(target)) {
			if (is.null(ylim2) || tolower(ylim2) == "orig") {
				# Use the range of the ORIGINAL target values (before computing the summary statistic value for each x_cat)
				# either when ylim2 is NULL or when its value is "orig"
				ylim2 = range(target)
			} else if (is.character(ylim2)) {
				# Initialize ylim2 to the limits AFTER the x categorization is performed when it has a character value other than "orig"
				ylim2 = range(target_center)
		## In all other cases (i.e. when ylim2 takes the usual c(xmin, xmax) form) use those values as ylim2.

		# Update ylim if bands for the y values are requested for each x_cat
		if (bands)
			# If requested, compute bands based on +- stdev or +- stdev/sqrt(n) around the summarized y values. Use either one depending on the value of stdCenter
			if (stdCenter) {
				toplot$y_upper = y_center + width * y_scale / sqrt(y_n)
				toplot$y_lower = y_center - width * y_scale / sqrt(y_n)
			} else {
				toplot$y_upper = y_center + width * y_scale
				toplot$y_lower = y_center - width * y_scale

			# Update ylim so that the upper and lower bands fit in the graph.
			ylim = range(c(ylim, y_center, toplot$y_lower, toplot$y_upper), na.rm=TRUE)
			# Extend the y axis range so that the labels fit in the plot (specially this extension is necessary for the
			# lower limit if pos=1 is used in the text() function below. If pos=3 is used, the upper limit should be extended)
			# Note that in package grDevices there is also the function extendrange() but I am not using it here because
			# I don't know if grDevices is included with the base installation of R.
			ylim[1] = ylim[1] - 0.1*(ylim[2] - ylim[1])
			ylim[2] = ylim[2] + 0.1*(ylim[2] - ylim[1])

		# Update ylim if boxplots of the y values are requested for each x_cat
		if (boxplots)
			box = boxplot(y ~ x_cat, plot=FALSE)
			# Update ylim so that the boxplots fit in the graph.
			ylim = range(c(ylim, box$stats, box$out))

		# Compute the parameter inches so that the symbol's minimum size is the one given in size.min
		nmin = min(x_n)
		nmax = max(x_n)
		inches = min(size.min * nmax / nmin, inches)
		#--- Prepare to plot

    # Plot symbols
    if (missing(xlim)) {
      # Explicitly set the xlim values because the symbols() and the plot() function behave differently
      # when setting the user coordinates (par$usr) for the plot.
      # Since I use the plot() function to plot the potentially existing target values on top of the
      # y vs. x values, I need to explicitly set the xlim option both in symbols() and in plot()
      xlim = range(x_center)
    if (is.null(circles) & is.null(thermometers)) {
      # Plot circles by default whose size is defined by the number of cases in each categorized x point
      symbols(x_center, y_center, circles=x_n, inches=inches, fg="black", bg=col, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt=xaxt, yaxt=yaxt, ...)
    } else {
        if (!is.null(circles)) {
          # Plot circles whose size is defined by the variable passed by the user
          symbols(x_center, y_center, circles=eval(circles), inches=inches, fg="black", bg=col, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt=xaxt, yaxt=yaxt, ...)
        if (!is.null(thermometers)) {
          # Plot thermometers
          # If a target variable was passed, the thermometers color is defined by the target value
          if (!is.null(target)) {
            # Coefficient of variation of the target values (weighted by the bin the x_cat category sizes!)
            # This is done to scale the colors so that when the variation is not so much (<< 100%) the colors are not so much spread out about the average yellow color
            target_mean =  weighted.mean(target_center, x_n)
            cv = sqrt(cov.wt(as.matrix(target_center), wt=x_n)$cov) / target_mean
            r = 0.5 + (target_center - target_mean) / diff(range(target_center)) * cv * 2
            g = 0.5 + (target_mean - target_center) / diff(range(target_center)) * cv * 2
            # Bound the values of r between 0 and 1
            fg = rgb(r,g,0,1)
          } else {
            fg = "black"
          symbols(x_center, y_center, thermometers=eval(thermometers), inches=inches, col="black", fg=fg, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt=xaxt, yaxt=yaxt, ...)
          # Add a text showing the target values
          xpd = par(xpd=TRUE)
          text(x_center, y_center, pos=4, offset=offset, label=as.character(signif(target_center,2)), cex=cex.label*log10(1 + x_n/10), col=rgb(r,g,0,1))
            ## use pos=4 because in the other text() function below I use pos=1 to add the values of the category sizes
          # Add information at the top regarding the overall average of target
          mtext(paste("Target average:", signif(target_mean,2)), side=3, cex=0.8*cex.label)
    # Decide whether to add axis tick marks when this function is called from pairs()
    # Note that when this is not the case and the user passed parameter add=TRUE,
    # no axis ticks are added here because it is the user's responsibility to add them wherever they want.
    # (in fact since we don't know where the primary axes are located we cannot decide where to add the axes for the plot currently being added)
    if (pairsFlag) {
      # If requested, manually place the x-axis ticks
      if (addXAxis) {
        axisProperty = pairsXAxisPosition(xaxt=xaxt)
        axis(at=pretty(c(xlim,x_center)), side=axisProperty$side, outer=axisProperty$outer, line=axisProperty$line)
      # If requested, manually place the y-axis ticks
      if (addYAxis) {
        axisProperty = pairsYAxisPosition(yaxt=yaxt)
        axis(at=pretty(c(ylim,y_center)), side=axisProperty$side, outer=axisProperty$outer, line=axisProperty$line)
    # Add x limits for each bin if requested
    if (limits) {
    	abline(v=x_min, col="gray", lty=2)
    if (lm & !inherits(lm, "try-error")) { lines(x_center, lmfit$fitted, col=col.lm,  lwd=2, lty=2) }
    if (loess & !inherits(loessfit, "try-error")) { lines(x_center, loessfit$fitted, col=col.loess, lwd=2) }
    # If non-null, plot the fitted values (given in pred).
    if (!is.null(pred))
    	lines(x_center, pred_center, col=col.pred, lwd=2)
    # If requested plot boxplots of y for each x_cat
    # (note that boxplots should come BEFORE any 'bands' because boxplots hide any previously plotted bands (because they normally are larger than the bands!)
    if (boxplots)
      boxplot(y ~ x_cat, add=TRUE, at=x_center, boxwex=0.01, xaxt="n", top=TRUE)
      ## xaxt="n" to avoid showing the x values that may cause confusion in the plot.
    # If requested, plot the +- stdev or +- stdev/sqrt(n) bands around the summarized y values.
    if (bands)
    	points(x_center, toplot$y_upper, col=col, pch="_", lwd=4, lty=2)
    	points(x_center, toplot$y_lower, col=col, pch="_", lwd=4, lty=2)
    	segments(x_center, toplot$y_lower, x_center, toplot$y_upper, col=col, lwd=2)
    # Reference line and labels
    if (is.null(thermometers) & pointlabels) {
      # Only add labels when thermometers are NOT used as symbols (because they already show the target value as labels)
			xpd = par(xpd=TRUE)		# Clip the text to the figure instead of to the plotting region so that all point labels are seen.
      text(x_center, y_center, x_n, pos=1, offset=offset, cex=cex.label*log10(1 + x_n/10), col="black")
    # If a target variable was passed and the requested symbols plot is NOT thermometers,
    # plot the summarized target values by x_cat on a secondary axis.
    # If thermometers symbols are used above, the target variable has already been used to define the fill level of the thermometers
    # This plot must come AFTER plotting everything else related to x and y because a new plot is made --over the existing graph-- with potentially *new* user coordinates (i.e. new y axis limits)
    if (!is.null(target) & is.null(thermometers))
      # Keep the same user coordinates as the current plot on the x axis so that the x positions of the target points coincide with the existing points
      # (this may not be the case even if the x variable is the same as the previous plot!!)
      # Note that setting the same user coordinates does NOT have the same effect as setting the same xlim range!!
      plot(x_center, target_center, type=type2, pch=pch2, col=col.target, bg=col.target, lwd=2, xlab="", ylab="", xlim=xlim, ylim=ylim2, xaxt="n", yaxt="n")
      abline(h=weighted.mean(target_center), col=col.target, lty=2)
      # Add the secondary axis ticks only when the panel is at one of the edge columns
      # This is useful when this function is called by pairs() as one of the pair functions (e.g. through upper.panel option) and it mimics what pairs() does.
      # Note that this condition does NOT affect the case when the plot is done on one single panel (i.e. when it is not called by pairs() as one of the pair functions)
      if (mfg[2] == 1 | mfg[2] == mfg[4] | addY2Axis) {
        if (pairsFlag & !addY2Axis) {
          # When this function is called by pairs() and the user did NOT request to explicitly add the secondary y-axis explicitly on every panel subplot,
          # decide where to place the secondary axis ticks so that they do not overlap with the axis used by pairs().
          # pairs() places the vertical axis on the right when the panel row is odd and on the left when the panel row is even.
          outer = TRUE
          line = 0  # This number should be -gap, where gap is the space between the subplots in margin lines. However, I don't know yet how to retrieve the value of gap in the current pairs() call!
          if (mfg[1] %% 2 == 1) {
            side = 2
          } else {
            side = 4
        } else {
          outer = FALSE
          line = 0
          side = 4
        axis(side=side, at=pretty(c(ylim2,target_center)), col=col.target, col.ticks=col.target, outer=outer, line=line)
        ## Note the combined use of outer=TRUE and line=-1 which makes the secondary axis tickmarks to be shown on the sides of the big pairs() plot, as opposed to on the sides of the current panel
        ## It seems it is not possible to also color the tick labels because there is no option col.lab in axis()...
  #----------------------------------------- Plots -------------------------------------------#
#	detach(toplot)
#	# Restore plotting parameters changed above
#	par(xpd=xpd)
	if (print) { cat("Data plotted...\n"); print(toplot) }
  output = list(data=toplot)
  if (lm)    { output$lm.fit = lmfit }
  if (loess) { output$loess.fit = loessfit }

# pairs.custom: Pairs plot of a set of variables where customized information is shown on the lower, diagonal and upper panels
# CHECK ALSO the car package (companion to applied regression) for interesting functions to enhance plots commonly used for regression)
pairs.custom = function(x, lower.panel=panel.image, diag.panel=panel.dist, upper.panel=points, max.panels=6, pause=TRUE, target=NULL, ...)
# Created:      2013/07/08
# Modified:     2013/07/08
# Description:  A manual pairs plot is produced. The plot is manual in that no call to the pairs() function is done but rather
#               the tiling of plots is generated using the par(mfrow) option.
#               (In fact the pairs() function does NOT allow a plotting function as panel function to make a NEW plot.
#               This implies that the axis limits cannot be redefined for e.g. a binned plot where the minimum and maximum values
#               of the plotted variables change after binning! => I cannot call plot.binned() as a panel function of pairs() and redefine the axis limits at the same time)
# Details:      The customized pairs plot show:
#               - Diagonal: density estimation with overimposed boxplot
#               - Upper diagonal: binned scatter plots produced by plot.binned() function defined above.
#               - Lower diagonal: correlation values produced by panel.cor() function defined above or the 2D kernel density estimation shown as image intensities
# Parameters:  
#               x:        	Matrix or data frame whose columns are used to produce the pairs plot
#								max.panels: Max number of panels to show per pairs plot. Several pairs plot are constructed when the number of variables in x is larger than max.panels.
#								target:			Name of the variable in x containing a target variable of interest. In this case the target variable is shown at the first row of the panel on EVERY pairs plot generated (when max.panels < number of columns in x)
# HISTORY:  (2013/09/03) Took care of missing values in the data (i.e. removed cases that have missing values in ANY of the analyzed variables x, y and pred.
#           (2013/09/15) Placed the two functions used by default for the lower and diagonal panels outside of this function so that they can be called by any pairs() call.
#						(2014/03/11) Added two new parameters: 'max.panels' and 'pause'.
#												 max.panels specifies the maximum number of panels to show per pairs plot.
#												 pause indicates whether to pause between plots and wait for a key pressed to continue.
#												 NOTE: If there is a target variable, it should be placed FIRST in the x matrix, so that it is plotted in ALL pairs plot at the first row.
#												 Also, for some reason the target variable CANNOT be passed as e.g. dat[,"target"] but it should be passed as dat$target.
#												 In the former case, I got the error "only 0's can be mixed with negative subscripts"... Not yet clear why I get this error.
#						(2014/03/30) Added parameter 'target' which contains the name of a target variable of interest.
#												 This implied the redesign of a large part of the function.
  # Store the list of parameters explicitly defined in the signature of this function
  # (note that this information is not used but was left here as a reference of how to parse the parameters explicitly defined in the function signature)
  funParams = formals(sys.function(sys.parent()))

  # Read the function call
  call = match.call()
#  params = names(call)  # The parameters are listed as follows: first the explictily defined parameter, then the parameters passed in '...'
#  # Remove the function name and the explicitly defined parameters from the parameter list
#  paramsList[[1]] = NULL
#  paramsListToRemove = c(1, which(params %in% funParams))
#   ## Index 1 is the function name (i.e. pairs.custom), the following indices are consecutive indices
#   ## since 'params' contains the parameter list in the order explained above (first explicit parameters, then parameters passed in '...')
#  # Remove the function name and the other parameters explicitly listed in this function from the parameter list
#  for (i in 1:length(paramsListToRemove)) {
#    p = paramsListToRemove[i]
#    paramsList[[p]] = NULL
#    # Next parameter to remove is located at paramsListToRemove[p+1]-1 since after removing element 'p' there is one less element in the list!!
#    paramsListToRemove = paramsListToRemove - 1
#  }
  #-------------------------- Parse Input parameters ---------------------------
  # Put all calling parameters into a parameter list for the pairs() function call with do.call()
  paramsList = as.list(call)
  # Remove the function name from the parameter list
  paramsList[[1]] = NULL

  ### Set parameters not passed by the user to the custom value I want to use here.
  # Note that I need to check if they were not passed by the user before assigning the custom value
  # because I don't want to override the user's specification!
  # *** Note also that NOT all parameters defined by the function are included in the parameter list 				 	***
  # *** returned by match.call() because match.call() only includes parameters explicitly passed by the user! ***
  if (is.null(paramsList$lower.panel)) paramsList$lower.panel = panel.image
  if (is.null(paramsList$diag.panel))  paramsList$diag.panel  = panel.dist
  ## Note that I do not assign a value to paramsList$upper.panel because the default custom value assigned to it is the pairs() default, i.e. "points".
  if (is.null(paramsList$gap)) paramsList$gap = 0       # pairs() parameter specifying the distance between panel plots in margin lines
  if (is.null(paramsList$new)) paramsList$new = FALSE
  if (is.null(paramsList$oma)) paramsList$oma = c(2,3,2,3)

	### Settings for the pairs plot
	# Limit the number of panels on each plot to the value specified in 'max.panels'
  imax = ncol(x)
  nplots = ceiling(imax/max.panels)
  # Adjust 'cex' and 'inches' when plot.binned() is used as panel function in the lower or upper diagonal,
  # and when they are not passed by the user, so that they become smaller as the number of panels per pairs plot grows.
  # The adjustment based on max.panels was obtained by trial and error.
  if (deparse(substitute(lower.panel)) == "plot.binned" | deparse(substitute(upper.panel)) == "plot.binned") {
		## Note that the deparse(substitute()) function needs to be applied to the function parameters 'lower.panel' and 'upper.panel'
		## and NOT to the elements of parmasList (paramsList$lower.panel and paramsList$upper.panel) because in the latter case
		## the result is a complicated structure representing something like the function call and having the function name
		## (e.g. "plot.binned") appearing at the end (as e.g. $plot.binned)
		## The output of this can be seen by printing the output as a list as follows:
		## print(as.list(deparse(substitute(paramsList$upper.panel))))
		if (is.null(paramsList$cex)) paramsList$cex = 1/(max.panels^(1/3))
	 	if (is.null(paramsList$inches)) paramsList$inches = 0.5/(max.panels^(1/3))
	 	if (is.null(paramsList$print)) paramsList$print = FALSE	# Do not show the plotted data of each pairs plot!
		if (is.null(paramsList$addXAxis)) paramsList$addXAxis = FALSE
		if (is.null(paramsList$addYAxis)) paramsList$addYAxis = FALSE
		if (is.null(paramsList$addY2Axis)) paramsList$addY2Axis = FALSE

	### Target variable
	# If a target parameter is passed, place it as first column of x so that it is plotted at the first row of the pairs plot 
	# in EVERY generated pairs plot.
	if (!is.null(target)) {
		# Check the existence of the variable indicated in 'target' in matrix or data frame 'x'.
		# (recall that 'target' contains the NAME of the target variable)
		if (!is.null(checkVariables(x, target))) {
			stop("The variables indicated above were not found in dataset '", deparse(substitute(x)), "'")
		# Place the target variable at the first column of matrix or data frame x so that it is more easily handled below when generating the pairs plots
		x = cbind(x[,target,drop=FALSE], x[,-match(target, colnames(x)),drop=FALSE])
		# Update the 'x' component of paramsList so that the correct order of variables is passed to pairs()
		paramsList$x = x

		# Generate the target variable as a matrix (with as.matrix()) and preserve the original variable name (with drop=FALSE)
		# Note that if x is a data frame, there will be an error in plot.binned() (if called) because in that function the target variable
		# is transformed with as.matrix(as.numeric(as.character(target))) which gives an error when 'target' is a data frame.
		target = as.matrix(x[,1,drop=FALSE])
		# Update the 'target' component of paramsList so that it is now an array (or matrix) of values (as opposed to a variable name)
		# since this is how it is received by panel functions (such as plot.binned())
		paramsList$target = target

		# Settings for the generation of the pairs plot below when number of pairs plot to generate is more than 1
		if (nplots > 1) {
			# Reduce the value of max.panels because this will be used as the number of INDEPENDENT variables being plotted in each pairs plot
			max.panels = max.panels - 1
			# First INDEPENDENT variable to plot in every pairs plot
			imin = 2
			# The number of panels per pairs plot is max.panels + 1
			cat("For better view, the pairs plot is divided into", nplots, "plots of", max.panels+1, "panels each.\n")
	} else {
		# Remove 'target' from paramsList so that it is not passed to the calling functions which may not receive 'target' as a parameter
		paramsList$target = NULL
		# Settings for the generation of the pairs plot below when number of pairs plot to generate is more than 1
		if (nplots > 1) {
			# First INDEPENDENT variable to plot in every pairs plot
			imin = 1
			# The number of panels per pairs plot is max.panels
			cat("For better view, the pairs plot is divided into", nplots, "plots of", max.panels, "panels each.\n")
  #-------------------------- Parse Input parameters ---------------------------

	if (nplots > 1) {
		plotnum = 0
		for (i in seq(imin, imax, max.panels)) {
			plotnum = plotnum + 1
			cat("Plot", plotnum, "of", nplots, "...\n")
			if (!is.null(target)) {
				paramsList$x = x[,c(1,i:min(i+max.panels-1, imax))]
			} else {
				paramsList$x = x[,i:min(i+max.panels-1, imax)]
		  # Call the pairs() function
		  do.call("pairs", paramsList)
		  if (pause & plotnum < nplots) {
		  	cat("Press ENTER for next plot:")
	} else {
		# Call the pairs() function
		do.call("pairs", paramsList)	

# hist.log: Histogram in log scale
hist.log = function(x, constant=1, format="E", cex=1, las=1, ...)
# Created: 		    2008/09/19
# Modified: 	    2008/09/19
# Description:	  Histogram of a variable using log scale, with the original scale used as labels.
# Parameters:	
#				x			    Vector of values to use in the histogram.
# 			constant	Constant to shift the log transformation in sign(x)*log10(constant + x).
# 			format		Format to use for the numbers in the axis labels.
# 			cex		  	Character EXpantion option for the labels.
# 			las			  las option for the label definining the text orientation (1 = horizontal, 2 = vertical)
#				...		  	Additional parameters to pass to the hist() function.
	xlog = safeLog(x, constant)

	xmin = floor(min(x))
	xmax = ceiling(max(x))
	hist(x, xaxt="n", ...)
	axis(1, at=xmin:xmax, labels=FALSE);
	labels = as.character(formatC(10^(xmin:xmax), format=format, digits=0))
	mtext(labels, side=1, at=xmin:xmax, line=1, cex=cex, las=las);

# plot.log: Plot in log scale (the log transformation is safeLog(x) = sign(x)*log10(constant + x))
# NOTE (2008/10/03): Could use function get() to decide what kind of plot to do depending on parameter (e.g. plot or hist)
#fname <- "pchisq";
#fname <- get(fname , mode="function");
#fname(x,3);				# This is exactly the same as doing pchisq(x,3);
plot.log = function(x, y, log="x", constant=c(1,1), format=c("E","E"), cex=c(1,1), las=c(1,1), ...)
# Created: 		  2008/09/19
# Modified:   	2008/09/19
# Description: 	Plot with log scale in one or the two axis, with the original scale used as labels.
# Parameters:	
#				        x			    Vector of values to plot in the horizontal axis.
#				        y			    Vector of values to plot in the vertical axis.
# 				      log		    1-D or 2-D vector listing the axes to be log-transformed
#							            Ex: c("x","y")
# 				      constant	1-D or 2-D vector specifying the constant to use in shifting the log transformation
#							            in safeLog(x) = sign(x)*log10(constant + x).
# 				      format		1-D or 2-D vector specifying the format to use for the numbers in the axis tick labels.
# 				      cex 			1-D or 2-D vector specifying the Character EXpantion option for the axis tick labels.
# 				      las	  		1-D or 2-D vector specifying the orientation of the axis tick labels
#							            (1 = horizontal, 2 = vertical)
#				        ...			Additional parameters to pass to the plot() function.
	# Get the range of values to be plotted for each axis
	xmin = floor(min(x))
	xmax = ceiling(max(x))
	ymin = floor(min(y))
	ymax = ceiling(max(y))

	#------------------------------- Parse input parameters ----------------------------------#
	x = x; logx = FALSE; xaxt = par("xaxt")
	y = y; logy = FALSE; yaxt = par("yaxt") 
	if ("x" %in% tolower(log))
		logx = TRUE
		xlog = safeLog(x, constant[1])
		x = xlog
		xaxt = "n"
	if ("y" %in% tolower(log))
		logy = TRUE
		ylog = safeLog(y, constant[2])
		y = ylog
		yaxt = "n"
	xcex = ycex = 1
	if (!is.null(cex))
		xcex = cex[1]
		if (length(cex) > 1)	{ ycex = cex[2] }
	xlas = ylas = 1
	if (!is.null(las))
		xlas = las[1]
		if (length(las) > 1)	{ ylas = las[2] }
	#------------------------------- Parse input parameters ----------------------------------#

	#--------------------------------------- Plot --------------------------------------------#
	plot(x, y, xaxt=xaxt, yaxt=yaxt, ...)
	if (logx)
		axis(1, at=xmin:xmax, labels=FALSE);
		labels = as.character(formatC(10^(xmin:xmax), format=format[2], digits=0))
		mtext(labels, side=1, at=xmin:xmax, line=1, cex=xcex, las=xlas);
	if (logx)
		axis(2, at=ymin:ymax, labels=FALSE);
		labels = as.character(formatC(10^(ymin:ymax), format=yformat, digits=0))
		mtext(labels, side=2, at=ymin:ymax, line=1, cex=ycex, las=ylas);
	#--------------------------------------- Plot --------------------------------------------#

# plot.axis: Plot using specified x tick marks
plot.axis = function(x, y, xticks=NULL, xlabels=NULL, las=1, grid=TRUE, lty.grid=2, lwd.grid=1, ...)
# Created: 		    2008/09/25
# Modified:   	  2008/09/25
# Description: 	  Plot using specified x tick marks. Optionally a grid is shown.
# Parameters:	
#				x			    Vector of values to plot in the horizontal axis.
#				y			    Vector of values to plot in the vertical axis.
#				xticks	  Ticks to use in the x axis
#				xlabels	  Labels to use in the x axis
#				las			  Orientation for the labels (1 = horizontal, 2 = vertical)
#				grid		  Whether to show a vertical grid
# 			lty.grid	Line type for the grid lines
#				lwd.grid	Line width for the grid lines
#				...			  Additional parameters to pass to the plot() function.
	xaxt = par("xaxt")
	if (!is.null(xticks))	{ xaxt = "n" }

	plot(x, y, xaxt=xaxt, ...)

	if (xaxt == "n")
		axis(1, at=xticks, labels=xlabels, las=las)
	if (grid)
		abline(v=xticks, lty=lty.grid, lwd=lwd.grid)

# 2017/03/09
# plot2
#' Make a dual plot, typically a bar plot on the left axis and a scatter plot on the right axis.
#' @param x x-axis values for the primary plot.
#' @param y y-axis values for the primary plot. If NULL, the 'x' values are plotted on the primary axis.
#' @param x2 x-axis values for the secondary plot. If NULL, the primary x-axis values are used.
#' @param y2 y-axis values for the secondary plot. If NULL, no secondary plot is produced.
#' @param plot1 function to use for the primary axis plot.
#' @param plot2 function to use for the secondary axis plot.
#' @param plot1_options list containing the named options for the plot on the primary axis.
#' @param plot2_options list containing the named options for the plot on the secondary axis.
#' @param ... common options to use for both plots
#' @return (invisibly) a list with the parameters for the x axes of the two plots, and the mar parameter value.
#' The list has the following entries:
#' x: The x values for the plot on the left axix
#' x2: The x values for the plot on the right axis
#' xlim: The min and max values used for the axis (for both plots)
#' xaxs: The xaxs property for the x axis (for both plots)
#' mar: the mar graphical parameter of par()
# TODO: Fix this problem: the y2 values appear on the left vertical axis when plot1=plot and no x is given, even if I use quote(y)!
plot2 = function(x, y=NULL, x2=NULL, y2=NULL, sortx=TRUE, plot1=barplot, plot2=plot, plot1_options=list(), plot2_options=list(type="o", pch=21, col="red", bg="red", col.axis="red"), ...) {
	#paramsList = getParamsList(match.call(), ...)
	plot1.name = deparse(substitute(plot1))
	plot2.name = deparse(substitute(plot2))

	#---------- Parse input parameters
	# Set default plotting options
	plot1_options_default = list()
	plot1_options = setDefaultOptions(plot1_options, plot1_options_default)
	plot2_options_default = list(type="o", pch=21, col="red", bg="red", col.axis="red")
	plot2_options = setDefaultOptions(plot2_options, plot2_options_default)
	# Secondary axis label
	y2lab = plot2_options$ylab
	plot2_options$ylab = NULL
	if (!is.null(y2lab)) {
		# Leave space for the label of the secondary axis
		op.mar = par("mar", no.readonly=TRUE); on.exit(par(mar=op.mar))
		mar = op.mar + c(0,0,0,1)
	#---------- Parse input parameters

	if (plot1.name == "barplot") {
		# Generate the barplot and set the x values to be the x-axis position of the barplots
		xaxs = "i"
		x = do.call(plot1.name, c(list(x, xaxs=xaxs), plot1_options, ...))			# Use xaxs="i" (i.e. the x-axis spans just the data plotted, with no extension as is the case with default xaxs="s") so that the x-axis of the second plot matches the x-axis of the barplot!
		# Get the x-axis limits of the above plot so that they can be used on the second plot
		xlim = getAxisLimits(xaxs=xaxs)[1:2]

		# Define the x2 values as x if NULL was given (i.e. the plot on the secondary axis shares the same x-axis values as the plot on the primary axis)
		if (is.null(x2)) {
			x2 = x
	} else {
		if (sortx) {
			# Sort the x values in case the user asked to connect the points with lines
			x = sort(x)
  	do.call(plot1.name, c(list(x, y), plot1_options, ...))
		# Get the x-axis limits of the above plot so that they can be used on the second plot
		xlim = getAxisLimits()[1:2]
		xaxs = par("xaxs")

		# Define the x and y values to plot on the secondary axis
		if (is.null(y)) {
			# This means that the primary plot is the plot of 'x' and the x-axis values are just indices (i.e. consecutive integers from 1 to length(x))
			# => the plot on the secondary axis should still have those values as x-axis
			x2 = 1:length(y2)
		if (is.null(x2)) {
			# The x-axis values are shared with the primary plot
			x2 = x

	# Generate the plot on the secondary axis
	if (!is.null(y2)) {
		do.call(plot2.name, c(list(quote(x2), quote(y2), xlim=xlim, xaxs=xaxs, xaxt="n", yaxt="n", xlab="", ylab=""), plot2_options, ...))
		axis(side=4, at=pretty(y), col=plot2_options$col, col.axis=plot2_options$col.axis)
		# Add the label for the secondary axis
		if (!is.null(y2lab)) {
			mtext(side=4, text=y2lab, line=2, col=plot2_options$col.axis)
	return(invisible(list(x=x, x2=x2, xlim=xlim, xaxs=xaxs, mar=mar)))

# plot.hist: Plot a histogram (i.e. plot boxes like the ones generated by hist())
plot.hist = function(x, y, ...)
# Created: 		  2008/10/03
# Modified: 	  2008/10/03
# Description: 	Plot a histogram.
# Parameters:	
#				x			  Breaks of histogram
#				y			  Counts or density of histogram
#				...		  Additional parameters to pass to the plot() function.
# NOTE: (2014/03/28) This same plot could be much more easily done with the rect() function (see panel.hist() in this module)
	toplot = cbind(x[-1], y)
	tp = matrix(nrow=3*nrow(toplot)+1, ncol=2)
	tp[1,] = c(x[1],0)
	xprev = tp[1,1]
	for (i in 1:nrow(toplot))
		tp[i*3-1,] = c(xprev, toplot[i,2])
		tp[i*3,] = toplot[i,]
		tp[i*3+1,] = c(toplot[i,1], 0)
		xprev = tp[i*3,1]
	# Plot the histogram
	polygon(tp, ...)

# plot.outliers2d: Plot 2D outliers based on the estimation of a multivariate gaussian distribution (Ref: Machine Learning course at Coursera)
plot.outliers2d = function(x, y=NULL, id=NULL, center=median, scale=cov, cutoff=0.01, add=FALSE, plot=TRUE, plot.kde=FALSE, plot.id=TRUE, lwd=1, col.outlier="red", cex=0.6, pos=3, srt=0, ...)
# Created: 			20-Apr-2014
# Author: 			Daniel Mastropietro
# Parameters:   x: variable x to analyze or matrix/data.frame containing both variables to analyze
#								y: variable y to analyze or empty (when x is a matrix/data.frame)
#								cutoff: when (estimated gaussian density) / max(estimated gaussian density) < cutoff => flag the point as outlier
#								srt: degrees of rotation for the text shown by text() (e.g. srt=90). This is a par() option that applies only to text information on a plots.
  ### Data prep
  # Remove missing values in data
  if (is.null(y)) {
    varnames = colnames(x)
    y = x[,2]
    x = x[,1]
  } else {
    varnames = c(deparse(substitute(x)), deparse(substitute(y)))
  indok = !is.na(x) & !is.na(y)
  # Store the data good for analyzing
  xy = cbind(x[indok],y[indok])
  # Name the rows using the id variable
  if (!is.null(id)) rownames(xy) = id[indok]
  ### Plot
  if (plot) {
		if (add) {
			points(xy, ...)
		} else {
			plot(xy, ...)
		#  do.call(plot.fun, list(xy, ...))
			## This do.call() does NOT work properly!! it generates a plot showing all the values to be plotted and it's a mess!!!!
			## (2014/05/11) OK! I found why! The reason is that the plot() function shows deparse(substitute(x)) and deparse(substitute(y)) as labels
			## for the x and y axis. In a do.call(), the parameters passed in the parameter list() are EVALUATED, meaning that the parameters x and y
			## (or the matrix xy as in this case) contain the VALUES to be plotted, NOT the NAME of the matrix/vectors containing the values to be plotted...
			## Therefore, the result of deparse(substitute(x)) will be the plotted values and NOT the variable names.
			## This should be solved by replacing xy with substitute(xy). See 'R Help.txt' under entry "PARSE FUNCTION PARAMETERS".
  # Compute the centroid and the covariance matrix
  centroid = apply(xy, 2, FUN=center)
  SIGMA = do.call(scale, list(xy))
  # Assume multivariate normality to compute density function to compare with cutoff value
  # I could Use the mahalanobis() function which already returns the square of the mahalanobis distance
  # but here I do it myself to show how to avoid computing non-relevant innter products in the computation of such distance!
  # (taken from mahalanobis() code)
  xyc = sweep(xy, 2, centroid)  # The sweep operator takes care of doing the right operation along the different rows of xy (this is equivalent to the more cumbersome t(t(xy) - centroid)
  maha2 = rowSums((xyc %*% solve(SIGMA)) * xyc) # VERY SMART CALCULATION!
  gaussian.density = exp(-0.5*maha2) / (2*pi*sqrt(det(SIGMA)))
  names(gaussian.density) = rownames(xy)
  # Plot the estimated density surface
  #   redblue.palette = colorRampPalette(c("red", "blue"))
  #   ncol = 50
  #   colors = redblue.palette(ncol)
  #   denscol = cut(dens2, quantile(dens2, probs=seq(0,1,1/ncol)))
  #   persp(gx, gy, dens2, col=colors[denscol], ticktype="detailed", theta=40, phi=20)
  ### 2D density estimation
  if (plot & plot.kde) {
    xy.kde = kde2d(xy[,1], xy[,2])
    contour(xy.kde$x, xy.kde$y, xy.kde$z, col="green", nlevels=5, add=TRUE, lwd=lwd)
  ### Compute the normal density in order to generate a contour plot (ellipses)
  # Generate the grid
  xrange = range(x)
  yrange = range(y)
  gx = seq.int(xrange[1], xrange[2], length.out=25)
  gy = seq.int(yrange[1], yrange[2], length.out=25)
  # Create all possible combinations of gx and gy where the normal density should be computed
  xgrid = rep(gx, length(gy))
  ygrid = sort(rep(gy, length(gx)))
  xygrid = cbind(xgrid, ygrid)
  dens2 = exp(-0.5*mahalanobis(xygrid, centroid, SIGMA)) / sqrt(2*pi*det(SIGMA))
  # Convert dens2 to a matrix of size (length(xgrid) * length(ygrid))
  dens2 = matrix(dens2, nrow=length(gx), ncol=length(gy))
  if (plot) {
		contour(gx, gy, dens2, nlevels=5, add=TRUE, lwd=lwd, col="blue")
		points(centroid[1], centroid[2], pch="x", lwd=lwd, cex=2, col="blue")
  # Using the ellipse() function...
  gaussian.cutoff = cutoff*max(gaussian.density)
  #  print(quantile(gaussian.density, probs=seq(0,1,1/100)))
  cat("Max of estimated gaussian density:", formatC(max(gaussian.density), format="e", digits=4), "\n")
  cat("Cut off value ( based on cutoff level", cutoff, "):", formatC(gaussian.cutoff, format="e", digits=4), "\n")
  cat("Min of estimated gaussian density:", formatC(min(gaussian.density), format="e", digits=4), "\n")
  maha2.cutoff = -2 * log(2*pi*sqrt(det(SIGMA)) * gaussian.cutoff)
  if (plot) try( ellipsem(centroid, solve(SIGMA), maha2.cutoff , col=col.outlier, lwd=lwd) )
  # Show the cutoff ellipse
  # NOTE: the above call to ellipse() is more reliable in terms of showing the actual cutoff
  #  contour(gx, gy, dens2, levels=cutoff*max(gaussian.value), add=TRUE, lwd=lwd, col="red")

  outliers = which(gaussian.density < gaussian.cutoff)
  if (length(outliers) > 0) {
    cat("Number of outliers detected for variables", varnames, ":", length(outliers), "(", formatC(length(outliers) / nrow(xy) * 100, format="g", digits=1), "%)\n")
    if (plot) {
			points(xy[outliers,], pch=21, bg=col.outlier, col=col.outlier)
			if (plot.id) {
				op = par(xpd=TRUE)
				text(xy[outliers,], labels=rownames(xy)[outliers], offset=0.5, pos=pos, srt=srt, cex=cex)
  } else {
    cat("Number of outliers detected for variables", varnames, ":", length(outliers), "(", formatC(length(outliers) / nrow(xy) * 100, format="g", digits=1), "%)\n")
    cat("No outliers detected.\n")

# panel.outliers2d: Same function as plot.outliers2d but as a panel function (i.e. always uses add=TRUE)
panel.outliers2d = function(x, y, id=NULL, center=median, scale=cov, cutoff=0.01, plot.kde=FALSE, plot.id=TRUE, lwd=1, cex=0.6, pos=3, srt=0, ...)
	plot.outliers2d(x, y, id=id, center=center, scale=scale, cutoff=cutoff, plot.kde=plot.kde, plot.id=plot.id, lwd=lwd, cex=cex, pos=pos, srt=srt, add=TRUE, ...)

# biplot.custom: Customized biplot where 2D density estimation of points is shown instead of the actual points (suitable for large number of points)
# - 2013/11/15: Extend this function so that biplots of a selected set of principal components (more than 2!) are produced in a pairs
# plot fashion. Currently this is not possible because I am using function filled.contour() which always generates a new plot
# (thus making it unsuitable for a pairs plot...). However, we could try using the contour() function which can be used to ADD a
# graph to an existing plot (using add=TRUE). The contour() function does not produce colors to indicate the height of the contour,
# however I think the colors can be produced by calling the image() function first, or perhaps better my own plot.image() or panel.image() function.
# For examples of using contour() in this way, see the examples of the hist2d() function in the gplots package.
# - 2016/01/17: Generate the plots using ggplot2 package for better quality. See this post for an example of generating biplots with ggplot2:
# http://stackoverflow.com/questions/6578355/plotting-pca-biplot-with-ggplot2
# Here we can also see the difference between the regular biplot() function... the ggplot output is much better!!
biplot.custom = function(x, pc=1:2, arrowsFlag=TRUE, pointsFlag=FALSE, outliersFlag=TRUE, pointlabels=NULL, thr=4, outliersMaxPct=0.5, ...)
# Created:      2013/08/28
# Modified:     2013/08/28
# Description:  Custom biplot suitable for large number of points
# Details:      The customized biplot shows:
#               - 2D estimated desnsity instead of actual points
#               - Axis limits are computed properly
#               - [NOT IMPLEMENTED YET!] Optionally, a panel plot of the first k principal components in pairs (e.g. top PCs explaining 80% of the variation)
# Parameters:  
#               x:  An object of class princomp
# Output:       When outliersFlag=TRUE, a data frame containing the points identified as outliers on the analyzed principal components
#								is returned.
# Assumptions:  There are NO missing values (NA) in the data!
  plot.ArrowsAndPoints = function(x, arrowsFlag=TRUE, pointsFlag=FALSE, outliersFlag=TRUE, pointlabels=NULL, thr=4, outliersMaxPct=0.5) {
    # Function to plot the arrows representing the projection of the variable axes into the PCs and optionally the observations as points
    # x:              An object of class princomp
    # pointlabels:    Vector whose values define the symbols to show instead of points
    # thr:            Multiplier threshold to use to define an outlier based on the matrix diagonals, h, as follows:
    #                 flag a case as outlier if h > thr*p/n, where p is the number of analyzed PCs and n is the number of cases
    # outliersMaxPct: Maximum percentage of cases to flag as outliers

    # Read the scores for the PCs to analyze
    # (which represent the points to be plotted or the data whose mean vector define the centroid of the points --always shown)
    X = x$scores[,pc]

    # Restore axis ticks and labels for the density contour plot (which are removed by using this plot.arrows() function as plot.axes option of filled.contour() below)
    xaxp = par("xaxp")
    xticks = seq(xaxp[1], xaxp[2], length.out=xaxp[3]+1)
    yaxp = par("yaxp")
    yticks = seq(yaxp[1], yaxp[2], length.out=yaxp[3]+1)
    axis(1, at=xticks, labels=xticks, col="blue", col.ticks="blue")
    axis(2, at=yticks, labels=yticks, col="blue", col.ticks="blue")
    mtext(paste("Comp.", pc[1], sep=""), side=1, line=3, las=1)   # xlabel
    mtext(paste("Comp.", pc[2], sep=""), side=2, line=3, las=0)   # ylabel
    ### Variables on which the PCA was applied (they are given by the row names of the loadings matrix provided by the princomp object x)
    vars = rownames(x$loadings)

    ### Points
    if (pointsFlag) {
      axis(1); axis(2);
      if (is.null(pointlabels) | outliersFlag) {
      	# Show just points either when the pointlabels vector is NULL or the user wishes to see outliers highlighted
      	# (it which case it is only relevant to know the ID of the outliers, not of ALL the cases!)
        points(X, pch=21, col=rgb(0,0,0,0.2), bg=rgb(0,0,0,0.2), cex=0.3)
      } else {
        text(X, label=pointlabels, adj=0, offset=0, col=rgb(0,0,0,0.8), bg=rgb(0,0,0,0.8), cex=0.5)

    ### Label the outliers based on the hat matrix diagonal elements
    # Create the indOutliers variable in the environment of biplot.custom(), which is TWO (2) environment positions before the current function, since this function is called by filled.contour())
    if (outliersFlag) {
    	#unlockBinding("indOutliers", sys.frame(sys.nframe()-2))
    	assign("indOutliers", pos=sys.nframe()-2, NULL)
    if (outliersFlag) {
      # Identify the outliers on the analyzed PCs based on the hat matrix (i.e. distance from the centroid relative to disperson of the data)
      H = X %*% solve(t(X) %*% X) %*% t(X)
      hat = diag(H)
      p = length(pc)
      n = nrow(X)
      if (!is.null(outliersMaxPct)) {
        hat.quant = quantile(hat, probs=1-outliersMaxPct/100)
      } else {
        hat.quant = 0
      if (outliersFlag) {
	      #unlockBinding("hat", sys.frame(sys.nframe()-2))
				assign("indOutliers", pos=sys.nframe()-2, which((hat>thr*p/n) & (hat> hat.quant)))  # NOTE: It seems it's important to use parenthesis to enclose each > condition!
	      assign("hat", pos=sys.nframe()-2, hat)	# Diagonal of hat matrix to show leverage of outliers
      if (is.null(pointlabels)) {
        label = 1:length(indOutliers)
      } else {
        label = pointlabels[indOutliers]
      # Clip the points and its labels to the figure region
      op = par()
      points(X[indOutliers,], pch=21, col="black", bg="black", cex=0.5)
      text(X[indOutliers,], label=label, offset=0.1, pos=3, cex=0.5)
      mtext(paste(signif(length(indOutliers)/n*100,1), "% of the observations are identified as outliers"), side=3, cex=0.8)

    ### Position of centroid of analyzed PCs
    # Add a star representing the average value of the analyzed PCs
    # (note that I compute the average of the PCs coordinates, not the projection onto the PCs of the data average.
    # This is so because the 2D density estimation is based on the PC coordinates and it is NOT the project of the
    # n-dimensional density estimation on the n-dimensional data!)
    meanPC = apply(X, 2, FUN=mean)
    points(meanPC[1], meanPC[2], pch="x", col="blue", cex=1.5)

    ### Arrows representing the original axes on the analyzed PCs
    if (arrowsFlag) {
      # Define the arrow start and end positions
      npcs = ncol(x$loadings)
      arrows.start = matrix(data=0, nrow=npcs, ncol=2) # column 1 is x coordinate of the start, column 2 is y coordinate
      arrows.end = x$loadings[,pc]
      # Compute contributed percentage of each variable on the analyzed PCs w.r.t. all PCs
      contrPct = x$loadings / apply(abs(x$loadings), 1, FUN=sum) * 100
      contrPct.pc = apply(abs(contrPct[,pc]), 1, FUN=sum)
      # Jitter arrow ends for the position of variable labels to increase readability
      pos.labels = jitter(arrows.end, factor=10)
      # Define symmetric axis so that the (0,0) of the arrows coincide with the (0,0) of the density contours already plotted!
      xlim = range(c(arrows.start[,1], arrows.end[,1], pos.labels[,1])); xlim = c(-max(abs(xlim)), max(abs(xlim)))
      ylim = range(c(arrows.start[,2], arrows.end[,2], pos.labels[,2])); ylim = c(-max(abs(ylim)), max(abs(ylim)))
      # Create a new plot on the existing density contour
      plot(arrows.end, type="n", xaxt="n", yaxt="n", xlim=xlim, ylim=ylim, xlab="", ylab="")
      arrows(arrows.start[,1], arrows.start[,2], arrows.end[,1], arrows.end[,2], length=0.15, col="blue")
      # Add the labels indicating the variable names corresponding to each arrow
      usr = par("usr")
      xticks = axTicks(1, usr=usr[1:2], log=FALSE)
      yticks = axTicks(2, usr=usr[3:4], log=FALSE)
      axis(3, at=xticks, labels=xticks)
      axis(4, at=yticks, labels=yticks)
      par(xpd=TRUE) # Clip plotting to the figure region so that if labels fall outside the plot they are still shown!
      text(pos.labels, labels=paste(vars, " (", signif(contrPct.pc, digits=2), "%)", sep=""), offset=0, cex=0.7)

  #--------------------------- Parse input parameters ---------------------------
  # Color definition for the different levels of the filled contour plot
  whitered.palette = colorRampPalette(c("white", "red"))
    ## White color is used as starting color so that it shades out with the default background color of plots

  # Compute the 2D estimated density of the first PCs
  x.kde = kde2d(x$scores[,pc[1]], x$scores[,pc[2]], n=25)  # n=25 is the default number of grid points to generate for the density plotting
  # Set symmetric plot limits so that the (0,0) for the density contour (of observations) and variables match on the same position!
  xlim = range(x.kde$x); xlim = c(-max(abs(xlim)), max(abs(xlim)))
  ylim = range(x.kde$y); ylim = c(-max(abs(ylim)), max(abs(ylim)))
  # filled.contour: Note the use of the plot.axes option to add the points, so that they are correctly added
  # on the same graphical region as the contours --o.w. they fall outside the plotting region because of the color scale shown on the right, see help() for more info)
  filled.contour(x.kde$x, x.kde$y, x.kde$z, xlim=xlim, ylim=ylim,
                              color.palette=my.colors<-function(nlevels) { whitered.palette(nlevels) },  # nlevels is a parameter defined by default in filled.contour()
                              plot.axes=plot.ArrowsAndPoints(x, arrowsFlag=arrowsFlag, pointsFlag=pointsFlag, outliersFlag=outliersFlag, pointlabels=pointlabels, thr=thr, outliersMaxPct=outliersMaxPct), ...)
  if (outliersFlag) {
    # indOutliers is created in function plot.ArrowsAndPoints()
    outliers = data.frame(V1=x$scores[indOutliers,pc[1]], V2=x$scores[indOutliers,pc[2]], V3=hat[indOutliers])
    names(outliers) = c(paste("Comp.", pc[1], sep=""), paste("Comp.", pc[1], sep=""), "leverage")
    if (!is.null(pointlabels)) {
      rownames = pointlabels[indOutliers]
    } else {
      rownames = indOutliers
    rownames(outliers) = rownames
    # Sort the outliers by decreasing hat value (so that I can easily label them in the leverage plot below and for convenience for the user)
    outliers = outliers[order(outliers$leverage, decreasing=TRUE),]
    # Show the distribution of leverage values
    cat("Distribution of leverage values\n")
    ord = order(hat,decreasing=TRUE)
#    if (!is.null(pointlabels)) {
#    	ind = pointlabels
#    } else {
#    	ind = 1:length(hat)
#    }
		# Use as x values the percentage accumulated 
		ind = (1:length(hat))/length(hat)*100
    plot(ind, hat[ord], main="Leverage values", xlab="Cumulative Percent of Cases", ylab="Standardized Mahalanobis distance", type="p", pch=21, col="black", bg="black", xaxt="n")
    lines(ind, hat[ord], col="black")
    #mtext(ind[ord], side=1, at=1:length(hat), las=3, cex=0.5)
		axis(side=1, at=seq(0,100,10), labels=FALSE)
		mtext(side=1, at=seq(0,100,10), line=1, text=paste(formatC(seq(0,100,10), format="g", digits=3), "%", sep=""))
    text(ind[1:nrow(outliers)], outliers$leverage, labels=rownames(outliers), offset=0.3, pos=4, cex=0.7)

# plot.cdf: Plot CDF of a numeric variable
plot.cdf = function(x, weight=NULL, probs=seq(0,1,0.01), empirical=TRUE, include.lowest=FALSE, freq=FALSE, reverse=FALSE, add=FALSE, ...)
# Created:      12-Mar-2014
# Modified:     04-Feb-2015
# Description:  Plot CDF of a numeric variable in the quantiles specified in parameter 'probs'
# Details:			The parameter probs corresponds to the parameter of the same name in function quantile()
# Parameters:   x: 							A numeric array (missing values are accepted and a warning is shown in that case)
#								weight:					A numeric array containing weights for each value in x (used in the CDF computation).
#																CURRENTLY IT IS ONLY USED WHEN empirical=TRUE.
#								probs: 					Array of values between 0 and 1 defining the quantiles of x to compute. Only used if empirical=FALSE.
#								empirical:			Flag indicating whether to compute the empirical CDF (as opposed to a fixed-quantile based CDF, where the quantiles are specified in 'probs').
#								include.lowest: Flag indicating whether to include the lowest value of x as 0% percentile in the emprical=TRUE case.
#								freq:						Flag indicating whether to show the cumulative number of cases on the vertical axis (instead of the cumulative percent).
#								reverse:				Flag indicating whether to compute the CDF on the reversed values of x.
#																(i.e. larger values of x correspond to smaller quantiles)
#								add: 						Flag indicating whether to add the plot to an existing plot.
#								...:						Additional parameters passed to the plot() function.
# Output:       A matrix containing the CDF of x where the row names are the quantile values defined in 'probs'.
# Assumptions:  None
# HISTORY:	(2014/04/15)
#						Fixed a bug related to the variables xlab and ylab that were found as functions in the ggplot2 namespace.
#						Added for this the option mode="name" in the exists() function that checks for existence of xlab and ylab.
#						(2015/02/04)
#						Added parameter 'empirical' so that the empirical CDF is computed instead of the "quantile" CDF using the 'probs' values.
#						(2015/02/05)
#						Added parameters 'freq' and 'weight'.
	# Stop if X is not numeric (e.g. stop if it is character or if it is a matrix of more than one column)

  # Parse the function call
  call = match.call()
  # Put all calling parameters into a parameter list
  paramsList = as.list(call)
  # Get the parameters and its values passed as part of ...
  optionsList = list(...)
  # Remove the function name from the parameter list
  paramsList[[1]] = NULL

	# Check if NAs are present in the data and issue a warning
	na.check = sum(is.na(x))
	if (na.check > 0) {
		na.pct = formatC(na.check/length(x)*100, format="f", digits=1)
		cat("PLOT.CDF: WARNING -", na.check, "NA(s) (", na.pct, "% ) found in array", deparse(substitute(x)), ".\n")
		cat("PLOT.CDF: Analysis based on", length(x)-na.check, "valid values out of", length(x), ".\n")

	### Compute CDF
	# Check parameter EMPIRICAL
	if (empirical) {
		# Compute the empirical CDF using the cdf() function defined in this file
		cdf.tab = cdf(x, include.lowest=include.lowest)
		# Store x and its CDF separately for plotting
		x.cdf = cdf.tab$x
		cdf.values = cdf.tab$cdf*100
		# Assign the quantiles/CDF of each x value as record label (for output purposes)
		# (2017/01/25) These names are no longer used, since they are not output as row names of the output data frame.
#		names(x.cdf) = paste(signif(cdf.values, digits=3), "%", sep="")
	} else {
		if (!is.null(weight)) {
			indok = !is.na(x)
			# Quantiles for x and the weight
			xw.cdf = quantile.weight(x[indok], probs=probs, weight=weight[indok])
			x.cdf = xw.cdf[,2]		# column 2 contains the weighted quantiles
			cdf.values = probs*100
		} else {
			x.cdf = quantile(x, probs=probs, na.rm=TRUE)
			# Construct the CDF values to plot on the vertical axis from the names of the x.dist array
			cdf.values = as.numeric(gsub("%","",names(x.cdf)))
	# Check parameter REVERSE
	# If reverse=TRUE, plot (1-CDF) and reverse the x limits when plotting
	if (reverse) {
		cdf.values = 100 - cdf.values		# recall that cdf.values is in percent scale (not proportion)
		# Udpate the record labels to reflect the reverse of the CDF computation
		# (2017/01/25) These names are no longer used, since they are not output as row names of the output data frame.
#		names(x.cdf) = paste(signif(cdf.values, digits=digits), "%", sep="")
		if (is.null(paramsList$xlim)) {
			paramsList$xlim = rev(range(x.cdf))
		} else {
			paramsList$xlim = rev(optionsList$xlim)	# NOTE that here we need to use optionsList$xlim and NOT paramsList$xlim because the latter has TEXT value (e.g. "c(0,1)")!!
	# Check parameter FREQ
	if (freq) {
		# Plot the cumulative number of cases, instead of the cumulative % cases for each value x
		if (!is.null(weight)) {
			cdf.values = cdf.values/100 * sum(as.numeric(na.omit(weight)))	# Use as.numeric() to avoid overflow which happened once when dealing with large weights! (e.g. DeudaTotal in Nilo project at NAT, 2015)
		} else {
			cdf.values = cdf.values/100 * sum(!is.na(x))

	# Plot
	# Define default values for graphical parameters (by checking whether the user passed any of those defined)
  if (is.null(paramsList$type))
  	if (empirical) { paramsList$type = "s" } else { paramsList$type = "l" }
	if (is.null(paramsList$xaxt)) paramsList$xaxt = par("xaxt")
	if (is.null(paramsList$yaxt)) paramsList$yaxt = par("yaxt")
	if (is.null(paramsList$xlab)) paramsList$xlab = deparse(substitute(x))
	if (is.null(paramsList$ylab)) paramsList$ylab = "cdf"
	if (is.null(paramsList$main)) paramsList$main = paste("Total cases:", length(x)-na.check)
  if (add) {
    # Do not show the axis ticks nor the x label when the plot is added to an existing graph
    paramsList$xaxt = "n"
    paramsList$yaxt = "n"
    paramsList$xlab = ""
    paramsList$ylab = ""
    paramsList$main = ""
#	if (!exists("type", envir=sys.nframe(), mode="name")) { type = "l" } 
#	if (!exists("xaxt", envir=sys.nframe(), mode="name")) { xaxt = par("xaxt") }
#	if (!exists("yaxt", envir=sys.nframe(), mode="name")) { yaxt = par("yaxt") }
#	if (!exists("xlab", envir=sys.nframe(), mode="name")) { xlab = deparse(substitute(x)) }
#	if (!exists("ylab", envir=sys.nframe(), mode="name")) { ylab = "cdf" } 
#	if (!exists("main", envir=sys.nframe(), mode="name")) { main = paste("Total cases:", length(x)-na.check) }

	# Update paramsList for a proper call to plot() below
	paramsList$x = x.cdf
	paramsList$y = cdf.values
	paramsList$probs = NULL
	paramsList$add = NULL
	paramsList$empirical = NULL
	paramsList$include.lowest = NULL
	paramsList$freq = NULL
	paramsList$weight = NULL
	paramsList$reverse = NULL
	# Use do.call() to mimic the following plot:
	# plot(x.cdf, cdf.values, type=type, xlab=xlab, ylab=ylab, xaxt=xaxt, yaxt=yaxt, main=main, ...)
	do.call("plot", paramsList)

	if (!is.null(weight)) {
		output = xw.cdf
		colnames(output) = c("q", deparse(substitute(x))[1], deparse(substitute(weight))[1])	# I use [1] after deparse(substitute()) because I got once an error where x was given as a long condition on a data frame and the result of deparse(substitute()) had length 2!!
	} else {
		output = as.data.frame(cbind(cdf.values/100, x.cdf))
		# Set the row names to be 1:nrows and the colum names to ("percentile", <analyzed variable name>)
		# Note that the as.data.frame() call above sets the row names to be equal to the names in the x.cdf vector
		# (but does NOT do so if I just convert x.cdf to a data frame!! --i.e. without column binding it to cdf.values/100 as done above)
		rownames(output) = 1:nrow(output)
		colnames(output) = c("quantile", deparse(substitute(x))[1])


# plot.bar: Bar plot of a categorical variable with categories sorted by a target variable
plot.bar <- plot.bars <- function(x, y, event="1", FUN="table", na.rm=FALSE, decreasing=TRUE, horiz=FALSE, bars=TRUE, width=1, cex.names=0.6, las=3, col.bars="black", plot=TRUE, ...)
# Created:      26-Mar-2014
# Modified:   	26-Mar-2014
# Description: 	Make a Bar Plot of a categorical variable x showing the average of a target variable y,
#								where the width of the bars are proportional to the size of each x category.
# Parameters:   - x: Either a character or numeric array corresponding to a categorical variable, or a matrix/data frame containing the
#									data (already aggregated) to plot.
#									In the latter case, x is assumed to be the output of a previously run plot.bar(), which depends on parameter FUN:
#									- When FUN == "table", x is expected to have been generated for a CATEGORICAL y variable, therefore it should have
#										the following columns:
#										- column 'event': the target variable event rate per category to show on the plot
#										- column "n": the number of cases per category
# 									- column "sd": the standard deviation of the target variable per category (NOTE that this value
#																	 should NOT be already divided by sqrt(n))
#										In addition the ROW NAMES of x should contain the x categories to show on the plot!
#									- When FUN != "table", x is expected to have been generated for a CONTINUOUS y variable, therefore it should have
#										the following columns:
#										- column 1: the x categories to show on the horizontal axis
#										- column 2: the representative value (e.g. mean) of the target variable per category to show on the plot
#										- column "n": the number of cases per category
# 									- column "sd": the standard deviation of the target variable per category (NOTE that this value
#																	 should NOT be already divided by sqrt(n))
#								- y: A character or numeric variable that represents some target variable of interest.
#								- event: Value taken by the y variable representing an event of interest (e.g. "1" for a binary 0/1 variable)
#								- FUN: Function to use to apply to the (x, y) pair. It can be either "table" or any other function that is
#									accepted by the aggregate() function that applies to numeric variables. In the latter case, the y variable
#									must be numeric.
#									When FUN=="table", table(x,y) is computed and the proportion of occurrence of 'event' in the y variable is plotted as a bar plot.
#									In all other cases, aggregate(y ~ x, FUN=FUN, na.rm=TRUE) is computed and the resulting value of the y variable is plotted as a bar plot.
#								- na.rm: Whether to remove NAs in the x and y variables from the analysis. If NAs are present in the x variable, it is considered
#									as a separate category. If NAs are present in the y variable, they are ignored.
#									This parameter simply sets the useNA= option of the table() function to "ifany" when TRUE and to "no" when FALSE.
#								- decreasing: how the x categories should be sorted in the bar plot based on target variable y.
#									Either TRUE (by decreasing y), FALSE (by increasing y) or NA (alphabetical order).
#								- horiz: Whether the bar plot should show horizontal or vertical bars (same parameter used by barplot())
#								- bars: Whether to show error bars proportional to the SE of y on each bar.
#								- width: Width of the error bars as multiplier of the SE of y.
#								- cex.names: Character expansion value to use in the bar plot and annotations (parameter cex.names of barplot(), cex of text())
#								- las: Label orientation for the x category axis (parameter las of barplot())
#								- col.bars: Color to use for the error bars
#								- plot: Whether to generate the plot or just return the table with the data to be plotted
#								- ...: Additional parameters to be passed to graphical functions called (bars(), segments(), points())
# Output:       A bar plot is generated where the x categories are sorted by decreasing number of cases.
#								A matrix containing the following columns:
#								- The x categories
#								- The number of non-missing cases in each category
#								- The value resulting from applying FUN to y in each category
#								- The standard error of:
#									- the 'event' rate in each category when FUN == "table"
#									- the representative value of y in each category when FUN != "table"
# Assumptions:  None
  # Graphical settings
  opar = par(no.readonly=TRUE)
  par(mar=c(5.5,4,2,1), xpd=TRUE)		# xpd=TRUE => clip all the plotting to the figure region (as opposed to the plot region) so that labels showing the number of cases in each bar are always seen)

  # Initialize tab so that we can easily check whether the users passed a matrix as x or a vector
	tab = NULL

	# Get the input variable names to use on messages to the user and as column names of the output table
	# (I must do this here because below I may change the value of x and/or y)
	xvarname = deparse(substitute(x))
	yvarname = deparse(substitute(y))

	# Check if parameter x is already a table containing the raw numbers of what should be plotted
	if (is.matrix(x) || is.data.frame(x)) {
		# The input matrix is considered to be already a computed table (i.e. the output of 
		tab = x
		if (!("n" %in% colnames(tab)) | (bars & !("sd" %in% colnames(tab)))) {
			stop("Input parameter '", deparse(substitute(x)), "' does NOT contain the necessary column names ('n' and/or 'sd').\nColumn names are: ", colnames(x))
		if (FUN == "table") {
			xnames = row.names(tab)
			ycol = event
			if (!(event %in% colnames(tab))) {
				stop("Input parameter '", deparse(substitute(x)), "' does NOT contain a column named '", event, "'\n")
		} else {
			xnames = tab[,1]
			ycol = 2

	# Start computation in the case x is NOT a matrix or data frame
	if (FUN == "table" & is.null(tab)) {
	  x = as.character(x)       # Remove the factor condition if existing from the categorical variable in order to avoid having categories with 0 cases!
	  # Check the na.rm= parameter
	  if (na.rm) { useNA = "no" } else { useNA = "ifany" }
	  tab.counts = table(x, y, useNA=useNA)
    # Compute percentages of events and number of cases in each category
		tab = cbind(prop.table(tab.counts, margin=1), n=apply(tab.counts, 1, sum))
		# Standard deviation of y using the probability of occurrence of 'event'
		# (note that I don't yet divide by n this happens at the moment of plotting below)
		tab = cbind(tab, sd=sqrt( tab[,2] * (1 - tab[,2]) ))
		# Names to be used as x categories in the bar plot
		xnames = row.names(tab)
		# Column of 'tab' containing the values of y to plot
		ycol = event
	} else if (is.null(tab)) {
		# The y variable is numeric and FUN is applied to its values to get a representative value for each x category
		if (!is.numeric(y)) {
			stop("Variable '", deparse(substitute(y)), "' is NOT numeric.\n")
		} else {
		  x = as.character(x)       # Remove the factor condition if existing from the categorical variable in order to avoid having categories with 0 cases!
		  tab = aggregate(y ~ x, FUN=FUN, na.rm=TRUE)
			# Compute the number of non-missing rows in the matrix obtained as cbind(x,y)
			# Note that the variable y can have missing values but these are not excluded from the count of non-missing values,
			# only the non-missing values in x are of interest.
			tab = cbind(tab, n=aggregate(y ~ x, FUN=function(x){ sum(!is.na(x)) })[,2])
			tab = cbind(tab, sd=aggregate(y ~ x, FUN=sd, na.rm=TRUE)[,2])
      # Replace missing values of sd with 0
      tab[is.na(tab[,"sd"]),"sd"] = 0
			# Names to be used as x categories in the bar plot
			xnames = tab[,1]
			# Column of 'tab' containing the values of y to plot
			ycol = 2
			colnames(tab)[1:2] = c(xvarname, yvarname)

  # Sort x categories as requested through the 'decreasing' parameter
  if (!is.na(decreasing)) {
    ord = order(tab[,ycol], decreasing=decreasing)
  } else {
    ord = 1:nrow(tab)
  tab = tab[ord,,drop=FALSE]  # Note the use of drop=FALSE so that tab is still a matrix even when it has only one row
  xnames = xnames[ord]

  if (plot) {
		### Bar plot
		xlim = NULL
		ylim = NULL
		# Set appropriate limits before calling barplot() when error bars are requested
		if (bars) {
			if (horiz) {
				xlim = range(c(0, tab[,ycol], tab[,ycol] + width*tab[,"sd"]/sqrt(tab[,"n"])))
			} else {
				ylim = range(c(0, tab[,ycol], tab[,ycol] + width*tab[,"sd"]/sqrt(tab[,"n"])))
		# Generate the bar plot and store the position of the center of each bar in 'barpos' so that we can add the error bars if requested
    # Note that I check whether the user passed the parameters xlim (when horiz=TRUE) or ylim (when horiz=FALSE)
    # so that there is no error indicating that the same parameter has been passed twice to the barplot() function.
    # This is not the best way to do this, but it's the quickest (otherwise I would have to do a whole parsing of the input parameters
    # involving a call to match.call(), etc. --see function pairs.custom() for an example)
    if (horiz & !missing(xlim)) {
      barpos = barplot(tab[,ycol], names.arg=xnames, horiz=horiz, cex.names=cex.names, width=tab[,"n"], ylim=ylim, las=las, ...)
    } else if (!horiz & !missing(ylim)) {
      barpos = barplot(tab[,ycol], names.arg=xnames, horiz=horiz, cex.names=cex.names, width=tab[,"n"], xlim=xlim, las=las, ...)
    } else {
      barpos = barplot(tab[,ycol], names.arg=xnames, horiz=horiz, cex.names=cex.names, width=tab[,"n"], xlim=xlim, ylim=ylim, las=las, ...)
		# Add the number of cases per category above each bar
		if (horiz) {
			# Horizontal bars
			text(tab[,ycol], barpos, labels=tab[,"n"], col="blue", offset=0.5, pos=2, cex=cex.names)
		} else {
			text(barpos, tab[,ycol], labels=tab[,"n"], col="blue", offset=0.5, pos=1, cex=cex.names)
		if (bars) {
			if (horiz) {
				# Horizontal bars
				x.upper = tab[,ycol] + width*tab[,"sd"]/sqrt(tab[,"n"])
				x.lower = pmax(0, tab[,ycol] - width*tab[,"sd"]/sqrt(tab[,"n"]))
				segments(x.lower, barpos, x.upper, barpos, lwd=2, col=col.bars, ...)
				points(x.upper, barpos, pch="|", lwd=2, col=col.bars, ...)
				points(x.lower, barpos, pch="|", lwd=2, col=col.bars, ...)
			} else {
				# Vertical bars
				y.upper = tab[,ycol] + width*tab[,"sd"]/sqrt(tab[,"n"])
				y.lower = pmax(0, tab[,ycol] - width*tab[,"sd"]/sqrt(tab[,"n"]))
				segments(barpos, y.lower, barpos, y.upper, lwd=2, col=col.bars, ...)
				points(barpos, y.upper, pch="_", lwd=2, col=col.bars, ...)
				points(barpos, y.lower, pch="_", lwd=2, col=col.bars, ...)

	# When x is not a matrix and FUN == "table" add the counts of each y value to the output table
	if (!is.matrix(x) & FUN == "table") {
		tab = cbind(tab.counts[ord, ,drop=FALSE], tab)


# cumplot: Cumulative plots of y vs. cumulative x
cumplot = function(x, y, decreasing=TRUE, plot=TRUE, type="l", col="blue", ...)
# Created:      07-May-2014
# Modified:     07-May-2014
# Description:  Plot cumulative y vs. cumulative x values. Target variable y can be continuous or binary 0/1.
# Details:			Missing values are allowed in x and/or y. Only cases with no missing values in both variables are used.
# Parameters:   - x: A numeric array representing the continuous variable to analyze
#								- y: A numeric array representing the target variable to analyze
#								- decreasing: Whether to sort the x values in decreasing order when computing its cumulative values and cumulative cases.
#									Default: TRUE, so that a CONCAVE/CONVEX curve on the left plot means a CONCAVE/CONVEX relationship between y and x.
#								- plot: Whether to generate the plot or just returned the data frame containing the data to plot
#								- type: Value of 'type' parameter of the generated plot()
#								- col: Value of 'col' parameter of the generated plot()
#								- ...: Additional parameters received by the plot() function, except type, col, xlab, ylab.
# Output:				A data frame containing the following columns:
#								- x:				x values sorted increasingly
#								- y:				y values sorted by increasing order of x
#								- xcum:			cumulative x values
#								- xcumpct:	% cumulative y values
#								- ycum:			cumulative y values
#								- ycumpct:	% cumulative y values
#								- ncum:			cumulative number of cases
#								- ncumpct:	% cumulative number of cases
#								When plot=TRUE, a plot is generated of ycumpct vs. xcumpct.
# Assumptions:  None
# HISTORY:	(2014/05/07)
#						Function created.
  # Sort the data by increasing x
  toplot = data.frame(x=x, y=y)
  # Keep cases with non missing values
  toplot = na.omit(toplot)
  n = nrow(toplot)

  # Sort by increasing/decreasing values of x
  ord = order(toplot$x, decreasing=decreasing)
  toplot = toplot[ord,]
  if (decreasing) {
  	TopBottom = "Top"
  } else {
  	TopBottom = "Bottom"
  # % Cumulative x values
  toplot$xcum = cumsum(toplot$x)
  toplot$xcumpct = toplot$xcum / toplot$xcum[n] * 100
  # % Cumulative y values
  toplot$ycum = cumsum(toplot$y)
  toplot$ycumpct = toplot$ycum / toplot$ycum[n] * 100
  # % Cumulative number of cases
  toplot$ncum = 1:n
  toplot$ncumpct = 1:n/n * 100

  # Plot
  if (plot) {
  	op = par(mfrow=c(1,2), mar=c(4,0,0,2), oma=c(0,4,3,0), no.readonly=TRUE); on.exit(par(op))
    #plot(toplot$x, toplot$ycumpct, type="l")		# This plot allows SEEING the distribution of x
    # Plot of Cumulative y values vs. Cumulative x values
    plot(toplot$xcumpct, toplot$ycumpct, type=type, col=col, xlab=paste(TopBottom, "% values"), ylab=NULL, cex.lab=0.9, ...)
    ### NO! UNFORTUNATELY THIS DOES NOT SHOW WHETHER THE RELATIONSHIP IS LINEAR! What it shows is whether the density of points of y
    ### is similar to the density of points of x... (because it means that the accumulation of x is like the accumulation of y
    ### meaning that the increase in x is similar to the increase in y values) In fact, if a few values of x are too large
    ### the % accumulation of x will be very different than the % accumulation of y. This effect can easily be seen when
    ### x = salary and compare the curve obtained with the one obtained when x = safeLog(salary): the former curve has more "panza"
    ### than the latter curve, but it doesn't mean that the relationship between y and salary is less linear than the relationship
    ### with safeLog(salary).
    title(sub="(to check linear/non-linear relationship(?))", line=2, cex.sub=0.7)
    # Plot of Cumulative y values vs. percentile of x (where lower percentiles correspond to lower x values)
    plot(toplot$ncumpct, toplot$ycumpct, type=type, col=col, xlab=paste(TopBottom, "% cases"), ylab=NULL, cex.lab=0.9, ...)
    title(sub="(to check predictive power)", line=2, cex.sub=0.7)
    title(deparse(substitute(x)), outer=TRUE)
		mtext(paste("% Cumulative", deparse(substitute(y)), "values"), side=2, line=2, adj=0, cex=0.9)	# adj=0 means left alignment

###################################### GRAPHICAL functions ####################################
mastropi/dmtools documentation built on May 21, 2019, 12:44 p.m.