Nothing
##########################
#' Visually checking a fitted quantile model
#'
#' @description Given an additive quantile model, fitted using \code{qgam}, \code{cqcheck} provides some plots
#' that allow to check what proportion of responses, \code{y}, falls below the fitted quantile.
#'
#' @param obj the output of a \code{qgam} call.
#' @param v if a 1D plot is required, \code{v} should be either a single character or a numeric vector. In the first case
#' \code{v} should be the names of one of the variables in the dataframe \code{X}. In the second case, the length
#' of \code{v} should be equal to the number of rows of \code{X}. If a 2D plot is required, \code{v} should be
#' either a vector of two characters or a matrix with two columns.
#' @param X a dataframe containing the data used to obtain the conditional quantiles. By default it is NULL, in which
#' case predictions are made using the model matrix in \code{obj$model}.
#' @param y vector of responses. Its i-th entry corresponds to the i-th row of X. By default it is NULL, in which
#' case it is internally set to \code{obj$y}.
#' @param nbin a vector of integers of length one (1D case) or two (2D case) indicating the number of bins to be used
#' in each direction. Used only if \code{bound==NULL}.
#' @param bound in the 1D case it is a numeric vector whose increasing entries represent the bounds of each bin.
#' In the 2D case a list of two vectors should be provided. \code{NULL} by default.
#' @param lev the significance levels used in the plots, this determines the width of the confidence
#' intervals. Default is 0.05.
#' @param scatter if TRUE a scatterplot is added (using the \code{points} function). FALSE by default.
#' @param ... extra graphical parameters to be passed to \code{plot()}.
#' @return Simply produces a plot.
#' @details Having fitted an additive model for, say, quantile \code{qu=0.4} one would expect that about 40% of the
#' responses fall below the fitted quantile. This function allows to visually compare the empirical number
#' of responses (\code{qu_hat}) falling below the fit with its theoretical value (\code{qu}). In particular,
#' the responses are binned, which the bins being constructed along one or two variables (given be arguments
#' \code{v}). Let (\code{qu_hat[i]}) be the proportion of responses below the fitted quantile in the ith bin.
#' This should be approximately equal to \code{qu}, for every i. In the 1D case, when \code{v} is a single
#' character or a numeric vector, \code{cqcheck} provides a plot where: the horizontal line is \code{qu},
#' the dots correspond to \code{qu_hat[i]} and the grey lines are confidence intervals for \code{qu}. The
#' confidence intervals are based on \code{qbinom(lev/2, siz, qu)}, if the dots fall outside them, then
#' \code{qu_hat[i]} might be deviating too much from \code{qu}. In the 2D case, when \code{v} is a vector of two
#' characters or a matrix with two columns, we plot a grid of bins. The responses are divided between the bins
#' as before, but now don't plot the confidence intervals. Instead we report the empirical proportions \code{qu_hat[i]}
#' for the non-empty bin, and with colour the bins in red if \code{qu_hat[i]<qu} and in green otherwise. If
#' \code{qu_hat[i]} falls outside the confidence intervals we put an * next to the numeric \code{qu_hat[i]} and
#' we use more intense colours.
#' @author Matteo Fasiolo <matteo.fasiolo@@gmail.com>.
#' @examples
#' #######
#' # Bivariate additive model y~1+x+x^2+z+x*z/2+e, e~N(0, 1)
#' #######
#' \dontrun{
#' library(qgam)
#' set.seed(15560)
#' n <- 500
#' x <- rnorm(n, 0, 1); z <- rnorm(n)
#' X <- cbind(1, x, x^2, z, x*z)
#' beta <- c(0, 1, 1, 1, 0.5)
#' y <- drop(X %*% beta) + rnorm(n)
#' dataf <- data.frame(cbind(y, x, z))
#' names(dataf) <- c("y", "x", "z")
#'
#' #### Fit a constant model for median
#' qu <- 0.5
#' fit <- qgam(y~1, qu = qu, data = dataf)
#'
#' # Look at what happens along x: clearly there is non linear pattern here
#' cqcheck(obj = fit, v = c("x"), X = dataf, y = y)
#'
#' #### Add a smooth for x
#' fit <- qgam(y~s(x), qu = qu, data = dataf)
#' cqcheck(obj = fit, v = c("x"), X = dataf, y = y) # Better!
#'
#' # Lets look across x and z. As we move along z (x2 in the plot)
#' # the colour changes from green to red
#' cqcheck(obj = fit, v = c("x", "z"), X = dataf, y = y, nbin = c(5, 5))
#'
#' # The effect look pretty linear
#' cqcheck(obj = fit, v = c("z"), X = dataf, y = y, nbin = c(10))
#'
#' #### Lets add a linear effect for z
#' fit <- qgam(y~s(x)+z, qu = qu, data = dataf)
#'
#' # Looks better!
#' cqcheck(obj = fit, v = c("z"))
#'
#' # Lets look across x and y again: green prevails on the top-left to bottom-right
#' # diagonal, while the other diagonal is mainly red.
#' cqcheck(obj = fit, v = c("x", "z"), nbin = c(5, 5))
#'
#' ### Maybe adding an interaction would help?
#' fit <- qgam(y~s(x)+z+I(x*z), qu = qu, data = dataf)
#'
#' # It does! The real model is: y ~ 1 + x + x^2 + z + x*z/2 + e, e ~ N(0, 1)
#' cqcheck(obj = fit, v = c("x", "z"), nbin = c(5, 5))
#' }
#'
cqcheck <- function(obj, v, X = NULL, y = NULL, nbin = c(10, 10), bound = NULL, lev = 0.05, scatter = FALSE, ...)
{
#### Set up
if( is.null(X) ){
X <- obj$model
if( is.null(y) ){ y <- obj$y }
} else {
if( is.null(y) ){ stop("If you provide X you must provide also the corresponding vector of responses y") }
}
if( length(y)!=nrow(X) ){ stop("length(y)!=nrow(X)") }
####### Setting up 1D and 2D cases
if( is.character(v) ){ # Name(s) of variable(s) in X provided OR ...
if(length(v) == 1){ ## 1D CASE ##
if( !(v %in% names(X)) ) stop("(v %in% names(X)) == FALSE")
x1 <- X[[v]]
x2 <- NULL
} else {
if(length(v) == 2){ ## 2D CASE ##
if( !(v[1] %in% names(X)) ) stop("(v[1] %in% names(X)) == FALSE")
if( !(v[2] %in% names(X)) ) stop("(v[2] %in% names(X)) == FALSE")
x1 <- X[[v[1]]]
x2 <- X[[v[2]]]
} else {
stop("If is.character(v)==TRUE, then length(v) should be either 1 or 2.")
}
}
} else { # ... actual numeric value of the variable(s) provided
if(is.vector(v)){ ## 1D CASE ##
x1 <- v
x2 <- NULL
if(length(v) != nrow(X)){ stop("length(v) != ncol(X)") }
} else {
if(is.matrix(v)){ ## 2D CASE ##
if(ncol(v)!=2){ stop("In the 2D case, v should be a matrix with 2 columns or a vector of 2 characters") }
x1 <- v[ , 1]
x2 <- v[ , 2]
} else {
stop("In the 2D case, v should be a matrix with 2 columns or a vector of 2 characters")
}
}
}
# Discard NAs from X, y, x1 and x2. We don't do this on v, hence if v is numeric it is dangerous to use it from here onwards
good <- complete.cases(X, y, x1, x2)
y <- y[ good ]
X <- X[good, , drop = FALSE]
x1 <- x1[ good ]
x2 <- x2[ good ]
# Calculating proportion of observation falling below estimate quantile curve
n <- nrow(X)
mu <- as.matrix(predict(obj, newdata = X, type = "response"))[ , 1]
res <- (mu - y) > 0
qu <- obj$family$getQu()
# Now branching for main computation
if( is.null(x2) ) ################ ONE VARIABLE
{
if(length(x1) != n) stop("length(x1) != ncol(X)")
if( is.null(bound )){ # Create bounds OR ...
nbin1 <- nbin[1]
bound <- seq(min(x1), max(x1), length.out = nbin1 + 1)
} else { # ... use those already given
nbin1 <- length(bound-1)
}
# For each bin: count number of responses that are smaller than the fitted quantile
indx <- as.factor( .bincode(x1, bound, TRUE, TRUE) ) # Attribute data to bins
levels(indx) <- 1:nbin1
bsize <- as.vector( table(indx) ) # Count number of data in each bin
indx <- as.integer(indx)
bins <- numeric(nbin1)
for(ii in 1:nbin1){ bins[ii] <- sum(res[indx==ii]) } # Count number of 1s in each bin
# Remove empty bins
while( any(bsize == 0) )
{
bad <- which(bsize == 0)[1]
bsize <- bsize[-bad]
bins <- bins[-bad]
bound <- bound[ -min(bad+1, nbin1) ]
if(bad<nbin1){
indx[indx>bad] <- indx[indx>bad]-1
}
nbin1 <- nbin1 - 1
}
ub <- qbinom(lev/2, bsize, qu, lower.tail = FALSE) / bsize
lb <- qbinom(lev/2, bsize, qu) / bsize
#svpar <- par(no.readonly = TRUE)
par(mar = c(5.1, 4.6, 4.1, 2.1))
x <- sort(x1)
tmp <- rep(bins/bsize, bsize)
plot(x, tmp, ylim = range(ub, lb, tmp), type = 'l', col = "white", ylab = expression(hat(P)(y<hat(mu))), ...)
abline(h = qu, lty = 2)
for(ii in 1:nbin1){
prop <- bins[ii]/bsize[ii]
lines(c(bound[ii], bound[ii+1]), rep(ub[ii], 2), col = "grey")
points(c(bound[ii], bound[ii+1]), rep(ub[ii], 2), pch = 3, lwd = 0.2)
lines(c(bound[ii], bound[ii+1]), rep(lb[ii], 2), col = "grey")
points(c(bound[ii], bound[ii+1]), rep(lb[ii], 2), pch = 3, lwd = 0.2)
points((bound[ii] + bound[ii+1])/2, prop,
col = ifelse(prop < ub[ii] && prop > lb[ii], 1, 2))
rug(x1[indx==ii], col = ifelse(ii%%2, 3, 4))
}
#par(svpar) # Warning: this causes problems with shiny
} else { ################ ... TWO VARIABLES
if(length(x1) != n) stop("length(x1) != ncol(X)")
if(length(x2) != n) stop("length(x2) != ncol(X)")
if( is.null(bound) ){ # Bounds created OR ...
if( length(nbin) != 2 ){ stop("In the 2D case, nbin should be a vector of length 2") }
bound1 <- seq(min(x1), max(x1), length.out = nbin[1] + 1)
bound2 <- seq(min(x2), max(x2), length.out = nbin[2] + 1)
} else { # ... already provided
if( length(bound) != 2 ){ stop("In the 2D case, bound a list of two numeric vectors") }
bound1 <- bound[[1]]
bound2 <- bound[[2]]
nbin <- c(length(bound1)-1, length(bound2)-1)
}
# For each bin: count number of responses that are smaller than the fitted quantile
bins <- bsize <- matrix(0, nbin[2], nbin[1])
for(ir in 1:nbin[2]){ # From bottom upward
inRow <- which( x2 >= bound2[ir] & x2 <= bound2[ir+1] )
if(length(inRow)){
x1In <- x1[inRow]
resIn <- res[inRow]
for(ic in 1:nbin[1]){ # From left to right
tmp <- which( x1In >= bound1[ic] & x1In <= bound1[ic+1] )
bsize[ir, ic] <- length(tmp)
bins[ir, ic] <- sum( resIn[tmp] )
}}}
# Plot!
plot(x1, x2, pch = ".", col = "white", ylim = range(bound2), xlim = range(bound1),
main = expression(hat(P)(y<hat(mu))), ...)
for(tmp in bound2){ segments(x0 = min(bound1), x1 = max(bound1), y0 = tmp, y1 = tmp) }
for(tmp in bound1){ segments(y0 = min(bound2), y1 = max(bound2), x0 = tmp, x1 = tmp) }
prop <- bins / bsize
for(ir in 1:nbin[2]){
for(ic in 1:nbin[1]){
if( is.finite(prop[ir, ic]) )
{
pr <- prop[ir, ic]
siz <- bsize[ir, ic]
b <- qbinom(lev/2, siz, qu, lower.tail = (pr<=qu)) / siz
sig <- if(pr>qu){ pr>b } else{ pr<b }
polygon(x = c(bound1[ic], bound1[ic], bound1[ic+1], bound1[ic+1], bound1[ic]),
y = c(bound2[ir], bound2[ir+1], bound2[ir+1], bound2[ir], bound2[ir]),
col = rgb(pr<qu, pr>qu, 0, alpha = ifelse(sig, 0.8, 0.3)))
text(x = (bound1[ic]+bound1[ic+1])/2, y = (bound2[ir]+bound2[ir+1])/2,
paste(round(pr, 3), ifelse(sig, "*", ''), sep = ""))
}
}}
rug(x1, side = 1)
rug(x2, side = 2)
if(scatter){ points(x1, x2, pch = ".") }
}
return( invisible(NULL) )
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.