R/plot.saeTrafo.R
In saeTrafo: Transformations for Unit-Level Small Area Models

Documented in plot.saeTrafo

#' Plots for an saeTrafo object
#'
#' Diagnostic plots of the nested error regression model
#' (see also \code{\link{NER_Trafo}}) are obtained. These include Q-Q plots and
#' density plots of residuals and random effects, a Cook's distance plot for
#' detecting outliers and the log-likelihood of the estimation of the
#' optimal parameter in log-shift transformations. The return depends on the
#' transformation, such that a plot for the optimal parameter is only returned
#' in case if a transformation with transformation parameter is chosen.
#' The range of the x-axis is optional but necessary to change if there are
#' convergence problems. All plots are obtained by
#' \code{\link[ggplot2]{ggplot}}.
#' @param x an object of type "NER", representing point
#' and, if chosen, MSE estimates obtained by the (transformed) nested error
#' regression model (see also \code{\link{NER_Trafo}}).
#' @param label argument that enables to customize title and axis labels. There
#' are three instant options to label the diagnostic plot: (i) original labels
#' ("orig"), (ii) axis lables but no title ("no_title"), (iii) neither axis
#' labels nor title ("blank"), (iv) individual labels by a list that needs to
#' have below structure.
#' Six elements can be defined called \code{qq_res, qq_ran, d_res, d_ran, cooks}
#' and \code{opt_lambda} for the six different plots and these list elements
#' need to have three elements each called \code{title, y_lab and x_lab}.
#' Only the labels for the plots that should be different to the original
#' need to be specified. Please see the details section for an example
#' with the default labels.
#' @param color a character vector with two elements. The first element defines
#' the color for the line in the QQ-plots, for the Cook's Distance plot and for
#' the optimal parameter plot. The second element defines the color for the
#' densities.
#' @param gg_theme \code{\link[ggplot2]{theme}} list from package \pkg{ggplot2}.
#' For using this argument, package \pkg{ggplot2} must be loaded via
#' \code{library(ggplot2)}.
#' @param cooks optional logical. If \code{TRUE}, a Cook's distance plot is
#' returned. The used method \code{mdffits.default} from the package
#' \pkg{HLMdiag} struggles when data sets get large. In these cases,
#' \code{cooks} should be set to \code{FALSE}. It defaults to \code{TRUE}.
#' @param range optional sequence determining the range of the x-axis for plots
#' of the optimal transformation parameter that defaults to \code{NULL}. In that
#' case a range of the default interval is used for the plots of the
#' optimal parameter. This leads in some cases to convergence problems such that
#' it should be changed to e.g. the selected \code{interval}. The default value
#' depends on the chosen data driven transformation and equals the default
#' interval for the estimation of the optimal parameter.
#' @param ... optional arguments passed to generic function.
#' @return Two Q-Q plots in one grid, two density plots, a Cook's distance plot
#' and a likelihood plot for the optimal parameter of transformations with
#' transformation parameter obtained by \code{\link[ggplot2]{ggplot}}.
#' @details The default settings of the \code{label} argument are as follows:\cr
##' \describe{
##' \item{list(}{}
##' \item{qq_res =}{c(title="Error term",
##'                   y_lab="Quantiles of pearson residuals",
##'                   x_lab="Theoretical quantiles"),}
##' \item{qq_ran =}{c(title="Random effect",
##'                   y_lab="Quantiles of random effects",
##'                   x_lab="Theoretical quantiles"),}
##' \item{d_res =}{c(title="Density - Pearson residuals",
##'                  y_lab="Density",
##'                  x_lab="Pearson residuals"),}
##' \item{d_ran =}{c(title="Density - Standardized random effects",
##'                  y_lab="Density",
##'                  x_lab="Standardized random effects"),}
##' \item{cooks =}{c(title="Cook's Distance Plot",
##'                  y_lab="Cook's Distance",
##'                  x_lab="Index"),}
##' \item{opt_lambda =}{c(title="Log-Shift - REML",
##'                       y_lab="Log-Likelihood",
##'                       x_lab="expression(lambda)"))}
##' }
##'
#' @examples
#'
#' \donttest{
#' # Examples for diagnostic plots
#'
#' # Load Data
#' data("eusilcA_smp")
#' data("pop_area_size")
#' data("pop_mean")
#' data("pop_cov")
#'
#' # Nested error regression model
#' NER_model <- NER_Trafo(fixed = eqIncome ~ gender + eqsize + cash +
#'                        self_empl + unempl_ben + age_ben + surv_ben +
#'                        sick_ben + dis_ben + rent + fam_allow + house_allow +
#'                        cap_inv + tax_adj,
#'                        smp_domains = "district",
#'                        pop_area_size = pop_area_size,
#'                        pop_mean = pop_mean, pop_cov = pop_cov,
#'                        smp_data = eusilcA_smp)
#'
#' # Example 1: Default diagnostic plot
#' plot(NER_model)
#'
#' # Example 2: Creation of diagnostic plots without labels and titles,
#' # different colors and without Cook's distance plot.
#'
#' plot(NER_model, label = "no_title", color = c("red", "yellow"),
#'      cooks = FALSE)
#' }
#'
#'
#' @seealso \code{\link{saeTrafoObject}}, \code{\link{NER_Trafo}}
#' @importFrom ggplot2 qplot geom_abline ggtitle ylab xlab ggplot stat_qq
#' @importFrom ggplot2 aes geom_point geom_smooth coord_fixed geom_line
#' @importFrom ggplot2 scale_color_manual scale_fill_manual geom_segment
#' @importFrom ggplot2 scale_linetype_discrete geom_density geom_text
#' @importFrom ggplot2 geom_line geom_vline stat_function geom_qq
#' @importFrom nlme ranef random.effects
#' @importFrom gridExtra arrangeGrob grid.arrange
#' @importFrom stats shapiro.test logLik cooks.distance dnorm residuals sd qnorm
#' @importFrom HLMdiag mdffits
#' @importFrom stringr str_to_title
#' @name plot.saeTrafo
#' @export

plot.saeTrafo <- function(x,
                          label = "orig",
                          color = c("blue", "lightblue3"),
                          gg_theme = NULL,
                          cooks = TRUE,
                          range = NULL, ...) {

  plot_check(x = x, label = label, color = color, cooks = cooks, range = range)

  # Preparation for plots ------------------------------------------------------
  residuals <- residuals(x$model, level = 0, type = "pearson")
  rand.eff <- ranef(x$model)$"(Intercept)"
  srand.eff <- (rand.eff - mean(rand.eff)) / sd(rand.eff)
  tmp <- as.matrix(random.effects(x$model))[, 1]
  model <- x$model
  model$call$fixed <- x$fixed
  cook_df <- NULL
  indexer <- NULL
  likelihoods <- NULL
  opt_lambda <- FALSE

  if (cooks == TRUE) {
    cooksdist <- NULL
    try(cooksdist <- as.vector(suppressWarnings(cooks.distance(model))),
        silent = TRUE)
    if (is.null(cooksdist)) {
      cooks <- FALSE
      warning(paste("Cook's distance could not be calculated, this is usually",
                    "due to exceedence of available memory. Try using",
                    "cooks = FALSE to avoid this message and improve",
                    "computation time."))
    } else {
      cook_df <- data.frame(index = seq_along(cooksdist), cooksdist)
      indexer <- cook_df[order(cooksdist, decreasing = TRUE), ][seq_len(3), ]
    }
  }

  if (x$transformation == "log.shift") {

    opt_lambda <- TRUE
    if (is.null(range)) {
      range <- seq(from   = x$transform_param$optimal_lambda -
                               (0.5 * x$transform_param$optimal_lambda),
                   to     = x$transform_param$optimal_lambda +
                               (0.5 * x$transform_param$optimal_lambda),
                   length = 50
      )
    } else {
      range <- range
    }

    likelihoods <- vapply(X              = range,
                          FUN            = likelihoods_f,
                          numeric(1),
                          fixed          = x$fixed,
                          smp_data       = x$framework$smp_data,
                          smp_domains    = x$framework$smp_domains,
                          transformation = x$transformation
    )

    if (any(is.na(likelihoods))) {
      warning(paste("For some lambda in the chosen range, the likelihood does",
                    "not converge. For these lambdas no likelihood is plotted.",
                    "Choose a different range to avoid this behaviour"))
    }
  }

  Residuals <- Random <- index <- lambda <- log_likelihood <- cooksdist <-  NULL

  plotList <- vector(mode = "list", length = 5)
  plotList <- lapply(plotList, function(x) NA)
  names(plotList) <- c("qq_plots", "density_res", "density_ran",
                       "cooks_distance", "likelihood")

  label <- define_label(x = x, label = label)

  ## QQ Plots
  # Residuals
  res <- qplot(sample = residuals) +
    geom_abline(colour = color[1]) +
    ggtitle(label$qq_res["title"]) +
    ylab(label$qq_res["y_lab"]) +
    xlab(label$qq_res["x_lab"]) +
    gg_theme

  # Random effects
  ran <- ggplot(data.frame(tmp), aes(sample = tmp)) +
    stat_qq(distribution = qnorm,
            dparams      = list(mean = mean(tmp), sd = sd(tmp))
    ) +
    geom_abline(intercept = 0, slope = 1, na.rm = TRUE, col = color[1]) +
    ggtitle(label$qq_ran["title"]) +
    ylab(label$qq_ran["y_lab"]) +
    xlab(label$qq_ran["x_lab"]) +
    gg_theme

  plotList[[1]] <- arrangeGrob(res, ran, ncol = 2)
  grid.arrange(plotList[[1]])
  cat("Press [enter] to continue")
  line <- readline()

  print((plotList[[2]] <-
           ggplot(data    = data.frame(Residuals = residuals),
                  mapping = aes(x = Residuals),
                  fill    = color[2],
                  color   = color[2]
           ) +
           geom_density(fill = color[2], color = color[2], alpha = 0.4) +
           stat_function(fun = dnorm) +
           ylab(label$d_res["y_lab"]) +
           xlab(label$d_res["x_lab"]) +
           ggtitle(label$d_res["title"]) +
           gg_theme
  ))
  cat("Press [enter] to continue")
  line <- readline()

  print((plotList[[3]] <-
           ggplot(data    = data.frame(Random = srand.eff),
                  mapping = aes(x = Random),
                  fill    = color[2],
                  color   = color[2]
           ) +
           geom_density(fill = color[2], color = color[2], alpha = 0.4) +
           stat_function(fun = dnorm) + ylab(label$d_ran["y_lab"]) +
           xlab(label$d_ran["x_lab"]) +
           ggtitle(label$d_ran["title"]) +
           gg_theme
  ))

  if (cooks == TRUE) {

    cat("Press [enter] to continue")
    line <- readline()

    print((plotList[[4]] <-
             ggplot(data    = cook_df,
                    mapping = aes(x = index, y = cooksdist)
             ) +
             geom_segment(mapping = aes(x    = index,
                                        y    = 0,
                                        xend = index,
                                        yend = cooksdist),
                          colour = color[1]
             ) +
             xlab("Index") +
             ylab(label$cooks["y_lab"]) +
             geom_text(label = indexer[, 1], data = indexer) +
             ggtitle(label$cooks["title"]) +
             gg_theme
     ))
  }

  if (opt_lambda == TRUE) {

    cat("Press [enter] to continue")
    line <- readline()

    if (any(label$opt_lambda["x_lab"] == "expression(lambda)") ||
        any(label$opt_lambda["x_lab"] == "expression(Lambda)")) {

      x_lab <- expression(lambda)

    } else {
      x_lab <- label$opt_lambda["x_lab"]
    }

    if (any(is.na(likelihoods))) {
      warning(paste("For some lambda in the chosen range, the likelihood does",
                    "not converge. For these lambdas no likelihood is plotted.",
                    "Choose a different range to avoid this behaviour"))
    }

    print((plotList[[5]] <-
             ggplot(data    = data.frame(lambda         = range,
                                         log_likelihood = likelihoods
                              ),
                    mapping = aes(x = lambda, y = log_likelihood)
             ) +
             geom_line() +
             xlab(x_lab) +
             ylab(label$opt_lambda["y_lab"]) +
             geom_vline(xintercept = x$transform_param$optimal_lambda,
                        colour     = color[1]) +
             ggtitle(label$opt_lambda["title"]) +
             gg_theme
    ))
  }
  invisible(plotList)
}



# Definition of the labels

define_label <- function(x, label) {

   if (!inherits(label, "list")) {
    if (label == "orig") {
      if (inherits(x, "NER")) {
        label <- list(qq_res = c(title = "Error term",
                                 y_lab = "Quantiles of pearson residuals",
                                 x_lab = "Theoretical quantiles"),
                      qq_ran = c(title = "Random effect",
                                 y_lab = "Quantiles of random effects",
                                 x_lab = "Theoretical quantiles"),
                      d_res = c(title = "Density - Pearson residuals",
                                y_lab = "Density",
                                x_lab = "Pearson residuals"),
                      d_ran = c(title = "Density - Standardized random effects",
                                y_lab = "Density",
                                x_lab = "Standardized random effects"),
                      cooks = c(title = "Cook's Distance Plot",
                                y_lab = "Cook's Distance",
                                x_lab = "Index"),
                      opt_lambda = c(title = paste0(str_to_title(
                                              gsub("\\.", "-", x$transformation)
                                             ), " - REML"),
                                     y_lab = "Log-Likelihood",
                                     x_lab = "expression(lambda)")
        )
      }
    } else if (label == "blank") {
      label <- list(qq_res = c(title = "",
                               y_lab = "",
                               x_lab = ""),
                    qq_ran = c(title = "",
                               y_lab = "",
                               x_lab = ""),
                    d_res = c(title = "",
                              y_lab = "",
                              x_lab = ""),
                    d_ran = c(title = "",
                              y_lab = "",
                              x_lab = ""),
                    cooks = c(title = "",
                              y_lab = "",
                              x_lab = ""),
                    opt_lambda = c(title = "",
                                   y_lab = "",
                                   x_lab = "")
      )
    } else if (label == "no_title") {
      if (inherits(x, "NER")) {
        label <- list(qq_res = c(title = "",
                                 y_lab = "Quantiles of pearson residuals",
                                 x_lab = "Theoretical quantiles"),
                      qq_ran = c(title = "",
                                 y_lab = "Quantiles of random effects",
                                 x_lab = "Theoretical quantiles"),
                      d_res = c(title = "",
                                y_lab = "Density",
                                x_lab = "Pearson residuals"),
                      d_ran = c(title = "",
                                y_lab = "Density",
                                x_lab = "Standardized random effects"),
                      cooks = c(title = "",
                                y_lab = "Cook's Distance",
                                x_lab = "Index"),
                      opt_lambda = c(title = "",
                                     y_lab = "Log-Likelihood",
                                     x_lab = "expression(lambda)")
        )
      }
    }
  } else if (inherits(label, "list")) {

    if (!any(names(label) %in% c("qq_res", "qq_ran", "d_res", "d_ran", "cooks",
                                 "opt_lambda")) ||
        !any(names(label) %in% c("qq_res", "qq_ran", "d_res", "d_ran", "cooks",
                                 "box_cox"))) {
     stop(paste("List elements must have following names even though not all",
                "must be included: qq_res, qq_ran, d_res, d_ran, cooks,",
                "opt_lambda. Every list element must have the elements title,",
                "y_lab and x_lab. See also help(plot.saeTrafo)."))
    }

    for (i in names(label)) {
      if (!all(names(label[[i]]) == c("title", "y_lab", "x_lab"))) {
        stop(paste("Every list element must have the elements title, y_lab and",
                   "x_lab in this order. See also help(plot.saeTrafo)."))
      }
    }

    orig_label <- list(qq_res = c(title = "Error term",
                                  y_lab = "Quantiles of pearson residuals",
                                  x_lab = "Theoretical quantiles"),
                       qq_ran = c(title = "Random effect",
                                  y_lab = "Quantiles of random effects",
                                  x_lab = "Theoretical quantiles"),
                       d_res = c(title = "Density - Pearson residuals",
                                 y_lab = "Density",
                                 x_lab = "Pearson residuals"),
                       d_ran = c(title = paste("Density - Standardized random",
                                               "effects"),
                                 y_lab = "Density",
                                 x_lab = "Standardized random effects"),
                       cooks = c(title = "Cook's Distance Plot",
                                 y_lab = "Cook's Distance",
                                 x_lab = "Index"),
                       opt_lambda = c(title = paste0(str_to_title(
                                              gsub("\\.", "-", x$transformation)
                                              ), " - REML"),
                                      y_lab = "Log-Likelihood",
                                      x_lab = "expression(lambda)")
    )

    if (any(names(label) == "qq_res")) {
      label$qq_res <- label$qq_res
    } else {
      label$qq_res <- orig_label$qq_res
    }
    if (any(names(label) == "qq_ran")) {
      label$qq_ran <- label$qq_ran
    } else {
      label$qq_ran <- orig_label$qq_ran
    }
    if (any(names(label) == "d_res")) {
      label$d_res <- label$d_res
    } else {
      label$d_res <- orig_label$d_res
    }
    if (any(names(label) == "d_ran")) {
      label$d_ran <- label$d_ran
    } else {
      label$d_ran <- orig_label$d_ran
    }
    if (any(names(label) == "cooks")) {
      label$cooks <- label$cooks
    } else {
      label$cooks <- orig_label$cooks
    }
    if (any(names(label) == "opt_lambda") || any(names(label) == "box_cox")) {
      if (any(names(label) == "opt_lambda")) {
        label$opt_lambda <- label$opt_lambda
      } else if (any(names(label) == "box_cox")) {
        label$opt_lambda <- label$box_cox
      }
    } else {
      label$opt_lambda <- orig_label$opt_lambda
    }
  }
  if (any(!(names(label) %in%  c("qq_res", "qq_ran", "d_res", "d_ran", "cooks",
                                 "opt_lambda", "box_cox")))) {
    warning(paste("One or more list elements are not called qq_res, qq_ran,",
                  "d_res, d_ran, cooks or opt_lambda. The changes are for",
                  "this/these element(s) is/are not done. Instead the original",
                  "labels are used."))
  }
  return(label)
}