R/plot_gpt.R
In sjPlot: Data Visualization for Statistics in Social Science

Documented in plot_gpt

utils::globalVariables("n")

#' @title Plot grouped proportional tables
#' @name plot_gpt
#'
#' @description Plot grouped proportional crosstables, where the proportion of
#'                each level of \code{x} for the highest category in \code{y}
#'                is plotted, for each subgroup of \code{grp}.
#'
#' @param x Categorical variable, where the proportion of each category in
#'            \code{x} for the highest category of \code{y} will be printed
#'            along the x-axis.
#' @param y Categorical or numeric variable. If not a binary variable, \code{y}
#'            will be recoded into a binary variable, dichtomized at the highest
#'            category and all remaining categories.
#' @param grp Grouping variable, which will define the y-axis
#' @param shape.fill.color Optional color vector, fill-color for non-filled shapes
#' @param shapes Numeric vector with shape styles, used to map the different
#'          categories of \code{x}.
#' @param show.total Logical, if \code{TRUE}, a total summary line for all aggregated
#'          \code{grp} is added.
#' @param annotate.total Logical, if \code{TRUE} and \code{show.total = TRUE},
#'          the total-row in the figure will be highlighted with a slightly
#'          shaded background.
#' @param axis.lim Numeric vector of length 2, defining the range of the plot axis.
#'          Depending on plot type, may effect either x- or y-axis, or both.
#'          For multiple plot outputs (e.g., from \code{type = "eff"} or
#'          \code{type = "slope"} in \code{\link{plot_model}}), \code{axis.lim} may
#'          also be a list of vectors of length 2, defining axis limits for each
#'          plot (only if non-faceted).
#' @param show.p Logical, adds significance levels to values, or value and
#'          variable labels.
#'
#' @return A ggplot-object.
#'
#' @inheritParams plot_scatter
#' @inheritParams plot_grpfrq
#' @inheritParams plot_xtab
#'
#' @details The p-values are based on \code{\link[stats]{chisq.test}} of \code{x}
#'            and \code{y} for each \code{grp}.
#'
#' @examples
#' if (requireNamespace("haven")) {
#'   data(efc)
#'
#'   # the proportion of dependency levels in female
#'   # elderly, for each family carer's relationship
#'   # to elderly
#'   plot_gpt(efc, e42dep, e16sex, e15relat)
#'
#'   # proportion of educational levels in highest
#'   # dependency category of elderly, for different
#'   # care levels
#'   plot_gpt(efc, c172code, e42dep, n4pstu)
#' }
#' @export
plot_gpt <- function(
  data,
  x,
  y,
  grp,
  colors = "metro",
  geom.size = 2.5,
  shape.fill.color = "#f0f0f0",
  shapes = c(15, 16, 17, 18, 21, 22, 23, 24, 25, 7, 8, 9, 10, 12),
  title = NULL,
  axis.labels = NULL,
  axis.titles = NULL,
  legend.title = NULL,
  legend.labels = NULL,
  wrap.title = 50,
  wrap.labels = 15,
  wrap.legend.title = 20,
  wrap.legend.labels = 20,
  axis.lim = NULL,
  grid.breaks = NULL,
  show.total = TRUE,
  annotate.total = TRUE,
  show.p = TRUE,
  show.n = TRUE)
{

  # get data

  name.x <- deparse(substitute(x))
  name.y <- deparse(substitute(y))
  name.grp <- deparse(substitute(grp))

  pl <- NULL

  if (inherits(data, "grouped_df")) {
    # get grouped data
    grps <- get_grouped_data(data)

    # now plot everything
    for (i in seq_len(nrow(grps))) {
      # copy back labels to grouped data frame
      tmp <- sjlabelled::copy_labels(grps$data[[i]], data)

      # prepare argument list, including title
      tmp.title <- get_grouped_plottitle(data, grps, i, sep = "\n")

      # copy data

      x <- tmp[[name.x]]
      y <- tmp[[name.y]]
      grp <- tmp[[name.grp]]

      # plot

      plots <- gpt_helper(
        x,
        y,
        grp,
        colors,
        geom.size,
        shape.fill.color,
        shapes,
        title = tmp.title,
        axis.labels,
        axis.titles,
        legend.title,
        legend.labels,
        wrap.title,
        wrap.labels,
        wrap.legend.title,
        wrap.legend.labels,
        axis.lim,
        grid.breaks,
        show.total,
        annotate.total,
        show.p,
        show.n,
        name.x,
        name.y,
        name.grp
      )

      # add plots, check for NULL results
      pl <- c(pl, list(plots))
    }
  } else {
    # copy data
    x <- data[[name.x]]
    y <- data[[name.y]]
    grp <- data[[name.grp]]

    # plot

    pl <- gpt_helper(
      x,
      y,
      grp,
      colors,
      geom.size,
      shape.fill.color,
      shapes,
      title,
      axis.labels,
      axis.titles,
      legend.title,
      legend.labels,
      wrap.title,
      wrap.labels,
      wrap.legend.title,
      wrap.legend.labels,
      axis.lim,
      grid.breaks,
      show.total,
      annotate.total,
      show.p,
      show.n,
      name.x,
      name.y,
      name.grp
    )
  }

  pl
}


gpt_helper <- function(
  x,
  y,
  grp,
  colors,
  geom.size,
  shape.fill.color,
  shapes,
  title,
  axis.labels,
  axis.titles,
  legend.title,
  legend.labels,
  wrap.title,
  wrap.labels,
  wrap.legend.title,
  wrap.legend.labels,
  axis.lim,
  grid.breaks,
  show.total,
  annotate.total,
  show.p,
  show.n,
  name.x,
  name.y,
  name.grp
) {
  # any missing names?

  if (is.null(name.x) || name.x == "NULL") {
    name.x <- ""
  }
  if (is.null(name.y) || name.y == "NULL") {
    name.y <- ""
  }
  if (is.null(name.grp) || name.grp == "NULL") {
    name.grp <- ""
  }

  # copy titles
  if (is.null(axis.titles)) {
    axisTitle.x <- NULL
    axisTitle.y <- NULL
  } else {
    axisTitle.x <- axis.titles[1]
    if (length(axis.titles) > 1) {
      axisTitle.y <- axis.titles[2]
    } else {
      axisTitle.y <- NULL
    }
  }

  # try to automatically set labels if not passed as argument
  x <- suppressMessages(sjmisc::to_factor(x))
  ylabels <-
    sjlabelled::get_labels(
      y,
      attr.only = FALSE,
      values = NULL,
      non.labelled = TRUE
    )

  # get only value label for hightest category
  ylabels <- ylabels[length(ylabels)]
  if (is.null(axis.labels)) {
    axis.labels <-
      sjlabelled::get_labels(
        grp,
        attr.only = FALSE,
        values = NULL,
        non.labelled = TRUE
      )
  }

  if (is.null(axisTitle.y)) {
    axisTitle.y <-
      paste0(
        "Proportion of ",
        sjlabelled::get_label(x, def.value = name.x),
        " in ",
        sjlabelled::get_label(y, def.value = name.y),
        " (",
        ylabels,
        ")"
      )
  }

  if (is.null(legend.title)) {
    legend.title <- sjlabelled::get_label(x, def.value = name.x)
  }

  if (is.null(legend.labels)) {
    legend.labels <-
      sjlabelled::get_labels(
        x,
        attr.only = FALSE,
        values = NULL,
        non.labelled = TRUE
      )
  }

  # set labels that are still missing, but which need values
  if (is.null(axis.labels)) {
    axis.labels <- as.character(seq_len(length(grp)))
  }

  # wrap titles and labels
  if (!is.null(legend.labels)) {
    legend.labels <- sjmisc::word_wrap(legend.labels, wrap.legend.labels)
  }
  if (!is.null(legend.title)) {
    legend.title <- sjmisc::word_wrap(legend.title, wrap.legend.title)
  }
  if (!is.null(title)) {
    title <- sjmisc::word_wrap(title, wrap.title)
  }
  if (!is.null(axisTitle.x)) {
    axisTitle.x <- sjmisc::word_wrap(axisTitle.x, wrap.title)
  }
  if (!is.null(axisTitle.y)) {
    axisTitle.y <- sjmisc::word_wrap(axisTitle.y, wrap.title)
  }
  if (!is.null(axis.labels)) {
    axis.labels <- sjmisc::word_wrap(axis.labels, wrap.labels)
  }

  # final data frame for plot
  newdf <- data.frame()
  group.p <- character()
  group.n <- character()

  # create data frame, for dplyr-chain
  mydf <-
    stats::na.omit(data.frame(
      grp = sjlabelled::as_numeric(grp, keep.labels = FALSE),
      xpos = x,
      dep = sjlabelled::as_numeric(y, keep.labels = FALSE)
    ))

  # recode dependent variable's categorues
  # max and all others, so we have proportion
  # between maximux value and rest
  mydf$dep <- sjmisc::rec(mydf$dep, rec = "max=1;else=0", append = FALSE)

  # group data by grouping variable, and inside
  # groups, group the x-variable

  newdf <- mydf |>
    dplyr::group_by(.data$grp, .data$xpos) |>
    dplyr::summarise(ypos = mean(.data$dep))

  # group data by grouping variable,
  # and summarize N per group and chisq.test
  # of grp and x within each group

  pvals <- mydf |>
    dplyr::group_by(.data$grp) |>
    dplyr::summarise(
      N = dplyr::n(),
      p = suppressWarnings(
        stats::chisq.test(table(.data$xpos, .data$dep))$p.value
      )
    )

  # copy p values
  for (i in seq_len(length(pvals$grp))) {
    group.p[i] <- get_p_stars(pvals$p[i])
  }

  # copy N
  for (i in seq_len(length(pvals$grp))) {
    group.n[i] <- prettyNum(pvals$N[i], big.mark = ",", scientific = FALSE)
  }

  # if we want total line, repeat all for
  # complete data frame
  if (show.total) {
    tmp <- mydf |>
      dplyr::group_by(.data$xpos) |>
      dplyr::summarise(ypos = mean(.data$dep))

    # pvalues and N
    pvals <- mydf |>
      dplyr::summarise(
        N = dplyr::n(),
        p = suppressWarnings(
          stats::chisq.test(table(.data$xpos, .data$dep))$p.value
        )
      )

    # bind total row to final df
    newdf <- dplyr::bind_rows(newdf, tmp)

    # copy p values
    group.p <- c(group.p, get_p_stars(pvals$p))
    # copy N
    group.n <- c(
      group.n,
      prettyNum(pvals$N, big.mark = ",", scientific = FALSE)
    )
    # add "total" to axis labels
    axis.labels <- c(axis.labels, "Total")
  }

  # make group variables categorical
  newdf$grp <- suppressMessages(sjmisc::to_factor(newdf$grp))
  newdf$xpos <- suppressMessages(sjmisc::to_factor(newdf$xpos))

  # proportion needs to be numeric
  newdf$ypos <- sjlabelled::as_numeric(newdf$ypos, keep.labels = FALSE)

  # add N and p-values to axis labels?
  if (show.n) {
    axis.labels <- paste0(axis.labels, " (n=", group.n, ")")
  }
  if (show.p) {
    axis.labels <- paste0(axis.labels, " ", group.p)
  }

  # Set up axis limits
  if (is.null(axis.lim)) {
    axis.lim <- c(0, max(pretty(max(newdf$ypos, na.rm = TRUE), 10)))
  }

  # Set up grid breaks
  if (is.null(grid.breaks)) {
    gridbreaks <- ggplot2::waiver()
  } else {
    gridbreaks <- seq(axis.lim[1], axis.lim[2], by = grid.breaks)
  }

  # Set up geom colors
  pal.len <- length(legend.labels)
  geom.colors <- col_check2(colors, pal.len)

  # Set up plot
  p <- ggplot2::ggplot(
    newdf,
    ggplot2::aes(
      x = rev(.data$grp),
      y = .data$ypos,
      colour = .data$xpos,
      shape = .data$xpos
    )
  ) +
    ggplot2::geom_point(size = geom.size, fill = shape.fill.color) +
    ggplot2::scale_y_continuous(
      labels = scales::percent,
      breaks = gridbreaks,
      limits = axis.lim
    ) +
    ggplot2::scale_x_discrete(labels = rev(axis.labels)) +
    ggplot2::scale_shape_manual(
      name = legend.title,
      labels = legend.labels,
      values = shapes[1:pal.len]
    ) +
    ggplot2::scale_colour_manual(
      name = legend.title,
      labels = legend.labels,
      values = geom.colors
    ) +
    ggplot2::labs(x = axisTitle.x, y = axisTitle.y, title = title) +
    ggplot2::coord_flip()

  # Annotate total line?
  if (show.total && annotate.total) {
    p <- p +
      ggplot2::annotate(
        "rect",
        xmin = 0.5,
        xmax = 1.5,
        ymin = -Inf,
        ymax = Inf,
        alpha = 0.15
      )
  }

  p
}