R/plot_pattern.R

Defines functions pat_to_chr plot_pattern

Documented in plot_pattern

#' Plot the missing data pattern of an incomplete dataset
#'
#' @param data An incomplete dataset of class `data.frame` or `matrix`.
#' @param vrb String, vector, or unquoted expression with variable name(s), default is "all".
#' @param square Logical indicating whether the plot tiles should be squares, defaults to squares to mimick `mice::md.pattern()`.
#' @param rotate Logical indicating whether the variable name labels should be rotated 90 degrees.
#' @param cluster Optional character string specifying which variable should be used for clustering (e.g., for multilevel data).
#' @param npat Optional numeric input specifying the number of missing data patterns to be visualized, defaults to all patterns.
#' @param caption Logical indicating whether the figure caption should be displayed.
#'
#' @return An object of class [ggplot2::ggplot].
#'
#' @examples
#' # plot missing data pattern for all columns
#' plot_pattern(mice::nhanes)
#'
#' # plot missing data pattern for specific columns by supplying a character vector
#' plot_pattern(mice::nhanes, c("chl", "hyp"))
#'
#' # plot missing data pattern for specific columns by supplying unquoted variable names
#' plot_pattern(mice::nhanes, c(chl, hyp))
#'
#' # plot missing data pattern for specific columns by passing an object with variable names
#' # from the environment, unquoted with `!!`
#' my_variables <- c("chl", "hyp")
#' plot_pattern(mice::nhanes, !!my_variables)
#' # object with variable names must be unquoted with `!!`
#' try(plot_pattern(mice::nhanes, my_variables))
#'
#' @export
plot_pattern <-
  function(data,
           vrb = "all",
           square = TRUE,
           rotate = FALSE,
           cluster = NULL,
           npat = NULL,
           caption = TRUE) {
    if (is.matrix(data) && ncol(data) > 1) {
      data <- as.data.frame(data)
    }
    verify_data(data, df = TRUE)
    vrb <- rlang::enexpr(vrb)
    vrb_matched <- match_vrb(vrb, names(data))
    if (length(vrb_matched) < 2) {
      cli::cli_abort("The number of variables should be two or more to compute missing data patterns.")
    }
    if (".x" %in% vrb_matched || ".y" %in% vrb_matched) {
      cli::cli_abort(
        c(
          "The variable names '.x' and '.y' are used internally to produce the missing data pattern plot.",
          "i" = "Please exclude or rename your variable(s)."
        )
      )
    }
    if (!is.null(cluster)) {
      if (cluster %nin% names(data[, vrb_matched])) {
        cli::cli_abort(
          c("Cluster variable not recognized.",
            "i" = "Please provide the variable name as a character string.")
        )
      }
    }
    if (!is.null(npat)) {
      if (!is.numeric(npat) || npat < 1) {
        cli::cli_abort(
          c("The minimum number of patterns to display is one.",
            "i" = "Please provide a positive integer.")
        )
      }
    }

    # get missing data pattern
    pat <- mice::md.pattern(data[, vrb_matched], plot = FALSE)
    rows_pat_full <-
      (nrow(pat) - 1) # full number of missing data patterns


    # filter npat most frequent patterns
    if (!is.null(npat)) {
      if (npat < rows_pat_full) {
        top_n_pat <-
          sort(as.numeric(row.names(pat)), decreasing = TRUE)[1:npat]
        pat <-
          pat[rownames(pat) %in% c(top_n_pat, ""), , drop = FALSE]

        if (npat != (nrow(pat) - 1)) {
          # if npat != number of missing patterns
          # show number of requested, shown, and hidden missing data patterns
          cli::cli_inform(
            c(
              "i" = "{npat} missing data patterns were requested.",
              "i" = "{nrow(pat) - 1} missing data patterns are shown.",
              "i" = "{rows_pat_full - (nrow(pat)-1)} missing data patterns are hidden."
            )
          )
        }
      } else {
        cli::cli_warn(
          c(
            "Number of patterns specified is equal to or greater than the total number of patterns.",
            "i" = "All missing data patterns are shown."
          )
        )
      }
    }

    # extract pattern info
    rws <- nrow(pat)
    cls <- ncol(pat)
    vrb <- colnames(pat)[-cls]
    frq <- row.names(pat)[-rws]
    na_row <- pat[-rws, cls]
    na_col <- pat[rws, -cls]

    # add opacity for clustering
    if (is.null(cluster)) {
      pat_clean <- cbind(.opacity = 1, pat[-rws, vrb, drop = FALSE])
    } else {
      pats <- purrr::map(split(data[, vrb], ~ get(cluster)), ~ {
        mice::md.pattern(., plot = FALSE) %>%
          pat_to_chr(., ord = vrb)
      })
      pat_used <- purrr::map_dfr(pats, ~ {
        pat_to_chr(pat) %in% .x
      }) %>%
        rowMeans()
      pat_clean <- data.frame(.opacity = pat_used, pat[-rws, vrb])
    }

    # tidy the pattern
    long <-
      as.data.frame(cbind(.y = 1:(rws - 1), pat_clean)) %>%
      tidyr::pivot_longer(
        cols = tidyselect::all_of(vrb),
        names_to = "x",
        values_to = ".where"
      ) %>%
      dplyr::mutate(
        .x = as.numeric(factor(
          .data$x,
          levels = vrb, ordered = TRUE
        )),
        .where = factor(
          .data$.where,
          levels = c(0, 1),
          labels = c("missing", "observed")
        ),
        .opacity = as.numeric(.data$.opacity)
      )

    # create the plot
    gg <-
      ggplot2::ggplot(
        long,
        ggplot2::aes(
          .data$.x,
          .data$.y,
          fill = .data$.where,
          alpha = 0.1 + .data$.opacity / 2
        )
      ) +
      ggplot2::geom_tile(color = "black") +
      ggplot2::scale_fill_manual(values = c(
        "observed" = "#006CC2B3",
        "missing" = "#B61A51B3"
      )) +
      ggplot2::scale_alpha_continuous(limits = c(0, 1), guide = "none") +
      ggplot2::scale_x_continuous(
        breaks = 1:(cls - 1),
        labels = na_col,
        sec.axis = ggplot2::dup_axis(labels = vrb,
                                     name = "Column name")
      ) +
      ggplot2::scale_y_reverse(
        breaks = 1:(rws - 1),
        labels = frq,
        sec.axis = ggplot2::dup_axis(labels = na_row,
                                     name = "Number of missing entries\nper pattern")
      ) +
      ggplot2::labs(
        x = "Number of missing entries\nper column",
        y = "Pattern frequency",
        fill = "",
        alpha = ""
      ) +
      theme_minimice()

    # additional arguments
    if (square) {
      gg <- gg + ggplot2::coord_fixed(expand = FALSE)
    } else {
      gg <- gg + ggplot2::coord_cartesian(expand = FALSE)
    }
    if (rotate) {
      gg <-
        gg + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90))
    }
    if (caption) {
      if (!is.null(npat) && npat < rows_pat_full) {
        gg <- gg +
          ggplot2::labs(
            x = "Number of missing entries\nper column*",
            y = "Pattern frequency**",
            caption = paste0(
              "*total number of missing entries: ",
              pat[rws, cls],
              "\n**number of patterns shown: ",
              (nrow(pat) - 1),
              " out of ",
              rows_pat_full,
              ""
            )
          )
      } else {
        gg <- gg +
          ggplot2::labs(x = "Number of missing entries\nper column*",
                        caption = paste0("*total number of missing entries: ",
                                         pat[rws, cls]))
      }
    }

    return(gg)
  }

#' Utils function to process missing data pattern
#'
#' @param pat Numeric matrix with a missing data pattern.
#' @param ord Vector with variable names.
#' @keywords internal
#' @noRd
pat_to_chr <- function(pat, ord = NULL) {
  if (is.null(ord)) {
    ord <- colnames(pat)[-ncol(pat)]
  }
  apply(pat[-nrow(pat), ord], 1, function(x) {
    paste(as.numeric(x), collapse = "")
  })
}
amices/ggmice documentation built on July 28, 2024, 5:27 p.m.