#' Visualise a data.frame to display missingness.
#'
#' `vis_miss` provides an at-a-glance ggplot of the missingness inside a
#' dataframe, colouring cells according to missingness, where black indicates
#' a missing cell and grey indicates a present cell. As it returns a ggplot
#' object, it is very easy to customize and change labels.
#'
#' The missingness summaries in the columns are rounded to the nearest integer.
#' For more detailed summaries, please see the summaries in the `naniar` R
#' package, specifically, `naniar::miss_var_summary()`.
#'
#' @param x a data.frame
#'
#' @param cluster logical. TRUE specifies that you want to use hierarchical
#' clustering (mcquitty method) to arrange rows according to missingness.
#' FALSE specifies that you want to leave it as is. Default value is FALSE.
#'
#' @param sort_miss logical. TRUE arranges the columns in order of missingness.
#' Default value is FALSE.
#'
#' @param show_perc logical. TRUE now adds in the \% of missing/complete data
#' in the whole dataset into the legend. Default value is TRUE.
#'
#' @param show_perc_col logical. TRUE adds in the \% missing data in a given
#' column into the x axis. Can be disabled with FALSE. Default value is TRUE.
#' No missingness percentage column information will be presented when `facet`
#' argument is used. Please see the `naniar` package to provide missingness
#' summaries over groups.
#'
#' @param warn_large_data logical - warn if there is large data? Default is TRUE
#' see note for more details
#'
#' @param large_data_size integer default is 900000 (given by
#' `nrow(data.frame) * ncol(data.frame)``). This can be changed. See
#' note for more details.
#'
#' @param facet (optional) bare variable name, if you want to create a faceted
#' plot, with one plot per level of the variable. No missingness percentage
#' column information will be presented when `facet` argument is used. Please
#' see the `naniar` package to provide missingness summaries over groups.
#'
#' @return `ggplot2` object displaying the position of missing values in the
#' dataframe, and the percentage of values missing and present.
#'
#' @seealso [vis_dat()] [vis_guess()] [vis_expect()] [vis_cor()] [vis_compare()]
#'
#' @note Some datasets might be too large to plot, sometimes creating a blank
#' plot - if this happens, I would recommend downsampling the data, either
#' looking at the first 1,000 rows or by taking a random sample. This means
#' that you won't get the same "look" at the data, but it is better than
#' a blank plot! See example code for suggestions on doing this.
#'
#' @examples
#'
#' vis_miss(airquality)
#'
#' vis_miss(airquality, cluster = TRUE)
#'
#' vis_miss(airquality, sort_miss = TRUE)
#'
#' vis_miss(airquality, facet = Month)
#'
#' \dontrun{
#' # if you have a large dataset, you might want to try downsampling:
#' library(nycflights13)
#' library(dplyr)
#' flights %>%
#' sample_n(1000) %>%
#' vis_miss()
#'
#' flights %>%
#' slice(1:1000) %>%
#' vis_miss()
#' }
#'
#' @export
vis_miss <- function(
x,
cluster = FALSE,
sort_miss = FALSE,
show_perc = TRUE,
show_perc_col = TRUE,
large_data_size = 900000,
warn_large_data = TRUE,
facet
) {
test_if_dataframe(x)
test_if_large_data(x, large_data_size, warn_large_data)
if (sort_miss) {
col_order_index <- names(n_miss_col(x, sort = TRUE))
} else if (!sort_miss) {
col_order_index <- names(x)
}
if (!missing(facet)) {
vis_miss_data <- x %>%
dplyr::group_by({{ facet }}) %>%
data_vis_miss(cluster)
col_order_index <- update_col_order_index(
col_order_index,
facet,
environment()
)
} else {
vis_miss_data <- data_vis_miss(x, cluster)
}
# calculate the overall % missingness to display in legend -------------------
# make a TRUE/FALSE matrix of the data.
# This tells us whether it is missing (true) or not (false)
x_fingerprinted <- fingerprint_df(x)
if (show_perc) {
temp <- miss_guide_label(x_fingerprinted)
p_miss_lab <- temp$p_miss_lab
p_pres_lab <- temp$p_pres_lab
# else if show_perc FALSE
} else {
p_miss_lab <- "Missing"
p_pres_lab <- "Present"
}
# then we plot it
vis_miss_plot <- vis_create_(vis_miss_data) +
ggplot2::scale_fill_manual(
name = "",
values = c(
"grey80",
"grey20"
),
labels = c(
p_pres_lab,
p_miss_lab
)
) +
ggplot2::guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
ggplot2::theme(legend.position = "bottom") +
# fix up the location of the text
ggplot2::theme(axis.text.x = ggplot2::element_text(hjust = 0))
# add the missingness column labels
# if there is only one colummn you don't need to sort the columns
# this is perhaps a bit of a hacky way around, but I can't see another
# way around it. Related issue: https://github.com/ropensci/visdat/issues/72
if (ncol(x) == 1) {
if (show_perc_col) {
return(
vis_miss_plot <- vis_miss_plot +
ggplot2::scale_x_discrete(
position = "top",
labels = label_col_missing_pct(
x_fingerprinted,
col_order_index
)
)
)
} else if (!show_perc_col) {
return(
vis_miss_plot <- vis_miss_plot +
ggplot2::scale_x_discrete(
position = "top",
labels = col_order_index
)
)
}
}
if (!missing(facet)) {
vis_miss_plot <- vis_miss_plot +
ggplot2::facet_wrap(facets = dplyr::vars({{ facet }}))
}
if (show_perc_col && missing(facet)) {
# flip the axes, add the info about limits
vis_miss_plot <- vis_miss_plot +
ggplot2::scale_x_discrete(
position = "top",
limits = col_order_index,
labels = label_col_missing_pct(
x_fingerprinted,
col_order_index
)
)
} else {
vis_miss_plot <- vis_miss_plot +
ggplot2::scale_x_discrete(
position = "top",
limits = col_order_index
)
}
return(vis_miss_plot)
# guides(fill = guide_legend(title = "Type"))
# Thanks to
# http://www.markhneedham.com/blog/2015/02/27/rggplot-controlling-x-axis-order/
# For the tip on using scale_x_discrete
} # end of function
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.