R/vis_miss.R

#' Visualise a data.frame to display missingness.
#'
#' `vis_miss` provides an at-a-glance ggplot of the missingness inside a
#'   dataframe, colouring cells according to missingness, where black indicates
#'   a missing cell and grey indicates a present cell. As it returns a ggplot
#'   object, it is very easy to customize and change labels.
#'
#' @param x a data.frame
#'
#' @param cluster logical. TRUE specifies that you want to use hierarchical
#'   clustering (mcquitty method) to arrange rows according to missingness.
#'   FALSE specifies that you want to leave it as is. Default value is FALSE.
#'
#' @param sort_miss logical. TRUE arranges the columns in order of missingness.
#'   Default value is FALSE.
#'
#' @param show_perc logical. TRUE now adds in the \% of missing/complete data
#'   in the whole dataset into the legend. Default value is TRUE.
#'
#'@param show_perc_col logical. TRUE adds in the \% missing data in a given
#'  column into the x axis. Can be disabled with FALSE. Default value is TRUE.
#'
#' @param warn_large_data logical - warn if there is large data? Default is TRUE
#'   see note for more details
#'
#' @param large_data_size integer default is 900000 (given by
#'   `nrow(data.frame) * ncol(data.frame)``). This can be changed. See
#'   note for more details.
#'
#' @return `ggplot2` object displaying the position of missing values in the
#'   dataframe, and the percentage of values missing and present.
#'
#' @seealso [vis_dat()] [vis_guess()] [vis_expect()] [vis_cor()] [vis_compare()]
#'
#' @note Some datasets might be too large to plot, sometimes creating a blank
#'   plot - if this happens, I would recommend downsampling the data, either
#'   looking at the first 1,000 rows or by taking a random sample. This means
#'   that you won't get the same "look" at the data, but it is better than
#'   a blank plot! See example code for suggestions on doing this.
#'
#' @examples
#'
#' vis_miss(airquality)
#'
#' \dontrun{
#' vis_miss(airquality, cluster = TRUE)
#'
#' vis_miss(airquality, sort_miss = TRUE)
#'
#' # if you have a large dataset, you might want to try downsampling:
#' library(nycflight13)
#' library(dplyr)
#' flights %>%
#'   sample_n(1000) %>%
#'   vis_miss()
#'
#' flights %>%
#'   slice(1:1000) %>%
#'   vis_miss()
#'
#' }
#'
#' @export
vis_miss <- function(x,
                     cluster = FALSE,
                     sort_miss = FALSE,
                     show_perc = TRUE,
                     show_perc_col = TRUE,
                     large_data_size = 900000,
                     warn_large_data = TRUE){
  
  # throw error if x not data.frame
  test_if_dataframe(x)
  
  # add warning for large data
  if (ncol(x) * nrow(x) > large_data_size && warn_large_data) {
    stop("Data exceeds recommended size for visualisation, please consider
         downsampling your data, or set argument 'warn_large_data' to FALSE.")
  }
  
  # make a TRUE/FALSE matrix of the data.
  # This tells us whether it is missing (true) or not (false)
  x.na <- is.na(x)
  
  # switch for creating the missing clustering
  if (cluster){
    
    # this retrieves a row order of the clustered missingness
    row_order_index <-
      stats::dist(x.na*1) %>%
      stats::hclust(method = "mcquitty") %>%
      stats::as.dendrogram() %>%
      stats::order.dendrogram()
    
  } else {
    row_order_index <- seq_len(nrow(x))
  } # end else
  
  if (sort_miss) {
    
    # arrange by the columns with the highest missingness
    # code inspired from https://r-forge.r-project.org/scm/viewvc.php/ ...
    # pkg/R/missing.pattern.plot.R?view=markup&root=mi-dev
    # get the order of columns with highest missingness
    na_sort <- order(colSums(is.na(x)), decreasing = TRUE)
    
    # get the names of those columns
    col_order_index <- names(x)[na_sort]
    
  } else if (!sort_miss) {
    
    col_order_index <- names(x)
    
  }
  
  # Arranged data by dendrogram order index
  
  # gather the variables together for plotting
  # here we now have a column of the row number (row),
  # then the variable(variables),
  # then the contents of that variable (value)
  dat_pre_vis <- as.data.frame(x.na[row_order_index , ])
  
  d <- dat_pre_vis %>%
    vis_gather_() %>%
    # add info for plotly mousover
    dplyr::mutate(value = vis_extract_value_(dat_pre_vis))
  
  # calculate the overall % missingness to display in legend -------------------
  
  if (show_perc) {
    
    temp <- miss_guide_label(x)
    
    p_miss_lab <- temp$p_miss_lab
    
    p_pres_lab <- temp$p_pres_lab
    
    # else if show_perc FALSE
  } else {
    
    p_miss_lab <- "Missing"
    
    p_pres_lab <- "Present"
    
  }
  # then we plot it
  vis_miss_plot <- vis_create_(d) +
    ggplot2::scale_fill_manual(name = "",
                               values = c("grey80",
                                          "grey20"),
                               labels = c(p_pres_lab,
                                          p_miss_lab)) +
    ggplot2::guides(fill = ggplot2::guide_legend(reverse = TRUE)) +
    ggplot2::theme(legend.position = "bottom") +
    # fix up the location of the text
    ggplot2::theme(axis.text.x = ggplot2::element_text(hjust = 0))
  
  # add the missingness column labels
  
  # if there is only one colummn you don't need to sort the columns
  # this is perhaps a bit of a hacky way around, but I can't see another
  # way around it. Related issue: https://github.com/ropensci/visdat/issues/72
  if (ncol(x) == 1) {
    
    if (show_perc_col) {
      return(
        # print(
        vis_miss_plot +
          ggplot2::scale_x_discrete(position = "top",
                                    labels = label_col_missing_pct(
                                      x,
                                      col_order_index)
          )
        # )
      )
    } else if (!show_perc_col) {
      return(
        # print(
        vis_miss_plot +
          ggplot2::scale_x_discrete(position = "top",
                                    labels = col_order_index)
        # )
      )
    }
    
  }
  
  if (show_perc_col) {
    
    # flip the axes, add the info about limits
    vis_miss_plot +
      ggplot2::scale_x_discrete(position = "top",
                                limits = col_order_index,
                                labels = label_col_missing_pct(
                                  x,
                                  col_order_index)
      )
    
  } else {
    vis_miss_plot +
      ggplot2::scale_x_discrete(position = "top",
                                limits = col_order_index)
  }
  
  # guides(fill = guide_legend(title = "Type"))
  # Thanks to
  # http://www.markhneedham.com/blog/2015/02/27/rggplot-controlling-x-axis-order/
  # For the tip on using scale_x_discrete
  
} # end of function
jixing475/tableOne documentation built on June 17, 2019, 1:05 a.m.