R/miss-x-summary.R

Defines functions miss_summary miss_case_summary.grouped_df miss_case_summary.default miss_case_summary miss_var_summary.grouped_df miss_var_summary.default miss_var_summary

Documented in miss_case_summary miss_summary miss_var_summary

#' Summarise the missingness in each variable
#'
#' Provide a summary for each variable of the number, percent missings, and
#'   cumulative sum of missings of the order of the variables. By default,
#'   it orders by the most missings in each variable.
#'
#' @param data a data.frame
#' @param order a logical indicating whether to order the result by `n_miss`.
#'     Defaults to TRUE. If FALSE, order of variables is the order input.
#' @param add_cumsum logical indicating whether or not to add the cumulative
#'   sum of missings to the data. This can be useful when exploring patterns
#'   of nonresponse. These are calculated as the cumulative sum of the missings
#'   in the variables as they are first presented to the function.
#' @param ... extra arguments
#'
#' @note `n_miss_cumsum` is calculated as the cumulative sum of missings in the
#'     variables in the order that they are given in the data when entering
#'     the function
#'
#' @return a tibble of the percent of missing data in each variable
#'
#' @seealso  [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' miss_var_summary(airquality)
#' miss_var_summary(oceanbuoys, order = TRUE)
#'
#' \dontrun{
#' # works with group_by from dplyr
#' library(dplyr)
#' airquality %>%
#'   group_by(Month) %>%
#'   miss_var_summary()
#' }
#' @export
miss_var_summary <- function(data,
                             order = FALSE,
                             add_cumsum = FALSE,
                             ...) {

  test_if_null(data)

  test_if_dataframe(data)

  UseMethod("miss_var_summary")
}

#' @export
miss_var_summary.default <- function(data,
                                     order = TRUE,
                                     add_cumsum = FALSE,
                                     ...) {

  col_n_miss <- colSums(is.na(data))
  col_pct_miss <- colMeans(is.na(data)) * 100

  res <- tibble::tibble(variable = names(col_n_miss),
                        n_miss = as.integer(col_n_miss),
                        pct_miss = as.numeric(col_pct_miss))

  if (add_cumsum) {
   res <- res %>% dplyr::mutate(n_miss_cumsum = cumsum(n_miss))
  }

  if (order) {
    return(dplyr::arrange(res, -n_miss))
  }

  return(res)

}

#' @export
miss_var_summary.grouped_df <- function(data,
                                        order = TRUE,
                                        add_cumsum = FALSE,
                                        ...) {

  group_by_fun(data,
               .fun = miss_var_summary,
               order = order,
               add_cumsum = add_cumsum)

}

#' Summarise the missingness in each case
#'
#' Provide a summary for each case in the data of the number, percent missings,
#'     and cumulative sum of missings of the order of the variables. By default,
#'     it orders by the most missings in each variable.
#'
#' @param data a data.frame
#' @param order a logical indicating whether or not to order the result by
#'     n_miss. Defaults to TRUE. If FALSE, order of cases is the order input.
#' @param ... extra arguments
#' @param add_cumsum logical indicating whether or not to add the cumulative
#'   sum of missings to the data. This can be useful when exploring patterns
#'   of nonresponse. These are calculated as the cumulative sum of the missings
#'   in the variables as they are first presented to the function.
#'
#' @return a tibble of the percent of missing data in each case.
#'
#' @seealso  [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' miss_case_summary(airquality)
#'
#' \dontrun{
#' # works with group_by from dplyr
#' library(dplyr)
#' airquality %>%
#'   group_by(Month) %>%
#'   miss_case_summary()
#'}
#'
miss_case_summary <- function(data,
                              order = TRUE,
                              add_cumsum = FALSE,
                              ...){

  test_if_null(data)

  test_if_dataframe(data)

  UseMethod("miss_case_summary")
}

#' @export
miss_case_summary.default <- function(data,
                                      order = TRUE,
                                      add_cumsum = FALSE,
                                      ...){

  res <- data

  res[["pct_miss"]] <- rowMeans(is.na(res))*100
  res[["n_miss"]] <- as.integer(rowSums(is.na(res)))
  res[["case"]] <- seq_len(nrow(res))

  if (add_cumsum) {
    res[["n_miss_cumsum"]] <- cumsum(res[["n_miss"]])
    res <- dplyr::as_tibble(res)
    res <- dplyr::select(res,
                         case,
                         n_miss,
                         pct_miss,
                         n_miss_cumsum)
  }

  if (!add_cumsum) {
    res <- dplyr::as_tibble(res)

    res <- dplyr::select(res,
                         case,
                         n_miss,
                         pct_miss)

  }

  if (order) {
    return(dplyr::arrange(res, -n_miss))
  }

  if (!order) {
    return(res)
  }
}

#' @export
miss_case_summary.grouped_df <- function(data,
                                         order = TRUE,
                                         add_cumsum = FALSE,
                                         ...){

  group_by_fun(data,
               .fun = miss_case_summary,
               order = order,
               add_cumsum = add_cumsum)

}

#' Collate summary measures from naniar into one tibble
#'
#' `miss_summary` performs all of the missing data helper summaries and puts
#'   them into lists within a tibble
#'
#' @param data a dataframe
#' @param order whether or not to order the result by n_miss
#'
#' @return a tibble of missing data summaries
#'
#' @seealso  [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' s_miss <- miss_summary(airquality)
#' s_miss$miss_df_prop
#' s_miss$miss_case_table
#' s_miss$miss_var_summary
#' # etc, etc, etc.
#'
#' \dontrun{
#' library(dplyr)
#' s_miss_group <- group_by(airquality, Month) %>% miss_summary()
#' s_miss_group$miss_df_prop
#' s_miss_group$miss_case_table
#' # etc, etc, etc.
#' }
#'
miss_summary <- function(data, order = TRUE){

  test_if_null(data)

  test_if_dataframe(data)

  return(
    tibble::tibble(
        miss_df_prop = prop_miss(data),
        miss_var_prop = prop_miss_var(data),
        miss_case_prop = prop_miss_case(data),
        miss_case_table = list(miss_case_table(data)),
        miss_var_table = list(miss_var_table(data)),
        miss_var_summary = list(miss_var_summary(data, order)),
        miss_case_summary = list(miss_case_summary(data, order))
      )
    )
  }

Try the naniar package in your browser

Any scripts or data that you put into this service are public.

naniar documentation built on Feb. 16, 2023, 5:11 p.m.