R/cb_summary_stats_time.R

Defines functions cb_summary_stats_time

Documented in cb_summary_stats_time

#' Compute Summary Statistics for Date or Time Variables
#'
#' @param df Data frame of interest
#' @param .x Column of interest
#' @param digits Number of digits after decimal to display
#'
#' @return A tibble
#' @family add_summary_stats
#' @importFrom dplyr %>%
#' @keywords internal
cb_summary_stats_time <- function(df, .x, digits = 2) {

  # ===========================================================================
  # Prevents R CMD check: "no visible binding for global variable ‘.’"
  # ===========================================================================
  Value = Frequency = Percentage = n = Statistic = .data = NULL

  # ===========================================================================
  # All NA check
  # If all values of .x are NA, then populate the table in an informative way.
  # Else calculate the min, mode, and max of non-missing values.
  # ===========================================================================
  unqiue_nonmissing_vals <- unique(df[[.x]])[!is.na(unique(df[[.x]]))]
  if (length(unqiue_nonmissing_vals) == 0) {
    summary <- data.frame(
      Statistic = c("Minimum", "Mode", "Maximum"),
      Value = "All values are missing",
      Frequency = length(df[[.x]]),
      Percentage = 100.00
    )

  } else {

    # ===========================================================================
    # Get the minimum value, and the number and percentage of times that value occurs
    # ===========================================================================
    min <- df %>%
      dplyr::summarise(
        Statistic  = "Minimum",
        Value      = min(.data[[.x]], na.rm = TRUE),
        Frequency  = (.data[[.x]] == Value) %>% sum(na.rm = TRUE),
        Percentage = Frequency / nrow(df) * 100
      ) %>%
      # Format output
      dplyr::mutate(
        Frequency  = format(Frequency, big.mark = ","),
        Percentage = round(Percentage, digits = digits),
        Percentage = format(Percentage, nsmal = digits)
      ) %>%
      dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

    # ===========================================================================
    # Get the mode value(s), and the number and percentage of times that value occurs
    # ===========================================================================
    counts <- df %>%
      # Filter out missing so that `NA` isn't returned as the mode value.
      dplyr::filter(!is.na(.data[[.x]])) %>%
      dplyr::count(.data[[.x]]) %>%
      dplyr::pull(n)

    # ===========================================================================
    # If all values appear the same number of times, we don't want them all listed out
    # ===========================================================================
    if (length(unique(counts)) == 1) {
      mode <- tibble::tibble(
        Statistic  = "Mode",
        Value      = paste("All", format(length(counts), big.mark = ","), "values"),
        Frequency  = unique(counts),
        Percentage = Frequency / nrow(df) * 100
      ) %>%
        # Format output
        dplyr::mutate(
          Frequency  = format(Frequency, big.mark = ","),
          Percentage = round(Percentage, digits = digits),
          Percentage = format(Percentage, nsmal = digits)
        )%>%
        dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

    } else {
      mode <- df %>%
        dplyr::filter(!is.na(.data[[.x]])) %>%
        dplyr::count(.data[[.x]], name = "Frequency") %>%
        dplyr::filter(Frequency == max(Frequency)) %>%
        dplyr::mutate(
          Statistic = "Mode",
          Percentage = Frequency / nrow(df) * 100,
          Value = .data[[.x]]
        ) %>%
        # Format output
        dplyr::mutate(
          Frequency  = format(Frequency, big.mark = ","),
          Percentage = round(Percentage, digits = digits),
          Percentage = format(Percentage, nsmal = digits)
        ) %>%
        dplyr::select(Statistic, Value, Frequency, Percentage) %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), as.character))
    }

    # ===========================================================================
    # Get the maximum value, and the number and percentage of times that value occurs
    # ===========================================================================
    max <- df %>%
      dplyr::summarise(
        Statistic  = "Maximum",
        Value      = max(.data[[.x]], na.rm = TRUE),
        Frequency  = (.data[[.x]] == Value) %>% sum(na.rm = TRUE),
        Percentage = Frequency / nrow(df) * 100
      ) %>%
      # Format output
      dplyr::mutate(
        Frequency  = format(Frequency, big.mark = ","),
        Percentage = round(Percentage, digits = digits),
        Percentage = format(Percentage, nsmal = digits)
      ) %>%
      dplyr::mutate(dplyr::across(dplyr::everything(), as.character))

    # ===========================================================================
    # Append the stats of interest into a single data frame and return
    # ===========================================================================
    summary <- dplyr::bind_rows(min, mode, max)

    # ===========================================================================
    # Issue 15: Make time appear in a human readable format
    # If .x is hms, we need to convert it to numeric before calculating min,
    # mode, max. We will then need to convert it back to hms before returning
    # the summary table.
    # ===========================================================================
    if ("hms" %in% class(df[[.x]])) {
      min_hms <- as.character(hms::as_hms(as.numeric(summary$Value[summary$Statistic == "Minimum"])))
      max_hms <- as.character(hms::as_hms(as.numeric(summary$Value[summary$Statistic == "Maximum"])))
      # If there is no mode, the value will be "All X values" and grepl will
      # TRUE
      mode_char <- grepl("All", summary$Value[summary$Statistic == "Mode"])
      # If mode_char is TRUE, then do nothing so that the value for mode will still
      # "All X values"
      if (!all(mode_char)) {
        # Otherwise, convert the number into an hms value
        summary$Value[summary$Statistic == "Mode"] <- as.character(hms::as_hms(summary$Value[summary$Statistic == "Mode"]))
      }
      summary$Value[summary$Statistic == "Minimum"] <- min_hms
      summary$Value[summary$Statistic == "Maximum"] <- max_hms
    }
  }

  # ===========================================================================
  # Return data frame of summary stats
  # ===========================================================================
  summary
}

# For testing
# devtools::load_all()
# data(study)
# study$date_visit[2] <- study$date_visit[1]
# study$time[2] <- study$time[1]
# study$time[4] <- study$time[3]
# cb_summary_stats_time(study, "time")

Try the codebookr package in your browser

Any scripts or data that you put into this service are public.

codebookr documentation built on June 25, 2024, 1:14 a.m.