R/colstats.R

#' Column-wise statistics for data.frames
#'
#' @param df a data.frame or a tibble
#' @examples
#' library(colstats)
#' colstats(iris)
#' @export
colstats <- function(df) {

  # output is a list containing three tibbles (data.frames)
  # one tibble with global profile, one with numeric variable profile, and
  # one with categorical variable profile

  output_list <- list()

  # profiling applied to all variables regardless of class #

  global_profile <- purrr::map(df, function(x) {
    dplyr::tibble(
      class.x = class(x),
      count.x = length(x),
      distinct.x = count_n_distinct(x),
      NA.x = count_na(x),
      pctNA.x = percent_na(x)
    )
  }) %>%
    dplyr::bind_rows() %>%
    dplyr::mutate(variable = names(df)) %>%
    dplyr::select(variable, dplyr::everything())

  output_list[["global"]] <- global_profile

  # profiling numeric variables

  n_numerics <- purrr::map_int(df, is.numeric) %>% sum()

  if (n_numerics >= 1) {

    numeric_features <- df %>%
      dplyr::select_if(is.numeric)

    numeric_profile <- purrr::map(numeric_features, function(x) {
      dplyr::tibble(
        min.x = min(x, na.rm = T),
        quartile1.x = quantile(x, 0.25, na.rm = T),
        mean.x = mean(x, na.rm = T),
        median.x = median(x, na.rm = T),
        quartile3.x = quantile(x, 0.75, na.rm = T),
        max.x = max(x, na.rm = T),
        sd.x = sd(x, na.rm = T)
        #normalP.x = get_shapiro_wilk_pvalue(x),
        #kurtosis.x = kurtosis(x, type = 3)
      )
    }) %>%
      dplyr::bind_rows() %>%
      dplyr::mutate(variable = names(numeric_features)) %>%
      dplyr::select(variable, dplyr::everything())

    output_list[["numeric"]] <- numeric_profile
  }

  # profile categorical variables

  n_non_numerics <- purrr::map_int(df, function(x) !is.numeric(x)) %>% sum()

  if (n_non_numerics >= 1) {

    categorical_features <- df %>%
      dplyr::select_if(funs(!is.numeric(.)))

    categorical_profile <- purrr::map(categorical_features, function(x) {
      dplyr::tibble(
        mode.x = get_mode(x),
        countMode.x = count_mode(x),
        pctMode.x = pct_mode(x),
        countNonAlphanum.x = count_non_alphanum(x)
      )
    }) %>%
      dplyr::bind_rows() %>%
      dplyr::mutate(variable = names(categorical_features)) %>%
      dplyr::select(variable, dplyr::everything())

    output_list[["categorical"]] <- categorical_profile
  }

  return(output_list)

}
dannymorris/colstats documentation built on May 31, 2019, 5:40 a.m.