R/cat_unique_counts.R

Defines functions cat_unique_count

Documented in cat_unique_count

#' @details Count unique values in categorical columns
#' @name cat_unique_count
#' @title cat_unique_count
#'
#' @param df A dataframe of type data.frame
#'
#' @description Counts unique values in a data frame's categorical feature
#'
#'
#' @return A dataframe with two columns: feature name and unique count
#' @export
#'
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr select_if select summarise_all n_distinct
#' @importFrom tibble as_tibble
#' @importFrom purrr map keep
#' @importFrom lubridate parse_date_time
#'
#' @examples
#' cat_unique_count( df = data.frame(
#' name = c("Amy","Tony","Jessica"),
#' age = c(18,21,30),
#' hobby = c("lab","quiz","swim")))

library('dplyr')
library('purrr')
library('lubridate')

cat_unique_count <- function(df) {

  # check if input is a data frame
  if ( !is.data.frame(df))
  {
    stop("Please provide a DataFrame for the 'df' argument")
  }

  #check if input is empty
  else if ( length(df) == 0) {
    stop ("Data frame cannot be empty")
  }
  else{
    # get only character/factor object in df
    df_char <- df|>
      select_if(is.character )

    df_fact <- df |>
      select_if(is.factor)

    df_cats <- rbind(df_char, df_fact)

    # get column names
    cat_names <- colnames(df_cats)

    # check if its date and get only non date-like columns
    df_is_date <-  as_tibble(map(df_cats , is_date))
    df_cat_cols <- names(keep(df_is_date, ~all(!.x)))

    # use summarize to get unique value counts
    df_value_counts <- df|>
      select(df_cat_cols)|>
      summarise_all(list(~n_distinct(.)))

    # pivot resulting table
    df_cat_unique  <- pivot_longer( df_value_counts,
                                    cols = names( df_value_counts),
                                   names_to = "feature name",
                                   values_to ="unique count" )

    df_cat_unique
  }

}

#' Check for date like string: Return true for matches date format
#'
#' @param string A string
#'
#' @return logical vector
#' @export
#'
#' @examples
#' is_date("20-03-2022")

is_date <- function (string) {
  tryCatch(!is.na(parse_date_time(string, "%d-%m-%y")) ,
           warning = function(w){
             FALSE
           })
}
UBC-MDS/slimreda documentation built on Feb. 7, 2022, 9:12 a.m.