R/ff_summ_percentiles.R

Defines functions ff_summ_percentiles

Documented in ff_summ_percentiles

ff_summ_percentiles <- function(df = iris, bl_statsasrows = TRUE, col2varname = FALSE) {
    #' Summarize each variable in a dataset
    #'
    #' @description
    #' Generate similar statistics as what is generated by distributional statistics
    #' calculator from dynamic asset webpage's distributional codes: \url{https://fanwangecon.github.io/CodeDynaAsset/}
    #'
    #' @param df dataframe input dataframe of interest
    #' @param col2varname boolean if true drop var names
    #' @param bl_statsasrows boolean if true then rotate table
    #' @return a dataframe with summary statistics.
    #' @author Fan Wang, \url{http://fanwangecon.github.io}
    #' @references
    #' \url{https://fanwangecon.github.io/REconTools/reference/ff_summ_percentiles.html}
    #' \url{https://github.com/FanWangEcon/REconTools/blob/master/R/ff_summ_percentiles.R}
    #' @export
    #' @import dplyr tidyr tibble
    #' @examples
    #' ff_summ_percentiles(iris)
    #' ff_summ_percentiles(iris, FALSE)

    # The code only works with numeric columns, so select
    df <- df %>% select_if(is.numeric)

    # if there are variables with underscore in names, replace by dot
    names(df) <- gsub("_", ".", names(df))

    # compute relevant statistics
    tb_summ_stats <- df %>% ungroup() %>%
        summarise_if(
            is.numeric, funs(
                n = n(), unique = length(unique(.)),
                NAobs = sum(is.na(.) == 1), ZEROobs = sum(. == 0),
                mean = mean(., na.rm = TRUE), min = min(., na.rm = TRUE), max = max(., na.rm = TRUE),
                sd = sd(., na.rm = TRUE), cv = sd(., na.rm = TRUE)/mean(., na.rm = TRUE),
                p01 = quantile(., probs = c(0.01), na.rm = TRUE),
                p05 = quantile(., probs = c(0.05), na.rm = TRUE),
                p10 = quantile(., probs = c(0.1), na.rm = TRUE),
                p25 = quantile(., probs = c(0.25), na.rm = TRUE),
                p50 = quantile(., probs = c(0.5), na.rm = TRUE),
                p75 = quantile(., probs = c(0.75), na.rm = TRUE),
                p90 = quantile(., probs = c(0.9), na.rm = TRUE),
                p95 = quantile(., probs = c(0.95), na.rm = TRUE),
                p99 = quantile(., probs = c(0.99), na.rm = TRUE)))

    # Summ stats to tibble and reshape
    tb_summ_stats <- as.tibble(tb_summ_stats) %>%
        gather(variable, value) %>%
        separate(variable, c("var", "stats"), sep = "_") %>%
        spread(stats, value) %>%
        select(var, n, unique, NAobs, ZEROobs, mean, sd, cv, min, p01, p05, p10, p25, p50, p75, p90, p95, p99, max)

    # first column to row names, not encouraged in tibble
    if (col2varname) {
        tb_summ_stats <- column_to_rownames(tb_summ_stats, var = "var")
    }

    # Show stats as rows and variables as columns
    if (bl_statsasrows) {
        tb_summ_stats <- as_tibble(cbind(nms = names(tb_summ_stats), t(tb_summ_stats)))
        names(tb_summ_stats) <- tb_summ_stats %>% slice(1) %>% unlist()
        tb_summ_stats <- tb_summ_stats %>% slice(-1)
        tb_summ_stats <- tb_summ_stats %>% rename(stats = var)

    }

    return(tb_summ_stats)
}
FanWangEcon/REconTools documentation built on Jan. 21, 2022, 10:28 p.m.