R/summarisers.R

Defines functions summarise_variables summarise_missingness

Documented in summarise_missingness summarise_variables

#' Compute many univariate summary statistics for all variables.
#'
#' @param input A dataframe of numeric and/or non-numeric variables to summarize.
#'
#' @return A long-formatted dataframe (class tbl_df, data.frame) of summary values.
#' 
#' @importFrom magrittr "%>%"
#' @export

summarise_variables <- function(input) {
    
    # divide input dataframe into numeric and discrete input
    # 1. numerics
    numeric_variables <- input %>%
        select_if(is.numeric)
    
    if (ncol(numeric_variables) > 0) {
        
        numeric_input <- numeric_variables %>%
            gather(variable, value)
        
    } else {
        
        numeric_input <- tibble::tibble(.rows = 0)
    }
    
    # 2. others
    discrete_variables <- input %>%
        select_if(negate(is.numeric))
    
    if (ncol(discrete_variables) > 0) {
        
        discrete_input <- discrete_variables %>%
            gather(variable, value) %>%
            # to group_by value to summarize values
            mutate(values = value)
        
    } else {
        
        discrete_input <- tibble::tibble(.rows = 0)
    }
    
    # define summary functions for numeric variables
    # ". %>%" is a magrittr-package short-hand for an anonymous function
    numeric_summary_functions <- list(
        sample_median = . %>% median(na.rm = TRUE),
        sample_mean = . %>% mean(na.rm = TRUE),
        sample_minimum = . %>% min(na.rm = TRUE),
        sample_maximum = . %>% max(na.rm = TRUE),
        type8_25_centile = . %>% quantile(probs = c(0.25), na.rm = TRUE, type = 8),
        type8_75_centile = . %>% quantile(probs = c(0.75), na.rm = TRUE, type = 8),
        sample_sd = . %>% sd(na.rm = TRUE),
        sample_mode = . %>% modeest::mlv1(method = "mfv", na.rm = TRUE),
        density_mode = . %>% modeest::mlv(method = "density", na.rm = TRUE),
        sample_sum = . %>% sum(na.rm = TRUE),
        sample_variance = . %>% var(na.rm = TRUE)
    )
    
    # apply summaries for numeric variables
    if (ncol(numeric_variables) > 0) {
        
        numeric_by_variable <- numeric_input %>%
            group_by(variable) %>%
            summarise_all(numeric_summary_functions) %>%
            tidyr::gather("summary_measure", "summary_value", -variable) %>%
            dplyr::mutate(summary_value = as.character(summary_value)) %>%
            dplyr::mutate(value = NA_character_) %>%
            dplyr::select(variable, value, summary_measure, summary_value)
    
    } else {
        
        numeric_by_variable <- tibble::tibble(.rows = 0)
    }
    
    # apply summaries for discrete variables
    # this can be done both by unique value (one number per unique value) 
    # and by variable (one number per variable)
    if (ncol(discrete_variables) > 0) {
        
        discrete_by_value <- discrete_input %>%
            group_by(variable, value) %>%
            summarise(n = n()) %>%
            ungroup() %>%
            # TODO: computes proportions wrong
            mutate(
                proportion = n / sum(n),
                odds = proportion / (1 - proportion),
                value = dplyr::if_else(is.na(value), "Missing", value)
            ) %>%
            tidyr::gather("summary_measure", "summary_value", -variable, -value) %>%
            dplyr::mutate(summary_value = as.character(summary_value))
    
        discrete_by_variable <- discrete_input %>%
            select(-values) %>%
            group_by(variable) %>%
            summarise(
                n_all = n(),
                n_observed = sum(!is.na(value)),
                n_distinct = n_distinct(value),
                sample_mode = modeest::mfv1(value, na.rm = TRUE)
            ) %>%
            tidyr::gather("summary_measure", "summary_value", -variable)
        
    } else {
        
        discrete_by_value <- tibble::tibble(.rows = 0)
        discrete_by_variable <- tibble::tibble(.rows = 0)
    }
    
    # combine everything back together
    summary_df <- discrete_by_value %>%
        dplyr::bind_rows(discrete_by_variable) %>%
        dplyr::bind_rows(numeric_by_variable) %>%
        dplyr::mutate(value = dplyr::if_else(is.na(value), "Any value", value)) %>%
        mutate(summary_measure = str_replace_all(summary_measure, "_", " ")) %>%
        mutate(summary_measure = str_to_sentence(summary_measure)) %>%
        mutate(summary_measure = str_replace_all(summary_measure, "N", "Count")) %>%
        arrange(variable, value, summary_measure)

    return(summary_df)

}

#' Summarise missing values in datasets.
#'
#' Note: this function has not been optimized yet and doesn't scale well to large (>10000 variables) datasets.
#'
#' @param data A dataset object (data.frame, tibble) or a list of datasets to summarise.
#'
#' @importFrom magrittr "%>%"
#' @export

summarise_missingness <- function(data) {

    # convert data.frames to lists to use purrr::map for both inputs
    if (("list" %in% class(data))) {
        input <- data
    } else {
        input <- list(data)
    }
    
    # a summariser for a single data.frame
    summarise_missingness <- function(dataset) {
        dataset %>%
            dplyr::mutate_all(function(x) dplyr::if_else(is.na(x), TRUE, FALSE)) %>%
            dplyr::summarise_all(list(
                missing_proportion = function(x) sum(x) / length(x)
            )) %>%
            tidyr::gather(variable, missing_proportion) %>%
            dplyr::mutate(
                variable = stringr::str_remove(variable, "_missing_proportion")
            )
    }

    summary_table <- input %>%
        purrr::map_dfr(summarise_missingness) %>%
        dplyr::arrange(dplyr::desc(missing_proportion))

    return(summary_table)
}
eteppo/tvs-project documentation built on Aug. 13, 2019, 8:53 a.m.