R/kubimities.R

#' @title Data path manager
#'
#' @description
#' \code{fp} is a simple utility to save / include the path to your data.
#' It can be used to shorten the calls of functions working with files,
#' if your data is not in the project directory.
#'
#' @usage
#' fp(filename)
#'
#' @param filename character. A name of the file.
#'
#' @details This function looks for a variable called path2data in your
#' Global environment. If no such variable is present, it asks you to specify it.
#' After specification, you can use \code{fp} as wrapper around the name of the
#' files in your data directory.
#'
#' @return Returns the full path to the file.
#'
#' @author Michal Kubista
#'
#' @examples
#' \dontrun{
#' fp("name_of_my_file.csv")
#' }
#' @export
fp = function(filename) {
    if (!exists("path2data")) {
        path2data = readline("Please specify the path to the data directory:")
        assign("path2data", path2data, envir = .GlobalEnv)
    }

    return(file.path(path2data, filename))
}

#' @title Cardinality utility
#'
#' @description
#' \code{leunq} is an extremely simple utility to asses the cardinality
#' (number of unique levels) in your data.
#'
#' @usage
#' lenuq(x)
#'
#' @param x a vector.
#'
#' @details This function simply call a combination of base functions
#' \code{length} and \code{unique}.
#'
#' @return Returns the number of unique levels.
#'
#' @author Michal Kubista
#'
#' @examples
#' lenuq(mtcars$cyl)
#'
#' @export
lenuq = function(x) {
    return(length(unique(x)))
}

freqs = function(x) {
    x %<>% as.data.table()
    x = x[, .(freq = .N), by = x][freq == max(freq)]
    if(nrow(x) > 1) {
        x = x[1]
    }
    return(x)
}

dtsum = function(x) {
    return(
        list(
            min = min(x, na.rm = T),
            Q1 = quantile(x, 0.25, na.rm = T, names = F),
            median = median(x, na.rm = T),
            mean = mean(x, na.rm = T),
            Q3 = quantile(x, 0.75, na.rm = T, names = F),
            max = max(x, na.rm = T),
            na = sum(is.na(x))
        )
    )
}

#' @title Summarisation function
#'
#' @description
#' \code{sumup} is an alternative to the base \code{summary} function.
#' It aims to provide more information in a more convinient format.
#'
#' @usage
#' sumup(x)
#'
#' @param x data. An object inheriting from data.frame (data.table, tibble).
#'
#' @details This function automatically splits the dataset into a numerical
#' and categorical variables. For numerical, the five-number-summary, mean and
#' number of NAs is reported. For categorical columns, the number of unique
#' levels and the most frequent level are provided.
#'
#' @return List of three:
#' \itemize{
#' \item{dimensions:} a vector of dimensions (a result of call \code{dim(x)})
#' \item{continuous:} a data.frame of enhanced five-number-summary (see Details) of
#' numerical variables
#' \item{categorical:} a data.frame of frequency and cardinality summary (see Details) of
#' categorical variables
#' }
#'
#' @author Michal Kubista
#'
#' @examples
#' sumup(mtcars)
#'
#' @export
sumup = function(x) {
    if (!inherits(x, "data.frame")) {
        stop("Object x is not a data.table, data.frame or tibble.")
    }

    colClasses = sapply(x, class)
    ch_cols = names(colClasses[colClasses %in% c("factor", "character")])
    num_cols = names(colClasses[colClasses %in% c("numeric", "integer",
                                                  "Date", "POSIXct",
                                                  "POSIXlt")])

    x %<>% as.data.table()

    if(length(num_cols) > 0) {
        sum_num =
            x[,lapply(.SD, dtsum)
              ,.SDcols = num_cols
              ][, metric := c("min", "Q1", "median", "mean", "Q3", "max", "NA")
                ][, .SD, .SDcols = c("metric", num_cols)]
    } else {
        sum_num = NA
    }

    if(length(ch_cols) > 0) {
    top_freq =
        lapply(x[, .SD, .SDcols = ch_cols], freqs) %>%
        rbindlist() %>%
        .[, paste0(.,": ", freq)] %>%
        t()

    sum_ch = x[,lapply(.SD, lenuq), .SDcols = ch_cols]

    colnames(top_freq) = colnames(sum_ch)
    sum_ch = rbind(sum_ch, top_freq)

    sum_ch %<>% t() %>% as.data.frame()
    colnames(sum_ch) = c("unique", "most_frequent")
    sum_ch$unique  %>%  as.numeric()
    } else {
        sum_ch = NA
    }

    return(list(dimensions = dim(x), continuous = sum_num, categorical = sum_ch))
}
kubistmi/kubimities documentation built on May 14, 2019, 2:07 p.m.