R/pretty-cuts.R

Defines functions pretty_cuts

Documented in pretty_cuts

# Pretty cuts
#' @title Pretty Cuts
#'
#' @description Function removing set notation used by \code{\link[base]{cut}}
#'   or by \code{\link[Hmisc]{cut2}}.
#'
#' @details For a factor generated by \code{\link[Hmisc]{cut2}}
#'   \code{"[2057,2652)", "[2652,3092)", "..."}
#'   the function will return a character vector
#'   \code{"2057 - 2651", "2652 - 3091", "..." }
#'
#' @param cut_str A character or factor vector produced by \code{\link[base]{cut}}
#'   or by \code{\link[Hmisc]{cut2}} or a similar function. The set values have
#'   to be expressed as integer strings.
#'
#' @param only_cuts A logical, defaults to \code{FALSE}. If \code{TRUE} the
#'   function will return only cut boundaries. For \code{FALSE} the function
#'   replaces each value with modified set value.
#'
#' @return A character vector.
#'
#' @section Acknowledgements:
#' The following function was referenced on Stacks Overflow on a number of
#'   occasions, in reference to
#'   \href{https://stackoverflow.com/q/31771810/1655567}{obtaining nicer cuts},
#'   \href{https://stackoverflow.com/q/34812228/1655567}{improving function performance} and
#'   \href{https://stackoverflow.com/q/34589927/1655567}{reducing small groups}.
#'   The linked discussion provide a more comprehensive solutions than the function
#'   offered in this package.
#'
#' @export
#'
#' @importFrom stringr str_extract str_replace str_squish
#' @importFrom checkmate assert_character
#'
#' @examples
#' set.seed(123)
#' pretty_cuts(cut(x = runif(n = 1e3, min = 1, 1e4), breaks = 5))
#'
pretty_cuts <- function(cut_str, only_cuts = FALSE) {
    # Check if passed as factor and de-factorize if needed
    if (is.factor(cut_str)) {
        cut_str <- as.character(cut_str)
    }

    # Check if passed vector is character vector and has the required
    # characteristics
    assert_character(
        x = cut_str,
        min.chars = 3,
        pattern = ".*(\\d*).*\\,.*(\\d*).*",
        ignore.case = TRUE,
        any.missing = TRUE,
        all.missing = FALSE,
        min.len = 1,
        unique = FALSE,
        null.ok = FALSE
    )

    # Split on comma to get two parts of each string
    lst_chunks <- strsplit(x = cut_str,
                           split = ",",
                           fixed = TRUE)

    # Fix boundary numbers
    lapply(
        X = lst_chunks,
        FUN = function(chunk_group) {
            sapply(chunk_group, function(chunk) {
                clean_num <-
                    as.integer(str_extract(string = chunk, pattern = "\\d{1,}"))
                if (grepl(pattern = "\\(|\\)", x = chunk)) {
                    clean_num - 1
                } else {
                    clean_num
                }
            })
        }
    ) -> lst_chunks_fxd

    # Collapse each chunk to create a nice group with - in the middle
    lapply(
        X = lst_chunks_fxd,
        FUN = function(chunk) {
            # Clear white spaces from the chunk side
            trimws(chunk) -> chunk
            # Remove potential superfluous white spaces inside string
            chunk <- str_squish(chunk)
            # Collapse with a nice hyphen notation
            paste0(chunk, collapse = " - ")
        }
    ) -> lst_chunks_replaced

    #  Prepare vector to return
    res <- unlist(lst_chunks_replaced)

    # Check whether to return only cuts
    if (only_cuts) {
        return(unique(res))
    } else {
        return(res)
    }
}
konradedgar/KEmisc documentation built on April 15, 2021, 1:50 p.m.