R/misc.R

Defines functions find_interval quantilize upper_first_char round_to_char reorder_labels df_to_message

Documented in df_to_message find_interval quantilize reorder_labels round_to_char upper_first_char

#' Print data.frame as message
#' 
#' Print \code{data.frame} or \code{data.table} object as a message. 
#' 
#' @param x \code{data.frame} or \code{data.table} object
#' @param digits number of significant digits to be printed; defaults to 3
#' @param row_names boolean; if TRUE, row names of \code{x} are printed. Defaults to FALSE
#' @param col_names boolean; if TRUE, column names of \code{x} are printed. Defaults to TRUE
#' @return prints out the content of the data.table as a \code{message}
#' @export
df_to_message = function(x, digits = 3L, row_names = FALSE, col_names = TRUE) 
{
    
    if (!col_names)
        names(x) = NULL
    
    message(
        paste(
            utils::capture.output(
                print(round(x, digits), row.names = row_names, digits = digits)
            ),
            collapse = "\n"
        )
    )
    
}


#' Reorder categories according to frequency
#' 
#' Takes a integer, character, or factor vector and reorder the labels according to the frequency with which they appear.
#' 
#' @param x integer or character vector, or factor variable
#' @param decreasing boolean; if TRUE, orders in decreasing order, otherwise in increasing order
#' @return returns \code{x} with labels ordered according to frequencies
#' @details It is assumed that the labels of \code{x} have an inherent ordering. \code{reorder_labels} reorders the labels such that the first label is assigned to the category with the highest frequency, the second label to the second most frequent category, and so on.
#' @export
reorder_labels = function(x, decreasing = TRUE)
{
    
    # get class of x
    x_class = class(x)
    
    if (x_class == "character") {    
        
        freq = table(x)
        labs = names(freq)
        l_tab = labs[order(freq, decreasing = decreasing)]
        
        return(labs[match(x, l_tab)])
        
    } else if (x_class == "integer") {
        
        match(x, order(table(x), decreasing = decreasing))

    } else if (x_class == "factor") {
        
        freq = table(x)
        labs = names(freq)[order(freq, decreasing = decreasing)]
        
        return(factor(x, levels = labs))
        
        
    } else {
        
        stop("x must be either a character or integer vector, or a factor variable")
        
    }

}



#' Round numeric object to string while keeping trailing zeros
#' 
#' Wrapper function to round a numeric value (or vector) to character string (or vector) for printing purposes keeping trailing zeros
#' 
#' @param x a numeric vector
#' @param digits integer; number of decimals to keep
#' @return character string/vector with \code{x} rounded to \code{digit} digits while keeping trailing zeros
#' @export
round_to_char = function(x, digits = 1L) 
{
 
    if (!is.null(dim(x)))
        stop("dimension of x has to be NULL")
   
    if (digits %% 1 != 0) 
        stop("digits has to be an integer value")
    
    return(sprintf(paste0("%.", digits, "f"), round(x, digits)))

}



#' Change first character of string to upper case
#' 
#' @param x string to modify
#' @param rest_to_lower boolean; if TRUE, changes all other characters to lower case
#' @return returns \code{x} with the first character changed to upper-case
upper_first_char = function(x, rest_to_lower = FALSE) 
{
    
    x1 = toupper(substr(x, 1, 1))
    x2 = if (rest_to_lower) {
            tolower(substr(x, 2, nchar(x)))
        } else {
            substr(x, 2, nchar(x))
        }
    
    return(paste0(x1, x2))

}

#' Generate factor using quantiles
#' 
#' Generates a factor out of a numeric vector, where the levels are determined based on the quantiles
#' 
#' @param x a numeric vector
#' @param n the number of categories to create
#' @param return_labels whether the labels containing the intervals should be returned; if \code{FALSE} a \code{numeric} variable is returned
#' @return returns a factor variable where the levels are intervals of \code{x} determined by the quantiles of \code{x}. The number of quantiles to use is determined by \code{n}.
quantilize = function(x, n, return_labels = TRUE) 
{
    if (!is.numeric(x)) 
        stop("x has to be numeric")
    
    if (n < 0) 
        stop("n has to be positive")
    
    cut(
        x,
        quantile(
            x, 
            probs = seq(0, 1, by = 1/n),
            na.rm = TRUE
        ),
        include.lowest = TRUE,
        labels = if (return_labels)  NULL  else FALSE
    )
    
}

#' Find interval containing a specific value from an \code{quantilefactor} object
#' 
#' @param x a \code{factor} object created with \code{gen_quantiles}
#' @param val a \code{numeric} vector containing the value(s) to search for
#' @param return_labels if \code{TRUE}; returns the label of the quantile, otherwise the number
#' @return returns the number of the quantile to which each element in \code{x} belongs. The lowest quantile has number 1 (not 0).
#' @details function checks the pattern of the factor labels, which have to be in the form of an interval. The specific regular expression that is checked is \code{^(\\(|\\[)-*[0-9.]+(e-\\d+)*,-*[0-9.]+(e-\\d+)*(\\)|\\])$}.
find_interval = function(x, val, return_labels = FALSE) 
{
    
    if (!is.factor(x)) 
        stop("x has to a factor")
    
    if (!is.numeric(val))
        stop("val has to be numeric")
    
    # get levels
    l = levels(x)
    
    # check pattern
    if (!all(grepl("^(\\(|\\[)-*[0-9.]+(e-\\d+)*,-*[0-9.]+(e-\\d+)*(\\)|\\])$", l)))
        stop("factor levels have to be a pattern of an interval (see details in ?find_quantile)")
    
    # drop (,[ at start and end
    l = gsub("^\\(|^\\[|\\)$|\\]$", "", levels(x))
    
    # get cutpoints for the intervals
    cuts = as.numeric(unique(do.call(c, strsplit(l, ","))))

    # find intervals
    res = findInterval(val, cuts, left.open = TRUE)
    
    # check values
    if (any(val < min(cuts) | val > max(cuts))) {
        
        warning("some elements of val are outside of the data range")

        # adjust when values are outside of data-range
        res[res == 0] = 1
        res[res == length(cuts)] = length(cuts) - 1L
        
    }
    
    if (return_labels)
        return(levels(x)[res])
    
    return(res)
        
}
baruuum/btoolbox documentation built on Aug. 17, 2020, 1:29 a.m.