R/occurrencesLessThan.R

Defines functions occurrencesLessThan.tbl_df occurrencesLessThan.data.frame occurrencesLessThan.matrix occurrencesLessThan.factor occurrencesLessThan.POSIXct occurrencesLessThan.Date occurrencesLessThan.numeric occurrencesLessThan.integer occurrencesLessThan.logical occurrencesLessThan.character vectorOption occurrencesLessThan.default occurrencesLessThan

Documented in occurrencesLessThan

##' Identifies Infrequent inverseRegex Patterns in an R Object.
##' 
##' Calls \code{inverseRegex} on the input object and identifies values that
##' occur infrequently.
##' 
##' @param x Object to analyse for infrequent regex patterns.
##' @param fraction Fraction of the R object size; regex patterns that occur less
##' (or equal) often than this will be identified. For a vector this fraction will
##' be multiplied by the length of the object; for a matrix it will be multiplied by
##' the total number of entries; and for a data frame or tibble it will be multiplied
##' by the number of rows. Defaults to \code{0.05}.
##' @param n Alternative to the \code{fraction} argument which allows a literal
##' number of occurrences to be searched for. Defaults to NULL, in which case
##' \code{fraction} will be used.
##' @param ... Other arguments to be passed to \code{inverseRegex}.
##' 
##' @return A collection of logical values with \code{TRUE} indicating entries with
##' an infrequent regex pattern. The class of the return value will depend on the
##' input object; matrices, data frames, and tibbles will be returned in kind; all
##' others are returned as vectors.
##' 
##' @details This function is essentially a wrapper around calling \code{table()} on
##' the return value of \code{inverseRegex}. It can be used to identify the indices
##' of values that consist of a regex pattern different to others in the R object.
##' 
##' @note NA values are not considered and will need to be identified separately.
##' 
##' @examples
##' occurrencesLessThan(c(LETTERS, 1))
##' 
##' x <- iris
##' x$Species <- as.character(x$Species)
##' x[27, 'Species'] <- 'set0sa'
##' apply(occurrencesLessThan(x), 2, which)
##' 
##' @seealso inverseRegex, regex
##' 
##' @author Jasper Watson
##' 
##' @export
##' 
##
occurrencesLessThan <- function(x,
                                fraction = 0.05,
                                n = NULL,
                                ...
                                ){
    
    UseMethod('occurrencesLessThan')
    
}

##' @export
occurrencesLessThan.default <- function(...)
    stop('Input class not supported')

vectorOption <- function(x, fraction = 0.05, n = NULL, ...){
    
    out <- rep(FALSE, length(x))

    if (is.null(n))
        n <- fraction * length(x)
    
    y <- inverseRegex(x, ...)
    
    tab <- table(y)
    
    tmp <- y %in% names(tab)[tab <= n]
    
    if (length(tmp) > 0)
        out <- tmp
    
    out
    
}
##' @export
occurrencesLessThan.character <- function(x, fraction = 0.05, n = NULL,  ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.logical <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.integer <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.numeric <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.Date <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.POSIXct <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.factor <- function(x, fraction = 0.05, n = NULL, ...)
    vectorOption(x, fraction, n, ...)

##' @export
occurrencesLessThan.matrix <- function(x, fraction = 0.05, n = NULL, ...){
    
    out <- matrix(FALSE, nrow = nrow(x), ncol = ncol(x), dimnames = dimnames(x))
    
    if (is.null(n))
        n <- nrow(x) * ncol(x) * fraction
     
    y <- inverseRegex(as.vector(x), ...)
    
    tab <- table(y)
    
    tmp <- y %in% names(tab)[tab <= n]
    
    if (length(tmp) > 0)
        out[] <- tmp
        
    out
    
}

##' @export
occurrencesLessThan.data.frame <- function(x, fraction = 0.05, n = NULL, ...){

    if (is.null(n))
        n <- nrow(x) * fraction
    
    out <- as.data.frame(matrix(FALSE, nrow = nrow(x), ncol = ncol(x),
                                dimnames = dimnames(x)),
                         stringsAsFactors = FALSE)
    
    for (jj in 1:ncol(x) ){
        
        y <- as.vector(unlist(inverseRegex(x[, jj, drop = FALSE], ...)))
        
        tab <- table(y)
        
        tmp <- y %in% names(tab)[tab <= n]
        
        if (length(tmp) > 0)
            out[, jj] <- tmp
        
    }
    
    out    
    
}

##' @export
occurrencesLessThan.tbl_df <- function(x, fraction = 0.05, n = NULL, ...){

    if (!requireNamespace('tibble', quietly = TRUE))
        stop('Package tibble not available. Install or provide input as a data.frame.')

    if (is.null(n))
        n <- nrow(x) * fraction
    
    out <- tibble::as_tibble(matrix(FALSE, nrow = nrow(x), ncol = ncol(x),
                                    dimnames = dimnames(x)),
                             stringsAsFactors = FALSE)
    
    for (jj in 1:ncol(x) ){
        
        y <- as.vector(unlist(inverseRegex(x[, jj, drop = FALSE], ...)))
        
        tab <- table(y)
        
        tmp <- y %in% names(tab)[tab <= n]
        
        if (length(tmp) > 0)
            out[, jj] <- tmp
        
    }
    
    out    
    
}

Try the inverseRegex package in your browser

Any scripts or data that you put into this service are public.

inverseRegex documentation built on Oct. 23, 2022, 5:06 p.m.