R/cut3.R

Defines functions cut3

Documented in cut3

#' @title cut3
#'
#' @description Hmisc::cut2 bones repackaged to remove errors with importing Hmisc
#'
#' @param x numeric vector to classify into intervals.
#' @param cuts cut points.
#' @param m desired minimum number of observations in a group. The algorithm does not guarantee that all groups will have at least m observations.
#' @param g number of quantile groups
#' @param digits number of significant digits to use in constructing levels.
#' @param minmax if cuts is specified but min(x)<min(cuts) or max(x)>max(cuts), augments cuts to include min and max x
#' @param oneval if an interval contains only one unique value, the interval will be labeled with the formatted version of that value instead of the interval endpoints, unless oneval=FALSE
#' @param onlycuts set to TRUE to only return the vector of computed cuts. This consists of the interior values plus outer ranges.
#' @param formatfun format function
#' @param ... additional arguments passed to formatfun
#'
#' @return vector of cut
#'
#'
#' @export
#' @importFrom stats "approx"

cut3 <- function(x, cuts, m=150, g, digits, minmax=TRUE,
                oneval=TRUE, onlycuts=FALSE, formatfun = format, ...){

  method <- 1
  formatfun <- format
  x.unique <- base::sort(base::unique(c(x[!is.na(x)],if(!missing(cuts))cuts)))
  min.dif <- base::min(diff(x.unique))/2
  min.dif.factor <- 1

  ## Make formatted values look good
  if(missing(digits))
    digits <- 5

  ## add digits to formatfun's arguments if relevant
  format.args <-
    if (any(c("...","digits") %in%  names(formals(args(formatfun))))) {
       c(digits = digits, list(...))
    } else {
     list(...)
    }

  oldopt <- options('digits')
  options(digits=digits)
  on.exit(options(oldopt))

  xlab <- attr(x, 'label')

  if(missing(cuts)) {
    nnm <- sum(!is.na(x))
    if(missing(g)) g <- max(1,floor(nnm/m))
    if(g < 1)
      stop('g must be >=1, m must be positive')

    options(digits=15)
    n <- table(x)
    xx <- as.double(names(n))
    options(digits=digits)
    cum <- cumsum(n)
    m <- length(xx)

    y <- as.integer(ifelse(is.na(x),NA,1))
    labs <- character(g)
    cuts <- approx(cum, xx, xout=(1:g)*nnm/g,
                   method='constant', rule=2, f=1)$y
    cuts[length(cuts)] <- max(xx)
    lower <- xx[1]
    upper <- 1e45
    up <- low <- double(g)
    i <- 0
    for(j in 1:g) {
      cj <- if(method==1 || j==1) cuts[j] else {
        if(i==0)
          stop('program logic error')
        s <- if(is.na(lower)) FALSE else xx >= lower
        cum.used <- if(all(s)) 0 else max(cum[!s])
        if(j==m) max(xx) else if(sum(s)<2) max(xx) else
          approx(cum[s]-cum.used, xx[s], xout=(nnm-cum.used)/(g-j+1),
                 method='constant', rule=2, f=1)$y
      }

      if(cj==upper) next

      i <- i + 1
      upper <- cj
      y[x >= (lower-min.dif.factor*min.dif)]  <- i
      low[i] <- lower
      lower <- if(j==g) upper else min(xx[xx > upper])

      if(is.na(lower)) lower <- upper

      up[i]  <- lower
    }

    low  <- low[1:i]
    up   <- up[1:i]
    variation <- logical(i)
    for(ii in 1:i) {
      r <- range(x[y==ii], na.rm=TRUE)
      variation[ii] <- diff(r) > 0
    }
    if(onlycuts) return(unique(c(low, max(xx))))
    flow <- do.call(formatfun,c(list(low), format.args))
    fup  <- do.call(formatfun,c(list(up),  format.args))
    bb   <- c(rep(')',i-1),']')
    labs <- ifelse(low==up | (oneval & !variation), flow,
                   paste('[',flow,',',fup,bb,sep=''))
    ss <- y==0 & !is.na(y)
    if(any(ss))
      stop(paste('categorization error in cut2.  Values of x not appearing in any interval:\n',
                 paste(format(x[ss],digits=12),collapse=' '),
                 '\nLower endpoints:',
                 paste(format(low,digits=12), collapse=' '),
                 '\nUpper endpoints:',
                 paste(format(up,digits=12),collapse=' ')))

    y <- structure(y, class='factor', levels=labs)
  } else {
    if(minmax) {
      r <- range(x, na.rm=TRUE)
      if(r[1]<cuts[1]) cuts <- c(r[1], cuts)
      if(r[2]>max(cuts)) cuts <- c(cuts, r[2])
    }

    l <- length(cuts)
    k2 <- cuts-min.dif
    k2[l] <- cuts[l]
    y <- base::cut(x, k2)
  }

  means <- tapply(x, y, function(w)mean(w,na.rm=TRUE))
  levels(y) <- do.call(formatfun,c(list(means), format.args))

  attr(y,'class') <- "factor"
  if(length(xlab)) label(y) <- xlab
  y
}

Try the prettyglm package in your browser

Any scripts or data that you put into this service are public.

prettyglm documentation built on Sept. 8, 2023, 5:56 p.m.