R/cutYears.R

#' Transform a vector of year values into an ordered factor of year groups.
#'
#' This function is a wrapper around \linkInt{cut}. Given a vector of 
#' strings or integers that represent years, and a vector of breakpoints, it
#' returns a factor in which each level represents a group of years. Unlike 
#' \code{cut()}, it returns pretty labels for the levels: "1975-79" instead 
#' of "[1975,1980)", and so on. Also unlike \code{cut()}, it ensures that 
#' all of the data are accounted for in the levels of the factor that it 
#' creates: data will never be dropped from a factor that \code{cutYears()}
#' returns.
#' 
#' By default, \code{cutYears()} differs from \code{cut()} in the following 
#' ways:
#' 
#' * Accepts only `x` vectors in which every value has four characters or 
#'   four digits.
#' * Returns a factor that has better labels for groups of years: for 
#'   example, "1975-80" rather than "\[1975,1980)".
#' * Returns factor levels that encompass all values of `x`. Consequently, 
#'   `cutYears()` will never convert year values to `NA`, as `cut()` will 
#'   often do. 
#' * Returns an ordered factor by default.
#' * By default, `cutYears()` drops levels that are outside the bounds
#'   of `x`. For example, if `x` ranges from 1975 to 1985, the factor 
#'   returned by `cut()` may have an infinite number of levels, including,
#'   say, "(1900-1905]". (The exact levels returns by `cut()` depend on the 
#'   arguments passed to it, especially the `breaks` argument.) But in a 
#'   case like this, the lowest factor level returned by `cutYears()` will
#'   include 1975, and the highest factor level returned by `cutYears()` 
#'   will contain 1985.
#' @md 


#' @param x Vector of four-digit integers, or of four-character strings 
#' that can be converted to integers, e.g., "1900".
#' 
#' @param breaks Numeric vector of cutpoints
#' 
#' @param levelsBoundedByData Logical. Ensures that the lowest and highest 
#' levels of the returned factor will contain some data. Also ensures that
#' the label for the highest factor level reports the maximum year in 
#' \code{x}, rather than a higher year.
#' 
#' @param shortLabels Logical. If \code{FALSE}, the second year in each 
#' label will always have four digits: for example, "1975-1999". If
#' \code{TRUE} (the default), the second year in each label will typically 
#' have two digits: for example, "1975-99". But even if \code{shortLabels}
#' is \code{TRUE}, the second year in a label will have four digits if it 
#' isn't in the same century as the first year. For example, 
#' \code{cutYears()} will always produce a label like "1975-2001" instead of
#' "1975-01".
#'    
#' 
#' @examples
#' years <- rep(1975:1993, each = 3)
#' fac1a <- cut(     years, breaks = seq(1975, 1993, by = 3))
#' fac1b <- cutYears(years, breaks = seq(1975, 1993, by = 3))
#' fac1c <- cutYears(years, breaks = seq(1975, 1993, by = 3), shortLabels = FALSE)
#'  
#' table(fac1a)
#' table(fac1b)
#' table(fac1c)
#' 
#' fac2a <- cut(     years, breaks = seq(1975, 1990, by = 3))
#' fac2b <- cutYears(years, breaks = seq(1975, 1990, by = 3))
#' table(fac2a)
#' table(fac2b)
#' 
#' fac3a <- cut(     years, breaks = seq(1955, 1990, by = 3))
#' fac3b <- cutYears(years, breaks = seq(1955, 1990, by = 3))
#' table(fac3a)
#' table(fac3b)

 

 
#' @seealso 
#' \linkInt{cut}, \code{\link[Hmisc:cut2]{Hmisc::cut2()}}


#' @export 
cutYears <- function (x, breaks, levelsBoundedByData = TRUE, shortLabels = TRUE) {
  
  if (! inherits(x, qw("character integer numeric"))) {
    stop('"x" must be of class "character", "integer", or "numeric".')
  }
  
  x <- as.integer(x)
  
  if (minNA(x) < 1000 | maxNA(x) > 9999) {
    stop("'x' must range from 1000 to 9999")
  }
  
    
  # If min(breaks) is greater than the earliest year, add a new breakpoint 
  # that is earlier than any others.  [2021 01 16]
  if (minNA(x) < minNA(breaks)) {
    # message("You have set min(x) < min(breaks). Revising factor levels accordingly.")
    breaks <- c(minNA(x), breaks)
  }
  
  # If "breaks" doesn't include the maximum year, add a new breakpoint.
  if (maxNA(breaks) < maxNA(x)) {
    # message("You have set max(x) > max(breaks). Revising factor levels accordingly.")
    breaks <- c(breaks, maxNA(x))
  }
  
  # If levelsBoundedByData is TRUE and the maximum breakpoint exceeds the 
  # maximum year, reduce the maximum breakpoint to the maximum year.
  if (levelsBoundedByData & maxNA(breaks) > maxNA(x)) {

    # If multiple breakpoints are greater than the maximum year, keep only 
    # the smallest of them.
    while (length(which(breaks > maxNA(x))) > 1) {
      breaks <- breaks[-length(breaks)]
    } 
    
    # If the second-highest breakpoint is the same as the maximum year, 
    # eliminate the highest breakpoint. Otherwise, reduce the highest 
    # breakpoint to the maximum year.
    if (breaks[length(breaks) - 1] == maxNA(x)) {
    } 
    else {
      breaks[which.max(breaks)] <- maxNA(x)
    }
  }
 
  
  
  # CONSTRUCT LABELS FOR EACH LEVEL OF THE FACTOR
  yearLabels <- NULL
  for (i in 1:(length(breaks) - 1)) {
    # We are constructing labels of the form {breaks[1]-(breaks[2]-1)}, 
    # except that the last label is simply of the form {breaks[1]-(breaks[2])}.
    # For example, if the breaks are c(1975, 1979, 1983), the labels are
    # "1975-78" and "1979-83".
    secondYearOffset <- 1 * (i+1 != length(breaks))  # always 0 or 1
    firstYearLabel  <- breaks[i]
    secondYearLabel <- breaks[i+1] - secondYearOffset
    if (shortLabels & substr(firstYearLabel, 1, 2) == substr(secondYearLabel, 1, 2)) {
      secondYearLabel <- substr(secondYearLabel, 3, 4)
    }
    newLabel   <- paste(firstYearLabel, secondYearLabel, sep = "-")
    yearLabels <- c(yearLabels, newLabel)    
  }
  
  
  # CREATE ORDERED FACTOR
  yearFac <- cut(
    x              = x, 
    breaks         = breaks, 
    labels         = yearLabels, 
    include.lowest = TRUE, 
    right          = FALSE,
    ordered_result = TRUE)
  
  
  # DROP EMPTY LEVELS THAT ARE OUTSIDE THE RANGE OF THE DATA
  if (levelsBoundedByData) {
    
    levelIndices_missing    <- which(table(yearFac) == 0)
    levelIndices_nonMissing <- which(table(yearFac) > 0)
    
    # drop levels that have no data and that are before the first level 
    if (any(levelIndices_missing < min(levelIndices_nonMissing))) {
      levels(yearFac)[which(levelIndices_missing < min(levelIndices_nonMissing))] <- NA
    }
    
    # drop levels that have no data and that are after the last level
    if (max(levelIndices_nonMissing) < length(levels(yearFac))) {
      levels(yearFac)[(max(levelIndices_nonMissing)+1):length(levels(yearFac))] <- NA
    }
  }
  
  yearFac
}
jbullock35/Bullock documentation built on April 1, 2022, 6:21 p.m.