R/stratified.R

#' Sample from a \code{data.frame} according to a stratification variable
#' 
#' The \code{\link{stratified}} function samples from a
#' \code{\link{data.frame}} in which one of the columns can be used as a
#' "stratification" or "grouping" variable. The result is a new
#' \code{data.frame} with the specified number of samples from each group.
#' 
#' 
#' @param df The source \code{data.frame}.
#' @param group Your grouping variables. Generally, if you are using more than
#' one variable to create your "strata", you should list them in the order of
#' \emph{slowest} varying to \emph{quickest} varying. This can be a vector of
#' names or column indexes.
#' @param size The desired sample size. \itemize{ \item If \code{size} is a
#' value between \code{0} and \code{1} expressed as a decimal, size is set to
#' be proportional to the number of observations per group. \item If
#' \code{size} is a single positive integer, it will be assumed that you want
#' the same number of samples from each group. \item If \code{size} is a
#' vector, the function will check to see whether the length of the vector
#' matches the number of groups and use those specified values as the desired
#' sample sizes. The values in the vector should be in the same order as you
#' would get if you tabulated the grouping variable (usually alphabetic order);
#' alternatively, you can name each value to ensure it is properly matched. }
#' @param select A named list containing levels from the "group" variables in
#' which you are interested. The list names must be present as variable names
#' for the input \code{data.frame}.
#' @param replace Logical. Should the sampling be done with replacement?
#' @param bothSets Logical. Should just the samples be returned, or a \code{list}
#' with two items: the sampled subset and the unsampled subset?
#' @note \emph{Slightly different sizes than requested}
#' 
#' Because of how computers deal with floating-point arithmetic, and because R
#' uses a "round to even" approach, the size per strata that results when
#' specifying a proportionate sample may be slightly higher or lower per strata
#' than you might have expected.
#' 
#' @author Ananda Mahto
#' @examples
#' 
#' # Generate a couple of sample data.frames to play with
#' set.seed(1)
#' dat1 <- data.frame(ID = 1:100,
#'               A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
#'               B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
#'               D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
#'               E = sample(c("M", "F"), 100, replace = TRUE))
#' dat2 <- data.frame(ID = 1:20,
#'               A = c(rep("AA", 5), rep("BB", 10),
#'                     rep("CC", 3), rep("DD", 2)))
#' # What do the data look like in general?
#' summary(dat1)
#' summary(dat2)
#' 
#' # Let's take a 10% sample from all -A- groups in dat1
#' stratified(dat1, "A", .1)
#' 
#' # Let's take a 10% sample from only "AA" and "BB" groups from -A- in dat1
#' stratified(dat1, "A", .1, select = list(A = c("AA", "BB")))
#' 
#' # Let's take 5 samples from all -D- groups in dat1,
#' #   specified by column number
#' stratified(dat1, group = 5, size = 5)
#' 
#' # Let's take a sample from all -A- groups in dat1, 
#' #   where we specify the number wanted from each group
#' stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
#' 
#' # Use a two-column strata: -E- and -D-
#' #   -E- varies more slowly, so it is better to put that first
#' stratified(dat1, c("E", "D"), size = .15)
#' 
#' # Use a two-column strata (-E- and -D-) but only interested in
#' #   cases where -E- == "M"
#' stratified(dat1, c("E", "D"), .15, select = list(E = "M"))
#' 
#' ## As above, but where -E- == "M" and -D- == "CA" or "TX"
#' stratified(dat1, c("E", "D"), .15,
#'      select = list(E = "M", D = c("CA", "TX")))
#' 
#' # Use a three-column strata: -E-, -D-, and -A-
#' s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
#' 
#' list(head(s.out), tail(s.out))
#' 
#' # How many samples were taken from each strata?
#' table(interaction(s.out[c("E", "D", "A")]))
#' 
#' # Can we verify the message about group sizes?
#' names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
#' 
#' names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
#' \dontshow{rm(dat1, dat2, s.out)}
#' 
#' @export stratified
stratified <- function(df, group, size, select = NULL, 
                       replace = FALSE, bothSets = FALSE) {
  if (is.null(select)) {
    df <- df
  } else {
    if (is.null(names(select))) stop("'select' must be a named list")
    if (!all(names(select) %in% names(df)))
      stop("Please verify your 'select' argument")
    temp <- sapply(names(select),
                   function(x) df[[x]] %in% select[[x]])
    df <- df[rowSums(temp) == length(select), ]
  }
  df.interaction <- interaction(df[group], drop = TRUE)
  df.table <- table(df.interaction)
  df.split <- split(df, df.interaction)
  if (length(size) > 1) {
    if (length(size) != length(df.split))
      stop("Number of groups is ", length(df.split),
           " but number of sizes supplied is ", length(size))
    if (is.null(names(size))) {
      n <- setNames(size, names(df.split))
      message(sQuote("size"), " vector entered as:\n\nsize = structure(c(",
              paste(n, collapse = ", "), "),\n.Names = c(",
              paste(shQuote(names(n)), collapse = ", "), ")) \n\n")
    } else {
      ifelse(all(names(size) %in% names(df.split)),
             n <- size[names(df.split)],
             stop("Named vector supplied with names ",
                  paste(names(size), collapse = ", "),
                  "\n but the names for the group levels are ",
                  paste(names(df.split), collapse = ", ")))
    }
  } else if (size < 1) {
    n <- round(df.table * size, digits = 0)
  } else if (size >= 1) {
    if (all(df.table >= size) || isTRUE(replace)) {
      n <- setNames(rep(size, length.out = length(df.split)),
                    names(df.split))
    } else {
      message(
        "Some groups\n---",
        paste(names(df.table[df.table < size]), collapse = ", "),
        "---\ncontain fewer observations",
        " than desired number of samples.\n",
        "All observations have been returned from those groups.")
      n <- c(sapply(df.table[df.table >= size], function(x) x = size),
             df.table[df.table < size])
    }
  }
  temp <- lapply(
    names(df.split),
    function(x) df.split[[x]][sample(df.table[x],
                                     n[x], replace = replace), ])
  set1 <- do.call("rbind", temp)
  
  if (isTRUE(bothSets)) {
    set2 <- df[!rownames(df) %in% rownames(set1), ]
    list(SET1 = set1, SET2 = set2)
  } else {
    set1
  }
}
NULL
vagnerfonseca/fifer documentation built on May 3, 2019, 4:06 p.m.