R/classify.R

Defines functions print.classified as.matrix.classified as.data.table.classified as.data.frame.classified eval_condition classify.data.table classify.data.frame classify.codified classify.default classify

Documented in as.data.frame.classified as.data.table.classified as.matrix.classified classify classify.codified classify.data.frame classify.data.table classify.default print.classified

#' Classify codified data
#'
#' This is the second step of `codify() %>% classify() %>% index()`.
#' Hence, the function takes a codified data set and classify each case based on
#' relevant codes as identified by the classification scheme provided by a
#' `classcodes` object.
#'
#' @inheritParams set_classcodes
#' @param codified output from [codify()]
#' @param code,id name of code/id columns (in `codified`).
#' @param cc_args List with named arguments passed to
#'   [set_classcodes()]
#' @param ... arguments passed between methods
#'
#' @return Object of class "classified". Inheriting from a Boolean matrix with
#'   one row for each element/row of `codified`
#'   and columns for each class with corresponding class names (according to the
#'   [`classcodes`] object). Note, however, that [print.classified()] preview
#'   this output as a tibble.
#'
#' @seealso [as.data.frame.classified()], [as.data.table.classified()] and
#' [as.matrix.classified()], [print.classified()]
#'
#' @export
#' @name classify
#' @example man/examples/classify.R
#' @family verbs
classify <- function(codified, cc, ..., cc_args = list()) {
  UseMethod("classify")
}

# Default method ----------------------------------------------------------

#' @export
#' @rdname classify
classify.default <- function(codified, cc, ..., cc_args = list()) {
  .cc <- cc

  if (!is.null(cc_args) | !is.classcodes(cc)) {
    ccargs <- cc_args
    ccargs$cc <- cc
    cc  <- do.call(set_classcodes, ccargs)
  }

  y <-
    vapply(
      cc[[attr(cc, "regexpr")]],
      grepl,
      logical(length(codified)),
      x = as.character(codified)
    )

  structure(
    if (length(codified) == 1) as.matrix(t(y)) else y,
    dimnames   = list(codified, cc$group),
    classcodes = .cc,
    id         = "id",
    class      = c("classified", "matrix")
  )
}


# codified ----------------------------------------------------------------

#' @export
#' @rdname classify
classify.codified <- function(codified, ...) {

  if (methods::hasArg("id") || methods::hasArg("code")) {
    stop("Arguments `id` and `code` should not be specified in `classify` if
         already set by a previous call to `codify`!")
  }

  classify.data.table(
    codified, ...,
    id = attr(codified, "id"), code = attr(codified, "code")
  )
}


# data.frame --------------------------------------------------------------

#' @export
#' @rdname classify
classify.data.frame <- function(codified, ...) {
  classify(as.data.table(codified), ...)
}


# data.table --------------------------------------------------------------

#' @export
#' @rdname classify
classify.data.table <- function(
  codified, cc, ..., id, code, cc_args = list()) {

  if (!is.character(codified[[id]]))
    stop("Id column '", id, "' must be of type character!")

  # Warning if not called from categorize
  if (!any(grepl("(coder)?categorize",
                 vapply(sys.calls(), function(x) deparse(x[[1]])[[1]], "")))) {
    warning(
      "'classify()' does not preserve row order ('categorize()' does!)"
      , call. = FALSE
    )
  }

  cc_name <- cc
  # Do not reset cc if already set from categorize
  if (!is.null(cc_args) | !is.classcodes(cc)) {
    ccargs <- cc_args
    ccargs$cc <- cc
    cc <- do.call(set_classcodes, ccargs)
  }
  names(codified)[names(codified) == code] <- "code"

  # Codes without class (treated separately for speed): FALSE matrix
  i_nocl           <- !is.na(codified$code) &
                      !grepl(paste(cc[[attr(cc, "regexpr")]],
                                   collapse = "|"), codified$code)
  i_na             <- is.na(codified$code)
  nocl             <- matrix(FALSE, sum(i_nocl), nrow(cc))
  rownames(nocl)   <- codified[[id]][i_nocl]
  nacl             <- matrix(NA, sum(i_na), nrow(cc))
  rownames(nacl)   <- codified[[id]][i_na]
  nocl             <- rbind(nocl, nacl)
  colnames(nocl)   <- cc$group
  codified         <- codified[!i_nocl & !i_na, ]

  # Classify all cases with at least one class
  y                <- classify(codified$code, cc, cc_args = NULL)
  if ("condition" %in% names(cc)) {
    y <- y & vapply(cc$condition, eval_condition,
                    logical(nrow(codified)), x = codified)
  }
  rownames(y)      <- codified[[id]]
  y                <- rbind(nocl, y) # Rejoin with no class cases

  # Identify if unit has more than one code
  ids              <- rownames(y)
  uni              <- !ids %in% ids[duplicated(ids)]

  # Cases with only one code (separate for speed)
  clu              <- y[uni, , drop = FALSE]
  rownames(clu)    <- ids[uni]

  # Cases with multiple codes (slower but necessary)
  idx              <- as.factor(ids[!uni])
  # tply is a simplified and therefore faster version of tapply
  tply             <- function(X) unlist(lapply(split(X, idx), any))
  clm              <- apply(y[!uni, , drop = FALSE], 2, tply)
  # apply returns a vector in case we have only one case
  # We then lose the row name and have to turn it back
  if (is.vector(clm)) {
    clm <- matrix(clm, 1, dimnames = list(idx[1]))
  }

  structure(
    rbind(clu, clm),
    classcodes = cc_name,
    id         = id,
    class      = c("classified", "matrix")
  )
}


# Evaluate extra conditions -----------------------------------------------

eval_condition <- function(cond, x) {
  if (is.na(cond)) !logical(nrow(x))
  else tryCatch(eval(parse(text = cond), envir = x), error = function(e) stop(
    "Classification is conditioned on variables not found in the data set!")
    )
}


# Convert output to data.frame --------------------------------------------

#' Convert output from classify() to matrix/data.frame/data.table
#'
#' @param x output from [classify()]
#' @param ... ignored
#' @return data frame/data table with:
#'
#' - first column named as "id" column specified as input
#'   to [classify()] and with data from `row.names(x)`
#' - all columns from `classified`
#' - no row names
#'
#' or simply the input matrix without additional attributes
#'
#' @export
#' @family classcodes
#' @examples
#' x <- classify(c("C80", "I20", "unvalid_code"), "elixhauser")
#'
#' as.matrix(x)[, 1:3]
#' as.data.frame(x)[, 1:3]
#' data.table::as.data.table(x)[, 1:3]
#'
#' # `as_tibble()` works automatically due to internal use of `as.data.frame()`.
#' tibble::as_tibble(x)
as.data.frame.classified <- function(x, ...) {
  y            <- NextMethod()
  id           <- attr(x, "id")
  y[[id]]      <- row.names(x)
  row.names(y) <- NULL
  y            <- y[, c(id, setdiff(names(y), id))]
  attr(y, "classcodes") <- attr(x, "classcodes")
  y
}


# Convert output to data.table --------------------------------------------

#' @export
#' @rdname as.data.frame.classified
as.data.table.classified <- function(x, ...) {
  structure(
    as.data.table(as.data.frame(x, ...)),
    classcodes = attr(x, "classcodes")
  )
}

#' @export
#' @rdname as.data.frame.classified
as.matrix.classified <- function(x, ...) {
  class(x) <- setdiff("classified", class(x))
  mostattributes(x) <- NULL
  x
}


# print classified --------------------------------------------------------

#' Printing classified data
#'
#' Preview first `n` rows as tibble
#'
#' @param x output from [classify()]
#' @param ... additional arguments passed to printing method for a `tibble`.
#'    `n` is the number of rows to preview.
#'    Set `n = NULL` to disable the `tibble`
#'    preview and print the object as is (a matrix).
#' @export
#' @family classcodes
#' @examples
#' # Preview all output
#'classify(c("C80", "I20", "unvalid_code"), "elixhauser")
#'
#'# Preview only the first row
#' print(classify(c("C80", "I20", "unvalid_code"), "elixhauser"), n = 1)
#'
#' # Print object as is (matrix)
#' print(classify(c("C80", "I20", "unvalid_code"), "elixhauser"), n = NULL)
print.classified <- function(x, ...) {
  print_tibble(x, ...)
}
eribul/classifyr documentation built on March 23, 2023, 2 a.m.