R/util.R

Defines functions .trc .dbg .msg .m .stopifnot_year capitalize_first str_extract na_to_false guess_icd_col_by_class guess_icd_pc_col_by_name guess_icd_col_by_name get_icd_pc_name get_icd_dx_name get_visit_name.matrix get_visit_name.data.frame get_visit_name binary_to_logical logical_to_binary strim

# save this in package environment so it doesn't need to be done on the fly
.have_regexec_perl <- "perl" %in% names(as.list(regexec))

#' Trim leading and trailing white space from a single string
#'
#' \code{NA} is accepted (and returned as \code{NA_character_})
#' @param x character vector of length one
#' @return character vector of length one
#' @keywords internal character
#' @noRd
strim <- function(x) {
  stopifnot(is.character(x), length(x) == 1)
  if (!is.na(x)) {
    strimCpp(as.character(x))
  } else {
    return(NA_character_)
  }
}

"%nin%" <- function(x, table) {
  match(x, table, nomatch = 0L) == 0L
}

#' Use faster matching for %in%
#' @keywords internal
#' @noRd
"%fin%" <- function(x, table) {
  fin(x, table)
}

#' Use faster matching for %nin%
#' @keywords internal
#' @noRd
"%fnin%" <- function(x, table) {
  !fin(x, table)
}

#' Encode \code{TRUE} as 1, and \code{FALSE} as 0 (integers)
#'
#' When saving data as text files for distribution, printing large amounts of
#' text containing \code{TRUE} and \code{FALSE} is inefficient. Converting to
#' binary allows more compact output to screen. Most functions in \R will
#' convert \code{TRUE} and \code{FALSE} logical flags to \code{1} and \code{0},
#' respectively.
#' @param x \code{data.frame} which may contain logical fields
#' @examples
#' mat <- matrix(sample(c(TRUE, FALSE), size = 9, replace = TRUE), nrow = 3)
#' mat
#' icd:::logical_to_binary(mat)
#' icd:::binary_to_logical(icd:::logical_to_binary(mat))
#' @return \code{data.frame} without logical fields
#' @keywords internal manip logical
#' @noRd
logical_to_binary <- function(x) {
  stopifnot(is.data.frame(x) || is.matrix(x))
  if (is.matrix(x)) {
    assert_matrix(x, min.rows = 1, min.cols = 1)
    mode(x) <- "integer"
    return(x)
  }
  assert_data_frame(x, min.rows = 1, min.cols = 1)
  logical_fields <- names(x)[vapply(x, is.logical, logical(1))]
  if (any(is.na(logical_fields)) || length(logical_fields) == 0) {
    return(x)
  }
  # update just the logical fields with integers
  x[, logical_fields] <-
    vapply(
      X = x[, logical_fields],
      FUN = function(y) ifelse(y, 1L, 0L),
      FUN.VALUE = integer(length = dim(x)[1])
    )
  x
}

#' @describeIn logical_to_binary Convert integer columns to logical values
#' @noRd
binary_to_logical <- function(x) {
  stopifnot(is.data.frame(x) || is.matrix(x))
  if (is.matrix(x)) {
    stopifnot(is.matrix(x), nrow(x) > 0, ncol(x) > 0)
    mode(x) <- "logical"
    return(x)
  }
  stopifnot(nrow(x) > 0, ncol(x) > 0)
  integer_fields <- names(x)[vapply(x, is.integer, logical(1))]
  if (any(is.na(integer_fields)) || length(integer_fields) == 0) {
    return(x)
  }
  # update just the logical fields with integers
  x[, integer_fields] <-
    vapply(
      X = x[, integer_fields],
      FUN = function(y) ifelse(y, TRUE, FALSE),
      FUN.VALUE = logical(length = dim(x)[1])
    )
  x
}

#' Get or guess the name of the visit ID column
#'
#' The guess depends on the data, working through a list of likely candidates.
#' If the visit ID is known, it should be specified, ideally instead of calling
#' this function, but if unavoidable, using the \code{visit_name} parameter.
#' @param x input data, typically a data frame
#' @template visit_name
#' @keywords internal
#' @noRd
get_visit_name <- function(x, visit_name = NULL) {
  UseMethod("get_visit_name")
}

#' Guess or get visit/patient column from data frame
#' @keywords internal
#' @export
#' @noRd
get_visit_name.data.frame <- function(x, visit_name = NULL) {
  stopifnot(is.data.frame(x))
  stopifnot(is.null(visit_name) ||
    (is.character(visit_name) && length(visit_name) == 1L))
  visit_name_guesses <- c(
    "visit.?Id", "patcom", "encounter.?id", "enc.?id",
    "in.*enc", "out.*enc", "encounter", "visit", "^id$",
    "^enc"
  )
  if (is.null(visit_name)) {
    for (guess in visit_name_guesses) {
      guess_matched <- grep(guess, names(x), ignore.case = TRUE, value = TRUE)
      if (length(guess_matched) == 1) {
        visit_name <- guess_matched
        break
      }
    }
    if (is.null(visit_name)) {
      visit_name <- names(x)[1]
    }
  }
  assert_string(visit_name)
  stopifnot(visit_name %in% names(x))
  visit_name
}

#' Give useful error message if matrix passed, as we assume it is a comorbidity
#' matrix. It is possible you have a character matrix with all your patient
#' data, and if so, please convert it to a \code{data.frame} and file an issue
#' on github.
#' @keywords internal
#' @export
#' @noRd
get_visit_name.matrix <- function(x, visit_name = NULL) {
  if (is.logical(x)) {
    stop(
      "matrices of comorbidity data are expected to be of logical type, ",
      "and have row names corresponding to the visit or patient."
    )
  }
  get_visit_name.data.frame(as.data.frame(head(x, 1000)))
}

#' Get the name of a \code{data.frame} column which is most likely to contain
#' the ICD codes
#'
#' Guess which field contains the (only) ICD code, in order of preference, the
#' column name has an icd code class, case-insensitive regular expressions of
#' commonly used names for ICD code fields, a single column has more than 10%
#' valid ICD codes. If the result is not specified by class, or exactly with
#' \code{icd_name} being given, we confirm there are at least some valid ICD
#' codes in there
#' @param x data frame
#' @param icd_name Usually \code{NULL} but if specified, will be checked it is
#'   valid (i.e. a character vector of length one, which is indeed a name of one
#'   of \code{x}'s columns) and returned unchanged
#' @param multi If \code{TRUE}, allow multiple ICD field names to be returned.
#' @keywords internal
#' @noRd
get_icd_dx_name <- function(x,
                            icd_name = NULL,
                            valid_codes = TRUE,
                            defined_codes = FALSE,
                            multi = FALSE) {
  if (!is.null(icd_name)) {
    stopifnot(all(icd_name %in% names(x)))
    return(icd_name)
  }
  if (any(grepl(pattern = "poa", icd_name, ignore.case = TRUE))) {
    warning("'POA' Present-on-arrival fields in 'icd_name'.")
  }
  icd_name <- guess_icd_col_by_class(x, pattern = icd_dx_not_generic)
  if (!is.null(icd_name)) {
    return(icd_name)
  }
  icd_pc_name <- guess_icd_col_by_class(x, pattern = icd_pc_not_generic)
  icd_generic <- guess_icd_col_by_class(x, pattern = c("icd9", "icd10"))
  if (is.null(icd_pc_name) && !is.null(icd_name)) {
    return(icd_generic)
  }
  icd_name <- guess_icd_col_by_name(x,
    valid_codes = valid_codes,
    defined_codes = defined_codes
  )
  if (is.null(icd_name)) {
    icd_name <- character()
    for (n in names(x)) {
      pc <- get_icd_defined_percent(x[[n]])
      if (pc$icd9 > 25 || pc$icd10 > 25) {
        icd_name <- c(icd_name, n)
      }
    }
  }
  if (nrow(x) < 2 || (!valid_codes && !defined_codes)) {
    return(icd_name)
  }
  pc <- if (defined_codes) {
    get_icd_defined_percent(x[icd_name[1]])
  } # TODO vectorize this function
  else {
    get_icd_valid_percent(x[icd_name[1]])
  }
  if (pc$icd9 < 10 && pc$icd10 < 10) {
    stop(
      "identified field with ICD codes as: '", icd_name,
      "' but fewer than 10% of codes are valid ICD-9 or ICD-10. ",
      "If this really is a valid column, identify the field containing ",
      "ICD codes in the input data using 'icd_name=\"my_icd_field\"' or ",
      "set the class using something like",
      " x[[icd_name]] <- as.icd9[[x[[icd_name]]"
    )
  }
  icd_name
}

#' deprecated synonym
#' @keywords internal
#' @noRd
get_icd_name <- get_icd_dx_name

#' Uses the columns which contain ICD-9 or ICD-10-CM procedure codes
#'
#' Will also guess procedure codes from other national ICD versions, when
#' supported.
#' @param x Data frame in which to look for the procedure name columns
#' @param icd_name character vector of the column names containing the procedure
#'   codes. This is usually not known in advance, but if known, will be passed
#'   through.
#' @seealso \code{get_icd_dx_name}
#' @keywords internal
#' @noRd
get_icd_pc_name <- function(x, icd_name = NULL) {
  if (!is.null(icd_name)) {
    stopifnot(all(icd_name %in% names(x)))
    return(icd_name)
  }
  if (any(grepl(pattern = "poa", icd_name, ignore.case = TRUE))) {
    warning("'POA' Present-on-arrival field name in 'icd_name'.")
  }
  icd_name <- guess_icd_col_by_class(x, pattern = icd_pc_not_generic)
  if (!is.null(icd_name)) {
    return(icd_name)
  }
  guess_icd_pc_col_by_name(x)
}

#' Get candidate column(s) from wide or long data frame frame, using hints
#' @examples
#' wide_df <- data.frame(
#'   a = letters,
#'   dx0 = icd9_map_elix$CHF[1:26],
#'   dx1 = icd9_map_elix$PVD[1:26],
#'   dx2 = icd9_map_elix$HTN[1:26]
#' )
#' icd:::guess_icd_col_by_name(wide_df)
#' wide_dc <- data.frame(
#'   a = letters,
#'   dx0 = as.icd9cm(icd9_map_elix$CHF[1:26]),
#'   dx1 = as.icd9cm(icd9_map_elix$PVD[1:26]),
#'   dx2 = as.icd9cm(icd9_map_elix$HTN[1:26]),
#'   stringsAsFactors = FALSE
#' )
#' icd:::guess_icd_col_by_name(wide_dc)
#' @return Zero, one or many names of columns likely to contain ICD codes based
#'   on the column names.
#' @keywords internal
#' @noRd
guess_icd_col_by_name <- function(x,
                                  valid_codes = TRUE,
                                  defined_codes = FALSE,
                                  guesses = c(
                                    "icd.?(9|10)",
                                    "icd.?(9|10).?Code",
                                    "icd",
                                    "diagnos",
                                    "diag.?code",
                                    "diag",
                                    "dx",
                                    "i(9|10)",
                                    "code"
                                  ),
                                  class_pattern = icd_dx_not_generic) {
  stopifnot(is.data.frame(x))
  stopifnot(is.logical(valid_codes), length(valid_codes) == 1L)
  stopifnot(is.logical(defined_codes), length(defined_codes) == 1L)
  stopifnot(is.character(guesses))
  # if one column exactly has a class like icd9, then we're done.
  icd_name_by_class <- guess_icd_col_by_class(x, pattern = class_pattern)
  if (!is.null(icd_name_by_class)) {
    return(icd_name_by_class)
  }
  guessed <- lapply(guesses,
    grep,
    x = names(x),
    ignore.case = TRUE,
    value = TRUE
  )
  guess_counts <- vapply(guessed, length, integer(1))
  guesses_logical <- as.logical(guess_counts)
  if (sum(guesses_logical) == 1L) {
    return(unlist(guessed[guesses_logical]))
  }
  best_guess <- which(guess_counts == max(guess_counts))
  if (any(guess_counts > 0L) && length(best_guess) > 0L) {
    return(guessed[[best_guess[1]]])
  }
  NULL
}

guess_icd_pc_col_by_name <- function(x,
                                     valid_codes = TRUE,
                                     defined_codes = FALSE,
                                     guesses = c(
                                       "icd.?(9|10).?(proc|p).?(code|c)?",
                                       "icd.*pc",
                                       "proced.*",
                                       "proc.?code",
                                       "pc",
                                       "i(9|10).*pc",
                                       "pc.*i(9|10)",
                                       "proc"
                                     ),
                                     class_pattern = icd_dx_not_generic) {
  guess_icd_col_by_name(
    x = x,
    guesses = guesses,
    class_pattern = icd_pc_not_generic
  )
}

#' @describeIn guess_icd_col_by_name Just use the class of columns
#' @keywords internal
#' @noRd
guess_icd_col_by_class <- function(x, pattern) {
  cls <- lapply(x, class)
  clg <- vapply(cls, function(z) any(z %in% pattern), logical(1))
  if (any(clg)) {
    return(names(x)[clg])
  }
  NULL
}

na_to_false <- function(x) {
  stopifnot(is.logical(x))
  x[is.na(x)] <- FALSE
  x
}

# nocov start

#' \CRANpkg{stringr} does this, but here we have a small amount of base R code
#' @param ... passed to regexec, e.g. \code{perl = TRUE}.
#' @noRd
#' @keywords internal
str_extract <- function(string,
                        pattern,
                        fun = `[[`,
                        ...) {
  vapply(
    regmatches(string,
      m = regexec(
        pattern = pattern,
        text = string,
        ...
      )
    ),
    FUN = fun,
    1,
    FUN.VALUE = character(1L)
  )
}

capitalize_first <- function(x) {
  trimws(paste0(toupper(substr(x, 1, 1)), substr(x, 2, nchar(x))))
}
# nocov end

.stopifnot_year <- function(year) {
  if (!all(grepl("^[[:digit:]]{4}$", as.character(year)))) {
    stop(year, " is not a four-digit year.", call. = FALSE)
  }
}

.m <- function(threshold, ..., print = FALSE) {
  v <- as.integer(.verbose())
  if (v < threshold) {
    return()
  }
  if (print) {
    dots <- list(...)
    print(
      lapply(dots, paste, collapse = ", ")
    )
    return()
  }
  message(...)
  invisible()
}

.msg <- function(...) {
  .m(1, ...)
}

.dbg <- function(...) {
  .m(2, ...)
}

.trc <- function(...) {
  .m(3, ...)
}
jackwasey/icd documentation built on Nov. 23, 2021, 9:56 a.m.