R/sas.R
In icd: Comorbidity Calculations and Tools for ICD-9 and ICD-10 Codes

# nocov start

#' Extract assignments from a SAS FORMAT definition
#'
#' This is modelled entirely on a single chunk of SAS code, but hopefully can be
#' generalised. It relies heavily on lists and regular expression, but, as you
#' will see from the code, R is not a great language with which to write a SAS
#' parser.
#' @param sas_lines is a character vector, with one item per line, e.g. from
#'   \code{readLines}
#' @references
# nolint start
#' \url{http://support.sas.com/documentation/cdl/en/proc/61895/HTML/default/viewer.htm#a002473474.htm}
# nolint end
#' @return list (of lists)
#' @keywords programming list internal
#' @noRd
sas_format_extract <- function(sas_lines) {
  # collapse everything onto one big line, so we can filter multi-line
  # commments. No ability to do multiline regex along a vector.
  sas_lines <- paste(sas_lines, collapse = " \\n")
  # sas comments are in the form /* ... */ inline/multiline, or * ... ;
  sas_lines <- gsub(pattern = "/\\*.*?\\*/", replacement = "", x = sas_lines) # nolint
  sas_lines <- gsub(pattern = "\\n\\*.*?;", replacement = "\\n", x = sas_lines) # nolint
  sas_lines <- strsplit(sas_lines, split = "\\;")[[1]]
  # strip white space and ?undetected newline characters, replace with single
  # spaces.
  sas_lines <- gsub(pattern = "\\\\n", "", sas_lines) # nolint
  sas_lines <- gsub(pattern = "[[:space:]]+", " ", sas_lines)
  sas_lines <- trimws(sas_lines)
  # drop everything except VALUE statements
  sas_lines <- grep(
    pattern = "^VALUE.*", x = sas_lines, ignore.case = TRUE,
    value = TRUE
  )
  # put each VALUE declaration in a vector element
  sma1 <- .str_match_all(
    string = sas_lines,
    pattern =
      "^V(?:ALUE|alue)[[:space:]]+([[:graph:]]+)[[:space:]]+(.+)[[:space:]]*$"
  )
  all_sas_assignments <- lapply(sma1, `[`, c(2, 3))
  out <- list()
  for (m in all_sas_assignments) {
    out[m[[1]]] <- list(sas_parse_assignments(m[[2]]))
  }
  out
}

#' @describeIn sas_format_extract Get just the \code{$RCOMFMT} assignment, which
#'   contains all the ICD (not DRG) data. The problem is \code{RENLFAIL} appears
#'   twice:
#'
#'   \code{"N183", "N184", "N185", "N186", "N189", "N19", "Z4901", "Z4902",
#'   "Z9115", "Z940", "Z992"="RENLFAIL"         /*Dependence on renal dialysis*/
#'
#'   "Z4931", "Z4932"="RENLFAIL"       /*Encounter for adequacy testing for
#'   peritoneal dialysis*/ }
#'
#'   so \code{RENLFAIL} needs special treatment
#' @keywords internal
#' @noRd
sas_format_extract_rcomfmt <- function(sas_lines) {
  # ignore DRG assignments
  sas_format_extract(sas_lines)[["$RCOMFMT"]]
}

# ICD-10 SAS code seems to be literal with all possible (ICD-10-CM for given
# year) cihldren listed. No ranges are specified (unlike the ICD-9 equivalentt)
sas_icd10_assignments_to_list <- function(x) {
  x["NONE"] <- NULL
  x[" "] <- NULL
  x
}

#' Get assignments from a character string strings
#'
#' The format of assignments is best seen in the SAS source files.
#' @param x is a character string containing space delimited assignments, in SAS
#'   declaration format.
#' @param strip_whitespace will strip all white space from the returned values
#' @param strip_quotes will strip all double quotation marks from the returned
#'   values
#' @return list with each list item containing a matrix of "char ranges",
#'   "assigned value" pairs
#' @keywords internal programming list
#' @noRd
sas_parse_assignments <- function(x, strip_whitespace = TRUE,
                                  strip_quotes = TRUE) {
  stopifnot(is.character(x), length(x) == 1)
  assert_flag(strip_whitespace)
  assert_flag(strip_quotes)
  # splitting with clever regex to separate each pair of assignments seems
  # tricky, so doing it in steps.
  # n.b. this is a list with list per input row.
  halfway <- as.list(unlist(
    strsplit(x, split = "[[:space:]]*=[[:space:]]*")
  ))

  # we need to match the first unquoted space to get the boundary between the
  # previous definition and the next variable name
  if (length(halfway) == 2) {
    # we have just a single name value pair so just set name to value and return
    # list of one item.
    if (strip_whitespace) {
      halfway <- gsub(
        pattern = "[[:space:]]*",
        replacement = "",
        halfway
      )
    }
    if (strip_quotes) halfway <- gsub(pattern = '"', replacement = "", halfway)
    out <- list()
    out[[halfway[[2]]]] <- unlist(strsplit(x = halfway[[1]], split = ","))
    return(out)
  }
  mid_tmp <- unlist(
    lapply(
      .str_match_all(
        halfway[seq(2, length(halfway) - 1)],
        pattern = '^([^"]|"[^"]*")*? (.*)'
      ),
      `[`, -1
    )
  )
  threequarters <- c(
    halfway[[1]],
    mid_tmp,
    halfway[[length(halfway)]]
  )
  if (strip_quotes) {
    threequarters <- gsub(pattern = '"', replacement = "", threequarters)
  }
  # spaces may matter still, so don't randomly strip them?
  out <- list()
  for (pair in seq(from = 1, to = length(threequarters), by = 2)) {
    if (strip_whitespace) {
      outwhite <- gsub(
        pattern = "[[:space:]]*",
        replacement = "",
        threequarters[pair]
      )
    } else {
      outwhite <- threequarters[pair]
    }
    # combine here in case there are duplicate labels, e.g. RENLFAIL twice in
    # ICD-10 AHRQ
    out[[threequarters[pair + 1]]] <-
      c(
        out[[threequarters[pair + 1]]],
        unlist(strsplit(x = outwhite, split = ","))
      )
  }
  out
}

#' Extract quoted or unquoted SAS string definitions
#'
#' Finds all the LET statements in some SAS code and writes them to an R list.
#' The list item name is the SAS variable name, and each list item is a
#' character vector of the contents. This is specifically for string
#' assignments, but probably easy to adapter to numbers if ever needed.
#' @param x is a vector of character strings, typically taken from something
#'   like \code{readLines(some_sas_file_path)}
#' @keywords internal programming list
#' @noRd
sas_extract_let_strings <- function(x) {
  let_rex <-
    "%LET ([[:alnum:]]+)[[:space:]]*=[[:space:]]*%STR\\(([[:print:]]+?)\\)"
  a <- .str_match_all(x, let_rex)
  a <- lapply(a, trimws)
  a <- a[vapply(a, FUN = function(x) length(x) != 0, FUN.VALUE = logical(1))]

  vls <- vapply(a, FUN = `[[`, 3, FUN.VALUE = "")
  splt <- strsplit(vls, split = ",")
  result <- lapply(splt, strip, pattern = "'") # strip single quotes
  result <- lapply(result, strip, pattern = '"') # strip double quotes
  names(result) <- vapply(a, FUN = function(x) x[[2]], FUN.VALUE = "")
  result
}

# horrible kludge for difficult source data
sas_expand_range <- function(start, end) {
  if (end == "0449") {
    end <- start
  } # HIV codes changed
  # hmmm, maybe get the diff and test all children of ambigs present later
  reals <- expand_range.icd9(start, end,
    short_code = TRUE, defined = TRUE,
    ex_ambig_start = FALSE, ex_ambig_end = TRUE
  )
  real_parents <- condense.icd9(reals, defined = TRUE, short_code = TRUE)
  merged <- unique(c(reals, real_parents))
  real_parents_of_merged <- condense.icd9(merged,
    defined = TRUE,
    short_code = TRUE
  )
  halfway <- children.icd9(real_parents_of_merged,
    defined = FALSE,
    short_code = TRUE
  )
  nonrealrange <- expand_range.icd9(start, end,
    defined = FALSE,
    short_code = TRUE,
    ex_ambig_start = TRUE,
    ex_ambig_end = TRUE
  )
  sort.icd9(unique(c(halfway, nonrealrange)), short_code = TRUE)
}

# nocov end