R/extract_var.R

Defines functions zmatch extract_var

Documented in extract_var

#' Extract arbitrary variable from data pack
#' @param DT A molten version of the raw data pack files.
#' @param var A string, one of the names \code{MEASURE_VARS}, the variable to be extracted.
#' @param ... Passed to \code{zmatch}.
#' @export extract_var

extract_var <- function(DT, var, ...) {
  stopifnot(var %chin% names(MEASURE_VARS),
            "permitted_values" %chin% names(MEASURE_VARS[[var]]),
            var %chin% names(DT) || "variable" %chin% names(DT))

  if (var %chin% names(DT)) {
    set(DT,
        j = var,
        value = zmatch(.subset2(DT, var),
                       y = MEASURE_VARS[[var]][["permitted_values"]],
                       ...))
  } else {
    DT[, (var) := zmatch(.subset2(DT, "variable"),
                         y = MEASURE_VARS[[var]][["permitted_values"]],
                         ...)]
  }

  DT[]
}

zmatch <- function(x, y,
                   extract1 = "^(.*)$",
                   delete.penalty = 0.01,
                   sub.penalty = 0.2,
                   y.complete = TRUE,
                   fixed = FALSE,
                   ignore.case = FALSE) {
  if (anyDuplicated(x)) {
    input <- data.table(x = x)
    decoder <-
      unique(input, by = "x")[, out := zmatch(x, y,
                                              extract1 = extract1,
                                              delete.penalty = delete.penalty,
                                              sub.penalty = sub.penalty,
                                              y.complete = y.complete,
                                              fixed = fixed,
                                              ignore.case = ignore.case)]
    return(decoder[input, on = "x"][["out"]])
  } else {
    x <- sub(extract1, "\\1", x, perl = TRUE)
    Y <- gsub("[^A-Za-z]+", "_", y)
    distance_matrix <-
      adist(x, Y,
            costs = list(deletions = delete.penalty,
                         substitutions = sub.penalty))

    indexes <- apply(distance_matrix, 1, which.min)
    out <- y[indexes]
    if (y.complete && any(y[!is.na(y)] %notin% out)) {
      x <- x[!is.na(x)]
      y <- y[!is.na(y)]

      unmatched_x <- x[x %notin% y]
      stop("Not all y were matched.\n\t",
           paste0(unique(y[y %notin% out]), sep = "\n\t"))
    }

    out
  }
}
HughParsonage/Census2016.spec documentation built on July 16, 2022, 11:40 p.m.