R/utils.R
In unsum: Reconstruct Raw Data from Summary Statistics

# Avoid NOTEs in R-CMD saying "no visible binding for global variable".
utils::globalVariables(c(".", "value", ".data"))


# Error if input is not an unchanged CLOSURE list.
check_closure_generate <- function(data) {
  tibbles_all <- c("inputs", "metrics", "frequency", "results")
  top_level_is_correct <-
    is.list(data) &&
    length(data) == 4L &&
    identical(names(data), tibbles_all) &&
    inherits(data$inputs, "closure_generate")

  if (!top_level_is_correct) {
    msg_tibbles_all <- paste0("\"", tibbles_all, "\"")
    cli::cli_abort(
      message = c(
        "Input must be the output of `closure_generate()` \
        or `closure_read()`.",
        "!" = "Such output is a list with the elements \
        {msg_tibbles_all}."
      ),
      call = rlang::caller_env()
    )
  }

  # Check the formats of the three tibbles that are elements of `data`, i.e., of
  # the output of `closure_generate()`:

  # Inputs (1 / 4)
  check_component_tibble(
    x = data$inputs,
    name = "inputs",
    dims = c(1L, 7L),
    col_names_types = list(
      "mean" = "character",
      "sd" = "character",
      "n" = c("integer", "double"),
      "scale_min" = c("integer", "double"),
      "scale_max" = c("integer", "double"),
      "rounding" = c("character"),
      "threshold" = c("integer", "double")
    )
  )

  # (Intermezzo to make sure that the assumptions in the second check hold)
  check_scale(
    scale_min = data$inputs$scale_min,
    scale_max = data$inputs$scale_max,
    mean = data$inputs$mean,
    warning = "Don't change CLOSURE results before this step.",
    n = 2
  )

  # Metrics (2 / 4)
  check_component_tibble(
    x = data$metrics,
    name = "metrics",
    dims = c(1L, 5L),
    col_names_types = list(
      "samples_initial" = "integer",
      "samples_all" = "integer",
      "values_all" = "integer",
      "horns" = "double",
      "horns_uniform" = "double"
    )
  )

  # Frequency (3 / 4)
  check_component_tibble(
    x = data$frequency,
    name = "frequency",
    dims = c(data$inputs$scale_max - data$inputs$scale_min + 1, 4),
    col_names_types = list(
      "value" = "integer",
      "f_average" = "double",
      "f_absolute" = "integer",
      "f_relative" = "double"
    )
  )

  # Results (4 / 4)
  check_component_tibble(
    x = data$results,
    name = "results",
    dims = c(data$metrics$samples_all, 2L),
    col_names_types = list(
      "id" = "integer",
      "sample" = "list"
    )
  )

  # Additional checks:

  if (!is_seq_linear_basic(data$frequency$value)) {
    cli::cli_abort(
      message = c(
        "The `value` column in `frequency` must be a linear sequence.",
        "x" = "It is actually {data$frequency$value}."
      ),
      call = rlang::caller_env()
    )
  }

  # The relative frequencies must sum up to 1 or 0. In the latter case, the
  # absolute frequencies must also sum up to 0: it only makes sense if no values
  # at all were found. These comparisons use `near()`, copied from dplyr, to
  # account for accidental floating-point inaccuracies.
  f_relative_sums_up <- near(
    sum(data$frequency$f_relative),
    1
  ) || (
      near(
        sum(data$frequency$f_relative),
        0
      ) && near(
          sum(data$frequency$f_absolute),
          0
        )
    )

  if (!f_relative_sums_up) {
    cli::cli_abort(
      message = c(
        "The `f_relative` column in `frequency` must sum up to 1 \
        (or 0, if `f_absolute` does).",
        "x" = "It actually sums up to {sum(data$frequency$f_relative)}."
      ),
      call = rlang::caller_env()
    )
  }

  all_results_integer <- data$results$sample |>
    vapply(
      FUN = function(x) typeof(x) == "integer",
      FUN.VALUE = logical(1)
    ) |>
    all()

  if (!all_results_integer) {
    cli::cli_abort("All `results` elements must be integer vectors.")
  }

  n <- data$inputs$n

  all_results_length_n <- data$results$sample |>
    vapply(
      FUN = function(x) length(x) == n,
      FUN.VALUE = logical(1)
    ) |>
    all()

  if (!all_results_length_n) {
    cli::cli_abort(
      message = "All `results` must have length `n` ({n}).",
      call = rlang::caller_env()
    )
  }
}


# Check each element of `closure_generate()` for correct format.
check_component_tibble <- function(
  x,
  name,
  dims,
  col_names_types,
  msg_main = NULL,
  n = 2
) {
  tibble_is_correct <-
    inherits(x, "tbl_df") &&
    all(dim(x) == dims) &&
    identical(names(x), names(col_names_types)) &&
    all(
      mapply(
        function(a, b) any(a == b),
        vapply(x, typeof, character(1)),
        col_names_types
      )
    )

  if (!tibble_is_correct) {
    cols_msg <- paste0(
      "\"",
      names(col_names_types),
      "\" (",
      unname(col_names_types),
      ")"
    )
    this_these <- if (length(col_names_types) == 1L) {
      "This column name and type"
    } else {
      "These column names and types"
    }
    if (is.null(msg_main)) {
      msg_main <- "CLOSURE data must not be changed before passing them \
        to other `closure_*()` functions."
    }
    cli::cli_abort(
      message = c(
        msg_main,
        "!" = "Specifically, `{name}` must be a tibble with:",
        "*" = "{dims[1]} row{?s} and {dims[2]} column{?s}",
        "*" = "{this_these}: {cols_msg}"
      ),
      call = rlang::caller_env(n)
    )
  }
}


# Borrowed from scrutiny's internals and used within `check_closure_generate()`,
# this checks whether a vector is a linear sequence (1, 2, 3) or not (3, 1, 7).
is_seq_linear_basic <- function(x) {
  if (length(x) < 3L) {
    return(TRUE)
  }
  diff_first <- x[2L] - x[1L]
  for (i in 3L:length(x)) {
    if (x[i] - x[i - 1L] != diff_first) {
      return(FALSE)
    }
  }
  TRUE
}


# Functions like `closure_generate()` that take `scale_min` and `scale_max`
# arguments need to make sure that min <= max. Functions that take the mean into
# account also need to check that it is within these bounds. Such functions
# include `closure_generate()` but not `closure_count_initial()`.
check_scale <- function(
  scale_min,
  scale_max,
  mean = NULL,
  warning = NULL,
  n = 1
) {
  if (scale_min > scale_max) {
    cli::cli_abort(
      message = c(
        "Scale minimum can't be greater than scale maximum.",
        "!" = warning,
        "x" = "`scale_min` is {scale_min}.",
        "x" = "`scale_max` is {scale_max}."
      ),
      call = rlang::caller_env(n)
    )
  }
  # Coercing mean and scale bounds to avoid a false-positive error
  if (!is.null(mean)) {
    if (as.numeric(mean) < as.numeric(scale_min)) {
      cli::cli_abort(
        message = c(
          "Mean can't be less than scale minimum.",
          "!" = warning,
          "x" = "`mean` is {mean}.",
          "x" = "`scale_min` is {scale_min}."
        ),
        call = rlang::caller_env(n)
      )
    }
    if (as.numeric(mean) > as.numeric(scale_max)) {
      cli::cli_abort(
        message = c(
          "Mean can't be greater than scale maximum.",
          "!" = warning,
          "x" = "`mean` is {mean}.",
          "x" = "`scale_max` is {scale_max}."
        ),
        call = rlang::caller_env(n)
      )
    }
  }
}


# Make sure a value has the right type (or one of multiple allowed types), has
# length 1, and is not `NA`. Multiple allowed types are often `c("double",
# "integer")` which allows any numeric value, but no values of any other types.
check_value <- function(x, type) {
  name <- deparse(substitute(x))
  check_type(x, type, n = 2, name = name)
  if (length(x) != 1L) {
    cli::cli_abort(
      message = c(
        "`{name}` must have length 1.",
        "x" = "It has length {length(x)}."
      ),
      call = rlang::caller_env()
    )
  }
  if (is.na(x)) {
    cli::cli_abort(
      message = "`{name}` can't be `NA`.",
      call = rlang::caller_env()
    )
  }
}


check_type <- function(x, t, n = 1, name = NULL) {
  if (any(typeof(x) == t) || (is.integer(x) && t == "double")) {
    return(invisible(NULL))
  }
  if (is.null(name)) {
    name <- deparse(substitute(x))
  }
  msg_type <- if (length(t) == 1L) {
    "be of type"
  } else {
    "be one of the types"
  }
  if (length(t) == 1 && t == "double") {
    t <- "double or integer"
  }
  cli::cli_abort(
    message = c(
      `!` = "`{name}` must {msg_type} {t}.",
      x = "It is {typeof(x)}."
    ),
    call = rlang::caller_env(n)
  )
}


# This helper creates the `frequency` part of `closure_generate()`'s output.
summarize_frequencies <- function(results, scale_min, scale_max, samples_all) {
  # Flatten the list of integer vectors into a single integer vector, then
  # create a frequency table for the values in that vector.
  f_absolute <- results |>
    unlist(use.names = FALSE) |>
    table()

  # Extract the scale values found in the samples. Then, remove them from
  # their source, `f_absolute`, as they are no longer needed.
  value <- as.integer(names(f_absolute))
  f_absolute <- as.integer(f_absolute)

  # Compute the share of each individual value in the sum of all values.
  f_relative <- f_absolute / sum(f_absolute)

  # Divide by the number of samples instead to get the average number of values
  # in each bin.
  f_average <- f_absolute / samples_all

  # Reconstruct the complete vector of possible scale values as a sequence from
  # scale minimum to scale maximum.
  value_completed <- scale_min:scale_max
  n_completed <- length(value_completed)

  # If each possible value is instantiated in the values that were found in the
  # samples, the results are complete and will be returned here. If not,
  # the zero counts of the uninstantiated values must be added to `value`, and
  # their zero frequencies to `f_absolute` and `f_relative`. This is what the
  # rest of the function will then do.
  if (length(value) == n_completed) {
    return(
      tibble::new_tibble(
        x = list(
          value = value,
          f_average = f_average,
          f_absolute = f_absolute,
          f_relative = f_relative
        ),
        nrow = n_completed
      )
    )
  }

  # At which indices in the complete vector of possible values are those values
  # that were actually found?
  indices_found <- which(value_completed %in% value)

  # Construct full-length vectors where each value is zero
  f_average_completed <- double(n_completed)
  f_absolute_completed <- integer(n_completed)
  f_relative_completed <- double(n_completed)

  # Fill in the non-zero values where appropriate
  f_average_completed[indices_found] <- f_average
  f_absolute_completed[indices_found] <- f_absolute
  f_relative_completed[indices_found] <- f_relative

  tibble::new_tibble(
    x = list(
      value = value_completed,
      f_average = f_average_completed,
      f_absolute = f_absolute_completed,
      f_relative = f_relative_completed
    ),
    nrow = n_completed
  )
}


# Copied from `dplyr::near()`
near <- function(x, y, tol = .Machine$double.eps^0.5) {
  abs(x - y) < tol
}


# Transform unsum's CLOSURE results into the "n"-column format of the CSV files
# made by closure-core's test harness or the original Python implementation.
# This is also the format in which `closure_write()` saves the Parquet files.
as_wide_n_tibble <- function(samples_all) {
  samples_all |>
    tibble::as_tibble(.name_repair = "minimal") |>
    t() |>
    tibble::as_tibble(.name_repair = function(x) paste0("n", seq_along(x)))
}


# This is the reverse operation of `as_wide_n_tibble()` except it also
# constructs a full "results" tibble, as in `closure_generate()`'s output.
as_results_tibble <- function(n_cols) {
  n_samples_all <- nrow(n_cols)
  tibble::new_tibble(
    x = list(
      id = seq_len(n_samples_all),
      sample = n_cols |>
        t() |>
        tibble::as_tibble(.name_repair = "minimal") |>
        as.list() |>
        unname()
    ),
    nrow = n_samples_all
  )
}