R/curve_interpolation.R

Defines functions curve_interpolation

Documented in curve_interpolation

#' Interpolates of growth curves along reference centiles
#'
#' By convention, we connect the points of a growth curve by
#' straight lines. This is OK when ages are relatively close
#' together. However, for ages that are more apart, the straight
#' line gives an ugly representation that doesn't follow the real
#' growth during the interval.
#'
#' The `curve_interpolation()` function provides a solution
#' for this. It defines a number of additional points (specified
#' through the `xout` argument), and calculates the linear
#' interpolation between these points in the Z-scores metric. After
#' that, the procedure transforms back the interpolated Z-scores
#' to the original scale, so the curve points follow the curvy
#' centiles.
#'
#' @param data A data frame in long format, with columns for person
#'   number (named `id`), the x and y variables.
#' @param xname Name of the variable on the horizontal axis.
#' @param yname Name of the variable on the vertical axis.
#' @param xout A vector of values for the horizontal axis in the scale
#'  of the variable `xname`. Only `xout` points within the
#'  data range of `xname` are interpolated.
#' @param refcode The name of reference needed to calculate the Z-score
#'  scores and the back-transformation to measured scores. There can only
#'  be one reference.
#' @param rule The `rule` argument passed down to `approx`. The
#' default here is `rule = 2L`, so any extrapolations beyond the
#' ranges of the reference take the closest value (min or max).
#' @return A `tibble` with five columns: `id`, xname, yname,
#' zname and `obs`. The `obs` variables signals whether
#' the point is observed or not.
#' @seealso [centile::y2z()], [centile::z2y()]
#' @author Stef van Buuren, 2021
#' @examples
#' data <- data.frame(
#'   id = c(1, 1, 1, 2, 3, 3, 3),
#'   age = c(0, 0.2, 1.2, 0.5, 0.1, 1, 1.3),
#'   hgt = c(52, 60, 78, 69, 62, 78, 82)
#' )
#' refcode <- "nl_1997_hgt_male_nl"
#' xout <- seq(0, 3, 0.2)
#' int <- chartplotter:::curve_interpolation(data,
#'   xname = "age", yname = "hgt",
#'   xout = xout, refcode = refcode
#' )
#' int
#' @export
curve_interpolation <- function(data, xname = "x", yname = "y",
                                xout = numeric(0),
                                refcode = NULL,
                                rule = 2L) {
  if (nrow(data) == 0L) {
    return(tibble(
      id = numeric(0),
      x = numeric(0), y = numeric(0), z = numeric(0),
      obs = logical(0)
    ))
  }

  # Initialisations
  if (is.null(refcode)) {
    message("No reference given.")
  }
  refcode <- refcode[[1L]]
  zname <- paste(yname, "z", sep = "_")

  # Transform to Z-values (analysis metric)
  observed <- data %>%
    select(all_of(c("id", xname, yname))) %>%
    mutate(
      obs = TRUE,
      !!zname := centile::y2z(
        y = data[[yname]], x = data[[xname]],
        refcode = refcode, pkg = "nlreferences", rule = rule
      )
    )

  # create bending points
  rng <- suppressWarnings(range(data[, xname, drop = TRUE], finite = FALSE))
  if (any(sapply(rng, is.infinite))) rng <- c(0, 0)

  grid <-
    as_tibble(expand.grid(
      id = unique(data$id),
      xout = xout
    )) %>%
    mutate(
      obs = FALSE,
      !!xname := .data$xout
    ) %>%
    select(-"xout") %>%
    bind_rows(observed) %>%
    arrange(!!!rlang::syms(c("id", xname, yname, zname))) %>%
    distinct(!!!rlang::syms(c("id", xname)), .keep_all = TRUE)

  # remove any bending points outside the observation intervals
  grid <- grid %>%
    group_by(.data$id) %>%
    mutate(
      mn = suppressWarnings(min(.data[[xname]][.data$obs & !is.na(.data[[zname]])], na.rm = TRUE)),
      mx = suppressWarnings(max(.data[[xname]][.data$obs & !is.na(.data[[zname]])], na.rm = TRUE)),
      lower = .data[[xname]] < .data$mn & !.data$obs,
      higher = .data[[xname]] > .data$mx & !.data$obs
    ) %>%
    filter(!.data$lower & !.data$higher) %>%
    select(-c("mn", "mx", "lower", "higher"))

  # isolate data with fewer than two valid xy pairs (approx fails)
  minimal2 <- grid %>%
    summarise(do_approx = sum(!(is.na(.data[[zname]][.data$obs])) | is.na(.data[[xname]][.data$obs])) >= 2L)

  grid <- grid %>%
    left_join(minimal2, by = "id")

  singletons <- grid %>%
    filter(!.data$do_approx) %>%
    ungroup() %>%
    select(all_of(c("id", xname, yname, zname, "obs")))

  # select subset with bending points
  grid <- grid %>%
    filter(.data$do_approx)

  # if there are bending points left:
  # interpolate and back-transform to y
  if (nrow(grid) > 0L) {
    grid <- grid %>%
      mutate(!!zname := approx(
        x = .data[[xname]], y = .data[[zname]],
        xout = .data[[xname]], rule = rule
      )$y) %>%
      ungroup()

    grid <- grid %>%
      mutate(yt = centile::z2y(
        z = .data[[zname]], x = .data[[xname]],
        refcode = refcode, pkg = "nlreferences", rule = rule
      ))

    # overwrite any NA's in yname
    ov <- grid %>%
      filter(is.na(.data[[yname]])) %>%
      mutate(!!yname := .data$yt)
    grid <- grid %>%
      filter(!is.na(.data[[yname]])) %>%
      bind_rows(ov) %>%
      select(all_of(c("id", xname, yname, zname, "obs")))
  }

  # append singletons and sort
  grid <- grid %>%
    bind_rows(singletons) %>%
    arrange(!!!rlang::syms(c("id", xname)))
}
growthcharts/chartplotter documentation built on May 12, 2024, 8:43 a.m.