R/polypoly-package.R
In polypoly: Helper Functions for Orthogonal Polynomials

Documented in poly_add_columns poly_melt poly_plot poly_plot_data poly_rescale

#' polypoly: Helper functions for orthogonal polynomials
#'
#' This package provides helpful functions for orthogonal polynomials created by
#' [stats::poly()]. These include plotting [poly_plot()], tidying [poly_melt()],
#' rescaling [poly_rescale()], and manipulating a dataframe
#' [poly_add_columns()].
#'
#' @name polypoly
#' @docType package
#' @author Tristan Mahr
NULL




#' Melt a polynomial matrix
#'
#' @param x a matrix created by [stats::poly()]
#' @return a [tibble::tibble()] with three columns: `observation` (row number of
#'   the matrix), polynomial `degree`, and `value`.
#' @export
#' @details The `degree` values are returned as a character vector because they
#'   should be treated categorically (as when plotting). Moreover, matrices
#'   made with multiple vectors (e.g., `poly(rnorm(10), rnorm(10), degree = 2)`)
#'   have names that are not numerically meaningful (e.g., `1.0`, `2.0`, `0.1`,
#'   `1.1`, `0.2`),
#'
#' @examples
#' m <- poly(rnorm(10), degree = 3)
#' poly_melt(m)
poly_melt <- function(x) {
  df <- reshape2::melt(x, c("observation", "degree"), "value", as.is = TRUE)
  tibble::as_tibble(df)
}




#' Plot a polynomial matrix
#'
#' @param x a matrix created by [stats::poly()]
#' @param by_observation whether the x axis should be mapped to the
#'   observation/row number (`TRUE`, the default) or to the degree-1 terms of
#'   the matrix (`FALSE`)
#' @param x_col integer indicating which column to plot as the x-axis when
#'   `by_observation` is `FALSE`. Default is 1 (assumes the first column is the
#'   linear polynomial term).
#' @return a [ggplot2::ggplot()] plot of the degree terms from the matrix. For
#'   `poly_plot_data()`, the dataframe used to create the plot is returned
#'   instead.
#' @export
#' @examples
#' # Defaults to plotting using the row number as x-axis
#' m <- poly(1:100, degree = 3)
#' poly_plot(m)
#'
#' # Not good because observations were not sorted
#' m2 <- poly(rnorm(100), degree = 3)
#' poly_plot(m2)
#'
#' # Instead set by_observation to FALSE to plot along the degree 1 values
#' poly_plot(m2, by_observation = FALSE)
#'
#' # Get a dataframe instead of plot
#' poly_plot_data(m2, by_observation = FALSE)
poly_plot <- function(x, by_observation = TRUE, x_col = 1) {
  poly_plot_backend(x, by_observation, x_col, just_data = FALSE)
}

#' @rdname poly_plot
#' @export
poly_plot_data <- function(x, by_observation = TRUE, x_col = 1) {
  poly_plot_backend(x, by_observation, x_col, just_data = TRUE)
}

poly_plot_backend <- function(x, by_observation = TRUE, x_col = 1, just_data) {
  df <- poly_melt(x)
  df$degree <- factor(df$degree, levels = colnames(x))
  x_var <- "observation"

  if (!by_observation) {
    df1 <- df[df$degree == colnames(x)[x_col], , drop = FALSE]
    x_var <- paste0("degree ", colnames(x)[x_col])
    df1[x_var] <- df1$value
    df1$degree <- NULL
    df1$value <- NULL
    df <- merge(df, df1, by = "observation")
  }

  # Should I try to avoid loading ggplot2?
  `%+p%` <- ggplot2::`%+%`
  p <- ggplot2::ggplot(df) %+p%
    ggplot2::aes_(x = as.name(x_var), y = ~ value, color = ~ degree) %+p%
    ggplot2::geom_line()

  if (just_data) {
    tibble::as_tibble(df)
  } else {
    p
  }
}




#' Add orthogonal polynomial columns to a dataframe
#' @param .data a dataframe
#' @param .col a bare column name
#' @param degree number of polynomial terms to add to the dataframe
#' @param prefix prefix for the names to add to the dataframe. default is the
#'   name of `.col`.
#' @param scale_width optionally rescale the dataframe using [poly_rescale()].
#'   Default behavior is not to perform any rescaling.
#' @param na_values How to handle missing values. Default is `"error"` which
#'   raises an error. Other options include `"warn"` to raise a warning and
#'   `"allow"` to silently accept missing values.
#' @return the dataframe with additional columns of orthogonal polynomial terms
#'   of `.col`
#' @export
#' @examples
#' df <- data.frame(time = rep(1:5, 3), y = rnorm(15))
#'
#' # adds columns "time1", "time2", "time3"
#' poly_add_columns(df, time, degree = 3)
#'
#' # adds columns "t1", "t2", "t3 and rescale
#' poly_add_columns(df, time, degree = 3, prefix = "t", scale_width = 1)
poly_add_columns <- function(
  .data,
  .col,
  degree = 1,
  prefix = NULL,
  scale_width = NULL,
  na_values = c("error", "warn", "allow")
) {
  # Support nonstandard evaluation
  .col <- rlang::enquo(.col)
  .col_name <- rlang::as_name(.col)
  stopifnot(.col_name %in% names(.data))

  # Name the new columns
  prefix <- ifelse(is.null(prefix), .col_name, prefix)
  names <- paste0(prefix, seq_len(degree))

  # Fail if name already exists
  if (any(names %in% colnames(.data))) {
    example_name <- colnames(.data)[which(colnames(.data) %in% names)][1]
    message <- paste0(
      "The column `",
      example_name,
      "` already exists. Try a different prefix."
    )
    rlang::abort(message)
  }

  # Get the unique values
  x <- rlang::eval_tidy(.col, .data)
  has_na <- anyNA(x)
  na_values <- match.arg(na_values)

  if (has_na) {
    if (na_values == "error") {
      rlang::abort(
        c(
          sprintf("NA values found in `%s`.", .col_name),
          "i" = "`stats::poly()` cannot handle missing values."
        )
      )
    }
    if (na_values == "warn") {
      rlang::warn(
        c(
          sprintf("NA values found in `%s`.", .col_name),
          "i" = "`stats::poly()` cannot handle missing values, so those values will be ignored.",
          "i" = sprintf("Derived columns like `%s` will contain missing values.", names[1])
        )
      )
    }
  }

  # Remove missing values
  x <- sort(unique(x), na.last = NA)

  # Create the polynomial basis
  m <- stats::poly(x, degree = degree, simple = TRUE)
  m <- poly_rescale(m, scale_width)

  # Merge orthogonal terms into data-set
  df <- as.data.frame(cbind(x, m))
  df <- rlang::set_names(df, c(.col_name, names))
  .data2 <- .data
  .data2["...rowid"] <- seq_len(nrow(.data2))

  # Preserve order of original data-frame
  merged <- merge(.data2, df, by = .col_name, all.x = TRUE)
  merged <- merged[order(merged[["...rowid"]]), , drop = FALSE]
  merged[["...rowid"]] <- NULL

  cols_to_add <- as.list(merged)[names]
  tibble::add_column(.data, !!! cols_to_add)
}




#' Rescale the range of a polynomial matrix
#'
#' @param x a matrix created by [stats::poly()]
#' @param scale_width the desired range (max - min) for the first column of the matrix
#' @return the rescaled polynomial matrix (as a plain matrix with `coefs`
#'   attribute removed)
#' @export
#' @details This function strips away the `poly` class and the `coefs`
#'   attribute of the matrix. This is because those attributes no longer
#'   describe the transformed matrix.
#' @examples
#' m <- poly(1:10, degree = 4)
#'
#' # Difference between min and max values of first column is 10
#' scaled <- poly_rescale(m, scale_width = 10)
#' scaled
#'
#' # Rescaled values are still orthogonal
#' zapsmall(cor(scaled))
poly_rescale <- function(x, scale_width = 1) {
  d1 <- x[, 1]
  current_width <- max(d1) - min(d1)

  if (is.null(scale_width)) {
    scale_width <- current_width
  }

  scale_by <- scale_width / current_width
  x <- x * scale_by

  # Confirm new scaling
  new_d1 <- x[, 1]
  new_width <- max(new_d1) - min(new_d1)
  stopifnot(all.equal(new_width, scale_width))

  poly_strip_info(x)
}




# Strip some metadata from a polynomial matrix
poly_strip_info <- function(x) {
  class(x) <- "matrix"
  attr(x, "coefs") <- NULL
  x
}