R/orderNorm.R
In bestNormalize: Normalizing Transformation Functions

Documented in orderNorm predict.orderNorm print.orderNorm

#' Calculate and perform Ordered Quantile normalizing transformation
#'
#' @name orderNorm
#' @aliases predict.orderNorm
#'
#' @description The Ordered Quantile (ORQ) normalization transformation,
#'   \code{orderNorm()}, is a rank-based procedure by which the values of a
#'   vector are mapped to their percentile, which is then mapped to the same
#'   percentile of the normal distribution. Without the presence of ties, this
#'   essentially guarantees that the transformation leads to a uniform
#'   distribution.
#'
#'   The transformation is: \deqn{g(x) = \Phi ^ {-1} ((rank(x) - .5) /
#'   (length(x)))}
#'
#'   Where \eqn{\Phi} refers to the standard normal cdf, rank(x) refers to each
#'   observation's rank, and length(x) refers to the number of observations.
#'
#'   By itself, this method is certainly not new; the earliest mention of it
#'   that I could find is in a 1947 paper by Bartlett (see references). This
#'   formula was outlined explicitly in Van der Waerden, and expounded upon in
#'   Beasley (2009). However there is a key difference to this version of it, as
#'   explained below.
#'
#'   Using linear interpolation between these percentiles, the ORQ normalization
#'   becomes a 1-1 transformation that can be applied to new data. However,
#'   outside of the observed domain of x, it is unclear how to extrapolate the
#'   transformation. In the ORQ normalization procedure, a binomial glm with a
#'   logit link is used on the ranks in order to extrapolate beyond the bounds
#'   of the original domain of x. The inverse normal CDF is then applied to
#'   these extrapolated predictions in order to extrapolate the transformation.
#'   This mitigates the influence of heavy-tailed distributions while preserving
#'   the 1-1 nature of the transformation. The extrapolation will provide a
#'   warning unless warn = FALSE.) However, we found that the extrapolation was
#'   able to perform very well even on data as heavy-tailed as a Cauchy
#'   distribution (paper to be published).
#'
#'   The fit used to perform the extrapolation uses a default of 10000
#'   observations (or length(x) if that is less). This added approximation
#'   improves the scalability, both computationally and in terms of memory used.
#'   Do not set this value to be too low (e.g. <100), as there is no benefit to
#'   doing so. Increase if your test data set is large relative to 10000 and/or 
#'   if you are worried about losing signal in the extremes of the range.
#'
#'   This transformation can be performed on new data and inverted via the
#'   \code{predict} function.
#'
#' @param x A vector to normalize
#' @param n_logit_fit Number of points used to fit logit approximation
#' @param newdata a vector of data to be (reverse) transformed
#' @param inverse if TRUE, performs reverse transformation
#' @param object an object of class 'orderNorm'
#' @param warn transforms outside observed range or ties will yield warning
#' @param ... additional arguments
#'
#' @return A list of class \code{orderNorm} with elements
#'
#'   \item{x.t}{transformed original data} \item{x}{original data}
#'   \item{n}{number of nonmissing observations} \item{ties_status}{indicator if
#'   ties are present} \item{fit}{fit to be used for extrapolation, if needed}
#'   \item{norm_stat}{Pearson's P / degrees of freedom}
#'
#'   The \code{predict} function returns the numeric value of the transformation
#'   performed on new data, and allows for the inverse transformation as well.
#'
#' @examples
#'
#' x <- rgamma(100, 1, 1)
#'
#' orderNorm_obj <- orderNorm(x)
#' orderNorm_obj
#' p <- predict(orderNorm_obj)
#' x2 <- predict(orderNorm_obj, newdata = p, inverse = TRUE)
#'
#' all.equal(x2, x)
#' @references
#'
#' Bartlett, M. S. "The Use of Transformations." Biometrics, vol. 3, no. 1,
#' 1947, pp. 39-52. JSTOR www.jstor.org/stable/3001536.
#'
#' Van der Waerden BL. Order tests for the two-sample problem and their power.
#' 1952;55:453-458. Ser A.
#'
#' Beasley TM, Erickson S, Allison DB. Rank-based inverse normal transformations
#' are increasingly used, but are they merited? Behav. Genet. 2009;39(5):
#' 580-595. pmid:19526352
#'
#'
#' @seealso  \code{\link{boxcox}}, \code{\link{lambert}},
#'   \code{\link{bestNormalize}}, \code{\link{yeojohnson}}
#' @importFrom stats qnorm glm
#' @export
orderNorm <- function(x, n_logit_fit = min(length(x), 10000), ..., warn = TRUE) {
  stopifnot(is.numeric(x))
  ties_status <- 0
  nunique <- length(unique(x))
  na_idx <- is.na(x)
  
  if (nunique < length(x)) {
    if(warn) warning('Ties in data, Normal distribution not guaranteed\n')
    ties_status <- 1
  }
  
  q.x <- (rank(x, na.last = 'keep') - .5) / (length(x) - sum(na_idx))
  x.t <- qnorm(q.x)
  
  # fit model for future extrapolation
  # create "reduced" x with n_logit_fit equally spaced observations
  keep_idx <- round(seq(1, length(x), length.out = min(length(x), n_logit_fit)))
  x_red <- x[order(x[!na_idx])[keep_idx]]
  n_red = length(x_red)
  q_red <- (rank(x_red, na.last = 'keep') - .5) / n_red
  
  # fit model for future extrapolation
  fit <- suppressWarnings(
    glm(q_red ~ x_red, family = 'binomial', 
        weights = rep(n_red, n_red))
  )
  
  ptest <- nortest::pearson.test(x.t)
  
  val <- list(
    x.t = x.t,
    x = x,
    n = length(x) - sum(is.na(x)),
    n_logit_fit =  n_logit_fit,
    ties_status = ties_status,
    fit = fit,
    norm_stat =  unname(ptest$statistic / ptest$df)
  )
  
  class(val) <- 'orderNorm'
  val
}

#' @rdname orderNorm
#' @method predict orderNorm
#' @export
predict.orderNorm <- function(object,
                              newdata = NULL,
                              inverse = FALSE, 
                              warn = TRUE,
                              ...) {
  stopifnot(is.null(newdata) || is.numeric(newdata))
  
  # Perform transformation
  if(!inverse) {
    if(is.null(newdata)) newdata <- object$x
    na_idx <- is.na(newdata)
    
    newdata[!na_idx] <- orderNorm_trans(object, newdata[!na_idx], warn)
    return(newdata)
  } 
  
  # Perform reverse transformation
  if (is.null(newdata)) newdata <- object$x.t
  
  na_idx <- is.na(newdata)
  newdata[!na_idx] <- inv_orderNorm_trans(object, newdata[!na_idx], warn)
  
  return(newdata)
}

#' @rdname orderNorm
#' @method print orderNorm
#' @importFrom stats quantile
#' @export
print.orderNorm <- function(x, ...) {
  cat('orderNorm Transformation with', x$n, 
      'nonmissing obs and', 
      ifelse(
        x$ties_status == 1, 
        paste0('ties\n - ', length(unique(x$x)), ' unique values'),
        'no ties'), '\n',
      '- Original quantiles:\n')
  print(round(quantile(x$x, na.rm = TRUE), 3))
}

#' @importFrom stats approx fitted predict.glm qnorm
orderNorm_trans <- function(orderNorm_obj, new_points, warn) {
  x_t <- orderNorm_obj$x.t
  old_points <- orderNorm_obj$x
  vals <- suppressWarnings(
    approx(old_points, x_t, xout = new_points, rule = 1)
  )
  
  # If predictions have been made outside observed domain
  if (any(is.na(vals$y))) {
    if (warn) warning('Transformations requested outside observed domain; logit approx. on ranks applied')
    fit <- orderNorm_obj$fit
    p <- qnorm(fitted(fit, type = "response"))
    l_idx <- vals$x < min(old_points, na.rm = TRUE)
    h_idx <- vals$x > max(old_points, na.rm = TRUE)
    
    # Check 
    if (any(l_idx)) {
      xx <- data.frame(x_red = vals$x[l_idx])
      vals$y[l_idx] <- qnorm(predict(fit, newdata = xx, type = 'response')) - 
        (min(p, na.rm = TRUE) - min(x_t, na.rm = TRUE))
      
    }
    if (any(h_idx)) {
      xx <- data.frame(x_red = vals$x[h_idx])
      vals$y[h_idx] <- qnorm(predict(fit, newdata = xx, type = 'response')) - 
        (max(p, na.rm = TRUE) - max(x_t, na.rm = TRUE))
    }
  }
  
  vals$y
}

#' @importFrom stats approx fitted qnorm pnorm
inv_orderNorm_trans <- function(orderNorm_obj, new_points_x_t, warn) {
  x_t <- orderNorm_obj$x.t
  old_points <- orderNorm_obj$x
  vals <- suppressWarnings(
    approx(x_t, old_points, xout = new_points_x_t, rule = 1)
  )
  
  # If predictions have been made outside observed domain
  if (any(is.na(vals$y))) {
    if(warn) warning('Transformations requested outside observed domain; logit approx. on ranks applied')
    
    fit <- orderNorm_obj$fit
    p <- qnorm(fitted(fit, type = "response"))
    l_idx <- vals$x < min(x_t, na.rm = TRUE)
    h_idx <- vals$x > max(x_t, na.rm = TRUE)
    
    # Check 
    if (any(l_idx)) {
      # Solve algebraically from original transformation
      logits <- log(pnorm(vals$x[l_idx] + min(p, na.rm = TRUE) - min(x_t, na.rm = TRUE)) / 
                      (1 - pnorm(vals$x[l_idx] + min(p, na.rm = TRUE) - min(x_t, na.rm = TRUE))))
      vals$y[l_idx] <- 
        unname((logits - fit$coef[1]) / fit$coef[2])
    }
    if (any(h_idx)) {
      logits <- log(pnorm(vals$x[h_idx] + max(p, na.rm = TRUE) - max(x_t, na.rm = TRUE)) / 
                      (1 - pnorm(vals$x[h_idx] + max(p, na.rm = TRUE) - max(x_t, na.rm = TRUE))))
      vals$y[h_idx] <- 
        unname((logits - fit$coef[1]) / fit$coef[2])
    }
  }
  
  vals$y
}