R/yeo_johnson.R

Defines functions yeo_johnson_trans estimate_yeo_johnson_lambda predict.yeo_johnson yeo_johnson

Documented in predict.yeo_johnson yeo_johnson

# Most code inspired by https://github.com/petersonR/bestNormalize/blob/master/R/yeojohnson.R

#' Yeo-Johnson transformation
#'
#' Perform Yeo-Johnson transformation to attempt normalization.
#'
#' @param x a numeric vector of values to make normal.
#' @param lambda the value of the lambda parameter. When NULL, an appropriate value is estimated from the data.
#' @param eps the tolerance value under which lamba is considered to be 0 and that is used to choose the appropriate formula.
#'
#' @details The Yeo-Johnson is similar to the Box-Cox method, however it allows for the transformation of non-positive data as well.
#'
#' @return A vector of transformed values, with an attribute to store the lambda value. The object if of class 'yeo_johnson' and can be used with [predict.yeo_johnson()].
#'
#' @references Yeo, I. K., & Johnson, R. A. (2000). A new family of power transformations to improve normality or symmetry. Biometrika.
#'
#' @seealso [predict.yeo_johnson()] to apply the same transformation to a new dataset.
#'
#' @export
#' @examples
#' # simulate non-normal data
#' x <- rgamma(100, 1, 1)
#' hist(x)
#' # and make it more normal looking
#' hist(yeo_johnson(x))
yeo_johnson <- function(x, lambda=NULL, eps=0.001) {
  stopifnot(is.numeric(x))

  # estimate the value of lambda if needed
  if (is.null(lambda)) {
    lambda <- estimate_yeo_johnson_lambda(x, eps=eps)
  }

  # transform non-missing data
  x_t <- x
  na_idx <- is.na(x)
  x_t[!na_idx] <- yeo_johnson_trans(x[!na_idx], lambda=lambda, eps=eps)

  # prepare output
  attr(x_t, "lambda") <- lambda
  class(x_t) <- c("yeo_johnson", class(x_t))
  return(x_t)
}

#' Transform new data using an already fitted Yeo-Johnson object
#'
#' @param object an object of class 'yeo_johnson'
#' @param newdata a numeric vector of data to transform.
#' @param ... further arguments passed to or from other methods.
#'
#' @export
#' @examples
#' # fit the Yeo=-Johnson transfomration on non-normal data
#' x <- rgamma(100, 1, 1)
#' hist(x)
#' xt <- yeo_johnson(x)
#' # apply the same transformation to new data
#' x2 <- rgamma(100, 1, 1)
#' hist(x2)
#' hist(predict(xt, newdata=x2))
predict.yeo_johnson <- function(object, newdata=NULL, ...) {
  if (is.null(newdata)) {
    # just output the already transformed data
    newdata <- object
  } else {
    # run the transformation
    na_idx <- is.na(newdata)
    newdata[!na_idx] <- yeo_johnson_trans(newdata[!na_idx], attr(object, "lambda"))
  }
  return(newdata)
}

# Helper function that estimates the lambda parameter
#' @importFrom stats var optimize
estimate_yeo_johnson_lambda <- function(x, lower=-5, upper=5, eps=0.001) {
  n <- length(x)
  ccID <- !is.na(x)
  x <- x[ccID]

  # See Yeo & Johnson Biometrika (2000)
  yj_loglik <- function(lambda) {
    x_t <- yeo_johnson_trans(x, lambda, eps)
    x_t_bar <- mean(x_t)
    x_t_var <- var(x_t) * (n - 1) / n
    constant <- sum(sign(x) * log(abs(x) + 1))
    -0.5 * n * log(x_t_var) + (lambda - 1) * constant
  }

  results <- optimize(yj_loglik, lower=lower,
                      upper=upper, maximum=TRUE,
                      tol=.0001)

  return(results$maximum)
}

# Workhouse function that performs the transformation
yeo_johnson_trans <- function(x, lambda, eps=0.001) {
  pos_idx <- x >= 0
  neg_idx <- x < 0

  # Transform positive (non-negative) values
  if (any(pos_idx)) {
    if (abs(lambda) < eps) {
      x[pos_idx] <- log(x[pos_idx] + 1)
    } else {
      x[pos_idx] <- ((x[pos_idx] + 1) ^ lambda - 1) / lambda
    }
  }

  # Transform negative values
  if (any(neg_idx)){
    if (abs(lambda - 2) < eps) {
      x[neg_idx] <- - log(-x[neg_idx] + 1)
    } else {
      x[neg_idx] <- - ((-x[neg_idx] + 1) ^ (2 - lambda) - 1) / (2 - lambda)
    }
  }

  return(x)
}
jiho/morphr documentation built on May 11, 2024, 9:32 p.m.