R/scale.R

#' Scale a data frame or matrix
#'
#' @details an improvement of the base R scale function. This function takes a data frame
#' or matrix of mixed variable types and scales the numeric columns according to one of
#' several options.
#' @export
#' @param data a data frame or matrix
#' @param scale.type one of "sample.sd" (default),"pop.sd", "median", "huber", "YJ", or "npn". pop.sd centers by the mean
#' and population standard deviation, while sample.sd uses the R default of the sample standard deviation formula as
#' in base::scale. "median" uses the median as the center and median absolute deviation as the scale. "huber"
#' uses Huber's robust estimates of location and scale (relying on the MASS::hubers function). "YJ" performs the Yeo-Johnson
#' transformation on the numeric variables. This is a generalization of the Box-Cox transformation
#' which allows for transforming variables with negative values. "npn" performs the non-paranormal transform, which
#' in my opinion is preferable to the Yeo-Johnson/Box-Cox transform. The non-paranormal transform takes the rank of
#' each value in a vector X and divides it by the number of samples+1 to obtain a vector Q, ie Q=rank(x)/(n + 1). It then plugs
#' each observation in Q into the normal quantile function and scaled by the standard deviation. This results in a normally distributed,
#' scaled, and centered, data set.
#'
#' @return A matrix or data frame
#' @examples
#' scale(data)
#'

scale = function (data, scale.type = "sample.sd")
{

  if (isTRUE(is.vector(data))) {
    Vector = "YES"
    data = cbind.data.frame(x = data)
  } else {
    Vector = "NO"
  }


  if (scale.type=="YJ") {



    yj = function(y, lambda.seq=c(-2,2)) {


      yeo.johnson = function (y, lambda, derivative = 0, epsilon = sqrt(.Machine$double.eps),
                              inverse = FALSE)
      {
        if (!is.numeric(derivative) ||
            derivative < 0)
          stop("argument 'derivative' must be a non-negative integer")
        ans <- y
        if (!is.numeric(epsilon))
          stop("argument 'epsilon' must be a single positive number")
        L <- max(length(lambda), length(y))
        if (length(y) != L)
          y <- rep_len(y, L)
        if (length(lambda) != L)
          lambda <- rep_len(lambda, L)

        if (derivative == 0) {
          if (any(index <- y >= 0 & abs(lambda) > epsilon))
            ans[index] <- ((y[index] + 1)^(lambda[index]) - 1)/lambda[index]
          if (any(index <- y >= 0 & abs(lambda) <= epsilon))
            ans[index] <- log1p(y[index])
          if (any(index <- y < 0 & abs(lambda - 2) > epsilon))
            ans[index] <- -((-y[index] + 1)^(2 - lambda[index]) -
                              1)/(2 - lambda[index])
          if (any(index <- y < 0 & abs(lambda - 2) <= epsilon))
            ans[index] <- -log1p(-y[index])
        }
        else {
          psi <- Recall(y = y, lambda = lambda, derivative = derivative -
                          1, epsilon = epsilon, inverse = inverse)
          if (any(index <- y >= 0 & abs(lambda) > epsilon))
            ans[index] <- ((y[index] + 1)^(lambda[index]) * (log1p(y[index]))^(derivative) -
                             derivative * psi[index])/lambda[index]
          if (any(index <- y >= 0 & abs(lambda) <= epsilon))
            ans[index] <- (log1p(y[index]))^(derivative + 1)/(derivative +
                                                                1)
          if (any(index <- y < 0 & abs(lambda - 2) > epsilon))
            ans[index] <- -((-y[index] + 1)^(2 - lambda[index]) *
                              (-log1p(-y[index]))^(derivative) - derivative *
                              psi[index])/(2 - lambda[index])
          if (any(index <- y < 0 & abs(lambda - 2) <= epsilon))
            ans[index] <- (-log1p(-y[index]))^(derivative + 1)/(derivative +
                                                                  1)
        }
        ans
      }


      trans = sapply(lambda.seq, function(x) yeo.johnson(y, lambda=x))
      skew = apply(trans, 2, function(x) psych::skew(x))
      wch = which.min(sapply(X=1:length(skew), function(x) abs(0 - skew[x])))
      results = trans[,wch]
      lamb = lambda.seq[wch]
      cat(crayon::blue(crayon::bold(paste("The chosen lambda for the Yeo-Johnson transform is", lamb, "\n"))))
      return(results)
    }

    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], yj)

    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

    if (isTRUE(is.matrix(data))) {
      return(as.matrix(scaled.data))
    }
    else {
      return(as.data.frame(scaled.data))
    }

  }

  if (scale.type == "npn") {

    Scale0 = function(x) {
      X = x
      n = length(x)
      Q = qt(rank(x)/(n + 1), df = 29)
      Q = Q/sd(Q)
      Q = as.vector(base::scale(Q))
      return(Q)
    }

    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], Scale0)

    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

      if (isTRUE(is.matrix(data))) {
        return(as.matrix(scaled.data))
      }
    else {
      return(as.data.frame(scaled.data))
    }
  }

  if (scale.type == "pop.sd") {
    Scale1 = function(x) {
      pop.sd = function(x) {
        n <- length(x)
        sd(x) * sqrt((n - 1)/n)
      }
      Mean = mean(x)
      Sd = pop.sd(x)
      x = x - Mean
      x = x/Sd
      x = qnorm(pnorm(x, mean = mean(x), sd = sd(x)))
      return(x)
    }
    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], Scale1)

    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

      if (isTRUE(is.matrix(data))) {
        return(as.matrix(scaled.data))
      }
    else {
      return(as.data.frame(scaled.data))
    }
  }
  if (scale.type == "sample.sd") {
    Scale2 = function(x) {
      Mean = mean(x)
      Sd = sd(x)
      x = x - Mean
      x = x/Sd
      return(x)
    }
    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], Scale2)

    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

      if (isTRUE(is.matrix(data))) {
        return(as.matrix(scaled.data))
      }
    else {
      return(as.data.frame(scaled.data))
    }
  }
  if (scale.type == "median") {
    Scale3 = function(x) {
      Median = median(x)
      Mad = mad(x)
      x = x - Median
      x = x/Mad
      return(x)
    }
    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], Scale3)
    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

      if (isTRUE(is.matrix(data))) {
        return(as.matrix(scaled.data))
      }
    else {
      return(as.data.frame(scaled.data))
    }
  }
  if (scale.type == "huber") {
    Scale4 = function(x) {
      huber = MASS::hubers(x, k = 1.482648, initmu = median(x),
                           tol = 1e-06)
      mu = huber$mu
      s = huber$s
      x = x - mu
      x = x/s
      return(x)
    }
    ind <- sapply(data, is.numeric)
    scaled.data = data
    scaled.data[ind] <- lapply(scaled.data[ind], Scale4)

    if (Vector=="YES") {
      return(as.vector(as.matrix(scaled.data)))
    } else

      if (isTRUE(is.matrix(data))) {
        return(as.matrix(scaled.data))
      }
    else {
      return(as.data.frame(scaled.data))
    }
  }
}
abnormally-distributed/abdisttools documentation built on May 5, 2019, 7:07 a.m.