bestNormalize.R
In bestNormalize: Normalizing Transformation Functions

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.height = 5, fig.width = 7)
library(bestNormalize)

## ----orq_vis, echo = FALSE----------------------------------------------------

x <- iris$Petal.Width
on <- orderNorm(x, warn = FALSE)
xx <- seq(min(x) - 1, max(x) + 1, length = 1000)
yy <- suppressWarnings(predict(on, xx))
r <- ((rank(x) - .5) / (length(x)))
f  <- suppressWarnings(glm(r ~ x, family = "binomial"))
p <- qnorm(predict(f, newdata = data.frame(x = xx), type = 'response'))

plot(x, on$x.t, pch = 20, xlim = range(xx), ylim = range(p, yy), main = "ORQ transformation",
      xlab = "Original Value", ylab = "Transformed Value")
lines(xx, p, col = '1', lwd = 1, lty =2)
lines(xx, yy, col = 'slateblue', lwd = 2)

# Add legend
legend('bottomright', 
       c('Original data', 'Tranformed values for new data', 
         'Approximation for Extrapolation'), 
       bty = 'n', lty = c(0, 1, 2), lwd = c(0, 2,1), 
       pch = c(20, NA, NA), 
       col = c(1, "slateblue", 1, 1))

## ----par_vis, echo = FALSE, out.width = "75%"---------------------------------
knitr::include_graphics("parallel_timings.jpg")

## ----gen_data-----------------------------------------------------------------

# Generate some data
set.seed(100)
x <- rgamma(250, 1, 1)
MASS::truehist(x, nbins = 12)

## ----vis_code-----------------------------------------------------------------
# Perform some tranformations individually

# arcsinh transformation
(arcsinh_obj <- arcsinh_x(x))
# Box Cox's Transformation
(boxcox_obj <- boxcox(x))
# Yeo-Johnson's Transformation
(yeojohnson_obj <- yeojohnson(x))
# orderNorm Transformation
(orderNorm_obj <- orderNorm(x))

# Pick the best one automatically
(BNobject <- bestNormalize(x))

# Last resort - binarize
(binarize_obj <- binarize(x))


## ----vis_data-----------------------------------------------------------------
xx <- seq(min(x), max(x), length = 100)
plot(xx, predict(arcsinh_obj, newdata = xx), type = "l", col = 1, ylim = c(-4, 4),
     xlab = 'x', ylab = "g(x)")
lines(xx, predict(boxcox_obj, newdata = xx), col = 2)
lines(xx, predict(yeojohnson_obj, newdata = xx), col = 3)
lines(xx, predict(orderNorm_obj, newdata = xx), col = 4)

legend("bottomright", legend = c("arcsinh", "Box Cox", "Yeo-Johnson", "OrderNorm"), 
       col = 1:4, lty = 1, bty = 'n')

## ----hist_trans, fig.height=8, fig.width = 7----------------------------------
par(mfrow = c(2,2))
MASS::truehist(arcsinh_obj$x.t, main = "Arcsinh transformation", nbins = 12)
MASS::truehist(boxcox_obj$x.t, main = "Box Cox transformation", nbins = 12)
MASS::truehist(yeojohnson_obj$x.t, main = "Yeo-Johnson transformation", nbins = 12)
MASS::truehist(orderNorm_obj$x.t, main = "orderNorm transformation", nbins = 12)

## ----hist_best----------------------------------------------------------------
par(mfrow = c(1,2))
MASS::truehist(BNobject$x.t, 
               main = paste("Best Transformation:", 
                            class(BNobject$chosen_transform)[1]), nbins = 12)
plot(xx, predict(BNobject, newdata = xx), type = "l", col = 1, 
     main = "Best Normalizing transformation", ylab = "g(x)", xlab = "x")

## ----boxplot, fig.width=10, out.width="100%"----------------------------------
boxplot(log10(BNobject$oos_preds), yaxt = 'n')
axis(2, at=log10(c(.1,.5, 1, 2, 5, 10)), labels=c(.1,.5, 1, 2, 5, 10))

## ----bn_output----------------------------------------------------------------
bestNormalize(x, allow_orderNorm = FALSE, out_of_sample = FALSE)

## ----scales_ex----------------------------------------------------------------
library(ggplot2)

x <- rgamma(1000, 1, .1)
bn <- bestNormalize(x)

bn

# say y is related linearly to the transformed x
y <- bn$x.t * 1 + rnorm(1000)

# A log transformation does OK...
ggplot(data.frame(x=x,y=y), aes(x, y)) +
  geom_point() + 
  scale_x_continuous(trans = "log", breaks = scales::log_breaks())

# Create bestNormalize scale for use in ggplot (using bestNormalize object)
bn_trans <- scales::trans_new(
  name = "bn_trans",
  trans = function(x) predict(bn, newdata = x),
  inverse = function(x) predict(bn, newdata = x, inverse = TRUE)
)


ggplot(data.frame(x=x,y=y), aes(x, y)) +
  geom_point() + 
  scale_x_continuous(trans = bn_trans)


## ----load_appdata-------------------------------------------------------------
data("autotrader")
autotrader$yearsold <- 2017 - autotrader$Year
### Using best-normalize
(priceBN <- bestNormalize(autotrader$price, r = 1, k = 5))

## ----bn_mileage---------------------------------------------------------------
(mileageBN <- bestNormalize(autotrader$mileage, r = 1, k = 5))

## ----bn_yearsold--------------------------------------------------------------
(yearsoldBN <- bestNormalize(autotrader$yearsold, r = 1, k = 5))

## ----hist_app, fig.height=8, fig.width=7--------------------------------------
par(mfrow = c(3, 2))
MASS::truehist(autotrader$price)
MASS::truehist(priceBN$x.t)
MASS::truehist(autotrader$mileage)
MASS::truehist(mileageBN$x.t)
MASS::truehist(autotrader$yearsold)
MASS::truehist(yearsoldBN$x.t)

## ----hist_app2----------------------------------------------------------------
par(mfrow = c(2, 2))
price.xx <- seq(min(autotrader$price), max(autotrader$price), length = 100)
mileage.xx <- seq(min(autotrader$mileage), max(autotrader$mileage), length = 100)
yearsold.xx <- seq(min(autotrader$yearsold), max(autotrader$yearsold), length = 100)

plot(price.xx, predict(priceBN, newdata = price.xx), type = "l", 
     main = "Price bestNormalizing transformation", 
     xlab = "Price ($)", ylab = "g(price)")
plot(mileage.xx, predict(mileageBN, newdata = mileage.xx), type = "l", 
     main = "Mileage bestNormalizing transformation", 
     xlab = "Mileage", ylab = "g(Mileage)")
plot(yearsold.xx, predict(yearsoldBN, newdata = yearsold.xx), type = "l", 
     main = "Years-old bestNormalizing transformation", 
     xlab = "Years-old", ylab = "g(Years-old)")

## ----app_vis------------------------------------------------------------------
autotrader$price.t <- priceBN$x.t
autotrader$mileage.t <- mileageBN$x.t
autotrader$yearsold.t <- yearsoldBN$x.t

fit4 <- lm(price.t ~ mileage.t + yearsold.t,
           data = autotrader)
summary(fit4)

miles.t <- predict(mileageBN, newdata = mileage.xx)
c1 <- coef(fit4)["mileage.t"]
plot_idx <- sample(1:nrow(autotrader), size = 1000)

par(mfrow = c(1, 1))
plot(
    mileageBN$x.t[plot_idx],
    priceBN$x.t[plot_idx],
    pch = 16,
    col = grey(.1, alpha = .2),
    main = "Estimated linear effect (using transformed data)",
    xlab = "g(Mileage)",
    ylab = "g(Price)"
)
lines(miles.t,
      coef(fit4)[1] + c1 * miles.t,
      col = "slateblue",
      lwd = 2)

## Mileage effect
plot(
  autotrader$mileage[plot_idx],
  autotrader$price[plot_idx],
  pch = 16,
  col = grey(.1, alpha = .2),
  main = "Mileage effect (re-transformed to original unit)",
  xlab = "Mileage",
  ylab = "Price"
)
line_vals <- miles.t * c1 + coef(fit4)[1]
lines(
    mileage.xx,
    y = predict(priceBN, newdata = line_vals, inverse = TRUE),
    lwd = 2,
    col = "slateblue"
)
# Compare to GAM fit
fit_gam <- mgcv::gam(price ~ s(yearsold) + s(mileage), data = autotrader)
p_gam <- predict(fit_gam, newdata = data.frame(yearsold = mean(autotrader$yearsold), 
                                               mileage = mileage.xx))
lines(mileage.xx, p_gam, lwd = 2, col = 'green3')

legend(
  'topright',
  c("GAM fit", "Transformed linear fit"),
  lwd = 2,
  col = c("green3", "slateblue"),
  bty = "n"
  )

## Years Old effect
yo.t <- predict(yearsoldBN, newdata = yearsold.xx)
c2 <- coef(fit4)["yearsold.t"]

plot(
    jitter(autotrader$yearsold[plot_idx], 1.5),
    autotrader$price[plot_idx],
    pch = 16,
    col = grey(.1, alpha = .2), 
    main = "Years old effect (re-transformed to original unit)",
    xlab = "Age (Jittered)",
    ylab = "Price"
)
line_vals <- yo.t * c2 + coef(fit4)[1]
lines(
    yearsold.xx,
    y = predict(priceBN, newdata = line_vals, inverse = TRUE),
    lwd = 2,
    col = "slateblue"
)

# Compare to GAM fit
p_gam <- predict(fit_gam, newdata = data.frame(yearsold = yearsold.xx, 
                                               mileage = mean(autotrader$mileage)))
lines(yearsold.xx, p_gam, lwd = 2, col = 'green3')

legend(
  'topright',
  c("GAM fit", "Transformed linear fit"),
  lwd = 2,
  col = c("green3", "slateblue"),
  bty = "n"
  )