Nothing
## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.height = 5, fig.width = 7)
library(bestNormalize)
## ----orq_vis, echo = FALSE----------------------------------------------------
x <- iris$Petal.Width
on <- orderNorm(x, warn = FALSE)
xx <- seq(min(x) - 1, max(x) + 1, length = 1000)
yy <- suppressWarnings(predict(on, xx))
r <- ((rank(x) - .5) / (length(x)))
f <- suppressWarnings(glm(r ~ x, family = "binomial"))
p <- qnorm(predict(f, newdata = data.frame(x = xx), type = 'response'))
plot(x, on$x.t, pch = 20, xlim = range(xx), ylim = range(p, yy), main = "ORQ transformation",
xlab = "Original Value", ylab = "Transformed Value")
lines(xx, p, col = '1', lwd = 1, lty =2)
lines(xx, yy, col = 'slateblue', lwd = 2)
# Add legend
legend('bottomright',
c('Original data', 'Tranformed values for new data',
'Approximation for Extrapolation'),
bty = 'n', lty = c(0, 1, 2), lwd = c(0, 2,1),
pch = c(20, NA, NA),
col = c(1, "slateblue", 1, 1))
## ----par_vis, echo = FALSE, out.width = "75%"---------------------------------
knitr::include_graphics("parallel_timings.jpg")
## ----gen_data-----------------------------------------------------------------
# Generate some data
set.seed(100)
x <- rgamma(250, 1, 1)
MASS::truehist(x, nbins = 12)
## ----vis_code-----------------------------------------------------------------
# Perform some tranformations individually
# arcsinh transformation
(arcsinh_obj <- arcsinh_x(x))
# Box Cox's Transformation
(boxcox_obj <- boxcox(x))
# Yeo-Johnson's Transformation
(yeojohnson_obj <- yeojohnson(x))
# orderNorm Transformation
(orderNorm_obj <- orderNorm(x))
# Pick the best one automatically
(BNobject <- bestNormalize(x))
# Last resort - binarize
(binarize_obj <- binarize(x))
## ----vis_data-----------------------------------------------------------------
xx <- seq(min(x), max(x), length = 100)
plot(xx, predict(arcsinh_obj, newdata = xx), type = "l", col = 1, ylim = c(-4, 4),
xlab = 'x', ylab = "g(x)")
lines(xx, predict(boxcox_obj, newdata = xx), col = 2)
lines(xx, predict(yeojohnson_obj, newdata = xx), col = 3)
lines(xx, predict(orderNorm_obj, newdata = xx), col = 4)
legend("bottomright", legend = c("arcsinh", "Box Cox", "Yeo-Johnson", "OrderNorm"),
col = 1:4, lty = 1, bty = 'n')
## ----hist_trans, fig.height=8, fig.width = 7----------------------------------
par(mfrow = c(2,2))
MASS::truehist(arcsinh_obj$x.t, main = "Arcsinh transformation", nbins = 12)
MASS::truehist(boxcox_obj$x.t, main = "Box Cox transformation", nbins = 12)
MASS::truehist(yeojohnson_obj$x.t, main = "Yeo-Johnson transformation", nbins = 12)
MASS::truehist(orderNorm_obj$x.t, main = "orderNorm transformation", nbins = 12)
## ----hist_best----------------------------------------------------------------
par(mfrow = c(1,2))
MASS::truehist(BNobject$x.t,
main = paste("Best Transformation:",
class(BNobject$chosen_transform)[1]), nbins = 12)
plot(xx, predict(BNobject, newdata = xx), type = "l", col = 1,
main = "Best Normalizing transformation", ylab = "g(x)", xlab = "x")
## ----boxplot, fig.width=10, out.width="100%"----------------------------------
boxplot(log10(BNobject$oos_preds), yaxt = 'n')
axis(2, at=log10(c(.1,.5, 1, 2, 5, 10)), labels=c(.1,.5, 1, 2, 5, 10))
## ----bn_output----------------------------------------------------------------
bestNormalize(x, allow_orderNorm = FALSE, out_of_sample = FALSE)
## ----scales_ex----------------------------------------------------------------
library(ggplot2)
x <- rgamma(1000, 1, .1)
bn <- bestNormalize(x)
bn
# say y is related linearly to the transformed x
y <- bn$x.t * 1 + rnorm(1000)
# A log transformation does OK...
ggplot(data.frame(x=x,y=y), aes(x, y)) +
geom_point() +
scale_x_continuous(trans = "log", breaks = scales::log_breaks())
# Create bestNormalize scale for use in ggplot (using bestNormalize object)
bn_trans <- scales::trans_new(
name = "bn_trans",
trans = function(x) predict(bn, newdata = x),
inverse = function(x) predict(bn, newdata = x, inverse = TRUE)
)
ggplot(data.frame(x=x,y=y), aes(x, y)) +
geom_point() +
scale_x_continuous(trans = bn_trans)
## ----load_appdata-------------------------------------------------------------
data("autotrader")
autotrader$yearsold <- 2017 - autotrader$Year
### Using best-normalize
(priceBN <- bestNormalize(autotrader$price, r = 1, k = 5))
## ----bn_mileage---------------------------------------------------------------
(mileageBN <- bestNormalize(autotrader$mileage, r = 1, k = 5))
## ----bn_yearsold--------------------------------------------------------------
(yearsoldBN <- bestNormalize(autotrader$yearsold, r = 1, k = 5))
## ----hist_app, fig.height=8, fig.width=7--------------------------------------
par(mfrow = c(3, 2))
MASS::truehist(autotrader$price)
MASS::truehist(priceBN$x.t)
MASS::truehist(autotrader$mileage)
MASS::truehist(mileageBN$x.t)
MASS::truehist(autotrader$yearsold)
MASS::truehist(yearsoldBN$x.t)
## ----hist_app2----------------------------------------------------------------
par(mfrow = c(2, 2))
price.xx <- seq(min(autotrader$price), max(autotrader$price), length = 100)
mileage.xx <- seq(min(autotrader$mileage), max(autotrader$mileage), length = 100)
yearsold.xx <- seq(min(autotrader$yearsold), max(autotrader$yearsold), length = 100)
plot(price.xx, predict(priceBN, newdata = price.xx), type = "l",
main = "Price bestNormalizing transformation",
xlab = "Price ($)", ylab = "g(price)")
plot(mileage.xx, predict(mileageBN, newdata = mileage.xx), type = "l",
main = "Mileage bestNormalizing transformation",
xlab = "Mileage", ylab = "g(Mileage)")
plot(yearsold.xx, predict(yearsoldBN, newdata = yearsold.xx), type = "l",
main = "Years-old bestNormalizing transformation",
xlab = "Years-old", ylab = "g(Years-old)")
## ----app_vis------------------------------------------------------------------
autotrader$price.t <- priceBN$x.t
autotrader$mileage.t <- mileageBN$x.t
autotrader$yearsold.t <- yearsoldBN$x.t
fit4 <- lm(price.t ~ mileage.t + yearsold.t,
data = autotrader)
summary(fit4)
miles.t <- predict(mileageBN, newdata = mileage.xx)
c1 <- coef(fit4)["mileage.t"]
plot_idx <- sample(1:nrow(autotrader), size = 1000)
par(mfrow = c(1, 1))
plot(
mileageBN$x.t[plot_idx],
priceBN$x.t[plot_idx],
pch = 16,
col = grey(.1, alpha = .2),
main = "Estimated linear effect (using transformed data)",
xlab = "g(Mileage)",
ylab = "g(Price)"
)
lines(miles.t,
coef(fit4)[1] + c1 * miles.t,
col = "slateblue",
lwd = 2)
## Mileage effect
plot(
autotrader$mileage[plot_idx],
autotrader$price[plot_idx],
pch = 16,
col = grey(.1, alpha = .2),
main = "Mileage effect (re-transformed to original unit)",
xlab = "Mileage",
ylab = "Price"
)
line_vals <- miles.t * c1 + coef(fit4)[1]
lines(
mileage.xx,
y = predict(priceBN, newdata = line_vals, inverse = TRUE),
lwd = 2,
col = "slateblue"
)
# Compare to GAM fit
fit_gam <- mgcv::gam(price ~ s(yearsold) + s(mileage), data = autotrader)
p_gam <- predict(fit_gam, newdata = data.frame(yearsold = mean(autotrader$yearsold),
mileage = mileage.xx))
lines(mileage.xx, p_gam, lwd = 2, col = 'green3')
legend(
'topright',
c("GAM fit", "Transformed linear fit"),
lwd = 2,
col = c("green3", "slateblue"),
bty = "n"
)
## Years Old effect
yo.t <- predict(yearsoldBN, newdata = yearsold.xx)
c2 <- coef(fit4)["yearsold.t"]
plot(
jitter(autotrader$yearsold[plot_idx], 1.5),
autotrader$price[plot_idx],
pch = 16,
col = grey(.1, alpha = .2),
main = "Years old effect (re-transformed to original unit)",
xlab = "Age (Jittered)",
ylab = "Price"
)
line_vals <- yo.t * c2 + coef(fit4)[1]
lines(
yearsold.xx,
y = predict(priceBN, newdata = line_vals, inverse = TRUE),
lwd = 2,
col = "slateblue"
)
# Compare to GAM fit
p_gam <- predict(fit_gam, newdata = data.frame(yearsold = yearsold.xx,
mileage = mean(autotrader$mileage)))
lines(yearsold.xx, p_gam, lwd = 2, col = 'green3')
legend(
'topright',
c("GAM fit", "Transformed linear fit"),
lwd = 2,
col = c("green3", "slateblue"),
bty = "n"
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.