Nothing
ppMethods <- c("BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", "knnImpute", "bagImpute", "pca", "ica", "spatialSign", "medianImpute")
preProcess <- function(x, ...) UseMethod("preProcess")
preProcess.default <- function(x, method = c("center", "scale"),
thresh = 0.95,
pcaComp = NULL,
na.remove = TRUE,
k = 5,
knnSummary = mean,
outcome = NULL,
fudge = .2,
numUnique = 3,
verbose = FALSE,
...)
{
ppMethods <- c("BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", "knnImpute", "bagImpute", "medianImpute", "pca", "ica", "spatialSign")
if(any(!(method %in% ppMethods))) stop(paste("'method' should be one of:", paste(ppMethods, collapse = ", ")))
if(is.null(method)) stop("NULL values of 'method' are not allowed")
if(any(method %in% "range") & any(method %in% c("center", "scale", "BoxCox")))
stop("centering, scaling and/or Box-Cox transformations are inconsistent with scaling to a range of [0, 1]")
if(all(c("pca", "ica") %in% method))
{
warning("fastICA automatically uncorrelates the data using PCA. method = 'pca' is not needed")
method <- method[method != "pca"]
}
if(any(method %in% c("pca", "ica", "knnImpute", "spatialSign")) & !(any(method == "scale"))) method <- c(method, "scale")
if(any(method %in% c("pca", "ica", "knnImpute", "spatialSign")) & !(any(method == "center"))) method <- c(method, "center")
if(sum(c("knnImpute","bagImpute", "medianImpute") %in% method) > 1)
stop("please pick only one imputation method")
method <- unique(method)
## the row.norm option in fastICA states: "logical value indicating whether rows
## of the data matrix X should be standardized beforehand." Basically, this means that
## we would center *and* scale before the ICA step, so let's adjust the "scale" method too
if(any(method == "ica"))
{
theDots <- list(...)
row.norm <- if(is.null(list(...)$row.norm)) FALSE else list(...)$row.norm
if(row.norm & !(any(method == "scale"))) method <- c(method, "scale")
}
if(is.matrix(x))
{
if(!is.numeric(x)) stop("x must be numeric")
}
if(is.data.frame(x))
{
isFactor <- unlist(lapply(x, is.factor))
isChar <- unlist(lapply(x, is.character))
if(any(isFactor | isChar)) stop("all columns of x must be numeric")
}
theCall <- match.call(expand.dots = TRUE)
if(any(method == "BoxCox"))
{
if(verbose) cat("Estimating Box-Cox transformations for the predictors...")
bc <- lapply(x,
BoxCoxTrans,
fudge = fudge,
na.rm = na.remove,
numUnique = numUnique,
x = if(is.null(outcome)) rep(1, nrow(x)) else outcome)
if(verbose) cat(" applying them to training data\n")
## Find a better way of doing this
for(i in seq(along = bc)) x[,i] <- predict(bc[[i]], x[,i])
} else bc <- NULL
if(any(method == "YeoJohnson"))
{
yjWrap <- function(x, numUnique = numUnique)
{
if(length(unique(x)) >= numUnique)
{
out <- try(powerTransform(y ~ 1,
data = data.frame(y = x[!is.na(x)]),
family = "yjPower"),
silent = TRUE)
if(class(out)[1] == "try-error") out <- NA
} else out <- NA
out
}
if(verbose) cat("Estimating Yeo-Johnson transformations for the predictors...")
yj <- lapply(x, yjWrap, numUnique = numUnique)
if(verbose) cat(" applying them to training data\n")
## Find a better way of doing this
lam <- unlist(lapply(yj, function(x) if(class(x) == "powerTransform") x$lambda else NA))
lam <- lam[!is.na(lam)]
if(length(lam) > 0)
{
for(i in seq(along = lam))
{
who <- gsub("\\.Y1$", "", names(lam)[i])
x[,who] <- yjPower(x[,who], yj[[who]]$lambda)
}
}
} else yj <- NULL
if(any(method == "expoTrans"))
{
if(verbose) cat("Estimating exponential transformations for the predictors...")
et <- lapply(x, expoTrans.default, numUnique = numUnique)
if(verbose) cat(" applying them to training data\n")
for(i in seq(et)) x[,i] <- predict(et[[i]], x[,i])
} else et <- NULL
if(any(method %in% c("center")))
{
if(verbose) cat("Calculating means for centering\n")
centerValue <- apply(x, 2, mean, na.rm = na.remove)
} else centerValue <- NULL
if(any(method %in% c("scale")))
{
if(verbose) cat("Calculating standard deviations for scaling\n")
scaleValue <- apply(x, 2, sd, na.rm = na.remove)
} else scaleValue <- NULL
if(any(method == "range"))
{
ranges <- apply(x, 2, function(x) c(min(x, na.rm = na.remove), max(x, na.rm = na.remove)))
} else ranges <- NULL
if(any(scaleValue == 0))
{
warning(
paste(
"These variables have zero variances:",
paste(
names(scaleValue)[which(scaleValue == 0)],
collapse = ", ")))
scaleValue[which(scaleValue == 0)] <- 1
}
cols <- if(any(method == "knnImpute")) which(apply(x, 2, function(x) !any(is.na(x)))) else NULL
if(any(method == "bagImpute"))
{
if(verbose) cat("Computing bagging models for each predictor...")
bagModels <- as.list(colnames(x))
names(bagModels) <- colnames(x)
bagModels <- lapply(bagModels,
bagImp,
x = x)
if(verbose) cat(" done\n")
} else bagModels <- NULL
x <- x[complete.cases(x),,drop = FALSE]
if (any(method == "medianImpute"))
{
if(verbose) cat("Computing medians for each predictor...")
median <- sapply(x, median, na.rm=TRUE)
names(median) <- names(x)
if(verbose) cat(" done\n")
}
if(any(method == "pca"))
{
if(verbose) cat("Computing PCA loadings\n")
tmp <- prcomp(x, scale = TRUE, retx = FALSE)
if(is.null(pcaComp))
{
cumVar <- cumsum(tmp$sdev^2/sum(tmp$sdev^2))
numComp <- max(2, which.max(cumVar > thresh))
} else numComp <- pcaComp
rot <- tmp$rotation[,1:numComp]
} else {
rot <- NULL
numComp <- NULL
}
if(any(method == "ica"))
{
if(verbose) cat("Computing ICA loadings\n")
library(fastICA)
x <- sweep(x, 2, centerValue, "-")
if(!row.norm & any(method == "scale")) x <- sweep(x, 2, scaleValue, "/")
tmp <- fastICA(x, ...)
ica <- list(
## S = tmp$S, ## was used for debugging
row.norm = row.norm,
K = tmp$K,
W = tmp$W)
} else {
ica <- NULL
}
out <- list(call = theCall,
dim = dim(x),
bc = bc,
yj = yj,
et = et,
mean = centerValue,
std = scaleValue,
ranges = ranges,
rotation = rot,
method = method,
thresh = thresh,
pcaComp = pcaComp,
numComp = numComp,
ica = ica,
k = k,
knnSummary = knnSummary,
bagImp = bagModels,
median = median,
cols = cols,
data = if(any(method == "knnImpute")) scale(x[complete.cases(x),,drop = FALSE]) else NULL)
structure(out, class = "preProcess")
}
predict.preProcess <- function(object, newdata, ...)
{
dataNames <- colnames(newdata)
## For centering and scaling, we can be flexible if a column in the
## original data set is not in newdata
if(!is.null(object$mean))
{
if(!all(names(object$mean) %in% dataNames))
{
if(all(dataNames %in% names(object$mean)))
{
object$mean <- object$mean[names(object$mean) %in% dataNames]
warning("newdata does not contain some variables")
} else {
vars <- dataNames[!(dataNames %in% names(object$mean))]
stop(paste("The following variables were not pre-processed:",
paste(vars, collapse = ",")))
}
}
}
if(!is.null(object$std))
{
if(!all(names(object$std) %in% dataNames))
{
if(all(dataNames %in% names(object$std)))
{
object$std <- object$std[names(object$std) %in% dataNames]
}
}
}
if(!is.null(object$rotation))
{
if(!all(names(object$rotation) %in% dataNames))
{
stop("newdata does not contain some variables")
}
}
oldClass <- class(newdata)
if(!is.null(object$bc))
{
lam <- unlist(lapply(object$bc, function(x) x$lambda))
lamIndex <- which(!is.na(lam))
if(length(lamIndex) > 0)
{
for(i in names(lamIndex))
{
tt<- newdata[,i]
tt <- tt[!is.na(tt)]
if(any(tt <= 0))
{
cat(i, "\n")
}
newdata[,i] <- predict(object$bc[[i]], newdata[,i])
}
}
}
if(!is.null(object$yj))
{
lam <- unlist(lapply(object$yj, function(x) if(class(x) == "powerTransform") x$lambda else NA))
lam <- lam[!is.na(lam)]
if(length(lam) > 0)
{
for(i in seq(along = lam))
{
who <- gsub("\\.Y1$", "", names(lam)[i])
newdata[,who] <- yjPower(newdata[,who], object$yj[[who]]$lambda)
}
}
}
if(!is.null(object$et))
{
for(i in seq(along = object$et))
{
who <- names(object$et)[i]
newdata[,who] <- predict(object$et[[who]], newdata[,who])
}
}
if(any(object$method == "range"))
{
newdata <- sweep(newdata, 2, object$ranges[1,], "-")
newdata <- sweep(newdata, 2, object$ranges[2,] - object$ranges[1,], "/")
}
if(any(object$method == "center")) newdata <- sweep(newdata, 2, object$mean, "-")
if(any(object$method %in% c("scale"))) newdata <- sweep(newdata, 2, object$std, "/")
cc <- complete.cases(newdata)
if(any(object$method == "knnImpute") && any(!cc))
{
hasMiss <- newdata[!cc,,drop = FALSE]
hasMiss <- apply(hasMiss,
1,
nnimp,
old = object$data,
cols = object$cols,
k = object$k,
foo = object$knnSummary)
hasMiss <- t(hasMiss)
newdata[!cc,] <- hasMiss
}
if(any(object$method == "bagImpute") && any(!cc))
{
library(ipred)
hasMiss <- newdata[!cc,,drop = FALSE]
missingVars <- apply(hasMiss,
2,
function(x) any(is.na(x)))
missingVars <- names(missingVars)[missingVars]
## ipred's bagging procedure only allows for data frames
if(!is.data.frame(hasMiss)) hasMiss <- as.data.frame(hasMiss)
for(i in seq(along = missingVars))
{
preds <- predict(object$bagImp[[missingVars[i]]]$model,
hasMiss[, !colnames(hasMiss) %in% missingVars[i]])
hasMiss[is.na(hasMiss[,missingVars[i]]),
missingVars[i]] <- preds[is.na(hasMiss[,missingVars[i]])]
}
newdata[!cc,] <- hasMiss
}
if (any(object$method == "medianImpute") && any(!cc)) {
hasMiss <- newdata[!cc, , drop = FALSE]
missingVars <- apply(hasMiss, 2, function(x) any(is.na(x)))
missingVars <- names(missingVars)[missingVars]
if (!is.data.frame(hasMiss))
hasMiss <- as.data.frame(hasMiss)
for (i in seq(along = missingVars)) {
hasMiss[is.na(hasMiss[, missingVars[i]]), missingVars[i]] <- object$median[missingVars[i]]
}
newdata[!cc, ] <- hasMiss
}
if(any(object$method == "pca"))
{
newdata <-if(is.matrix(newdata)) newdata %*% object$rotation else as.matrix(newdata) %*% object$rotation
}
if(any(object$method == "ica"))
{
if(!is.matrix(newdata)) newdata <- as.matrix(newdata)
##if(object$ica$row.norm) newdata <- apply(newdata, 1, function(u) u/sd(u))
newdata <- newdata %*% object$ica$K %*% object$ica$W
colnames(newdata) <- paste("ICA", 1:ncol(object$ica$W), sep = "")
}
if(any(oldClass == "data.frame")) newdata <- as.data.frame(newdata)
if(any(object$method == "spatialSign")) newdata <- spatialSign(newdata)
if(!(any(object$method %in% c("pca", "ica")))) colnames(newdata) <- dataNames
newdata
}
print.preProcess <- function(x, ...)
{
printCall(x$call)
cat("Created from", x$dim[1], "samples and", x$dim[2], "variables\n")
pp <- x$method
pp <- gsub("BoxCox", "Box-Cox transformation", pp)
pp <- gsub("YeoJohnson", "Yeo-Johnson transformation", pp)
pp <- gsub("scale", "scaled", pp)
pp <- gsub("center", "centered", pp)
pp <- gsub("pca", "principal component signal extraction", pp)
pp <- gsub("ica", "independent component signal extraction", pp)
pp <- gsub("spatialSign", "spatial sign transformation", pp)
pp <- gsub("knnImpute", paste(x$k, "nearest neighbor imputation"), pp)
pp <- gsub("bagImpute", "bagged tree imputation", pp)
pp <- gsub("range", "re-scaling to [0, 1]", pp)
ppText <- paste("Pre-processing:", paste(pp, collapse = ", "))
cat(truncateText(ppText), "\n\n")
if(any(x$method == "BoxCox"))
{
cat("Lambda estimates for Box-Cox transformation:\n")
if(length(x$bc) < 11)
{
lmbda <- unlist(lapply(x$bc, function(x) x$lambda))
naLmbda <- sum(is.na(lmbda))
cat(paste(round(lmbda[!is.na(lmbda)], 2), collapse = ", "))
if(naLmbda > 0) cat(" (#NA: ", naLmbda, ")\n", sep = "")
} else print(summary(unlist(lapply(x$bc, function(x) x$lambda))))
cat("\n")
}
if(any(x$method == "YeoJohnson"))
{
cat("Lambda estimates for Yeo-Johnson transformation:\n")
if(length(x$yj) < 11)
{
lmbda <- unlist(lapply(x$yj, function(x) if(class(x) == "powerTransform") x$lambda else NA))
naLmbda <- sum(is.na(lmbda))
cat(paste(round(lmbda[!is.na(lmbda)], 2), collapse = ", "))
if(naLmbda > 0) cat(" (#NA: ", naLmbda, ")\n", sep = "")
} else print(summary(unlist(lapply(x$yj, function(x) if(class(x) == "powerTransform") x$lambda else NA))))
cat("\n")
}
if(any(x$method == "pca"))
{
if(is.null(x$pcaComp))
{
cat("PCA needed", x$numComp, ifelse(x$numComp > 1, "components", "component"),
"to capture", round(x$thresh*100, 2),
"percent of the variance\n")
} else {
cat("PCA used", x$pcaComp, ifelse(x$pcaComp > 1, "components", "component"), "as specified.\n")
}
}
if(any(x$method == "ica"))
{
cat("ICA used", ncol(x$ica$W), "components\n")
}
}
nnimp <- function(new, old, cols, k, foo)
{
library(RANN)
nms <- names(new)
cols2 <- which(!is.na(new))
new <- matrix(new, ncol = length(new))
colnames(new) <- nms
cols <- sort(intersect(cols2, cols))
nn <- nn2(old[, cols, drop = FALSE],
new[, cols, drop = FALSE],
k = k)
tmp <- old[nn$nn.idx, -cols, drop = FALSE]
##TODO deal with cases where training set has missing data
subs <- apply(tmp, 2, foo, na.rm = TRUE)
new[, -cols] <- subs
new
}
bagImp <- function(var, x, B = 10)
{
library(ipred)
## The formula interface is much slower than the
## (y, X) interface, but the latter would have to
## do case-wise deletion of samples from the
## training set.
if(!is.data.frame(x)) x <- as.data.frame(x)
mod <- bagging(as.formula(paste(var, "~.")),
data = x,
nbagg = B)
list(var = var,
model = mod)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.