Nothing
#' General Imputation Framework in R
#'
#' @description Impute missing values under the general framework in R
#' @details This function can impute several kinds of data, including continuous-only
#' data, categorical-only data and mixed-type data. Many methods can be used, including
#' regularisation method like LASSO and ridge regression, tree-based model and dimensionality
#' reduction method like PCA and PLS.
#'
#' @param missdata data matrix with missing values encoded as NA.
#' @param lmFun the variable selection method for continuous data.
#' @param cFun the variable selection method for categorical data.
#' @param ini the method for initilisation. It is a length one character if
#' missdata contains only one type of variables only. For continous only data,
#' ini can be "mean" (mean imputation), "median" (median imputation) or "random"
#' (random guess), the default is "mean". For categorical data, it can be
#' either "majority" or "random", the default is "majority". If missdata is
#' mixed of continuous and categorical data, then ini has to be a vector of two
#' characters, with the first element indicating the method for continous
#' variables and the other element for categorical variables, and the default
#' is c("mean", "majority".)
#' @param maxiter is the maximum number of interations
#' @param verbose is logical, if TRUE then detailed information will
#' be printed in the console while running.
#' @param conv logical, if TRUE, the convergence details will be returned
#'
#' @return if conv = FALSE, it returns a completed data matrix with no
#' missing values; if TRUE, it rrturns a list of components including:
#'
#' \item{imp}{the imputed data matrix with no missing values}
#' \item{conv}{the convergence status during the imputation}
#'
#' @seealso \code{\link{SimIm}} for missing value simulation.
#' @importFrom stats na.omit predict
#' @export
#' @examples
#' data(parkinson)
#' # introduce 10% random missing values into the parkinson data
#' missdata <- SimIm(parkinson, 0.1)
#' # impute the missing values by LASSO
#' \donttest{
#' impdata <- impute(missdata, lmFun = "lassoR")
#' # calculate the normalised RMSE for the imputation
#' Rmse(impdata$imp, missdata, parkinson, norm = TRUE)
#' }
impute <- function(missdata, lmFun = NULL, cFun = NULL, ini = NULL,
maxiter = 100, verbose = TRUE, conv = TRUE) {
Type <- Detect(missdata)
if(all(Type == "numeric")) {
task <- 1
if(is.null(ini)) {
ini = "mean"
} else {
stopifnot(ini %in% c("mean", "median", "random"))
}
names(task) <- "Regression"
} else if(all(Type == "character")) {
task <- 2
if(is.null(ini)) {
ini = "majority"
} else {
stopifnot(ini %in% c("majority", "random"))
}
names(task) <- 'Classification'
} else {
task <- 3
if(is.null(ini)) {
ini = c("mean", "majority")
} else {
if (length(ini) !=2) {
stop ("Data are mixed-type, provide two initial methods")
} else {
if (!(ini[1] %in% c("mean", "median", "random")) ||
!(ini[2] %in% c("majority", "random"))) {
stop ("ini has to be a two length character vector, check the help
about this argument and make sure you provide valid names")
}
}
}
names(task) <- "Regression and Classification mixed"
}
if (verbose) {
cat("Imputation task is:", names(task), "\n")
}
if (task == 1) {
stopifnot(!is.null(lmFun))
lmFUN <- match.fun(lmFun)
cFUN <- NULL
} else if (task == 2) {
stopifnot(!is.null(cFun))
lmFUN <- NULL
cFUN <- match.fun(cFun)
} else {
stopifnot(!is.null(cFun), !is.null(lmFun))
lmFUN <- match.fun(lmFun)
cFUN <- match.fun(cFun)
}
n <- nrow(missdata)
p <- ncol(missdata)
## remove completely missing variables
if (any(apply(is.na(missdata), 2, sum) == n)) {
ind <- which(apply(is.na(missdata), 2, sum) == n)
missdata <- missdata[, -ind]
p <- ncol(missdata)
cat('removed variable(s)', ind,
'due to the missingness of all entries\n')
}
## perform initial guess on miss
ximp <- missdata
if (task == 1) {
ximp <- guess(ximp, type = ini)
} else if (task == 2) {
ximp <- guess(ximp, type = ini)
} else {
for (i in seq_along(Type)) {
if (Type[i] == "numeric") {
ximp[, i] <- guess(missdata[, i], type = ini[1])
} else {
if (ini[2] == "majority") {
ximp[, i] <- as.numeric(major(missdata[, i]))
} else {
ximp[, i] <- guess(missdata[, i], type = "random")
}
}
}
}
# extract missingness pattern
NAloc <- is.na(missdata)
noNAvar <- apply(NAloc, 2, sum) # how many are missing in the vars
sort.j <- order(noNAvar, decreasing = TRUE) # indices of increasing amount of NA in vars
sort.noNAvar <- noNAvar[sort.j]
# ready for output
Ximp <- vector('list', maxiter)
iter <- 0
k <- length(unique(Type))
convNew <- rep(0, k)
convOld <- rep(Inf, k)
# setup convergence container w.r.t. task types
if (k == 1) {
if (unique(Type) == 'numeric'){
names(convNew) <- c('numeric')
} else {
names(convNew) <- c('character')
}
Converg <- rep(NA, maxiter)
} else {
names(convNew) <- c('numeric', 'character')
Converg <- matrix(NA, nrow = maxiter, ncol = 2)
}
# stopping function for the loop
stopCriterion <- function(Type, convNew, convOld, iter, maxiter) {
k <- length(unique(Type))
if (k == 1) {
(convNew < convOld) & (iter < maxiter)
} else {
((convNew[1] < convOld[1]) | (convNew[2] < convOld[2])) & (iter < maxiter)
}
}
while (stopCriterion(Type, convNew, convOld, iter, maxiter)) {
if (iter != 0){
convOld <- convNew
}
if (verbose) {
if (task == 1) {
cat("iteration", iter + 1, "using", lmFun,
"in progress...")
} else if (task == 2) {
cat("iteration", iter + 1, "using", cFun,
"in progress...")
} else {
cat("iteration", iter + 1, "using", lmFun, "and", cFun,
"in progress...")
}
}
ximp.old <- ximp
for (s in 1:p) {
varInd <- sort.j[s]
if (noNAvar[[varInd]] != 0) {
obsi <- !NAloc[, varInd] # which i's are observed
misi <- NAloc[, varInd] # which i's are missing
obsY <- ximp[obsi, varInd] # training response
obsX <- as.matrix(ximp[obsi, seq(1, p)[-varInd]]) # training variables
colnames(obsX) <- paste0("x", 1:ncol(obsX))
misX <- as.matrix(ximp[misi, seq(1, p)[-varInd]]) # predictors
# as.df can be overwritten if some known functions that require data.frame
# by their preidction function are called.
if (task == 1 && (lmFun %in%
c("stepR", "ridgeR", "stepBothR", "stepBackR", "stepForR"))) {
# some class of predction functions require that the newdata be
# a data.frame rather than a matrix, so we need to tranform the
# new data (matrix) into a data frame
misX <- as.data.frame(misX)
} else if (task == 2 && (cFun %in%
c("stepBothC", "stepBackC", "stepForC", "rpartC",
"treeC", "gbmC", "ridgeC"))) {
misX <- as.data.frame(misX)
} else if (task == 3) {
if ((lmFun %in% c("stepR", "ridgeR", "stepBothR", "stepBackR", "stepForR")) ||
(cFun %in% c("stepBothC", "stepBackC", "stepForC", "rpartC",
"treeC", "gbmC", "ridgeC")))
misX <- as.data.frame(misX)
}
colnames(misX) <- colnames(obsX)
typeY <- Type[varInd]
## train model (with automated variable selction) on observed data
if (typeY == "numeric") {
Miss <- lmFUN(x = obsX, y = obsY)
if (lmFun %in% c("pcrR", "plsR")) {
misY <- predict(Miss, misX, ncomp = 2, type = "response")
} else if (lmFun == "CubistR") {
misY <- predict(Miss$model, misX, neighbors = Miss$neighbors)
} else if (lmFun == "glmboostR") {
misY <- predict(Miss, newdata = as.data.frame(misX))
} else {
misY <- predict(Miss, misX)
}
} else {
obsY2 <- factor(obsY)
summarY <- summary(obsY2)
if (length(summarY) == 1) {
# if all values of obsY is the same then using model would be
# unnecessary
misY <- factor(rep(names(summarY), sum(misi)))
} else {
if (cFun %in% c("randomForest", "RRF")) {
obsY <- factor(obsY)
}
Miss <- cFUN(x = obsX, y = obsY)
if (cFun %in% c("stepBothC", "stepBackC", "stepForC",
"lassoC", "ridgeC")) {
misY <- ifelse(predict(Miss, misX, type = "response") < 0.5, 0, 1)
} else if (cFun %in% c("rpartC", "randomForest", "RRF")) {
try <- predict(Miss, misX, type = "prob")
misY <- ifelse(try[, 1] > try[, 2], 0, 1)
} else if (cFun == "treeC") {
misY <- predict(Miss, misX, type = "class")
} else if (cFun == "gbmC") {
try <- predict(Miss$model, misX, Miss$best, type = "response")
misY <- ifelse(try < 0.5, 0, 1)
} else {
misY <- predict(Miss, misX)
}
}
}
## replace old imputed value with prediction
ximp[misi, varInd] <- misY
}
}
if (verbose) {
cat('done!\n')
}
iter <- iter + 1
Ximp[[iter]] <- ximp
# check the difference between iteration steps
# This implementation is really smart and is derived from the brillian MissForest
# package source.
t.co2 <- 1
for (t.type in names(convNew)) {
t.ind <- which(Type == t.type)
if (t.type == "numeric") {
convNew[t.co2] <- sum((ximp[, t.ind] - ximp.old[, t.ind])^2)/sum(ximp[, t.ind]^2)
} else {
dist <- sum(as.character(as.matrix(ximp[, t.ind])) !=
as.character(as.matrix(ximp.old[, t.ind])))
convNew[t.co2] <- dist/(n * sum(Type == "character"))
}
t.co2 <- t.co2 + 1
}
if (conv) {
if (k == 1) {
Converg[iter] <- convNew
} else {
Converg[iter, ] <- convNew
}
}
if (verbose) {
if (task == 3) {
cat("Difference after iteration", iter, "is", convNew[1],
"and", convNew[2], "\n")
} else {
cat("Difference after iteration", iter, "is", convNew, "\n")
}
}
}
## produce output w.r.t. stopping rule
if (iter == maxiter){
ximp = Ximp[[iter]]
} else {
ximp = Ximp[[iter - 1]]
}
if (conv) {
Converg <- na.omit(Converg)
return(list(imp = ximp, conv = Converg))
}
return(ximp)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.