#' Fast imputation of missing values by extreme gradien boosting
#'
#' @importFrom stats var predict
#' @importFrom xgboost xgboost
#'
#' @description Uses the "xgboost" package to do fast missing value imputation by extreme gradien boosting.
#' Between the iterative model fitting, it offers the option of predictive mean matching. This firstly avoids imputation
#' with values not present in the original mat (like a value 0.3334 in a 0-1 coded variable). Secondly, predictive mean
#' matching tries to raise the variance in the resulting conditional distributions to a realistic level and, as such,
#' allows to do multiple imputation when repeating the call to impute_xgboost(). The iterative chaining stops as soon as \code{max_iterations}
#' is reached or if the average out-of-bag estimate of performance stops improving. In the latter case, except for the first iteration,
#' the second last (i.e. best) imputed matrix is returned.
#' @param mat A \code{matrix} with missing values to impute.
#' @param max_iterations Maximum number of chaining iterations.
#' @param pmm_k Number of candidate non-missing values to sample from in the predictive mean matching step. Defaults to 5L. 0 to avoid this step.
#' @param seed Integer seed to initialize the random generator.
#' @param verbose Controls how much info is printed to screen. 0 to print nothing. 1 (default) to print a "." per iteration and
#' standardized prediction error , 2 to print model convergences.
#' @param eta eta control the learning rate: scale the contribution of each tree by a factor of 0 < eta < 1 when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for eta implies larger value for nrounds: low eta value means model more robust to overfitting but slower to compute. Default: 0.3
#' @param nrounds max number of boosting iterations.
#' @param max_depth maximum depth of a tree. Default: 6
#' @param objective specify the learning task and the corresponding learning objective, default 'reg:linear'
#' @param eval_metric evaluation metrics for validation data. Default 'rmse'
#' @param ... Arguments passed to \code{xgboost}.
#'
#' @return An imputed \code{matrix}.
#' @export
#'
#' @examples
#' mat = as.matrix(iris[,1:4])
#' mis_mat = generate_na(mat , 0.3)
#' imp_mat = impute_xgboost(mis_mat)
#'
impute_xgboost = function(mat,
max_iterations = 10L ,
seed = NULL ,
verbose = 1 ,
pmm_k = 5L ,
nrounds = 40 ,
eta = 0.4 ,
max_depth = 6 ,
objective = 'reg:linear' ,
eval_metric = 'rmse' ,
...) {
stopifnot(verbose %in% 0:3 )
if (verbose > 0) {
cat("\nMissing value imputation by extreme gradient boosting\n")
}
stopifnot(
is.matrix(mat) | 'dgeMatrix' %in% is(mat) | 'dgcMatrix' %in% is(mat)
)
stopifnot(
dim(mat) >= 1L ,
is.numeric(max_iterations) ,
length(max_iterations) == 1L ,
max_iterations >= 1L ,
is.numeric(pmm_k) ,
length(pmm_k) == 1L ,
pmm_k >= 0L
)
if(is.matrix(mat)){
stopifnot(
is.numeric(mat)
)
}
if( !any(is.na(mat)) ){
message('No missing values in matrix')
return(mat)
}
if (!is.null(seed)) {
set.seed(seed)
}
all_missing = function(vec){
all(is.na(vec))
}
imputation_variables = apply(mat, 2, function(x) !all_missing(x) )
if (verbose > 0 && length(imputation_variables) < ncol(mat)) {
cat("\n Variables ignored because all values missing: ")
cat(setdiff(names(mat), imputation_variables), sep = ", ")
}
stopifnot(length(imputation_variables) > 1L)
missingness_indicator_matrix = is.na(mat[, imputation_variables, drop = FALSE])
count_sequence = sort(colSums(missingness_indicator_matrix))
visit_sequence = names(count_sequence)[count_sequence > 0]
if (!length(visit_sequence)) {
return(mat)
}
verbose_digits = 4 # prediction of OOB prediction errors (if verbose = 2)
iteration_count = 1L # iterator
prediction_error = rep( Inf, length(visit_sequence))
names(prediction_error) = visit_sequence
prediction_error_decreased = TRUE # prediction_error on OOB prediction error to keep iterating
completed = setdiff(imputation_variables, visit_sequence)
if (verbose >= 2) {
cat("\n", abbreviate(visit_sequence, minlength = verbose_digits + 2), sep = "\t")
}
while (prediction_error_decreased && iteration_count <= max_iterations ) {
if (verbose > 0) { cat("\niter ", iteration_count , ":\t", sep = "") }
last_mat = mat
last_prediction_error = prediction_error
for( response_variable in visit_sequence ) {
response_variable_index = which(colnames(mat) == response_variable)
is_na_response = missingness_indicator_matrix[, response_variable]
if (length(completed) == 0L) {
mat[,response_variable] = impute_univariate(mat[ , response_variable])
if(verbose >= 2) cat('u')
} else {
if(verbose >= 2) cat('m')
this_prediction_data = mat[!is_na_response , -response_variable_index]
non_na_responses = mat[!is_na_response , response_variable_index]
fit = xgboost(
data = this_prediction_data ,
label = non_na_responses,
eta = eta ,
max_depth = max_depth ,
objective = objective ,
eval_metric = eval_metric ,
nrounds = nrounds ,
verbose = max( 0 , verbose - 1 ) ,
...
)
this_prediction_error = fit$evaluation_log$train_rmse[nrounds]
this_mean_abs = mean(abs(non_na_responses))
this_cv_rmse = this_prediction_error / this_mean_abs
pred = predict( fit , mat[ , -response_variable_index] )
if(pmm_k > 0){
pred = pmm(xtrain = pred[ !is_na_response ] ,
xtest = mat[!is_na_response , response_variable_index],
ytrain = non_na_responses ,
k = pmm_k)
}
mat[is_na_response, response_variable] = pred[is_na_response]
prediction_error[[response_variable]] = this_cv_rmse
}
completed = union(completed, response_variable)
if (verbose == 1) {
cat(".")
} else if (verbose >= 2) {
cat(format(round(prediction_error[[response_variable]], verbose_digits), nsmall = verbose_digits), "\t")
}
}
if(verbose > 0){
cat('\n mean coefficient of variation of root mean square error:' , mean(prediction_error) )
}
if(verbose > 1) {
cat('\n')
print(prediction_error)
}
iteration_count= iteration_count + 1L
prediction_error_decreased = mean(prediction_error) < mean(last_prediction_error)
}
if (verbose > 0) {
cat("\n")
}
if (iteration_count== 2L || (iteration_count== max_iterations && prediction_error_decreased)) {
last_mat = mat
}
return( last_mat )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.