#' hmi: Hierarchical Multilevel Imputation.
#'
#' The user has to pass his data to the function.
#' Optionally he passes his analysis model formula so that \code{hmi} runs the imputation model
#' in line with his analysis model formula.\cr
#' And of course he can specify some parameters for the imputation routine
#' (like the number of imputations and iterations and the number of iterations
#' within the Gibbs sampling).\cr
#' @param data A \code{data.frame} with all variables appearing in \code{model_formula}.
#' @param model_formula A \code{\link[stats]{formula}} used for the analysis model.
#' Currently the package is designed to handle formula used in
#' \code{lm}, \code{glm}, \code{lmer} and \code{glmer}. The formula is also used for the default pooling.
#' @param family To improve the default pooling, a family object supported by \code{glm} (resp. \code{glmer}) an be given.
#' See \code{family} for details.
#' @param additional_variables A character with names of variables (separated by "+", like "x8+x9")
#' that should be included in the imputation model as fixed effects variables,
#' but not in the analysis model.
#' An alternative would be to include such variable names into the \code{model_formula}
#' and run a reduced analysis model with \code{hmi_pool} or the functions provide by \code{mice}.
#' @param list_of_types a list where each list element has the name of a variable
#' in the data.frame. The elements have to contain a single character denoting the type of the variable.
#' See \code{get_type} for details about the variable types.
#' With the function \code{list_of_types_maker}, the user can get the framework for this object.
#' In most scenarios this is should not be necessary.
#' One example where it might be necessary is when only two observations
#' of a continuous variable are left - because in this case \code{get_type}
#' interpret is variable to be binary. Wrong is it in no case.
#' @param M An integer defining the number of imputations that should be made.
#' @param maxit An integer defining the number of times the imputation cycle
#' (imputing \eqn{x_1|x_{-1}} then \eqn{x_2|x_{-2}}, ... and finally \eqn{x_p|x_{-p}}) shall be repeated.
#' The task of checking convergence is left to the user, by evaluating the chainMean and chainVar!
#' @param nitt An integer defining number of MCMC iterations (see \code{MCMCglmm}).
#' @param burnin An integer for the desired number of
#' Gibbs samples that shall be regarded as burnin.
#' @param pvalue A numeric between 0 and 1 denoting the threshold of p-values a variable in the imputation
#' model should not exceed. If they do, they are excluded from the imputation model.
#' @param mn An integer defining the minimum number of individuals per cluster.
#' @param k An integer defining the allowed maximum of levels in a factor covariate.
#' @param spike A numeric value saying which value in the semi-continuous data might be the spike.
#' Or a list with with such values and names identical to the variables with spikes
#' (see \code{list_of_spikes_maker} for details.) In versions earlier to 0.9.0 it was called \code{heap}.
#' @param heap Use spike instead. \code{heap} is only included due to backwards compatibility
#' and will be removed with version 1.0.0
#' @param rounding_degrees A numeric vector with the presumed rounding degrees of rounded variables.
#' Or a list with rounding degrees, where each list element has the name of a rounded continuous variable.
#' Such a list can be generated using \code{list_of_rounding_degrees_maker(data)}.
#' Note: it is presupposed that the rounding degrees include 1 meaning that there
#' is a positive probability that e.g. 3500 was only rounded to the nearest integer
#' (and not rounded to the nearest multiple of 100 or 500).
#' @param rounding_formula A formula with the model formula for the latent rounding tendency G.
#' Or a list with model formulas for G, where each list element has the name of a rounded continuous variable.
#' Such a list can be generated
#' using \code{list_of_rounding_formulas_maker(data)}
#' @param pool_with_mice A Boolean indicating whether the user wants to pool the \code{M} data sets by mice
#' using his \code{model_formula}. The default value is \code{FALSE} because this tampers the
#' \code{mids} object as it adds an argument \code{pooling} not found in "normal" \code{mids} objects
#' generated by \code{mice}.
#' @return The function returns a \code{mids} object. See \code{mice} for further information.
#' @examples
#' \dontrun{
#' require("lme4")
#' require("mice")
#' data(Gcsemv, package = "hmi")
#'
#' model_formula <- written ~ 1 + gender + coursework + (1 + gender|school)
#'
#' set.seed(123)
#' dat_imputed <- hmi(data = Gcsemv, model_formula = model_formula, M = 2, maxit = 2)
#'
#' my_analysis <- function(complete_data){
#' # In this list, you can write all the parameters you are interested in.
#' # Those will be averaged.
#' # So make sure that averaging makes sense and that you only put in single numeric values.
#' parameters_of_interest <- list()
#'
#' # ---- write in the following lines, what you are interetest in to do with your complete_data
#' # the following lines are an example where the analyst is interested in the fixed intercept
#' # and fixed slope and the random intercepts variance,
#' # the random slopes variance and their covariance
#' my_model <- lmer(model_formula, data = complete_data)
#'
#' parameters_of_interest[[1]] <- fixef(my_model)
#' parameters_of_interest[[2]] <- lme4::VarCorr(my_model)[[1]][,]
#' ret <- unlist(parameters_of_interest)# This line is essential if the elements of interest
#' #should be labeled in the following line.
#' names(ret) <-
#' c("beta_intercept", "beta_gender", "beta_coursework", "sigma0", "sigma01", "sigma10", "sigma1")
#'
#' return(ret)
#' }
#' hmi_pool(mids = dat_imputed, analysis_function = my_analysis)
#' #if you are interested in fixed effects only, consider pool from mice:
#' pool(with(data = dat_imputed, expr = lmer(written ~ 1 + gender + coursework + (1 + gender|school))))
#' }
#' @export
hmi <- function(data,
model_formula = NULL,
family = stats::gaussian,
additional_variables = NULL,
list_of_types = NULL,
M = 5,
maxit = NULL,
nitt = 22000,
burnin = 2000,
pvalue = 1,
mn = 1,
k = NULL,
spike = NULL,
heap = NULL,
rounding_degrees = NULL,
rounding_formula = ~ .,
pool_with_mice = TRUE){
options(error = expression(NULL))
if(is.matrix(data)) data <- as.data.frame(data)
if (!is.null(heap)) {
warning("argument heap is deprecated; please use spike instead.",
call. = FALSE) #In future version heap will be entirely replaced by spike.
if(is.null(spike)) spike <- heap
}
#restore the original value of k. By this later it is possible whether a value of k = Inf,
#was given by the user, or set as default by hmi.
k_org <- k
if(is.null(k)){
k <- Inf
}
if(is.null(spike)){
spike <- list_of_spikes_maker(data)
}
if(is.null(spike)) spike <- 0
if(is.null(list_of_types)){
tmp_list_of_types <- list_of_types_maker(data, spike = spike, rounding_degrees = rounding_degrees)
}else{
if(!is.list(list_of_types)) stop("We need list_of_types to be a list.")
if(any(!names(list_of_types) %in% colnames(data))){
stop("At least one variable in list_of_types isn't found in data.")
}
#Currently we allow list_of_types to specify only a subset of variables found in data.
#For a strict 1:1 relation, the following code can be used.
#if(!isTRUE(all.equal(sort(names(list_of_types)), sort(colnames(data))))){
# stop("At least one variable in list_of_types isn't found in data or vica versa.")
#}
if(any(lapply(list_of_types, class) != "character")){
stop("We need the elements of list_of_types to be of class character.")
}
if(any(lapply(list_of_types, length) != 1)){
stop("We need that each element of list_of_types only contains one character.")
}
tmp_list_of_types <- list_of_types
}
#Check for implicit specifications for a variable to be semicont or roundedcont
if(is.list(spike)){
for(varindex in colnames(data)){
if(is.null(list_of_types[[varindex]]) & !is.null(spike[[varindex]])){
tmp_list_of_types[[varindex]] <- "semicont"
}
if(is.null(list_of_types[[varindex]]) & !is.null(rounding_degrees[[varindex]])){
tmp_list_of_types[[varindex]] <- "roundedcont"
}
}
}
my_data <- data
if(is.matrix(my_data)) stop("We need your data in a data.frame (and not a matrix).")
if(!is.data.frame(my_data)) stop("We need your data in a data.frame.")
if(any(apply(my_data, 1, function(x) all(is.na(x))))){
warning("Some observations have a missing value in every variable.
This can increase your estimate's variance.
We strongly suggest to remove those observations.")
}
if(any(apply(my_data, 2, function(x) all(is.na(x))))){
warning("Some variables have a missing values in every observation.
Remove those variables from your data.frame (e.g. by setting them to NULL).")
}
if(!class(model_formula) %in% c("character", "formula", "NULL")){
stop("model_formula has to be a formula!")
}
# recode "-Inf;Inf" in interval data to NA
#intervals from -Inf to Inf shall be recoded as NA.
for(j in 1:ncol(my_data)){
data[, j] <- sna_interval(my_data[, j])
my_data[, j] <- sna_interval(my_data[, j])
}
#get missing rates
missing_rates <- apply(my_data, 2, function(x) sum(is.na(x)))/nrow(my_data)
# it's not necessary to divide by nrow(my_data),
# but we keep this for more intuitive debugging
#get variables with missings
incomplete_variables <- names(my_data)[missing_rates > 0]
variables_to_impute <- union(incomplete_variables,
names(tmp_list_of_types)[tmp_list_of_types == "roundedcont" |
tmp_list_of_types == "interval"])
# Set a default value of maxit.
# This default value should be 1 in case of only one variable to impute.
# (In such cases the imputation cycle cannot further converge, so 1 is efficient.)
# In the other cases, it is set to 10.
if(is.null(maxit)){
if(length(variables_to_impute) <= 1){
maxit <- 1
}else{
maxit <- 10
}
}
#Check nitt. To check whether nitt is a whole number, the example from ?integer is used.
is.wholenumber <-
function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol
if(!is.wholenumber(nitt) | nitt < 1){
warning("nitt must be an integer >= 1")
}
if(any(missing_rates > 0.9)){
many_NAs <- names(my_data)[missing_rates > 0.9]
writeLines(paste("The variable(s) >>",
paste(many_NAs, collapse = " and "),
"<< have missing rates above 90%.
This can result in instable imputation results for this variable
and later in failing imputation routines for other variables (like count variables).
How do you want to proceed: \n
[c]ontinue using these variables
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
}
# # # # # # # # # # # get the formula elements (fe) and do consistency checks # # # # # # # # # # # # # # # #
constant_variables <- apply(my_data, 2, function(x) length(unique(x)) == 1)
if(sum(constant_variables) >= 2){
print(paste("Your data has two or more constant variables: ",
paste(names(my_data)[constant_variables], collapse = ", "),
". For the imputation we kept only the first to avoid multicolinearity.", sep = ""))
my_data[, names(my_data)[constant_variables][-1]] <- NULL
}
# Searches for intercepts
# In a model there a three ways to specify an intercept:
# 1: Explicitly by writing ~ 1 + X1 + ...
# 2: Implicitly by writing ~ X1 + ...
# 3: By refering to an exisiting intercept variable
# (eg. if called "Int"): ~ X1 + Int + ...
model_formula_org <- model_formula
if(is.null(model_formula)){
model_has_intercept <- TRUE
}else{
if(is.null(additional_variables)){
model_formula <- stats::as.formula(format(model_formula))
}else{
model_formula <- stats::as.formula(paste(format(model_formula), additional_variables, sep = "+"))
}
model_has_intercept <- fixed_intercept_check(model_formula) |
random_intercept_check(model_formula) |
any(names(which(constant_variables)) %in% all.vars(stats::delete.response(stats::terms(lme4::nobars(model_formula)))))
}
dataset_has_intercept <- any(constant_variables)
if(model_has_intercept){
if(dataset_has_intercept){
#nothing to do in this case: just use this intercept
}else{
#in the case an intercept is needed but not present in the data: add an intercept
# generate a variable name that is guaranteed not used in the data
# by adding something to the longest variable name.
intercept_varname <- paste("Intercept_",
colnames(my_data)[which.max(nchar(colnames(my_data)))], sep = "")
my_data[, intercept_varname] <- 1
}
}else{
if(dataset_has_intercept){
#We decided always to include an intercept, even if the analysis model excludes one
#for the target variable. Reason: For the imputation of another (co)variable,
#it is not clear if the user also want to exclude an intercept.
#And we think an intercept is generally beneficial.
#The following code might be used if intercepts should be excluded:
#intercept_varname <- names(which(constant_variables))
#warning(paste("We droped the constant variable >>", intercept_varname,
#"<< in the imputation process as the model_formula excluded an intercept.", sep = ""))
#my_data[, intercept_varname] <- NULL
}else{
#nothing to do in this case
}
}
constant_variables <- apply(my_data, 2, function(x) length(unique(x)) == 1)
if(sum(constant_variables) >= 2){
print(paste("Your data has two or more constant variables: ",
paste(names(my_data)[constant_variables], collapse = ", "),
". For the imputation we kept only the first to avoid multicolinearity.", sep = ""))
my_data[, names(my_data)[constant_variables][-1]] <- NULL
}
variable_names_in_data <- colnames(my_data)
fe <- extract_varnames(model_formula = model_formula,
constant_variables = constant_variables,
variable_names_in_data = variable_names_in_data,
data = data)
# check if all fix effects covariates are available in the data set
if(all(fe$fixedeffects_varname != "") & !all(fe$fixedeffects_varname %in% names(my_data))){
writeLines(paste("We didn't find >>",
paste(fe$fixedeffects_varname[!fe$fixedeffects_varname %in% names(my_data)],
collapse = " and "),
"<< in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula and running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
#if the model_formula is ignored every extracted value of the model_formula is ignored...
model_formula <- NULL
fe[1:length(fe)] <- ""
#... and every variable is used as a fixed effects variable, because in this case a single level imputation
#on every variable is run
fe$fixedeffects_varname <- colnames(my_data)
}
# check if all random effects covariates are available in the data set
if(all(fe$randomeffects_varname != "") & !all(fe$randomeffects_varname %in% names(my_data))){
writeLines(paste("We didn't find >>",
paste(fe$randomeffects_varname[!fe$randomeffects_varname %in% names(my_data)],
collapse = " and "),
"<<in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula and running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
if(all(c("0", "1") %in% fe$randomeffects_varname)){
warning("Your model_formula includes 0 for no random intercept and 1 for a random intercept at the same time.
Please decide whether you want a random intercept or not.")
}
#it could be the case that the user specifies several cluster ids.
#As the tool doesn't support this yet, we stop the imputation routine.
if(length(fe$clID_varname) >= 2){
writeLines("The package only supports one cluster ID in the model_formula. \n
How do you want to proceed: \n
[c]ontinue with ignoring the model_formula and running a single level imputation
or [e]xiting the imputation?")
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
#check whether the formula matches the data
if(fe$clID_varname != "" & !(fe$clID_varname %in% names(my_data))){
writeLines(paste("We didn't find >>", fe$clID_varname,
"<< in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula and running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
#Check whether there are missing values in the cluster variable.
if(fe$clID_varname != ""){
missings_in_clid <- is.na(my_data[, fe$clID_varname, drop = FALSE])
if(any(missings_in_clid)){
writeLines(paste("Missing values were found in the cluster variable >>", fe$clID_varname, "<<.",
"\\nHow do you want to proceed? \n
[r]emoving these missing cases from the whole data set (recommended),
[i]mputing these missing observations using a categorical imputation routine (not recommended)
or [e]xiting the imputation?\n"))
proceed <- readline("Type 'r', 'i or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
if(proceed == "r"){
my_data <- my_data[!missings_in_clid, , drop = FALSE]
data <- data[!missings_in_clid, , drop = FALSE]
}
#if the NAs should be imputed, no special action has be be taken here.
}
}
#check whether the constant variable in the dataset are labbeld "1" if this is the label of the
#intercept variable in the model formula.
if(any(constant_variables)){
if((any(fe$fixedeffects_varname == "1") &
variable_names_in_data[constant_variables] != "1")|
(any(fe$randomeffects_varname == "1") &
variable_names_in_data[constant_variables] != "1")){
warning(paste("Your model_formula includes '1' for an intercept.
The constant variable in your data is named ", variable_names_in_data[constant_variables], ".
Consider naming the variable also '1' or 'Intercept'.", sep = ""))
}
}
#make sure that the intercept variable exist, and has the value 1
if(fe$intercept_varname != ""){
my_data[, fe$intercept_varname] <- 1
}
if(fe$clID_varname != ""){
my_data[, fe$clID_varname] <- as.factor(my_data[, fe$clID_varname])
}
#generate interaction variable(s), which will be updated after each imputation of a variable
interaction_names <- list()
if(fe$interactions != ""){
for(i in 1:length(fe$interactions)){
#get the product of the interaction variables
interaction <- apply(my_data[, strsplit(fe$interactions[i], ":")[[1]], drop = FALSE], 1, prod)
#get a name for the interaction
tmp_interaction_name <- paste("Interaction_", i, "_",
colnames(my_data)[which.max(nchar(colnames(my_data)))],
sep = "")
#add the interaction to the dataset
my_data[, tmp_interaction_name] <- interaction
#This step has to be repeated after each imputation of a variable.
#To avoid ever groing names, we fix the names of the interaction variables in a list
interaction_names[[i]] <- tmp_interaction_name
}
}
# The rounding_formula shall be transformed into a list with vectors of covariates names mentioned in the
#rounding_formula
if(is.list(rounding_formula)){
rounding_covariates <- rounding_formula
}else{
rounding_covariates <- list()
for(tmpindex in names(data)){
rounding_covariates[[tmpindex]] <- rounding_formula
}
rm(tmpindex)
}
for(i in names(rounding_covariates)){
ge <- extract_varnames(model_formula = rounding_covariates[[i]],
constant_variables = constant_variables,
variable_names_in_data = variable_names_in_data,
data = data)
# check if all fix effects covariates are available in the data set
tmp <- ge$fixedeffects_varname
if(all(tmp != "") & !all(tmp %in% names(my_data))){
writeLines(paste("We didn't find >>",
paste(tmp[!tmp %in% names(my_data)],
collapse = " and "),
"<< in your data. How do you want to proceed: \n
[c]ontinue with ignoring the rounding_formula and run a minimal rounding_formula
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
#if the rounding_formula is ignored the rounding d
ge$fixedeffects_varname <- i
}
rounding_covariates[[i]] <- ge$fixedeffects_varname
}
########################### MERGE SMALL CLUSTERS ####################
if(fe$clID_varname != ""){
tab_1 <- table(my_data[, fe$clID_varname])
if(!is.numeric(mn)) stop("mn has to be an integer (whole number).")
if(mn %% 1 != 0) stop("mn has to be an integer (whole number).")
#merge small clusters to a large big cluster
if(any(tab_1 < mn)){
writeLines(paste(sum(tab_1 < mn), "clusters have less than", mn, "observations.",
"How do you want to proceed: \n
[c]ontinue with merging these clusters with others
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
# The procedure is the following:
# 1. determine the smallest and the second smallest cluster.
# 2. take the observations of the smallest cluster and combine it with observations of the second smallest cluster.
# this is be done by giving both clusters the name "[name of second smallest cluster] forced merge".
# 3. repeat with 1 until there is no invalid cluster left
while(any(tab_1 < mn)){
#step 1: determine the smallest and the second smallest cluster.
smallest_cluster <- tab_1[tab_1 == sort(as.numeric(tab_1))[1]][1]
tab_1_without_smallest_cluster <- tab_1[names(tab_1) != names(smallest_cluster)]
second_smallest_cluster <- tab_1_without_smallest_cluster[tab_1_without_smallest_cluster == sort(as.numeric(tab_1_without_smallest_cluster))[1]][1]
#step 2: new cluster name
new_name <- names(second_smallest_cluster)
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(smallest_cluster)] <- new_name
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(second_smallest_cluster)] <- new_name
#step 3: repeat.
tab_1 <- table(my_data[, fe$clID_varname])
}
}
if(length(tab_1) <= 2){
writeLines("The package currently requires more than two clusters\n
How do you want to proceed: \n
[c]ontinue with ignoring the model_formula and running a single level imputation
or [e]xiting the imputation?")
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
}
######################################## START THE IMPUTATION #####################
# get the variable types:
types <- unlist(list_of_types_maker(my_data, spike = spike, rounding_degrees = rounding_degrees))
categorical <- types == "categorical"
if(any(categorical)){
more_than_10_levels <- colnames(my_data[, categorical, drop = FALSE])[
apply(my_data[, categorical, drop = FALSE], 2, function(x) nlevels(factor(x))) > 10]
if(length(more_than_10_levels) > 0 & !is.null(k_org)){
warning(paste("More than 10 categories found in ", paste(more_than_10_levels, collapse = ", "),
"\n consider setting k = 10.
\n Or if only insignificant categories should be dropped during imputation:
\n set alpha = 0.2 (for example).", sep = ""))
}
for(catcount in 1:sum(categorical)){
my_data[, names(my_data)[categorical][catcount]] <-
as.factor(my_data[, names(my_data)[categorical][catcount]])
}
}
data_prepared <- my_data
NA_locator <- is.na(data_prepared)
#################### Preparation for setting up a mids-object
alldata <- list()#this object saves the M datasets.
my_chainMean <- array(dim = c(length(variables_to_impute), maxit, M))
dimnames(my_chainMean) <- list(variables_to_impute,
as.character(1:maxit),
paste("Chain", 1:M))
my_chainVar <- my_chainMean
cat("Imputation progress:\n")
cat("0% 20% 40% 60% 80% 100%\n")
cat("|")
progress_plot <- "----|----|----|----|----|"
tickts_drawn <- 0
if(is.null(list_of_types)){
tmp_list_of_types <- list_of_types_maker(my_data, spike = spike, rounding_degrees = rounding_degrees)
#Note: here it can happen that a variable in the original_data is assumed to be
#semicont, but after the first imputation it is considered to be continuous.
#Therefore the list of types is updated based on the data_before
#and not on the original data.
}else{
tmp_list_of_types <- list_of_types
}
#initialize the list for the chains of Gibbs-samplings
gibbs <- list()
for(i in 1:M){
gibbs[[paste("imputation", i, sep = "")]] <- list()
for(l1 in 1:maxit){
#do the imputation cycle
tmp <- imputationcycle(data_before = my_data,
original_data = data,
NA_locator = NA_locator,
fe = fe,
interaction_names = interaction_names,
list_of_types = tmp_list_of_types,
nitt = nitt,
burnin = burnin,
thin = 1,
pvalue = pvalue,
mn = mn,
k = k,
spike = spike,
rounding_degrees = rounding_degrees,
rounding_covariates = rounding_covariates)
my_data <- tmp$data_after
gibbs[[paste("imputation", i, sep = "")]][[paste("cycle", l1, sep = "")]] <-
tmp$chains
if(is.null(list_of_types)){
tmp_list_of_types <- list_of_types_maker(data, spike = spike, rounding_degrees = rounding_degrees)
#Note: here it can happen that a variable in the original_data is assumed to be
#semicont, but after the first imputation it is considered to be continuous.
#Therefore we update the list of types absed on the data_before
#and not on the original data.
}else{
tmp_list_of_types <- list_of_types
}
incomplete_variables <- colnames(NA_locator)[colSums(NA_locator) > 0]
variables_to_impute <- union(incomplete_variables,
names(tmp_list_of_types)[tmp_list_of_types == "roundedcont" |
tmp_list_of_types == "interval"])
#evaluate the imputed variables
for(l2 in variables_to_impute){
#for some variable types these imputed data cannot be directly evaluated by taking the mean.
# For example categorical variables (including binary variables which might not be 0-1 coded but
# for example "m"-"f".)
#in other variables (interval and rounded continuous) not only the missing variables,
# but all values which received a new value shall be evaluated.
#Therefore the types of the variables are needed
if(is.null(list_of_types)){
tmp_type <- list_of_types_maker(data[, l2, drop = FALSE],
spike = spike, rounding_degrees = rounding_degrees)[[1]]
}else{
tmp_type <- list_of_types[[l2]]
}
to_evaluate <- my_data[is.na(data[, l2]), l2, drop = TRUE]
if(tmp_type == "roundedcont"){
#For rounded continuous variables three types of observations can be evaluated:
#1. the observations roundable by a factor from the rounding degrees.
#(Note: rounding to the nearest integer is counted as well, as those observations are imputed too)
#2. Interval observations.
#3. Missing observations.
if(is.list(rounding_degrees)){
rounding_degrees_tmp <- rounding_degrees[[l2]]
}else{
rounding_degrees_tmp <- suggest_rounding_degrees(data[, l2])
}
if(is.null(rounding_degrees_tmp)){
rounding_degrees_tmp <- c(1, 10, 100, 1000)
}
eval_index <-
apply(outer(decompose_interval(interval = data[, l2])[, "precise"], rounding_degrees_tmp, '%%') == 0, 1, any) |
!is.na(decompose_interval(interval = data[, l2])[, "lower_imprecise"]) | is.na(data[, l2])
to_evaluate <- my_data[eval_index, l2, drop = TRUE]
}
if(tmp_type == "interval"){
eval_index <- !is.na(decompose_interval(interval = data[, l2])[, "lower_imprecise"]) | is.na(data[, l2])
to_evaluate <- my_data[eval_index, l2, drop = TRUE]
}
if(tmp_type == "binary" |
tmp_type == "categorical" | tmp_type == "ordered_categorical"){
eval_index <- is.na(data[, l2])
to_evaluate <- as.numeric(as.factor(my_data[, l2]))[eval_index]
}
if(tmp_type == "intercept"){
to_evaluate <- 1
}
my_chainMean[l2, l1, i] <- mean(to_evaluate)
my_chainVar[l2, l1, i] <- stats::var(to_evaluate)
}
}
#Show the progress of the imputation:
progress <- floor((i/M) / 0.04)
if(progress > tickts_drawn){
cat(substr(progress_plot,
start = tickts_drawn + 1, stop = progress))
tickts_drawn <- progress
}
#Store the data, but only those variables that were also in the original data,
#even if they were droped because of colinearity.
#An added intercept variable wont be exported.
alldata[[i]] <- data
variables_to_export <- colnames(my_data) %in% colnames(data)
alldata[[i]][, variables_to_export] <- my_data[, variables_to_export, drop = FALSE]
# Restore the original data, so that the next imputation round with its maxit cycles
# are independent of the current runs.
my_data <- data_prepared
}
# -------------- SET UP MIDS OBJECT --------------
imp_hmi <- stats::setNames(vector("list", ncol(data)), colnames(data))
method <- unlist(list_of_types_maker(data, spike = spike, rounding_degrees = rounding_degrees))
names(method) <- colnames(data)
for(l2 in variables_to_impute){
tmp_type <- list_of_types_maker(data[, l2, drop = FALSE],
spike = spike, rounding_degrees = rounding_degrees)[[1]]
if(tmp_type == "interval" |tmp_type == "roundedcont"){
data[, l2] <- NA
}
n_mis_j <- sum(is.na(data[, l2]))
imp_hmi[[l2]] <- as.data.frame(matrix(NA, nrow = n_mis_j, ncol = M,
dimnames = list(NULL, 1:M)))
for(i in 1:M){
# save only the imputed values of each variable across the different imputation runs.
# remember: alldata is a list with M elements, each element contains the whole data.frame
# which is the original dataframe, where the NAs have been replaced by imputed values.
# We want imputed_values to be a data.frame, so that we can store its colname
var_of_interest <- alldata[[i]][, l2, drop = FALSE]
imputed_values <- var_of_interest[is.na(data[, l2]), , drop = FALSE]
imp_hmi[[l2]][, i] <- imputed_values
rownames(imp_hmi[[l2]]) <- rownames(imputed_values)
}
}
nmis <- apply(is.na(data), 2, sum)
predictorMatrix <- matrix(0, nrow = ncol(data), ncol = ncol(data))
rownames(predictorMatrix) <- colnames(data)
colnames(predictorMatrix) <- colnames(data)
predictorMatrix[variables_to_impute, ] <- 1
diag(predictorMatrix) <- 0
visitSequence <- (1:ncol(data))[apply(is.na(data), 2, any)]
names(visitSequence) <- colnames(data)[visitSequence]
loggedEvents <- data.frame(it = 0, im = 0, co = 0, dep = "",
meth = "", out = "")
#Not supported yet
form <- vector("character", length = ncol(data))
post <- vector("character", length = ncol(data))
midsobj <- list(call = call, data = data,
m = M, nmis = nmis, imp = imp_hmi, method = method,
predictorMatrix = predictorMatrix, visitSequence = visitSequence,
form = form, post = post, seed = NA, iteration = maxit,
lastSeedValue = .Random.seed,
chainMean = my_chainMean,
chainVar = my_chainVar,
loggedEvents = loggedEvents)
oldClass(midsobj) <- "mids"
# If the user wants to use mice as a pooling function, and not hmi_pool,
# we conduct the pooling right away using the model_formula given by the user
# ! Note: here something not found in mids-objects from mice is added!
tmp_type <- list_of_types_maker(data[, fe$target_varname, drop = FALSE], spike = spike,
rounding_degrees = rounding_degrees)[[1]]
if(is.null(tmp_type)){
tmp_type <- "unknown"
}
#For cautious pooling, include:
#& (tmp_type == "cont" | tmp_type == "semicont" | tmp_type == "roundedcont" |
# tmp_type == "interval" | tmp_type == "count")
if(pool_with_mice & !is.null(model_formula_org)){
if(fe$clID_varname == ""){ # if no cluster ID was found, run a single level model
if(tmp_type == "ordered_categorical"){
cat("\n Currently, mice does not support the pooling of ordered categorical models. Use hmi_pool instead.")
}else{
# Run a glm or in case of a categorical variable, a multinomial model
if(tmp_type == "categorical"){
mira <- with(data = midsobj,
expr = nnet::multinom(formula = formula(format(model_formula_org))))
}else{
mira <- with(data = midsobj,
expr = stats::glm(formula = formula(format(model_formula_org)), family = family))
}
# mice::pool cannot deal with missing coefficients, so pool is only called,
#if no coefficitent is NA.
if(!any(unlist(lapply(mira$analyses, function(x) any(is.na(stats::coefficients(x))))))){
midsobj$pooling <- mice::pool(mira)
}
}
}else{ # otherwise a multilevel model
if(tmp_type == "categorical" | tmp_type == "ordered_categorical"){
print("Currently glmer does not support (ordered) categorical variables. Use hmi_pool instead.")
}else if(tmp_type == "cont"){
midsobj$pooling <- mice::pool(with(data = midsobj,
expr = lme4::lmer(formula = format(model_formula_org))))
}else{
oldw <- getOption("warn")
options(warn = -1)
midsobj$pooling <- mice::pool(with(data = midsobj,
expr = lme4::glmer(formula = format(model_formula_org),
family = family)))
options(warn = oldw)
}
}
}
#In addition to classical mids objects we add the chains of the gibbs-samplings from MCMCglmm.
midsobj$gibbs <- gibbs
midsobj$gibbs_para <- c(nitt, burnin, 1) #The last parameter is the default thinning parameter.
return(midsobj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.