#' Hierarchical Multilevel Imputation (hmi). The main wrapper function called by the user.
#'
#' The user has to pass his data to the function.
#' Optionally he passes his analysis model formula so that \code{hmi} runs the imputation model
#' in line with his analysis model formula.\cr
#' And of course he can specify some parameters for the imputation routine
#' (like the number of imputations and iterations) including Gibbs-sampler parameters
#' (number of iterations, burnin and thinning.\cr
#' @param data A \code{data.frame} with all variables appearing in \code{model_formula}.
#' @param model_formula A \code{\link[stats]{formula}} used for the analysis model.
#' Currently the package is designed to handle formula used in
#' \code{lm}, \code{glm} and \code{lmer}.
#' @param M An integer defining the number of imputations that should be made.
#' @param maxit An integer defining the number of times the imputation cycle
#' (imputing \eqn{x_1|x_{-1} then x_2|x_{-2}, ... x_p|x_{-p}}) shall be repeated.
#' The task of checking convergence is left to the user, by evaluating the chainMean and chainVar!
#' @param nitt An integer defining number of MCMC iterations (see \code{MCMCglmm}).
#' @param thin An integer defining the thinning interval (see \code{MCMCglmm}).
#' @param burnin An integer defining the percentage of draws from the gibbs sampler
#' that should be discarded as burn in (see \code{MCMCglmm}).
#' @return The function returns a \code{mids} object. See \code{mice} for further information.
#' @examples
#' my.formula <- Reaction ~ Days + (1 + Days|Subject)
#' my_analysis <- function(complete_data){
#' # In this list, you can write all the parameters you are interested in.
#' # Those will be averaged.
#' # So make sure that averaging makes sense and that you only put in single numeric values.
#' parameters_of_interest <- list()
#'
#' # ---- write in the following lines, what you are interetest in to do with your complete_data
#' # the following lines are an example where the analyst is interested in the fixed intercept
#' # and fixed slope and the random intercepts variance, the random slopes variance and their covariance
#' my_model <- lmer(my.formula, data = complete_data)
#'
#' parameters_of_interest[[1]] <- fixef(my_model)[1]
#' parameters_of_interest[[2]] <- fixef(my_model)[2]
#' parameters_of_interest[[3]] <- VarCorr(my_model)[[1]][1, 1]
#' parameters_of_interest[[4]] <- VarCorr(my_model)[[1]][1, 2]
#' parameters_of_interest[[5]] <- VarCorr(my_model)[[1]][2, 2]
#' names(parameters_of_interest) <- c("beta_intercept", "beta_Days", "sigma0", "sigma01", "sigma1")
#'
#' # ---- do change this function below this line.
#' return(parameters_of_interest)
#'}
#'
#' test <- sleepstudy
#' test$Intercept <- 1
#' test[sample(1:nrow(test), size = 20), "Reaction"] <- NA
#' hmi_imp <- hmi(data = test, model_formula = my.formula)
#' hmi_pool(mids = hmi_imp, analysis_function = my_analysis)
#' #if you are interested in fixed effects only, consider using \code{pool} from \code{mice}.
#' pool(with(data = hmi_imp, expr = lmer(Reaction ~ Days + (1 + Days | Subject))))
hmi <- function(data, model_formula = NULL,
M = 10,
maxit = 5,
nitt = 3000,
thin = 100,
burnin = 1000){
my_data <- data #wird spaeter ausgegeben. Entspricht von der Form dem
#original Datensatz, nur, dass alle fehlenden Werte (mit Ausnahme der design
#bedingt fehlenden) durch imputierte Werte ersetzt wurden.
#des weitern gibt es noch einen tmp_data Datensatz
if(is.matrix(my_data)) stop("We need your data in a data.frame (and not a matrix).")
if(!is.data.frame(my_data)) stop("We need your data in a data.frame.")
#"Beobachtungen" bei denen alle Angaben fehlen, sollten im Sinne des Anwenders
#entfernt werden. Da er aber vielleicht einen Grund hat sie zu uebergeben, wird
#nur eine Warnung ausgegeben.
if(any(apply(my_data, 1, function(x) all(is.na(x))))){
warning("Some observations have a missing value in every variable.
This can increase your estimate's variance.
We strongly suggest to remove those observations.")
}
#get missing rates
missing_rates <- apply(my_data, 2, function(x) sum(is.na(x)))/nrow(my_data)
only_one_variable_to_impute <- sum(missing_rates != 0) == 1
#MS: it's not necessary to divide by nrow(my_data)
#get variables with missings
incomplete_variables <- names(my_data)[missing_rates > 0]
#MS: Nonresponse-Matrix aufbauen: (wird später für die Convergenz Diagnostic gebraucht)
nonresponse_matrix <- is.na(my_data)
# # # # # # # # # # # get the formula elements (fe) and do consistency checks # # # # # # # # # # # # # # # #
constant_variables <- apply(my_data, 2, function(x) length(unique(x)) == 1)
if(sum(constant_variables) >= 2){
stop(paste("Your data has two or more constant variables: ", names(my_data)[constant_variables],
". One needs to be dropped to avoid multicolinearity.", sep = ""))
}
variable_names_in_data <- colnames(my_data)
fe <- extract_varnames(model_formula = model_formula, constant_variables = constant_variables,
variable_names_in_data = variable_names_in_data)
# check if target variable is available
#check if this variable name realy occurs in the dataset
if(all(fe$target_varname != "") & !all(fe$target_varname %in% names(my_data))){
writeLines(paste("We didn't find",
paste(fe$target_varname[!fe$target_varname %in% names(my_data)],
collapse = " and "),
"in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula an running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
#if the model_formula is ignored every extracted value of the model_formula is ignored...
model_formula <- NULL
fe[1:length(fe)] <- ""
#... and every variable is used as a fixed effects variable, because in this case a single level imputation
#on every variable is run
fe$fixedeffects_varname <- colnames(my_data)
}
#check if this variable name realy occurs in the dataset
if(all(fe$fixedeffects_varname != "") & !all(fe$fixedeffects_varname %in% names(my_data))){
writeLines(paste("We didn't find",
paste(fe$fixedeffects_varname[!fe$fixedeffects_varname %in% names(my_data)],
collapse = " and "),
"in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula an running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
#if the model_formula is ignored every extracted value of the model_formula is ignored...
model_formula <- NULL
fe[1:length(fe)] <- ""
#... and every variable is used as a fixed effects variable, because in this case a single level imputation
#on every variable is run
fe$fixedeffects_varname <- colnames(my_data)
}
#check if this variable name realy occurs in the dataset
if(all(fe$randomeffects_varname != "") & !all(fe$randomeffects_varname %in% names(my_data))){
writeLines(paste("We didn't find",
paste(fe$randomeffects_varname[!fe$randomeffects_varname %in% names(my_data)],
collapse = " and "),
"in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula an running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
if(all(c("0", "1") %in% fe$randomeffects_varname)){
warning("Your model_formula includes 0 for no random intercept and 1 for a random intercept at the same time.
Please decide whether you want a random intercept or not.")
}
#it could be the case that the user specifies several cluster ids.
#As the tool doesn't support this yet, we stop the imputation routine.
if(length(fe$clID_varname) >= 2){
writeLines("The package only supports one cluster ID in the model_formula. \n
How do you want to proceed: \n
[c]ontinue with ignoring the model_formula an running a single level imputation
or [e]xiting the imputation?")
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
#check whether the formula matches the data
if(fe$clID_varname != "" & !(fe$clID_varname %in% names(my_data))){
writeLines(paste("We didn't find", fe$clID_varname,
"in your data. How do you want to proceed: \n
[c]ontinue with ignoring the model_formula an running a single level imputation
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
model_formula <- NULL
fe[1:length(fe)] <- ""
fe$fixedeffects_varname <- colnames(my_data)
}
#check whether the constant variable in the dataset are labbeld "1" if this is the label of the
#intercept variable in the model formula.
if(any(constant_variables)){
if((any(fe$fixedeffects_varname == "1") &
variable_names_in_data[constant_variables] != "1")|
(any(fe$randomeffects_varname == "1") &
variable_names_in_data[constant_variables] != "1")){
warning(paste("Your model_formula includes '1' for an intercept.
The constant variable in your data is named ", variable_names_in_data[constant_variables], ".
Consider naming the variable also '1' or 'Intercept'.", sep = ""))
}
}
#make shure that the random effects variable exist, and has the value 1
if(fe$intercept_varname != ""){
print(paste("We interprete", fe$intercept_varname,
"as the intercept variable and set its value to 1."))
my_data[, fe$intercept_varname] <- 1
}
if(fe$clID_varname != ""){
print(paste("We interprete", fe$clID_varname,
"as the cluster indicator and treat it as a factor."))
my_data[, fe$clID_varname] <- as.factor(my_data[, fe$clID_varname])
}
########################### MERGE SMALL CLUSTERS ####################
if(fe$clID_varname != ""){
tab_1 <- table(my_data[, fe$clID_varname])
# set a minumim size for the clusters
mn <- 6
#merge small clusters to a large big cluster
if(any(tab_1 < mn)){
writeLines(paste(sum(tab_1 < mn), "clusters have less than", mn, "observations.",
"How do you want to proceed: \n
[c]ontinue with merging these clusters with others
or [e]xiting the imputation?"))
proceed <- readline("Type 'c' or 'e' into the console and press [enter]: ")
if(proceed == "e") return(NULL)
# The procedure is the following:
# 1. determine the smallest and the second smallest cluster.
# 2. take the observations of the smallest cluster and combine it with observations of the second smallest cluster.
# this is be done by giving both clusters the name "[name of second smallest cluster] forced merge".
# 3. repeat with 1 until there is no invalid cluster left
while(any(tab_1 < mn)){
#step 1: determine the smallest and the second smallest cluster.
smallest_cluster <- tab_1[tab_1 == sort(as.numeric(tab_1))[1]][1]
tab_1_without_smallest_cluster <- tab_1[names(tab_1) != names(smallest_cluster)]
second_smallest_cluster <- tab_1_without_smallest_cluster[tab_1_without_smallest_cluster == sort(as.numeric(tab_1_without_smallest_cluster))[1]][1]
#step 2: new cluster name
new_name <- paste(names(second_smallest_cluster), "forced_merge", sep = "_")
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(smallest_cluster)] <- new_name
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(second_smallest_cluster)] <- new_name
#step 3: repeat.
tab_1 <- table(my_data[, fe$clID_varname])
}
}
}
######################################## START THE IMPUTATION #####################
#In order to have a rectangular data.set (without NAs), do the sample_imp:
types <- array(dim = ncol(my_data))#get the variable types
for(j in 1:ncol(my_data)){
my_data[, j] <- sample_imp(my_data[, j])#apply would be smarter, but it returns a matrix.
types[j] <- get_type(my_data[, j])
}
categorical <- types == "categorical"
if(any(categorical)){
for(catcount in 1:sum(categorical)){
print(paste("We interprete", names(my_data)[categorical][catcount],
"as the categorical variable and force it to be a factor."))
my_data[, names(my_data)[categorical][catcount]] <-
as.factor(my_data[, names(my_data)[categorical][catcount]])
}
}
#MS: now safe the variables used for modeling fixed effects
X <- my_data[, fe$fixedeffects_varname[fe$fixedeffects_varname %in% names(my_data)], drop = FALSE]
#MS: now safe the variables used for modeling random effects
Z <- my_data[, fe$randomeffects_varname[fe$randomeffects_varname %in% names(my_data)] , drop = FALSE]
#################### Preparation for setting up a mids-object
alldata <- list()#this object saves the M datasets.
my_chainMean <- array(dim = c(length(incomplete_variables), maxit, M))
dimnames(my_chainMean) <- list(incomplete_variables,
as.character(1:maxit),
paste("Chain", 1:M))
my_chainVar <- my_chainMean
for(i in 1:M){
for(l1 in 1:maxit){
for(l2 in incomplete_variables){#do the imputation cycle
print(Sys.time())
print(paste("Imputing variable ", l2, " at iteration ", l1,
" for imputation ", i, " (out of ", M, ")", sep = ""))
#find the smallest positive missing rate.
#If there is not *the one* smallest positive missing rate, sample from those who have the smallest pos. MR.
tmp_variable <- l2
#get type
tmp_type <- get_type(my_data[, tmp_variable])
#generate temporary data set, that contains only the variable to impute and
#the covariates where no missing values occurs
#tmp_data <- my_data
#make currrent variable which is about to be imputed again containing
#the original missing values
my_data[, l2] <- data[, l2]
tmp_X <- X[, names(X) != tmp_variable, drop = FALSE]
tmp_Z <- Z[, names(Z) != tmp_variable, drop = FALSE]
#check number of observed values in each cluster.
if(fe$clID_varname != ""){
tab_1 <- table(my_data[!is.na(my_data[, tmp_variable]), fe$clID_varname])
#merge small clusters to a large big cluster
safety_counter <- 0
while(any(tab_1 < mn) & safty_counter < 100){
safety_counter <- safety_counter + 1
#step 1: determine the smallest and the second smallest cluster.
smallest_cluster <- tab_1[tab_1 == sort(as.numeric(tab_1))[1]][1]
tab_1_without_smallest_cluster <- tab_1[names(tab_1) != names(smallest_cluster)]
second_smallest_cluster <- tab_1_without_smallest_cluster[tab_1_without_smallest_cluster == sort(as.numeric(tab_1_without_smallest_cluster))[1]][1]
#step 2: new cluster name
new_name <- paste(names(second_smallest_cluster), "forced_merge", sep = "_")
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(smallest_cluster)] <- new_name
levels(my_data[, fe$clID_varname])[levels(my_data[, fe$clID_varname]) == names(second_smallest_cluster)] <- new_name
#step 3: repeat.
tab_1 <- table(my_data[!is.na(my_data[, tmp_variable]), fe$clID_varname])
}
}
if(tmp_type == "binary"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_binary(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X)
}else{
imp <- imp_binary_multi(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
M = 1,
nitt = nitt,
thin = thin,
burnin = burnin)
}
}
if(tmp_type == "cont"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_cont(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X)
}else{
imp <- imp_cont_multi(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
M = 1,
nitt = 3000,
thin = 10,
burnin = 1000)
}
}
if(tmp_type == "semicont"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_semicont(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
heap = 0,
M = 1)
}else{
imp <- imp_semicont_multi(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
model_formula = model_formula,
heap = 0,
M = 1,
nitt = 3000,
thin = 10,
burnin = 1000)
}
}
if(tmp_type == "roundedcont"){
#yet, we don't have a multilevel imputation for rounded incomes
imp <- imp_roundedcont(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
intercept_varname = fe$intercept_varname,
M = 1)
}
if(tmp_type == "count"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_count_single(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X)
}else{
imp <- imp_count_multi(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
M = 1,
nitt = nitt,
thin = thin,
burnin = burnin)
}
}
if(tmp_type == "categorical"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_cat_cart(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X)
}else{
imp <- imp_cat_multi(y_imp_multi = as.factor(my_data[, tmp_variable]),
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
model_formula = model_formula,
M = 1,
nitt = 3000,
thin = 10,
burnin = 1000)
}
}
if(tmp_type == "ordered_categorical"){
if(fe$clID_varname == "" || !(fe$clID_varname %in% names(my_data)) || tmp_variable != fe$target_varname){
imp <- imp_orderedcat(y_imp_multi = my_data[, tmp_variable],
X_imp_multi = tmp_X)
}else{
imp <- imp_orderedcat_multi(y_imp_multi = as.factor(my_data[, tmp_variable]),
X_imp_multi = tmp_X,
Z_imp_multi = tmp_Z,
clID = my_data[, fe$clID_varname],
model_formula = model_formula,
M = 1,
nitt = 3000,
thin = 10,
burnin = 1000)
}
}
#MS: update my_data with imputed variable
my_data[, tmp_variable] <- imp
#MS: update Z, because it could be the case that a variable contained in Z
#MS: was udated by imputation
if(tmp_variable %in% names(Z)) Z[, tmp_variable] <- imp
#MS: update X, because it could be the case that a variable contained in Z
#MS: was udated by imputation
if(tmp_variable %in% names(X)) X[, tmp_variable] <- imp
to_evaluate <- imp[is.na(data[, tmp_variable]), ]
if(tmp_type == "categorical" | tmp_type == "ordered_categorical"){
to_evaluate_numerical <- as.numeric(as.factor(my_data[, tmp_variable]))
}else{
to_evaluate_numerical <- to_evaluate
}
my_chainMean[l2, l1, i] <- mean(to_evaluate_numerical)
my_chainVar[l2, l1, i] <- var(to_evaluate_numerical)
}
}
alldata[[i]] <- my_data
}
############################ SET UP MIDS OBJECT ####################
imp_hmi <- setNames(vector("list", ncol(data)), colnames(data))
for(l2 in incomplete_variables){
n_mis_j <- sum(is.na(data[, l2]))
imp_hmi[[l2]] <- as.data.frame(matrix(NA, nrow = n_mis_j, ncol = M, dimnames = list(NULL, 1:M)))
for(i in 1:M){
imputed_values <- alldata[[i]][l2][is.na(data[, l2]), , drop = FALSE]
imp_hmi[[l2]][, i] <- imputed_values
rownames(imp_hmi[[l2]]) <- rownames(imputed_values)
}
}
nmis <- apply(is.na(data), 2, sum)
method <- rep("multi", times = ncol(data)) #!!! DAS MUSS ICH NOCH AUSBESSERN JE NACH DEM OB DIE METHODE WIRKLICH MULTI WAR ODER NICHT
#UND AUCH DANACH OB ES CONT; SEMICONT; CAT ETC WAR!!!
names(method) <- colnames(data)
predictorMatrix <- matrix(0, nrow = ncol(my_data), ncol = ncol(my_data))
rownames(predictorMatrix) <- colnames(data)
colnames(predictorMatrix) <- colnames(data)
predictorMatrix[incomplete_variables, ] <- 1
diag(predictorMatrix) <- 0
visitSequence <- (1:ncol(data))[apply(is.na(data), 2, any)]
names(visitSequence) <- colnames(data)[visitSequence]
loggedEvents <- data.frame(it = 0, im = 0, co = 0, dep = "",
meth = "", out = "")
#Not supported yet
form <- vector("character", length = ncol(data))
post <- vector("character", length = ncol(data))
midsobj <- list(call = call, data = data,
m = M, nmis = nmis, imp = imp_hmi, method = method,
predictorMatrix = predictorMatrix, visitSequence = visitSequence,
form = form, post = post, seed = NA, iteration = maxit,
lastSeedValue = .Random.seed,
chainMean = my_chainMean,
chainVar = my_chainVar,
loggedEvents = loggedEvents)
#!!!NEED TO ADD vcov() method to midsobj!!!
oldClass(midsobj) <- "mids"
return(midsobj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.