hmi: Hierarchical Multiple Imputation

Documented in gibbs.coef imp_multi

# You can learn more about package authoring with RStudio at:
#
#   http://r-pkgs.had.co.nz/
#
# Some useful keyboard shortcuts for package authoring:
#
#   Build and Reload Package:  'Ctrl + Shift + B'
#   Check Package:             'Ctrl + Shift + E'
#   Test Package:              'Ctrl + Shift + T'
#
# MS: All other shortcuts are shown when pressing 'Alt + Shift + K'
# MS: Hinweis: es wird zu jeder R-Datei im Projekordner\R, die eine Documentation hat,
#Mit devtools::document() auch eine .RD Datei erstellt.


#library(MASS) 'MS: DURCH BESSEREN AUFRUF require (etc) ersetzen
#library(coda) #is needed for gelman.diag()
#library(stats)

#########three different parts:

#########Gibbs sampler functions: contains all functions to draw from
#########				    the conditional distributions
#########Gibbs sampler: 	    the actual Gibbs sampler function
#########Imputation function:     the function that prepares the data before
#########				   calling the Gibbs sampler and uses
#########			          the parameters from the Gibbs for imputation







#################################################
#########Imputation function#####################
#################################################



#' The main function called by the user. LATER THE USER WILL USE THE WRAPPER.
#'
#' The user has to passes to the function his data.
#' Optionally he pass his analysis model formula so that hmi runs the imputation model
#' in line with his analysis model formula.
#' And of course he can specify some parameters for the imputation routine
#' (like the number of imputations and iterations and the burn in percentage.)
#' The standard usage should be that the user gives his complete dataset
#' and his analysis model. But he also could just give y, X and Z and the cluser ID.
#' @param y_imp_multi A Vector with the variable to impute.
#' @param X_imp_multi A data.frame with the fixed effects variables.
#' @param Z_imp_multi A data.frame with the random effects variables.
#' @param y_variable_name A character naming the variable to impute.
#' @param clID A vector with the cluster ID.
#' @param n.iter An integer defining the number of
#' iterations that should be run in each bunch of iterations.
#' @param M An integer defining the number of imputations that should be made.
#' @param n.chains An integer defining the number of Markov chains to be made.
#' @param burn.in A numeric between 0 and 1 defining the percentage of draws from the gibbs sampler
#' that should be discarded as burn in.
#' @param max.iter An integer defining the maximum number of
#' iterations that should be run in total.
#' @param allowed_max_value A single numeric Value which shall not be exceeded
#' when values are imputed (e.g. the age of a person can be limited to 125).
#' @param allowed_max_variable A character naming a variable V.
#' For each Y_i the value of V_i shall not exceeded
#' (e.g. the net income shall not exceed the gross income).
#' Note that a new imputed value has to satisfy both conditions of \code{allowed_max_value}
#' and \code{allowed_max_variable} at the same time.
#' @param allowed_min_value Analog to \code{allowed_max_value}.
#' @param allowed_min_variable Analog to \code{allowed_max_variable}.
#' @return A n x M matrix. Each column is one of M imputed y-variables.
imp_multi <- function(y_imp_multi,
                      X_imp_multi,
                      Z_imp_multi,
                      clID,
                      n.iter = 100, M = 10, n.chains = 3, burn.in = 1/3,
                      max.iter = 5000,
                      allowed_max_value = Inf,
                      allowed_max_variable = NULL,
                      allowed_min_value = -Inf,
                      allowed_min_variable = NULL){

  a <- Sys.time()

  # -----------------------------preparing the data ------------------
  # -- standardise the covariates in X (which are numeric and no intercept)
  need_stand <- apply(X_imp_multi, 2, get_type) == "cont"
  X_imp_multi_stand <- X_imp_multi
  X_imp_multi_stand[, need_stand] <- scale(X_imp_multi[, need_stand])
  #X_imp_multi %>% mutate_each_(funs(scale), vars = names(need_stand)[need_stand])
  #generate model.matrix (from the class matrix)
  n <- nrow(X_imp_multi_stand)
  X_model_matrix <- model.matrix(rnorm(n) ~ 0 + ., data = X_imp_multi_stand)
  # Remove ` from the variable names
  colnames(X_model_matrix) <- gsub("`", "", colnames(X_model_matrix))

  # -- standardise the covariates in Z (which are numeric and no intercept)
  need_stand <- apply(Z_imp_multi, 2, get_type) == "cont"
  Z_imp_multi_stand <- Z_imp_multi
  Z_imp_multi_stand[, need_stand] <- scale(Z_imp_multi[, need_stand])

  # MS: If the user wants a fixed intercept, the wrapper function assures that X_imp_multi
  # includes such a variable

  # Get the number of random effects variables
  n.par.rand <- ncol(Z_imp_multi_stand)
  length.alpha <- length(table(clID)) * n.par.rand


  # -------------- calling the gibbs sampler to get imputation parameters----
  pars <- gibbs.coef(y_gibbs = y_imp_multi, X_gibbs = X_model_matrix, Z_gibbs = Z_imp_multi_stand,
                     clID = clID,
                     n.iter = n.iter, M = M, n.chains = n.chains, burn.in = burn.in,
                     max.iter = max.iter)

  ###sample m values for each parameter
  select.record <- sample(1:dim(pars)[2], M, replace = TRUE)
  select.chain <- sample(1:dim(pars)[1], M, replace = TRUE)

  # -------------------- drawing samples with the parameters from the gibbs sampler --------
  y.imp <- array(NA, dim = c(n, M))
  ###start imputation
  for (j in 1:M){

    rand.eff.imp <- matrix(pars[select.chain[j], select.record[j], 1:length.alpha], ncol = n.par.rand)
    fix.eff.imp <- pars[select.chain[j], select.record[j], (length.alpha + 1):(length.alpha + ncol(X))]
    sigma.y.imp <- pars[select.chain[j], select.record[j], length.alpha + ncol(X) + 1]
    y.temp <- rnorm(n, X_model_matrix %*% fix.eff.imp + apply(Z * rand.eff.imp[clID,], 1, sum), sigma.y.imp)

    y.imp[,j] <- ifelse(is.na(y_imp_multi), y.temp, y_imp_multi)
  }
  b <- Sys.time()
  print(b - a)
  # --------- returning the imputed data --------------
  return(y.imp)
}


############################################################
######Gibbs sampler for random coefficient model############
############################################################

#' Gibbs sampler for random coefficient model
#'
#' It generates the Markov chains of the imputation parameters by drawing from their conditional distributions
#' until convergence
#' @param y_gibbs A vector or data.frame with \code{ncol = 1} containing the target variable with the missing values.
#' @param X_gibbs A matrix containing the covariates influencing \code{y} via fixed effects.
#' If rows with missing values in \code{X} should also be imputed, put all your variables in a data.frame (or matrix)
#' @param Z_gibbs A data.frame containing the covariates influencing \code{y} via random effects
#' @param clID A factor (should come as data.frame or vector) containing the cluster IDs.
#' @param n.iter An integer defining the number of
#' iterations that should be run in each bunch of iterations.
#' @param max.iter An integer defining the maximum number of
#' iterations that should be run in total.
#' @param M An integer defining the number of imputations that should be made.
#' @param n.chains An integer defining the number of Markov chains to be made.
#' @param burn.in A numeric between 0 and 1 defining the percentage of draws from the gibbs sampler
#' that should be discarded as burn in.
#' @return It returns a multidimensional vector with the Markov chains
#' containing the imputation parameters needed for \code{imp_multi}.
gibbs.coef <- function(y_gibbs, X_gibbs, Z_gibbs, clID, n.iter = 100, M = 10,
                       n.chains = 3, burn.in = 1/3, max.iter = 5000){

  ##calculate quantities that don't change in the Gibbs sampler
  y_obs <- y_gibbs[!is.na(y_gibbs)] ###only observed part of y
  X_obs <- X_gibbs[!is.na(y_gibbs), , drop = FALSE]
  Z_obs <- Z_gibbs[!is.na(y_gibbs), , drop = FALSE]
  clID_obs <- clID[!is.na(y_gibbs)]
  clID_obs <- droplevels(clID_obs)

  n.obs <- sum(!is.na(y_gibbs))
  nclass <- length(table(clID))
  n.par.rand <- ncol(Z_gibbs)
  length.alpha <- length(table(clID)) * n.par.rand
  n.par <- length.alpha + ncol(X_gibbs) + 1 + n.par.rand^2
  # MS: Explanation of the number of parameters
  # length.alpha: for each cluster (length(table(clID))) there are n.par.rand random effects variables.
  # ncol(X_gibbs): There are ncol(X_gibbs) fixed effects variables.
  # 1: There is one residual variance.
  # n.par.rand^2: The random effects covariance matrix has n.par.rand * n.par.rand entries
  # (we ignore that the upper triangle is identical to the lower triangle).

  xtx <- solve(t(X_obs) %*% X_obs) #!!! 2016-07-13 war einmal nicht invertierbar!!!
  b.hat.part1 <- xtx %*% t(X_obs)

  data_obs <- data.frame(y_obs = y_obs,
                         clID_obs = clID_obs)
  # MS: get all variables from X and Z into data_obs,
  # but only once (often Z is a subset of X and Zs variables should not appear again in the data set
  # if they are already brought to data_obs by X).

  data_obs[, colnames(X_obs)] <- X_obs
  tmp <- names(Z_obs)[!(names(Z_obs) %in%  colnames(X_obs))]
  data_obs[, tmp] <- Z_obs[, tmp]
  rm(tmp)

  tmp_model_formula <- formula(paste("y_obs ~ 0 + ",
                                     paste(colnames(X_obs), collapse = " + "),
                                     "+ (0 + ",
                                     paste(colnames(Z_obs), collapse = " + "), "| clID_obs)"))
#DEBUG: EINE INTERAKTION BEHEBT DAS PROBLEM AUCH NICHT!!!
 # tmp_model_formula <- formula(y_obs ~ 0 + Intercept + X * cl.var + (0 + Intercept + X | clID_obs))

  #ohne explizite intercept variable änder tauch nichts.
  #tmp_model_formula <- formula(y_obs ~ X * cl.var + (1 + X | clID_obs))
  # MS: Was soll hier passieren???
  res_obs <- lme4::lmer(tmp_model_formula, data = data_obs) #MS: Die Interaktion X * cl.var
  #wird hier nicht beruecksichtig!!!
  vcov.rand <- lme4::VarCorr(res_obs)[[1]][,]
  hyper.df1 <- nclass + dim(vcov.rand)
  hyper.df2 <- vcov.rand * dim(vcov.rand)

  sims <- array(NA, c(n.chains, max.iter, n.par))

  ###generate starting values
#DEBUGCOUNTER <- 0
#DEBUG_COR <- array(dim = max.iter)
  converged <- FALSE
  iter <- 0
  while(!converged & iter < max.iter){
#    DEBUGCOUNTER <- DEBUGCOUNTER + 1
#    set.seed(DEBUGCOUNTER)
    for (k in 1:n.chains){
      if (iter == 0){

        ###generate starting values
        alpha.new <- matrix(rnorm(n = length.alpha), ncol = n.par.rand)
        beta.new <- rnorm(n = ncol(X_obs))
        sigma.y.new <- runif(1)
        sigma.alpha.new <- solve(rWishart(1, df = n.par.rand + 1, Sigma = diag(n.par.rand))[,,1])
      }

      if (iter != 0){

        # MS: use the latest parameters
        alpha.new <- matrix(sims[k, iter, 1:length.alpha], ncol = n.par.rand)
        beta.new <- sims[k, iter, (length.alpha + 1):(length.alpha + ncol(X_obs))]
        sigma.y.new <- sims[k, iter, length.alpha + ncol(X_obs) + 1]
        sigma.alpha.new <- matrix(sims[k, iter,
                (length.alpha + ncol(X_obs) + 2):(length.alpha + ncol(X_obs) + 1 + n.par.rand^2)],
                ncol = n.par.rand)
      }



      draws <- array(NA, dim = c(n.iter, n.par))

      for (s in 1:n.iter){
#        DEBUGCOUNTER <- DEBUGCOUNTER + 1
#        set.seed(DEBUGCOUNTER)
#        save(Z_obs, alpha.new, b.hat.part1, clID_obs, y_obs, sigma.y.new,xtx, file ="updatesigmaalphacoefcrashes.rda")
#        print(DEBUGCOUNTER)#1904 war letzer angezeigter seed
#    DEBUG_COR[DEBUGCOUNTER] <- cor(alpha.new)[1,2]
#    save(DEBUG_COR, file = "DEBUG_COR.rda")
#        print(paste("Correlation of alpha.new:", round(DEBUG_COR[DEBUGCOUNTER], 3)))
        #jetzt wars 1047
#        load("updatesigmaalphacoefcrashes.rda")

        if(FALSE){
          #pdf("CorrelationAlphanew_Over1000runs.pdf")
          jpeg("CorrelationAlphanew_Over1000runs.jpg")
          plot(x = 2:1000, y = DEBUG_COR[2:1000], type = "l")
          dev.off()
        }

        # Updata fixed effects
        beta.new <- update.beta.coef(Z_obs = Z_obs, alpha.new = alpha.new, b.hat.part1 = b.hat.part1,
                                     clID_obs = clID_obs, y_obs = y_obs,
                                     sigma.y.new = sigma.y.new, xtx = xtx)

        # Update the residual variance
        sigma.y.new <- update.sigma.y.coef(y_obs = y_obs, X_obs = X_obs,
                                           beta.new = beta.new, Z_obs = Z_obs,
                                           alpha.new = alpha.new, clID_obs = clID_obs, n.obs = n.obs)

        # Update random effects covariance matrix
        # !!! 2016-07-13  reciprocal condition number
        #save(hyper.df1, hyper.df2, alpha.new, file ="updatesigmaalphacoefcrashes.rda")
        #MACHT PROBLEME!!!
        #Ich denke das Problem ist die (quasi) perfekte Korrelation in alpha.new
        # (oder in hyper.df2)
        sigma.alpha.new <- update.sigma.alpha.coef(hyper.df1 = hyper.df1,
                                                   hyper.df2 = hyper.df2, alpha.new = alpha.new)

        # Update the cluster specific random effects
        #
        alpha.new <- update.alpha.coef(Z_obs = Z_obs, y_obs = y_obs, X_obs = X_obs,
                                       beta.new = beta.new, clID_obs = clID_obs,
                                       sigma.y.new = sigma.y.new,
                                       sigma.alpha.new.inv = solve(sigma.alpha.new))

        # Store the drawn parameters
        draws[s, ] <- c(alpha.new, beta.new, sigma.y.new, sigma.alpha.new)
      }

      sims[k, (iter + 1):(iter + n.iter), ] <- draws
    }

    iter <- iter + n.iter

    ###discard first part of the MCMC chain

    chains <- sims[, ceiling(iter * burn.in):iter, ]

    ###turn into coda usable object

    get.mcmc <- vector("list", n.chains)

    for (k in 1:n.chains){
      get.mcmc[[k]] <- coda::as.mcmc(chains[k,,])
    }

    ###calculate R.hat
    R.hat <- coda::gelman.diag(get.mcmc, autoburnin = FALSE, multivariate = FALSE)
    converged <- max(R.hat$psrf[,1]) < 1.05
  }

  print(paste("converged after", iter, "iterations."))

  return(chains)
}


# Generate documentation with devtools::document()
# Build package with devtools::build() and devtools::build(binary = TRUE) for zips