R/mice.factorize.R
In miceExt: Extension Package to 'mice'

Documented in mice.factorize

#'Transform Imputations of Binarized Data Into Their Corresponding Factors
#'
#'This function acts as the counterpart to \code{mice.binarize}, as it
#'effectively retransforms imputations of binarized data that \code{mice} has
#'been run on and that has been post-processed via \code{mice.post.matching}
#'after. The post-processing is usually necessary as \code{mice} is very likely
#'to impute multiple ones among the dummy columns belonging to to a single
#'factor entry. The resulting \code{mice::mids} object is not suited for further
#'\code{mice.mids()} iterations or the use of \code{plot}, but works well as
#'input to \code{with()}.
#'
#'@param obj \code{mice::mids} object resulting from a call of
#'  \code{mice.post.matching()} and whose underlying data frame results from a
#'  call of \code{mice::binarize()}.
#'@param par_list List that has been returned in a previous call of
#'  \code{mice::binarize()} next to the underlying data of the argument
#'  \code{obj}.
#'  
#'@return A \code{mice::mids} object in which data and imputations have been
#'  retransformed from their respective binarized versions in the input
#'  \code{obj}. As this isn't a proper result of a mice iteration and many of
#'  the attributes of \code{obj} cannot be transformed well, only the slots
#'  \code{data}, \code{nmis}, \code{where} and \code{imp}, which are needed in
#'  \code{with()}, are not \code{NULL}. In particular, it would not work as
#'  input for \code{mice.mids()}.
#'
#'@author Tobias Schumacher, Philipp Gaffert
#'
#'@examples
#'
#'\dontrun{
#'#------------------------------------------------------------------------------
#'# Example that illustrates the combined functionalities of mice.binarize(),
#'# mice.factorize() and mice.post.matching() on the data set 'boys_data', which
#'# contains the column blocks ('hgt','bmi') and ('hc','gen','phb') that have
#'# identical missing value patterns, and out of which the columns 'gen' and
#'# 'phb' are factors. We are going to impute both tuples blockwise, while
#'# binarizing the factor columns first. Note that we never need to specify any
#'# blocks or columns to binarize, as these are all determined automatically 
#'#------------------------------------------------------------------------------
#'
#'# By default, mice.binarize() expands all factor columns that contain NAs,
#'# so the columns 'gen' and 'phb' are automatically binarized
#'boys_bin <- mice.binarize(boys_data)
#'
#'# Run mice on binarized data, note that we need to use boys_bin$data to grab
#'# the actual binarized data and that we use the output predictor matrix
#'# boys_bin$pred_matrix which is recommended for obtaining better imputation
#'# models
#'mids_boys <- mice(boys_bin$data, predictorMatrix = boys_bin$pred_matrix)
#'
#'# It is very likely that mice imputed multiple ones among one set of dummy
#'# variables, so we need to post-process. As recommended, we also use the output
#'# weights from mice.binarize(), which yield a more balanced weighting on the
#'# column tuple ('hc','gen','phb') within the matching. As in previous examples,
#'# both tuples are automatically discovered and imputed on
#'post_boys <- mice.post.matching(mids_boys, weights = boys_bin$weights)
#'
#'# Now we can safely retransform to the original data, with non-binarized
#'# imputations
#'res_boys <- mice.factorize(post_boys$midsobj, boys_bin$par_list)
#'
#'# Analyze the distribution of imputed variables, e.g. of the column 'gen',
#'# using the mice version of with()
#'with(res_boys, table(gen))
#'
#'
#'
#'#------------------------------------------------------------------------------
#'# Similar example to the previous, that also works on 'boys_data' and
#'# illustrates some more advanced funtionalities of all three functions in miceExt: 
#'# This time we only want to post-process the column block ('gen','phb'), while
#'# weighting the first of these tuples twice as much as the second. Within the
#'# matching, we want to avoid matrix computations by using the euclidian distance
#'# to determine the donor pool, and we want to draw from three donors only.
#'#------------------------------------------------------------------------------
#'
#'# Binarize first, we specify blocks in list format with a single block, so we 
#'# can omit an enclosing list. Similarly, we also specify weights in list format.
#'# Both blocks and weights will be expanded and can be accessed from the output
#'# to use them in mice.post.matching() later
#'boys_bin <- mice.binarize(boys_data, 
#'                          blocks = c("gen", "phb"), 
#'                          weights = c(2,1))
#'
#'# Run mice on binarized data, again use the output predictor matrix from
#'# mice.binarize()
#'mids_boys <- mice(boys_bin$data, predictorMatrix = boys_bin$pred_matrix)
#'
#'# Post-process the binarized columns. We use blocks and weights from the previous
#'# output, and set 'distmetric' and 'donors' as announced in the example
#'# description
#'post_boys <- mice.post.matching(mids_boys,
#'                                blocks = boys_bin$blocks,
#'                                weights = boys_bin$weights,
#'                                distmetric = "euclidian",
#'                                donors = 3L)
#'
#'# Finally, we can retransform to the original format
#'res_boys <- mice.factorize(post_boys$midsobj, boys_bin$par_list)
#'}
#'
#'
#'
#'@seealso \code{\link[miceExt]{mice.binarize}},
#'  \code{\link[miceExt]{mice.post.matching}}, \code{\link[mice]{mice}}
#'@export
mice.factorize <- function(obj, par_list)
{
  ## check whether input is valid
  if (!is.mids(obj))
    stop("Object should be of type mids.")

  check_par_list(obj, par_list)

  # grab function call
  call <- match.call()

  ## make local copies of some frequently used parameters
  src_factor_cols <- par_list$src_factor_cols
  dummy_cols <- par_list$dummy_cols
  n_src_cols <- par_list$n_src_cols

  # get number of columns in padded data frame
  n_padded_cols <- ncol(obj$data)

  # initialize result imputation list, nmis and where
  res_imp <- vector("list", length = n_src_cols)
  res_method <- vector(mode = "character", length = n_src_cols)
  res_nmis <- vector(mode = "numeric", length = n_src_cols)
  res_where <- matrix(FALSE, nrow = nrow(obj$data), ncol = n_src_cols)


  ## first copy imputations of non-categorical variables

  # grab numeric columns in source and padded data
  src_numeric_cols <- setdiff(1:n_src_cols,src_factor_cols)
  pad_data_numeric_cols <- setdiff(1:n_padded_cols, unlist(dummy_cols))

  # copy from mids$imp
  res_imp[src_numeric_cols] <- obj$imp[pad_data_numeric_cols]
  res_method[src_numeric_cols] <- obj$method[pad_data_numeric_cols]
  res_nmis[src_numeric_cols] <- obj$nmis[pad_data_numeric_cols]
  res_where[,src_numeric_cols] <- obj$where[,pad_data_numeric_cols]

  ###
  # MAIN STEP: transform binarized imputations back to categorical, use three convoluted iterations, where the inner two are carried out
  #    with apply/lapply functionalities
  # -> outermost iteration runs on target elements of result imputation list, gathering imputations for each eement
  # -> mid-level iteration runs over number of imputations which form the column of current entry in result imputation list
  # -> innermost iteration runs over all impuations of dummy atrributes of current target attribute, building a binary matrix representing
  #    the encoding of current target attribute, which is then transformed via another call of apply that scans each row of the binary
  #    matrix for the entry that is equal to 1
  ###

  #res_imp[src_factor_cols] <- lapply(seq_along(src_factor_cols),
  for(col_index in seq_along(src_factor_cols))
  {
    ## outer iteration: iterate over index of target column tuple

    # grab tmp values of current target column, corrensponding tuple of dummy columns, and list of levels of target column
    curr_src_index <- src_factor_cols[[col_index]]
    curr_dummy_tuple <- dummy_cols[[col_index]]
    curr_levels <- par_list$src_levels[[col_index]]

    # grab imputation of first dummy variable in current dummy tuple as reference
    # for naming and null-checking purposes
    ref_imp <- obj$imp[[curr_dummy_tuple[1]]]

    # if reference imputation is null, there are no imputed values -> skip to next iteration
    if(is.null(ref_imp))
      next

    # get imputations of current attribute
    curr_imp <- data.frame(lapply(1:obj$m,
      function(m)
      {
        ## mid-level iteration: iterate over imputation index m

        ## build binary matrix representing the encoding of current target attribute of current imputation
        # -> use inner iteration over current dummy columns
        bin_matrix <- do.call(cbind, lapply(obj$imp[curr_dummy_tuple], function(imp) imp[,m]))

        # return transformed factor column
        # -> transformation is obtained by scanning each row in binary matrix for the entry that is "1" via the apply function
        factor(apply(bin_matrix, MARGIN = 1,
          function(row)
          {
            # check whether binary encoding is correct, as mice() might return rows with multiple '1's and therefore requires use of mice.post.matching
            if(!all(row %in% c(0,1)) || sum(row) != 1)
              stop("The imputed values in the binarized columns are not in proper format. Maybe you forgot to run mice.post.matching() on input mids object.\n")

            # transform, column index that equals '1' indicates level to use
            return(curr_levels[which(row == 1)])
          }),
          levels = curr_levels)
      }))

    # copy row and column names from reference imputation
    colnames(curr_imp) <- colnames(ref_imp)
    rownames(curr_imp) <- rownames(ref_imp)

    res_imp[[curr_src_index]] <- curr_imp
    res_method[curr_src_index] <- "pmm"
    res_where[,curr_src_index] <- obj$where[,curr_dummy_tuple[1]]
    res_nmis[curr_src_index] <- obj$nmis[curr_dummy_tuple[1]]
  }

  # set names of elements of resulting imputation list
  names(res_imp) <- par_list$src_names

  # now build result mids object that can be used within mids
  ## save, and return
  midsobj <- list(call = call,
                  data = par_list$src_data,
                  where = res_where,
                  m = obj$m,
                  nmis = res_nmis,
                  imp = res_imp,
                  method = res_method,
                  predictorMatrix = NULL,
                  visitSequence = NULL,
                  post = NULL,
                  seed = obj$seed,
                  iteration = obj$iteration,
                  lastSeedValue = obj$lastSeedValue,
                  chainMean = NULL,
                  chainVar = NULL,
                  loggedEvents = obj$loggedEvents)

  oldClass(midsobj) <- "mids"

  # return result
  return(midsobj)
}