R/complete_results.R

Defines functions complete_results

Documented in complete_results

#' Completed data sets as long data.frame
#'
#' Completed (imputed) data sets given incomplete data passed to
#' \code{\link{smirf}} and the output thereof, returned in a long-format
#' data.frame.
#'
#' Description
#'
#' @param X data.frame;
#'     incomplete data set passed to \code{\link{smirf}} in construction of
#'     \code{mf}.
#' @param mf list;
#'     imputed data sets and output of missForest procedure as returned by
#'     \code{\link{smirf}}.
#' @param k integer;
#'     number of completed data sets to generate, default is the number of
#'     imputed data sets in \code{mf}.
#' @param to.complete character;
#'     vector of names of variables to include, default is all variables in
#'     imputed data sets in \code{mf}.
#' @param replace boolean;
#'     indicator of sampling from imputed data sets with replacement or not.
#' @return data.frame;
#'     completed long-format data containing the columns of \code{X} and two
#'     more columns, \describe{
#'        \item{\code{.source}}{integer; id of the source data set in \code{mf}
#'            of the imputed data in the row (if any), otherwise \code{NA},
#'            and;}
#'        \item{\code{.imp}}{integer; id of the complete data set between 1 and
#'            \code{k}, or \code{NA} if it belongs to all (i.e. contains no
#'            imputed data),}
#'     } and each row is either \itemize{
#'         \item a complete case whose values are unaltered, or;
#'         \item an incomplete case with a sample from the imputed data
#'               replacing any missing values.
#'     }
#'
#' @seealso \code{\link{smirf}}
#'
#' @section To-do:
#' \itemize{
#'   \item document \code{complete_results}
#'   \item write tests for \code{complete_results}
#' }
#'
#' @export
complete_results <- function(X,
                             mf,
                             k=length(mf$results),
                             to.complete=names(mf$which_imputed),
                             replace=F) {

    # TODO: perhaps write check arguments function
    if (inherits(X, 'grouped_df'))
        warning('groups in X will be ignored and discarded.')

    not_found <- names(to.complete)[!(names(to.complete) %in%
                                          names(mf$which_imputed))]
    if (length(not_found) > 0)
        stop(paste0('variables in to.complete not found in imputed data set:\n',
                    '  - ', paste0(not_found, collapse=', '), '.'))

    # number of imputed data sets
    n <- length(mf$results)
    to_use <- 1L + sapply(mf$results, getElement, 'iterations')

    which_imputed <- unique(do.call(c, unname(mf$which_imputed[to.complete])))

    J <- samples_as_matrix(n, k, size=length(which_imputed), replace=replace)

    X_ <- X[rep(which_imputed, times=k),,drop=F]

    # over each data set generated by smirf
    for (source_set in seq_len(n)) {
        # observations to which this imputed data set is assigned
        to_j <- which(J == source_set, arr.ind=T)

        # rows of X_ to which the imputed data is assigned
        to_X_ <- (to_j[,2] - 1L) * length(which_imputed) + to_j[,1]
        # record 'id' of imputed data set (source) and the final data set
        X_[to_X_, '.source'] <- source_set
        X_[to_X_, '.imp'] <- to_j[,2]

        # over each to-be completed variable
        for (v in to.complete) {
            # indices of imputed data from which value is taken
            # could be taken outside loop
            from_j <- match(which_imputed[to_j[,1]],
                            mf$which_imputed[[v]],
                            nomatch=0L)
            X_[to_X_[from_j > 0L],v] <- with(mf$results[[source_set]],
                                             imputed[[
                                                 to_use[source_set]
                                             ]][[v]][from_j])
        }
    }

    X[,'.source'] <- X[,'.imp'] <- NA_integer_
    rbind(X[-which_imputed,], X_)

}
stephematician/miForang documentation built on July 23, 2019, 5:11 p.m.