R/measure_stekhoven_2012.R

Defines functions measure_stekhoven_2012

Documented in measure_stekhoven_2012

#' Measure change between completed data sets as per Stekhoven and Buehlmann 
#'
#' Measures a relationship between two supplied completed data sets, typically
#' generated by two sequential iterations of the missForest procedure. Given by
#' a Frobenius norm-based relative difference or the proportion of stationary
#' values for continuous and categorical (included ordered) data respectively,
#' as in Stekhoven and Buehlmann (2012).
#'
#' Measures a relationship between two supplied completed data sets, typically
#' generated by two sequential iterations of the missForest procedure. Intended
#' to be used with the stop criterion that as soon as all values (see below)
#' remain constant or decrease at once, then the missForest procedure is deemed
#' to have converged. These are as per the original Stekhoven and Buehlmann
#' (2012) paper.
#'
#' The two measures are the;
#' \itemize{
#'    \item sum of the differences squared of all continuous data divided by the
#'          sum of the squares of the continuous data in the second supplied
#'          completed data set, and;
#'    \item proportion of stationary categorical data between the two supplied
#'          completed data sets (see \code{\link{stationary_rate}}).
#' }
#'
#' The first item is referred to here as a Frobenius norm-based relative
#' difference in the completed data.
#'
#' @inheritParams perform_missforest
#' @param X named list;
#'            imputed values of each variable (named) from one iteration within
#'            missForest procedure.
#' @param Y list;
#'            imputed values of each variable (named) from the iteration within
#'            the missForest procedure succeeding that used to determine
#'            \code{X}.
#' @return named numeric;
#'             two named values: \describe{
#'                 \item{\code{continuous}}{a Frobenius normi-based relative 
#'                      difference of the continuous data between the two
#'                      completed data sets, and;}
#'                 \item{\code{categorical}}{proportion of stationary values of
#'                     categorical (including ordered) data between the two
#'                     completed data sets (see \code{\link{stationary_rate}}).}
#'             }
#'
#' @seealso \code{\link{measure_correlation}} \code{\link{smirf}} 
#'          \code{\link{stationary_rate}}
#'
#' @references
#'
#' Stekhoven, D.J. and Buehlmann, P., 2012. MissForest--non-parametric missing
#' value imputation for mixed-type data. \emph{Bioinformatics, 28}(1), pp.
#' 112-118.
#' \href{https://dx.doi.org/10.1093/bioinformatics/btr597}{doi.1.1093/bioinformatics/btr597}
#'
#' @examples
#' \dontrun{
#' # simply pass to smirf
#' smirf(iris, stop.measure=measure_stekhoven_2012)
#' }
#' @export
measure_stekhoven_2012 <- function(X, Y, X_init, indicator) {

    continuous <- categorical <- NULL

    cts_data <- names(X)[!sapply(X, is.factor)]

    # includes ordered data
    cat_data <- setdiff(names(X), cts_data)

    # reversed sign here due to form of stop_condition()
    if (length(cts_data) > 0)
        continuous <- -sum(mapply(function(x, y) sum((x - y)^2),
                                  X[cts_data],
                                  Y[cts_data])) /
                           sum(mapply(function(x, d, indicator)
                                          sum(c(x, d[!indicator])^2),
                                      Y[cts_data],
                                      X_init[cts_data],
                                      indicator[cts_data]))

    if (length(cat_data) > 0)
        categorical <- stationary_rate(X[cat_data],
                                       Y[cat_data],
                                       X_init[cat_data],
                                       indicator[cat_data])

    c(categorical=categorical, continuous=continuous)

}
stephematician/miForang documentation built on July 23, 2019, 5:11 p.m.