R/impreciseImputation.R

# Copyright (C) 2018  Paul Fink, Eva Endres
#
# This file is part of impimp.
#
# imptree is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# imptree is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with imptree.  If not, see <https://www.gnu.org/licenses/>.

#' @title Imprecise Imputation for Statistical Matching
#'
#' @description Impute a data frame imprecisely
#'
#' @param recipient a data.frame acting as recipient; see details.
#' @param donor a data.frame acting as donor; see details.
#' @param method 1-character string of the desired imputation method.
#' The following values are possible, see details for an explanantion:
#' \code{"variable_wise"} (default), \code{"case_wise"} and
#' \code{"domain"}.
#' @param matchvars a character vector containing the variable names
#' to be used as matching variables. If \code{NULL} (default) all
#' variables, present in both \code{donor} and \code{recipient} are
#' used as matching variables.
#' @param vardomains a named list containing the possible values of
#' all variable in \code{donor} that are not present in
#' \code{recipient}.\cr
#' If set to \code{NULL} (default) the list is generated by first
#' coercing all those variables to type \code{\link[base]{factor}}
#' and then storing their levels.
#'
#' @details
#' As in the context of statistical matching the data.frames
#' \code{recipient} and \code{donor} are assumed to contain an
#' overlapping set of variables.
#'
#' The missing values in \code{recipient} are subsituted with
#' observed values in \code{donor} for approaches based on donation
#' classes and otherwise with the set of all possible values for
#' the variable in question.
#'
#' For \code{method = "domain"} a missing value of a variable in
#' \code{recipient} is imputed by the set of all possible values
#' of that variable.
#'
#' The other methods are based on donation classes which are formed
#' based on the matching variables whose names are provided by
#' \code{matchvars}. They need to be present in both \code{recipient}
#' and \code{donor}:
#' For \code{method = "variable_wise"} a missing value of a variable
#' in \code{recipient} is imputed by the set of all observed values
#' of that variable in \code{donor}.
#' For \code{method = "case_wise"} the variables only present in
#' \code{donor} are represented as tuples. A missing tuple in
#' \code{recipient} is then imputed by the set of all observed
#' tuples in \code{donor}.
#'
#' @section Reserved characters:
#' The variable names and observations in \code{recipient} and
#' \code{donor} must not contain characters that are reserved for
#' internal purpose.
#' The actual characters that are internally used are stored in the
#' options \code{options("impimp.obssep")} and
#' \code{options("impimp.varssep")}. The former is used to separate
#' the values of a set-valued observation, while the other is used
#' for a concise tupel representation.
#'
#' @note
#' This method does not require that all variables in \code{recipient}
#' and \code{donor} are \code{\link[base]{factor}} variables, however,
#' the imputation methods apply coercion to factor, so purely
#' numerical variables will be treated as factors eventually.
#' It does assume (and test for it) that there are no missing
#' values present in the matching variables.
#'
#' @return
#' The data.frame resulting in an imprecise imputation
#' of \code{donor} into \code{recipient}.
#' It is also of class \code{"impimp"} and stores the imputation
#' method in its attribute \code{"impmethod"}, the names of the
#' variables of the resulting object containing imputed values
#' in the attribute \code{"imputedvarnames"}, as well as the
#' list of (guessed) levels of each underlying variable in
#' \code{"varlevels"}.
#'
#' @keywords robust datagen
#'
#' @seealso for the estimation of probabilities \code{\link{impest}}
#' and \code{\link{impestcond}}; \code{\link{rbindimpimp}} for
#' joining two \code{impimp} objects
#'
#' @references Endres, E., Fink, P. and Augustin, T. (2018),
#' Imprecise Imputation: A Nonparametric Micro Approach Reflecting
#' the Natural Uncertainty of Statistical Matching with Categorical
#' Data, \emph{Department of Statistics (LMU Munich): Technical Reports},
#' No. 214. URL \url{https://epub.ub.uni-muenchen.de/42423/}.
#'
#' @examples
#' A <- data.frame(x1 = c(1,0), x2 = c(0,0),
#'                 y1 = c(1,0), y2 = c(2,2))
#' B <- data.frame(x1 = c(1,1,0), x2 = c(0,0,0),
#'                 z1 = c(0,1,1), z2 = c(0,1,2))
#' impimp(A, B, method = "variable_wise")
#'
#' ## Specifically setting the possible levels of 'z1'
#' impimp(A, B, method = "domain", vardomains = list(z1 = c(0:5)))
#'
#' @importFrom stats setNames
#' @export
impimp <- function(recipient, donor, method = c("variable_wise",
                                    "case_wise",
                                    "domain"),
                   matchvars = NULL, vardomains = NULL) {

  # Check the environment
  varsep <- getOption("impimp.varsep", ",")
  obssep <- getOption("impimp.obssep",  "|")
  if(varsep == obssep) {
    stop(gettextf("option values %s and %s need to be different characters",
                  sQuote("impimp.varsep"), sQuote("impimp.obssep"),
                  domain = "R-impimp"))
  }

  # temporarily set stringsAsFactors to FALSE
  # and reset it to old value after exiting
  oldsAF <- options(stringsAsFactors = FALSE)
  on.exit(options(oldsAF))

  # function argument matching
  method <- match.arg(method)

  # extract common variables
  cnames <- intersect(names(donor), names(recipient))

  # Test if there is an non-empty intersection in the names
  if(!length(cnames)) {
    stop(gettextf("%s and %s do not contain any variables present in both",
                  sQuote("recipient"), sQuote("donor"),
                  domain = "R-impimp"))
  }

  if(is.null(matchvars)) {
    matchvars <- cnames
  } else if(is.character(matchvars)){
    if(any(nm <- (match(matchvars, cnames, nomatch = 0L) == 0L))) {
      stop(gettextf("%s contains variable(s) which are not present in both %s and %s: %s",
                    sQuote("matchvars"), sQuote("donor"),
                    sQuote("recipient"),
                    paste(sapply(matchvars[nm], dQuote), collapse = ", "),
                    domain = "R-impimp"))
    }
  } else {
    stop(gettextf("%s must be NULL or a character vector",
                  sQuote("matchvars"),
                  domain = "R-impimp"))
  }
  # Test if the matching variables do not contain NA
  lapply(matchvars, function(x) {
    if(anyNA(recipient[ ,x])) {
      stop(gettextf("missing values in variable %s in %s",
                    sQuote(x), sQuote("recipient"),
                    domain = "R-impimp"))
    }
    if(anyNA(donor[ ,x])) {
      stop(gettextf("missing values in variable %s in %s",
                    sQuote(x), sQuote("donor"),
                    domain = "R-impimp"))
    }
  })


  rnames <- setdiff(names(recipient), names(donor))
  dnames <- setdiff(names(donor), names(recipient))

  allnames <- c(rnames, cnames, dnames)

  # check for special package-reserved characters in variable names
  if(length(grep(varsep, allnames, fixed = TRUE))) {
    stop(gettextf(c("some variable names contain the character %s, reserved for internal purpose.",
                    "\nRename the variable(s) or change the internal character by setting the option %s"),
                  c(sQuote(varsep), sQuote("impimp.varsep")),
                  domain = "R-impimp"))
  }


  # Do nothing if there are no variables in donor that aren't in recipient
  if(!length(dnames)) {
    warning(gettextf(c("no variable present only in %s and not in %s; ",
                       "returning %s unmodified"),
                     c(sQuote("donor"), sQuote("recipient")),
                     sQuote("recipient"),
                     domain = "R-impimp"))
    return(recipient)
  }

  # Construct the possible values for the variables from the
  # (partially) supplied argument 'vardomains'
  if(!is.null(vardomains)){
    # partially match available levels
    lvls <- vardomains[allnames]
  } else {
    # else generate a list of empty ones
    lvls <- vector(length = length(allnames), mode = "list")
  }
  # generate the potentially missing levels
  # by using the factor based approach
  lvls <- lapply(stats::setNames(nm = allnames),
                 function(varname) {
                   varlevels <- lvls[[varname]]
                   # if variable is not present in one df,
                   # then NULL is returned for that df
                   gvarlevels <- gather_levels(c(as.character(recipient[[varname]]),
                                                 as.character(donor[[varname]])))
                   if(is.null(varlevels)) {
                     varlevels <- gvarlevels
                   } else if(length(lvldiff <- setdiff(gvarlevels,
                                                       varlevels))) {
                     varlevels <- c(lvldiff, varlevels)
                   }
                   varlevels
                 })

  # check for special package-reserved characters in variable values
  lapply(names(lvls), function(x) {
    if(length(grep(varsep, lvls[[x]], fixed = TRUE))) {
      stop(gettextf(c("variable %s contains the character %s, reserved for internal purpose.",
                      "\nChange the internal character by setting the option %s"),
                    c(sQuote(x), sQuote("impimp.varsep")),
                    sQuote(varsep),  domain = "R-impimp"))
    }
    if(length(grep(obssep, lvls[[x]], fixed = TRUE))) {
      stop(gettextf(c("variable %s contains the character %s, reserved for internal purpose.",
                      "\nChange the internal character by setting the option %s"),
                    c(sQuote(x), sQuote("impimp.obssep")),
                    sQuote(obssep), domain = "R-impimp"))
    }
  })

  dlvls <- lvls[dnames]
  # impute the domain for every missing cell

  if(method == "domain") {

    # add columns with NA to the data
    impRecipient <- cbind(recipient, matrix(NA, ncol = length(dnames),
                                    nrow = nrow(recipient),
                                    dimnames = list(c(), dnames)))
    # impute all the levels
    impRecipient[, dnames] <- imputation_values(dlvls, dnames)

  } else {
    # impute cell-wise within donor classes

    # create new variable to index the x structure
    # This is for donation classes
    recipient$cfactor <- factor(apply(recipient[, matchvars], MARGIN = 1,
                              FUN = paste, collapse =","))
    donor$cfactor <- factor(apply(donor[, matchvars], MARGIN = 1,
                              FUN = paste, collapse =","))

    ## transform into tuple notation for method == case_wise
    if(method == "case_wise") {
      donor <- cbind(donor, collapse_variables(donor, dnames))
      dlvls <- collapse_variables(
        do.call("expand.grid", dlvls), dnames)
      dnames <- names(dlvls)

    }

    # extract level combinations of common variables
    # which are present in recipient
    clvls <- levels(recipient$cfactor)

    # initialize the resulting data.frame with NAs
    # in the variable sto be imputed
    impRecipient <- cbind(recipient, matrix(NA, ncol = length(dnames),
                                    nrow = nrow(recipient),
                                    dimnames = list(c(), dnames)))

    for(clvl in clvls) {
      ## Generate the levels to impute
      donorclass_donor <- donor[donor$cfactor == clvl, ]
      if(NROW(donorclass_donor) == 0) {
        # empty donor class in donor, use collection
        # of levels of the variables
        ##### Shall we leave this a warning or a message instead?
        ##### We can also opt for a 'verbose' option
        warning(gettextf("No donor found for donation class: %s",
                         sQuote(clvl),
                         domain = "R-impimp"))
        donor_dlvls <- dlvls
      } else {
        # extract observed donor values
        donor_dlvls <- lapply(stats::setNames(nm = dnames),
                              function(x) {
                                gather_levels(donorclass_donor[,x])
        })
      }

      # impute observed donor values
      impRecipient[impRecipient$cfactor == clvl, dnames] <-
        imputation_values(donor_dlvls, dnames)
    }
  }
  impRecipient <- impRecipient[, c(rnames, cnames, dnames)]
  if(!length(grep(class(impRecipient), "impimp", fixed = TRUE))) {
    class(impRecipient) <- c("impimp", class(impRecipient))
  }
  attr(impRecipient, "impmethod") <-
    c(method, attr(impRecipient, "impmethod"))
  attr(impRecipient, "varlevels") <- lvls
  attr(impRecipient, "imputedvarnames") <- dnames
  impRecipient
}

#' @rdname impimp
#' @param x object of class 'impimp'
#' @param ... further arguments passed down to
#' \code{\link[base]{print.data.frame}}
#' @export
print.impimp <- function(x, ...) {
  cat(gettextf("result of imprecise imputation with method %s\n",
                sQuote(attr(x, "impmethod")),
                domain ="R-impimp"))
  NextMethod(x, ...)
}

#' @rdname impimp
#' @param z object to test for class \code{"impimp"}
#' @export
is.impimp <- function(z) {
  inherits(x = z, what = "impimp")
}

Try the impimp package in your browser

Any scripts or data that you put into this service are public.

impimp documentation built on May 1, 2019, 10:13 p.m.