Nothing
# Copyright (C) 2018 Paul Fink, Eva Endres
#
# This file is part of impimp.
#
# imptree is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# imptree is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with imptree. If not, see <https://www.gnu.org/licenses/>.
#' @title Imprecise Imputation for Statistical Matching
#'
#' @description Impute a data frame imprecisely
#'
#' @param recipient a data.frame acting as recipient; see details.
#' @param donor a data.frame acting as donor; see details.
#' @param method 1-character string of the desired imputation method.
#' The following values are possible, see details for an explanantion:
#' \code{"variable_wise"} (default), \code{"case_wise"} and
#' \code{"domain"}.
#' @param matchvars a character vector containing the variable names
#' to be used as matching variables. If \code{NULL} (default) all
#' variables, present in both \code{donor} and \code{recipient} are
#' used as matching variables.
#' @param vardomains a named list containing the possible values of
#' all variable in \code{donor} that are not present in
#' \code{recipient}.\cr
#' If set to \code{NULL} (default) the list is generated by first
#' coercing all those variables to type \code{\link[base]{factor}}
#' and then storing their levels.
#'
#' @details
#' As in the context of statistical matching the data.frames
#' \code{recipient} and \code{donor} are assumed to contain an
#' overlapping set of variables.
#'
#' The missing values in \code{recipient} are subsituted with
#' observed values in \code{donor} for approaches based on donation
#' classes and otherwise with the set of all possible values for
#' the variable in question.
#'
#' For \code{method = "domain"} a missing value of a variable in
#' \code{recipient} is imputed by the set of all possible values
#' of that variable.
#'
#' The other methods are based on donation classes which are formed
#' based on the matching variables whose names are provided by
#' \code{matchvars}. They need to be present in both \code{recipient}
#' and \code{donor}:
#' For \code{method = "variable_wise"} a missing value of a variable
#' in \code{recipient} is imputed by the set of all observed values
#' of that variable in \code{donor}.
#' For \code{method = "case_wise"} the variables only present in
#' \code{donor} are represented as tuples. A missing tuple in
#' \code{recipient} is then imputed by the set of all observed
#' tuples in \code{donor}.
#'
#' @section Reserved characters:
#' The variable names and observations in \code{recipient} and
#' \code{donor} must not contain characters that are reserved for
#' internal purpose.
#' The actual characters that are internally used are stored in the
#' options \code{options("impimp.obssep")} and
#' \code{options("impimp.varssep")}. The former is used to separate
#' the values of a set-valued observation, while the other is used
#' for a concise tupel representation.
#'
#' @note
#' This method does not require that all variables in \code{recipient}
#' and \code{donor} are \code{\link[base]{factor}} variables, however,
#' the imputation methods apply coercion to factor, so purely
#' numerical variables will be treated as factors eventually.
#' It does assume (and test for it) that there are no missing
#' values present in the matching variables.
#'
#' @return
#' The data.frame resulting in an imprecise imputation
#' of \code{donor} into \code{recipient}.
#' It is also of class \code{"impimp"} and stores the imputation
#' method in its attribute \code{"impmethod"}, the names of the
#' variables of the resulting object containing imputed values
#' in the attribute \code{"imputedvarnames"}, as well as the
#' list of (guessed) levels of each underlying variable in
#' \code{"varlevels"}.
#'
#' @keywords robust datagen
#'
#' @seealso for the estimation of probabilities \code{\link{impest}}
#' and \code{\link{impestcond}}; \code{\link{rbindimpimp}} for
#' joining two \code{impimp} objects
#'
#' @references Endres, E., Fink, P. and Augustin, T. (2018),
#' Imprecise Imputation: A Nonparametric Micro Approach Reflecting
#' the Natural Uncertainty of Statistical Matching with Categorical
#' Data, \emph{Department of Statistics (LMU Munich): Technical Reports},
#' No. 214. URL \url{https://epub.ub.uni-muenchen.de/42423/}.
#'
#' @examples
#' A <- data.frame(x1 = c(1,0), x2 = c(0,0),
#' y1 = c(1,0), y2 = c(2,2))
#' B <- data.frame(x1 = c(1,1,0), x2 = c(0,0,0),
#' z1 = c(0,1,1), z2 = c(0,1,2))
#' impimp(A, B, method = "variable_wise")
#'
#' ## Specifically setting the possible levels of 'z1'
#' impimp(A, B, method = "domain", vardomains = list(z1 = c(0:5)))
#'
#' @importFrom stats setNames
#' @export
impimp <- function(recipient, donor, method = c("variable_wise",
"case_wise",
"domain"),
matchvars = NULL, vardomains = NULL) {
# Check the environment
varsep <- getOption("impimp.varsep", ",")
obssep <- getOption("impimp.obssep", "|")
if(varsep == obssep) {
stop(gettextf("option values %s and %s need to be different characters",
sQuote("impimp.varsep"), sQuote("impimp.obssep"),
domain = "R-impimp"))
}
# temporarily set stringsAsFactors to FALSE
# and reset it to old value after exiting
oldsAF <- options(stringsAsFactors = FALSE)
on.exit(options(oldsAF))
# function argument matching
method <- match.arg(method)
# extract common variables
cnames <- intersect(names(donor), names(recipient))
# Test if there is an non-empty intersection in the names
if(!length(cnames)) {
stop(gettextf("%s and %s do not contain any variables present in both",
sQuote("recipient"), sQuote("donor"),
domain = "R-impimp"))
}
if(is.null(matchvars)) {
matchvars <- cnames
} else if(is.character(matchvars)){
if(any(nm <- (match(matchvars, cnames, nomatch = 0L) == 0L))) {
stop(gettextf("%s contains variable(s) which are not present in both %s and %s: %s",
sQuote("matchvars"), sQuote("donor"),
sQuote("recipient"),
paste(sapply(matchvars[nm], dQuote), collapse = ", "),
domain = "R-impimp"))
}
} else {
stop(gettextf("%s must be NULL or a character vector",
sQuote("matchvars"),
domain = "R-impimp"))
}
# Test if the matching variables do not contain NA
lapply(matchvars, function(x) {
if(anyNA(recipient[ ,x])) {
stop(gettextf("missing values in variable %s in %s",
sQuote(x), sQuote("recipient"),
domain = "R-impimp"))
}
if(anyNA(donor[ ,x])) {
stop(gettextf("missing values in variable %s in %s",
sQuote(x), sQuote("donor"),
domain = "R-impimp"))
}
})
rnames <- setdiff(names(recipient), names(donor))
dnames <- setdiff(names(donor), names(recipient))
allnames <- c(rnames, cnames, dnames)
# check for special package-reserved characters in variable names
if(length(grep(varsep, allnames, fixed = TRUE))) {
stop(gettextf(c("some variable names contain the character %s, reserved for internal purpose.",
"\nRename the variable(s) or change the internal character by setting the option %s"),
c(sQuote(varsep), sQuote("impimp.varsep")),
domain = "R-impimp"))
}
# Do nothing if there are no variables in donor that aren't in recipient
if(!length(dnames)) {
warning(gettextf(c("no variable present only in %s and not in %s; ",
"returning %s unmodified"),
c(sQuote("donor"), sQuote("recipient")),
sQuote("recipient"),
domain = "R-impimp"))
return(recipient)
}
# Construct the possible values for the variables from the
# (partially) supplied argument 'vardomains'
if(!is.null(vardomains)){
# partially match available levels
lvls <- vardomains[allnames]
} else {
# else generate a list of empty ones
lvls <- vector(length = length(allnames), mode = "list")
}
# generate the potentially missing levels
# by using the factor based approach
lvls <- lapply(stats::setNames(nm = allnames),
function(varname) {
varlevels <- lvls[[varname]]
# if variable is not present in one df,
# then NULL is returned for that df
gvarlevels <- gather_levels(c(as.character(recipient[[varname]]),
as.character(donor[[varname]])))
if(is.null(varlevels)) {
varlevels <- gvarlevels
} else if(length(lvldiff <- setdiff(gvarlevels,
varlevels))) {
varlevels <- c(lvldiff, varlevels)
}
varlevels
})
# check for special package-reserved characters in variable values
lapply(names(lvls), function(x) {
if(length(grep(varsep, lvls[[x]], fixed = TRUE))) {
stop(gettextf(c("variable %s contains the character %s, reserved for internal purpose.",
"\nChange the internal character by setting the option %s"),
c(sQuote(x), sQuote("impimp.varsep")),
sQuote(varsep), domain = "R-impimp"))
}
if(length(grep(obssep, lvls[[x]], fixed = TRUE))) {
stop(gettextf(c("variable %s contains the character %s, reserved for internal purpose.",
"\nChange the internal character by setting the option %s"),
c(sQuote(x), sQuote("impimp.obssep")),
sQuote(obssep), domain = "R-impimp"))
}
})
dlvls <- lvls[dnames]
# impute the domain for every missing cell
if(method == "domain") {
# add columns with NA to the data
impRecipient <- cbind(recipient, matrix(NA, ncol = length(dnames),
nrow = nrow(recipient),
dimnames = list(c(), dnames)))
# impute all the levels
impRecipient[, dnames] <- imputation_values(dlvls, dnames)
} else {
# impute cell-wise within donor classes
# create new variable to index the x structure
# This is for donation classes
recipient$cfactor <- factor(apply(recipient[, matchvars], MARGIN = 1,
FUN = paste, collapse =","))
donor$cfactor <- factor(apply(donor[, matchvars], MARGIN = 1,
FUN = paste, collapse =","))
## transform into tuple notation for method == case_wise
if(method == "case_wise") {
donor <- cbind(donor, collapse_variables(donor, dnames))
dlvls <- collapse_variables(
do.call("expand.grid", dlvls), dnames)
dnames <- names(dlvls)
}
# extract level combinations of common variables
# which are present in recipient
clvls <- levels(recipient$cfactor)
# initialize the resulting data.frame with NAs
# in the variable sto be imputed
impRecipient <- cbind(recipient, matrix(NA, ncol = length(dnames),
nrow = nrow(recipient),
dimnames = list(c(), dnames)))
for(clvl in clvls) {
## Generate the levels to impute
donorclass_donor <- donor[donor$cfactor == clvl, ]
if(NROW(donorclass_donor) == 0) {
# empty donor class in donor, use collection
# of levels of the variables
##### Shall we leave this a warning or a message instead?
##### We can also opt for a 'verbose' option
warning(gettextf("No donor found for donation class: %s",
sQuote(clvl),
domain = "R-impimp"))
donor_dlvls <- dlvls
} else {
# extract observed donor values
donor_dlvls <- lapply(stats::setNames(nm = dnames),
function(x) {
gather_levels(donorclass_donor[,x])
})
}
# impute observed donor values
impRecipient[impRecipient$cfactor == clvl, dnames] <-
imputation_values(donor_dlvls, dnames)
}
}
impRecipient <- impRecipient[, c(rnames, cnames, dnames)]
if(!length(grep(class(impRecipient), "impimp", fixed = TRUE))) {
class(impRecipient) <- c("impimp", class(impRecipient))
}
attr(impRecipient, "impmethod") <-
c(method, attr(impRecipient, "impmethod"))
attr(impRecipient, "varlevels") <- lvls
attr(impRecipient, "imputedvarnames") <- dnames
impRecipient
}
#' @rdname impimp
#' @param x object of class 'impimp'
#' @param ... further arguments passed down to
#' \code{\link[base]{print.data.frame}}
#' @export
print.impimp <- function(x, ...) {
cat(gettextf("result of imprecise imputation with method %s\n",
sQuote(attr(x, "impmethod")),
domain ="R-impimp"))
NextMethod(x, ...)
}
#' @rdname impimp
#' @param z object to test for class \code{"impimp"}
#' @export
is.impimp <- function(z) {
inherits(x = z, what = "impimp")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.