Nothing
### fill.NAs parses out either a data frame or a formula
### and then hands each column over to the appropriate fill.column
### method. Missingness indicators are added for each column that is
### imputed.
#' Given a \code{data.frame} or \code{formula} and data,
#' \code{fill.NAs()} returns an expanded data frame, including a new
#' missingness flag for each variable with missing values and
#' replacing each missing entry with a value representing a reasonable
#' default for missing values in its column. Functions in the formula
#' are supported, with transformations happening before \code{NA}
#' replacement. The expanded data frame is useful for propensity
#' modeling and balance checking when there are covariates with
#' missing values.
#'
#' \code{fill.NAs} prepares data for use in a model or matching
#' procedure by filling in missing values with minimally invasive
#' substitutes. Fill-in is performed column-wise, with each column
#' being treated individually. For each column that is missing, a new
#' column is created of the form \dQuote{ColumnName.NA} with
#' indicators for each observation that is missing a value for
#' \dQuote{ColumnName}. Rosenbaum and Rubin (1984, Sec. 2.4 and
#' Appendix B) discuss propensity score models using this data
#' structure.
#'
#' The replacement value used to fill in a missing value is simple
#' mean replacement. For transformations of variables, e.g. \code{y ~
#' x1 * x2}, the transformation occurs first. The transformation
#' column will be \code{NA} if any of the base columns are
#' \code{NA}. Fill-in occurs next, replacing all missing values with
#' the observed column mean. This includes transformation columns.
#'
#' Data can be passed to \code{fill.NAs} in two ways. First, you can
#' simply pass a \code{data.frame} object and \code{fill.NAs} will
#' fill every column. Alternatively, you can pass a \code{formula} and
#' a \code{data.frame}. Fill-in will only be applied to columns
#' specifically used in the formula. Prior to fill-in, any functions
#' in the formula will be expanded. If any arguments to the functions
#' are \code{NA}, the function value will also be \code{NA} and
#' subject to fill-in.
#'
#' By default, \code{fill.NAs} does not impute the response
#' variable. This is to encourage more sophisticated imputation
#' schemes when the response is a treatment indicator in a matching
#' problem. This behavior can be overridden by setting \code{all.covs
#' = TRUE}.
#'
#' @title Create missingness indicator variables and non-informatively fill in missing values
#'
#' @param x Can be either a data frame (in which case the data
#' argument should be \code{NULL}) or a formula (in which case data
#' must be a data.frame)
#' @param data If x is a formula, this must be a data.frame. Otherwise
#' it will be ignored.
#' @param all.covs Should the response variable be imputed? For
#' formula \code{x}, this is the variable on the left hand side. For
#' \code{data.frame} \code{x}, the response is considered the first
#' column.
#' @param contrasts.arg (from \code{model.matrix}) A list, whose
#' entries are values (numeric matrices or character strings naming
#' functions) to be used as replacement values for the
#' \code{\link{contrasts}} replacement function and whose names are
#' the names of columns of \code{data} containing
#' \code{\link{factor}}s.
#'
#' @return A \code{data.frame} with all \code{NA} values replaced with
#' mean values and additional indicator columns for each column
#' including missing values. Suitable for directly passing to
#' \code{\link{lm}} or other model building functions to build
#' propensity scores.
#'
#' @author Mark M. Fredrickson and Jake Bowers
#'
#' @references
#' Rosenbaum, Paul R. and Rubin, Donald B. (1984) \sQuote{Reducing
#' Bias in Observational Studies using Subclassification on the
#' Propensity Score,} \emph{Journal of the American Statistical
#' Association}, \bold{79}, 516 -- 524.
#'
#' Von Hipple, Paul T. (2009) \sQuote{How to impute interactions,
#' squares, and other transformed variables,} \emph{Sociological
#' Methodology}, \bold{39}(1), 265 -- 291.
#'
#' @seealso \code{\link{match_on}}, \code{\link{lm}}
#'
#' @examples
#' data(nuclearplants)
#' ### Extract some representative covariates:
#' np.missing <- nuclearplants[c('t1', 't2', 'ne', 'ct', 'cum.n')]
#'
#' ### create some missingness in the covariates
#' n <- dim(np.missing)[1]
#' k <- dim(np.missing)[2]
#'
#' for (i in 1:n) {
#' missing <- rbinom(1, prob = .1, size = k)
#' if (missing > 0) {
#' np.missing[i, sample(k, missing)] <- NA
#' }
#' }
#'
#' ### Restore outcome and treatment variables:
#' np.missing <- data.frame(nuclearplants[c('cost', 'pr')], np.missing)
#'
#' ### Fit a propensity score but with missing covariate data flagged
#' ### and filled in, as in Rosenbaum and Rubin (1984, Appendix):
#' np.filled <- fill.NAs(pr ~ t1 * t2, np.missing)
#' # Look at np.filled to establish what missingness flags were created
#' head(np.filled)
#' (np.glm <- glm(pr ~ ., family=binomial, data=np.filled))
#' (glm(pr ~ t1 + t2 + `t1:t2` + t1.NA + t2.NA,
#' family=binomial, data=np.filled))
#' # In a non-interactive session, the following may help, as long as
#' # the formula passed to `fill.NAs` (plus any missingness flags) is
#' # the desired formula for the glm.
#' (glm(formula(terms(np.filled)), family=binomial, data=np.filled))
#'
#' ### produce a matrix of propensity distances based on the propensity model
#' ### with fill-in and flagging. Then perform pair matching on it:
#' pairmatch(match_on(np.glm, data=np.filled), data=np.filled)
#'
#' ## fill NAs without using treatment contrasts by making a list of contrasts for
#' ## each factor ## following hints from https://stackoverflow.com/a/4569239/161808
#'
#' np.missing$t1F<-factor(np.missing$t1)
#' cov.factors <- sapply(np.missing[,c("t1F","t2")],is.factor)
#' cov.contrasts <- lapply(
#' np.missing[,names(cov.factors)[cov.factors],drop=FALSE],
#' contrasts, contrasts = FALSE)
#'
#' ## make a data frame filling the missing covariate values, but without
#' ## excluding any levels of any factors
#' np.noNA2<-fill.NAs(pr~t1F+t2,data=np.missing,contrasts.arg=cov.contrasts)
#'
#' @keywords manip
#'
#' @export
fill.NAs <- function(x, data = NULL, all.covs = FALSE, contrasts.arg=NULL) {
# if x is present alone, it must be a data.frame
# if x is present with data, x is a formula
# all.covs is logical indicating whether or not we want a response variable
if (is.null(data)) {
if (!is.data.frame(x) && !is.matrix(x)) {
stop("Single argument must be a data frame")
}
# swap the arguments around
data <- as.data.frame(x) # in case it is a matrix
response <- colnames(data)[1] # the name of the response var
if(!all.covs && dim(data)[2] > 1){
# Added ticks in case response has "+" or other formula characters in it
x <- terms(as.formula(paste0("`", response, "` ~ .")), data = data)
} else {
x <- terms(as.formula(~ .), data = data) # everything, including the response should be imputed
}
} else {
if (inherits(x, "formula")) {
if(!is.data.frame(data) && !is.matrix(data)) {
stop("Data argument required for formulas")
}
}
}
# Adding informative error for #104 prior to fixing the overarching issue
if (any(vapply(data, function(x) !is.null(dim(x)), logical(1)))) {
stop("Data with matrix columns not supported; potentially generated by `cbind` inside formula.")
}
withStrata <- findStrata(x, data)
data <- as.data.frame(data) # in case it is a matrix
ttt <- terms(withStrata$newx, data = data)
response <- all.vars(ttt)[attr(ttt, "response")]
data.trimmed <- data[all.vars(ttt)]
# not.response <- colnames(data)[colnames(data) != response]
# find missingness indicators for each column
original.NAs <- vapply(data.trimmed, function(i) { any(is.na(i)) }, logical(1))
original.names <- colnames(data.trimmed)[original.NAs]
# create a model matrix from the data.trimmed; transforms of NA should be NA
modmat <- model.matrix(withStrata$newx, model.frame(withStrata$newx, data.trimmed, na.action = na.pass), contrasts.arg=contrasts.arg)
modmat <- as.data.frame(modmat)
# remove the intercept, if any
modmat["(Intercept)"] <- NULL
# shortcircuit if there are no additional NAs to add
# if(!any(original.NAs)) {
# result <- cbind(data.trimmed[response], modmat)
# return(result)
# }
result <- modmat
if(any(original.NAs)) {
# indicator columns for missing data.trimmed, only for original missing columns, not transforms
NA.columns <- vapply(data.trimmed[original.names], function(column) {
is.na(column)
}, logical(nrow(data.trimmed)))
colnames(NA.columns) <- paste(colnames(NA.columns), "NA", sep = ".")
# of the remaining columns, find those with missingness
expanded.NAs <- colnames(modmat)[apply(modmat, 2, function(i) { any(is.na(i))})]
# fill in the columns with missingness
# NB: fill.column.numeric is hard coded as value of model.matrix is always numeric. no need for a generic fn.
if (length(withStrata$strata) > 0 ) {
sformula <- as.formula(paste("~", paste(withStrata$strata, collapse = "+")))
tmp <- interaction(model.frame(sformula, data = data, na.action = na.pass))
for (l in levels(tmp)) {
idx <- tmp == l & !is.na(tmp)
modmat[idx, expanded.NAs] <- lapply(modmat[expanded.NAs][idx, , drop = FALSE], fill.column.numeric)
}
} else {
modmat[expanded.NAs] <- lapply(modmat[expanded.NAs], fill.column.numeric)
}
result <- cbind(modmat, NA.columns)
}
if(!all.covs){
result <- cbind(data.trimmed[response], result)
newfmla <- formula(result)
} else {
p <- paste0("`", names(result), "`")
p <- gsub("``", "`", p)
newfmla <- formula(paste("~", paste(p, collapse="+")))
}
if (length(withStrata$strata) > 0) {
sformula <- as.formula(paste("~", paste(withStrata$strata, collapse = "+")))
tmp <- model.frame(sformula, data = data, na.action = na.pass)
colnames(tmp) <- all.vars(sformula)
result <- cbind(result, tmp)
newfmla <- update(newfmla, paste0(".~.+", paste(withStrata$strata, collapse = "+")))
}
attr(result, "terms") <- terms(newfmla, data = data, specials = "strata")
return(result)
}
### Column imputation: takes a column and returns filled in values
### Adding NA indicator columns happens elsewhere
### NB: the only that will probably ever be called in numeric now
### that model.matrix gets called ahead of column filling
#' @keywords internal
fill.column <- function(column) {
UseMethod("fill.column", column)
}
#' @keywords internal
fill.column.numeric <- function(column) {
nas <- is.na(column)
cm <- mean(column[!nas])
column[nas] <- cm
return(column)
}
#' @keywords internal
fill.column.logical <- function(column) {
nas <- is.na(column)
cm <- mean(column[!nas]) > .5
column[nas] <- cm
return(column)
}
#' @keywords internal
fill.column.factor <- function(column) {
# following RITools' imputation function, this adds a level
# another option would be imputing based on the modal factor
levels(column) <- c(levels(column),'.NA')
column[is.na(column)] <- ".NA"
return(column)
}
#' @keywords internal
fill.column.ordered <- function(column) {
# uses a median imputation scheme -- finds the middle most
# level and uses that
sorted <- sort(column, NA.last = NA)
imputed.value <- sorted[floor(length(sorted))/2]
column[is.na(column)] <- imputed.value
return(column)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.