Nothing
aregImpute <- function(formula, data, subset, n.impute=5,
group=NULL, nk=3, tlinear=TRUE,
type=c('pmm','regression','normpmm'), pmmtype=1,
match=c('weighted','closest','kclosest'),
kclosest=3, fweighted=0.2, curtail=TRUE, constraint=NULL,
boot.method=c('simple', 'approximate bayesian'),
burnin=3, x=FALSE,
pr=TRUE, plotTrans=FALSE,
tolerance=NULL, B=75) {
acall <- match.call()
type <- match.arg(type)
match <- match.arg(match)
boot.method <- match.arg(boot.method)
if(pmmtype %nin% 1:3) stop('pmmtype must be 1, 2, or 3')
if(pmmtype == 3) boot.method <- 'approximate bayesian'
lgroup <- length(group)
if(type == 'normpmm' && lgroup)
stop('group makes no sense when type="normpmm"')
if(type == 'normpmm' && ! tlinear)
stop('type="normpmm" not implemented when tlinear=FALSE because no covariance matrix is available for right hand side beta for first canonical variate')
if(length(constraint) && type != 'pmm')
stop("constraint only works with type='pmm'")
if(! inherits(formula,'formula'))
stop('formula must be a formula')
nam <- all.vars(formula)
m <- match.call(expand.dots = FALSE)
Terms <- terms(formula, specials='I')
m$formula <- formula
m$match <- m$fweighted <- m$curtail <- m$x <- m$n.impute <- m$nk <-
m$tlinear <- m$burnin <- m$type <- m$pmmtype <- m$group <- m$pr <-
m$plotTrans <- m$tolerance <- m$boot.method <- m$B <- m$constraint <- NULL
m$na.action <- na.retain
m[[1]] <- as.name("model.frame")
z <- eval(m, sys.parent())
p <- length(z)
n <- nrow(z)
rnam <- row.names(z)
if(length(rnam) == 0) rnam <- as.character(1:n)
if(lgroup) {
if(boot.method == 'approximate bayesian')
stop('group not implemented for boot.method="approximate bayesian"')
if(lgroup != n)
stop('group should have length equal to number of observations')
ngroup <- length(unique(group[! is.na(group)]))
}
linear <- nam[attr(Terms,'specials')$I]
cat.levels <- vector('list',p)
names(cat.levels) <- nam
vtype <- rep('s', p); names(vtype) <- nam
dof <- rep(NA, p); names(dof) <- nam
na <- vector('list',p)
names(na) <- nam
nna <- integer(p); names(nna) <- nam
xf <- matrix(as.double(1), nrow=n, ncol=p, dimnames=list(rnam,nam))
imp <- vector('list',p)
names(imp) <- nam
if(lgroup) group.inds <- imp
for(i in 1:p) {
xi <- z[[i]]
ni <- nam[i]
nai <- is.na(xi)
na[[i]] <- (1:n)[nai]
nna[i] <- nnai <- sum(nai)
if(nnai > 0) imp[[ni]] <- matrix(NA, nrow=nnai, ncol=n.impute,
dimnames=list(rnam[nai],NULL))
if(lgroup) {
if(any(is.na(group[! nai])))
stop('NAs not allowed in group')
if(length(unique(group[! nai])) != ngroup)
stop(paste('not all',ngroup,
'values of group are represented in\n',
'observations with non-missing values of',
ni))
group.inds[[i]] <- split((1:n)[! nai], group[! nai])
}
iscat <- FALSE
if(is.character(xi)) {
xi <- as.factor(xi)
lev <- levels(xi)
iscat <- TRUE
} else if(is.factor(xi)) {
lev <- levels(xi)
iscat <- TRUE
}
if(iscat) {
if(length(lev) < 2) stop(paste(ni,'is constant'))
tab <- table(xi)
if(any(tab == 0))
stop(paste(ni,'has the following levels with no observations:',
paste(names(tab)[tab == 0],collapse=' ')))
if(any(tab < 5))
warning(paste(ni,'has the following levels with < 5 observations:',
paste(names(tab)[tab < 5],collapse=' '),
'\nConsider using the group parameter to balance bootstrap samples'))
cat.levels[[ni]] <- lev
xi <- as.integer(xi)
vtype[ni] <- 'c'
}
else {
u <- unique(xi[! nai])
if(length(u) == 1)
stop(paste(ni,'is constant'))
else
if((length(nk) == 1 && nk == 0) || length(u) == 2 || ni %in% linear)
vtype[ni] <- 'l'
}
xf[,i] <- xi
## Initialize imputed values to random sample of non-missings
if(nnai > 0) xf[nai, i] <-
sample(xi[! nai], nnai, replace=nnai > (n-nnai))
}
Countqual <- NULL
if(length(constraint)) {
## Define a list of lists of vectors
## For each target variable referenced in constraint, make a list
## having as each of its elements corresponding to missing observations
## on the target variable, with the element containing a list of
## row numbers of the set of non-missing target variables with
## corresponding original data meeting the constraint
## Expressions within constraint must reference a single target (recipient)
## observation using r$ followed by original variable names (and
## expecting their original coding) and vectors of donor observation
## variable values using d$ followed by original variable names (and
## expecting their original coding also).
cons <- Countqual <- vector('list', length(constraint))
ncons <- names(constraint)
names(cons) <- names(Countqual) <- ncons
if(any(duplicated(ncons)))
stop('constraint contains duplicate entries for a variable')
for(v in ncons) {
if(v %nin% names(z)) stop('variable ', v, ' in constraint not in data')
xi <- z[, v]
nai <- which(is.na(xi))
nnai <- length(nai)
if(! nnai) warning('variable ', v, ' is in constraints but has no NAs')
j <- which(! is.na(xi))
if(! length(j)) stop('variable ', v, ' has no non-missing values')
w <- vector('list', nnai)
## Get data frame of donors
d <- z[- nai, , drop=FALSE]
countqual <- integer(nnai)
jj <- 0
conv <- constraint[[v]]
for(l in nai) {
jj <- jj + 1
r <- z[l, , drop=FALSE] # single original data row
qual <- eval(conv) # must evaluate to have # rows = # non-NA on xi
if(! is.logical(qual) || (length(qual) != nrow(d)))
stop('result of constraint expression for ', v,
' is not a logical vector of length equal to number of ',
' rows in donor data frame d')
if(any(is.na(qual)))
stop('result of constraint expression for ', v, ' contains an NA')
countqual[jj] <- sum(qual)
if(! any(qual)) {
cat('\nFor the following observation with a missing target variable value\n\n')
print(r)
stop('No observations meet the constraint for variable ', v,
' with constraint ', conv)
}
## w elements are indexed 1, 2, 3, ... for 1st, 2nd, 3rd, ...
## row where target xi was NA
## Vector value of a w element is indexed to row numbers of
## dataset after removing ALL rows with NA in xi
w[[jj]] <- which(qual)
}
cons[[v]] <- w
Countqual[[v]] <- countqual
}
}
z <- NULL
wna <- (1:p)[nna > 0]
if(length(wna) < 2 && missing(burnin)) burnin <- 0
nu <- apply(xf, 2, function(z)length(unique(z)))
## xf = original data matrix (categorical var -> integer codes)
## with current imputations
rsq <- double(length(wna));
names(rsq) <- nam[wna]
resampacc <- list()
if(curtail) xrange <- apply(xf, 2, range)
fits <- NULL
## Compute vector of subscripts of the first argument corresponding
## to closest match of values of second argument, for PMM
## Both arguments represent predicted transformed values
## Returned vector has length = length(precip)
nearest <-
switch(match,
kclosest = function(x, y) whichClosek (x, y, k=kclosest),
closest = function(x, y) whichClosest(x, y),
weighted = function(x, y) whichClosePW(x, y, f=fweighted) )
findclose <- function(pdonor, precip, v) {
if(! length(constraint) || v %nin% names(constraint))
return(nearest(pdonor, precip))
## When constraint is to be applied, different target observations
## may have different donor observations taken out of consideration
## due to constraints, so we need to find closest matches
## individually by target (missing) observations
## Non-qualifying donor observations have 1000 replacing their
## predicted values so that they will not be found as nearest
## matches
co <- cons[[v]]
nrecipients <- length(precip)
found <- integer(nrecipients)
## Elements of co are indexed by 1, 2, 3, ... for 1st, 2nd, 3rd, ...
## row in which target variable was NA
## Values of an element of co are row numbers after subsetting
## original dataset to non-NA target variable values
for(i in 1 : nrecipients) {
qualifying <- co[[i]]
pd <- pdonor
pd[- qualifying] <- 1000.
found[i] <- nearest(pd, precip[i])
}
if(any(pdonor[found] == 1000.)) {
z <- list(pdonor=pdonor, precip=precip, v=v, qualifying=qualifying)
tf <- tempfile()
saveRDS(z, file=tf)
stop('program logic error 5 for constrained variable ', v,
' data saved in file ', tf)
}
found
}
for(iter in 1:(burnin + n.impute)) {
if(pr) cat('Iteration',iter,'\r')
## wna = indexes of variables with > 0 NAs
for(i in wna) { ## i=target variable (recipient)
nai <- na[[i]] ## subscripts of NAs on xf[,i]
j <- (1:n)[-nai] ## subscripts of non-NAs on xf[,i]
npr <- length(j)
ytype <- if(tlinear && vtype[i] == 's')'l' else vtype[i]
if(iter == (burnin + n.impute) && length(nk) > 1) {
rn <- c('Bootstrap bias-corrected R^2',
'10-fold cross-validated R^2',
'Bootstrap bias-corrected mean |error|',
'10-fold cross-validated mean |error|',
'Bootstrap bias-corrected median |error|',
'10-fold cross-validated median |error|')
racc <- matrix(NA, nrow=6, ncol=length(nk),
dimnames=list(rn, paste('nk=', nk, sep='')))
jj <- 0
for(k in nk) {
jj <- jj + 1
f <- areg(xf[,-i,drop=FALSE], xf[,i],
xtype=vtype[-i], ytype=ytype,
nk=k, na.rm=FALSE,
tolerance=tolerance, B=B, crossval=10)
w <- c(f$r2boot, f$rsquaredcv, f$madboot, f$madcv,
f$medboot, f$medcv)
racc[,jj] <- w
}
resampacc[[nam[i]]] <- racc
}
if(lgroup) {
## insure orig. no. obs from each level of group
s <- rep(NA, npr)
for(ji in 1:ngroup) {
gi <- (group.inds[[i]])[[ji]]
s[gi] <- sample(gi, length(gi), replace=TRUE)
}
s <- s[! is.na(s)]
}
else { ## sample of non-NAs
if(type == 'normpmm') s <- j
else {
# Sample subscripts of all non-NA xi
s <- sample(j, npr, replace=TRUE)
if(boot.method == 'approximate bayesian') {
sorig <- s
s <- sample(s, replace=TRUE)
}
}
}
nami <- nam[i]
nm <- c(nami, nam[-i])
if(type != 'normpmm') {
xch <- which(vtype == 'c')
if(length(xch)) {
nus <- apply(xf[s, xch, drop=FALSE], 2,
function(z)length(unique(z)))
xtf <- nus < nu[xch]
if(any(xtf))
stop(paste('a bootstrap resample had too few unique values of the following variables:',paste(nam[xch[xtf]],collapse=','),sep='\n'))
}
}
X <- xf[,-i,drop=FALSE] # non-target variables (donors)
## If there is only one variable that has any NAs, fits on
## non-bootstrapped samples will not vary across multiple imputations.
## Otherwise, fits vary because across the multiple imputations,
## predictors are updated from previous spells as target variables
if(type == 'normpmm' && length(wna) < 2) {
if(iter == 1)
fits[[i]] <- f <-
areg(X[s,], xf[s,i], xtype=vtype[-i], ytype=ytype,
nk=min(nk), na.rm=FALSE, tolerance=tolerance)
else f <- fits[[i]]
}
else f <- areg(X[s,], xf[s,i], xtype=vtype[-i], ytype=ytype,
nk=min(nk), na.rm=FALSE, tolerance=tolerance)
xdf <- f$xdf
dof[nam[-i]] <- xdf
dof[nami] <- f$ydf
if(plotTrans) plot(f)
rsq[nami] <- f$rsquared
## residuals off of transformed predicted values
res <- f$residuals
pti <- predict(f, X) # predicted transformed xf[,i], all observations
## if type is normpmm only those elements corresponding to
## complete cases are used
if(type == 'normpmm') {
xpxi <- f$xpxi
if(! length(xpxi)) stop('type="normpmm" cannot be used when any variable needing imputation is categorical or nonlinear')
## See mice package .norm.draw function
px <- sum(xdf)
sigma.star <- sqrt(sum(res^2)/rchisq(1, length(res) - px))
beta.star <- f$xcoefficients + t(chol(xpxi)) %*% rnorm(1 + px) *
sigma.star
pti[nai] <- predict(f, X[nai,,drop=FALSE], type='x') %*% beta.star
} else if(type == 'pmm') {
if(pmmtype %in% c(1,3)) {
## Match predicted complete cases using non-bootstrap
## beta with incomplete cases using bootstrap beta
## For pmmtype=3 use bootstrap vs. sample w/replacement bootstrap
ss <- if(pmmtype == 1) j else sorig
g <- areg(X[ss,], xf[ss,i], xtype=vtype[-i], ytype=ytype,
nk=min(nk), na.rm=FALSE, tolerance=tolerance)
## This would not need to be run fresh at each mult. imp.
## iteration if only one variable were ever NA
pti[j] <- predict(g, X[j,])
}
}
if(type != 'regression') {
if(ytype == 'l') pti <- (pti - mean(pti)) / sqrt(var(pti))
## Jitter predicted transformed values for non-NAs to randomly
## break ties in matching with predictions for NAs in xf[,i]
## Because of normalization used by fitter, pti usually ranges
## from about -4 to 4
if(match == 'closest') pti[j] <- pti[j] + runif(npr,-.0001,.0001)
whichclose <- j[findclose(pti[j], pti[nai], nami)]
## j = non-NA rows nai = NA rows on xi
impi <- xf[whichclose, i]
}
else { ## type='regression'
## predicted transformed target var + random sample of res,
## for NAs
r <- sample(res, length(nai),
replace=length(nai) > length(res))
ptir <- pti[nai] + r
## predicted random draws on untransformed scale
impi <- f$yinv(ptir, what='sample', coef=f$ycoefficients)
if(curtail) impi <- pmin(pmax(impi, xrange[1,i]), xrange[2,i])
}
xf[nai,i] <- impi
if(iter > burnin) imp[[nam[i]]][,iter-burnin] <- impi
}
}
if(pr)
cat('\n')
if(! x) xf <- NULL
structure(list(call=acall, formula=formula,
match=match, fweighted=fweighted, pmmtype=pmmtype,
constraint=constraint, countqual=Countqual,
n=n, p=p, na=na, nna=nna,
type=vtype, tlinear=tlinear, nk=min(nk),
cat.levels=cat.levels, df=dof,
n.impute=n.impute, imputed=imp, x=xf, rsq=rsq,
resampacc=resampacc),
class='aregImpute')
}
print.aregImpute <- function(x, digits=3, ...) {
cat("\nMultiple Imputation using Bootstrap and PMM\n\n")
dput(x$call)
cat("\n")
cat('n:',x$n,'\tp:',x$p,
'\tImputations:',x$n.impute,' \tnk:',x$nk,'\n')
cat('\nNumber of NAs:\n'); print(x$nna); cat('\n')
info <- data.frame(type=x$type, d.f.=x$df,
row.names=names(x$type))
print(info)
if(x$tlinear)
cat('\nTransformation of Target Variables Forced to be Linear\n')
cat('\nR-squares for Predicting Non-Missing Values for Each Variable\nUsing Last Imputations of Predictors\n')
print(round(x$rsq, digits))
racc <- x$resampacc
if(length(racc)) {
cat('\nResampling results for determining the complexity of imputation models\n\n')
for(i in 1:length(racc)) {
cat('Variable being imputed:', names(racc)[i], '\n')
print(racc[[i]], digits=digits)
cat('\n')
}
cat('\n')
}
cq <- x$countqual
if(length(cq)) {
cat('\nFrequency distributions of number of potential donor observations\nmeeting constraints\n\n')
for(v in names(cq)) {
cat(v, '\n')
print(table(cq[[v]]))
cat('\n')
}
}
invisible()
}
plot.aregImpute <- function(x, nclass=NULL, type=c('ecdf','hist'),
datadensity=c("hist","none","rug","density"),
diagnostics=FALSE, maxn=10, ...) {
type <- match.arg(type)
datadensity <- match.arg(datadensity)
i <- x$imputed
catg <- x$categorical
lev <- x$cat.levels
n.impute <- x$n.impute
for(n in names(i)) {
xi <- i[[n]]
if(! length(xi))
next
if(diagnostics) {
r <- range(xi)
for(j in 1:min(maxn,nrow(xi))) {
plot(1:n.impute, xi[j,], ylim=r, xlab='Imputation',
ylab=paste("Imputations for Obs.",j,"of",n))
}
}
ix <- as.vector(i[[n]])
lab <- paste('Imputed',n)
if(n %in% catg) {
tab <- table(ix)
dotchart3(tab, lev[[n]], auxdata=tab, xlab='Frequency',
ylab=lab)
}
else {
if(type == 'ecdf')
Ecdf(ix, xlab=lab, datadensity=datadensity, subtitles=FALSE)
else {
if(length(nclass))
hist(ix, xlab=n, nclass=nclass, main='')
else
hist(ix, xlab=lab, main='')
scat1d(ix)
}
}
}
invisible()
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.