Missing Data {#chapter-missing}

require(Matching)
require(mice)
data(lalonde, package='Matching')
Tr <- lalonde$treat
Y <- lalonde$re78
X <- lalonde[,c('age','educ','black','hisp','married','nodegr','re74','re75')]
lalonde.glm <- glm(treat ~ ., family=binomial, data=cbind(treat=Tr, X))
summary(lalonde.glm)

Create a copy of the covariates to simulate missing at random (mar) and not missing at random (nmar).

lalonde.mar <- X
lalonde.nmar <- X

missing.rate <- .2 # What percent of rows will have missing data
missing.cols <- c('nodegr', 're75') # The columns we will add missing values to

# Vectors indiciating which rows are treatment and control.
treat.rows <- which(lalonde$treat == 1)
control.rows <- which(lalonde$treat == 0)

Add missingness to the existing data. For the not missing at random data treatment units will have twice as many missing values as the control group.

set.seed(2112)
for(i in missing.cols) {
    lalonde.mar[sample(nrow(lalonde), nrow(lalonde) * missing.rate), i] <- NA
    lalonde.nmar[sample(treat.rows, length(treat.rows) * missing.rate * 2), i] <- NA
    lalonde.nmar[sample(control.rows, length(control.rows) * missing.rate), i] <- NA
}

The proportion of missing values for the first covariate

prop.table(table(is.na(lalonde.mar[,missing.cols[1]]), lalonde$treat, useNA='ifany'))
prop.table(table(is.na(lalonde.nmar[,missing.cols[1]]), lalonde$treat, useNA='ifany'))

Create a shadow matrix. This is a logical vector where each cell is TRUE if the value is missing in the original data frame.

shadow.matrix.mar <- as.data.frame(is.na(lalonde.mar))
shadow.matrix.nmar <- as.data.frame(is.na(lalonde.nmar))

Change the column names to include "_miss" in their name.

names(shadow.matrix.mar) <- names(shadow.matrix.nmar) <- paste0(names(shadow.matrix.mar), '_miss')

Impute the missing values using the mice package

set.seed(2112)
mice.mar <- mice(lalonde.mar, m=1)
mice.nmar <- mice(lalonde.nmar, m=1)

Get the imputed data set.

complete.mar <- complete(mice.mar)
complete.nmar <- complete(mice.nmar)

Estimate the propensity scores using logistic regression.

lalonde.mar.glm <- glm(treat~., data=cbind(treat=Tr, complete.mar, shadow.matrix.mar))
lalonde.nmar.glm <- glm(treat~., data=cbind(treat=Tr, complete.nmar, shadow.matrix.nmar))

We see that the two indicator columns from the shadow matrix are statistically significant predictors suggesting that the data is not missing at random.

summary(lalonde.mar.glm)
summary(lalonde.nmar.glm)


jbryer/psa documentation built on Nov. 17, 2023, 8:21 a.m.