knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
The package can be loaded with the command:
library("ILSE")
First, we generate a small simulated data.
set.seed(1) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1)
Then, we fit the linear regression model without missing values based on ILSE.
ilse1 <- ilse(Y~X) print(ilse1)
We can also create a (data.frame) object as input for ILSE.
dat <- data.frame(Y=Y, X=X) ilse1 <- ilse(Y~., data=dat) print(ilse1) Coef(ilse1) # access the coefficients Fitted.values(ilse1)[1:5] Residuals(ilse1)[1:5]
Check the significant variables by bootstratp.
s1 <- summary(ilse1) s1
First, we randomly remove some entries in X.
mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA ncomp <- sum(complete.cases(Xmis)) message("Number of complete cases is ", ncomp, '\n')
Second, we use lm to fit linear regression model based on complete cases, i.e., CC analysis. We can not detect any siginificant covariates.
lm1 <- lm(Y~Xmis) s_cc <- summary.lm(lm1) s_cc
Third, we use ILSE to fit the linear regression model based on all data. We can fit a linear regression model without intercept by setting formula:
ilse2 <- ilse(Y~Xmis+0, data=NULL, verbose=T) print(ilse2)
Then, we fit a linear regression model with intercept by following command
ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T) print(ilse2)
Fourth, Bootstrap is applied to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe four significant coefficients.
s2 <- summary(ilse2, Nbt=20) s2
In ILSE package, we also provide Full Information Maximum Likelihood for Linear Regression fimlreg. We show how to use it to handle the above missing data.
fimllm <- fimlreg(Y~Xmis) print(fimllm)
We also use bootstrap to evaluate the standard error and p-values of each coefficients estimated by ILSE. We observe only one significant coefficients.
s_fiml <- summary(fimllm, Nbt=20) s_fiml
We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis and blue line 0.1 in y-axis.
pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s2[,4], FIML=s_fiml[,4]) library(ggplot2) df1 <- data.frame(Pval= as.vector(pMat[-1,]), Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)), covariate= factor(rep(paste0("X", 1:p), times=3))) ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + geom_hline(yintercept = 0.1, color='blue')
Base on the above data, we add a new column, a categorical variable (Sex), into the data.frame. This variable is not associated with the outcome variable.
dat <- data.frame(Y=Y, X=Xmis) dat$Sex <- factor(rep(c('male', 'female'), times=n/2)) dat$Sex[sample(1:n, n*mis_rate)] <- NA ilse1 <- ilse(Y~., data=dat, verbose = T)
We can change the bootstrap times in calculate the standard errors, Z value and p-values of coefficients.
s3 <- summary(ilse1, Nbt=40) s3
First, we generate data from a linear regression model with three inportant variables(1,3,5) and three unimportant variables(2,4,6).
set.seed(10) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,0), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) message("The true regression coefficients are: ", paste0(beta0, ' '))
We randomly assign missing values in the design matrix.
mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA
Next, we use ILSE to fit model.
dat <- data.frame(Y=Y, X=Xmis) ilse1 <- ilse(Y~., data=dat, verbose = T) s3 <- summary(ilse1) s3
Fit model by using lm and FIML, finally compare ILSE with these two methods.
lm1 <- lm(Y~Xmis) s_cc <- summary.lm(lm1) fimllm <- fimlreg(Y~Xmis) s_fiml <- summary(fimllm)
We visualize the p-vaules of each methods , where red line denotes 0.05 in y-axis. Under significance level 0.05, we found both ILSE and FIML can identify all important variables (X1, X3 and X5), while CC method only identified X1 and X5.
library(ggthemes) pMat <- cbind(CC=s_cc$coefficients[,4], ILSE=s3[,4], FIML=s_fiml[,4]) df1 <- data.frame(Pval= as.vector(pMat[-1,]), Method =factor(rep(c('CC', "ILSE", "FIML"),each=p)), covariate= factor(rep(paste0("X", 1:p), times=3))) ggplot(data=df1, aes(x=covariate, y=Pval, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) + geom_hline(yintercept = 0.05, color='red') + scale_fill_economist()
Here, we generate a data with 80% missing values, then use ILSE to fit model.
# generate data from linear model set.seed(10) n <- 100 p <- 6 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), times=3) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) # generate missing values mis_rate <- 0.8 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA # retain 4 complete cases. Xmis[1:4,] <- X[1:4, ] sum(complete.cases(Xmis))
CC method will failed.
lm1 <- lm(Y~Xmis) summary.lm(lm1)
However, ILSE can still work.
ilse2 <- ilse(Y~Xmis, verbose = T) s2 <- summary(ilse2) s2
We generate a large-scale data with n=1000 and p = 50
n <- 1000 p <- 50 X <- MASS::mvrnorm(n, rep(0, p), cor.mat(p, rho=0.5)) beta0 <- rep(c(1,-1), length=p) Y <- -2+ X %*% beta0 + rnorm(n, sd=1) mis_rate <- 0.3 set.seed(1) na_id <- sample(1:(n*p), n*p*mis_rate) Xmis <- X Xmis[na_id] <- NA Xmis[1:10,] <- X[1:10,] lm1 <- lm(Y~Xmis) lm1 system.time(ilse2 <- ilse(Y~Xmis, data=NULL, verbose=T))
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.