An Introduction to HMLasso
In hmlasso: Lasso with High Missing Rate

We introduce a simple regression problem, and compare the performance of mean imputation, CoCoLasso, and HMLasso. It takes several minutes to run this vignette because of our simple implementation. To see the details of HMLasso, please refer to the following paper.

Takada, M., Fujisawa, H., & Nishikawa, T. (2019). "HMLasso: Lasso with High Missing Rate." IJCAI. .

NOTE: Because of the simple implementation, less than 200 variables are desirable from the perspective of computation efficiency.

Setup

First, we load libraries.

knitr::opts_chunk$set(echo = TRUE)
library(hmlasso)
library(MASS)

Data Preparation

Next, we define a simple regression problem.

# setting
n <- 2000 # number of samples
p <- 20 # number of dimensions
r <- 0.5 # correlation levels
eps <- 1 # noise level

# construct covariance matrix
mu <- rep(0, length=p)
Sigma <- matrix(r, nrow=p, ncol=p) # correlation among variables is 0
diag(Sigma) <- 1 # variance of each variable is 1

# generate training data
set.seed(0)
X <- mvrnorm(n, mu, Sigma)
b <- matrix(0, nrow = p, ncol = 1)
b[1:10,] <- (-1)^(1:10) * (1:10)
y <- X %*% b + eps * rnorm(n)
colnames(X) <- paste0("V", 1:p)
rownames(b) <- paste0("V", 1:p)

# introduce missing data
X_incompl <- X
mrc <- runif(p, min=0.1, max=0.9) # missing rate of each column
for (j in 1:p) {
  mr <- sample(1:nrow(X), round(nrow(X) * mrc[j]))
  X_incompl[mr, j] <- NA
}
image(t(apply(apply(X_incompl, c(1,2), function(v){ifelse(is.na(v), 1, 0)}), 2, rev)),
      col=grey(11:0/12), main="missing pattern") # visualize (black:na, white:fill)

# generate test data
n_ts <- 10000
X_ts <- mvrnorm(n_ts, mu, Sigma)
y_ts <- X_ts %*% b + eps * rnorm(n_ts)
colnames(X_ts) <- paste0("V", 1:p)

# setup cv fold
nfolds <- 5
foldid1 <- sample(rep(1:nfolds, (n %/% nfolds)), replace=FALSE)
foldid2 <- sample(1:nfolds, (n %% nfolds), replace=FALSE)
foldid <- c(foldid1, foldid2)

Mean Imputation

We apply mean imputation method to the problem.

# MeanImp
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1,
                     foldid=foldid, impute_method="mean",
                     direct_prediction=FALSE, positify="mean")
plot(cv_fit) # plot lambda-MSE
plot(cv_fit$fit) # plot solution path
cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error

CoCoLasso

We apply CoCoLasso to the problem.

# CoCoLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, 
                        foldid=foldid, direct_prediction=TRUE, 
                        positify="admm_max", weight_power = 0)
plot(cv_fit) # plot lambda-MSE
plot(cv_fit$fit) # plot solution path
cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error

HMLasso

We apply HMLasso to the problem.

# HMLasso
cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, 
                        foldid=foldid, direct_prediction=TRUE, 
                        positify="admm_frob", weight_power = 1)
plot(cv_fit) # plot lambda-MSE
plot(cv_fit$fit) # plot solution path
cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error
cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient
pred <- predict(cv_fit$fit, X_ts) # predict
sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error
sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error