We introduce a simple regression problem, and compare the performance of mean imputation, CoCoLasso, and HMLasso. It takes several minutes to run this vignette because of our simple implementation. To see the details of HMLasso, please refer to the following paper.
Takada, M., Fujisawa, H., & Nishikawa, T. (2019). "HMLasso: Lasso with High Missing Rate." IJCAI.
NOTE: Because of the simple implementation, less than 200 variables are desirable from the perspective of computation efficiency.
First, we load libraries.
knitr::opts_chunk$set(echo = TRUE) library(hmlasso) library(MASS)
Next, we define a simple regression problem.
# setting n <- 2000 # number of samples p <- 20 # number of dimensions r <- 0.5 # correlation levels eps <- 1 # noise level # construct covariance matrix mu <- rep(0, length=p) Sigma <- matrix(r, nrow=p, ncol=p) # correlation among variables is 0 diag(Sigma) <- 1 # variance of each variable is 1 # generate training data set.seed(0) X <- mvrnorm(n, mu, Sigma) b <- matrix(0, nrow = p, ncol = 1) b[1:10,] <- (-1)^(1:10) * (1:10) y <- X %*% b + eps * rnorm(n) colnames(X) <- paste0("V", 1:p) rownames(b) <- paste0("V", 1:p) # introduce missing data X_incompl <- X mrc <- runif(p, min=0.1, max=0.9) # missing rate of each column for (j in 1:p) { mr <- sample(1:nrow(X), round(nrow(X) * mrc[j])) X_incompl[mr, j] <- NA } image(t(apply(apply(X_incompl, c(1,2), function(v){ifelse(is.na(v), 1, 0)}), 2, rev)), col=grey(11:0/12), main="missing pattern") # visualize (black:na, white:fill) # generate test data n_ts <- 10000 X_ts <- mvrnorm(n_ts, mu, Sigma) y_ts <- X_ts %*% b + eps * rnorm(n_ts) colnames(X_ts) <- paste0("V", 1:p) # setup cv fold nfolds <- 5 foldid1 <- sample(rep(1:nfolds, (n %/% nfolds)), replace=FALSE) foldid2 <- sample(1:nfolds, (n %% nfolds), replace=FALSE) foldid <- c(foldid1, foldid2)
We apply mean imputation method to the problem.
# MeanImp cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, foldid=foldid, impute_method="mean", direct_prediction=FALSE, positify="mean") plot(cv_fit) # plot lambda-MSE plot(cv_fit$fit) # plot solution path cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient pred <- predict(cv_fit$fit, X_ts) # predict sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
We apply CoCoLasso to the problem.
# CoCoLasso cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, foldid=foldid, direct_prediction=TRUE, positify="admm_max", weight_power = 0) plot(cv_fit) # plot lambda-MSE plot(cv_fit$fit) # plot solution path cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient pred <- predict(cv_fit$fit, X_ts) # predict sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
We apply HMLasso to the problem.
# HMLasso cv_fit <- cv.hmlasso(X_incompl, y, nlambda=50, lambda.min.ratio=1e-1, foldid=foldid, direct_prediction=TRUE, positify="admm_frob", weight_power = 1) plot(cv_fit) # plot lambda-MSE plot(cv_fit$fit) # plot solution path cv_fit$cve[cv_fit$lambda.min.index] # minimum cross-valiation error cv_fit$fit$beta[, cv_fit$lambda.min.index] # estimated coefficient pred <- predict(cv_fit$fit, X_ts) # predict sqrt(sum((cv_fit$fit$beta[, cv_fit$lambda.min.index] - as.vector(b))^2)) # estimation error sqrt(sum((pred[, cv_fit$lambda.min.index] - y_ts)^2) / n_ts) # prediction error
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.