hdme.R
In hdme: High-Dimensional Regression with Measurement Error

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=6
)

## -----------------------------------------------------------------------------
# Load the hdme package
library(hdme)

## -----------------------------------------------------------------------------
create_example_data <- function(n, p, s = 5, sdX = 1, sdU = 0.5, 
                                sdEpsilon = 0.1, family = "gaussian") {
  # Independent true covariates with mean zero and standard deviation sdX
  X <- matrix(rnorm(n * p, sd = sdX), nrow = n, ncol = p)
  # if Gaussian, response has standard deviation sdEpsilon, and zero intercept
  # if binomial, response is binomial with mean (1 + exp(-X %*% beta))^(-1)
  beta <- c(-2, -1, 0.5, 1, 2, rep(0, p - s))
  
  if(family == "gaussian") {
    # True coefficient vector has s non-zero elements and p-s zero elements
    y <- X %*% beta + rnorm(n, sd = sdEpsilon)  
  } else if (family == "binomial") {
    # Need an amplification in the binomial case
    beta <- beta * 3
    y <- rbinom(n, size = 1, prob = (1 + exp(-X %*% beta))**(-1))
  }
  
  # The measurements W have mean X and standard deviation sdU. 
  # We assume uncorrelated measurement errors
  W <- X + matrix(rnorm(n * p, sd = sdU), nrow = n, ncol = p)
  
  return(list(X = X, W = W, y = y, beta = beta, sigmaUU = diag(p) * sdU))  
}

## ---- message=FALSE-----------------------------------------------------------
n <- 100
p <- 500
set.seed(1000)
ll <- create_example_data(n, p)

## ---- message=FALSE-----------------------------------------------------------
library(glmnet)
library(dplyr)
# Lasso with cross-validation on data without measurement error
fit1 <- cv.glmnet(ll$X, ll$y)
# Lasso with cross-validation on data with measurement error
fit2 <- cv.glmnet(ll$W, ll$y)
# Create a data frame with results ([-1] because we drop the intercept)
lassoEstimates <- tibble(
  index = rep(1:p, times = 3),
  beta = c(ll$beta, as.numeric(coef(fit1)[-1]), coef(fit2)[-1]),
  label = c(rep("True values", p), rep("No measurement error", p), rep("Measurement error", p))
  )


## -----------------------------------------------------------------------------
library(ggplot2)
theme_set(theme_bw())
theme_update(
  panel.grid = element_blank()
)
ggplot(lassoEstimates, aes(x = index, y = beta, color = label)) +
  geom_point() +
  xlab("p") +
  scale_color_brewer(type = "qualitative", palette = "Paired") +
  theme(legend.title=element_blank()) + 
  ggtitle("Measurement error leading to false positives")

## ---- message=FALSE, warning=FALSE--------------------------------------------
library(tidyr) 
estimatesOfNonzero <- lassoEstimates %>% 
  spread(key = label, value = beta) %>% 
  filter(`True values` != 0) %>% 
  gather(key = label, value = beta, -index)

ggplot(estimatesOfNonzero, aes(x = index, y = beta, color = label)) +
  geom_point() +
  xlab("p") +
  scale_color_brewer(type = "qualitative", palette = "Paired") +
  theme(legend.title=element_blank()) + 
  ggtitle("Measurement error leading to attenuation")

## -----------------------------------------------------------------------------
# Number of samples
n <- 1000
# Number of covariates
p <- 50
# Create example data
ll <- create_example_data(n, p, family = "binomial")

## -----------------------------------------------------------------------------
args(gds)

## -----------------------------------------------------------------------------
# Fit the Generalized Dantzig Selector
gds_estimate <- gds(ll$X, ll$y, family = "binomial")

## -----------------------------------------------------------------------------
class(gds_estimate)

## -----------------------------------------------------------------------------
str(gds_estimate)

## -----------------------------------------------------------------------------
set.seed(1000)
# Generate example data
ll <- create_example_data(n, p)
# Fit the corrected lasso
corrected_fit <- corrected_lasso(W = ll$W, y = ll$y, sigmaUU = ll$sigmaUU)

## -----------------------------------------------------------------------------
# Class of the object
class(corrected_fit)
# The coef() method prints the number of nonzero estimates as a function of the radius
coef(corrected_fit)

## -----------------------------------------------------------------------------
args(corrected_lasso)

## -----------------------------------------------------------------------------
plot(corrected_fit)

## -----------------------------------------------------------------------------
plot(corrected_fit, type = "path")

## -----------------------------------------------------------------------------
set.seed(323)
n <- 100
p <- 50
ll <- create_example_data(n, p, sdU = 0.2, family = "binomial")

## -----------------------------------------------------------------------------
corrected_fit <- corrected_lasso(ll$W, ll$y, ll$sigmaUU, family = "binomial")

## -----------------------------------------------------------------------------
plot(corrected_fit)

## -----------------------------------------------------------------------------
plot(corrected_fit, type = "path")

## -----------------------------------------------------------------------------
set.seed(1000)
# Generate example data
ll <- create_example_data(n, p)
# Run lasso with cross-validation
cv_corrected_fit <- cv_corrected_lasso(W = ll$W, y = ll$y, sigmaUU = ll$sigmaUU)

## -----------------------------------------------------------------------------
class(cv_corrected_fit)

## -----------------------------------------------------------------------------
str(cv_corrected_fit)

## -----------------------------------------------------------------------------
plot(cv_corrected_fit)

## -----------------------------------------------------------------------------
corrected_fit <- corrected_lasso(ll$W, ll$y, ll$sigmaUU, radii = cv_corrected_fit$radius_1se)

## -----------------------------------------------------------------------------
str(corrected_fit)

## -----------------------------------------------------------------------------
set.seed(1)
# Number of samples
n <- 1000
# Number of covariates
p <- 50
# Generate data
ll <- create_example_data(n, p, sdU = 0.2)

## -----------------------------------------------------------------------------
mus_fit <- mus(ll$W, ll$y)
class(mus_fit)

## -----------------------------------------------------------------------------
coef(mus_fit)

## -----------------------------------------------------------------------------
plot(mus_fit)

## -----------------------------------------------------------------------------
mus_fit <- mus(ll$W, ll$y, delta = 0.1)

## -----------------------------------------------------------------------------
plot(mus_fit)

## -----------------------------------------------------------------------------
set.seed(323)
n <- 100
p <- 50
ll <- create_example_data(n, p, sdU = 0.2, family = "binomial")
gmus_fit <- gmus(ll$W, ll$y, family = "binomial")

## -----------------------------------------------------------------------------
class(gmus_fit)
str(gmus_fit)

## -----------------------------------------------------------------------------
plot(gmus_fit)

## -----------------------------------------------------------------------------
gmus_fit <- gmus(ll$W, ll$y, delta = 0.1, family = "binomial")

## -----------------------------------------------------------------------------
plot(gmus_fit)

## -----------------------------------------------------------------------------
set.seed(323)
n <- 100
p <- 50
ll <- create_example_data(n, p, sdU = 0.2, family = "binomial")
gmu_lasso_fit <- gmu_lasso(ll$W, ll$y, family = "binomial")

## -----------------------------------------------------------------------------
class(gmu_lasso_fit)
str(gmu_lasso_fit)

## -----------------------------------------------------------------------------
plot(gmu_lasso_fit)