R/data.R
In glmtlp: Generalized Linear Models with Truncated Lasso Penalty

Documented in gen.binomial.data gen.gaussian.data

#' A simulated binomial data set.
#'
#' A data set simulated for illustrating logistic regression models. Generated by 
#'   \code{gen.binomial.data(n = 200, p = 20, seed = 2021)}.
#'
#' @format A list with three elements: design matrix \code{X}, response \code{y}, 
#'   and the true coefficient vector \code{beta}.
#' \describe{
#'   \item{X}{design matrix}
#'   \item{y}{response}
#'   \item{beta}{the true coefficient vector}
#' }
#' 
#' @usage data(bin_data)
#' 
#' @examples
#' data("bin_data")
#' cv.fit <- cv.glmtlp(bin_data$X, bin_data$y, family = "binomial", penalty = "l1")
#' plot(cv.fit)
#' 
"bin_data"

#' A simulated gaussian data set.
#'
#' A data set simulated for illustrating linear regression models. Generated by 
#'   \code{gen.gaussian.data(n = 200, p = 20, seed = 2021)}.
#'
#' @format A list with five elements: design matrix \code{X}, response \code{y}, 
#'   correlation structure of the covariates \code{Sigma}, true beta \code{beta}, 
#'   and the noise level \code{sigma}.
#' \describe{
#'   \item{X}{design matrix}
#'   \item{y}{response}
#'   \item{beta}{true beta values}
#'   \item{sigma}{the noise level}
#' }
#' 
#' @usage data(gau_data)
#' 
#' @examples 
#' data("gau_data")
#' cv.fit <- cv.glmtlp(gau_data$X, gau_data$y, family = "gaussian", penalty = "tlp")
#' plot(cv.fit)
#' 
"gau_data"

#' Simulate a binomial data set
#'
#' @description
#' Simulate a data set with binary response following the logistic regression
#'   model.
#' 
#' @param n Sample size.
#' @param p Number of covariates.
#' @param rho The parameter defining the AR(1) correlation matrix.
#' @param kappa The number of nonzero coefficients.
#' @param beta.type Numeric indicator for choosing the beta type. For 
#'   \code{beta.type = 1}, the true coefficient vector has \code{kappa} components being 1, 
#'   roughly equally distributed between 1 to \code{p}. For \code{beta.type = 2}, 
#'   the first \code{kappa} values are 1, and the rest are 0. For \code{beta.type = 3}, 
#'   the first \code{kappa} values are equally-spaced values from 10 to 0.5, and 
#'   the rest are 0. For \code{beta.type = 4}, the first \code{kappa} values are 
#'   the first \code{kappa} values in c(-10, -6, -2, 2, 6, 10), and the rest are 
#'   0. For \code{beta.type = 5}, the first \code{kappa} values are 1, and the 
#'   rest decay exponentially to 0 with base 0.5.
#' @param seed The seed for reproducibility. Default is 2021.
#' 
#' @return A list containing the simulated data.
#' \item{X}{the covariate matrix, of dimension \code{n} x \code{p}.}
#' \item{y}{the response, of length \code{n}.}
#' \item{beta}{the true coefficients, of length \code{p}.}
#'   
#' @examples 
#' bin_data <- gen.binomial.data(n = 200, p = 20, seed = 2021)
#' head(bin_data$X)
#' head(bin_data$y)
#' head(bin_data$beta)
#' 
#' @importFrom stats rnorm
#' @importFrom stats rbinom
#' @export gen.binomial.data
#' 
gen.binomial.data <- function(n, p, rho = 0, kappa = 5, beta.type = 1, seed = 2021) {
  set.seed(seed)
  X <- matrix(rnorm(n * p), n, p)
  if (rho != 0) {
    for (j in 2:p) {
      X[, j] <- sqrt(1 - rho^2) * X[, j] + rho * X[, j - 1]
    }
  }
  beta <- gen.beta(kappa, p, beta.type)
  mu <- plogis(as.numeric(X %*% beta))
  y <- rbinom(n, 1, mu)
  
  if (p > 1) {
    colnames(X) <- paste("V", seq(p), sep = "") 
  }
  
  list(X = X, y = y, beta = beta)
  
}

#' Simulate a gaussian data set
#'
#' @description
#' Simulate a data set with gaussian response following the linear regression
#'   model.
#' 
#' @param n Sample size.
#' @param p Number of covariates.
#' @param rho The parameter defining the AR(1) correlation matrix.
#' @param kappa The number of nonzero coefficients.
#' @param beta.type Numeric indicator for choosing the beta type. For 
#'   \code{beta.type = 1}, the true coefficient vector has \code{kappa} components being 1, 
#'   roughly equally distributed between 1 to \code{p}. For \code{beta.type = 2}, 
#'   the first \code{kappa} values are 1, and the rest are 0. For \code{beta.type = 3}, 
#'   the first \code{kappa} values are equally-spaced values from 10 to 0.5, and 
#'   the rest are 0. For \code{beta.type = 4}, the first \code{kappa} values are 
#'   the first \code{kappa} values in c(-10, -6, -2, 2, 6, 10), and the rest are 
#'   0. For \code{beta.type = 5}, the first \code{kappa} values are 1, and the 
#'   rest decay exponentially to 0 with base 0.5.
#' @param snr Signal-to-noise ratio. Default is 1.
#' @param seed The seed for reproducibility. Default is 2021.
#' 
#' @return A list containing the simulated data.
#' \item{X}{the covariate matrix, of dimension \code{n} x \code{p}.}
#' \item{y}{the response, of length \code{n}.}
#' \item{beta}{the true coefficients, of length \code{p}.}
#' \item{sigma}{the standard error of the noise.}
#'   
#' @examples 
#' gau_data <- gen.gaussian.data(n = 200, p = 20, seed = 2021)
#' head(gau_data$X)
#' head(gau_data$y)
#' head(gau_data$beta)
#' gau_data$sigma
#' 
#' @importFrom stats rnorm
#' @export gen.gaussian.data
gen.gaussian.data <- function(n, p, rho=0, kappa=5, beta.type=1, snr=1, seed=2021) {
  set.seed(seed)
  X <- matrix(rnorm(n*p), n, p)
  if (rho != 0) {
    for (j in 2:p) {
      X[, j] <- sqrt(1 - rho^2) * X[, j] + rho * X[, j - 1]
    }
  }
  beta <- gen.beta(kappa, p, beta.type)
  vmu <- sum((crossprod(cholesky.ar1.root(rho, p), beta))^2)
  sigma <- sqrt(vmu/snr)
  y <- as.numeric(X %*% beta + rnorm(n) * sigma)
  
  if (p > 1) {
    colnames(X) <- paste("V", seq(p), sep = "") 
  }

  list(X = X, y = y, beta = beta, sigma = sigma)
}

gen.beta <- function(kappa, p, beta.type) {
  kappa <- min(kappa, p)
  beta <- rep(0, p)
  if (beta.type == 1) {
    beta[round(seq(1, p, length = kappa))] <- 1
  } else if (beta.type == 2) {
    beta[1:kappa] <- 1
  } else if (beta.type == 3) {
    beta[1:kappa] <- seq(10, 0.5, length = kappa)
  } else if (beta.type == 4) {
    beta[1:6] <- c(-10, -6, -2, 2, 6, 10)
  } else {
    beta[1:kappa] <- 1
    if (kappa + 1 <= p) {
      beta[(kappa + 1):p] <- 0.5^(1:(p - kappa))
    }
  }
  beta
}

cholesky.ar1.root <- function(rho, p) {
  # reference 1: https://blogs.sas.com/content/iml/2018/10/03/ar1-cholesky-root-simulation.html
  # reference 2: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4455603/ (Direct formulation to Cholesky decomposition of a general nonsingular correlation matrix)
  if (rho != 0) {
    L <- matrix(0, nrow = p, ncol = p)
    L[, 1] <- rho^(0:(p - 1))
    c <- sqrt(1 - rho^2)
    cL <- c * L[, 1]
    for (i in 2:p) {
      L[i:p, i] <- cL[1:(p - i + 1)]
    }
  } else {
    L <- diag(1, p)
  }
  L
}