cvreg: Cross Validation and Robust Estimation Utilities

Documented in rcorr sim.gauss sim.sgauss sim.student simX simY

#' Generate response variable from design matrix, coefficients, and likelihood function.
#'
#' @param X design matrix
#' @param betas coefficients
#' @param lkfun a likelihood function with appropriate arguments filled in. See example.
#'
#' @return  a vector
#' @export
#'
#'
#' @examples
#' simY(X, betas, lkfun = function(x) rnorm(length(x), x, 1))
#' simY(X, betas, lkfun = function(x) rbinom(length(x), 1, exp(x)/(exp(x)+1)))
#'
simY = function(X, betas, lkfun = function(x) rnorm(length(x), x, 1)){
  mu <- as.vector(betas %*% t(X))
  lkfun(mu)
}

#' Simulate a design matrix of correlated continuous covariates including P > N matrices.
#'
#' @param vars A vector containing the number of variables to load on each respective variable cluster. For example, if you desire three
#' clusters of variables that have 5 variables each, enter c(5, 5, 5).
#' @param Nobs The number of observations. Defaults to 50. Can be less than the number of variables to simulate P > N data.
#' @param distfunc A function with appropriate arguments filled in. See example. This is used to generate random observations
#' which are "morphed" to have the correlation structure generated by this function.
#'
#' @return
#' A matrix with the attribute "assign" indicating the cluster onto which a given variable loads.
#' @export
#'
#' @examples
#' simX((vars = c(16, 15, 9), Nobs = 28, distfunc = function(x) rt(x, df = 3)))
#'
simX = function(vars = c(9, 6, 10, 6, 9, 10, 12, 5, 3), Nobs = 50, distfunc = function(x) rt(x, df = 3) ){

  gfactor = function(N){

    matrix(round(c(runif(max(floor(N/4), 1), -0.13, 0.015),
             runif(N - max(floor(N/4), 1), -0.018, 0.66)), 2),
    nrow = N)

  }

  loading = function(vars){
    ms = function(x, N){
      if (x-1 <= 0){
        probs = c(0.32, 0.21, 0.18,  0.13, 0.05, 0.05, 0.06)
        scalars = c(1, 0.86, -0.96, -0.92, -1, 0.78, 0.82)
        rbind(c(runif(1, 0.80, 1) * sample(scalars, prob = probs, size = 1), runif(N-1, -0.06, 0.01)))
      } else {
        probs = c(0.39, 0.28, 0.170, 0.16)
        scalars = c(1, 0.76, -0.95,-0.82)
        rbind(rev(c(runif(N-x, -1e-2, 1e-2), runif(1, 0.80, 1) * sample(scalars, prob = probs, size = 1), runif(N-(N-x)-1, -0.09, 0.02))))
      }
    }

    t(zapsmall(matrix(unlist(sapply(1:length(vars), function(x) rbind(replicate(vars[x], ms(x, length(vars)))))), nrow = length(vars)), 2))

  }

  g = gfactor(length(vars))
  f = loading(vars)
  C = zapsmall(psych::sim.hierarchical(gload = g, fload = f), 2)

  s <- svd(C)
  V <- s$v
  D <- sqrt(diag(s$d))
  X <- (V %*% D) %*% matrix(sqrt(abs(distfunc(ncol(C)*Nobs))), ncol(C))
  X <- scale(t(X))
  attr(X, "scaled:center") <- NULL
  attr(X, "scaled:scale") <- NULL
  attr(X, "assign") <- unlist(sapply(1:length(vars), function(v) rep(v, vars[v])))
  return(X)
}

#' Generate a random correlation matrix
#'
#' Generates a random draw from a Wishart distribution with a specified degrees of freedom (nu)
#' and then standarizes it to obtain a correlation matrix. The degrees of freedom must
#' be equal to or greater than the number of variables. Larger degrees of freedom shrink
#' the correlations towards the diagonal. When nu = p the correlations are entirely random.
#'
#' @param p number of variables
#' @param nu the degrees of freedom of the wishart distribution
#'
#' @return a matrix
#' @export
#'
#' @examples rcorr(10, 15)
rcorr <-function(p, nu=1) {

  if (nu < p){
    cat(crayon::magenta(crayon::bold("Nu excedes the number of variables. Setting nu to nu = p.\n")))
    nu <- p
  }

  rho = cov2cor(rWishart(1, df = nu, Sigma = rWishart(1, df = nu * 3, Sigma = diag(rep(1, p)))[, ,1])[, ,1])
  return(rho)
}

#' Simulate data for benchmarking Gaussian regression models.
#'
#' @param n Number of observations.
#' @param p Number of predictors.
#' @param rho Correlation for generating correlated variables.
#' @param coefs Vector of non-zero coefficients
#' @param snr Signal to noise ratio (SNR). Defaults to 2.
#' SNR is defined as
#' \deqn{
#' \frac{Var(E(y | X))}{Var(Y - E(y | X))} =
#' \frac{Var(f(X))}{Var(\varepsilon)} =
#' \frac{Var(X^T \beta)}{Var(\varepsilon)} =
#' \frac{Var(\beta^T \Sigma \beta)}{\sigma^2}.
#' }
#' @param x.df The degrees of freedom for the multivariate distribution of covariates. Defaults to Inf (Gaussian).
#' @param seed Random seed for reproducibility.
#' @param scale should the data be scaled? Defaults to TRUE.
#' @return a data frame with an attribute "true.betas" that contains the true coefficients. If scale = TRUE, the coefficients are scaled to match.
#'
#' @author Brandon Vaughan
#' @export
#'
#' @examples
#' dat <- sim.gauss(
#'   n = 120, p = 200, rho = 0.26,
#'   coefs = c(runif(25, -4, -1), runif(25, 1, 4)), snr = 2,
#'   seed = 100)
#'
sim.gauss <- function(n = 100, p = 25, rho = 0.5, coefs = c(1.5, 4.0, -2.0, -4.0, 1.0, 2.0, -2.5), snr = 2, x.df = Inf, scale = TRUE, cormat = NULL, seed = 100) {

  call <- match.call()

  set.seed(seed)

  if (x.df >= 1997){
    x.df <- 1997
  }

  if (is.null(cormat)){
    sigma <- matrix(0, p, p)
    corvec <- function(i, p, rho) rho^(abs(i - 1L:p))
    for (i in 1:p) sigma[i, ] <- corvec(i, p, rho)
  } else {
    sigma <- cormat
  }

  X <- mvtnorm::rmvt(n, df = x.df, sigma)

  # non-zero coefficients
  betas <- matrix(c(coefs, rep(0, (p - length(coefs)))))

  snr.numerator <- as.vector(t(betas) %*% sigma %*% betas)
  snr.denominator <- snr.numerator / snr
  sd <- sqrt(snr.denominator)

  mu <- as.vector(X %*% betas)


  y <- rnorm(length(mu), mu, sd)

  if (scale){
    sds = apply(X, 2, sd)
    ysd = sd(y)
    true.betas <- betas * (sds / ysd)
    df <- Scale(cbind.data.frame(y = y , X))
    colnames(df) = c("y", paste0("Var", 1:ncol(X)))
    attr(df, "true.betas") <- true.betas

  } else {
    colnames(X) = paste0("Var", 1:ncol(X))
    df = cbind.data.frame(y = y, X)
    attr(df, "true.betas") <- betas
  }
  return(df)
}


#' Simulate data for benchmarking Student-T regression models.
#'
#' @param n Number of observations.
#' @param p Number of predictors.
#' @param rho Correlation for generating correlated variables.
#' @param coefs Vector of non-zero coefficients
#' @param snr Signal to noise ratio (SNR). Defaults to 2.
#' SNR is defined as
#' \deqn{
#' \frac{Var(E(y | X))}{Var(Y - E(y | X))} =
#' \frac{Var(f(X))}{Var(\varepsilon)} =
#' \frac{Var(X^T \beta)}{Var(\varepsilon)} =
#' \frac{Var(\beta^T \Sigma \beta)}{\sigma^2}.
#' }
#' @param noise.df The degrees of freedom for the noise distribution. Defaults to Inf (Gaussian).
#' @param seed Random seed for reproducibility.
#' @param scale should the data be scaled? Defaults to TRUE.
#' @return a data frame with an attribute "true.betas" that contains the true coefficients. If scale = TRUE, the coefficients are scaled to match.
#'
#' @author Brandon Vaughan
#' @export
#'
#' @examples
#' dat <- sim.student(
#'   n = 120, p = 200, rho = 0.26,
#'   coefs = c(runif(25, -4, -1), runif(25, 1, 4)), snr = 2,
#'   seed = 100)
#'
sim.student <- function(n = 100, p = 25, rho = 0.5, coefs = c(1.5, 4.0, -2.0, -4.0, 1.0, 2.0, -2.5), snr = 2, noise.df = 3, scale = TRUE, cormat = NULL, seed = 100) {

  call <- match.call()

  set.seed(seed)

  if (x.df >= 1997){
    x.df <- 1997
  }

  if (is.null(cormat)){
    sigma <- matrix(0, p, p)
    corvec <- function(i, p, rho) rho^(abs(i - 1L:p))
    for (i in 1:p) sigma[i, ] <- corvec(i, p, rho)
  } else {
    sigma <- cormat
  }

  X <- mvtnorm::rmvt(n, df = Inf, sigma)

  # non-zero coefficients
  betas <- matrix(c(coefs, rep(0, (p - length(coefs)))))

  snr.numerator <- as.vector(t(betas) %*% sigma %*% betas)
  snr.denominator <- snr.numerator / snr
  sd <- sqrt(snr.denominator)

  mu <- as.vector(X %*% betas)

  rst <- function (n, nu = 3, mu = 0, scale = 1)
  {
    if (any(scale <= 0))
      stop("the scale parameter must be positive.")
    if (any(nu <= 0))
      stop("the nu parameter must be positive.")
    rGamma = function(n, shape = 1, rate = 1) {
      return(qgamma(seq(1/n, 1 - 1/n, length.out = n), shape,
                    rate))
    }
    gamma.samps = rGamma(n, nu * 0.5, (scale^2) * (nu * 0.5))
    sigmas = sqrt(1/gamma.samps)
    x = rnorm(n, mu, sigmas)
    return(x)
  }

  y <- rst(length(mu), mu, scale = sd)

  if (scale){
    sds = apply(X, 2, sd)
    ysd = sd(y)
    true.betas <- betas * (sds / ysd)
    df <- Scale(cbind.data.frame(y = y , X))
    colnames(df) = c("y", paste0("Var", 1:ncol(X)))
    attr(df, "true.betas") <- true.betas

  } else {
    colnames(X) = paste0("Var", 1:ncol(X))
    df = cbind.data.frame(y = y, X)
    attr(df, "true.betas") <- betas
  }
  return(df)
}

#' Simulate data for benchmarking Skew Normal regression models.
#'
#' @param n Number of observations.
#' @param p Number of predictors.
#' @param alpha Skew parameter for noise distribution. Defaults to -3.
#' @param rho Correlation for generating correlated variables.
#' @param coefs Vector of non-zero coefficients
#' @param snr Signal to noise ratio (SNR). Defaults to 2.
#' SNR is defined as
#' \deqn{
#' \frac{Var(E(y | X))}{Var(Y - E(y | X))} =
#' \frac{Var(f(X))}{Var(\varepsilon)} =
#' \frac{Var(X^T \beta)}{Var(\varepsilon)} =
#' \frac{Var(\beta^T \Sigma \beta)}{\sigma^2}.
#' }
#' @param x.df The degrees of freedom for the multivariate distribution of covariates. Defaults to Inf (Gaussian).
#' @param seed Random seed for reproducibility.
#' @param scale should the data be scaled? Defaults to TRUE.
#' @return a data frame with an attribute "true.betas" that contains the true coefficients. If scale = TRUE, the coefficients are scaled to match.
#'
#' @author Brandon Vaughan
#' @export
#'
#' @examples
#' dat <- sim.sgauss(
#'   n = 120, p = 200, rho = 0.26,
#'   coefs = c(runif(25, -4, -1), runif(25, 1, 4)), snr = 2,
#'   seed = 100)
#'
sim.sgauss <- function(n = 100, p = 25, rho = 0.5, alpha = -3, coefs = c(1.5, 4.0, -2.0, -4.0, 1.0, 2.0, -2.5), snr = 2, x.df = Inf, scale = TRUE, cormat = NULL, seed = 100) {

  call <- match.call()

  set.seed(seed)

  if (x.df >= 1997){
    x.df <- 1997
  }

  if (is.null(cormat)){
    sigma <- matrix(0, p, p)
    corvec <- function(i, p, rho) rho^(abs(i - 1L:p))
    for (i in 1:p) sigma[i, ] <- corvec(i, p, rho)
  } else {
    sigma <- cormat
  }

  X <- mvtnorm::rmvt(n, df = x.df, sigma)

  # non-zero coefficients
  betas <- matrix(c(coefs, rep(0, (p - length(coefs)))))

  snr.numerator <- as.vector(t(betas) %*% sigma %*% betas)
  snr.denominator <- snr.numerator / snr
  sd <- sqrt(snr.denominator)

  mu <- as.vector(X %*% betas)

  y <- rsnorm(length(mu), mu = mu, sigma = sd, alpha = alpha)

  if (scale){
    sds = apply(X, 2, sd)
    ysd = sd(y)
    true.betas <- betas * (sds / ysd)
    df <- Scale(cbind.data.frame(y = y , X))
    colnames(df) = c("y", paste0("Var", 1:ncol(X)))
    attr(df, "true.betas") <- true.betas

  } else {
    colnames(X) = paste0("Var", 1:ncol(X))
    df = cbind.data.frame(y = y, X)
    attr(df, "true.betas") <- betas
  }
  return(df)
}