R/gen.data.R
In BioMark: Find Biomarkers in Two-Class Discrimination Problems

Documented in gen.data gen.data2

gen.data <- function(ncontrol, ntreated = ncontrol, nvar, nbiom = 5,
                     group.diff = .5, nsimul = 100,
                     means = rep(0, nvar), cormat = diag(nvar))
{
  nobj <- ncontrol + ntreated
  
  X <- array(0, c(nobj, nvar, nsimul))
  diffvec <- rep(c(group.diff, 0), c(nbiom, nvar-nbiom))
  means.treated <- means + diffvec

  for (i in 1:nsimul)
    X[,,i] <- rbind(mvrnorm(ncontrol, means, cormat),
                    mvrnorm(ntreated, means.treated, cormat))
  
  list(X = X,
       Y = factor(rep(c("control", "treated"), c(ncontrol, ntreated))),
       nbiomarkers = nbiom)
}

gen.data2 <- function(X, ncontrol, nbiom, spikeI,
                      type = c("multiplicative", "additive"),
                      nsimul = 100, stddev = .05) {
  dimnames(X) <- NULL
  
  ntreated <- nrow(X) - ncontrol
  X <- X[sample(1:nrow(X)),]
  X.control <- X[1:ncontrol,]
  X.treated.orig <- X[(ncontrol+1):nrow(X),]

  ## if more than one data set is to be simulated, there should be
  ## differences between the simulations. In this implementation, the
  ## differences are generated by choosing different levels for the
  ## first nbiom elements. We check that different settings indeed are
  ## possible. Naturally, if only one set is simulated
  ## (but who wants to do that?) this is less interesting.
  if (length(unique(spikeI)) < 3 & nsimul > 1)
    stop("spikeI should contain at least three different elements")
  if (length(unique(spikeI))^nbiom < nsimul)
    stop("number of simulations exceeds number of possible data sets")
  
  newI <- matrix(sample(spikeI, nbiom * nsimul, replace = TRUE),
                 ncol = nsimul)
  newI <- newI + matrix(rnorm(prod(dim(newI)), sd = stddev*mean(newI)),
                        nrow(newI), ncol(newI))

  nvar <- ncol(X)
  X.output <- array(0, c(nrow(X), nvar, nsimul))
  type <- match.arg(type)
  if (type == "multiplicative") {
    for (i in 1:nsimul) {
      Bmat <- rep(1, ncol(X))
      Bmat[1:nbiom] <- newI[,i]

      X.output[,,i] <- rbind(X.control, X.treated.orig %*% diag(Bmat))
    }
  } else {
    zeromat <- matrix(0, ntreated, ncol(X) - nbiom)
    for (i in 1:nsimul) {
      X.output[,,i] <- rbind(X.control, 
                             X.treated.orig + cbind(newI, zeromat))
    }
  }

  dimnames(X.output) <- list(c(paste("Control", 1:ncontrol),
                               paste("Treated", 1:ntreated)),
                             c(paste("Biom", 1:nbiom),
                               paste("Var", 1:(nvar - nbiom))))
  list(X = X.output,
       Y = factor(rep(c("control", "treated"), c(ncontrol, ntreated))),
       n.biomarkers = nbiom)
       
}