R/simulation.R

Defines functions convert.pec.sim.to.inbix createMixedSimulation createSimulation createMainEffects createInteractions splitDataset

Documented in convert.pec.sim.to.inbix createInteractions createMainEffects createMixedSimulation createSimulation splitDataset

# simulation.R - Trang Le and Bill White - Fall 2016/Spring 2017
#
# Simulated data for comparison of classification algorithms in:
# Classification algorithms used in the Bioinformatics paper:
# Differential privacy-based Evaporative Cooling feature selection and
# classification with Relief-F and Random Forests
# https://doi.org/10.1093/bioinformatics/btx298

# The following modification applied on data simulation functions by Saeid Parvandeh - Spring 2018
# 1.) Modifying simulation data to do quantitative outcome 
# 2.) simulate imbalanced data
# 3.) making a function that will simulate case/control with a mix of main and interaction effects.

#' Split a data set for machine learning classification
#'
#' Return data.sets as a list of training set, holdout set and validation set
#' according to the predefined percentage of each partition
#' default is a 50-50 split into training and holdout, no testing set
#' code class/label/phenotypes as 1 and -1.
#' User can manage the simulation data to be dichotomious/quantitative using label (class/qtrait)
#'
#' @param all.data A data frame of n rows by d colums of data plus a label column
#' @param pct.train A numeric percentage of samples to use for traning
#' @param pct.holdout A numeric percentage of samples to use for holdout
#' @param pct.validation A numeric percentage of samples to use for testing
#' @param label A character vector of the data column name for the outcome label. class for classification
#' and qtrait for regression.
#' @return A list containing:
#' \describe{
#'   \item{train}{traing data set}
#'   \item{holdout}{holdout data set}
#'   \item{validation}{validation data set}
#' }
#' @examples
#' data("rsfMRIcorrMDD")
#' data.sets <- splitDataset(rsfMRIcorrMDD)
#' @family simulation
#' @export
# splitDataset function
######################################################################################################
splitDataset <- function(all.data=NULL,
                         pct.train=0.5,
                         pct.holdout=0.5,
                         pct.validation=0,
                         label="class") {
  if (is.null(all.data)) {
    # stop or warning and return list of length 0?
    stop("No data passed")
  }
  if (1.0 - (pct.train + pct.holdout + pct.validation) > 0.001 ) {
    stop("Proportions of training, holdout and testing must to sum to 1")
  }
  if (!(label %in% colnames(all.data))) {
    stop("Class label is not in the column names or used more than once in data set column names")
  }
  if (label == "class"){
    if (!is.factor(all.data[, label])) {
      all.data[, label] <- factor(all.data[, label])
    }
    if (nlevels(all.data[, label]) != 2) {
      stop("Cannot split data set with more than or less than 2 factor levels in the class label")
    }
  }
  
  if (label == "class"){
    class.levels <- levels(all.data[, label])
    ind.case <- rownames(all.data)[all.data[, label] == class.levels[1]]
    ind.ctrl <- rownames(all.data)[all.data[, label] == class.levels[2]]
    
    n.case <- length(ind.case)
    n.ctrl <- length(ind.ctrl)
    
    n.validation.case <- floor(pct.validation * n.case)
    n.holdo.case <- floor(pct.holdout * n.case)
    n.train.case <- n.case - n.validation.case - n.holdo.case
    partition.case <- sample(c(rep(3, n.validation.case), rep(2, n.holdo.case),
                               rep(1, n.train.case)), n.case)
    
    n.validation.ctrl <- floor(pct.validation * n.ctrl)
    n.holdo.ctrl <- floor(pct.holdout * n.ctrl)
    n.train.ctrl <- n.ctrl - n.validation.ctrl - n.holdo.ctrl
    partition.ctrl <- sample(c(rep(3, n.validation.ctrl),
                               rep(2, n.holdo.ctrl),
                               rep(1, n.train.ctrl)), n.ctrl)
    
    all.data <- data.frame(all.data)
    all.data[, label] <- factor(all.data[, label])
    levels(all.data[, label]) <- c(-1, 1)
    data.case <- all.data[ind.case, ]
    data.ctrl <- all.data[ind.ctrl, ]
    X_train <- rbind(data.case[partition.case == 1, ], data.ctrl[partition.ctrl == 1, ])
    X_holdo <- rbind(data.case[partition.case == 2, ], data.ctrl[partition.ctrl == 2, ])
    X_validation <- rbind(data.case[partition.case == 3, ], data.ctrl[partition.ctrl == 3, ])
  } else {
    num_sample <- length(all.data[, label])
    n.train <- floor(pct.train * num_sample)
    n.holdout <- floor(pct.holdout * num_sample)
    n.validation <- floor(pct.validation * num_sample)
    partition <- sample(c(rep(3, n.validation),
                          rep(2, n.holdout),
                          rep(1, n.train)))
    X_train <- rbind(all.data[partition == 1, ])
    X_holdo <- rbind(all.data[partition == 2, ])
    X_validation <- rbind(all.data[partition == 3, ])
  }
  
  # if(nrow(X_validation) == 0) {
  #   data.sets <- list(train=X_train, holdout=X_holdo)
  # } else {
  data.sets <- list(train = X_train, holdout = X_holdo, validation = X_validation)
  # }
  #
  data.sets
}
######################################################################################################


#' Create a differentially coexpressed data set with interactions
#'
#' @param M An integer for the number of samples (rows)
#' @param N An integer for the number of variables (columns)
#' @param label A character vector for the name of the outcome column. class for classification
#' and qtrait for regression
#' @param pct.imbalance A numeric percentage to indicate proportion of the imbalaced samples. 
#' 0 means all controls and 1 mean all cases.
#' @param meanExpression A numeric for the mean expression levels
#' @param A A matrix representing a weighted, undirected network (adjacency)
#' @param randSdNoise Random noise for the background expression levels
#' @param sdNoise A numeric for the noise in the differential expression
#' @param sampleIndicesInteraction A vector of integers of significant variables
#' @param verbose A flag for sending verbose output to stdout
#' @family simulation
#' @return A matrix representing the new new data set.
# createInteractions function
######################################################################################################
createInteractions <- function(M=100,
                               N=100,
                               label="class",
                               pct.imbalance = 0.5,
                               meanExpression=7,
                               A=NULL,
                               randSdNoise=1,
                               sdNoise=0.4,
                               sampleIndicesInteraction=NULL,
                               verbose=FALSE) {
  if (is.null(A)) {
    stop("privacyEC: No adjacency matrix provided")
  }
  if (is.null(sampleIndicesInteraction)) {
    stop("privacyEC: No sample signal indices provided")
  }
  # create a random data matrix
  D <- matrix(nrow = M, ncol = N, data = stats::rnorm(M * N,
                                                      mean = meanExpression,
                                                      sd = randSdNoise))
  
  # add co-expression
  already_modified <- rep(0, N)                                                ##### BD edit
  already_modified[1] <- 1
  for (i in 1:(N - 1)) {                                                       ##### BD edit
    for (j in (i + 1):N) {                                                     ##### BD edit
      if (verbose) cat("Condidering A: row", i, "column", j, "\n")
      if ((A[i, j] == 1) && (!already_modified[j])) {
        if (verbose) cat("Making col", j, "from col", i, "\n")                 ##### BD edit
        D[, j] <- D[, i] + stats::rnorm(M, mean = 0, sd = as.numeric(sdNoise)) ##### BD edit
        already_modified[j] <- 1
      } else {
        if (already_modified[j] == 1 && !already_modified[i]) {
          # if j is already modified, we want to modify i,
          # unless i is already modified then do nothing
          D[, i] <- D[, j] + stats::rnorm(M, mean = 0, sd = sdNoise)           ##### BD edit
        }
      }
    }
  }
  
  # perturb to get differential coexpression
  n1 <- M / 2                                                 ##### BD edit
  mGenesToPerturb <- length(sampleIndicesInteraction)
  for (i in 1:mGenesToPerturb) {
    geneIdxInteraction <- sampleIndicesInteraction[i]
    
    # NOTE: bcw, these are never used?
    # g0 <- D[sampleIndicesInteraction, (n1 + 1):N]
    # g1 <- D[sampleIndicesInteraction, 1:n1]
    
    # get the group 2 gene expression and randomly order for differential coexpression
    x <- D[1:n1, geneIdxInteraction]                          ##### BD edit
    x <- x[order(stats::runif(length(x)))]
    D[1:n1, geneIdxInteraction] <- x                          ##### BD edit
  }
  
  # return a regression ready data frame
  # Modifying outcome generator to generate imbalanced data.
  dimN <- nrow(D)                                             ##### BD edit
  n1 <- round(pct.imbalance * dimN)
  n2 <- round((1 - pct.imbalance) * dimN)
  subIds <- c(paste("case", 1:n1, sep = ""), paste("ctrl", 1:n2, sep = ""))
  qtrait <- c(rep(1, n1), rep(0, n2))
  newD <- cbind(D, qtrait)                                                      ##### BD edit
  colnames(newD) <- c(paste("var", sprintf("%04d", 1:N), sep = ""), label)      ##### BD edit
  rownames(newD) <- subIds
  
  data.frame(newD)
}
######################################################################################################


# main effect with quantitative outcome using label ("class" (dichotomious) or "qtrait"(quantitative))
# main effect with imbalanced outcome using pct.imbalance (decreasing pct. will generate less control sample)
# parameters in effect n.e=num.variables, n.db=num.samples, sd.b=bias, p.b=pct.signals

#' Create a simulated data set with main effects
#'
#' \eqn{X = BS + \Gamma G + U}
#'
#' S = Biological group                                                                                                                   m
#' G = Batch
#' U = random error
#'
#' NOTE:  If you use conf=TRUE, then you must have exactly two surrogate
#' variables in the database this function only allows for confounding in
#' the database, not confounding in the new samples
#'
#' @param n.e number of variables
#' @param n.db sample size in database
#' @param n.ns sample size in newsample
#' @param label A character vector for the name of the outcome column. class for classification
#' and qtrait for regression
#' @param pct.imbalance A numeric percentage to indicate proportion of the imbalaced samples. 
#' 0 means all controls and 1 mean all cases.
#' @param sv.db batches
#' @param sv.ns batches
#' @param sd.b a numeric for sd.b?
#' @param sd.gam a numeric for sd.gam?
#' @param sd.u a numeric for sd.u?
#' @param conf a flag for conf?
#' @param distr.db a numeric for distr.db?
#' @param p.b a numeric for p.b?
#' @param p.gam a numeric for p.gam?
#' @param p.ov a numeric for p.ov?
#' @return A list with:
#' \describe{
#'   \item{db}{database creation variables}
#'   \item{vars}{variables used in simulation}
#' }
#' @references
#' Leek,J.T. and Storey,J.D. (2007) Capturing heterogeneity in gene expression
#' studies by surrogate variable analysis. PLoS Genet., 3, 1724–1735
#' @family simulation
#' @export
# createMainEffects function
######################################################################################################
createMainEffects <- function(n.e=1000,
                              n.db=70,
                              n.ns=30,
                              label = "class",
                              pct.imbalance = 0.5,
                              sv.db=c("A", "B"),
                              sv.ns=c("A", "B"),
                              sd.b=1,
                              sd.gam=1,
                              sd.u=1,
                              conf=FALSE,
                              distr.db=NA,
                              p.b=0.3,
                              p.gam=0.3,
                              p.ov=p.b / 2) {
  if(!any(label == c("class", "qtrait"))){
    stop("CreateMainEffects: name of the label should be class or qtrait")
  }
  n <- n.db + n.ns
  # Create random error
  U <- matrix(nrow = n.e, ncol = n, stats::rnorm(n.e * n, sd = sd.u))
  
  # Create index for database vs. new sample #
  ind <- as.factor(c(rep("db", n.db), rep("ns", n.ns)))
  
  if (label == "class"){
    # Create outcome, surrogate variables #
    # Use distr option to show % overlap of outcome, surrogate variables.
    # Note that .5 means no confounding between outcome, surrogate variables.
    
    # biological variable (fixed at 50% for each outcome)
    
    ##############################
    ##### imbalanced ability #####
    # ----------------------------
    S.db <- c(rep(0, round(pct.imbalance * n.db)), rep(1, round((1 - pct.imbalance) * n.db)))
    S.ns <- c(rep(0, round(pct.imbalance * n.ns)), rep(1, round((1 - pct.imbalance) * n.ns)))
    S <- c(S.db, S.ns)
    
    len0 <- sum(S.db == 0)
    len1 <- sum(S.db == 1)
    
    if (conf == FALSE) {
      # surrogate variable (no confounding in this function)
      n.sv.db <- length(sv.db)
      prop.db <- 1 / n.sv.db
      # create surrogate variables for outcome 0 in database
      x1 <- c()
      for (i in 1:n.sv.db) {
        x1 <- c(x1, rep(sv.db[i], floor(prop.db * len0)))
      }
      # If the rounding has caused a problem, randomly assign to fill out vector
      while (length(x1) != len0) {
        x1 <- c(x1, sample(sv.db, 1))
      }
      # surrogate variables for outcome 1 will NOT be the same
      x2 <- c()
      for (i in 1:n.sv.db) {
        x2 <- c(x2, rep(sv.db[i], floor(prop.db * len1)))
      }
      # If the rounding has caused a problem, randomly assign to fill out vector
      while (length(x2) != len1) {
        x2 <- c(x2, sample(sv.db, 1))
      }
    }
    
    if (conf == TRUE) {
      x1 <- c(rep("A", round(distr.db * len0)),
              rep("B", len0 - round(distr.db * len0)))
      x2 <- c(rep("A", round((1 - distr.db) * len1)),
              rep("B", len1 - round((1 - distr.db) * len1)))
    }
    
    # create surrogate variables for outcome 0 in new samples
    n.sv.ns <- length(sv.ns)
    prop.ns <- 1 / n.sv.ns
    
    len0 <- sum(S.ns == 0)
    len1 <- sum(S.ns == 1)
    
    x3 <- c()
    for (i in 1:n.sv.ns) {
      x3 <- c(x3, rep(sv.ns[i], floor(prop.ns * len0)))
    }
    # If the rounding has caused a problem, randomly assign to fill out vector
    while (length(x3) != len0) {
      x3 <- c(x3, sample(sv.ns, 1))
    }
    
    # surrogate variables for outcome 1 will NOT be the same
    x4 <- c()
    for (i in 1:n.sv.ns) {
      x4 <- c(x4, rep(sv.ns[i], floor(prop.ns * len1)))
    }
    # If the rounding has caused a problem, randomly assign to fill out vector
    while (length(x4) != len1) {
      x4 <- c(x4, sample(sv.ns, 1))
    }
    
    G <- c(x1, x2, x3, x4)
    G <- t(stats::model.matrix(~ as.factor(G)))[-1, ]
    if (is.null(dim(G))) {
      G <- matrix(G, nrow = 1, ncol = n)
    }
    # Determine which probes are affected by what:
    # 30% for biological, 30% for surrogate, 10% overlap
    # First 30% of probes will be affected by biological signal
    ind.B <- rep(0, n.e)
    ind.B[1:round(p.b * n.e)] <- 1
    # Probes 20% thru 50% will be affected by surrogate variable
    ind.Gam <- rep(0, n.e)
    ind.Gam[round((p.b - p.ov) * n.e):round((p.b - p.ov + p.gam) * n.e)] <- 1
    
    # figure out dimensions for Gamma
    
    # create parameters for signal, noise
    B <- matrix(nrow = n.e, ncol = 1, stats::rnorm(n.e, mean = 0, sd = sd.b) * ind.B)
    Gam <- matrix(nrow = n.e, ncol = dim(G)[1],
                  stats::rnorm(n.e * dim(G)[1], mean = 0, sd = sd.gam) * ind.Gam)
    
    # simulate the data
    sim.dat <- B %*% S + Gam %*% G + U
    sim.dat <- sim.dat + abs(min(sim.dat)) + 0.0001
    
    # simulate data without batch effects
    sim.dat.nobatch <- B %*% S + U
    sim.dat.nobatch <- sim.dat.nobatch + abs(min(sim.dat)) + 0.0001
    
    # divide parts into database, new samples
    db <- list()
    db$dat <- sim.dat[, ind == "db"]
    db$datnobatch <- sim.dat.nobatch[, ind == "db"]
    db$U <- U[, ind == "db"]
    db$B <- B
    db$S <- S[ind == "db"]
    db$Gam <- Gam
    db$G <- G[ind == "db"]
  } else if (label == "qtrait"){
    ##########################
    # ------- Comment --------
    # Since, we have quantitative outcome and not dichotomize outcome, we may not be able to include conf.
    # Also, simulation algorithm return a simulate data without batch effects, so we do not also need Gam.
    ##########################
    S.db <- stats::rnorm(n.db, 0, 1)
    S.ns <- stats::rnorm(n.ns, 0, 1)
    S <- c(S.db, S.ns)
    
    # Determine which probes are affected by what:
    # 30% for biological, 30% for surrogate, 10% overlap
    # First 30% of probes will be affected by biological signal
    ind.B <- rep(0, n.e)
    ind.B[1:round(p.b * n.e)] <- 1
    # Probes 20% thru 50% will be affected by surrogate variable
    ind.Gam <- rep(0, n.e)
    ind.Gam[round((p.b - p.ov) * n.e):round((p.b - p.ov + p.gam) * n.e)] <- 1
    
    # figure out dimensions for Gamma
    
    # create parameters for signal, noise
    B <- matrix(nrow = n.e, ncol = 1, stats::rnorm(n.e, mean = 0, sd = sd.b) * ind.B)
    # Gam <- matrix(nrow = n.e, ncol = dim(G)[1],
    #               stats::rnorm(n.e * dim(G)[1], mean = 0, sd = sd.gam) * ind.Gam)
    # 
    # # simulate the data
    # sim.dat <- B %*% S + Gam %*% G + U
    # sim.dat <- sim.dat + abs(min(sim.dat)) + 0.0001
    
    # simulate data without batch effects
    # since, there is no G and Gam matrix for quantitative outcome, we use without batch effects.
    sim.dat.nobatch <- B %*% S + U
    sim.dat.nobatch <- sim.dat.nobatch + abs(min(sim.dat.nobatch)) + 0.0001
    
    # divide parts into database, new samples
    db <- list()
    # db$dat <- sim.dat[, ind == "db"]
    db$datnobatch <- sim.dat.nobatch[, ind == "db"]
    db$U <- U[, ind == "db"]
    db$B <- B
    db$S <- S[ind == "db"]
    # db$Gam <- Gam
    # db$G <- G[ind == "db"]
  }
  
  vars <- list(n.e  =  n.e, n.db = n.db, n.ns = n.ns, sv.db = sv.db,
               sv.ns = sv.ns, sd.b = sd.b, sd.gam = sd.gam, sd.u = sd.u,
               conf = conf, distr.db = distr.db, p.b = p.b, p.gam = p.gam,
               p.ov = p.ov)
  
  list(db = db, vars = vars)
}
######################################################################################################


#' Create a data simulation and return train/holdout/validation data sets.
#'
#' @param num.variables An integer for the number of variables
#' @param num.samples An integer for the number of samples
#' @param pct.imbalance A numeric percentage to indicate proportion of the imbalaced samples. 
#' 0 means all controls and 1 mean all cases.
#' @param pct.signals A numeric for proportion of simulated signal variables
#' @param bias A numeric for effect size in simulated signal variables
#' @param label A character vector for the name of the outcome column. class for classification
#' and qtrait for regression
#' @param sim.type A character vector of the type of simulation:
#' mainEffect/interactionErdos/interactionScalefree
#' @param pct.train A numeric percentage of samples to use for traning
#' @param pct.holdout A numeric percentage of samples to use for holdout
#' @param pct.validation A numeric percentage of samples to use for testing
#' @param verbose A flag indicating whether verbose output be sent to stdout
#' @param save.file A filename or NULL indicating whether to save the simulations to file
#' @return A list with:
#' \describe{
#'   \item{train}{traing data set}
#'   \item{holdout}{holdout data set}
#'   \item{validation}{validation data set}
#'   \item{label}{the class label/column name}
#'   \item{signal.names}{the variable names with simulated signals}
#'   \item{elapsed}{total elapsed time}
#' }
#' @examples
#' num.variables <- 100
#' num.samples <- 100
#' pct.imbalance <- 0.5
#' pct.signals <- 0.1
#' bias <- 0.4
#' label <- "class"
#' sim.type <- "mainEffect"
#' sim.data <- createSimulation(num.samples=num.samples,
#'                              num.variables=num.variables,
#'                              pct.imbalance=pct.imbalance,
#'                              pct.signals=pct.signals,
#'                              bias=bias,
#'                              label=label,
#'                              sim.type=sim.type,
#'                              verbose=FALSE)
#' @family simulation
#' @export
# createSimulation function
######################################################################################################
createSimulation <- function(num.samples=100,
                             num.variables=100,
                             pct.imbalance = 0.5,
                             pct.signals=0.1,
                             bias=0.4,
                             label="class",
                             sim.type="mainEffect",
                             pct.train=0.5,
                             pct.holdout=0.5,
                             pct.validation=0,
                             save.file=NULL,
                             verbose=FALSE) {
  if(!any(label == c("class", "qtrait"))){
    stop("CreateMainEffects: name of the label should be class or qtrait")
  }
  ptm <- proc.time()
  nbias <- pct.signals * num.variables
  if (sim.type == "mainEffect") {
    # new simulation:
    # sd.b sort of determines how large the signals are
    # p.b=0.1 makes 10% of the variables signal, bias <- 0.5
    my.sim.data <- createMainEffects(n.e=num.variables,                   
                                     n.db=num.samples,              
                                     pct.imbalance=pct.imbalance,
                                     label = label,
                                     sd.b=bias,
                                     p.b=pct.signals)$db
    dataset <- cbind(t(my.sim.data$datnobatch), my.sim.data$S)
  } else if (sim.type == "interactionScalefree") {
    # interaction simulation: scale-free
    g <- igraph::barabasi.game(num.variables, directed = F)
    A <- igraph::get.adjacency(g)
    myA <- as.matrix(A)
    dataset <- createInteractions(M=num.samples,                             ##### BD edit
                                  N=num.variables,                           ##### BD edit
                                  pct.imbalance = pct.imbalance,
                                  meanExpression=7,
                                  A=myA,
                                  randSdNoise=1,
                                  sdNoise=bias,
                                  sampleIndicesInteraction=1:nbias)
  } else if(sim.type == "interactionErdos") {
    attach.prob <- 0.1
    g <- igraph::erdos.renyi.game(num.variables, attach.prob)
    #   foo <- printIGraphStats(g)
    A <- igraph::get.adjacency(g)
    # degrees <- rowSums(A)
    myA <- as.matrix(A)
    dataset <- createInteractions(M=num.samples,                             ##### BD edit
                                  N=num.variables,                           ##### BD edit
                                  pct.imbalance = pct.imbalance,
                                  meanExpression=7,
                                  A=myA,
                                  randSdNoise=1,
                                  sdNoise=bias,
                                  sampleIndicesInteraction=1:nbias)
  }
  # make numeric matrix into a data frame for splitting and subsequent ML algorithms
  dataset <- as.data.frame(dataset)
  # BEWARE - DO NOT BE TEMPTED TO USE COLNAMES WITH '.' IN THE NAME 'sim.var1',
  # Removed it altogether; put 'sim' together with 'var' for 'simvar1' instead
  # Not technically allowed but tolerated by R, see make.names() - bcw - 3/4/18
  # Exanple: vgboost does not allow for certain functions using these names
  signal.names <- paste("simvar", 1:nbias, sep = "")
  background.names <- paste("var", 1:(num.variables - nbias), sep = "")
  var.names <- c(signal.names, background.names, label)
  colnames(dataset) <- var.names
  split.data <- splitDataset(all.data = dataset,
                             pct.train = pct.train,
                             pct.holdout = pct.holdout,
                             pct.validation = pct.validation,
                             label = label)
  
  if (!is.null(save.file)) {
    if (verbose) cat("saving to data/", save.file, ".Rdata\n", sep = "")
    save(split.data, pct.signals, bias, sim.type, file = save.file)
  }
  
  elapsed <- (proc.time() - ptm)[3]
  if (verbose) cat("createSimulation elapsed time:", elapsed, "\n")
  
  list(train = split.data$train,
       holdout = split.data$holdout,
       validation = split.data$validation,
       label = label,
       signal.names = signal.names,
       elapsed = elapsed)
}
######################################################################################################



# To do mixed simulations, I think you can do the interaction simulation first 
# and then add main effect simulations on top of that data matrix. 
# User can handle the ratio of mixed data using pct.mixed parameter (main effect(0), interaction(1))

#' Create a data simulation with a mix of main and interaction effects.
#'
#' @param num.variables An integer for the number of variables
#' @param num.samples An integer for the number of samples
#' @param pct.imbalance A numeric percentage to indicate proportion of the imbalaced samples. 
#' 0 means all controls and 1 mean all cases.
#' @param pct.signals A numeric for proportion of simulated signal variables
#' @param bias A numeric for effect size in simulated signal variables
#' @param label A character vector for the name of the outcome column. class for classification
#' and qtrait for regression
#' @param pct.mixed A numeric percentage to indicate the proportion of each simulation type
#' @param mixed.type A character vector of the mix type of simulations:
#' mainEffect, interactionErdos/interactionScalefree
#' @param pct.train A numeric percentage of samples to use for traning
#' @param pct.holdout A numeric percentage of samples to use for holdout
#' @param pct.validation A numeric percentage of samples to use for testing
#' @param verbose A flag indicating whether verbose output be sent to stdout
#' @param save.file A filename or NULL indicating whether to save the simulations to file
#' @return A list with:
#' \describe{
#'   \item{train}{traing data set}
#'   \item{holdout}{holdout data set}
#'   \item{validation}{validation data set}
#'   \item{label}{the class label/column name}
#'   \item{signal.names}{the variable names with simulated signals}
#'   \item{elapsed}{total elapsed time}
#' }
#' @examples
#' num.variables <- 100
#' num.samples <- 100
#' pct.imbalance <- 0.5
#' pct.signals <- 0.1
#' bias <- 0.4
#' label <- "class"
#' pct.mixed <- 0.5
#' mixed.type <- c("mainEffect", "interactionScalefree")
#' sim.data <- createMixedSimulation(num.samples=num.samples,
#'                                   num.variables=num.variables,
#'                                   pct.signals=pct.signals,
#'                                   pct.imbalance=pct.imbalance,
#'                                   label = label,
#'                                   bias=bias,
#'                                   pct.mixed=pct.mixed,
#'                                   mixed.type=mixed.type,
#'                                   verbose=FALSE)
#' @family simulation
#' @export
# createMixedSimulation function
######################################################################################################
createMixedSimulation <- function(num.samples = 100,
                                  num.variables = 100,
                                  pct.imbalance = 0.5,
                                  pct.signals=0.1,
                                  bias=0.4,
                                  label = "class",
                                  pct.mixed = 0.5,
                                  mixed.type = c("mainEffect","interactionScalefree"),
                                  pct.train=0.5,
                                  pct.holdout=0.5,
                                  pct.validation=0,
                                  save.file=NULL,
                                  verbose=FALSE){
  if(!any(label == c("class", "qtrait"))){
    stop("createMixedSimulation: name of the label should be class or qtrait")
  }
  num.int.vars <- round(num.variables * pct.mixed)
  num.main.vars <- round(num.variables * (1 - pct.mixed))
  ptm <- proc.time()
  int.nbias <- pct.signals * num.int.vars
  main.nbias <- pct.signals * num.main.vars
  if(!is.null(mixed.type) && any("mainEffect" == mixed.type)){
    # new simulation:
    # sd.b sort of determines how large the signals are
    # p.b=0.1 makes 10% of the variables signal, bias <- 0.5
    my.sim.data <- createMainEffects(n.e=num.main.vars,
                                     n.db=num.samples,
                                     label=label,
                                     pct.imbalance = pct.imbalance,
                                     sd.b=bias,
                                     p.b=pct.signals)$db
    mainEffect.dataset <- cbind(t(my.sim.data$datnobatch), my.sim.data$S)
  }
  
  if (!is.null(mixed.type) && any("interactionScalefree" == mixed.type)) {
    # interaction simulation: scale-free
    g <- igraph::barabasi.game(num.int.vars, directed = F)
    A <- igraph::get.adjacency(g)
    myA <- as.matrix(A)
    interaction_dataset <- createInteractions(M=num.samples,                       ##### BD edit
                                              N=num.int.vars,                      ##### BD edit
                                              label=label,
                                              pct.imbalance = pct.imbalance,
                                              meanExpression=7,
                                              A=myA,
                                              randSdNoise=1,
                                              sdNoise=bias,
                                              sampleIndicesInteraction=1:int.nbias)
    
  } else if (!is.null(mixed.type) && any("interactionErdos" == mixed.type)) {
    attach.prob <- 0.1
    g <- igraph::erdos.renyi.game(num.int.vars, attach.prob)
    #   foo <- printIGraphStats(g)
    A <- igraph::get.adjacency(g)
    # degrees <- rowSums(A)
    myA <- as.matrix(A)
    interaction_dataset <- createInteractions(M=num.samples,                       ##### BD edit
                                              N=num.int.vars,                      ##### BD edit
                                              label=label,
                                              pct.imbalance = pct.imbalance,
                                              meanExpression=7,
                                              A=myA,
                                              randSdNoise=1,
                                              sdNoise=bias,
                                              sampleIndicesInteraction=1:int.nbias)
  } else {
    stop("createMixedSimulation: mixed.type vector is empty")
  }
  mainEffect.dataset <- mainEffect.dataset[c((pct.imbalance*num.samples + 1):num.samples, 1:(pct.imbalance*num.samples)), ]
  mixed_data <- cbind(mainEffect.dataset[, -(num.main.vars + 1)], interaction_dataset)
  # make numeric matrix into a data frame for splitting and subsequent ML algorithms
  dataset <- as.data.frame(mixed_data)
  # BEWARE - DO NOT BE TEMPTED TO USE COLNAMES WITH '.' IN THE NAME 'sim.var1',
  # Removed it altogether; put 'sim' together with 'var' for 'simvar1' instead
  # Not technically allowed but tolerated by R, see make.names() - bcw - 3/4/18
  # Exanple: vgboost does not allow for certain functions using these names
  main.signal.names  <- paste("mainsim", 1:main.nbias, sep = "")
  int.signal.names <- paste("intsim", 1:int.nbias, sep = "")
  int.background.names <- paste("var", 1:(num.int.vars - int.nbias), sep = "")
  main.background.names <- paste("var", 1:(num.main.vars - main.nbias), sep = "")
  var.names <- c(main.signal.names, main.background.names, int.signal.names, int.background.names, label)
  colnames(dataset) <- var.names
  split.data <- splitDataset(all.data = dataset,
                             pct.train = pct.train,
                             pct.holdout = pct.holdout,
                             pct.validation = pct.validation,
                             label = label)
  
  if (!is.null(save.file)) {
    if (verbose) cat("saving to data/", save.file, ".Rdata\n", sep = "")
    save(split.data, pct.signals, bias, mixed.type, file = save.file)
  }
  
  elapsed <- (proc.time() - ptm)[3]
  if (verbose) cat("createSimulation elapsed time:", elapsed, "\n")
  
  list(train = split.data$train,
       holdout = split.data$holdout,
       validation = split.data$validation,
       label = label,
       signal.names = c(main.signal.names, int.signal.names),
       elapsed = elapsed)
  
}
######################################################################################################


#=========================================================================#
#' convert.pec.sim.to.inbix
#'
#' Reads a pEC simulated csv file, converts to inbix format and writes to working directory
#'
#' @param pEC.inputFile pEC-simulated data csv file name with path if needed 
#' @param inbix.file.prefix prefix of inbix filename for .num and .pheno files
#' @return none
#'
#' @examples
#' # setwd(...) # for output
#' # create a simulated data set with createSimulation and write to file
#' infile <- "ARF_compare_1_multisurf_0.8_bias_No_k_0.1_pct.signals_1000_num.attr_100_num.samp.csv"
#' convert.pec.sim.to.inbix(infile,"simulated1")
#' # writes simulated1.num and simualted1.pheno
#'
#' @export
# convert.pec.sim.to.inbix function
######################################################################################################
convert.pec.sim.to.inbix <- function(pEC.inputFile,inbix.file.prefix){
  
  ### write pEC simulated csv format to .num and .pheno format for inbix
  
  simdat <- read.csv(file=pEC.inputFile)
  # first column is X: subject id's case1, case2,...
  # class columns is class/status: 1, 1, -1, -1, ...
  nc <- ncol(simdat)
  num.attr <- nc - 2
  subjIDs <- as.character(simdat[,1])
  pheno <- as.numeric(simdat[, nc])  # class is -1/1, change to 1/2
  pheno[pheno==1]<-2
  pheno[pheno==-1]<-1
  predictors.mat <- simdat[,-nc]        # drop class (last) col
  predictors.mat <- predictors.mat[,-1]  # drop subj id (first) col
  attr.names <- colnames(predictors.mat)
  
  # for inbix .pheno. note: .pheno does not have header
  # write inbix .pheno
  phenosTable <- cbind(subjIDs, subjIDs, pheno)
  datasimInbixPhenoFile <- paste(inbix.file.prefix, ".pheno", sep="")
  write.table(phenosTable, datasimInbixPhenoFile, quote=F, sep="\t", 
              col.names=F, row.names=F)
  
  # .num has a header
  # write inbix numeric (.num) file/data set
  dataTable <- cbind(subjIDs, subjIDs, predictors.mat)
  colnames(dataTable) <- c("FID", "IID", attr.names)
  datasimInbixNumFile <- paste(inbix.file.prefix, ".num", sep="")
  write.table(dataTable, datasimInbixNumFile, quote=F, sep="\t", 
              col.names=T, row.names=F)
}
######################################################################################################
insilico/privateEC documentation built on May 22, 2020, 5:12 p.m.