R/2_simulatedata.R

Defines functions SimulateData

#' Simulate Differentially Methylated Regions (DMRs) in Methylation Data
#'
#' @description Generate a methylation dataset where treatment effects are added 
#'    to beta values in one group of samples for some randomly selected
#'    co-methylated clusters. 
#'    
#' @param beta_mat A beta value matrix for methylation samples from a
#'    450k methylation array with CpG IDs as the row names and sample IDs as
#'    the column names. An example is given in the \code{betaVals_mat} data set.
#'    
#' @param Aclusters_df A data frame of beta values and CpG information for
#'    clusters of CpGs over a 450k methylation array. The rows correspond to the
#'    CPGs. The columns have information on the cluster number, chromosome,
#'    cluster start and end locations, and the beta values for each subject
#'    grouped by some clinical indicator (e.g. case v. control). An example is
#'    given in the \code{startEndCPG_df} data set. This data set can be
#'    generated by the file \code{/inst/1_Aclust_data_import.R}
#'    
#' @param delta_num The treatment size: a non-negative real number to add to
#'    the beta values within randomly-selected clusters for the first
#'    \code{numEx_int} samples. This artifically creates differentially-
#'    methylated regions (DMRs).
#'    
#' @param seed_int The seed value passed to the \code{\link[base]{Random}}
#'    function to enable reproducible results
#'    
#' @param betaCols_idx The column numbers of the \code{Aclusters_df} data frame
#'    in which beta values for each subject are stored. 
#'    
#' @param numEx_int The number of samples in the first group. This
#'    function assumes that samples in one group are contiguous columns of the
#'    \code{Aclusters_df} data frame.
#'    
#' @param numClusters_int The total number of randomly selected clusters, for
#'    which the treatment effect, \code{delta_num}, is then added
#'
#' @return A list with two elements:
#'    \itemize{
#'      \item{\code{simBetaVals_df} : }{A data frame of beta values after
#'        treatment effects were added, used for input for different DMR-finding
#'        methods. Note this dataset includes all the CpGs on the array.}
#'      \item{\code{simAclusters_df} : }{A data frame of the methylation values
#'        only for clusters identified by \code{Aclust} and indicator variable
#'        for whether treatment effects were added. Note this has only CpGs
#'        mapped to all the clusters found by the \code{Aclust} method.}
#'    }
#'
#'
#' @export
#'
#' @examples
#' \dontrun{
#'    data("startEndCPG_df")
#'    data("betaVals_mat")
#'
#'    SimulateData(beta_mat = betaVals_mat,
#'                 Aclusters_df = startEndCPG_df,
#'                 delta_num = 0.4,
#'                 seed_int = 12345)
#' }
SimulateData <- function(beta_mat, Aclusters_df,
                         delta_num, seed_int,
                         betaCols_idx = 9:22,
                         numEx_int = 7, numClusters_int = 500){

  ###  Setup  ###
  # set the seed value from the 'rp'-th element from the vector 'seed_values'
  set.seed(seed_int)

  # randomly pick numClusters_int clusters
  clusts_int <- Aclusters_df$Clusternumber
  randClusts_int <- sample(max(clusts_int), numClusters_int)

  # full data info of randomly picked clusters
  inflateClust_df <- Aclusters_df[which(clusts_int %in% randClusts_int), ]
  bval_mat <- as.matrix(inflateClust_df[, betaCols_idx])


  ### Check average b-val/m-val group-wise in each cpg, and add delta value ###
  if (delta_num > 0) {

    # for all cpgs of numClusters_int random picked clusters
    for (i in 1:nrow(bval_mat)) {

      avgbetagr1 <- mean(bval_mat[i, 1:numEx_int])
      avgbetagr2 <- mean(bval_mat[i, (numEx_int + 1):ncol(bval_mat)])

      # Switch statement to ensure that adding delta only increases the signal
      if (avgbetagr1 > avgbetagr2) {
        bval_mat[i, 1:numEx_int] <- bval_mat[i, 1:numEx_int] + delta_num
      } else {

        bval_mat[i, (numEx_int + 1):ncol(bval_mat)] <-
          bval_mat[i, (numEx_int + 1):ncol(bval_mat)] + delta_num

      }

    } # END for()

    # If any entries of the resultant bval_mat, after adding delta_num, are
    #   greater than 1, replace them by 1 (the max beta score)
    bval_mat[bval_mat >= 1] <- 0.999

  } # END if()
  bval_df <- as.data.frame(bval_mat)

  inflateClust_df$actual <- "positive"


  ###  The other clusters  ###
  # Full data info of remaining CPGs, i.e. except randomly picked clusters
  nonInflateClust_df <-
    Aclusters_df[!(rownames(Aclusters_df) %in% inflateClust_df$cpg), ]
  nonInflateClust_df$actual <- "negative"

  # Full data info (beta values) of remaining CPGs, i.e. except randomly picked
  #   clusters
  nonInflateBvals_mat <-
    beta_mat[!(rownames(beta_mat) %in% inflateClust_df$cpg), ]


  ###  Combine Treated and Untreated Clusters  ###
  # Combine the CPGs (belonging to selected clusters) and remianing cpgs
  #   (belonging to non-selected clusters)
  treatedAclusters_df <- rbind(inflateClust_df, nonInflateClust_df)
  # Combine the treated (delta-value added) CPGs (belonging to the clusters
  #   selected at random) and untreated (delta-value not added) CPGs (belonging
  #   to the non-selected clusters)
  betaResults_df <- rbind(bval_df, nonInflateBvals_mat)


  ###  Order Columns and Rows  ###
  # Reorder the 'actual' column to previous of beta values
  treatedAclusters_df <- as.data.frame(treatedAclusters_df)
  impVars_df <- treatedAclusters_df[c("Clusternumber", "cpg", "CHR", "MAPINFO",
                                      "start_position", "end_position",
                                      "coordinate_37", "chromosome", "actual")]
  otherVars_df <- treatedAclusters_df[setdiff(names(treatedAclusters_df),
                                              names(impVars_df))]
  treatedAclusters_df <- cbind(impVars_df, otherVars_df)

  # Reorder the rows (CPGs) according to clusternumber
  treatedAclustCPGordered_df <-
    treatedAclusters_df[order(treatedAclusters_df$Clusternumber), ]

  if (delta_num == 0) {
    treatedAclustCPGordered_df$actual <- "negative"
  }


  ###  Return  ###
  list(simBetaVals_df = betaResults_df,
       simAclusters_df = treatedAclustCPGordered_df)

}
gabrielodom/DMRcomparePaper documentation built on May 25, 2019, 2:52 a.m.