R/CreateRandomExpRandomSigs.R

Defines functions Create1CatRandomExpRandomSigs GenerateRandomExpRandomSigs200 Resummarize GenerateAllRandomSP ParametersSPLike GenerateAllRandomSA ParametersSALike

Documented in Create1CatRandomExpRandomSigs GenerateAllRandomSA GenerateAllRandomSP ParametersSALike ParametersSPLike

# CreateRanomExpRandomSigs.R
#
# Generate a suite of data with random signatures and exposures with
# specications requested by Ludmil, 2019 08 25


#' Parameters for generating data with exposures resembling SignatureAnalyzer exposures.
#' @keywords internal

ParametersSALike <- function() {
  retval <- data.frame(
    total.num.sigs          = c(3,   5,   10,  15,  25),

    # These are chosen to ramp up to the mean standard deviation in
    # PCAWG tumors (15.525 and 6.172).
    mean.num.sigs.per.tumor = c(2.9,  4.9, 8,   11,   16),
    sd.num.sigs.per.tumor   = c(1,    2,   4,    5,    6),
    mean.log10.exp.per.sig  = c(3.45, 3.0, 2.7,  2.8,  2.6),
    sd.log10.exp.per.sig    = c(0.4,  0.5, 0.5,  0.4,  0.4)
  )

  return(retval)
}


#' Generate a full set of random data with characteristics somewhat like SignatureAnalyzer signatures and attribution.
#' @keywords internal
#'
#' Base parameters on SignatureAnalyzer attributions (exposures).

GenerateAllRandomSA <-
  function(top.level.dir = "../SA.like.rand.exp.rand.sigs.2019.10.29",
           overwite = TRUE,
           verbose = TRUE,
           num.replicates = 10) {
    log <- testthat::capture_messages(
      GenerateRandomExpRandomSigs200(
        parm                   = ParametersSALike(),
        top.level.dir          = top.level.dir,
        seed                  = 811211,
        num.replicates        = num.replicates))
    cat(log, file = file.path(top.level.dir, "log.txt"))
    invisible(Resummarize(top.level.dir))
  }


#' Parameters for generating data with exposures resembling SigProfiler exposures.
#' @keywords internal

ParametersSPLike <- function() {
  retval <- data.frame(
    total.num.sigs          = c(3, 5, 10, 15, 25),

    # Except for the first element, these are based on the observed mean and
    # standard deviation in PCAWG tumors.
    mean.num.sigs.per.tumor = c(1.5, 3.9,  3.9,   3.9,  3.9),
    sd.num.sigs.per.tumor   = c(0.5, 1.1,  1.1,   1.1,  1.1),
    mean.log10.exp.per.sig  = c(3.8, 3.2,  3.3,   3.3,  3.3),
    sd.log10.exp.per.sig    = c(0.4, 0.35, 0.34,  0.34, 0.34)
  )
  return(retval)
}


#' Generate a suite of data with random signatures and exposures.
#'
#' @keywords internal
#'
#' Base parameters on SigProfiler attributions (exposures).

GenerateAllRandomSP <-
  function(top.level.dir  = "../SP.like.rand.exp.rand.sigs.2019.10.29",
           overwrite      = TRUE,
           verbose        = TRUE,
           num.replicates = 10,
           unlink         = FALSE) {
    log <- testthat::capture_messages(
      GenerateRandomExpRandomSigs200(
        parm                   = ParametersSPLike(),
        top.level.dir          = top.level.dir,
        seed                   = 1211,
        num.replicates         = num.replicates))
    cat(log, file = file.path(top.level.dir, "log.txt"))
    invisible(Resummarize(top.level.dir))
  }


Resummarize <- function(top.level.dir) {
  summary <- data.table::fread(file.path(top.level.dir, "summary.csv"))
  s1 <- as.data.frame(summary)
  fut.rownames <-
    paste("nsig", s1$total.num.sigs,
          "rep.num", s1$replicate.number, sep = ".")
  s1 <- s1[ , -ncol(s1)]
  rownames(s1) <- fut.rownames
  s2 <- split(s1, s1$total.num.sigs)
  add.means <- function(x) {
    xx <- colMeans(x)
    xxx <- rbind(x, as.list(xx))
    # xxxx <- as.character(signif(xxx, digits = 4))
    xxxx <-
      apply(xxx, MARGIN = 2, function(x) as.character(signif(x, digits = 3)))
    retv <- rbind(xxxx, "")
    return(retv)
  }

  s3 <- lapply(s2, add.means)
  s4 <- do.call("rbind", s3)
  s.file <- file.path(top.level.dir, "summary2.csv")
  write.csv(s4, s.file)

  cat("This table has data rounded to 3 significant digits;",
      "see summary.csv for unrounded data",
      file = s.file, append = TRUE)

  cat("Synthetic data generated by ",
      "SynSigGen::GenerateAllRandomSA or",
      "SynSigGen::GenerateAllRandomSP on", date(), "\n\n",
      file = s.file, append = TRUE)

  cat("In real data\n",
      "mean total # mutations,17366\n",
      "sd,84673\n",
      "median,5259\n",
      "mad,5239\n\n",
      file = s.file, append = TRUE)

  utils::capture.output(sessionInfo(), file = s.file, append = TRUE)
  return(s4)
}


GenerateRandomExpRandomSigs200 <- function(parm,
                                           top.level.dir,
                                           overwrite      = TRUE,
                                           verbose        = TRUE,
                                           num.replicates = 1,
                                           seed) {
  MustCreateDir(top.level.dir, overwrite)
  set.seed(seed, sample.kind = "Rejection")
  if (verbose) RNGMessages("GenerateRandomExpRandomSigs200")

  summary.file <- file.path(top.level.dir, "summary.csv")
  cat("total.num.sigs",
      "target.mean.num.sigs.per.tumor",
      "target.sd.num.sigs.per.tumor",
      "replicate.number",
      "actual.mean.num.sigs.per.tumor",
      "actual.sd.num.sigs.per.tumor",
      "mean.num.muts.per.tumor",
      "mean.log10.muts",
      "sd.num.muts.per.tumor",
      "sd.log10.muts",
      "median.num.muts.per.tumor",
      "mad.num.muts.per.tumor",
      "trail\n",
      sep = ",", file = summary.file)

  for (i in 1:nrow(parm)) {
    total.num.sigs          <- parm[i, "total.num.sigs"]
    mean.num.sigs.per.tumor <- parm[i, "mean.num.sigs.per.tumor"]
    sd.num.sigs.per.tumor   <- parm[i, "sd.num.sigs.per.tumor"]
    mean.log10.exp.per.sig  <- parm[i, "mean.log10.exp.per.sig"]
    sd.log10.exp.per.sig    <- parm[i, "sd.log10.exp.per.sig"]
    if (verbose) {
      message("\n", paste(rep("=", 40), collapse = ""))
      message("\nGenerateRandomExpRandomSigs200\n",
              "target distribution parameters:",
              "\ntotal.num.sigs          = ", total.num.sigs,
              "\nmean.num.sigs.per.tumor = ", mean.num.sigs.per.tumor,
              "\nsd.num.sigs.per.tumor   = ", sd.num.sigs.per.tumor)
    }


    for (replicate.number in 1:num.replicates) {

      dir.name <-
        file.path(top.level.dir,
                  paste0("syn.data.", total.num.sigs, ".sigs"))
      if (num.replicates > 1) {
        dir.name <- paste0(dir.name, ".repnum.",
                           formatC(replicate.number, width = 2, flag = "0"))
      }

      if (verbose) {
        message("\n", paste(rep("-", 40), collapse = ""))
        message("replicate number ", replicate.number)
        message("directory is ", dir.name)

      }

      stats.on.sim.data <-
        Create1CatRandomExpRandomSigs(
          num.syn.tumors          = 200,
          total.num.sigs          = total.num.sigs,
          mut.mean                = mean.log10.exp.per.sig,
          mut.sd                  = sd.log10.exp.per.sig,
          mean.num.sigs.per.tumor = mean.num.sigs.per.tumor,
          sd.num.sigs.per.tumor   = sd.num.sigs.per.tumor,
          dir.name                = dir.name,
          overwrite               = overwrite,
          verbose                 = verbose)

      cat(total.num.sigs, mean.num.sigs.per.tumor, sd.num.sigs.per.tumor,
          replicate.number, stats.on.sim.data,
          "\n", sep = ",", file = summary.file, append = TRUE)

    }
  }
}


#' Create a catalog of "random" synthetic spectra for 96-channel mutation types.
#'
#' @param num.syn.tumors Total number of synthetic tumors to create.
#'
#' @param total.num.sigs Total number of signatures in the universe.
#'
#' @param mut.mean Mean of the log10 of the
#' number of mutations due to each signature.
#'
#' @param mut.sd Standard deviation of the log10 of
#'  the number of mutations due to each signature.
#'
#' @param mean.num.sigs.per.tumor Mean number of signatures
#'        contributing to each tumor.
#'
#' @param sd.num.sigs.per.tumor Standard deviation the number of signatures
#' in each tumor.
#'
#' @param sig.name.prefix String to put in front of an integer (as
#' string) to form an identifier for a synthetic signature.
#'
#' @param sample.name.prefix String to put in front of an integer (as
#' string) to form an identifier for a synthetic sample (tumor).
#'
#' @param dir.name A string indicating the name of the 96-channel
#' subdirectory; probably one of \code{"sa.sa.COMPOSITE"} or
#' \code{"sp.sa.COMPOSITE"}.
#'
#' @param COMPOSITE.features Character vector containing
#' rownames for a COMPOSITE signature or catalog.
#'
#' @param overwrite If \code{TRUE} overwrite existing directories / files.
#'
#' @keywords internal

Create1CatRandomExpRandomSigs <-
  function(num.syn.tumors,
           total.num.sigs,
           mut.mean,
           mut.sd,
           mean.num.sigs.per.tumor,
           sd.num.sigs.per.tumor,
           dir.name,
           overwrite = FALSE,
           verbose   = TRUE) {

    sigs <-
      CreateRandomMutSigProfiles(
        ICAMS::catalog.row.order[["SBS96"]], total.num.sigs,
        sig.name.prefix = "RandSig")

    sig.info <- CreateMeanAndStdevForSigs(
      total.num.sigs, mut.mean, mut.sd, colnames(sigs))

    exp <-
      CreateRandomExposures(
        num.exposures           = num.syn.tumors,
        mean.num.sigs.per.tumor = mean.num.sigs.per.tumor,
        sd.num.sigs.per.tumor   = sd.num.sigs.per.tumor,
        total.num.sigs          = total.num.sigs,
        per.sig.mean.and.sd     = sig.info,
        sample.name.prefix      = "S",
        sigs                    = sigs,
        verbose                 = verbose)

    NewCreateAndWriteCatalog(
      sigs      = sigs,
      exp       = exp,
      dir       = dir.name,
      overwrite = overwrite)

    muts.per.tumor = colSums(exp)

    return(c(
      actual.sig.num.mean = attr(exp, "actual.sig.num.mean"),
      actual.sig.num.sd = attr(exp, "actual.sig.num.sd"),

      mean.num.muts.per.tumor = mean(muts.per.tumor),
      mean.log10.num = mean(log10(muts.per.tumor)),
      sd.num.muts.per.tumor = sd(muts.per.tumor),
      sd.log10.num = sd(log10(muts.per.tumor)),
      median.num.muts.per.tumor = stats::median(muts.per.tumor),
      mad.num.muts.per.tumor    = stats::mad(muts.per.tumor)

      ))
  }
steverozen/SynSigGen documentation built on April 1, 2022, 8:54 p.m.