R/CreateSynData.R

Defines functions AddNoise NewCreateAndWriteCatalog MustCreateDir CreateAndWriteCatalog GenerateSynFromReal GenerateSynAbstract OutDir MergeExposures Merge2Exposures CreateSynCatalogs GenerateSynExposureOneSample present.sigs AssignPresentAbsentOneSample GenerateSyntheticExposures WriteSynSigParams GetSynSigParamsFromExposuresOld GetSynSigParamsFromExposures GetMutationType SynSigParamsOneSignature

Documented in AddNoise AssignPresentAbsentOneSample CreateAndWriteCatalog CreateSynCatalogs GenerateSynAbstract GenerateSynExposureOneSample GenerateSynFromReal GenerateSyntheticExposures GetSynSigParamsFromExposures Merge2Exposures MergeExposures NewCreateAndWriteCatalog OutDir present.sigs SynSigParamsOneSignature WriteSynSigParams

#' @title Extract  parameters for one mutational signature profile
#'
#' @param counts  A vector of mutation counts attributed to one signature across
#'                length(counts) samples. TODO(Steve): rename to exposures
#'
#' @param target.size The length of genomic sequence from which the counts
#'                were derived, in megabases. Deprecated, set this to 1.
#'
#' @param distribution Probability distribution used to fit exposures due to one
#'   mutational signature. Can be \code{neg.binom} which stands for negative
#'   binomial distribution. If \code{NULL} (Default), then this function uses
#'   log normal distribution with base 10.
#'
#' @return
#' * For log normal distribution, a 3-element vector with names "prob", "mean",
#' and "stdev".
#'
#' * For negative binomial distribution, a 3-element vector with
#' names "prob", "size", and "mu".
#'
#' @importFrom stats sd
#'
#' @md
#'
#' @keywords internal

SynSigParamsOneSignature <- function(counts, target.size = 1, distribution = NULL) {
  prevalence <-  length(counts[counts >= 1 ]) / length(counts)

  if (is.null(distribution)) {
    counts.per.mb <- counts[counts >= 1 ] / target.size

    ## generate log10(mut/mb) values for mean and sd
    mean.per.mb <- mean(log10(counts.per.mb))
    sd.per.mb <- sd(log10(counts.per.mb))
    return(c(prob = prevalence, mean = mean.per.mb, stdev = sd.per.mb))
  } else if (distribution == "neg.binom") {
    counts.per.mb <- counts[counts >= 1 ] / target.size

    if (length(counts.per.mb) == 1) {
      # If there is only one data point, don't try to fit the data
      # But use the original mutation count be the value of parameter mu
      names(counts.per.mb) <- NULL
      return(c(prob = prevalence, size = NA, mu = counts.per.mb))
    } else {
      fit <- fitdistrplus::mledist(counts.per.mb, distr = "nbinom")

      names(fit$estimate) <- NULL
      return(c(prob = prevalence, size = fit$estimate[1], mu = fit$estimate[2]))
    }
  } else {
    stop("Only 'neg.binom' distribution is supported")
  }
}

#' @keywords internal
GetMutationType <- function(sig.name) {
  if (all(grepl(pattern = "SBS", x = sig.name))) {
    return("SBS96")
  } else if (all(grepl(pattern = "DBS", x = sig.name))) {
    return("DBS78")
  } else if (all(grepl(pattern = "ID", x = sig.name))) {
    return("ID")
  }
}

#' @title Empirical estimates of key parameters describing exposures due to signatures.
#'
#' @param exposures A matrix in which each column is a sample and each row is a mutation
#'         signature, with each element being the "exposure",
#'         i.e. mutation count attributed to a
#'         (sample, signature) pair.
#'
#' @param verbose If > 0 cat various messages.
#'
#' @param distribution Probability distribution used to fit exposures due to one
#'   mutational signature. Can be \code{neg.binom} which stands for negative
#'   binomial distribution. If \code{NULL} (Default), then this function uses
#'   log normal distribution with base 10.
#'
#' @param cancer.type Optional argument specifying the cancer type of the
#'   samples being analyzed.
#'
#' @param sig.params Empirical signature parameters generated using real
#'   exposures irrespective of their cancer types. If there
#'   is only one tumor having a signature in a cancer type in \code{exposures},
#'   we cannot fit the \code{distribution} to only one data point. Instead, we
#'   will use the empirical parameter \code{size} from \code{sig.params}.
#'   Users can use \code{SynSigGen:::GetSynSigParamsFromExposuresOld} to generate
#'   their own signature parameters. If \code{NULL}(default), this function uses the
#'   PCAWG7 empirical signature parameters. See \code{signature.params} for more details.
#'
#' @return
#' * For log normal distribution,
#' a data frame with one column for
#' each of a subset of the input signatures
#' and the following rows
#' \describe{
#' \item{prob}{The proportion of tumors with the signature.}
#' \item{mean}{The mean(log_10(number of mutations)).}
#' \item{stdev}{The stdev(log_10(number of mutations)).}
#' }
#' Signatures not present in
#'  \code{exposures} or present only in a single tumor in
#'  \code{exposures} are removed.
#'
#' * For negative binomial distribution,
#' a data frame with one column for
#' each of a subset of the input signatures
#' and the following rows
#' \describe{
#' \item{prob}{The proportion of tumors with the signature.}
#' \item{size}{Dispersion parameter.}
#' \item{mu}{Mean.}
#' }
#'
#' @md
#'
#' @export

GetSynSigParamsFromExposures <-
  function(exposures, verbose = 0, distribution = NULL, cancer.type = NULL,
           sig.params = NULL) {
    stopifnot(ncol(exposures) > 0)
    integer.counts <- round(exposures, digits = 0)
    integer.counts <- integer.counts[rowSums(integer.counts) > 0 , , drop = FALSE]
    ret1 <- apply(X            = integer.counts,
                  MARGIN       = 1,
                  FUN          = SynSigParamsOneSignature,
                  distribution = distribution)

    ret2 <- sapply(rownames(integer.counts), FUN = function(x) {
      sig.name <- x
      exposure.one.sig <- integer.counts[sig.name, ]
      retval <- SynSigParamsOneSignature(counts = exposure.one.sig,
                                         distribution = distribution)
      return(retval)
    })

    if (is.null(distribution)) {
      # Some standard deviations can be NA (if there is only one tumor
      # with mutations for that signature). We pretend we did not see
      # these signatures. TODO(Steve): impute from similar signatures.
      if (any(is.na(ret1['stdev', ]))) {
        if (verbose > 0) {
          cat("\nAnalyzing samples", cancer.type)
          cat("\nWarning, some signatures present in only one sample, dropping:\n")
          cat(colnames(ret1)[is.na(ret1['stdev', ])], "\n")
        }
      }
      retval <- ret1[,!is.na(ret1['stdev',]) , drop = FALSE]
    } else if (distribution == "neg.binom") {
      # Some parameter estimates can be NA (if there is only one tumor
      # with mutations for that signature). We pretend we did not see
      # these signatures.
      if (any(is.na(ret1['size', ]))) {
        rare.sig.names <- colnames(ret1)[is.na(ret1['size', ])]
        if (verbose > 0) {
          cat("\nAnalyzing samples", cancer.type)
          cat("\nWarning, some signatures present in only one sample:\n")
          cat(rare.sig.names, "\n")
          cat("Using the empirical signature parameters 'size' from all cancer types\n")
        }
        mutation.type <- GetMutationType(sig.name = rare.sig.names)
        retval <- ret1
        if (is.null(sig.params)) {
          sig.params <- SynSigGen::signature.params[[mutation.type]]
        }

        sig.with.no.params <-
          setdiff(rare.sig.names, colnames(sig.params))
        if (length(sig.with.no.params) > 0) {
          cat("\nWarning, some signatures present in only one sample across all the cancer types, dropping:\n")
          cat(sig.with.no.params, "\n")
          rare.sig.names <- setdiff(rare.sig.names, sig.with.no.params)
          retval <- retval[, !colnames(retval) %in% sig.with.no.params]
        }

        # Only use the size parameter from sig.params
        retval["size", rare.sig.names] <- sig.params["size", rare.sig.names]
        # retval["mu", rare.sig.names] <- sig.params["mu", rare.sig.names]
      } else {
        retval <- ret1
      }
    }

    if (ncol(retval) == 0) {
      stop("No signatures with usable parameters (> 1 sample with exposure) in samples ",
           cancer.type)
    }
    return(retval)
  }

#' @keywords internal
GetSynSigParamsFromExposuresOld <-
  function(exposures, verbose = 0, distribution = NULL, cancer.type = NULL) {
    stopifnot(ncol(exposures) > 0)
    integer.counts <- round(exposures, digits = 0)
    integer.counts <- integer.counts[rowSums(integer.counts) > 0 , , drop = FALSE]
    ret1 <- apply(X            = integer.counts,
                  MARGIN       = 1,
                  FUN          = SynSigParamsOneSignature,
                  distribution = distribution)

    ret2 <- sapply(rownames(integer.counts), FUN = function(x) {
      sig.name <- x
      exposure.one.sig <- integer.counts[sig.name, ]
      retval <- SynSigParamsOneSignature(counts = exposure.one.sig,
                                         distribution = distribution)
      return(retval)
    })

    if (is.null(distribution)) {
      # Some standard deviations can be NA (if there is only one tumor
      # with mutations for that signature). We pretend we did not see
      # these signatures. TODO(Steve): impute from similar signatures.
      if (any(is.na(ret1['stdev', ]))) {
        if (verbose > 0) {
          cat("\nAnalyzing samples", cancer.type)
          cat("\nWarning, some signatures present in only one sample, dropping:\n")
          cat(colnames(ret1)[is.na(ret1['stdev', ])], "\n")
        }
      }
      retval <- ret1[,!is.na(ret1['stdev',]) , drop = FALSE]
    } else if (distribution == "neg.binom") {
      # Some parameter estimates can be NA (if there is only one tumor
      # with mutations for that signature). We pretend we did not see
      # these signatures.
      if (any(is.na(ret1['size', ]))) {
        if (verbose > 0) {
          cat("\nAnalyzing samples", cancer.type)
          cat("\nWarning, some signatures present in only one sample, dropping:\n")
          cat(colnames(ret1)[is.na(ret1['size', ])], "\n")
        }
      }
      retval <- ret1[,!is.na(ret1['size',]) , drop = FALSE]
    }

    if (ncol(retval) == 0) {
      stop("No signatures with usable parameters (> 1 sample with exposure) in samples ",
           cancer.type)
    }
    return(retval)
  }

#' @title Write key parameters describing exposures due to a signature to a file.
#'
#' The parameters written are prevalence, mean(log(exposure)), and sd(log(exposure)).
#'
#' @param params The parameters to write.
#'
#' @param file   The path to the file to write.
#'
#' @param append Whether to append to or overwrite \code{file} if it already
#'        exists.
#'
#' @param col.names If NA, add column names.
#'
#' @importFrom utils write.table
#'
#' @export

# Needs to be exported for e.g. Create.pancreas.Rmd

WriteSynSigParams <-
  function(params, file, append = FALSE,
           col.names = ifelse(append, FALSE, NA)) {
    # Suppress warning about writing column names
    # on an append.
    suppressWarnings(
      write.table(x = as.data.frame(params),
                  file = file,
                  sep = ",",
                  col.names = col.names,
                  row.names = TRUE,
                  append = append)
    )
  }

#' @title Create synthetic exposures based given parameters
#'
#' @return A matrix with the rows being each signature and the columns being
#' generated samples. Each entry is the count of mutations due to one
#' signature in one sample.
#'
#' @inheritParams GenerateSynExposureOneSample
#'
#' @param num.samples Number of samples to generate
#'
#' @param name Prefix for sample identifiers in the simulated dataset
#'
#' @param sig.matrix Signature matrix to construct synthetic tumors
#'
#' @param distribution Probability distribution used to generate synthetic
#'   exposures due to active mutational signatures. Can be \code{neg.binom}
#'   which stands for negative binomial distribution. If \code{NULL} (Default),
#'   then this function uses log normal distribution with base 10.
#'
#' @param round.exposure Whether the exposures should be rounded to
#' an integer. Set as \code{FALSE} only when reproducing legacy data
#' sets.
#'
#' @export

GenerateSyntheticExposures <-
  function(sig.params,
           num.samples = 10,
           name = 'synthetic',
           sig.matrix = NULL,
           distribution = NULL,
           round.exposure = TRUE) {

    sigs <- colnames(sig.params)
    stopifnot(!is.null(sigs))
    prev.present <- unlist(sig.params['prob', ]) # Note, get a vector
    sig.present <- present.sigs(num.samples, prev.present)
    colnames(sig.present) <- paste(name, seq(1, num.samples), sep = '.')

    # Create a synthetic exposures for each column (sample)
    # in sig.present.
    if (is.null(distribution)) {
      sig.burden <- sig.params['mean', , drop = FALSE]
      sig.sd <- sig.params['stdev', , drop = FALSE]
      retval <-
        apply(sig.present,
              2,
              GenerateSynExposureOneSample,
              sigs,
              sig.burden, ## burden is in mutation per megabase
              sig.sd,
              sig.matrix,
              round.exposure = round.exposure)
    } else if (distribution == "neg.binom") {
      retval <-
        apply(sig.present,
              2,
              GenerateSynExposureOneSample,
              sigs,
              NULL,
              NULL,
              sig.matrix,
              distribution,
              sig.params,
              round.exposure = round.exposure)
    }

    # Remove signatures that have zero exposure
    retval1 <- retval[rowSums(retval) > 0, , drop = FALSE]

    return(retval1)
  }

#' Randomly assign present / absent to each of a set of signatures, and
#' keep trying until at least one is present
#'
#' @param prev.present  Vector of prevalences,
#'   each the prevalence of 1 mutational
#'   signature.
#'
#' @param verbose If > 0, cat some possibly informative messages
#'
#' @return a vector of 0s and 1s of length
#' \code{length(prev.present)}, and for which
#' \code{sum(prev.present) > 0}.
#'
#' @importFrom stats rbinom
#'
#' @keywords internal
AssignPresentAbsentOneSample <- function(prev.present, verbose = 0) {
  v <- numeric(length(prev.present))
  while (sum(v) < 1) {
    v <- rbinom(length(prev.present), 1, prev.present)
  }
  if (verbose > 0)
    cat("\nAssignPresentAbsentOneSample returning ", v, "\n\n")
  return(v)
}


#' @title Decide which signatures will be present in
#'  the catalogs of synthetic tumors.
#'
#' @param num.tumors Number of tumors to generate
#'
#' @param prev.present Vector of prevalences,
#'   each the prevalence of 1 mutational
#'   signature. The names are the names of the
#'   signatures.
#'
#' @return A matrix with one row
#' per signature and one column per tumor, with
#' 1 in a cell indicated the presence of a signature
#' and 0 indicating absence.
#'
#' @importFrom stats rbinom
#'
#' @keywords internal

present.sigs <-
  function(num.tumors, prev.present){

    num.process <- length(prev.present)

    present.list <- list()

    for (i in 1:num.process) {
      present.each <- rbinom(num.tumors,
                             1,
                             prob = prev.present[i])
      present.list[[i]] <-  present.each
    }

    present <- do.call(rbind,present.list)

    rownames(present) <- names(prev.present)

    # If the column for one tumor has only 0s,
    # re-sample until there is at least on non-0
    # signature.
    for (tumor in 1:ncol(present)) {
      if (all(present[,tumor] == rep(0, length(nrow(present))))) {
        # present['SBS1',tumor] = 1
        present[ , tumor] <-
          AssignPresentAbsentOneSample(prev.present)
      }
    }
    return(present)
  }


#' @title Using parameters given to generate exposures for synthetic tumors
#'
#' @param tumor Signature presence matrix or exposure matrix for a tumor.
#' It has only one row, and K (# of signatures) columns.
#' Value in each column is the presence flag for a mutational signature:
#' the value can be non-zero(signature is present) or 0(absent).
#' The name of each column should be the name of a signature.
#'
#' @param sig.interest Names of mutational signatures you want to use to
#' generate exposures. It can be all, or part of signatures in colnames(tumor).
#'
#' @param burden.per.sig Mean mutation burden a log10 of the
#' counts of mutations per megabase.
#' It has one row, and K columns. Each column name refers to a mutational signature.
#'
#' @param sd.per.sig standard deviation of mutation burden.
#' It has one row, and K columns. Each column name refers to a mutational signature.
#'
#' @param sig.matrix Signature matrix to construct synthetic tumors.
#'
#' @param distribution Probability distribution used to generate synthetic
#'   exposures due to active mutational signatures. Can be \code{neg.binom}
#'   which stands for negative binomial distribution. If \code{NULL} (Default),
#'   then this function uses log normal distribution with base 10.
#'
#' @param sig.params Parameters from \code{\link{GetSynSigParamsFromExposures}}
#'   or another source. Should be
#'   a matrix or data frame with one column for
#'   each signature and the following rows:
#'
#'   * For log normal distribution,
#' \describe{
#' \item{prob}{The proportion of tumors with the signature.}
#' \item{mean}{The mean(log_10(number of mutations)).}
#' \item{stdev}{The stdev(log_10(number of mutations)).}
#' }
#'
#'  * For negative binomial distribution,
#' \describe{
#' \item{prob}{The proportion of tumors with the signature.}
#' \item{size}{Dispersion parameter.}
#' \item{mu}{Mean.}
#' }
#'
#'   The rownames need to be the column names of a signature
#'   catalog.
#'
#' @param round.exposure Whether the exposures should be rounded to
#' an integer. Set as \code{FALSE} only when reproducing legacy data
#' sets.
#'
#' @details Determine the intensity of each
#' mutational signature in a tumor, returning the number of mutations
#' using the mean mutation burden per signature and the std dev
#'
#' @md
#'
#' @importFrom stats rnorm
#'
#' @keywords internal
GenerateSynExposureOneSample <-
  function(tumor,
           sig.interest,
           burden.per.sig,
           sd.per.sig,
           sig.matrix = NULL,
           distribution = NULL,
           sig.params = NULL,
           round.exposure = TRUE
  ) {

    ## starts with individual tumors, only generate exposures for signatures
    ## with a flag does not equal to 0.
    active.sigs <- base::which(tumor != 0)

    GenerateSynExposureFromDistribution <-
      function(tumor, sigs, burden.per.sig, sd.per.sig,
               distribution = NULL, sig.params = NULL) {
        if (is.null(distribution)) {
          stdev <- sd.per.sig[,sigs]
          burden <- burden.per.sig[,sigs]

          ## if std dev is too big, >= 3, max = 3
          ### consider handling this different. the worry is that the variation
          ##  is too large, the sampled mutation burden will be very high,
          ### which will have a mutation burden that is not biologically possible
          if (stdev >= 3) {
            cat("Very large stdev", stdev, "\n")
            stdev = 3
          }

          ## mutational intensity follows a log normal distribution
          ## use the normal distribution with log-ed values instead
          return(10^(rnorm(1, sd = stdev, mean = burden)))
        } else if (distribution == "neg.binom") {
          if (is.null(sig.params)) {
            stop("sig.params cannot be NULL when distribution is non-NULL")
          }

          param.not.available <- setdiff(c("size", "mu"), rownames(sig.params))
          if (length(param.not.available) > 0) {
            stop("Parameter ", paste(param.not.available, collapse = " "),
                 " not available in sig.params")
          }

          size <- sig.params["size", sigs]
          mu <- sig.params["mu", sigs]
          count <- stats::rnbinom(n = 1, size = size, mu = mu)

          # sigs is active mutational signature, so resample until count is
          # greater than 0
          while(count == 0) {
            count <- stats::rnbinom(n = 1, size = size, mu = mu)
          }
          return(count)
        }
      }

    for (sigs in active.sigs) {
      tumor[sigs] <-
        GenerateSynExposureFromDistribution(tumor = tumor,
                                            sigs = sigs,
                                            burden.per.sig = burden.per.sig,
                                            sd.per.sig = sd.per.sig,
                                            distribution = distribution,
                                            sig.params = sig.params)
    }

    # Round the mutations due to each signature as non integer values of
    # mutations is impossible in biology.
    #
    # However, rounding should be disabled to reproduce legacy data sets.
    if(round.exposure) {
      tumor <- round(tumor)
    }

    tumor <- as.matrix(tumor)
    names(tumor) <- sig.interest

    if (!is.null(sig.matrix)) {
      sigs.to.use <- sig.matrix[, names(tumor), drop = FALSE]
      catalog <- sigs.to.use %*% tumor
      i.cat <- round(catalog, digits = 0)

      # Resample the exposures until the mutation counts for one sample is not zero
      while (colSums(i.cat) == 0) {
        for (sigs in active.sigs) {
          tumor[sigs] <-
            GenerateSynExposureFromDistribution(tumor = tumor,
                                                sigs = sigs,
                                                burden.per.sig = burden.per.sig,
                                                sd.per.sig = sd.per.sig,
                                                distribution = distribution,
                                                sig.params = sig.params)
        }

        # Round the mutations due to each signature as non integer values of
        # mutations is impossible in biology
        tumor <- round(tumor)

        tumor <- as.matrix(tumor)
        names(tumor) <- sig.interest

        sigs.to.use <- sig.matrix[, names(tumor), drop = FALSE]
        catalog <- sigs.to.use %*% tumor
        i.cat <- round(catalog, digits = 0)
      }
    }

    return(tumor)
  }

#' @title Generate synthetic spectra catalogs given
#' signature profiles and synthetic exposures.
#'
#' @param signatures The signature profiles.
#'
#' @param exposures The synthetic exposures.
#'
#' @param sample.id.suffix A string for adding a suffix to
#'  sample ID. For example, if sample.id.suffix is "abc",
#'  then SomeCancerType::s1.33 is changed to
#'  SomeCancerType::s1-abc.33. Actually, this just replaces
#'  the first "." in the sample id with "-" concatenated
#'  to sample.id.suffix. TODO(Steve): probably drop this
#'
#' @return A list of three elements that comprise the
#' synthetic data: \enumerate{
#'  \item \code{ground.truth.catalog}: Spectra catalog for
#'  the software input.
#'  \item \code{ground.truth.signatures}: Signatures active
#'  in \code{ground.truth.catalog}.
#'  \item \code{ground.truth.exposures}: Exposures of \code{ground.truth.signatures}
#'  in \code{ground.truth.catalog}.
#' }
#'
#' @export

CreateSynCatalogs <-
  function(signatures, exposures, sample.id.suffix = NULL) {

    # if (any(colSums(exposures) < 1)) warning("Some exposures < 1")

    exposed.sigs <- rownames(exposures)

    # It is an error if there are signatures in exposures that are not
    # in signatures.
    stopifnot(setequal(setdiff(exposed.sigs, colnames(signatures)), c()))

    # VERY IMPORTANT, the next statement guarantees that
    # the order of signatures in rows of exposures is the same as
    # the order of columns in signatures. In addition,
    # it ensure that signatures contains only signatures
    # that are present in exposures.
    #
    signatures <- signatures[ , exposed.sigs]

    # TODO(Steve): in a future versions, for each
    # synthetic tumor, for each signature, accounting
    # for e mutations in that tumor, sample e mutations
    # from the signature profile.
    catalog <- signatures %*% exposures

    i.cat <- round(catalog, digits = 0)

    if (any(colSums(i.cat) == 0))
      warning("Some tumors with 0 mutations")

    if (!is.null(sample.id.suffix)) {
      newcolnames <-
        gsub(".", paste0("-", sample.id.suffix, "."),
             colnames(i.cat), fixed = TRUE)
      stopifnot(colnames(i.cat == colnames(exposures))) #NEW
      colnames(i.cat) <- newcolnames
      colnames(exposures) <- newcolnames
    }

    ## Convert ground.truth.catalog and ground.truth.signatures
    ## into ICAMS acceptable catalogs before outputting the list
    i.cat <- ICAMS::as.catalog(
      object = i.cat,
      # ref.genome = "hg19",
      # WARNING, there
      abundance = NULL,
      # ICAMS::all.abundance$BSgenome.Hsapiens.1000genomes.hs37d5$genome$`96`,
      region = "genome",
      catalog.type = "counts")

    signatures <- ICAMS::as.catalog(
      object = signatures,
      # ref.genome = "hg19",
      abundance = NULL,
      #  ICAMS::all.abundance$BSgenome.Hsapiens.1000genomes.hs37d5$genome$`96`,
      region = "genome",
      catalog.type = "counts.signature")

    ## Return a list with:
    ## $ground.truth.catalog: Spectra catalog for the software input
    ## $ground.truth.signatures: Signatures active in $ground.truth.catalog
    ## $ground.truth.exposures: Exposures of $ground.truth.signatures in
    ## $ground.truth.catalog.
    return(list(ground.truth.catalog = i.cat,
                ground.truth.signatures = signatures,
                ground.truth.exposures = exposures))
    # TODO(Steve) In future, add noise
  }


#' Merge 2 exposure matrices
#'
#' @param exp1 An exposure matrix
#'
#' @param exp2 An exposure matrix
#'
#' @return The column-wise merge of the two input matrices as
#' with all rownames from either matrix preserved and
#' corresponding entries filled with 0s.
#'
#' @keywords internal
Merge2Exposures <- function(exp1, exp2) {
  # Rows are signatures
  exp.m <- merge(exp1, exp2, by = 0, all = TRUE)
  rownames(exp.m) <- exp.m[ ,1]
  exp.m <- exp.m[ , -1]
  exp.m[is.na(exp.m)] <- 0
  return(as.matrix(exp.m))
}

#' Merge all exposure matrices in a list of matrices
#'
#' @param list.of.exposures A list of exposure matrices
#'
#' @return The column-wise merge of all the input matrices
#' with all rownames from all matrices preserved and
#' corresponding entries filled with 0s.
#'
#' @export

MergeExposures <- function(list.of.exposures) {
  stopifnot(length(list.of.exposures) > 0)
  if (length(list.of.exposures) == 1) {
    return(as.matrix(list.of.exposures[[1]]))
  }
  start <- list.of.exposures[[1]]
  for (i in 2:length(list.of.exposures)) {
    start <- Merge2Exposures(start, list.of.exposures[[i]])
  }
  start2 <- start[SortSigId(rownames(start)), , drop = FALSE]
  return(as.matrix(start2))
}

#' Create file names in a given directory
#'
#' The directory is provided by the global
#' variable \code{OutDir.dir},
#' which \strong{must} be set by the user. If
#' \code{OutDir.dir} is NULL then just return
#' \code{file.name}.
#'
#' @param file.name The name of the that will be
#' prefixed by \code{OutDir.dir}.
#'
#' @return \code{file.name} prefixed by \code{OutDir.dir}.
#'
#' @export
# Paste of OutDir.dir / file.name (or just file.name)
OutDir <- function(file.name) {
  warning("Use of function OutDir is deprecated")
  if (!exists("OutDir.dir")) return(file.name)
  if (is.null(OutDir.dir)) return(file.name)
  tmp <- OutDir.dir
  n <- nchar(tmp)
  if (substr(tmp, n, n) != "/")
    tmp <- paste0(tmp, "/")
  if (!dir.exists(tmp)) {
    dir.create(tmp)
  }
  return(paste0(tmp, file.name))
}


#' Generate synthetic exposures from abstract parameters.
#'
#' Checkpoints the parameters and the synthetic
#' exposures to files. It also checks that the parameters
#' inferred from the synthetic data approximate those
#' inferred from \code{real.exp}.
#'
#' @param parms The actual exposures upon which to base
#' the parameters and synthetic exposures.
#'
#' @param num.syn.tumors Generate this number of synthetic tumors.
#'
#' @param file.prefix Prepend this to output filenames
#'  to indicate the organization of the data.
#'
#' @param sample.id.prefix Prefix for sample identifiers for the
#' synthetic samples.
#'
#' @param round.exposure Whether the exposures should be rounded to
#' an integer. Set as \code{FALSE} only when reproducing legacy data
#' sets.
#'
#' @return A list with elements:
#' \enumerate{
#'  \item \code{parms} The parameters inferred from \code{real.exp}.
#'  \item \code{syn.exp} The synthetic exposures generated from \code{parms}.
#' }
#'
#' @keywords internal

GenerateSynAbstract <-
  function(parms,
           num.syn.tumors,
           file.prefix = NULL,
           sample.id.prefix,
           froot = NULL,
           round.exposure = FALSE) {
    stopifnot(!is.null(parms))

    if (is.null(froot)) {
      froot <- OutDir(file.prefix)
    }

    parm.file <- paste0(froot, ".parms.csv")
    cat("# Original paramaters\n", file = parm.file)
    suppressWarnings( # Suppress warning on column names on append
      WriteSynSigParams(parms, parm.file, append = TRUE,
                        col.names = NA))

    syn.exp <-
      GenerateSyntheticExposures(
        parms,
        num.syn.tumors,
        sample.id.prefix,
        round.exposure = round.exposure)

    WriteExposure(syn.exp, paste0(froot, ".generic.syn.exposure.csv"))

    # Sanity check; we regenerate the parameters from the synthetic exposures.
    check.params <- GetSynSigParamsFromExposures(syn.exp)

    # check.params should be similar to parms
    cat("# Parameters derived from synthetic exposures\n",
        file = parm.file, append = TRUE)
    suppressWarnings(
      WriteSynSigParams(check.params, parm.file, append = TRUE))

    missing.sig.names <- setdiff(colnames(parms), colnames(check.params))
    if (length(missing.sig.names) > 0) {
      cat("# Some signatures not represented in the synthetic data:\n",
          file = parm.file, append =  TRUE)
      cat("#", missing.sig.names, "\n", file = parm.file, append = TRUE)
      check.param2 <- matrix(NA, nrow = dim(parms)[1], ncol = dim(parms)[2])
      dimnames(check.param2) <- dimnames(parms)
      check.param2[ , colnames(check.params)] <- check.params
      check.params <- check.param2
    }

    cat("# Difference between original parameters and parameters",
        "derived from synthetic exposures\n",
        file = parm.file, append = TRUE)
    WriteSynSigParams(parms - check.params, parm.file,
                      append = TRUE)
    cat("# The difference should be small\n",
        file = parm.file, append = TRUE)

    return(list(parms = parms, syn.exp = syn.exp, check.parms = check.params))
  }

#' Generate synthetic exposures from real exposures.
#'
#' Checkpoints the parameters and the synthetic
#' exposures to files. It also checks that the parameters
#' inferred from the synthetic data approximate those
#' inferred from \code{real.exp}.
#'
#' @param real.exp The actual (real) exposures upon which to base
#' the parameters and synthetic exposures.
#'
#' @param num.syn.tumors Generate this number of synthetic tumors.
#'
#' @param file.prefix Prepend this to output filenames
#'  to indicate the organization of the data.
#'
#' @param sample.id.prefix Prefix for sample identifiers for the
#' synthetic samples.
#'
#' @param top.level.dir Directory in which to create several files.
#'   This directory must already exist.
#'
#' @return A list with elements:
#' \enumerate{
#'  \item \code{parms} The parameters inferred from \code{real.exp}.
#'  \item \code{syn.exp} The synthetic exposures generated from \code{parms}.
#' }
#'
#' @export
GenerateSynFromReal <- function(real.exp,
                                num.syn.tumors,
                                file.prefix,
                                sample.id.prefix,
                                top.level.dir = NULL) {

  stopifnot(ncol(real.exp) > 0)
  parms <- GetSynSigParamsFromExposures(real.exp)

  if (is.null(top.level.dir)) {
    new.file.prefix <- OutDir(file.prefix)
    warning("Calling GenerateSynFromReal witout top.level.dir is deprecated")
  } else {
    new.file.prefix <- file.path(top.level.dir, file.prefix)
  }

  WriteExposure(real.exp,
                paste0(new.file.prefix, ".real.input.exposure.csv"))

  return(
    GenerateSynAbstract(
      parms = parms,
      num.syn.tumors = num.syn.tumors,
      file.prefix = file.prefix,
      sample.id.prefix = sample.id.prefix,
      froot = file.path(top.level.dir, file.prefix)))

  #   function(parms, num.syn.tumors, file.prefix, sample.id.prefix, froot = NULL)
}

#' Create and write a mutational spectra catalog
#'
#' @export
#'
#' @param sigs Signatures to use.
#'
#' @param exp (Synthetic) exposures.
#'
#' @param dir Deprecated, maintained only to avoid
#' breaking old code. A subdirectory based on
#' the deprecated global variable \code{\link{OutDir}}.
#'
#' @param write.cat.fn Function to write catalogs \strong{or}
#' spectra to files.
#'
#' @param extra.file.suffix Extra string to put before ".csv".
#'
#' @param overwrite If TRUE, overwrite existing directory; useful for
#' debugging / testing.
#'
#' @param my.dir The directory in which to write the catalog
#'  and several additional files.
#'
#' @return Invisibly, the generated catalog.
#'
#' @details Create a file with the catalog \code{syn.data.csv}
#'  and writes \code{sigs} to \code{input.sigs.csv}.
#'
CreateAndWriteCatalog <-
  function(sigs,
           exp,
           dir               = NULL,
           write.cat.fn      = ICAMS::WriteCatalog,
           extra.file.suffix = "",
           overwrite         = FALSE,
           my.dir            = NULL) {
    info <- CreateSynCatalogs(sigs, exp)

    if (is.null(my.dir)) {
      warning("In CreateAndWriteCatalog my.dir is NULL, using deprecated work-around\n",
              "Do not use the argument dir, use out.dir")
      if (exists("OutDir") && exists("OutDir.dir")) {
        my.dir <-  OutDir(dir)
      } else {
        warning("In CreateAndWriteCatalog my.dir is NULL and OutDir() does not exist\n",
                "setting my.dir <- dir")
        my.dir <- dir
      }
    }

    if (dir.exists(my.dir)) {
      if (!overwrite) stop("\nDirectory ", my.dir, " exists\n")
    } else {
      dir.create(my.dir)
    }

    if (extra.file.suffix == "") {
      suffix <- ".csv"
    } else {
      suffix = paste0(".", extra.file.suffix, ".csv")
    }
    write.cat.fn(info$ground.truth.signatures,
                 paste0(my.dir, "/ground.truth.syn.sigs", suffix))

    zero.mutation <- base::which(colSums(info$ground.truth.catalog) == 0)

    if (length(zero.mutation) > 0) {
      warning("Tumors with no mutation:\n\n",
              paste(colnames(info$ground.truth.catalog)[zero.mutation], collapse = " "),
              " in", my.dir)
    }
    write.cat.fn(info$ground.truth.catalog,
                 paste0(my.dir, "/ground.truth.syn.catalog", suffix))
    WriteExposure(info$ground.truth.exposures,
                  paste0(my.dir, "/ground.truth.syn.exposures", suffix))
    invisible(info$ground.truth.catalog)
  }

#' @keywords internal
MustCreateDir <- function(dir, overwrite = FALSE) {
  if (dir.exists(dir)) {
    if (!overwrite) stop(dir, " exists and overwrite is FALSE")
    return(NULL)
  }
  if (!dir.create(dir, recursive = TRUE)) {
    stop("Unable to create dir ", dir )
  }
  return(NULL)
}

#' Create and write a mutational spectra catalog
#'
#' @export
#'
#' @param sigs Signatures to use.
#'
#' @param exp (Synthetic) exposures.
#'
#' @param dir Directory in which to put the signatures;
#' NOTE: this will be a subdirectory based on \code{\link{OutDir}}.
#'
#' @param extra.file.suffix Extra string to put before ".csv".
#'
#' @param overwrite If TRUE, overwrite existing directory; useful for
#' debugging / testing.
#'
#' @return Invisibly, the generated catalog.
#'
#' @details Create a file with the catalog \code{syn.data.csv}
#'  and writes \code{sigs} to \code{input.sigs.csv}.
#'
NewCreateAndWriteCatalog <-
  function(sigs, exp, dir, extra.file.suffix = "",
           overwrite = FALSE) {
    info <- CreateSynCatalogs(sigs, exp)

    if (dir.exists(dir)) {
      if (!overwrite) stop("\nDirectory ", dir, " exists\n")
    } else {
      MustCreateDir(dir)
    }

    if (extra.file.suffix == "") {
      suffix <- ".csv"
    } else {
      suffix = paste0(".", extra.file.suffix, ".csv")
    }
    ICAMS::WriteCatalog(info$ground.truth.signatures,
                        paste0(dir, "/ground.truth.syn.sigs", suffix))

    zero.mutation <- base::which(colSums(info$ground.truth.catalog) == 0)

    if (length(zero.mutation) > 0) {
      warning("Tumors with no mutation:\n\n",
              colnames(info$ground.truth.catalog)[zero.mutation],
              "in", dir)
    }
    ICAMS::WriteCatalog(info$ground.truth.catalog,
                        paste0(dir, "/ground.truth.syn.catalog", suffix))
    WriteExposure(info$ground.truth.exposures,
                  paste0(dir, "/ground.truth.syn.exposures", suffix))
    invisible(info$ground.truth.catalog)
  }


#' Exposures and spectra with Poisson or negative binomial noise in exposures.
#'
#' @param input.exposure The exposures to which to add noise; a numeric matrix
#'    or data frame in which the rows are signatures and the columns are
#'    samples. Each cell indicates the number of mutations due to a particular
#'    signature in a particular sample.
#'
#' @param signatures The signatures in the exposure; the column names
#'     of \code{signatures} have to include all row names in
#'     \code{input.exposure}; can be an \code{\link[ICAMS]{ICAMS}}
#'     catalog or a numerical matrix or data frame.
#'
#' @param n.binom.size If non \code{NULL}, use negative binomial noise
#'     with this size parameter; see \code{\link[stats]{NegBinomial}}.
#'     If \code{NULL}, use Poisson noise.
#'
#' @return A list with the elements \describe{
#' \item{expsoures}{The numbers of mutations due to each signature
#'    after adding noise}
#' \item{spectra}{The spectra based on the noisy signature exposures.}
#' }
#'
#' @importFrom stats rpois rnbinom
#'
#' @export

AddNoise <- function(input.exposure, signatures, n.binom.size = 100) {

  if (is.data.frame(input.exposure)) {
    input.exposure <- as.matrix(input.exposure)
  }
  stopifnot(is.numeric(input.exposure))
  exposures <- input.exposure
  exposures[ , ] <- NA

  spectra <- matrix(0, ncol = ncol(input.exposure), nrow = nrow(signatures))
  rownames(spectra) <- rownames(signatures)
  colnames(spectra) <- colnames(input.exposure)

  for (sig in rownames(input.exposure)) {

    partial.spec <- signatures[ , sig] %*% input.exposure[sig, , drop = FALSE]
    # Resample (add noise) to the "partial spectra" due to sig
    if (is.null(n.binom.size)) {
      noised.vec <-
        rpois(n = length(partial.spec), lambda = partial.spec)
    } else {
      noised.vec <-
        rnbinom(n = length(partial.spec), size = n.binom.size, mu = partial.spec)
    }
    # Turn the vector back into a matrix
    noisy.partial.spec <- matrix(noised.vec, nrow = nrow(partial.spec))
    exposures[sig, ] <- colSums(noisy.partial.spec)
    spectra <- spectra + noisy.partial.spec

  }

  return(list(exposures = exposures, spectra = spectra))

}
steverozen/SynSigGen documentation built on April 1, 2022, 8:54 p.m.