R/add_noise.R

Defines functions finish shift_noise sqrt_noise graded_log_noise log_noise add_noise

Documented in add_noise graded_log_noise log_noise shift_noise sqrt_noise

#' Add noise to count data
#' 
#' Gaussian noise can be added to the simulated count matrix in multiple ways which can
#' be combined.
#' 
#' - `add_noise` adds simple Gaussian noise to counts. This affects low
#' expressed genes and hardly affects highly expressed genes.
#' - With `log_noise`,
#' counts are converted using log2+1 and Gaussian noise added, followed by
#' conversion back to count scale. This affects all genes irrespective of
#' expression level.  
#' - With `graded_log_noise`,
#' counts are converted to log2+1. A scaling factor is calculated for gene
#' expression level ranging from 0 to 1, which maps to 0 to the maximum number
#' of counts. This scaling factor is inverted from 1 to 0 (i.e. noise affects
#' low counts more than high counts) and then passed through the function
#' specified by `transform` (this controls how much the middle counts are
#' affected). Then the Gaussian noise is multiplied by the scaling factor and
#' added to the counts.
#' - With `sqrt_noise`, counts
#' are square root transformed before Gaussian noise is added, and then
#' transformed back. This still has a stronger effect on low expressed genes,
#' but the effect is more graduated with a more gradual fall off in effect on
#' genes with increasing expression.
#' - With `shift_noise`, whole gene rows are selected at random then each row is
#' multiplied by a random amount varying according to 2^rnorm. This simulates
#' shifted expression up/down due to differences in chemistry through which some
#' genes are more or less detectable.
#' 
#' @param counts An integer count matrix with genes in rows and cell
#'   subclasses typically generated by [simulate_bulk()].
#' @param sd Standard deviation of noise to be added.
#' @param transform Function for controlling amount of noise by expression level
#'   in [graded_log_noise()].
#' @param p Proportion of genes affected by noise.
#' @returns A positive integer count matrix with genes in rows and cell
#'   subclasses in columns.
#' @export
add_noise <- function(counts, sd = 100) {
  scale_factor <- mean(colSums(counts)) / 1e9
  sd <- sd * scale_factor
  # simple Gaussian noise
  rn <- rnorm(prod(dim(counts)), sd = sd)
  rmat <- matrix(round(rn), nrow = nrow(counts))
  counts <- counts + rmat
  finish(counts)
}

#' @rdname add_noise
#' @export
log_noise <- function(counts, sd = 0.1) {
  # Gaussian noise on log scale
  rn <- rnorm(prod(dim(counts)), sd = sd)
  rmat <- matrix(rn, nrow = nrow(counts))
  log_sim <- log2(counts +1)
  log_sim <- log_sim + rmat
  counts <- 2^log_sim -1
  finish(counts)
}

#' @rdname add_noise
#' @export
graded_log_noise <- function(counts, sd = 0.1,
                             transform = function(x) x^3) {
  # Gaussian noise on log scale
  rn <- rnorm(prod(dim(counts)), sd = sd)
  rmat <- matrix(rn, nrow = nrow(counts))
  log_sim <- log2(counts +1)
  fac <- 1 - (log_sim / max(log_sim))
  fac <- transform(fac)
  log_sim <- log_sim + rmat * fac
  counts <- 2^log_sim -1
  finish(counts)
}

#' @rdname add_noise
#' @export
sqrt_noise <- function(counts, sd = 100) {
  # sqrt transform
  scale_factor <- mean(colSums(counts)) / 1e9
  sd <- sqrt(sd * scale_factor) /2  # loose equivalence to count_sd
  rn <- rnorm(prod(dim(counts)), sd = sd)
  rmat <- matrix(rn, nrow = nrow(counts))
  sx <- sqrt(counts)
  sx <- sx + rmat
  sx[sx < 0] <- 0
  counts <- sx^2
  finish(counts)
}

#' @rdname add_noise
#' @importFrom stats rbinom
#' @export
shift_noise <- function(counts, sd = 0.5, p = 0.5) {
  # shift transform
  rn <- rnorm(nrow(counts), sd = sd)
  w <- rbinom(nrow(counts), 1, p)
  rn[w == 0] <- 0
  counts <- counts * 2^rn
  out <- finish(counts)
  attr(out, "shift") <- rn
  out
}

finish <- function(counts) {
  counts[counts < 0] <- 0
  if (max(counts) <= .Machine$integer.max) {
    mode(counts) <- "integer"
  } else counts <- round(counts)
  counts
}

Try the cellGeometry package in your browser

Any scripts or data that you put into this service are public.

cellGeometry documentation built on April 20, 2026, 1:06 a.m.