R/internals.R
In scquantum: Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

Documented in ecf.global.max expected.peak.heights gat l2e.normal.sd prof2invals quantum.sd seg2invals segment.summarize timeseries.iod

#' Functions for processing raw profiles




# Estimate the standard deviation of a normal distribution with L2E, assuming
# the normal distribution has mean 0.


#' @export
#' @keywords internal
#' @name l2e.normal.sd
#' @docType methods
#' @rdname internals
l2e.normal.sd <- function(xs)
{
  # Need at least two values to get a standard deviation
  stopifnot(length(xs) >= 2)
  optim.result <- stats::optimize(
    # L2E loss function
    f=function(sd)
    # "Data part", the sample average of the likelihood
    -2 * mean(stats::dnorm(xs, sd=sd)) +
    # "Theta part", the integral of the squared density
      1/(2*sqrt(pi)*sd),
    # Parameter: standard deviation of the normal distribution fit
    interval = c(0, diff(range(xs))))
  return(optim.result$minimum)
}


# A variance-stabilizing transform for data where the mean varies but the index
# of dispersion stays the same. "gat" stands for "generalized Anscombe transform"

#' @export
#' @keywords internal
#' @name gat
#' @docType methods
#' @rdname internals
gat <- function(x, iod)
{
  # When index of dispersion is above 4, transformation will be imaginary for
  # low inputs. So, you need to check if it's real-valued. If it's imaginary,
  # return NA.
  ifelse(x + (4-iod)/8 < 0,
    NA,
    # Due to vectorization, this will get executed even when the result is
    # imaginary, resulting in a warning. Therefore, suppress warnings.
    suppressWarnings((2 / sqrt(iod)) * sqrt(x + (4-iod)/8))
  )
}

#' @useDynLib scquantum tf_dp_wrapper
#' @export
#' @keywords internal
#' @name tf.dp
#' @docType methods
#' @rdname internals
tf.dp <- Vectorize(function(y, lam)
{
  n <- length(y)
  stopifnot(length(lam)==1)
  .C("tf_dp_wrapper", n=as.integer(n), y=as.numeric(y),
     lam=as.double(lam)[1], beta=numeric(n))$beta
}, "lam")

# Segment a profile, and summarize the segments. Output has columns penalty,
# mean, se, length, start_index, and end_index. Each row in the output
# represents a segment, giving the estimated mean of the segment, the standard
# error of the mean, and the number of elements which were in the segment.
#' @export
#' @keywords internal
#' @name segment.summarize
#' @docType methods
#' @rdname internals
segment.summarize <- function(inprof, penalty, trans, seg, loc, se)
{
  # If the input penalties don't have names, name them. This is intended to make
  # grouping by penalty safe without worrying about flaoting point problems.
  # Transform to try and get data which are normally distributed around a
  # segment-specific mean, with the same variance for each segment
  transformed.profile <- trans(inprof)
  stopifnot(all(!is.nan(transformed.profile)) & all(!is.na(transformed.profile)))

  segmented.profile <- seg(transformed.profile, penalty)

  # For each penalty value, number elements of the profile according to what
  # segment they're in
  segnums <- cumsum(c(TRUE, abs(diff(segmented.profile)) > 0.1))

  # Summarize the segments, recording three numbers: estimate the mean of each
  # segment, the standard error of the mean, and record the length of the
  # segment. Values are not transformed, since they are obtained from the
  # original profile, using only the segment numbering derived from the
  # segmented transformed profile.
  means <- tapply(inprof, segnums, loc)
  standard.errors <- tapply(inprof, segnums, se)
  lengths <- tapply(inprof, segnums, length)

  # From the lengths, get the start and end indices
  end.indices <- cumsum(lengths)
  start.indices <- c(0, end.indices[-length(end.indices)]) + 1

  data.frame(mean=means, se=standard.errors,
             length=lengths, start_index = start.indices,
             end_index = end.indices)
}

#' @export
#' @keywords internal
#' @name prof2invals
#' @docType methods
#' @rdname internals
prof2invals <- function(
  # The input required for segmentation: the profile to be segmented, and the
  # penalty value for the segmentation
  inprof, penalty,
  # The input required for annotation: the data frame containing the
  # annotations, and the names of the columns which are going to be used. This
  # also affects the segmentation, since chromosome boundaries are always
  # segment boundaries.
  annotations, chrom.colname, bin.start.colname, bin.end.colname)
{

  stopifnot(typeof(chrom.colname) == "character")
  stopifnot(typeof(bin.start.colname) == "character")
  stopifnot(typeof(bin.end.colname) == "character")
  stopifnot("data.frame" %in% class(annotations))

  # Estimate the index of dispersion, which will be used for estimating standard
  # errors of segment means, and for transformation
  iod.est <- timeseries.iod(inprof)

  # Split the annotations into two parts, one which will be accessed using the
  # segment start indices, and the other which will be accessed using the
  # segment end indices. The chromosome column can go in either one of these,
  # but of course the bin start positions need to go in the first, and the bin
  # end positions need to go in the second.
  left.annotations <- annotations[,c(chrom.colname, bin.start.colname),drop=FALSE]
  # If you don't want to annotate with bin ends, you can put in NULL, and the
  # right annotations will be a data frame with no columns
  right.annotations <- if (is.null(bin.end.colname))
  {
    data.frame()[1:nrow(annotations),]
  } else
  {
      out <- annotations[,bin.end.colname,drop=FALSE]
      # When bin.start.colname overlaps with bin.end.colname, add a suffix to
      # the bin end column names to avoid duplicate column names
      while (any(colnames(out) %in% colnames(left.annotations)))
      {
        colnames(out)[colnames(out) %in% colnames(left.annotations)] <-
          sapply(colnames(out)[colnames(out) %in% colnames(left.annotations)],
                 paste, "end", sep=".")
      }
      out
  }
  # Segment the profile and annotate the results
  annotated.segmented.counts <-
    Reduce(rbind, mapply(function(segments, annotations1, annotations2)
    cbind(annotations1[segments$start_index,,drop=FALSE],
          annotations2[segments$end_index,,drop=FALSE],
          segments),
    # Split the profile by chromosome and segment each chromosome separately,
    # guaranteeing that chromosome boundaries are segment boundaries
    tapply(inprof, annotations[[chrom.colname]], segment.summarize,
           penalty=penalty,
      # The functions to be used to transform the data, to segment it, to estimate
      # the segment means, and to estimate the standard error of the means
      trans = function(x)
      {
        gat.result <- gat(x, iod=iod.est);
        ifelse(is.na(gat.result), 0, gat.result)
      },
      seg = tf.dp, loc = stats::median,
      se = function(x) sqrt(pi/2) * sqrt(iod.est * stats::median(x) / length(x))
    ),
    # Chromosome annotations and bin start annotations, split by chromosome
    split(left.annotations, annotations[[chrom.colname]]),
    # Bin end annotations, split by chromosome
    split(right.annotations, annotations[[chrom.colname]]),
    # Can be read either as a statement that the output is to be returned as a
    # list, or a sarcastic comment about how the code was written
    SIMPLIFY=FALSE))

  return(annotated.segmented.counts[,c(
    colnames(left.annotations), colnames(right.annotations),
    "mean", "se", "length"
  )])
}

#' @export
#' @keywords internal
#' @name seg2invals
#' @docType methods
#' @rdname internals
seg2invals <- function(seg_mean, seg_length, iod, annotations)
{
  se <- sqrt(iod * seg_mean / seg_length)
  return(cbind(annotations, data.frame(mean = seg_mean, se = se, length = seg_length)))
}


### Empirical characteristic functions and maxima

# Evaluate the weighted empirical characteristic function
#' @export
#' @keywords internal
#' @name weighted.ecf
#' @docType methods
#' @rdname internals
weighted.ecf <- Vectorize(function(y, sds, s)
{
  stopifnot(length(y) == length(sds))
  y <- y[sds > 0.00001]
  sds <- sds[sds > 0.00001]
  means <- exp(-2 * pi^2 * sds^2 * s^2)
  variances <- 1 - exp(-4 * pi^2 * sds^2 * s^2)
  unnormalized.weights <- means / variances
  weights <- unnormalized.weights / sum(unnormalized.weights)
  sum(weights * exp(1i * (2*pi) * s * y))
}, 's')

# Global maximum of the weighted empirical characteristic function
#' @export
#' @keywords internal
#' @name ecf.global.max
#' @docType methods
#' @rdname internals
ecf.global.max <- function(y, sds, smin=1, smax = 8)
{
  stopifnot(length(y) == length(sds))
  svals <- seq(smin, smax, by=0.01)
  polar.quantogram <- weighted.ecf(y, sds, svals)
  max.index <- which.max(Mod(polar.quantogram))
  if (length(max.index) == 1)
  {
    return(c(
      peak_location = svals[max.index],
      peak_height = Mod(polar.quantogram)[max.index],
      peak_phase = Arg(polar.quantogram)[max.index]
    ))
  } else
  {
    return(c(
      peak_location = NA,
      peak_height = NA,
      peak_phase = NA
    ))
  }
}

# Takes in copy numbers, ratio standard errors, the scaling factor, and input
# values to the characteristic function. Outputs absolute value of the
# characteristic function
#' @export
#' @keywords internal
#' @name expected.peak.heights
#' @docType methods
#' @rdname internals
expected.peak.heights <- function(cn, ratio.se, scale, svals)
{
  nonzero.cn <- cn[cn > 0]
  nonzero.se <- (ratio.se * scale)[cn > 0]

  # Produce matrices, where each row is an s value, and each column is a segment
  # Characteristic function of the noise
  noise.cf <- t(sapply(svals, function(s)
    exp(-(1/2) * nonzero.se^2 * (2 * pi * s)^2)
  ))
  # Characteristic function of the copy number. I'm considering the copy number
  # to be a constant, but it still has a characteristic function, although a
  # trivial one
  # I'm leaving out the offset for now because I'm just calculating the modulus,
  # which is not affected by it; but I should put it in just so the definitions
  # don't look wrong
  cn.cf <- t(sapply(svals, function(s)
    exp(1i * (nonzero.cn / scale) * (2 * pi * s))
  ))
  # E[X] = Psi(2 pi s)
  expected.values <- cn.cf * noise.cf
  mean.norms <- noise.cf
  # Var(X) = 1 - | Psi(2 pi s) |^2
  variances <- 1 - mean.norms^2
  unnormalized.weights <- mean.norms / variances
  partition.function <- apply(unnormalized.weights, 1, sum)
  weights <- unnormalized.weights / partition.function
  weights[apply(weights, 1, function(x) any(is.nan(x)))] <- NA
  this.theoretical.quantogram <-
    apply(weights * expected.values, 1, sum)
  background <- apply(weights^2 * variances, 1, sum)
  expected.peak.height.function <- sqrt(Mod(this.theoretical.quantogram)^2 + background)
  return(expected.peak.height.function)
}

#' @export
#' @keywords internal
#' @name quantum.sd
#' @docType methods
#' @rdname internals
quantum.sd <- function(x, mu)
{
  y <- x / mu
  svals <- seq(1, 8, length.out=700)
  polar.quantogram <- sapply(svals, function(s) mean(exp(1i * (2*pi) * s * y)))
  peak.height <- Mod(polar.quantogram)[which.max(Mod(polar.quantogram))[1]]
  peak.location <- svals[which.max(Mod(polar.quantogram))[1]]
  ratio.variance <- (log(peak.height) / (-2 * pi^2)) * peak.location^2
  ratio.variance * mu
}

#' @export
#' @keywords internal
#' @name irises.pluspurple
#' @docType methods
#' @rdname internals
irises.pluspurple <-
  c(`0`="#25292E", `1`="#9F3D0C", `2`="#CF601F", `3`="#E1DE96",
    `4`="#89AD71", `5`="#89C9E0", `6`="#556bb7", `>=7`="#8b44bb")

# A function for estimating the index of dispersion, which is used when
# estimating standard errors for each segment mean
#' @export
#' @keywords internal
#' @name timeseries.iod
#' @docType methods
#' @rdname internals
timeseries.iod <- function(v)
{
  # 3 elements, 2 differences, can find a standard deviation
  stopifnot(length(v) >= 3)
  # Differences between pairs of values
  y <- v[-1]
  x <- v[-length(v)]
  # Normalize the differences using the sum. The result should be around zero,
  # plus or minus square root of the index of dispersion
  vals.unfiltered <- (y-x)/sqrt(y+x)
  # Remove divide by zero cases, and--considering this is supposed to be count
  # data--divide by almost-zero cases
  vals <- vals.unfiltered[y + x  >= 1]
  # Check that there's anything left
  stopifnot(length(vals) >= 2)
  # Assuming most of the normalized differences follow a normal distribution,
  # estimate the standard deviation
  val.sd <- l2e.normal.sd(vals)
  # Square this standard deviation to obtain an estimate of the index of
  # dispersion
  return(val.sd^2)
}

Any scripts or data that you put into this service are public.

scquantum documentation built on March 23, 2022, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

scquantum
Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

R/internals.R
In scquantum: Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

Defines functions timeseries.iod quantum.sd expected.peak.heights ecf.global.max seg2invals prof2invals segment.summarize gat l2e.normal.sd

Documented in ecf.global.max expected.peak.heights gat l2e.normal.sd prof2invals quantum.sd seg2invals segment.summarize timeseries.iod

Try the scquantum package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scquantum Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

R/internals.R In scquantum: Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

Defines functions timeseries.iod quantum.sd expected.peak.heights ecf.global.max seg2invals prof2invals segment.summarize gat l2e.normal.sd

Documented in ecf.global.max expected.peak.heights gat l2e.normal.sd prof2invals quantum.sd seg2invals segment.summarize timeseries.iod

Try the scquantum package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scquantum
Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing

R/internals.R
In scquantum: Estimate Ploidy and Absolute Copy Number from Single Cell Sequencing