R/util_stats.R
In goat: Gene Set Analysis Using the Gene Set Ordinal Association Test

Documented in minlog10_fixzero padjust_genesets rankscore rankscore_fixed_order

#' compute rank^2 scores and rescale these between 0~1000 (with further precision captured by decimals)
#'
#' rank^2 can yield huge values, e.g. a large genelist of N=50000 genes would imply max gene score = 50000^2 = 2.5e+09.
#' Thus we rescale these scores between 0~1000 so downstream applications (e.g. sum of 10000 gene scores) don't explode to huge numbers.
#'
#' @param x data.frame to sort (i.e. the genelist)
#' @param sort1 numeric vector of length `nrow(x)` to sort by first/primarily (descending order, higher value gets better result score)
#' @param sort2 numeric vector of length `nrow(x)` to sort by for breaking ties (descending order, higher value gets better result score)
#' @param sort3 numeric vector of length `nrow(x)` to sort by for breaking ties (descending order, higher value gets better result score)
#' @param colname name for result column in `x`  (overwritten if already exists)
#' @examples
#'   x = data.frame(gene=c(1, 2, 3, 4, 5),
#'                  pvalue=c(0.01, 1, 1, 0.1, 1),
#'                  effectsize=c(-2, 0.25, 0.5, 1, 0.25))
#'   print(x, row.names = FALSE)
#'   print(rankscore(x, sort1 = -1*x$pvalue, sort2 = abs(x$effectsize),
#'         sort3 = x$gene, colname="score") |>
#'     arrange(desc(score)), row.names = FALSE)
#' @return data.frame `x` with added column `colname`, containing gene scores between 0 and 1000
#' @export
rankscore = function(x, sort1, sort2, sort3, colname) {
  i = NULL
  if(is.null(sort3)) {
    i = order(sort1, sort2, decreasing = TRUE)
  } else {
    i = order(sort1, sort2, sort3, decreasing = TRUE)
  }
  # add scores
  x[i,colname] = rev(rankscore_fixed_order(nrow(x)))
  return(x)
}



#' Gene score array, from low to high scores
#'
#' @param n genelist length
#' @return result of `(1:n)^2 / (n^2 / 1000)`
#' @export
rankscore_fixed_order = function(n) {
  (1:n)^2 / (n^2 / 1000) # compute scaling factor prior to application to rank^2 scores
}



#' -log10 transform a vector of p-values, replacing zeros with some limit/threshold
#'
#' @examples
#'   pval = c(0, 10^-6, 0.001, 0.01, 1, NA, -Inf, Inf, NaN)
#'   cbind(
#'     input = pval,
#'     # default; replace zeros with typical R machine precision for doubles
#'     minlog10_default = minlog10_fixzero(pval),
#'     # alternatively, replace zero with lowest non-zero pvalue in input
#'     minlog10_limit_from_data = minlog10_fixzero(pval, limit = NA)
#'   )
#' @param x p-value vector to transform to -log10
#' @param limit value to replace zero's in `x` with. Set NA to replace zero's in `x` with the smallest finite value in `x` (if there is none, defaults to 2.22e-16)
#' @return input parameter `x` transformed to -log10
#' @export
minlog10_fixzero = function(x, limit = 2.22e-16) {
  stopifnot(length(limit) == 1 && (is.na(limit) || (is.numeric(limit) & is.finite(limit) & limit > 0)))
  stopifnot(is.na(x) | is.numeric(x)) # accept numbers, NA, NaN, Inf, -Inf. But not strings, booleans, factors, etc.

  # zero length input; return empty numeric
  if(length(x) == 0) return(numeric())

  # limit; if NA, use lowest non-zero value in input. But if no non-zero in input
  x_nonzero = x[is.finite(x) & x > 0]
  limit = ifelse(!is.finite(limit) && length(x_nonzero) > 0, min(x_nonzero), limit)
  limit = ifelse(!is.finite(limit), 2.22e-16, limit) # fallback for `x=0;limit=NA`; no x_nonzero to estimate from

  x[is.finite(x) & x == 0] = limit
  x[!is.finite(x)] = NA # i.e. replace NA, NaN, Inf, -Inf to NA prior to log transformation
  -log10(x)
}



#' Adjust p-values for all genesets, grouped by 'source' then adjust for the number of 'sources'
#'
#' @param genesets tibble with genesets, must contain column 'pvalue'
#' @param method method for multiple testing correction, must be any of `stats::p.adjust.methods`, e.g. "BH" or "bonferroni"
#' @param cutoff numeric cutoff value for adjusted p-value, `signif` column is set to TRUE for all values lesser-equals
#' @param correct_sources apply Bonferroni adjustment to all p-values according to the number of geneset sources that were tested. Boolean parameter, set TRUE to enable (default) or FALSE to disable
#' @return updated `genesets` table
#' @export
padjust_genesets = function(genesets, method = "BH", cutoff = 0.01, correct_sources = TRUE) {
  pvalue = pvalue_adjust = NULL # fix invisible bindings R package NOTE
  stopifnot(length(genesets) > 0 && is.data.frame(genesets) && all(c("source", "pvalue") %in% colnames(genesets)))
  stopifnot(length(method) == 1 && method %in% stats::p.adjust.methods)
  stopifnot(length(cutoff) == 1 && is.finite(cutoff))
  stopifnot(length(correct_sources) == 1 && correct_sources %in% c(TRUE, FALSE))

  genesets = genesets |>
    group_by(source) |> # new group_by() will override preexisting groups (as here intended)
    mutate(pvalue_adjust = stats::p.adjust(pvalue, method = method)) |>
    ungroup()

  # correction for the number of sources that were evaluated
  if(correct_sources) {
    N = n_distinct(genesets$source)
    genesets = genesets |> mutate(pvalue_adjust = pvalue_adjust * N)
  }

  genesets |>
    mutate(
      pvalue_adjust = pmin(pvalue_adjust, 1), # cap p-value at max 1
      signif = is.finite(pvalue_adjust) & pvalue_adjust <= cutoff
    )
}



#' Density function for skew normal distribution
#'
#' @param x numeric vector of values
#' @param mean location parameter
#' @param sd scale parameter
#' @param xi skewness parameter
#' @noRd
dsnorm = function(x, mean = 0, sd = 1, xi = 1) {
  dsnorm__std = function(x, xi) {
    if(length(xi) == 1) {
      xi = rep(xi, length(x))
    }
    m1 = 2/sqrt(2 * pi)
    mu = m1 * (xi - 1/xi)
    sigma = sqrt((1 - m1^2) * (xi^2 + 1/xi^2) + 2 * m1^2 - 1)
    z = x * sigma + mu
    g = 2 / (xi + 1 / xi)
    flag_up = z >= 0 # indicates upper half of the distribution
    XI = xi # flip xi for lower half
    XI[!flag_up] = 1 / XI[!flag_up]
    g * stats::dnorm(x = z/XI) * sigma
  }

  return(dsnorm__std(x = (x - mean)/sd, xi = xi) / sd)
}



#' Probability function for skew normal distribution
#'
#' @param q numeric vector of quantiles
#' @param mean location parameter
#' @param sd scale parameter
#' @param xi skewness parameter
#' @noRd
psnorm_upper_tail = function(q, mean = 0, sd = 1, xi = 1) {
  # vectorized over xi
  psnorm_upper_tail__std = function(q, xi) {
    if(length(xi) == 1) {
      xi = rep(xi, length(q))
    }

    m1 = 2/sqrt(2 * pi)
    mu = m1 * (xi - 1/xi)
    sigma = sqrt((1 - m1^2) * (xi^2 + 1/xi^2) + 2 * m1^2 - 1)
    z = q * sigma + mu
    g = 2 / (xi + 1 / xi)
    flag_up = z >= 0 # indicates upper half of the distribution
    XI = xi # flip xi for lower half
    XI[!flag_up] = 1 / XI[!flag_up]
    p = z # init as numeric vector of expected length
    # high precision for small pvalues
    p[flag_up] = g[flag_up] * XI[flag_up] * stats::pnorm(q = z[flag_up]/XI[flag_up], lower.tail = FALSE)
    # precision for large p-values is capped near 1 (approx 1 - 10^-16), but doesn't matter
    p[!flag_up] = 1 - g[!flag_up] * XI[!flag_up] * stats::pnorm(q = z[!flag_up]/XI[!flag_up], lower.tail = TRUE)

    p[p > 1] = 1
    return(p)
  }

  return( psnorm_upper_tail__std(q = (q - mean)/sd, xi = xi) )

  ### QC code
  # # validate precision; compare to pnorm @ xi = 1
  # q = seq(-12, 12, by = 0.1)
  # stopifnot( all.equal(stats::pnorm(q, lower.tail = FALSE), psnorm_upper_tail__std(q, xi = 1), tolerance = 10^-14) )
  #
  # # validate correctness; compare to preexisting implementation
  # # here only using a limited range for q because reference lacks precision for large q (ours should be fine)
  # q = seq(-8, 8, by = 0.1)
  # for(xi in c(0.5, 0.8, 1, 1.2, 1.5)) {
  #   stopifnot( all.equal(1 - fGarch::.psnorm(q, xi), psnorm_upper_tail__std(q, xi), tolerance = 10^-12) )
  # }
}



#' MLE estimate of parameter Xi in a skew-normal distribution
#'
#' @param x numeric vector
#' @param mu mean value for x
#' @param sd sd value for x
#' @param init_xi initial value for xi (typically 1~1.25)
#' @param lower lower bound for Xi
#' @param upper Upper bound for Xi
#' @noRd
fit_snorm_xi = function(x, mu, sd, init_xi, lower = NA, upper = NA) {

  ###### for reference; pure R variant (much slower than our Rcpp variant)
  # dsnorm_loglik = function(par) {
  #   -sum( dsnorm_log(x = x_standardized, xi = par) - log(sd) )
  # }
  # dsnorm_log = function(x, xi) {
  #   m1 = 2/sqrt(2 * pi)
  #   mu = m1 * (xi - 1/xi)
  #   g = 2/(xi + 1/xi)
  #   sigma = sqrt((1 - m1^2) * (xi^2 + 1/xi^2) + 2 * m1^2 - 1)
  #   z = x * sigma + mu
  #   Xi = xi^sign(z)
  #   log(g) + dnorm(x = z/Xi, log = TRUE) + log(sigma)
  # }
  # x_standardized = (x - mu) / sd
  ######

  dsnorm_loglik = function(par) {
    rcpp_dsnorm_logsum(x_min_mean = x_min_mu, N = N, xi = par, sd = sd)
  }

  if(is.na(lower)) {
    lower = 1
  }
  if(is.na(upper)) {
    upper = 1.5
  }

  N = length(x)
  x_min_mu = x - mu

  stats::optim(
    par = init_xi, # initial parameters
    fn = dsnorm_loglik,
    method = "Brent",
    lower = lower,
    upper = upper,
    control = list(reltol = 1e-10)
  )
}

Any scripts or data that you put into this service are public.

goat documentation built on April 3, 2025, 6:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

goat
Gene Set Analysis Using the Gene Set Ordinal Association Test

R/util_stats.R
In goat: Gene Set Analysis Using the Gene Set Ordinal Association Test

Defines functions fit_snorm_xi psnorm_upper_tail dsnorm padjust_genesets minlog10_fixzero rankscore_fixed_order rankscore

Documented in minlog10_fixzero padjust_genesets rankscore rankscore_fixed_order

Try the goat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

goat Gene Set Analysis Using the Gene Set Ordinal Association Test

R/util_stats.R In goat: Gene Set Analysis Using the Gene Set Ordinal Association Test

Defines functions fit_snorm_xi psnorm_upper_tail dsnorm padjust_genesets minlog10_fixzero rankscore_fixed_order rankscore

Documented in minlog10_fixzero padjust_genesets rankscore rankscore_fixed_order

Try the goat package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

goat
Gene Set Analysis Using the Gene Set Ordinal Association Test

R/util_stats.R
In goat: Gene Set Analysis Using the Gene Set Ordinal Association Test