#' @name scmet_hvf
#' @rdname scmet_hvf_lvf
#' @aliases detect_hvf_lvf detect_hvf
#'
#' @title Detect highly (or lowly) variable features with scMET
#'
#' @description Function for calling features as highly (or lowly) variable
#' within a datasert or cell population. This can be thought as a feature
#' selection step, where the highly variable features (HVF) can be used for
#' diverse downstream tasks, such as clustering or visualisation. Two
#' approaches for identifying HVFs (or LVFs): (1) If we correct for
#' mean-dispersion relationship, then we work directly on residual dispersions
#' `epsilon`, and define a percentile threshold `delta_e`. This is the
#' preferred option since the residual overdispersion is not confounded by
#' mean methylation levels. (2) Work directly with the overdispersion
#' parameter `gamma` and define an overdispersion contribution threshold
#' `delta_g`, above (below) of which we call HVFs (LVFs).
#'
#' @param scmet_obj The scMET posterior object after performing inference, i.e.
#' after calling `scmet` function.
#' @param delta_e Percentile threshold for residual overdispersion to detect
#' variable features (between 0 and 1). Default: 0.9 for HVF and 0.1 for LVF
#' (top 10%). NOTE: This parameter should be used when correcting for
#' mean-dispersion relationship.
#' @param delta_g Overdispersion contribution threshold (between 0 and 1).
#' @param evidence_thresh Optional parameter. Posterior evidence probability
#' threshold parameter `alpha_{H}` (between 0.6 and 1).
#' @param efdr Target for expected false discovery rate related to HVF/LVF
#' detection (default = 0.1).
#'
#' @return The scMET posterior object with an additional element named `hvf` or
#' `lvf` according to the analysis performed. This is a list object containing
#' the following elements: \itemize{ \item{ \code{summary}: A data.frame
#' containing HVF or LVF analysis output information per feature, including
#' posterior medians for `mu`, `gamma`, and `epsilon`. The `tail_prob` column
#' contains the posterior tail probability of a feature being called as HVF or
#' LVF. The logical `is_variable` column informs whether the feature is called
#' as variable or not.} \item{ \code{evidence_thresh}: The optimal evidence
#' threshold. } \item{ \code{efdr}: The EFDR value.} \item{ \code{efnr}: The
#' EFNR value. } \item{\code{efdr_grid}: The EFDR values for the grid search.}
#' \item{\code{efnr_grid}: The EFNR values for the grid search.}
#' \item{\code{evidence_thresh_grid}: The grid where we searched for optimal
#' evidence threshold.} }
#'
#' @seealso \code{\link{scmet}}, \code{\link{scmet_differential}}
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Fit scMET
#' obj <- scmet(Y = scmet_dt$Y, X = scmet_dt$X, L = 4, iter = 100)
#'
#' # Run HVF analysis
#' obj <- scmet_hvf(scmet_obj = obj)
#'
#' # Run LVF analysis
#' obj <- scmet_lvf(scmet_obj = obj)
#'
#' @export
#'
scmet_hvf <- function(scmet_obj, delta_e = 0.9, delta_g = NULL,
evidence_thresh = 0.8, efdr = 0.1) {
# Perform checks
if (!(methods::is(scmet_obj, "scmet_mcmc") ||
methods::is(scmet_obj, "scmet_vb")) ) {
stop("Posterior object is not generated by scMET.")
}
if (!is.null(delta_e)) {
assertthat::assert_that(delta_e >= 0 & delta_e <= 1)
}
if (!is.null(delta_g)) {
assertthat::assert_that(delta_g >= 0 & delta_g <= 1)
}
if (!is.null(delta_g) & !is.null(delta_e) ) {
stop("At least one of 'delta_e' or 'delta_g' should be NULL!")
}
if (!is.null(evidence_thresh)) {
assertthat::assert_that(evidence_thresh >= 0.6 & evidence_thresh <= 1,
msg = "Posterior evidence threshold must be between (0.6, 1).\n")
} else {
message("Posterior evidence threshold hasn't been supplied.\n",
"Setting initial value to 0.8.\n")
evidence_thresh <- 0.8
}
assertthat::assert_that(efdr >= 0 & efdr <= 1)
# Extract predicted residual dispersion
epsilon <- matrixStats::colMedians(scmet_obj$posterior$epsilon)
# Extract dispersion parameter estimates
gamma <- matrixStats::colMedians(scmet_obj$posterior$gamma)
# If we want to compute HVF from residual dispersion
if (!is.null(delta_e)) {
eps_thresh <- stats::quantile(x = epsilon, probs = delta_e, na.rm = TRUE)
# HVF probability for a given epsilon threshold, \pi_{j}
prob <- matrixStats::colMeans2(scmet_obj$posterior$epsilon > eps_thresh)
} else {# Compute HVF from dispersion parameter
prob <- matrixStats::colMeans2(scmet_obj$posterior$gamma > delta_g)
}
# Threshold search and store optimal value
tmp <- .thresh_search(evidence_thresh = evidence_thresh,
prob = prob[!is.na(prob)], efdr = efdr,
task = "HVF detection", suffix = "")
optimal_evidence_thresh <- tmp$optimal_evidence_thresh
# Output preparation
mu <- matrixStats::colMedians(scmet_obj$posterior$mu)
hvf <- ifelse(prob > optimal_evidence_thresh[1], TRUE, FALSE)
feature_name <- scmet_obj$feature_names
# Create table with data
tbl <- cbind.data.frame(feature_name = feature_name,
mu = mu,
gamma = gamma,
epsilon = epsilon,
tail_prob = prob,
is_variable = hvf,
stringsAsFactors = FALSE)
# Re-order the table of results
tbl <- tbl[order(tbl$tail_prob, decreasing = TRUE, na.last = TRUE), ]
# Show summary of results
thresh_st <- ifelse(!is.null(delta_e),
paste("- The ", round(100 * delta_e, 2),
"percentile of variable genes \n"),
paste("- Overdispersion threshold =",
round(100 * delta_g, 2), "% \n"))
message(sum(hvf, na.rm = TRUE),
" Features classified as highly variable using: \n",
thresh_st,
"- Evidence threshold = ", optimal_evidence_thresh[1], "\n",
"- EFDR = ", round(100 * optimal_evidence_thresh[2], 2), "% \n",
"- EFNR = ", round(100 * optimal_evidence_thresh[3], 2), "% \n")
# Store results
scmet_obj$hvf <- list(summary = tbl,
evidence_thresh = optimal_evidence_thresh[1],
efdr = optimal_evidence_thresh[2],
efnr = optimal_evidence_thresh[3],
efdr_grid = tmp$efdr_grid,
efnr_grid = tmp$efnr_grid,
evidence_thresh_grid = tmp$evidence_thresh_grid,
target_efdr = efdr)
return(scmet_obj)
}
#' @name scmet_lvf
#' @aliases scmet_hvf_lvf detect_lvf
#' @rdname scmet_hvf_lvf
#'
#' @export
#'
scmet_lvf <- function(scmet_obj, delta_e = 0.1, delta_g = NULL,
evidence_thresh = 0.8, efdr = 0.1) {
# Perform checks
if (!(methods::is(scmet_obj, "scmet_mcmc") ||
methods::is(scmet_obj, "scmet_vb")) ) {
stop("Posterior object is not generated from the given models.")
}
if (!is.null(delta_e)) {
assertthat::assert_that(delta_e >= 0 & delta_e <= 1)
}
if (!is.null(delta_g)) {
assertthat::assert_that(delta_g >= 0 & delta_g <= 1)
}
if (!is.null(delta_g) & !is.null(delta_e) ) {
stop("At least one of 'delta_e' or 'delta_g' should be NULL!")
}
if (!is.null(evidence_thresh)) {
assertthat::assert_that(evidence_thresh >= 0.6 & evidence_thresh <= 1,
msg = "Posterior evidence threshold must be between (0.6, 1).\n.")
} else {
message("Posterior evidence threshold hasn't been supplied.\n",
"Setting initial value to 0.8.\n")
evidence_thresh <- 0.8
}
assertthat::assert_that(efdr >= 0 & efdr <= 1)
# Extract predicted residual dispersion
epsilon <- matrixStats::colMedians(scmet_obj$posterior$epsilon)
# Extract dispersion parameter estimates
gamma <- matrixStats::colMedians(scmet_obj$posterior$gamma)
# Initialize variables
epsilon = gamma <- rep(-1, length(scmet_obj$feature_names))
# If we want to compute HVF from residual dispersion
if (!is.null(delta_e)) {
eps_thresh <- stats::quantile(x = epsilon, probs = delta_e, na.rm = TRUE)
# HVF probability for a given epsilon threshold, \pi_{j}
prob <- matrixStats::colMeans2(scmet_obj$posterior$epsilon < eps_thresh)
} else {# Compute HVF from dispersion parameter
prob <- matrixStats::colMeans2(scmet_obj$posterior$gamma < delta_g)
}
# Threshold search and store optimal value
tmp <- .thresh_search(evidence_thresh = evidence_thresh,
prob = prob[!is.na(prob)], efdr = efdr,
task = "LVF detection", suffix = "")
optimal_evidence_thresh <- tmp$optimal_evidence_thresh
# Output preparation
mu <- matrixStats::colMedians(scmet_obj$posterior$mu)
lvf <- ifelse(prob > optimal_evidence_thresh[1], TRUE, FALSE)
feature_name <- scmet_obj$feature_names
# Create table with data
tbl <- cbind.data.frame(feature_name = feature_name,
mu = mu,
gamma = gamma,
epsilon = epsilon,
tail_prob = prob,
is_variable = lvf,
stringsAsFactors = FALSE)
# # Re-order the table of results
tbl <- tbl[order(tbl$tail_prob, decreasing = TRUE, na.last = TRUE), ]
# Show summary of results
thresh_st <- ifelse(!is.null(delta_e),
paste("- The ", round(100 * delta_e, 2),
"percentile of variable genes \n"),
paste("- Overdispersion threshold =",
round(100 * delta_g, 2), "% \n"))
message(sum(lvf, na.rm = TRUE),
" Features classified as lowly variable using: \n",
thresh_st,
"- Evidence threshold = ", optimal_evidence_thresh[1], "\n",
"- EFDR = ", round(100 * optimal_evidence_thresh[2], 2), "% \n",
"- EFNR = ", round(100 * optimal_evidence_thresh[3], 2), "% \n")
# Store results
scmet_obj$lvf <- list(summary = tbl,
evidence_thresh = optimal_evidence_thresh[1],
efdr = optimal_evidence_thresh[2],
efnr = optimal_evidence_thresh[3],
efdr_grid = tmp$efdr_grid,
efnr_grid = tmp$efnr_grid,
evidence_thresh_grid = tmp$evidence_thresh_grid,
target_efdr = efdr)
return(scmet_obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.