Nothing
#' Annotate Gene Expression Matrix and Remove Duplicated Genes
#'
#' @description
#' Annotates an expression matrix with gene symbols using provided annotation
#' data,
#' filters out missing or invalid symbols, handles duplicate gene entries, and
#' removes uninformative rows. The function supports multiple aggregation
#' methods
#' for resolving duplicate gene symbols.
#'
#' @details
#' The function performs the following operations:
#' \enumerate{
#' \item Filters probes with missing symbols or labeled as `"NA_NA"`
#' \item Matches probes between expression set and annotation data
#' \item Merges annotation with expression data
#' \item Handles duplicate gene symbols using specified aggregation method
#' \item Removes rows with all zeros, all NAs, or missing values in the first
#' column
#' }
#'
#' @param eset Expression matrix or ExpressionSet object containing gene
#' expression data.
#' @param annotation Data frame containing annotation information for probes.
#' Built-in options include `anno_hug133plus2`, `anno_rnaseq`, and
#' `anno_illumina`.
#' @param symbol Character string specifying the column name in `annotation`
#' that represents gene symbols. Default is `"symbol"`.
#' @param probe Character string specifying the column name in `annotation`
#' that represents probe identifiers. Default is `"probe_id"`.
#' @param method Character string specifying the aggregation method for
#' duplicate gene symbols. Options are `"mean"`, `"sum"`, or `"sd"`.
#' Default is `"mean"`.
#'
#' @return Annotated and cleaned gene expression matrix with gene symbols as
#' row names.
#'
#' @author Dongqiang Zeng
#' @export
#'
#' @examples
#' # Create a small example expression matrix
#' eset_mat <- matrix(runif(100), nrow = 10, ncol = 10)
#' rownames(eset_mat) <- paste0("Probe", 1:10)
#' colnames(eset_mat) <- paste0("Sample", 1:10)
#'
#' # Create a matching annotation data frame
#' anno_df <- data.frame(
#' probe_id = paste0("Probe", 1:10),
#' symbol = c("Gene1", "Gene1", "Gene2", "Gene3", "Gene4",
#' "Gene5", "Gene6", "Gene7", "Gene8", "Gene9")
#' )
#'
#' # Annotate
#' result <- anno_eset(eset = eset_mat, annotation = anno_df)
#' head(result)
anno_eset <- function(eset,
annotation,
symbol = "symbol",
probe = "probe_id",
method = "mean") {
# Input validation
if (is.null(eset)) return(NULL)
if (is.null(annotation)) return(eset)
if (is.null(eset)) {
cli::cli_abort("{.arg eset} cannot be NULL.")
}
if (!is.matrix(eset) && !is.data.frame(eset)) {
cli::cli_abort(c(
"Invalid type for {.arg eset}.",
"i" = "Expected a matrix or data frame, got {.cls {class(eset)}}."
))
}
if (nrow(eset) == 0) {
cli::cli_abort("{.arg eset} has no rows.")
}
if (is.null(annotation) || !is.data.frame(annotation)) {
cli::cli_abort(c(
"Invalid type for {.arg annotation}.",
"i" = "Expected a data frame, got {.cls {class(annotation)}}."
))
}
if (!symbol %in% colnames(annotation)) {
cli::cli_abort("Symbol column {.val {symbol}} not found in annotation.")
}
if (!probe %in% colnames(annotation)) {
cli::cli_abort("Probe column {.val {probe}} not found in annotation.")
}
method <- rlang::arg_match(method, c("mean", "sum", "sd"))
# Prepare annotation
annotation <- as.data.frame(annotation)
colnames(annotation)[colnames(annotation) == symbol] <- "symbol"
colnames(annotation)[colnames(annotation) == probe] <- "probe_id"
annotation <- annotation[, c("probe_id", "symbol")]
# Filter out NA and invalid symbols
valid_symbols <- !is.na(annotation$symbol) & annotation$symbol != "NA_NA"
annotation <- annotation[valid_symbols, , drop = FALSE]
cli::cli_alert_info("Row number of original eset: {nrow(eset)}")
# Match probes
matched_probes <- rownames(eset) %in% annotation$probe_id
anno_count <- sum(matched_probes) / length(matched_probes)
cli::cli_alert_success(paste0(
format(100 * anno_count, digits = 2),
"% of probes in expression set were annotated"
))
if (sum(matched_probes) == 0) {
cli::cli_abort(c(
"No probes matched between eset and annotation.",
"i" = paste(
"Check that probe ID formats match between eset rownames and",
"annotation."
),
"*" = "Example eset rowname: {.val {rownames(eset)[1]}}",
"*" = "Example annotation probe: {.val {annotation$probe_id[1]}}"
))
}
# Filter eset to matched probes
eset <- eset[matched_probes, , drop = FALSE]
annotation <- annotation[annotation$probe_id %in% rownames(eset), ,
drop = FALSE
]
# Merge
eset <- as.data.frame(eset)
eset <- tibble::rownames_to_column(eset, var = "id")
eset <- merge(annotation, eset, by.x = "probe_id", by.y = "id", all = FALSE)
eset$probe_id <- NULL
# Handle duplicates
n_dups <- nrow(eset) - length(unique(eset$symbol))
if (n_dups > 0) {
cli::cli_alert_info(
"Found {n_dups} duplicate symbol{?s}, using {.val {method}} method"
)
agg_col <- setdiff(colnames(eset), "symbol")
order_index <- switch(method,
"mean" = rowMeans(eset[, agg_col, drop = FALSE], na.rm = TRUE),
"sum" = rowSums(eset[, agg_col, drop = FALSE], na.rm = TRUE),
"sd" = apply(eset[, agg_col, drop = FALSE], 1, stats::sd, na.rm = TRUE)
)
eset <- eset[order(order_index, decreasing = TRUE), , drop = FALSE]
eset <- dplyr::distinct(eset, symbol, .keep_all = TRUE)
}
eset <- tibble::column_to_rownames(eset, var = "symbol")
# Remove uninformative rows
eset <- eset[rowSums(eset == 0) < ncol(eset), , drop = FALSE]
eset <- eset[rowSums(is.na(eset)) < ncol(eset), , drop = FALSE]
eset <- eset[!is.na(eset[, 1]), , drop = FALSE]
cli::cli_alert_info(
"Row number after filtering duplicated gene symbol: {nrow(eset)}"
)
as.matrix(eset)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.