Nothing
#' Identify Marker Features in Bulk Expression Data
#'
#' Identifies informative marker features across groups from bulk gene
#' expression or signature score matrices using Seurat workflows. Performs
#' feature selection, scaling, PCA, clustering, and marker discovery.
#'
#' @param pdata Data frame. Sample metadata.
#' @param eset Matrix. Gene expression or signature score matrix.
#' @param group Character. Column name in pdata specifying grouping variable.
#' @param id_pdata Character. Column name for sample IDs. Default is "ID".
#' @param nfeatures Integer. Number of top variable features to select. Default is 2000.
#' @param top_n Integer. Number of top markers to retain per cluster. Default is 20.
#' @param thresh.use Numeric. Threshold for marker selection. Default is 0.25.
#' @param only.pos Logical. Whether to retain only positive markers. Default is TRUE.
#' @param min.pct Numeric. Minimum expression percentage threshold. Default is 0.25.
#' @param npcs Integer. Number of principal components to use. Default is 30.
#'
#' @importFrom methods as
#' @importFrom tibble column_to_rownames
#' @return List with components: `sce` (Seurat object), `markers` (all markers), `top_markers` (top markers per group).
#' @export
#'
#' @examples
#' if (requireNamespace("Seurat", quietly = TRUE) && requireNamespace("Matrix", quietly = TRUE)) {
#' # Simulate data
#' set.seed(123)
#' sim_eset <- matrix(abs(rnorm(100 * 30)), 100, 30)
#' rownames(sim_eset) <- paste0("Gene", 1:100)
#' colnames(sim_eset) <- paste0("Sample", 1:30)
#'
#' sim_pdata <- data.frame(
#' ID = paste0("Sample", 1:30),
#' TMEcluster = rep(c("A", "B", "C"), each = 10)
#' )
#'
#' res <- find_markers_in_bulk(
#' pdata = sim_pdata, eset = sim_eset,
#' group = "TMEcluster", npcs = 5
#' )
#' if (!is.null(res)) head(res$top_markers)
#' }
find_markers_in_bulk <- function(pdata, eset, group, id_pdata = "ID", nfeatures = 2000, top_n = 20, thresh.use = 0.25, only.pos = TRUE, min.pct = 0.25, npcs = 30) {
if (is.null(pdata) || is.null(eset)) return(NULL)
# Check required packages
rlang::check_installed(c("Seurat", "Matrix"))
# 转换输入数据格式
if (!inherits(eset, "dgCMatrix")) {
eset <- methods::as(as.matrix(eset), "dgCMatrix")
}
# 处理元数据
if (ncol(pdata) == 2) {
pdata[, "newid"] <- pdata[, id_pdata]
}
pdata <- tibble::column_to_rownames(pdata, var = id_pdata)
pdata <- pdata[rownames(pdata) %in% colnames(eset), ]
# 创建Seurat对象
sce <- Seurat::CreateSeuratObject(
counts = eset,
meta.data = pdata,
min.cells = 0,
min.features = 0,
project = "sce"
)
# 获取版本号并进行规范比较
seurat_version <- packageVersion("Seurat")
v5_threshold <- package_version("5.0.0")
# 版本适配处理
if (seurat_version >= v5_threshold) {
# Seurat v5+ 的处理逻辑
message("Using Seurat v5+ workflow")
sce <- Seurat::NormalizeData(sce)
sce[["RNA"]]$data <- SeuratObject::LayerData(sce, assay = "RNA", layer = "counts") # 兼容v5的Layer访问
sce <- Seurat::ScaleData(sce, layer = "data")
} else {
# Seurat v4及以下版本的处理逻辑
message("Using Seurat v4 workflow")
sce <- Seurat::NormalizeData(sce)
sce <- Seurat::ScaleData(sce)
}
# 特征选择
tryCatch(
{
sce <- Seurat::FindVariableFeatures(
object = sce,
selection.method = "vst",
nfeatures = nfeatures,
mean.cutoff = c(0.1, 8),
dispersion.cutoff = c(1, Inf)
)
},
error = function(e) {
message("Error in FindVariableFeatures: ", e$message)
counts <- Seurat::GetAssayData(sce, assay = "RNA", slot = "counts")
gene_var <- apply(counts, 1, var)
top_genes <- names(sort(gene_var, decreasing = TRUE))[1:nfeatures]
Seurat::VariableFeatures(sce) <<- top_genes # Assign variable features
}
)
# 降维和标记物识别
if (ncol(sce) < npcs) {
warning(
"Number of samples (", ncol(sce), ") is less than requested PCs (", npcs, "). ",
"Setting npcs to ", max(1, ncol(sce) - 1), "."
)
npcs <- max(1, ncol(sce) - 1) # 确保至少为1
}
sce <- Seurat::RunPCA(
object = sce,
features = Seurat::VariableFeatures(sce),
npcs = npcs
)
Seurat::Idents(sce) <- as.character(sce[[group, drop = TRUE]])
# 查找差异表达基因
sce.markers <- Seurat::FindAllMarkers(
object = sce,
only.pos = only.pos,
min.pct = min.pct,
logfc.threshold = thresh.use,
test.use = "wilcox"
)
# 提取top标记物
topN <- sce.markers %>%
dplyr::group_by(cluster) %>%
dplyr::slice_max(order_by = avg_log2FC, n = top_n)
return(list(sce = sce, markers = sce.markers, top_markers = topN))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.