Nothing
# Prompt templates for mLLMCelltype
#
# This file contains all prompt template functions used in mLLMCelltype.
# These functions create various prompts for different stages of the cell type annotation process.
extract_cluster_item_genes <- function(item) {
if (is.list(item) && "genes" %in% names(item)) {
return(as.character(item$genes))
}
if (is.character(item)) {
return(item)
}
stop("When input is a list, each element must be a character vector of genes or a list containing a 'genes' field")
}
#' Normalize list input into a canonical cluster->genes mapping
#'
#' For list input, each element can be either:
#' 1) a list containing a `genes` field, or
#' 2) a character vector of genes.
#'
#' Naming rules:
#' - unnamed lists are assigned 0-based IDs ("0", "1", ...)
#' - numeric names are preserved as-is (e.g., "1", "2", "3" stays unchanged)
#' - non-numeric names are preserved as-is
#'
#' @param input List input for cluster annotation
#' @return Named list of character vectors (cluster_id -> genes)
#' @keywords internal
normalize_cluster_gene_list <- function(input) {
if (!is.list(input) || is.data.frame(input)) {
stop("normalize_cluster_gene_list expects a list input")
}
names_vec <- names(input)
gene_vectors <- lapply(input, extract_cluster_item_genes)
if (is.null(names_vec)) {
canonical_names <- as.character(seq_along(gene_vectors) - 1)
} else {
canonical_names <- names_vec
}
if (anyDuplicated(canonical_names)) {
stop("Duplicate cluster IDs detected after normalization")
}
names(gene_vectors) <- canonical_names
gene_vectors
}
#' Extract marker genes for a discussion prompt
#'
#' @param input Marker gene input in list or data.frame format
#' @param cluster_id Cluster ID to extract
#' @param top_gene_count Maximum number of genes to include
#' @return Comma-separated marker genes for the requested cluster
#' @keywords internal
#' @noRd
extract_cluster_genes_for_discussion <- function(input, cluster_id, top_gene_count) {
cluster_key <- as.character(cluster_id)
if (is.list(input) && !is.data.frame(input)) {
cluster_names <- names(input)
if (is.null(cluster_names)) {
cluster_names <- as.character(seq_along(input) - 1)
}
if (anyDuplicated(cluster_names)) {
stop("Duplicate cluster IDs detected after normalization")
}
cluster_index <- match(cluster_key, cluster_names)
if (is.na(cluster_index)) {
stop(sprintf("Cluster '%s' was not found in list input", cluster_key))
}
genes <- extract_cluster_item_genes(input[[cluster_index]])
if (length(genes) == 0) {
stop(sprintf("No genes found for cluster '%s'", cluster_key))
}
return(paste(head(genes, top_gene_count), collapse = ","))
}
if (is.data.frame(input)) {
required_columns <- c("cluster", "gene")
if (!all(required_columns %in% names(input))) {
stop("Data frame input must contain 'cluster' and 'gene' columns")
}
cluster_mask <- as.character(input$cluster) == cluster_key
numeric_cluster <- suppressWarnings(as.numeric(cluster_key))
if (!any(cluster_mask) && !is.na(numeric_cluster)) {
cluster_mask <- suppressWarnings(as.numeric(input$cluster)) == numeric_cluster
cluster_mask[is.na(cluster_mask)] <- FALSE
}
cluster_data <- input[cluster_mask, , drop = FALSE]
if ("avg_log2FC" %in% names(cluster_data)) {
cluster_data <- cluster_data[!is.na(cluster_data$avg_log2FC) & cluster_data$avg_log2FC > 0, , drop = FALSE]
cluster_data <- cluster_data[order(cluster_data$avg_log2FC, decreasing = TRUE), , drop = FALSE]
}
genes <- as.character(cluster_data$gene)
genes <- genes[!is.na(genes) & nzchar(genes)]
if (length(genes) == 0) {
stop(sprintf("No genes found for cluster '%s'", cluster_key))
}
return(paste(head(genes, top_gene_count), collapse = ","))
}
stop("Input must be either a data.frame or a list of gene lists")
}
#' Create prompt for cell type annotation
#'
#' @param input Either a data frame from Seurat's FindAllMarkers() or a list for each cluster
#' where each element is either a character vector of genes or a list containing a `genes` field
#' Cluster IDs in named inputs are preserved as-is; unnamed list input receives sequential IDs starting at "0".
#' @param tissue_name Tissue context for the annotation (e.g., 'human PBMC', 'mouse brain')
#' @param top_gene_count Number of top genes to use per cluster when input is from Seurat. Default: 10
#'
#' @return A list with `prompt` (formatted prompt text), `expected_count`
#' (number of clusters), and `gene_lists` (cluster ID to marker genes mapping).
#' @importFrom magrittr "%>%"
#' @export
create_annotation_prompt <- function(input, tissue_name, top_gene_count = 10) {
if (is.list(input) && !is.data.frame(input)) {
normalized_input <- normalize_cluster_gene_list(input)
gene_lists <- list()
for (cluster_id in names(normalized_input)) {
genes <- normalized_input[[cluster_id]]
gene_lists[[cluster_id]] <- paste(genes, collapse = ", ")
}
expected_count <- length(normalized_input)
} else if (is.data.frame(input)) {
# Process Seurat differential gene table
# Cluster IDs are preserved as-is from the input data
markers <- input %>%
group_by(cluster) %>%
slice_max(avg_log2FC, n = top_gene_count, with_ties = FALSE) %>%
group_split()
gene_lists <- list()
for (marker_group in markers) {
cluster_id <- marker_group$cluster[1]
gene_lists[[as.character(cluster_id)]] <- paste(marker_group$gene, collapse = ',')
}
expected_count <- length(unique(input$cluster))
} else {
stop("Input must be either a data.frame (from Seurat) or a list of gene lists")
}
# Create formatted lines from gene_lists using actual names/indices
cluster_names <- names(gene_lists)
formatted_lines <- vapply(cluster_names, function(name) {
paste0(name, ": ", gene_lists[[name]])
}, character(1), USE.NAMES = FALSE)
# Sort numerically if all names are numeric
if (all(!is.na(suppressWarnings(as.numeric(cluster_names))))) {
formatted_lines <- formatted_lines[order(as.numeric(cluster_names))]
}
prompt <- paste0("You are a cell type annotation expert. Below are marker genes for different cell clusters in ",
tissue_name, ".\n\n",
paste(formatted_lines, collapse = "\n"),
"\n\nFor each cluster ID, provide only the cell type name in a new line, without any explanation.")
return(list(
prompt = prompt,
expected_count = expected_count,
gene_lists = gene_lists
))
}
#' Create prompt for checking consensus among model predictions
#
#
#
#
#' @keywords internal
create_consensus_check_prompt <- function(round_responses, controversy_threshold = 2/3, entropy_threshold = 1.0) {
# Format the predictions for Claude
paste(
"You are a cell type annotation expert. Below are different models' predictions for the same cell cluster.",
"Your task is to analyze these predictions and calculate uncertainty metrics.",
"",
"PREDICTIONS:",
paste(paste("Model", seq_along(round_responses), ":", round_responses), collapse = "\n"),
"",
"IMPORTANT GUIDELINES:",
"1. Consider predictions as matching if they refer to the same cell type, ignoring differences in:",
" - Formatting (e.g., 'NK cells' vs 'Natural Killer cells')",
" - Capitalization",
" - Additional qualifiers (e.g., 'activated', 'mature', etc.)",
"2. Group predictions that refer to the same cell type",
"3. If any prediction is 'Unknown' or 'Unclear', treat it as a separate group",
"",
"CALCULATE THE FOLLOWING METRICS:",
"1. Consensus Proportion = Number of models supporting the majority prediction / Total number of models",
"2. Shannon Entropy = -sum(p_i * log2(p_i)) where p_i is the proportion of models predicting each unique cell type",
sprintf("3. Determine if consensus is reached (Consensus Proportion >= %s AND Entropy <= %s)",
format(controversy_threshold, nsmall=1), format(entropy_threshold, nsmall=1)),
"",
"RESPONSE FORMAT:",
"Line 1: 1 if consensus is reached, 0 if not",
"Line 2: Consensus Proportion (a decimal between 0 and 1)",
"Line 3: Shannon Entropy (a decimal number)",
"Line 4: The majority cell type prediction",
"",
"Example matches:",
"- 'NK cells' = 'Natural Killer cells'",
"- 'CD8+ T cells' = 'Cytotoxic T cells'",
"- 'B cells' = 'B lymphocytes'",
"",
"RESPOND WITH EXACTLY FOUR LINES AS SPECIFIED ABOVE.",
sep = "\n"
)
}
#' Create prompt for additional discussion rounds
#
#
#
#
#
#
#' @keywords internal
create_discussion_prompt <- function(cluster_id,
cluster_genes,
tissue_name,
previous_rounds,
round_number) {
# Compile previous discussion history
discussion_history <- sapply(previous_rounds, function(round) {
responses <- round$responses
paste(sprintf("Round %d:\n%s\n",
round$round_number,
paste(sprintf("%s:\n%s\n",
names(responses),
responses),
collapse = "\n")))
})
sprintf(
"We are continuing the discussion for cluster %s (marker genes: %s%s).
Previous discussion:
%s
This is round %d of the discussion.
Using the Toulmin argumentation model, please structure your response as follows:
1. CLAIM: State your clear cell type prediction (e.g., 'This is a T cell')
2. GROUNDS/DATA: Present specific observable evidence that supports your claim (e.g., marker genes expressed in this cluster)
3. WARRANT: Explain the logical reasoning that connects your grounds to your claim (e.g., why these genes imply this cell type)
4. BACKING: Provide additional support for your warrant (e.g., citations, references, established knowledge in the field)
5. QUALIFIER: Indicate any conditions or limitations affecting the certainty of your claim (e.g., 'probably', 'likely', 'almost certainly')
6. REBUTTAL: Acknowledge possible counter-arguments or exceptions (including addressing other models' predictions)
Based on previous discussion, also indicate:
- Whether you agree or disagree with any emerging consensus
- If you've revised your previous position, explain why
Format your response as:
CELL TYPE: [your current prediction]
GROUNDS: [specific marker genes and expression evidence supporting your claim]
WARRANT: [logical connection between your evidence and claim]
BACKING: [additional support for your reasoning]
QUALIFIER: [degree of certainty - definite, probable, possible, etc.]
REBUTTAL: [addressing counter-arguments or alternative interpretations]
CONSENSUS STATUS: [Agree/Disagree with emerging consensus]",
cluster_id,
cluster_genes,
if (!is.null(tissue_name)) sprintf(" from %s", tissue_name) else "",
paste(discussion_history, collapse = "\n\n"),
round_number
)
}
#' Create prompt for the initial round of discussion
#
#
#
#
#
#' @keywords internal
create_initial_discussion_prompt <- function(cluster_id,
cluster_genes,
tissue_name,
initial_predictions) {
sprintf(
"We are analyzing cluster %s with the following marker genes: %s%s
Different models have made different predictions:
%s
Please provide your cell type prediction using the Toulmin argumentation model to structure your response:
1. CLAIM: State your clear cell type prediction (e.g., 'This is a T cell')
2. GROUNDS/DATA: Present specific observable evidence that supports your claim (e.g., marker genes expressed in this cluster)
3. WARRANT: Explain the logical reasoning that connects your grounds to your claim (e.g., why these genes imply this cell type)
4. BACKING: Provide additional support for your warrant (e.g., citations, references, established knowledge in the field)
5. QUALIFIER: Indicate any conditions or limitations affecting the certainty of your claim (e.g., 'probably', 'likely', 'almost certainly')
6. REBUTTAL: Acknowledge possible counter-arguments or exceptions (including addressing other models' predictions)
Format your response as:
CELL TYPE: [your predicted cell type]
GROUNDS: [specific marker genes and expression evidence supporting your claim]
WARRANT: [logical connection between your evidence and claim]
BACKING: [additional support for your reasoning]
QUALIFIER: [degree of certainty - definite, probable, possible, etc.]
REBUTTAL: [addressing counter-arguments or alternative interpretations]",
cluster_id,
cluster_genes,
if (!is.null(tissue_name)) sprintf(" from %s", tissue_name) else "",
paste(sapply(names(initial_predictions), function(model_name) {
pred <- initial_predictions[[model_name]]
# Check if pred is a list with named elements
if (is.list(pred) && !is.null(names(pred))) {
# If it's a structured list, try to extract the prediction for this cluster
cell_type <- if (!is.null(pred[[as.character(cluster_id)]])) {
pred[[as.character(cluster_id)]]
} else {
"No prediction"
}
} else if (is.character(pred)) {
# If it's a character vector, try to find the line for this cluster
cell_type <- "No prediction"
# Check if there are predictions with cluster ID
has_cluster_id_format <- FALSE
for (line in pred) {
if (trimws(line) == "") next
parts <- strsplit(line, ":", fixed = TRUE)[[1]]
if (length(parts) >= 2 && trimws(parts[1]) == as.character(cluster_id)) {
cell_type <- trimws(paste(parts[-1], collapse = ":"))
has_cluster_id_format <- TRUE
break
}
}
} else {
cell_type <- "No prediction"
}
sprintf("%s: %s", model_name, cell_type)
}), collapse = "\n")
)
}
#' Create prompt for standardizing cell type names
#
#
#' @keywords internal
create_standardization_prompt <- function(all_cell_types) {
paste0(
"I need to standardize the following cell type names to ensure consistent nomenclature while preserving biological subtypes and specificity. \n\n",
"IMPORTANT GUIDELINES:\n",
"1. Preserve cell subtype information: Do NOT collapse specific subtypes into general categories (e.g., DO NOT convert 'Memory B cell' to just 'B cell').\n",
"2. Standardize expression variations: Only standardize different ways of expressing the same biological entity (e.g., 'CD4+ T cell' and 'Helper T cell' can be standardized to 'CD4+ T cell').\n",
"3. Maintain granularity: If a cell type has specific markers or functional designations, preserve that information.\n",
"4. Use widely accepted nomenclature: When standardizing, use the most scientifically accepted term.\n",
"5. Be consistent with surface markers: Use consistent formatting for surface markers (e.g., CD4+, CD8+).\n\n",
"Examples of CORRECT standardization:\n",
"- 'Helper T cell' -> 'CD4+ T cell' (same biological entity, standard nomenclature)\n",
"- 'CD14+ monocyte' -> 'Classical monocyte' (if they are biologically equivalent)\n",
"- 'B-lymphocyte' -> 'B cell' (expression variation)\n\n",
"Examples of INCORRECT standardization:\n",
"- 'Memory B cell' -> 'B cell' (loses subtype information)\n",
"- 'Regulatory T cell' -> 'CD4+ T cell' (loses functional subtype)\n",
"- 'Classical monocyte' -> 'Monocyte' (loses subtype information)\n\n",
"Please provide a standardized name for each cell type in the exact format: 'ORIGINAL: STANDARDIZED'.\n",
"Do not add any additional text, explanations, or formatting.\n\n",
"Here are the cell types to standardize:\n\n",
paste(all_cell_types, collapse = "\n")
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.