knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE )
This tutorial provides detailed instructions for using mLLMCelltype for cell type annotation in single-cell RNA sequencing data. We'll cover various usage scenarios, parameter configurations, and integration with Seurat.
The main function for cell type annotation with a single model:
library(mLLMCelltype) results <- annotate_cell_types( input, # Marker gene data (data frame, list, or file path) tissue_name, # Tissue name (e.g., "human PBMC", "mouse brain") model, # LLM model to use api_key = NA, # API key (if not set in environment, NA returns prompt only) top_gene_count = 10, # Number of top genes per cluster to use debug = FALSE # Whether to print debugging information )
Function for creating consensus annotations from multiple models through interactive discussion:
consensus_results <- interactive_consensus_annotation( input, # Original marker gene data (Seurat FindAllMarkers result or list of genes) tissue_name = NULL, # Optional tissue name models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"), # Models to use api_keys, # Named list of API keys top_gene_count = 10, # Number of top genes to use controversy_threshold = 0.7, # Threshold for identifying controversial clusters entropy_threshold = 1.0, # Entropy threshold for controversial clusters max_discussion_rounds = 3, # Maximum discussion rounds consensus_check_model = NULL, # Model to use for consensus checking (see recommendations below) log_dir = "logs", # Directory for logs cache_dir = NULL, # Uses default system cache directory use_cache = TRUE # Whether to use cache )
Important Note on consensus_check_model: This parameter is used for evaluating semantic similarity, calculating consensus metrics, and moderating discussions. We recommend using capable models such as:
- claude-sonnet-4-6 (Anthropic)
- claude-opus-4-1-20250805 (Anthropic)
- o1 (OpenAI)
- gpt-5.5 (OpenAI)
- gemini-3.1-pro-preview (Google)
See the main README for detailed recommendations and examples.
For quick exploration or when API usage is a concern:
# Load example data library(Seurat) data("pbmc_small") # Find markers pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Run annotation with a single model results <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10 ) # Add annotations to Seurat object pbmc_small$cell_type_claude <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(results), to = results ) # Visualize DimPlot(pbmc_small, group.by = "cell_type_claude", label = TRUE)
For publication-quality annotations with uncertainty quantification:
# Define multiple models to use models <- c( "claude-sonnet-4-6", # Anthropic "gpt-5.5", # OpenAI "gemini-3.1-pro-preview", # Google "grok-4.3" # X.AI ) # API keys for different providers api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY"), grok = Sys.getenv("GROK_API_KEY") ) # Run annotation with multiple models results <- list() for (model in models) { provider <- get_provider(model) api_key <- api_keys[[provider]] results[[model]] <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = model, api_key = api_key, top_gene_count = 10 ) } # Create consensus consensus_results <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = models, # Use all the models defined above api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-6" ) # View consensus results # You can access the final annotations with consensus_results$final_annotations # Add consensus annotations and metrics to Seurat object pbmc_small$cell_type_consensus <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(consensus_results$final_annotations), to = consensus_results$final_annotations ) # Extract consensus metrics from the consensus results consensus_metrics <- lapply(names(consensus_results$initial_results$consensus_results), function(cluster_id) { metrics <- consensus_results$initial_results$consensus_results[[cluster_id]] return(list( cluster = cluster_id, consensus_proportion = metrics$consensus_proportion, entropy = metrics$entropy )) }) # Convert to data frame for easier handling metrics_df <- do.call(rbind, lapply(consensus_metrics, data.frame)) # Add consensus proportion to Seurat object pbmc_small$consensus_proportion <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = metrics_df$cluster, to = metrics_df$consensus_proportion ) # Add entropy to Seurat object pbmc_small$shannon_entropy <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = metrics_df$cluster, to = metrics_df$entropy )
For users with limited API credits or budget constraints:
# Set OpenRouter API key openrouter_api_key <- Sys.getenv("OPENROUTER_API_KEY") # Define free OpenRouter models to use free_models <- c( "meta-llama/llama-4-maverick:free", # Meta Llama 4 Maverick (free) "meta-llama/llama-3.3-70b-instruct:free", # Meta Llama 3.3 70B (free) "deepseek/deepseek-v4-pro:free", # DeepSeek V4 Pro (free) "meta-llama/llama-3.3-70b-instruct:free" # Meta Llama 3.3 70B (free) ) # Run annotation with free OpenRouter models free_results <- list() for (model in free_models) { free_results[[model]] <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = model, # OpenRouter models are automatically detected by format: 'provider/model-name:free' api_key = openrouter_api_key, top_gene_count = 10 ) } # Create consensus with free models free_consensus_results <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = free_models, # Use all the free models defined above api_keys = list("openrouter" = openrouter_api_key), controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "meta-llama/llama-4-maverick:free" # Use a free model for consensus checking ) # View free model consensus results # You can access the final annotations with free_consensus_results$final_annotations # Add free model consensus annotations to Seurat object pbmc_small$free_model_consensus <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(free_consensus_results$final_annotations), to = free_consensus_results$final_annotations ) # Compare paid vs. free model results comparison <- data.frame( cluster = names(consensus_results$final_annotations), paid_models = consensus_results$final_annotations, free_models = free_consensus_results$final_annotations, agreement = consensus_results$final_annotations == free_consensus_results$final_annotations ) print(comparison)
For users who prefer working with files:
# Save markers to CSV write.csv(pbmc_markers, "pbmc_markers.csv", row.names = FALSE) # Run annotation using the CSV file results <- annotate_cell_types( input = "pbmc_markers.csv", tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
For better control over caching behavior:
# Note: The annotate_cell_types function does not have built-in caching. # If you need caching, you can implement it separately. # Run annotation results <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10, debug = FALSE ) # If you need custom caching, you can implement it using your own cache manager # This is just a conceptual example and not part of the actual package # cache_manager <- YourCacheManager$new(cache_dir = "path/to/cache") # cache_manager$clear_cache()
mLLMCelltype supports a wide range of LLM models. Here's a guide to help you choose:
For the most accurate annotations:
claude-opus-4-7)claude-sonnet-4-6)gpt-5.5)gemini-3.1-pro-preview)For good results with lower API costs:
grok-4.3): Competitive performance at lower costdeepseek-v4-flash): Good performance for specialized tissuesFor preliminary exploration or large datasets:
qwen3.6-flash): Good performance for the costglm-5-turbo): Economical option with decent performanceMiniMax-M2.5): Cost-effective for initial explorationFor users with limited API credits or budget constraints:
meta-llama/llama-4-maverick:free): Most reliable and fast, recommended for consensus checkingmeta-llama/llama-3.3-70b-instruct:free): Good performance with consistent formattingdeepseek/deepseek-v4-pro:free): Reasoning model (free)Based on our testing, we recommend the following free models:
meta-llama/llama-4-maverick:free: Most reliable and fast, recommended for consensus checkingmeta-llama/llama-3.3-70b-instruct:free: Good performance with consistent formattingdeepseek/deepseek-v4-pro:free: Reliable with good response timeSome models may have limitations:
deepseek/deepseek-v4-pro:free: May occasionally return empty resultsthudm/glm-z1-9b:free: May return localized error messages ("non-character parameter") when used for consensus checkingThese free models are accessed through OpenRouter and don't consume credits, but may have limitations compared to paid models. Use the :free suffix in the model name to access them.
# Example of using a free model via OpenRouter # First, set your OpenRouter API key Sys.setenv(OPENROUTER_API_KEY = "your-openrouter-api-key") # Then use a free model with the :free suffix free_model_results <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = "meta-llama/llama-4-maverick:free", # Note the :free suffix api_key = Sys.getenv("OPENROUTER_API_KEY") # No need to specify provider - it's automatically detected from the model name format )
Here's a complete example of integrating mLLMCelltype with a Seurat workflow:
library(Seurat) library(mLLMCelltype) library(ggplot2) # Load data data("pbmc_small") # Standard Seurat preprocessing pbmc_small <- NormalizeData(pbmc_small) pbmc_small <- FindVariableFeatures(pbmc_small) pbmc_small <- ScaleData(pbmc_small) pbmc_small <- RunPCA(pbmc_small) pbmc_small <- FindNeighbors(pbmc_small) pbmc_small <- FindClusters(pbmc_small, resolution = 0.5) pbmc_small <- RunUMAP(pbmc_small, dims = 1:10) # Find markers for each cluster pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Define models to use models <- c( "claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview" ) # API keys api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY") ) # Run annotation with multiple models results <- list() for (model in models) { provider <- get_provider(model) api_key <- api_keys[[provider]] results[[model]] <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = model, api_key = api_key, top_gene_count = 10 ) # Add individual model results to Seurat object column_name <- paste0("cell_type_", gsub("[^a-zA-Z0-9]", "_", model)) pbmc_small[[column_name]] <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(results[[model]]), to = results[[model]] ) } # Create consensus consensus_results <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = models, # Use all the models defined above api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-6" ) # Add consensus results to Seurat object pbmc_small$cell_type_consensus <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(consensus_results$final_annotations), to = consensus_results$final_annotations ) # Extract consensus metrics from the consensus results consensus_metrics <- lapply(names(consensus_results$initial_results$consensus_results), function(cluster_id) { metrics <- consensus_results$initial_results$consensus_results[[cluster_id]] return(list( cluster = cluster_id, consensus_proportion = metrics$consensus_proportion, entropy = metrics$entropy )) }) # Convert to data frame for easier handling metrics_df <- do.call(rbind, lapply(consensus_metrics, data.frame)) # Add consensus proportion to Seurat object pbmc_small$consensus_proportion <- as.numeric(plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = metrics_df$cluster, to = metrics_df$consensus_proportion )) # Add entropy to Seurat object pbmc_small$shannon_entropy <- as.numeric(plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = metrics_df$cluster, to = metrics_df$entropy )) # Visualize results p1 <- DimPlot(pbmc_small, group.by = "cell_type_consensus", label = TRUE, repel = TRUE) + ggtitle("Cell Type Annotations") + theme(plot.title = element_text(hjust = 0.5)) p2 <- FeaturePlot(pbmc_small, features = "consensus_proportion", cols = c("yellow", "green", "blue")) + ggtitle("Consensus Proportion") + theme(plot.title = element_text(hjust = 0.5)) p3 <- FeaturePlot(pbmc_small, features = "shannon_entropy", cols = c("red", "orange")) + ggtitle("Shannon Entropy") + theme(plot.title = element_text(hjust = 0.5)) # Combine plots p1 | p2 | p3
The top_gene_count parameter controls how many top marker genes per cluster are used for annotation:
# Using more genes (better for well-characterized tissues) results_more_genes <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 20 # Using more genes ) # Using fewer genes (better for noisy data) results_fewer_genes <- annotate_cell_types( input = pbmc_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 5 # Using fewer genes )
The controversy_threshold parameter in the interactive_consensus_annotation function controls which clusters are considered controversial and require discussion:
# Example of using interactive_consensus_annotation with different controversy thresholds # Lower threshold (more clusters will be discussed) consensus_results_low_threshold <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3-flash-preview"), api_keys = list( "anthropic" = Sys.getenv("ANTHROPIC_API_KEY"), "openai" = Sys.getenv("OPENAI_API_KEY"), "gemini" = Sys.getenv("GEMINI_API_KEY") ), controversy_threshold = 0.3 # Lower threshold - more clusters will be discussed ) # Higher threshold (fewer clusters will be discussed) consensus_results_high_threshold <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3-flash-preview"), api_keys = list( "anthropic" = Sys.getenv("ANTHROPIC_API_KEY"), "openai" = Sys.getenv("OPENAI_API_KEY"), "gemini" = Sys.getenv("GEMINI_API_KEY") ), controversy_threshold = 0.7 # Higher threshold - fewer clusters will be discussed )
Different LLM providers have different rate limits and pricing:
To manage costs and rate limits:
Typical execution times:
To improve performance:
top_gene_count for faster executioncontroversy_threshold to reduce the number of clusters that require discussionIf you encounter "No auth credentials found" errors with OpenRouter:
provider/model-name:free)If you see "non-character parameter" errors:
thudm/glm-z1-9b:free for consensus checkingmeta-llama/llama-4-maverick:free for consensus checkingIf a model returns empty results:
deepseek/deepseek-v4-pro:free may occasionally return empty resultsNow that you understand the detailed usage of mLLMCelltype, you can explore:
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.