knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE )
This article explores advanced features of mLLMCelltype and presents practical examples demonstrating its application in various research contexts.
Cell types often exist in hierarchical relationships. For example, T cells can be further classified into CD4+ T cells, CD8+ T cells, regulatory T cells, etc. mLLMCelltype can be used in a multi-step workflow to capture these hierarchical relationships.
Here's a practical approach to perform hierarchical annotation:
library(mLLMCelltype) library(Seurat) library(dplyr) # Step 1: Perform initial high-level annotation high_level_results <- annotate_cell_types( input = marker_data, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10 ) # Step 2: Add high-level annotations to Seurat object seurat_obj$high_level_celltype <- plyr::mapvalues( x = as.character(Idents(seurat_obj)), from = names(high_level_results), to = high_level_results ) # Step 3: Subset T cells for further annotation t_cells <- subset(seurat_obj, high_level_celltype == "T cells") # Step 4: Find markers within T cells t_cell_markers <- FindAllMarkers(t_cells, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Step 5: Perform T cell subtype annotation t_cell_subtypes <- annotate_cell_types( input = t_cell_markers, tissue_name = "human PBMC T cells", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10 ) # Step 6: Add T cell subtypes back to original object t_cell_barcodes <- WhichCells(t_cells) seurat_obj$detailed_celltype <- seurat_obj$high_level_celltype seurat_obj$detailed_celltype[t_cell_barcodes] <- plyr::mapvalues( x = as.character(Idents(t_cells)), from = names(t_cell_subtypes), to = paste0("T cells: ", t_cell_subtypes) )
After creating hierarchical annotations, it's important to validate the consistency between levels:
# Create a simple function to check parent-child consistency validate_hierarchy <- function(high_level, detailed_level) { # Extract parent type from detailed annotation (before the colon) parent_from_detailed <- sapply(strsplit(detailed_level, ": "), function(x) x[1]) # Check if parent matches high-level annotation consistent <- parent_from_detailed == high_level # Return consistency check results data.frame( high_level = high_level, detailed_level = detailed_level, consistent = consistent ) } # Apply validation hierarchy_validation <- validate_hierarchy( seurat_obj$high_level_celltype, seurat_obj$detailed_celltype ) # Identify inconsistencies inconsistencies <- hierarchy_validation[!hierarchy_validation$consistent, ] print(inconsistencies)
Real-world scRNA-seq data often contains noise. Here are practical strategies for handling noisy input:
For noisy datasets, using fewer top genes can help focus on the strongest signals:
# For noisy data, use fewer top genes results_fewer_genes <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 5 # Use fewer genes to focus on strongest signals )
Pre-filtering marker genes with stricter thresholds can improve annotation quality:
# Apply stricter filtering to marker genes filtered_markers <- marker_data %>% filter(p_val_adj < 0.01, avg_log2FC > 1.0) # Stricter thresholds # Annotate with filtered markers results_filtered <- annotate_cell_types( input = filtered_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
The consensus approach can help overcome noise by combining predictions from multiple models:
# Set up API keys api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY") ) # Define multiple models to use models <- c( "claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview" ) # Create consensus using interactive_consensus_annotation consensus_results <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human PBMC", models = models, api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-6" )
When working with data affected by batch effects, you can:
# For data with batch effects, use consensus with lower threshold batch_consensus <- interactive_consensus_annotation( input = marker_data, # Your marker gene data with batch effects tissue_name = "mouse brain", models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"), api_keys = api_keys, controversy_threshold = 0.4, # Lower threshold to discuss more clusters entropy_threshold = 0.8 # Lower entropy threshold )
# Include batch information in the tissue context batch_aware_results <- annotate_cell_types( input = marker_data, # Your marker gene data with batch effects tissue_name = "mouse brain with technical batch effects", # Include batch context model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
One of the key features of mLLMCelltype is the ability to incorporate domain knowledge through the tissue_name parameter. This provides important context to the LLM:
# Basic annotation without specific tissue context basic_results <- annotate_cell_types( input = marker_data, tissue_name = "human sample", # Generic context model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Annotation with specific tissue context specific_results <- annotate_cell_types( input = marker_data, tissue_name = "human fetal liver at 20 weeks gestation", # Detailed context model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
For advanced use cases, you can create and modify the annotation prompt directly:
# Create a custom annotation prompt custom_prompt <- create_annotation_prompt( input = marker_data, tissue_name = "human PBMC", top_gene_count = 10 ) # Modify the prompt to include additional context modified_prompt <- paste0( custom_prompt$prompt, "\n\nAdditional context: This sample is from a patient with rheumatoid arthritis. ", "Previous studies have identified activated T cells, B cells, and CXCR4-high monocytes in this condition." ) # Use the modified prompt directly custom_results <- get_model_response( prompt = modified_prompt, model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
You can enhance your annotation workflow by combining mLLMCelltype with other R packages and resources:
library(Seurat) library(dplyr) # Example: Using CellMarker database information to validate annotations # This is a conceptual example - implementation would depend on your specific needs # 1. Get annotations with mLLMCelltype annotations <- annotate_cell_types( input = marker_data, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # 2. Compare with known marker genes (conceptual) # In a real workflow, you would query a database or use a reference dataset known_markers <- list( "T cells" = c("CD3D", "CD3E", "CD3G"), "B cells" = c("CD19", "MS4A1", "CD79A"), "Monocytes" = c("CD14", "LYZ", "CSF1R") ) # 3. Validate annotations against known markers # This is a simplified example of how you might validate annotations validate_annotations <- function(annotations, marker_data, known_markers) { validation_results <- list() for (i in 1:length(annotations)) { cluster_id <- i predicted_type <- annotations[i] # Get markers for this cluster cluster_markers <- marker_data %>% filter(cluster == cluster_id) %>% arrange(desc(avg_log2FC)) %>% pull(gene) %>% head(20) # Check overlap with known markers for this cell type if (predicted_type %in% names(known_markers)) { expected_markers <- known_markers[[predicted_type]] overlap <- intersect(cluster_markers, expected_markers) validation_results[[i]] <- list( cluster = cluster_id, predicted_type = predicted_type, overlap_count = length(overlap), overlap_genes = paste(overlap, collapse = ", "), confidence = length(overlap) / length(expected_markers) ) } else { validation_results[[i]] <- list( cluster = cluster_id, predicted_type = predicted_type, overlap_count = 0, overlap_genes = "", confidence = 0 ) } } return(validation_results) } # This is a conceptual example of how you might validate annotations # validation_results <- validate_annotations(annotations, marker_data, known_markers)
This example demonstrates a complete workflow for analyzing a PBMC dataset:
library(Seurat) library(mLLMCelltype) library(ggplot2) library(dplyr) # Load example PBMC data # In a real workflow, you would use your own data data("pbmc_small") # Example dataset from Seurat # Find marker genes pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Set up API keys api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY") ) # Use consensus annotation consensus_results <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"), api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-6" ) # Add results to Seurat object pbmc_small$cell_type <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(consensus_results$final_annotations), to = consensus_results$final_annotations ) # Visualize results # In a real workflow, you would create a UMAP or t-SNE plot # DimPlot(pbmc_small, group.by = "cell_type", label = TRUE) + # ggtitle("PBMC Cell Types")
When working with datasets containing rare cell populations, you can adjust parameters to improve detection:
# For rare cell types, use these strategies: # 1. Increase the number of marker genes considered rare_cell_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human bone marrow", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 20 # Use more genes for rare cell types ) # 2. Use consensus with lower thresholds to discuss more clusters rare_cell_consensus <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human bone marrow", models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"), api_keys = api_keys, controversy_threshold = 0.4, # Lower threshold to discuss more clusters entropy_threshold = 0.8, # Lower entropy threshold consensus_check_model = "claude-sonnet-4-6" ) # 3. Provide more specific tissue context specific_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human bone marrow with expected rare plasma cells and basophils", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") )
mLLMCelltype can be used to compare cell types across different species:
# Example workflow for cross-species comparison # 1. Annotate human and mouse datasets separately # (Assuming you have marker data for both species) human_annotations <- annotate_cell_types( input = human_marker_data, # Your human marker data tissue_name = "human brain cortex", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) mouse_annotations <- annotate_cell_types( input = mouse_marker_data, # Your mouse marker data tissue_name = "mouse brain cortex", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # 2. Compare annotations # This is a conceptual example - in a real workflow, you would: # - Map annotations to Seurat objects # - Calculate proportions # - Create comparison visualizations # - Identify conserved and species-specific cell types # Example comparison function (conceptual) compare_species_annotations <- function(human_annotations, mouse_annotations) { # Get unique cell types from both species human_types <- unique(human_annotations) mouse_types <- unique(mouse_annotations) # Find common cell types common_types <- intersect(human_types, mouse_types) # Find species-specific cell types human_specific <- setdiff(human_types, mouse_types) mouse_specific <- setdiff(mouse_types, human_types) # Return comparison results list( common_types = common_types, human_specific = human_specific, mouse_specific = mouse_specific ) } # This is a conceptual example # comparison <- compare_species_annotations(human_annotations, mouse_annotations)
When using mLLMCelltype, it's important to consider the costs associated with API calls to different LLM providers:
# Example of cost-efficient model selection # Choose models based on your specific needs and budget # For initial exploration or smaller datasets # Use more affordable models affordable_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-haiku-4-5-20251001", # More affordable model api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # For final analysis or challenging datasets # Use larger models premium_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-sonnet-4-6", # Larger model api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Use OpenRouter for access to free models openrouter_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "meta-llama/llama-3.3-70b-instruct:free", # Free model via OpenRouter api_key = Sys.getenv("OPENROUTER_API_KEY") )
To optimize runtime when working with large datasets:
# 1. Use caching with interactive_consensus_annotation consensus_with_cache <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human PBMC", models = c("claude-sonnet-4-6", "gpt-5.5"), api_keys = api_keys, use_cache = TRUE, # Enable caching cache_dir = NULL # Uses default system cache directory ) # 2. Process clusters in batches # This is a conceptual example - implementation would depend on your workflow process_in_batches <- function(marker_data, batch_size = 5) { # Get unique clusters clusters <- unique(marker_data$cluster) # Process in batches results <- list() for (i in seq(1, length(clusters), by = batch_size)) { # Get current batch of clusters batch_clusters <- clusters[i:min(i + batch_size - 1, length(clusters))] # Filter marker data for current batch batch_data <- marker_data %>% filter(cluster %in% batch_clusters) # Process batch batch_results <- annotate_cell_types( input = batch_data, tissue_name = "human PBMC", model = "claude-sonnet-4-6", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Store results results <- c(results, batch_results) } return(results) } # 3. Use faster models for initial exploration fast_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-haiku-4-5-20251001", # Faster model api_key = Sys.getenv("ANTHROPIC_API_KEY") )
For advanced users, mLLMCelltype allows you to register custom providers and models:
# Define a custom processing function # This function must accept prompt, model, and api_key parameters custom_process_fn <- function(prompt, model, api_key) { # Custom implementation to process prompts and get responses # This is a simplified example cat("Processing prompt with custom provider\n") cat("Model:", model, "\n") # In a real implementation, you would make API calls here # For example: # response <- httr::POST( # url = "https://api.custom-provider.com/v1/chat/completions", # body = list(prompt = prompt, model = model), # httr::add_headers(Authorization = paste("Bearer", api_key)), # encode = "json" # ) # result <- httr::content(response)$choices[[1]]$text # For this example, just return a fixed response result <- "T cells" return(result) } # Register the custom provider register_custom_provider( provider_name = "custom_provider", process_fn = custom_process_fn, description = "My custom LLM provider" ) # Register a custom model register_custom_model( model_name = "custom-model", provider_name = "custom_provider", model_config = list( temperature = 0.7, max_tokens = 2000 ) ) # Use the custom model # custom_results <- annotate_cell_types( # input = marker_data, # tissue_name = "human PBMC", # model = "custom-model", # api_key = "your-custom-api-key" # )
mLLMCelltype provides a comprehensive unified logging system with structured output, performance monitoring, and multi-level logging:
# Configure the global logger (recommended approach) configure_logger(level = "INFO", console_output = TRUE, json_format = TRUE) # Use simple logging functions log_info("Starting analysis of cluster 0", list( cluster_id = "0", tissue_name = "human PBMC", marker_genes = c("CD3D", "CD3E", "CD2", "IL7R", "LTB") )) # Log API calls with performance tracking log_info("API call completed", list( provider = "anthropic", model = "claude-sonnet-4-6", duration_seconds = 2.34, success = TRUE )) # Log warnings and errors log_warn("Model response had unusual format", list( model = "gpt-5.5", response_length = 50 )) log_error("API call failed", list( provider = "openai", error = "Rate limit exceeded" )) # Alternatively, create a custom logger instance custom_logger <- UnifiedLogger$new( base_dir = "custom_logs", level = "DEBUG", console_output = TRUE, json_format = TRUE ) # Use the custom logger custom_logger$info("Custom log message", list(analysis_step = "preprocessing")) custom_logger$debug("Detailed debugging info", list(variable_state = "initialized")) # Get performance summary performance <- get_logger()$get_performance_summary() print(performance)
The CacheManager class helps optimize performance by caching consensus results:
# Create a cache manager cache_manager <- CacheManager$new(cache_dir = NULL) # Generate a cache key cache_key <- cache_manager$generate_key( input = marker_data, models = c("claude-sonnet-4-6", "gpt-5.5"), cluster_id = "0" ) # Check if results exist in cache if (cache_manager$has_cache(cache_key)) { # Load from cache cached_results <- cache_manager$load_from_cache(cache_key) } else { # Process and save to cache # results <- process_cluster(...) # cache_manager$save_to_cache(cache_key, results) } # Get cache statistics cache_stats <- cache_manager$get_cache_stats() # Clear cache (with confirmation) # cache_manager$clear_cache(confirm = TRUE)
mLLMCelltype provides convenient functions for managing cache directories:
# Check cache location mllmcelltype_cache_dir() # Use local cache mllmcelltype_cache_dir("local") # Clear cache mllmcelltype_clear_cache()
Now that you've explored the advanced features of mLLMCelltype, you can:
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.