advanced-features.R
In mLLMCelltype: Cell Type Annotation Using Large Language Models

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  eval = FALSE
)

## -----------------------------------------------------------------------------
# library(mLLMCelltype)
# library(Seurat)
# library(dplyr)
# 
# # Step 1: Perform initial high-level annotation
# high_level_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 10
# )
# 
# # Step 2: Add high-level annotations to Seurat object
# seurat_obj$high_level_celltype <- plyr::mapvalues(
#   x = as.character(Idents(seurat_obj)),
#   from = names(high_level_results),
#   to = high_level_results
# )
# 
# # Step 3: Subset T cells for further annotation
# t_cells <- subset(seurat_obj, high_level_celltype == "T cells")
# 
# # Step 4: Find markers within T cells
# t_cell_markers <- FindAllMarkers(t_cells, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
# 
# # Step 5: Perform T cell subtype annotation
# t_cell_subtypes <- annotate_cell_types(
#   input = t_cell_markers,
#   tissue_name = "human PBMC T cells",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 10
# )
# 
# # Step 6: Add T cell subtypes back to original object
# t_cell_barcodes <- WhichCells(t_cells)
# seurat_obj$detailed_celltype <- seurat_obj$high_level_celltype
# seurat_obj$detailed_celltype[t_cell_barcodes] <- plyr::mapvalues(
#   x = as.character(Idents(t_cells)),
#   from = names(t_cell_subtypes),
#   to = paste0("T cells: ", t_cell_subtypes)
# )

## -----------------------------------------------------------------------------
# # Create a simple function to check parent-child consistency
# validate_hierarchy <- function(high_level, detailed_level) {
#   # Extract parent type from detailed annotation (before the colon)
#   parent_from_detailed <- sapply(strsplit(detailed_level, ": "), function(x) x[1])
# 
#   # Check if parent matches high-level annotation
#   consistent <- parent_from_detailed == high_level
# 
#   # Return consistency check results
#   data.frame(
#     high_level = high_level,
#     detailed_level = detailed_level,
#     consistent = consistent
#   )
# }
# 
# # Apply validation
# hierarchy_validation <- validate_hierarchy(
#   seurat_obj$high_level_celltype,
#   seurat_obj$detailed_celltype
# )
# 
# # Identify inconsistencies
# inconsistencies <- hierarchy_validation[!hierarchy_validation$consistent, ]
# print(inconsistencies)

## -----------------------------------------------------------------------------
# # For noisy data, use fewer top genes
# results_fewer_genes <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 5  # Use fewer genes to focus on strongest signals
# )

## -----------------------------------------------------------------------------
# # Apply stricter filtering to marker genes
# filtered_markers <- marker_data %>%
#   filter(p_val_adj < 0.01, avg_log2FC > 1.0)  # Stricter thresholds
# 
# # Annotate with filtered markers
# results_filtered <- annotate_cell_types(
#   input = filtered_markers,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Set up API keys
# api_keys <- list(
#   anthropic = Sys.getenv("ANTHROPIC_API_KEY"),
#   openai = Sys.getenv("OPENAI_API_KEY"),
#   gemini = Sys.getenv("GEMINI_API_KEY")
# )
# 
# # Define multiple models to use
# models <- c(
#   "claude-sonnet-4-6",
#   "gpt-5.5",
#   "gemini-3.1-pro-preview"
# )
# 
# # Create consensus using interactive_consensus_annotation
# consensus_results <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   models = models,
#   api_keys = api_keys,
#   controversy_threshold = 0.7,
#   entropy_threshold = 1.0,
#   consensus_check_model = "claude-sonnet-4-6"
# )

## -----------------------------------------------------------------------------
# # For data with batch effects, use consensus with lower threshold
# batch_consensus <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data with batch effects
#   tissue_name = "mouse brain",
#   models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"),
#   api_keys = api_keys,
#   controversy_threshold = 0.4,  # Lower threshold to discuss more clusters
#   entropy_threshold = 0.8  # Lower entropy threshold
# )

## -----------------------------------------------------------------------------
# # Include batch information in the tissue context
# batch_aware_results <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data with batch effects
#   tissue_name = "mouse brain with technical batch effects",  # Include batch context
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Basic annotation without specific tissue context
# basic_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human sample",  # Generic context
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # Annotation with specific tissue context
# specific_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human fetal liver at 20 weeks gestation",  # Detailed context
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Create a custom annotation prompt
# custom_prompt <- create_annotation_prompt(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   top_gene_count = 10
# )
# 
# # Modify the prompt to include additional context
# modified_prompt <- paste0(
#   custom_prompt$prompt,
#   "\n\nAdditional context: This sample is from a patient with rheumatoid arthritis. ",
#   "Previous studies have identified activated T cells, B cells, and CXCR4-high monocytes in this condition."
# )
# 
# # Use the modified prompt directly
# custom_results <- get_model_response(
#   prompt = modified_prompt,
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# library(Seurat)
# library(dplyr)
# 
# # Example: Using CellMarker database information to validate annotations
# # This is a conceptual example - implementation would depend on your specific needs
# 
# # 1. Get annotations with mLLMCelltype
# annotations <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # 2. Compare with known marker genes (conceptual)
# # In a real workflow, you would query a database or use a reference dataset
# known_markers <- list(
#   "T cells" = c("CD3D", "CD3E", "CD3G"),
#   "B cells" = c("CD19", "MS4A1", "CD79A"),
#   "Monocytes" = c("CD14", "LYZ", "CSF1R")
# )
# 
# # 3. Validate annotations against known markers
# # This is a simplified example of how you might validate annotations
# validate_annotations <- function(annotations, marker_data, known_markers) {
#   validation_results <- list()
# 
#   for (i in 1:length(annotations)) {
#     cluster_id <- i
#     predicted_type <- annotations[i]
# 
#     # Get markers for this cluster
#     cluster_markers <- marker_data %>%
#       filter(cluster == cluster_id) %>%
#       arrange(desc(avg_log2FC)) %>%
#       pull(gene) %>%
#       head(20)
# 
#     # Check overlap with known markers for this cell type
#     if (predicted_type %in% names(known_markers)) {
#       expected_markers <- known_markers[[predicted_type]]
#       overlap <- intersect(cluster_markers, expected_markers)
# 
#       validation_results[[i]] <- list(
#         cluster = cluster_id,
#         predicted_type = predicted_type,
#         overlap_count = length(overlap),
#         overlap_genes = paste(overlap, collapse = ", "),
#         confidence = length(overlap) / length(expected_markers)
#       )
#     } else {
#       validation_results[[i]] <- list(
#         cluster = cluster_id,
#         predicted_type = predicted_type,
#         overlap_count = 0,
#         overlap_genes = "",
#         confidence = 0
#       )
#     }
#   }
# 
#   return(validation_results)
# }
# 
# # This is a conceptual example of how you might validate annotations
# # validation_results <- validate_annotations(annotations, marker_data, known_markers)

## -----------------------------------------------------------------------------
# library(Seurat)
# library(mLLMCelltype)
# library(ggplot2)
# library(dplyr)
# 
# # Load example PBMC data
# # In a real workflow, you would use your own data
# data("pbmc_small")  # Example dataset from Seurat
# 
# # Find marker genes
# pbmc_markers <- FindAllMarkers(pbmc_small,
#                               only.pos = TRUE,
#                               min.pct = 0.25,
#                               logfc.threshold = 0.25)
# 
# # Set up API keys
# api_keys <- list(
#   anthropic = Sys.getenv("ANTHROPIC_API_KEY"),
#   openai = Sys.getenv("OPENAI_API_KEY"),
#   gemini = Sys.getenv("GEMINI_API_KEY")
# )
# 
# # Use consensus annotation
# consensus_results <- interactive_consensus_annotation(
#   input = pbmc_markers,
#   tissue_name = "human PBMC",
#   models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"),
#   api_keys = api_keys,
#   controversy_threshold = 0.7,
#   entropy_threshold = 1.0,
#   consensus_check_model = "claude-sonnet-4-6"
# )
# 
# # Add results to Seurat object
# pbmc_small$cell_type <- plyr::mapvalues(
#   x = as.character(Idents(pbmc_small)),
#   from = names(consensus_results$final_annotations),
#   to = consensus_results$final_annotations
# )
# 
# # Visualize results
# # In a real workflow, you would create a UMAP or t-SNE plot
# # DimPlot(pbmc_small, group.by = "cell_type", label = TRUE) +
# #   ggtitle("PBMC Cell Types")

## -----------------------------------------------------------------------------
# # For rare cell types, use these strategies:
# 
# # 1. Increase the number of marker genes considered
# rare_cell_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 20  # Use more genes for rare cell types
# )
# 
# # 2. Use consensus with lower thresholds to discuss more clusters
# rare_cell_consensus <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow",
#   models = c("claude-sonnet-4-6", "gpt-5.5", "gemini-3.1-pro-preview"),
#   api_keys = api_keys,
#   controversy_threshold = 0.4,  # Lower threshold to discuss more clusters
#   entropy_threshold = 0.8,  # Lower entropy threshold
#   consensus_check_model = "claude-sonnet-4-6"
# )
# 
# # 3. Provide more specific tissue context
# specific_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow with expected rare plasma cells and basophils",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Example workflow for cross-species comparison
# 
# # 1. Annotate human and mouse datasets separately
# # (Assuming you have marker data for both species)
# human_annotations <- annotate_cell_types(
#   input = human_marker_data,  # Your human marker data
#   tissue_name = "human brain cortex",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# mouse_annotations <- annotate_cell_types(
#   input = mouse_marker_data,  # Your mouse marker data
#   tissue_name = "mouse brain cortex",
#   model = "claude-sonnet-4-6",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # 2. Compare annotations
# # This is a conceptual example - in a real workflow, you would:
# # - Map annotations to Seurat objects
# # - Calculate proportions
# # - Create comparison visualizations
# # - Identify conserved and species-specific cell types
# 
# # Example comparison function (conceptual)
# compare_species_annotations <- function(human_annotations, mouse_annotations) {
#   # Get unique cell types from both species
#   human_types <- unique(human_annotations)
#   mouse_types <- unique(mouse_annotations)
# 
#   # Find common cell types
#   common_types <- intersect(human_types, mouse_types)
# 
#   # Find species-specific cell types
#   human_specific <- setdiff(human_types, mouse_types)
#   mouse_specific <- setdiff(mouse_types, human_types)
# 
#   # Return comparison results
#   list(
#     common_types = common_types,
#     human_specific = human_specific,
#     mouse_specific = mouse_specific
#   )
# }
# 
# # This is a conceptual example
# # comparison <- compare_species_annotations(human_annotations, mouse_annotations)

## -----------------------------------------------------------------------------
# # Example of cost-efficient model selection
# # Choose models based on your specific needs and budget
# 
# # For initial exploration or smaller datasets
# # Use more affordable models
# affordable_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-haiku-4-5-20251001",  # More affordable model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # For final analysis or challenging datasets
# # Use larger models
# premium_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-6",  # Larger model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # Use OpenRouter for access to free models
# openrouter_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "meta-llama/llama-3.3-70b-instruct:free",  # Free model via OpenRouter
#   api_key = Sys.getenv("OPENROUTER_API_KEY")
# )

## -----------------------------------------------------------------------------
# # 1. Use caching with interactive_consensus_annotation
# consensus_with_cache <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   models = c("claude-sonnet-4-6", "gpt-5.5"),
#   api_keys = api_keys,
#   use_cache = TRUE,  # Enable caching
#   cache_dir = NULL  # Uses default system cache directory
# )
# 
# # 2. Process clusters in batches
# # This is a conceptual example - implementation would depend on your workflow
# process_in_batches <- function(marker_data, batch_size = 5) {
#   # Get unique clusters
#   clusters <- unique(marker_data$cluster)
# 
#   # Process in batches
#   results <- list()
#   for (i in seq(1, length(clusters), by = batch_size)) {
#     # Get current batch of clusters
#     batch_clusters <- clusters[i:min(i + batch_size - 1, length(clusters))]
# 
#     # Filter marker data for current batch
#     batch_data <- marker_data %>% filter(cluster %in% batch_clusters)
# 
#     # Process batch
#     batch_results <- annotate_cell_types(
#       input = batch_data,
#       tissue_name = "human PBMC",
#       model = "claude-sonnet-4-6",
#       api_key = Sys.getenv("ANTHROPIC_API_KEY")
#     )
# 
#     # Store results
#     results <- c(results, batch_results)
#   }
# 
#   return(results)
# }
# 
# # 3. Use faster models for initial exploration
# fast_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-haiku-4-5-20251001",  # Faster model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Define a custom processing function
# # This function must accept prompt, model, and api_key parameters
# custom_process_fn <- function(prompt, model, api_key) {
#   # Custom implementation to process prompts and get responses
#   # This is a simplified example
#   cat("Processing prompt with custom provider\n")
#   cat("Model:", model, "\n")
# 
#   # In a real implementation, you would make API calls here
#   # For example:
#   # response <- httr::POST(
#   #   url = "https://api.custom-provider.com/v1/chat/completions",
#   #   body = list(prompt = prompt, model = model),
#   #   httr::add_headers(Authorization = paste("Bearer", api_key)),
#   #   encode = "json"
#   # )
#   # result <- httr::content(response)$choices[[1]]$text
# 
#   # For this example, just return a fixed response
#   result <- "T cells"
#   return(result)
# }
# 
# # Register the custom provider
# register_custom_provider(
#   provider_name = "custom_provider",
#   process_fn = custom_process_fn,
#   description = "My custom LLM provider"
# )
# 
# # Register a custom model
# register_custom_model(
#   model_name = "custom-model",
#   provider_name = "custom_provider",
#   model_config = list(
#     temperature = 0.7,
#     max_tokens = 2000
#   )
# )
# 
# # Use the custom model
# # custom_results <- annotate_cell_types(
# #   input = marker_data,
# #   tissue_name = "human PBMC",
# #   model = "custom-model",
# #   api_key = "your-custom-api-key"
# # )

## -----------------------------------------------------------------------------
# # Configure the global logger (recommended approach)
# configure_logger(level = "INFO", console_output = TRUE, json_format = TRUE)
# 
# # Use simple logging functions
# log_info("Starting analysis of cluster 0", list(
#   cluster_id = "0",
#   tissue_name = "human PBMC",
#   marker_genes = c("CD3D", "CD3E", "CD2", "IL7R", "LTB")
# ))
# 
# # Log API calls with performance tracking
# log_info("API call completed", list(
#   provider = "anthropic",
#   model = "claude-sonnet-4-6",
#   duration_seconds = 2.34,
#   success = TRUE
# ))
# 
# # Log warnings and errors
# log_warn("Model response had unusual format", list(
#   model = "gpt-5.5",
#   response_length = 50
# ))
# 
# log_error("API call failed", list(
#   provider = "openai",
#   error = "Rate limit exceeded"
# ))
# 
# # Alternatively, create a custom logger instance
# custom_logger <- UnifiedLogger$new(
#   base_dir = "custom_logs",
#   level = "DEBUG",
#   console_output = TRUE,
#   json_format = TRUE
# )
# 
# # Use the custom logger
# custom_logger$info("Custom log message", list(analysis_step = "preprocessing"))
# custom_logger$debug("Detailed debugging info", list(variable_state = "initialized"))
# 
# # Get performance summary
# performance <- get_logger()$get_performance_summary()
# print(performance)

## -----------------------------------------------------------------------------
# # Create a cache manager
# cache_manager <- CacheManager$new(cache_dir = NULL)
# 
# # Generate a cache key
# cache_key <- cache_manager$generate_key(
#   input = marker_data,
#   models = c("claude-sonnet-4-6", "gpt-5.5"),
#   cluster_id = "0"
# )
# 
# # Check if results exist in cache
# if (cache_manager$has_cache(cache_key)) {
#   # Load from cache
#   cached_results <- cache_manager$load_from_cache(cache_key)
# } else {
#   # Process and save to cache
#   # results <- process_cluster(...)
#   # cache_manager$save_to_cache(cache_key, results)
# }
# 
# # Get cache statistics
# cache_stats <- cache_manager$get_cache_stats()
# 
# # Clear cache (with confirmation)
# # cache_manager$clear_cache(confirm = TRUE)

## -----------------------------------------------------------------------------
# # Check cache location
# mllmcelltype_cache_dir()
# 
# # Use local cache
# mllmcelltype_cache_dir("local")
# 
# # Clear cache
# mllmcelltype_clear_cache()
Any scripts or data that you put into this service are public.
mLLMCelltype documentation built on May 11, 2026, 9:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mLLMCelltype
Cell Type Annotation Using Large Language Models

inst/doc/advanced-features.R
In mLLMCelltype: Cell Type Annotation Using Large Language Models

Try the mLLMCelltype package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mLLMCelltype Cell Type Annotation Using Large Language Models

inst/doc/advanced-features.R In mLLMCelltype: Cell Type Annotation Using Large Language Models

Try the mLLMCelltype package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mLLMCelltype
Cell Type Annotation Using Large Language Models

inst/doc/advanced-features.R
In mLLMCelltype: Cell Type Annotation Using Large Language Models