--- title: "Advanced Features & Case Studies" author: "Chen Yang" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Advanced Features & Case Studies} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE ) ``` This article explores advanced features of mLLMCelltype and presents practical examples demonstrating its application in various research contexts. ## Hierarchical Cell Type Annotation ### Understanding Hierarchical Annotation Cell types often exist in hierarchical relationships. For example, T cells can be further classified into CD4+ T cells, CD8+ T cells, regulatory T cells, etc. mLLMCelltype can be used in a multi-step workflow to capture these hierarchical relationships. ### Implementing Hierarchical Annotation Here's a practical approach to perform hierarchical annotation: ```{r} library(mLLMCelltype) library(Seurat) library(dplyr) # Step 1: Perform initial high-level annotation high_level_results <- annotate_cell_types( input = marker_data, tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10 ) # Step 2: Add high-level annotations to Seurat object seurat_obj$high_level_celltype <- plyr::mapvalues( x = as.character(Idents(seurat_obj)), from = names(high_level_results), to = high_level_results ) # Step 3: Subset T cells for further annotation t_cells <- subset(seurat_obj, high_level_celltype == "T cells") # Step 4: Find markers within T cells t_cell_markers <- FindAllMarkers(t_cells, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Step 5: Perform T cell subtype annotation t_cell_subtypes <- annotate_cell_types( input = t_cell_markers, tissue_name = "human PBMC T cells", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 10 ) # Step 6: Add T cell subtypes back to original object t_cell_barcodes <- WhichCells(t_cells) seurat_obj$detailed_celltype <- seurat_obj$high_level_celltype seurat_obj$detailed_celltype[t_cell_barcodes] <- plyr::mapvalues( x = as.character(Idents(t_cells)), from = names(t_cell_subtypes), to = paste0("T cells: ", t_cell_subtypes) ) ``` ### Validating Hierarchical Annotations After creating hierarchical annotations, it's important to validate the consistency between levels: ```{r} # Create a simple function to check parent-child consistency validate_hierarchy <- function(high_level, detailed_level) { # Extract parent type from detailed annotation (before the colon) parent_from_detailed <- sapply(strsplit(detailed_level, ": "), function(x) x[1]) # Check if parent matches high-level annotation consistent <- parent_from_detailed == high_level # Return consistency check results data.frame( high_level = high_level, detailed_level = detailed_level, consistent = consistent ) } # Apply validation hierarchy_validation <- validate_hierarchy( seurat_obj$high_level_celltype, seurat_obj$detailed_celltype ) # Identify inconsistencies inconsistencies <- hierarchy_validation[!hierarchy_validation$consistent, ] print(inconsistencies) ``` ## Handling Noisy Input Data ### Strategies for Noisy Marker Genes Real-world scRNA-seq data often contains noise. Here are practical strategies for handling noisy input: #### 1. Adjust the top_gene_count parameter For noisy datasets, using fewer top genes can help focus on the strongest signals: ```{r} # For noisy data, use fewer top genes results_fewer_genes <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 5 # Use fewer genes to focus on strongest signals ) ``` #### 2. Apply stricter filtering for marker genes Pre-filtering marker genes with stricter thresholds can improve annotation quality: ```{r} # Apply stricter filtering to marker genes filtered_markers <- marker_data %>% filter(p_val_adj < 0.01, avg_log2FC > 1.0) # Stricter thresholds # Annotate with filtered markers results_filtered <- annotate_cell_types( input = filtered_markers, tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` #### 3. Use multi-model consensus The consensus approach can help overcome noise by combining predictions from multiple models: ```{r} # Set up API keys api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY") ) # Define multiple models to use models <- c( "claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro" ) # Create consensus using interactive_consensus_annotation consensus_results <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human PBMC", models = models, api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-5-20250929" ) ``` ### Handling Data with Batch Effects When working with data affected by batch effects, you can: #### 1. Use the consensus approach with a lower controversy threshold ```{r} # For data with batch effects, use consensus with lower threshold batch_consensus <- interactive_consensus_annotation( input = marker_data, # Your marker gene data with batch effects tissue_name = "mouse brain", models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), api_keys = api_keys, controversy_threshold = 0.4, # Lower threshold to discuss more clusters entropy_threshold = 0.8 # Lower entropy threshold ) ``` #### 2. Include batch information in the tissue context ```{r} # Include batch information in the tissue context batch_aware_results <- annotate_cell_types( input = marker_data, # Your marker gene data with batch effects tissue_name = "mouse brain with technical batch effects", # Include batch context model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` ## Incorporating Domain Knowledge ### Using Tissue Context One of the key features of mLLMCelltype is the ability to incorporate domain knowledge through the `tissue_name` parameter. This provides important context to the LLM: ```{r} # Basic annotation without specific tissue context basic_results <- annotate_cell_types( input = marker_data, tissue_name = "human sample", # Generic context model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Annotation with specific tissue context specific_results <- annotate_cell_types( input = marker_data, tissue_name = "human fetal liver at 20 weeks gestation", # Detailed context model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` ### Creating Custom Prompts For advanced use cases, you can create and modify the annotation prompt directly: ```{r} # Create a custom annotation prompt custom_prompt <- create_annotation_prompt( input = marker_data, tissue_name = "human PBMC", top_gene_count = 10 ) # Modify the prompt to include additional context modified_prompt <- paste0( custom_prompt$prompt, "\n\nAdditional context: This sample is from a patient with rheumatoid arthritis. ", "Previous studies have identified activated T cells, B cells, and CXCR4-high monocytes in this condition." ) # Use the modified prompt directly custom_results <- get_model_response( prompt = modified_prompt, model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` ### Combining with External Resources You can enhance your annotation workflow by combining mLLMCelltype with other R packages and resources: ```{r} library(Seurat) library(dplyr) # Example: Using CellMarker database information to validate annotations # This is a conceptual example - implementation would depend on your specific needs # 1. Get annotations with mLLMCelltype annotations <- annotate_cell_types( input = marker_data, tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # 2. Compare with known marker genes (conceptual) # In a real workflow, you would query a database or use a reference dataset known_markers <- list( "T cells" = c("CD3D", "CD3E", "CD3G"), "B cells" = c("CD19", "MS4A1", "CD79A"), "Monocytes" = c("CD14", "LYZ", "CSF1R") ) # 3. Validate annotations against known markers # This is a simplified example of how you might validate annotations validate_annotations <- function(annotations, marker_data, known_markers) { validation_results <- list() for (i in 1:length(annotations)) { cluster_id <- i predicted_type <- annotations[i] # Get markers for this cluster cluster_markers <- marker_data %>% filter(cluster == cluster_id) %>% arrange(desc(avg_log2FC)) %>% pull(gene) %>% head(20) # Check overlap with known markers for this cell type if (predicted_type %in% names(known_markers)) { expected_markers <- known_markers[[predicted_type]] overlap <- intersect(cluster_markers, expected_markers) validation_results[[i]] <- list( cluster = cluster_id, predicted_type = predicted_type, overlap_count = length(overlap), overlap_genes = paste(overlap, collapse = ", "), confidence = length(overlap) / length(expected_markers) ) } else { validation_results[[i]] <- list( cluster = cluster_id, predicted_type = predicted_type, overlap_count = 0, overlap_genes = "", confidence = 0 ) } } return(validation_results) } # This is a conceptual example of how you might validate annotations # validation_results <- validate_annotations(annotations, marker_data, known_markers) ``` ## Practical Case Studies ### Case Study 1: PBMC Dataset Analysis This example demonstrates a complete workflow for analyzing a PBMC dataset: ```{r} library(Seurat) library(mLLMCelltype) library(ggplot2) library(dplyr) # Load example PBMC data # In a real workflow, you would use your own data data("pbmc_small") # Example dataset from Seurat # Find marker genes pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # Set up API keys api_keys <- list( anthropic = Sys.getenv("ANTHROPIC_API_KEY"), openai = Sys.getenv("OPENAI_API_KEY"), gemini = Sys.getenv("GEMINI_API_KEY") ) # Use consensus annotation consensus_results <- interactive_consensus_annotation( input = pbmc_markers, tissue_name = "human PBMC", models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), api_keys = api_keys, controversy_threshold = 0.7, entropy_threshold = 1.0, consensus_check_model = "claude-sonnet-4-5-20250929" ) # Add results to Seurat object pbmc_small$cell_type <- plyr::mapvalues( x = as.character(Idents(pbmc_small)), from = names(consensus_results$final_annotations), to = consensus_results$final_annotations ) # Visualize results # In a real workflow, you would create a UMAP or t-SNE plot # DimPlot(pbmc_small, group.by = "cell_type", label = TRUE) + # ggtitle("PBMC Cell Types") ``` ### Case Study 2: Identifying Rare Cell Types When working with datasets containing rare cell populations, you can adjust parameters to improve detection: ```{r} # For rare cell types, use these strategies: # 1. Increase the number of marker genes considered rare_cell_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human bone marrow", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY"), top_gene_count = 20 # Use more genes for rare cell types ) # 2. Use consensus with lower thresholds to discuss more clusters rare_cell_consensus <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human bone marrow", models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), api_keys = api_keys, controversy_threshold = 0.4, # Lower threshold to discuss more clusters entropy_threshold = 0.8, # Lower entropy threshold consensus_check_model = "claude-sonnet-4-5-20250929" ) # 3. Provide more specific tissue context specific_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human bone marrow with expected rare plasma cells and basophils", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` ### Case Study 3: Cross-Species Comparison mLLMCelltype can be used to compare cell types across different species: ```{r} # Example workflow for cross-species comparison # 1. Annotate human and mouse datasets separately # (Assuming you have marker data for both species) human_annotations <- annotate_cell_types( input = human_marker_data, # Your human marker data tissue_name = "human brain cortex", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) mouse_annotations <- annotate_cell_types( input = mouse_marker_data, # Your mouse marker data tissue_name = "mouse brain cortex", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # 2. Compare annotations # This is a conceptual example - in a real workflow, you would: # - Map annotations to Seurat objects # - Calculate proportions # - Create comparison visualizations # - Identify conserved and species-specific cell types # Example comparison function (conceptual) compare_species_annotations <- function(human_annotations, mouse_annotations) { # Get unique cell types from both species human_types <- unique(human_annotations) mouse_types <- unique(mouse_annotations) # Find common cell types common_types <- intersect(human_types, mouse_types) # Find species-specific cell types human_specific <- setdiff(human_types, mouse_types) mouse_specific <- setdiff(mouse_types, human_types) # Return comparison results list( common_types = common_types, human_specific = human_specific, mouse_specific = mouse_specific ) } # This is a conceptual example # comparison <- compare_species_annotations(human_annotations, mouse_annotations) ``` ## Performance Considerations ### API Cost Management When using mLLMCelltype, it's important to consider the costs associated with API calls to different LLM providers: ```{r} # Example of cost-efficient model selection # Choose models based on your specific needs and budget # For initial exploration or smaller datasets # Use more affordable models affordable_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-haiku-4-20250514", # More affordable model api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # For final analysis or challenging datasets # Use larger models premium_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", # Larger model api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Use OpenRouter for access to free models openrouter_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "meta-llama/llama-3.3-70b-instruct:free", # Free model via OpenRouter api_key = Sys.getenv("OPENROUTER_API_KEY") ) ``` ### Optimizing Runtime To optimize runtime when working with large datasets: ```{r} # 1. Use caching with interactive_consensus_annotation consensus_with_cache <- interactive_consensus_annotation( input = marker_data, # Your marker gene data tissue_name = "human PBMC", models = c("claude-sonnet-4-5-20250929", "gpt-5"), api_keys = api_keys, use_cache = TRUE, # Enable caching cache_dir = NULL # Uses default system cache directory ) # 2. Process clusters in batches # This is a conceptual example - implementation would depend on your workflow process_in_batches <- function(marker_data, batch_size = 5) { # Get unique clusters clusters <- unique(marker_data$cluster) # Process in batches results <- list() for (i in seq(1, length(clusters), by = batch_size)) { # Get current batch of clusters batch_clusters <- clusters[i:min(i + batch_size - 1, length(clusters))] # Filter marker data for current batch batch_data <- marker_data %>% filter(cluster %in% batch_clusters) # Process batch batch_results <- annotate_cell_types( input = batch_data, tissue_name = "human PBMC", model = "claude-sonnet-4-5-20250929", api_key = Sys.getenv("ANTHROPIC_API_KEY") ) # Store results results <- c(results, batch_results) } return(results) } # 3. Use faster models for initial exploration fast_annotation <- annotate_cell_types( input = marker_data, # Your marker gene data tissue_name = "human PBMC", model = "claude-haiku-4-20250514", # Faster model api_key = Sys.getenv("ANTHROPIC_API_KEY") ) ``` ## Advanced Customization ### Custom Processing Functions For advanced users, mLLMCelltype allows you to register custom providers and models: ```{r} # Define a custom processing function # This function must accept prompt, model, and api_key parameters custom_process_fn <- function(prompt, model, api_key) { # Custom implementation to process prompts and get responses # This is a simplified example cat("Processing prompt with custom provider\n") cat("Model:", model, "\n") # In a real implementation, you would make API calls here # For example: # response <- httr::POST( # url = "https://api.custom-provider.com/v1/chat/completions", # body = list(prompt = prompt, model = model), # httr::add_headers(Authorization = paste("Bearer", api_key)), # encode = "json" # ) # result <- httr::content(response)$choices[[1]]$text # For this example, just return a fixed response result <- "T cells" return(result) } # Register the custom provider register_custom_provider( provider_name = "custom_provider", process_fn = custom_process_fn, description = "My custom LLM provider" ) # Register a custom model register_custom_model( model_name = "custom-model", provider_name = "custom_provider", model_config = list( temperature = 0.7, max_tokens = 2000 ) ) # Use the custom model # custom_results <- annotate_cell_types( # input = marker_data, # tissue_name = "human PBMC", # model = "custom-model", # api_key = "your-custom-api-key" # ) ``` ### Using the Unified Logging System mLLMCelltype provides a comprehensive unified logging system with structured output, performance monitoring, and multi-level logging: ```{r} # Configure the global logger (recommended approach) configure_logger(level = "INFO", console_output = TRUE, json_format = TRUE) # Use simple logging functions log_info("Starting analysis of cluster 0", list( cluster_id = "0", tissue_name = "human PBMC", marker_genes = c("CD3D", "CD3E", "CD2", "IL7R", "LTB") )) # Log API calls with performance tracking log_info("API call completed", list( provider = "anthropic", model = "claude-3.5-sonnet", duration_seconds = 2.34, success = TRUE )) # Log warnings and errors log_warn("Model response had unusual format", list( model = "gpt-5", response_length = 50 )) log_error("API call failed", list( provider = "openai", error = "Rate limit exceeded" )) # Alternatively, create a custom logger instance custom_logger <- UnifiedLogger$new( base_dir = "custom_logs", level = "DEBUG", console_output = TRUE, json_format = TRUE ) # Use the custom logger custom_logger$info("Custom log message", list(analysis_step = "preprocessing")) custom_logger$debug("Detailed debugging info", list(variable_state = "initialized")) # Get performance summary performance <- get_logger()$get_performance_summary() print(performance) ``` ### Using the CacheManager The `CacheManager` class helps optimize performance by caching consensus results: ```{r} # Create a cache manager cache_manager <- CacheManager$new(cache_dir = NULL) # Generate a cache key cache_key <- cache_manager$generate_key( input = marker_data, models = c("claude-sonnet-4-5-20250929", "gpt-5"), cluster_id = "0" ) # Check if results exist in cache if (cache_manager$has_cache(cache_key)) { # Load from cache cached_results <- cache_manager$load_from_cache(cache_key) } else { # Process and save to cache # results <- process_cluster(...) # cache_manager$save_to_cache(cache_key, results) } # Get cache statistics cache_stats <- cache_manager$get_cache_stats() # Clear cache (with confirmation) # cache_manager$clear_cache(confirm = TRUE) ``` ### Cache Management mLLMCelltype provides convenient functions for managing cache directories: ```{r} # Check cache location mllmcelltype_cache_dir() # Use local cache mllmcelltype_cache_dir("local") # Clear cache mllmcelltype_clear_cache() ``` ## Next Steps Now that you've explored the advanced features of mLLMCelltype, you can: - [Contribute to the project](https://cafferyang.com/mLLMCelltype/articles/contributing-guide.html): Learn how to contribute to mLLMCelltype - [Review the version history](https://cafferyang.com/mLLMCelltype/news/index.html): Explore the development history of the package - [Return to the introduction](https://cafferyang.com/mLLMCelltype/articles/introduction.html): Review the basic concepts