## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE ) ## ----------------------------------------------------------------------------- # library(mLLMCelltype) # library(Seurat) # library(dplyr) # # # Step 1: Perform initial high-level annotation # high_level_results <- annotate_cell_types( # input = marker_data, # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 10 # ) # # # Step 2: Add high-level annotations to Seurat object # seurat_obj$high_level_celltype <- plyr::mapvalues( # x = as.character(Idents(seurat_obj)), # from = names(high_level_results), # to = high_level_results # ) # # # Step 3: Subset T cells for further annotation # t_cells <- subset(seurat_obj, high_level_celltype == "T cells") # # # Step 4: Find markers within T cells # t_cell_markers <- FindAllMarkers(t_cells, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # # # Step 5: Perform T cell subtype annotation # t_cell_subtypes <- annotate_cell_types( # input = t_cell_markers, # tissue_name = "human PBMC T cells", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 10 # ) # # # Step 6: Add T cell subtypes back to original object # t_cell_barcodes <- WhichCells(t_cells) # seurat_obj$detailed_celltype <- seurat_obj$high_level_celltype # seurat_obj$detailed_celltype[t_cell_barcodes] <- plyr::mapvalues( # x = as.character(Idents(t_cells)), # from = names(t_cell_subtypes), # to = paste0("T cells: ", t_cell_subtypes) # ) ## ----------------------------------------------------------------------------- # # Create a simple function to check parent-child consistency # validate_hierarchy <- function(high_level, detailed_level) { # # Extract parent type from detailed annotation (before the colon) # parent_from_detailed <- sapply(strsplit(detailed_level, ": "), function(x) x[1]) # # # Check if parent matches high-level annotation # consistent <- parent_from_detailed == high_level # # # Return consistency check results # data.frame( # high_level = high_level, # detailed_level = detailed_level, # consistent = consistent # ) # } # # # Apply validation # hierarchy_validation <- validate_hierarchy( # seurat_obj$high_level_celltype, # seurat_obj$detailed_celltype # ) # # # Identify inconsistencies # inconsistencies <- hierarchy_validation[!hierarchy_validation$consistent, ] # print(inconsistencies) ## ----------------------------------------------------------------------------- # # For noisy data, use fewer top genes # results_fewer_genes <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 5 # Use fewer genes to focus on strongest signals # ) ## ----------------------------------------------------------------------------- # # Apply stricter filtering to marker genes # filtered_markers <- marker_data %>% # filter(p_val_adj < 0.01, avg_log2FC > 1.0) # Stricter thresholds # # # Annotate with filtered markers # results_filtered <- annotate_cell_types( # input = filtered_markers, # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Set up API keys # api_keys <- list( # anthropic = Sys.getenv("ANTHROPIC_API_KEY"), # openai = Sys.getenv("OPENAI_API_KEY"), # gemini = Sys.getenv("GEMINI_API_KEY") # ) # # # Define multiple models to use # models <- c( # "claude-sonnet-4-5-20250929", # "gpt-5", # "gemini-1.5-pro" # ) # # # Create consensus using interactive_consensus_annotation # consensus_results <- interactive_consensus_annotation( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # models = models, # api_keys = api_keys, # controversy_threshold = 0.7, # entropy_threshold = 1.0, # consensus_check_model = "claude-sonnet-4-5-20250929" # ) ## ----------------------------------------------------------------------------- # # For data with batch effects, use consensus with lower threshold # batch_consensus <- interactive_consensus_annotation( # input = marker_data, # Your marker gene data with batch effects # tissue_name = "mouse brain", # models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), # api_keys = api_keys, # controversy_threshold = 0.4, # Lower threshold to discuss more clusters # entropy_threshold = 0.8 # Lower entropy threshold # ) ## ----------------------------------------------------------------------------- # # Include batch information in the tissue context # batch_aware_results <- annotate_cell_types( # input = marker_data, # Your marker gene data with batch effects # tissue_name = "mouse brain with technical batch effects", # Include batch context # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Basic annotation without specific tissue context # basic_results <- annotate_cell_types( # input = marker_data, # tissue_name = "human sample", # Generic context # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # Annotation with specific tissue context # specific_results <- annotate_cell_types( # input = marker_data, # tissue_name = "human fetal liver at 20 weeks gestation", # Detailed context # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Create a custom annotation prompt # custom_prompt <- create_annotation_prompt( # input = marker_data, # tissue_name = "human PBMC", # top_gene_count = 10 # ) # # # Modify the prompt to include additional context # modified_prompt <- paste0( # custom_prompt$prompt, # "\n\nAdditional context: This sample is from a patient with rheumatoid arthritis. ", # "Previous studies have identified activated T cells, B cells, and CXCR4-high monocytes in this condition." # ) # # # Use the modified prompt directly # custom_results <- get_model_response( # prompt = modified_prompt, # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # library(Seurat) # library(dplyr) # # # Example: Using CellMarker database information to validate annotations # # This is a conceptual example - implementation would depend on your specific needs # # # 1. Get annotations with mLLMCelltype # annotations <- annotate_cell_types( # input = marker_data, # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # 2. Compare with known marker genes (conceptual) # # In a real workflow, you would query a database or use a reference dataset # known_markers <- list( # "T cells" = c("CD3D", "CD3E", "CD3G"), # "B cells" = c("CD19", "MS4A1", "CD79A"), # "Monocytes" = c("CD14", "LYZ", "CSF1R") # ) # # # 3. Validate annotations against known markers # # This is a simplified example of how you might validate annotations # validate_annotations <- function(annotations, marker_data, known_markers) { # validation_results <- list() # # for (i in 1:length(annotations)) { # cluster_id <- i # predicted_type <- annotations[i] # # # Get markers for this cluster # cluster_markers <- marker_data %>% # filter(cluster == cluster_id) %>% # arrange(desc(avg_log2FC)) %>% # pull(gene) %>% # head(20) # # # Check overlap with known markers for this cell type # if (predicted_type %in% names(known_markers)) { # expected_markers <- known_markers[[predicted_type]] # overlap <- intersect(cluster_markers, expected_markers) # # validation_results[[i]] <- list( # cluster = cluster_id, # predicted_type = predicted_type, # overlap_count = length(overlap), # overlap_genes = paste(overlap, collapse = ", "), # confidence = length(overlap) / length(expected_markers) # ) # } else { # validation_results[[i]] <- list( # cluster = cluster_id, # predicted_type = predicted_type, # overlap_count = 0, # overlap_genes = "", # confidence = 0 # ) # } # } # # return(validation_results) # } # # # This is a conceptual example of how you might validate annotations # # validation_results <- validate_annotations(annotations, marker_data, known_markers) ## ----------------------------------------------------------------------------- # library(Seurat) # library(mLLMCelltype) # library(ggplot2) # library(dplyr) # # # Load example PBMC data # # In a real workflow, you would use your own data # data("pbmc_small") # Example dataset from Seurat # # # Find marker genes # pbmc_markers <- FindAllMarkers(pbmc_small, # only.pos = TRUE, # min.pct = 0.25, # logfc.threshold = 0.25) # # # Set up API keys # api_keys <- list( # anthropic = Sys.getenv("ANTHROPIC_API_KEY"), # openai = Sys.getenv("OPENAI_API_KEY"), # gemini = Sys.getenv("GEMINI_API_KEY") # ) # # # Use consensus annotation # consensus_results <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), # api_keys = api_keys, # controversy_threshold = 0.7, # entropy_threshold = 1.0, # consensus_check_model = "claude-sonnet-4-5-20250929" # ) # # # Add results to Seurat object # pbmc_small$cell_type <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = names(consensus_results$final_annotations), # to = consensus_results$final_annotations # ) # # # Visualize results # # In a real workflow, you would create a UMAP or t-SNE plot # # DimPlot(pbmc_small, group.by = "cell_type", label = TRUE) + # # ggtitle("PBMC Cell Types") ## ----------------------------------------------------------------------------- # # For rare cell types, use these strategies: # # # 1. Increase the number of marker genes considered # rare_cell_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human bone marrow", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 20 # Use more genes for rare cell types # ) # # # 2. Use consensus with lower thresholds to discuss more clusters # rare_cell_consensus <- interactive_consensus_annotation( # input = marker_data, # Your marker gene data # tissue_name = "human bone marrow", # models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"), # api_keys = api_keys, # controversy_threshold = 0.4, # Lower threshold to discuss more clusters # entropy_threshold = 0.8, # Lower entropy threshold # consensus_check_model = "claude-sonnet-4-5-20250929" # ) # # # 3. Provide more specific tissue context # specific_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human bone marrow with expected rare plasma cells and basophils", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Example workflow for cross-species comparison # # # 1. Annotate human and mouse datasets separately # # (Assuming you have marker data for both species) # human_annotations <- annotate_cell_types( # input = human_marker_data, # Your human marker data # tissue_name = "human brain cortex", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # mouse_annotations <- annotate_cell_types( # input = mouse_marker_data, # Your mouse marker data # tissue_name = "mouse brain cortex", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # 2. Compare annotations # # This is a conceptual example - in a real workflow, you would: # # - Map annotations to Seurat objects # # - Calculate proportions # # - Create comparison visualizations # # - Identify conserved and species-specific cell types # # # Example comparison function (conceptual) # compare_species_annotations <- function(human_annotations, mouse_annotations) { # # Get unique cell types from both species # human_types <- unique(human_annotations) # mouse_types <- unique(mouse_annotations) # # # Find common cell types # common_types <- intersect(human_types, mouse_types) # # # Find species-specific cell types # human_specific <- setdiff(human_types, mouse_types) # mouse_specific <- setdiff(mouse_types, human_types) # # # Return comparison results # list( # common_types = common_types, # human_specific = human_specific, # mouse_specific = mouse_specific # ) # } # # # This is a conceptual example # # comparison <- compare_species_annotations(human_annotations, mouse_annotations) ## ----------------------------------------------------------------------------- # # Example of cost-efficient model selection # # Choose models based on your specific needs and budget # # # For initial exploration or smaller datasets # # Use more affordable models # affordable_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # model = "claude-haiku-4-20250514", # More affordable model # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # For final analysis or challenging datasets # # Use larger models # premium_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # Larger model # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # Use OpenRouter for access to free models # openrouter_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # model = "meta-llama/llama-3.3-70b-instruct:free", # Free model via OpenRouter # api_key = Sys.getenv("OPENROUTER_API_KEY") # ) ## ----------------------------------------------------------------------------- # # 1. Use caching with interactive_consensus_annotation # consensus_with_cache <- interactive_consensus_annotation( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # models = c("claude-sonnet-4-5-20250929", "gpt-5"), # api_keys = api_keys, # use_cache = TRUE, # Enable caching # cache_dir = NULL # Uses default system cache directory # ) # # # 2. Process clusters in batches # # This is a conceptual example - implementation would depend on your workflow # process_in_batches <- function(marker_data, batch_size = 5) { # # Get unique clusters # clusters <- unique(marker_data$cluster) # # # Process in batches # results <- list() # for (i in seq(1, length(clusters), by = batch_size)) { # # Get current batch of clusters # batch_clusters <- clusters[i:min(i + batch_size - 1, length(clusters))] # # # Filter marker data for current batch # batch_data <- marker_data %>% filter(cluster %in% batch_clusters) # # # Process batch # batch_results <- annotate_cell_types( # input = batch_data, # tissue_name = "human PBMC", # model = "claude-sonnet-4-5-20250929", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) # # # Store results # results <- c(results, batch_results) # } # # return(results) # } # # # 3. Use faster models for initial exploration # fast_annotation <- annotate_cell_types( # input = marker_data, # Your marker gene data # tissue_name = "human PBMC", # model = "claude-haiku-4-20250514", # Faster model # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Define a custom processing function # # This function must accept prompt, model, and api_key parameters # custom_process_fn <- function(prompt, model, api_key) { # # Custom implementation to process prompts and get responses # # This is a simplified example # cat("Processing prompt with custom provider\n") # cat("Model:", model, "\n") # # # In a real implementation, you would make API calls here # # For example: # # response <- httr::POST( # # url = "https://api.custom-provider.com/v1/chat/completions", # # body = list(prompt = prompt, model = model), # # httr::add_headers(Authorization = paste("Bearer", api_key)), # # encode = "json" # # ) # # result <- httr::content(response)$choices[[1]]$text # # # For this example, just return a fixed response # result <- "T cells" # return(result) # } # # # Register the custom provider # register_custom_provider( # provider_name = "custom_provider", # process_fn = custom_process_fn, # description = "My custom LLM provider" # ) # # # Register a custom model # register_custom_model( # model_name = "custom-model", # provider_name = "custom_provider", # model_config = list( # temperature = 0.7, # max_tokens = 2000 # ) # ) # # # Use the custom model # # custom_results <- annotate_cell_types( # # input = marker_data, # # tissue_name = "human PBMC", # # model = "custom-model", # # api_key = "your-custom-api-key" # # ) ## ----------------------------------------------------------------------------- # # Configure the global logger (recommended approach) # configure_logger(level = "INFO", console_output = TRUE, json_format = TRUE) # # # Use simple logging functions # log_info("Starting analysis of cluster 0", list( # cluster_id = "0", # tissue_name = "human PBMC", # marker_genes = c("CD3D", "CD3E", "CD2", "IL7R", "LTB") # )) # # # Log API calls with performance tracking # log_info("API call completed", list( # provider = "anthropic", # model = "claude-3.5-sonnet", # duration_seconds = 2.34, # success = TRUE # )) # # # Log warnings and errors # log_warn("Model response had unusual format", list( # model = "gpt-5", # response_length = 50 # )) # # log_error("API call failed", list( # provider = "openai", # error = "Rate limit exceeded" # )) # # # Alternatively, create a custom logger instance # custom_logger <- UnifiedLogger$new( # base_dir = "custom_logs", # level = "DEBUG", # console_output = TRUE, # json_format = TRUE # ) # # # Use the custom logger # custom_logger$info("Custom log message", list(analysis_step = "preprocessing")) # custom_logger$debug("Detailed debugging info", list(variable_state = "initialized")) # # # Get performance summary # performance <- get_logger()$get_performance_summary() # print(performance) ## ----------------------------------------------------------------------------- # # Create a cache manager # cache_manager <- CacheManager$new(cache_dir = NULL) # # # Generate a cache key # cache_key <- cache_manager$generate_key( # input = marker_data, # models = c("claude-sonnet-4-5-20250929", "gpt-5"), # cluster_id = "0" # ) # # # Check if results exist in cache # if (cache_manager$has_cache(cache_key)) { # # Load from cache # cached_results <- cache_manager$load_from_cache(cache_key) # } else { # # Process and save to cache # # results <- process_cluster(...) # # cache_manager$save_to_cache(cache_key, results) # } # # # Get cache statistics # cache_stats <- cache_manager$get_cache_stats() # # # Clear cache (with confirmation) # # cache_manager$clear_cache(confirm = TRUE) ## ----------------------------------------------------------------------------- # # Check cache location # mllmcelltype_cache_dir() # # # Use local cache # mllmcelltype_cache_dir("local") # # # Clear cache # mllmcelltype_clear_cache()