## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  eval = FALSE
)

## -----------------------------------------------------------------------------
# library(mLLMCelltype)
# library(Seurat)
# library(dplyr)
# 
# # Step 1: Perform initial high-level annotation
# high_level_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 10
# )
# 
# # Step 2: Add high-level annotations to Seurat object
# seurat_obj$high_level_celltype <- plyr::mapvalues(
#   x = as.character(Idents(seurat_obj)),
#   from = names(high_level_results),
#   to = high_level_results
# )
# 
# # Step 3: Subset T cells for further annotation
# t_cells <- subset(seurat_obj, high_level_celltype == "T cells")
# 
# # Step 4: Find markers within T cells
# t_cell_markers <- FindAllMarkers(t_cells, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
# 
# # Step 5: Perform T cell subtype annotation
# t_cell_subtypes <- annotate_cell_types(
#   input = t_cell_markers,
#   tissue_name = "human PBMC T cells",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 10
# )
# 
# # Step 6: Add T cell subtypes back to original object
# t_cell_barcodes <- WhichCells(t_cells)
# seurat_obj$detailed_celltype <- seurat_obj$high_level_celltype
# seurat_obj$detailed_celltype[t_cell_barcodes] <- plyr::mapvalues(
#   x = as.character(Idents(t_cells)),
#   from = names(t_cell_subtypes),
#   to = paste0("T cells: ", t_cell_subtypes)
# )

## -----------------------------------------------------------------------------
# # Create a simple function to check parent-child consistency
# validate_hierarchy <- function(high_level, detailed_level) {
#   # Extract parent type from detailed annotation (before the colon)
#   parent_from_detailed <- sapply(strsplit(detailed_level, ": "), function(x) x[1])
# 
#   # Check if parent matches high-level annotation
#   consistent <- parent_from_detailed == high_level
# 
#   # Return consistency check results
#   data.frame(
#     high_level = high_level,
#     detailed_level = detailed_level,
#     consistent = consistent
#   )
# }
# 
# # Apply validation
# hierarchy_validation <- validate_hierarchy(
#   seurat_obj$high_level_celltype,
#   seurat_obj$detailed_celltype
# )
# 
# # Identify inconsistencies
# inconsistencies <- hierarchy_validation[!hierarchy_validation$consistent, ]
# print(inconsistencies)

## -----------------------------------------------------------------------------
# # For noisy data, use fewer top genes
# results_fewer_genes <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 5  # Use fewer genes to focus on strongest signals
# )

## -----------------------------------------------------------------------------
# # Apply stricter filtering to marker genes
# filtered_markers <- marker_data %>%
#   filter(p_val_adj < 0.01, avg_log2FC > 1.0)  # Stricter thresholds
# 
# # Annotate with filtered markers
# results_filtered <- annotate_cell_types(
#   input = filtered_markers,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Set up API keys
# api_keys <- list(
#   anthropic = Sys.getenv("ANTHROPIC_API_KEY"),
#   openai = Sys.getenv("OPENAI_API_KEY"),
#   gemini = Sys.getenv("GEMINI_API_KEY")
# )
# 
# # Define multiple models to use
# models <- c(
#   "claude-sonnet-4-5-20250929",
#   "gpt-5",
#   "gemini-1.5-pro"
# )
# 
# # Create consensus using interactive_consensus_annotation
# consensus_results <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   models = models,
#   api_keys = api_keys,
#   controversy_threshold = 0.7,
#   entropy_threshold = 1.0,
#   consensus_check_model = "claude-sonnet-4-5-20250929"
# )

## -----------------------------------------------------------------------------
# # For data with batch effects, use consensus with lower threshold
# batch_consensus <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data with batch effects
#   tissue_name = "mouse brain",
#   models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"),
#   api_keys = api_keys,
#   controversy_threshold = 0.4,  # Lower threshold to discuss more clusters
#   entropy_threshold = 0.8  # Lower entropy threshold
# )

## -----------------------------------------------------------------------------
# # Include batch information in the tissue context
# batch_aware_results <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data with batch effects
#   tissue_name = "mouse brain with technical batch effects",  # Include batch context
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Basic annotation without specific tissue context
# basic_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human sample",  # Generic context
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # Annotation with specific tissue context
# specific_results <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human fetal liver at 20 weeks gestation",  # Detailed context
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Create a custom annotation prompt
# custom_prompt <- create_annotation_prompt(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   top_gene_count = 10
# )
# 
# # Modify the prompt to include additional context
# modified_prompt <- paste0(
#   custom_prompt$prompt,
#   "\n\nAdditional context: This sample is from a patient with rheumatoid arthritis. ",
#   "Previous studies have identified activated T cells, B cells, and CXCR4-high monocytes in this condition."
# )
# 
# # Use the modified prompt directly
# custom_results <- get_model_response(
#   prompt = modified_prompt,
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# library(Seurat)
# library(dplyr)
# 
# # Example: Using CellMarker database information to validate annotations
# # This is a conceptual example - implementation would depend on your specific needs
# 
# # 1. Get annotations with mLLMCelltype
# annotations <- annotate_cell_types(
#   input = marker_data,
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # 2. Compare with known marker genes (conceptual)
# # In a real workflow, you would query a database or use a reference dataset
# known_markers <- list(
#   "T cells" = c("CD3D", "CD3E", "CD3G"),
#   "B cells" = c("CD19", "MS4A1", "CD79A"),
#   "Monocytes" = c("CD14", "LYZ", "CSF1R")
# )
# 
# # 3. Validate annotations against known markers
# # This is a simplified example of how you might validate annotations
# validate_annotations <- function(annotations, marker_data, known_markers) {
#   validation_results <- list()
# 
#   for (i in 1:length(annotations)) {
#     cluster_id <- i
#     predicted_type <- annotations[i]
# 
#     # Get markers for this cluster
#     cluster_markers <- marker_data %>%
#       filter(cluster == cluster_id) %>%
#       arrange(desc(avg_log2FC)) %>%
#       pull(gene) %>%
#       head(20)
# 
#     # Check overlap with known markers for this cell type
#     if (predicted_type %in% names(known_markers)) {
#       expected_markers <- known_markers[[predicted_type]]
#       overlap <- intersect(cluster_markers, expected_markers)
# 
#       validation_results[[i]] <- list(
#         cluster = cluster_id,
#         predicted_type = predicted_type,
#         overlap_count = length(overlap),
#         overlap_genes = paste(overlap, collapse = ", "),
#         confidence = length(overlap) / length(expected_markers)
#       )
#     } else {
#       validation_results[[i]] <- list(
#         cluster = cluster_id,
#         predicted_type = predicted_type,
#         overlap_count = 0,
#         overlap_genes = "",
#         confidence = 0
#       )
#     }
#   }
# 
#   return(validation_results)
# }
# 
# # This is a conceptual example of how you might validate annotations
# # validation_results <- validate_annotations(annotations, marker_data, known_markers)

## -----------------------------------------------------------------------------
# library(Seurat)
# library(mLLMCelltype)
# library(ggplot2)
# library(dplyr)
# 
# # Load example PBMC data
# # In a real workflow, you would use your own data
# data("pbmc_small")  # Example dataset from Seurat
# 
# # Find marker genes
# pbmc_markers <- FindAllMarkers(pbmc_small,
#                               only.pos = TRUE,
#                               min.pct = 0.25,
#                               logfc.threshold = 0.25)
# 
# # Set up API keys
# api_keys <- list(
#   anthropic = Sys.getenv("ANTHROPIC_API_KEY"),
#   openai = Sys.getenv("OPENAI_API_KEY"),
#   gemini = Sys.getenv("GEMINI_API_KEY")
# )
# 
# # Use consensus annotation
# consensus_results <- interactive_consensus_annotation(
#   input = pbmc_markers,
#   tissue_name = "human PBMC",
#   models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"),
#   api_keys = api_keys,
#   controversy_threshold = 0.7,
#   entropy_threshold = 1.0,
#   consensus_check_model = "claude-sonnet-4-5-20250929"
# )
# 
# # Add results to Seurat object
# pbmc_small$cell_type <- plyr::mapvalues(
#   x = as.character(Idents(pbmc_small)),
#   from = names(consensus_results$final_annotations),
#   to = consensus_results$final_annotations
# )
# 
# # Visualize results
# # In a real workflow, you would create a UMAP or t-SNE plot
# # DimPlot(pbmc_small, group.by = "cell_type", label = TRUE) +
# #   ggtitle("PBMC Cell Types")

## -----------------------------------------------------------------------------
# # For rare cell types, use these strategies:
# 
# # 1. Increase the number of marker genes considered
# rare_cell_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY"),
#   top_gene_count = 20  # Use more genes for rare cell types
# )
# 
# # 2. Use consensus with lower thresholds to discuss more clusters
# rare_cell_consensus <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow",
#   models = c("claude-sonnet-4-5-20250929", "gpt-5", "gemini-1.5-pro"),
#   api_keys = api_keys,
#   controversy_threshold = 0.4,  # Lower threshold to discuss more clusters
#   entropy_threshold = 0.8,  # Lower entropy threshold
#   consensus_check_model = "claude-sonnet-4-5-20250929"
# )
# 
# # 3. Provide more specific tissue context
# specific_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human bone marrow with expected rare plasma cells and basophils",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Example workflow for cross-species comparison
# 
# # 1. Annotate human and mouse datasets separately
# # (Assuming you have marker data for both species)
# human_annotations <- annotate_cell_types(
#   input = human_marker_data,  # Your human marker data
#   tissue_name = "human brain cortex",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# mouse_annotations <- annotate_cell_types(
#   input = mouse_marker_data,  # Your mouse marker data
#   tissue_name = "mouse brain cortex",
#   model = "claude-sonnet-4-5-20250929",
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # 2. Compare annotations
# # This is a conceptual example - in a real workflow, you would:
# # - Map annotations to Seurat objects
# # - Calculate proportions
# # - Create comparison visualizations
# # - Identify conserved and species-specific cell types
# 
# # Example comparison function (conceptual)
# compare_species_annotations <- function(human_annotations, mouse_annotations) {
#   # Get unique cell types from both species
#   human_types <- unique(human_annotations)
#   mouse_types <- unique(mouse_annotations)
# 
#   # Find common cell types
#   common_types <- intersect(human_types, mouse_types)
# 
#   # Find species-specific cell types
#   human_specific <- setdiff(human_types, mouse_types)
#   mouse_specific <- setdiff(mouse_types, human_types)
# 
#   # Return comparison results
#   list(
#     common_types = common_types,
#     human_specific = human_specific,
#     mouse_specific = mouse_specific
#   )
# }
# 
# # This is a conceptual example
# # comparison <- compare_species_annotations(human_annotations, mouse_annotations)

## -----------------------------------------------------------------------------
# # Example of cost-efficient model selection
# # Choose models based on your specific needs and budget
# 
# # For initial exploration or smaller datasets
# # Use more affordable models
# affordable_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-haiku-4-20250514",  # More affordable model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # For final analysis or challenging datasets
# # Use larger models
# premium_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-sonnet-4-5-20250929",  # Larger model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )
# 
# # Use OpenRouter for access to free models
# openrouter_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "meta-llama/llama-3.3-70b-instruct:free",  # Free model via OpenRouter
#   api_key = Sys.getenv("OPENROUTER_API_KEY")
# )

## -----------------------------------------------------------------------------
# # 1. Use caching with interactive_consensus_annotation
# consensus_with_cache <- interactive_consensus_annotation(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   models = c("claude-sonnet-4-5-20250929", "gpt-5"),
#   api_keys = api_keys,
#   use_cache = TRUE,  # Enable caching
#   cache_dir = NULL  # Uses default system cache directory
# )
# 
# # 2. Process clusters in batches
# # This is a conceptual example - implementation would depend on your workflow
# process_in_batches <- function(marker_data, batch_size = 5) {
#   # Get unique clusters
#   clusters <- unique(marker_data$cluster)
# 
#   # Process in batches
#   results <- list()
#   for (i in seq(1, length(clusters), by = batch_size)) {
#     # Get current batch of clusters
#     batch_clusters <- clusters[i:min(i + batch_size - 1, length(clusters))]
# 
#     # Filter marker data for current batch
#     batch_data <- marker_data %>% filter(cluster %in% batch_clusters)
# 
#     # Process batch
#     batch_results <- annotate_cell_types(
#       input = batch_data,
#       tissue_name = "human PBMC",
#       model = "claude-sonnet-4-5-20250929",
#       api_key = Sys.getenv("ANTHROPIC_API_KEY")
#     )
# 
#     # Store results
#     results <- c(results, batch_results)
#   }
# 
#   return(results)
# }
# 
# # 3. Use faster models for initial exploration
# fast_annotation <- annotate_cell_types(
#   input = marker_data,  # Your marker gene data
#   tissue_name = "human PBMC",
#   model = "claude-haiku-4-20250514",  # Faster model
#   api_key = Sys.getenv("ANTHROPIC_API_KEY")
# )

## -----------------------------------------------------------------------------
# # Define a custom processing function
# # This function must accept prompt, model, and api_key parameters
# custom_process_fn <- function(prompt, model, api_key) {
#   # Custom implementation to process prompts and get responses
#   # This is a simplified example
#   cat("Processing prompt with custom provider\n")
#   cat("Model:", model, "\n")
# 
#   # In a real implementation, you would make API calls here
#   # For example:
#   # response <- httr::POST(
#   #   url = "https://api.custom-provider.com/v1/chat/completions",
#   #   body = list(prompt = prompt, model = model),
#   #   httr::add_headers(Authorization = paste("Bearer", api_key)),
#   #   encode = "json"
#   # )
#   # result <- httr::content(response)$choices[[1]]$text
# 
#   # For this example, just return a fixed response
#   result <- "T cells"
#   return(result)
# }
# 
# # Register the custom provider
# register_custom_provider(
#   provider_name = "custom_provider",
#   process_fn = custom_process_fn,
#   description = "My custom LLM provider"
# )
# 
# # Register a custom model
# register_custom_model(
#   model_name = "custom-model",
#   provider_name = "custom_provider",
#   model_config = list(
#     temperature = 0.7,
#     max_tokens = 2000
#   )
# )
# 
# # Use the custom model
# # custom_results <- annotate_cell_types(
# #   input = marker_data,
# #   tissue_name = "human PBMC",
# #   model = "custom-model",
# #   api_key = "your-custom-api-key"
# # )

## -----------------------------------------------------------------------------
# # Configure the global logger (recommended approach)
# configure_logger(level = "INFO", console_output = TRUE, json_format = TRUE)
# 
# # Use simple logging functions
# log_info("Starting analysis of cluster 0", list(
#   cluster_id = "0",
#   tissue_name = "human PBMC",
#   marker_genes = c("CD3D", "CD3E", "CD2", "IL7R", "LTB")
# ))
# 
# # Log API calls with performance tracking
# log_info("API call completed", list(
#   provider = "anthropic",
#   model = "claude-3.5-sonnet",
#   duration_seconds = 2.34,
#   success = TRUE
# ))
# 
# # Log warnings and errors
# log_warn("Model response had unusual format", list(
#   model = "gpt-5",
#   response_length = 50
# ))
# 
# log_error("API call failed", list(
#   provider = "openai",
#   error = "Rate limit exceeded"
# ))
# 
# # Alternatively, create a custom logger instance
# custom_logger <- UnifiedLogger$new(
#   base_dir = "custom_logs",
#   level = "DEBUG",
#   console_output = TRUE,
#   json_format = TRUE
# )
# 
# # Use the custom logger
# custom_logger$info("Custom log message", list(analysis_step = "preprocessing"))
# custom_logger$debug("Detailed debugging info", list(variable_state = "initialized"))
# 
# # Get performance summary
# performance <- get_logger()$get_performance_summary()
# print(performance)

## -----------------------------------------------------------------------------
# # Create a cache manager
# cache_manager <- CacheManager$new(cache_dir = NULL)
# 
# # Generate a cache key
# cache_key <- cache_manager$generate_key(
#   input = marker_data,
#   models = c("claude-sonnet-4-5-20250929", "gpt-5"),
#   cluster_id = "0"
# )
# 
# # Check if results exist in cache
# if (cache_manager$has_cache(cache_key)) {
#   # Load from cache
#   cached_results <- cache_manager$load_from_cache(cache_key)
# } else {
#   # Process and save to cache
#   # results <- process_cluster(...)
#   # cache_manager$save_to_cache(cache_key, results)
# }
# 
# # Get cache statistics
# cache_stats <- cache_manager$get_cache_stats()
# 
# # Clear cache (with confirmation)
# # cache_manager$clear_cache(confirm = TRUE)

## -----------------------------------------------------------------------------
# # Check cache location
# mllmcelltype_cache_dir()
# 
# # Use local cache
# mllmcelltype_cache_dir("local")
# 
# # Clear cache
# mllmcelltype_clear_cache()