diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..72e5ed2c2f28e5743260e77a37767989a1d1ec71 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Virtual environment +venv/ +env/ +.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Gradio +flagged/ +gradio_cached_examples/ + +# Logs +*.log diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md new file mode 100644 index 0000000000000000000000000000000000000000..4153b281870b511f6353e2cac491a2821d6ec7f7 --- /dev/null +++ b/DEPLOYMENT_GUIDE.md @@ -0,0 +1,306 @@ +# Hugging Face Spaces Deployment Guide + +This guide walks you through deploying the LangGraph Multi-Agent MCTS demo to Hugging Face Spaces. + +## Prerequisites + +- [Hugging Face Account](https://huggingface.co/join) +- Git installed locally +- Python 3.10+ (for local testing) + +## Step 1: Create a New Space + +1. Go to [Hugging Face Spaces](https://huggingface.co/spaces) +2. Click **"Create new Space"** +3. Fill in the form: + - **Owner**: Your username or organization + - **Space name**: `langgraph-mcts-demo` (or your choice) + - **License**: MIT + - **SDK**: Gradio + - **Hardware**: CPU Basic (Free tier - sufficient for demo) + - **Visibility**: Public (or Private) +4. Click **"Create Space"** + +## Step 2: Clone and Deploy + +### Option A: Git-based Deployment (Recommended) + +```bash +# 1. Clone your new empty Space +git clone https://huggingface.co/spaces/YOUR_USERNAME/langgraph-mcts-demo +cd langgraph-mcts-demo + +# 2. Copy demo files from this directory +cp -r /path/to/huggingface_space/* . +cp -r /path/to/huggingface_space/.gitignore . + +# 3. Verify structure +ls -la +# Should show: +# - app.py +# - requirements.txt +# - README.md +# - .gitignore +# - demo_src/ +# - __init__.py +# - agents_demo.py +# - llm_mock.py +# - mcts_demo.py + +# 4. Commit and push +git add -A +git commit -m "Initial deployment of LangGraph Multi-Agent MCTS demo" +git push + +# 5. Space will automatically build and deploy (takes 2-5 minutes) +``` + +### Option B: Direct Upload via Web UI + +1. Navigate to your Space on Hugging Face +2. Click **"Files"** tab +3. Click **"Add file"** → **"Upload files"** +4. Upload all files maintaining the directory structure: + - `app.py` + - `requirements.txt` + - `README.md` + - `.gitignore` + - `demo_src/__init__.py` + - `demo_src/agents_demo.py` + - `demo_src/llm_mock.py` + - `demo_src/mcts_demo.py` +5. Commit changes + +## Step 3: Monitor Deployment + +1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/langgraph-mcts-demo` +2. Click **"Logs"** tab to monitor build progress +3. Wait for "Running on" message +4. Your demo is now live! + +## Step 4: Test the Demo + +1. Enter a query or select an example +2. Enable/disable different agents +3. Adjust MCTS parameters +4. Click "Process Query" +5. Review results and consensus scores + +## Optional: Enable Real LLM Responses + +To use Hugging Face Inference API instead of mock responses: + +### 1. Update requirements.txt + +```txt +gradio>=4.0.0,<5.0.0 +numpy>=1.24.0,<2.0.0 +huggingface_hub>=0.20.0 +``` + +### 2. Add Secret Token + +1. Go to Space Settings → **Repository secrets** +2. Add new secret: + - Name: `HF_TOKEN` + - Value: Your Hugging Face token (from [Settings → Access Tokens](https://huggingface.co/settings/tokens)) + +### 3. Update app.py Initialization + +Change line ~290 in `app.py`: + +```python +# From: +framework = MultiAgentFrameworkDemo(use_hf_inference=False) + +# To: +import os +framework = MultiAgentFrameworkDemo( + use_hf_inference=True, + hf_model="mistralai/Mistral-7B-Instruct-v0.2" +) +``` + +### 4. Commit and Push + +```bash +git add -A +git commit -m "Enable Hugging Face Inference API" +git push +``` + +## Optional: Enable Weights & Biases Tracking + +Track experiments and visualize metrics with W&B integration. + +### 1. Get W&B API Key + +1. Sign up at [wandb.ai](https://wandb.ai) +2. Go to Settings → API Keys +3. Copy your API key + +### 2. Add W&B Secret to Space + +1. Go to Space Settings → **Repository secrets** +2. Add new secret: + - Name: `WANDB_API_KEY` + - Value: Your W&B API key + +### 3. Use W&B in the Demo + +1. Expand "Weights & Biases Tracking" accordion in the UI +2. Check "Enable W&B Tracking" +3. Optionally set: + - **Project Name**: Your W&B project (default: `langgraph-mcts-demo`) + - **Run Name**: Custom name for this run (auto-generated if empty) +4. Process your query +5. View the W&B run URL in the results + +### 4. What Gets Logged + +- **Agent Metrics**: Confidence scores, execution times, response lengths +- **MCTS Metrics**: Best value, visits, tree depth, exploration paths +- **Consensus Metrics**: Agreement scores, agent combinations +- **Performance**: Total processing time +- **Artifacts**: Full JSON results as artifacts + +### 5. View Your Dashboard + +After runs, visit your W&B project dashboard to: +- Compare different agent configurations +- Visualize consensus patterns +- Analyze MCTS exploration strategies +- Track performance over time + +## Customization Options + +### Change Gradio Theme + +In `app.py`, modify: + +```python +with gr.Blocks( + theme=gr.themes.Soft(), # Try: Default(), Monochrome(), Glass() + ... +) as demo: +``` + +### Add Custom Examples + +Update `EXAMPLE_QUERIES` list in `app.py`: + +```python +EXAMPLE_QUERIES = [ + "Your custom query 1", + "Your custom query 2", + ... +] +``` + +### Adjust MCTS Parameters + +Modify sliders in `app.py`: + +```python +mcts_iterations = gr.Slider( + minimum=10, + maximum=200, # Increase for more thorough search + value=50, # Change default + ... +) +``` + +### Add More Agent Types + +1. Create new agent in `demo_src/agents_demo.py` +2. Add to `MultiAgentFrameworkDemo` in `app.py` +3. Add UI controls in Gradio interface + +## Troubleshooting + +### Build Fails + +- Check **Logs** tab for error details +- Verify `requirements.txt` has compatible versions +- Ensure all imports in `app.py` are satisfied + +### Slow Performance + +- Reduce default MCTS iterations +- Use mock LLM (no API calls) +- Simplify tree visualization + +### Memory Issues (Free Tier) + +- Limit max MCTS iterations to 100 +- Reduce tree depth in `demo_src/mcts_demo.py` +- Simplify response generation + +### Missing Files + +Ensure directory structure: +``` +your-space/ +├── app.py +├── requirements.txt +├── README.md +├── .gitignore +└── demo_src/ + ├── __init__.py + ├── agents_demo.py + ├── llm_mock.py + ├── mcts_demo.py + └── wandb_tracker.py +``` + +## Upgrading Hardware + +For better performance: + +1. Go to Space Settings +2. Under **Hardware**, select: + - **CPU Upgrade** ($0.03/hr) - Faster processing + - **T4 Small** ($0.60/hr) - GPU for neural models +3. Save changes + +## Sharing Your Space + +### Embed in Website + +```html + +``` + +### Direct Link + +Share: `https://huggingface.co/spaces/YOUR_USERNAME/langgraph-mcts-demo` + +### API Access + +Gradio automatically provides API endpoint: +``` +https://YOUR_USERNAME-langgraph-mcts-demo.hf.space/api/predict +``` + +## Next Steps + +1. **Collect Feedback**: Enable flagging for user feedback +2. **Add Analytics**: Track usage patterns +3. **Extend Agents**: Add domain-specific reasoning modules +4. **Integrate RAG**: Connect to vector databases for real context +5. **Add Visualization**: Enhanced tree and consensus displays + +## Support + +- **Hugging Face Docs**: https://huggingface.co/docs/hub/spaces +- **Gradio Docs**: https://www.gradio.app/docs +- **Full Framework**: https://github.com/ianshank/langgraph_multi_agent_mcts + +--- + +**Happy Deploying!** 🚀 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4d1bbfbe60a76289200e3169f54d1573b7f27a71 --- /dev/null +++ b/README.md @@ -0,0 +1,225 @@ +--- +title: LangGraph Multi-Agent MCTS Demo +emoji: 🌳 +colorFrom: blue +colorTo: green +sdk: gradio +sdk_version: 4.44.0 +app_file: app.py +pinned: false +license: mit +tags: + - multi-agent + - mcts + - reasoning + - langgraph + - ai-agents + - wandb + - experiment-tracking +short_description: Multi-agent reasoning framework with Monte Carlo Tree Search +--- + +# LangGraph Multi-Agent MCTS Framework + +**Production Demo with Trained Neural Models** - Experience real trained meta-controllers for intelligent agent routing + +## What This Demo Shows + +This interactive demo showcases trained neural meta-controllers that dynamically route queries to specialized agents: + +### 🤖 Trained Meta-Controllers + +1. **RNN Meta-Controller** + - GRU-based recurrent neural network + - Learns sequential patterns in agent performance + - Fast inference (~2ms latency) + - Trained on 1000+ synthetic routing examples + +2. **BERT Meta-Controller with LoRA** + - Transformer-based text understanding + - Parameter-efficient fine-tuning with LoRA adapters + - Context-aware routing decisions + - Better generalization to unseen query patterns + +### 🧠 Three Specialized Agents + +1. **HRM (Hierarchical Reasoning Module)** + - Best for: Complex decomposition, multi-level problems + - Technique: Hierarchical planning with adaptive computation + +2. **TRM (Tree Reasoning Module)** + - Best for: Iterative refinement, comparison tasks + - Technique: Recursive refinement with convergence detection + +3. **MCTS (Monte Carlo Tree Search)** + - Best for: Optimization, strategic planning + - Technique: UCB1 exploration with value backpropagation + +### 📊 Key Features + +- **Real Trained Models**: Production-ready neural meta-controllers +- **Intelligent Routing**: Models learn optimal agent selection patterns +- **Routing Visualization**: See confidence scores and probability distributions +- **Feature Engineering**: Demonstrates query → features → routing pipeline +- **Performance Metrics**: Track execution time and routing accuracy + +## How to Use + +1. **Enter a Query**: Type your question or select an example +2. **Select Controller**: Choose RNN (fast) or BERT (context-aware) +3. **Process Query**: Click "🚀 Process Query" +4. **Review Results**: + - See which agent the controller selected + - View routing confidence and probabilities + - Examine features used for decision-making + - Check agent execution details + +## Weights & Biases Integration + +Track your experiments with **Weights & Biases** for: +- 📈 **Metrics Dashboard**: Visualize consensus scores, execution times, agent performance +- 🔄 **Run Comparison**: Compare different configurations side-by-side +- 📊 **Experiment History**: Track all your queries and results +- 🌳 **MCTS Visualization**: Log tree exploration patterns + +### Setting Up W&B + +1. **Get API Key**: Sign up at [wandb.ai](https://wandb.ai) and get your API key +2. **Configure Space Secret** (if deploying your own): + - Go to Space Settings → Repository secrets + - Add: `WANDB_API_KEY` = your API key +3. **Enable in UI**: + - Expand "Weights & Biases Tracking" accordion + - Check "Enable W&B Tracking" + - Set project name (optional) + - Set run name (optional, auto-generated if empty) +4. **View Results**: After processing, click the W&B run URL to see your dashboard + +### Logged Metrics + +- **Per Agent**: Confidence, execution time, response length, reasoning steps +- **MCTS**: Best value, visits, tree depth, top actions with UCB1 scores +- **Consensus**: Score, level (high/medium/low), number of agents +- **Performance**: Total processing time +- **Artifacts**: Full JSON results, tree visualizations + +## Example Queries + +- "What are the key factors to consider when choosing between microservices and monolithic architecture?" +- "How can we optimize a Python application that processes 10GB of log files daily?" +- "Should we use SQL or NoSQL database for a social media application with 1M users?" +- "How to design a fault-tolerant message queue system?" + +## Technical Details + +### Architecture + +``` +Query Input + │ + ├─→ HRM Agent (Hierarchical Decomposition) + │ ├─ Component Analysis + │ └─ Structured Synthesis + │ + ├─→ TRM Agent (Iterative Refinement) + │ ├─ Initial Response + │ ├─ Clarity Enhancement + │ └─ Validation Check + │ + └─→ MCTS Engine (Strategic Search) + ├─ Selection (UCB1) + ├─ Expansion + ├─ Simulation + └─ Backpropagation + │ + ▼ + Consensus Scoring + │ + ▼ + Final Synthesized Response +``` + +### MCTS Algorithm + +The Monte Carlo Tree Search implementation uses: + +- **UCB1 Selection**: `Q(s,a) + C * sqrt(ln(N(s)) / N(s,a))` +- **Progressive Widening**: Controls branching factor +- **Domain-Aware Actions**: Contextual decision options +- **Value Backpropagation**: Updates entire path statistics + +### Consensus Calculation + +``` +consensus = average_confidence * agreement_factor +agreement_factor = max(0, 1 - std_deviation * 2) +``` + +High consensus (>70%) indicates agents agree on approach. +Low consensus (<40%) suggests uncertainty or conflicting strategies. + +## Demo Scope + +This demonstration focuses on **meta-controller training and routing**: + +- ✅ **Real Trained Models**: Production RNN and BERT controllers +- ✅ **Actual Model Loading**: PyTorch and HuggingFace Transformers +- ✅ **Feature Engineering**: Query analysis → feature vectors +- ✅ **Routing Visualization**: See controller decision-making +- ⚠️ **Simplified Agents**: Agent responses are mocked for demo purposes +- ⚠️ **No Live LLM Calls**: Agents don't call actual LLMs (to reduce latency/cost) + +## Full Production Framework + +The complete repository includes all production features: + +- ✅ **Neural Meta-Controllers**: RNN and BERT with LoRA (deployed here!) +- ✅ **Agent Implementations**: Full HRM, TRM, and MCTS with PyTorch +- ✅ **Training Pipeline**: Data generation, training, evaluation +- ✅ **LLM Integration**: OpenAI, Anthropic, LM Studio support +- ✅ **RAG Systems**: ChromaDB, FAISS, Pinecone vector stores +- ✅ **Observability**: OpenTelemetry tracing, Prometheus metrics +- ✅ **Storage**: S3 artifact storage, experiment tracking +- ✅ **CI/CD**: Automated testing, security scanning, deployment + +**GitHub Repository**: [ianshank/langgraph_multi_agent_mcts](https://github.com/ianshank/langgraph_multi_agent_mcts) + +## Technical Stack + +- **Python**: 3.11+ +- **UI**: Gradio 4.x +- **ML Frameworks**: PyTorch 2.1+, Transformers, PEFT (LoRA) +- **Models**: GRU-based RNN, BERT-mini with LoRA adapters +- **Architecture**: Neural meta-controller + multi-agent system +- **Experiment Tracking**: Weights & Biases (optional) +- **Numerical**: NumPy + +## Research Applications + +This framework demonstrates concepts applicable to: + +- Complex decision-making systems +- AI-assisted software architecture decisions +- Multi-perspective problem analysis +- Strategic planning with uncertainty + +## Citation + +If you use this framework in research, please cite: + +```bibtex +@software{langgraph_mcts_2024, + title={LangGraph Multi-Agent MCTS Framework}, + author={Your Name}, + year={2024}, + url={https://github.com/ianshank/langgraph_multi_agent_mcts} +} +``` + +## License + +MIT License - See repository for details. + +--- + +**Built with** LangGraph, Gradio, and Python | **Demo Version**: 1.0.0 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..4543e9791790e1600dac3a2e611801079ecead0b --- /dev/null +++ b/app.py @@ -0,0 +1,553 @@ +""" +LangGraph Multi-Agent MCTS Framework - Integrated Demo with Trained Models + +Demonstrates the actual trained neural meta-controllers: +- RNN Meta-Controller for sequential pattern recognition +- BERT with LoRA adapters for text-based routing + +This is a production demonstration using real trained models. +""" + +import asyncio +import sys +import time +from dataclasses import dataclass +from pathlib import Path + +# Fail fast if critical dependencies are missing or broken +try: + import peft + + print(f"[OK] PEFT library imported successfully (version: {peft.__version__})") +except ImportError as e: + print(f"CRITICAL ERROR: Could not import peft library: {e}") + # We don't exit here to allow the app to crash naturally later with full stack trace, + # but this print ensures it's visible in the logs immediately. + +import gradio as gr +import torch + +# Import the trained controllers +sys.path.insert(0, str(Path(__file__).parent)) + +from src.agents.meta_controller.base import MetaControllerFeatures +from src.agents.meta_controller.bert_controller import BERTMetaController +from src.agents.meta_controller.rnn_controller import RNNMetaController +from src.agents.meta_controller.feature_extractor import ( + FeatureExtractor, + FeatureExtractorConfig, +) +from src.utils.personality_response import PersonalityResponseGenerator + + +@dataclass +class AgentResult: + """Result from a single agent.""" + + agent_name: str + response: str + confidence: float + reasoning_steps: list[str] + execution_time_ms: float + + +@dataclass +class ControllerDecision: + """Decision made by the meta-controller.""" + + selected_agent: str + confidence: float + routing_probabilities: dict[str, float] + features_used: dict + + +def create_features_from_query( + query: str, + iteration: int = 0, + last_agent: str = "none", + feature_extractor: FeatureExtractor | None = None, +) -> MetaControllerFeatures: + """ + Convert a text query into features for the meta-controller. + + Uses semantic embeddings for robust feature extraction. Falls back to + heuristic-based extraction if embeddings are not available. + + Args: + query: The input query text + iteration: Current iteration number + last_agent: Name of the last agent used + feature_extractor: Optional FeatureExtractor instance (created if None) + + Returns: + MetaControllerFeatures instance + """ + # Use provided feature extractor or create a new one + if feature_extractor is None: + try: + config = FeatureExtractorConfig.from_env() + feature_extractor = FeatureExtractor(config) + except Exception as e: + print(f"Warning: Failed to initialize FeatureExtractor: {e}") + print("Falling back to heuristic-based feature extraction") + # Will use heuristic fallback below + + # Extract features using the feature extractor + try: + if feature_extractor is not None: + return feature_extractor.extract_features(query, iteration, last_agent) + except Exception as e: + print(f"Warning: Feature extraction failed: {e}") + print("Falling back to heuristic-based feature extraction") + + # Fallback to original heuristic-based extraction + # (This code is kept as a safety net but should rarely be used) + query_length = len(query) + + # Estimate complexity based on query characteristics + has_multiple_questions = "?" in query and query.count("?") > 1 + has_comparison = any(word in query.lower() for word in ["vs", "versus", "compare", "difference", "better"]) + has_optimization = any(word in query.lower() for word in ["optimize", "best", "improve", "maximize", "minimize"]) + has_technical = any(word in query.lower() for word in ["algorithm", "code", "implement", "technical", "system"]) + + # Create mock confidence scores based on query characteristics + hrm_confidence = 0.5 + (0.3 if has_multiple_questions else 0) + (0.1 if has_technical else 0) + trm_confidence = 0.5 + (0.3 if has_comparison else 0) + (0.1 if query_length > 100 else 0) + mcts_confidence = 0.5 + (0.3 if has_optimization else 0) + (0.1 if has_technical else 0) + + # Normalize + total = hrm_confidence + trm_confidence + mcts_confidence + if total == 0: + hrm_confidence = 1.0 / 3.0 + trm_confidence = 1.0 / 3.0 + mcts_confidence = 1.0 / 3.0 + else: + hrm_confidence /= total + trm_confidence /= total + mcts_confidence /= total + + # Calculate consensus score + max_confidence = max(hrm_confidence, trm_confidence, mcts_confidence) + if max_confidence == 0: + consensus_score = 0.0 + else: + consensus_score = min(hrm_confidence, trm_confidence, mcts_confidence) / max_confidence + + features = MetaControllerFeatures( + hrm_confidence=hrm_confidence, + trm_confidence=trm_confidence, + mcts_value=mcts_confidence, + consensus_score=consensus_score, + last_agent=last_agent, + iteration=iteration, + query_length=query_length, + has_rag_context=query_length > 50, + rag_relevance_score=0.7 if query_length > 50 else 0.0, + is_technical_query=has_technical, + ) + + return features + + +class IntegratedFramework: + """ + Integrated multi-agent framework using trained meta-controllers. + """ + + def __init__(self): + """Initialize the framework with trained models.""" + self.device = "cuda" if torch.cuda.is_available() else "cpu" + print(f"Using device: {self.device}") + + # Initialize feature extractor with semantic embeddings + print("Initializing Feature Extractor...") + try: + config = FeatureExtractorConfig.from_env() + # Set device to match the framework device + config.device = self.device + self.feature_extractor = FeatureExtractor(config) + print(f"[OK] Feature Extractor initialized: {self.feature_extractor}") + except Exception as e: + print(f"[WARN] Failed to initialize Feature Extractor: {e}") + print("[WARN] Will fall back to heuristic-based feature extraction") + self.feature_extractor = None + + # Load trained RNN Meta-Controller + print("Loading RNN Meta-Controller...") + self.rnn_controller = RNNMetaController(name="RNNController", seed=42, device=self.device) + + # Load the trained weights + rnn_model_path = Path(__file__).parent / "models" / "rnn_meta_controller.pt" + if rnn_model_path.exists(): + checkpoint = torch.load(rnn_model_path, map_location=self.device, weights_only=True) + self.rnn_controller.model.load_state_dict(checkpoint) + self.rnn_controller.model.eval() + print(f"[OK] Loaded RNN model from {rnn_model_path}") + else: + print(f"[WARN] RNN model not found at {rnn_model_path}, using untrained model") + + # Load trained BERT Meta-Controller with LoRA + print("Loading BERT Meta-Controller with LoRA...") + self.bert_controller = BERTMetaController(name="BERTController", seed=42, device=self.device, use_lora=True) + + bert_model_path = Path(__file__).parent / "models" / "bert_lora" / "final_model" + if bert_model_path.exists(): + try: + self.bert_controller.load_model(str(bert_model_path)) + print(f"[OK] Loaded BERT LoRA model from {bert_model_path}") + except Exception as e: + print(f"[WARN] Error loading BERT model: {e}") + print("Using untrained BERT model") + else: + print(f"[WARN] BERT model not found at {bert_model_path}, using untrained model") + + # Agent routing map + self.agent_handlers = { + "hrm": self._handle_hrm, + "trm": self._handle_trm, + "mcts": self._handle_mcts, + } + + print("Framework initialized successfully!") + + async def process_query( + self, + query: str, + controller_type: str = "rnn", + ) -> tuple[AgentResult, ControllerDecision]: + """ + Process a query using the trained meta-controller. + + Args: + query: The input query + controller_type: Which controller to use ("rnn" or "bert") + + Returns: + (agent_result, controller_decision) tuple + """ + start_time = time.perf_counter() + + # Step 1: Convert query to features using semantic embeddings + features = create_features_from_query(query, feature_extractor=self.feature_extractor) + + # Step 2: Get controller decision + if controller_type == "rnn": + prediction = self.rnn_controller.predict(features) + else: # bert + prediction = self.bert_controller.predict(features) + + selected_agent = prediction.agent + confidence = prediction.confidence + + # Get routing probabilities (prediction.probabilities is already a dict) + routing_probs = prediction.probabilities + + # Step 3: Route to selected agent + handler = self.agent_handlers.get(selected_agent, self._handle_hrm) + agent_result = await handler(query) + + # Create controller decision summary + controller_decision = ControllerDecision( + selected_agent=selected_agent, + confidence=confidence, + routing_probabilities=routing_probs, + features_used={ + "hrm_confidence": features.hrm_confidence, + "trm_confidence": features.trm_confidence, + "mcts_value": features.mcts_value, + "consensus_score": features.consensus_score, + "query_length": features.query_length, + "is_technical": features.is_technical_query, + }, + ) + + total_time = (time.perf_counter() - start_time) * 1000 + agent_result.execution_time_ms = round(total_time, 2) + + return agent_result, controller_decision + + async def _handle_hrm(self, query: str) -> AgentResult: + """Handle query with Hierarchical Reasoning Module.""" + # Simulate HRM processing + await asyncio.sleep(0.1) + + steps = [ + "Decompose query into hierarchical subproblems", + "Apply high-level reasoning (H-Module)", + "Execute low-level refinement (L-Module)", + "Synthesize hierarchical solution", + ] + + response = f"[HRM Analysis] Breaking down the problem hierarchically: {query[:100]}..." + + return AgentResult( + agent_name="HRM (Hierarchical Reasoning)", + response=response, + confidence=0.85, + reasoning_steps=steps, + execution_time_ms=0.0, + ) + + async def _handle_trm(self, query: str) -> AgentResult: + """Handle query with Tree Reasoning Module.""" + # Simulate TRM processing + await asyncio.sleep(0.1) + + steps = [ + "Initialize solution state", + "Recursive refinement iteration 1", + "Recursive refinement iteration 2", + "Convergence achieved - finalize", + ] + + response = f"[TRM Analysis] Applying iterative refinement: {query[:100]}..." + + return AgentResult( + agent_name="TRM (Iterative Refinement)", + response=response, + confidence=0.80, + reasoning_steps=steps, + execution_time_ms=0.0, + ) + + async def _handle_mcts(self, query: str) -> AgentResult: + """Handle query with MCTS.""" + # Simulate MCTS processing + await asyncio.sleep(0.15) + + steps = [ + "Build search tree", + "Selection: UCB1 exploration", + "Expansion: Add promising nodes", + "Simulation: Rollout evaluation", + "Backpropagation: Update values", + ] + + response = f"[MCTS Analysis] Strategic exploration via tree search: {query[:100]}..." + + return AgentResult( + agent_name="MCTS (Monte Carlo Tree Search)", + response=response, + confidence=0.88, + reasoning_steps=steps, + execution_time_ms=0.0, + ) + + +# Global framework instance +framework = None + + +def initialize_framework(): + """Initialize or reinitialize the framework.""" + global framework + try: + framework = IntegratedFramework() + return "[OK] Framework initialized with trained models!" + except Exception as e: + return f"[ERROR] Error initializing framework: {str(e)}" + + +def process_query_sync( + query: str, + controller_type: str, +): + """Synchronous wrapper for async processing.""" + global framework + + if framework is None: + framework = IntegratedFramework() + + if not query.strip(): + return ("Please enter a query.", {}, "", {}, "", "") + + # Run async function + agent_result, controller_decision = asyncio.run( + framework.process_query(query=query, controller_type=controller_type.lower()) + ) + + # Format outputs + final_response = agent_result.response + + # Generate personality-infused response + personality_gen = PersonalityResponseGenerator() + try: + personality_response = personality_gen.generate_response( + agent_response=final_response, + query=query + ) + except Exception as e: + # Fallback to a simple wrapper if personality generation fails + personality_response = f"Here's what I found:\n\n{final_response}" + print(f"Warning: Personality generation failed: {e}") + + # Controller decision visualization + routing_viz = "### 🧠 Meta-Controller Decision\n\n" + routing_viz += f"**Selected Agent:** `{controller_decision.selected_agent.upper()}`\n\n" + routing_viz += f"**Confidence:** {controller_decision.confidence:.1%}\n\n" + routing_viz += "**Routing Probabilities:**\n" + for agent, prob in controller_decision.routing_probabilities.items(): + bar = "█" * int(prob * 50) + routing_viz += f"- **{agent.upper()}**: {prob:.1%} {bar}\n" + + # Agent details + agent_details = { + "agent": agent_result.agent_name, + "confidence": f"{agent_result.confidence:.1%}", + "reasoning_steps": agent_result.reasoning_steps, + "execution_time_ms": agent_result.execution_time_ms, + } + + # Features used + features_viz = "### 📊 Features Used for Routing\n\n" + for feature, value in controller_decision.features_used.items(): + if isinstance(value, float): + features_viz += f"- **{feature}**: {value:.3f}\n" + elif isinstance(value, bool): + features_viz += f"- **{feature}**: {'Yes' if value else 'No'}\n" + else: + features_viz += f"- **{feature}**: {value}\n" + + # Metrics + metrics = f""" +**Controller:** {controller_type} +**Execution Time:** {agent_result.execution_time_ms:.2f} ms +**Agent Confidence:** {agent_result.confidence:.1%} +""" + + return final_response, agent_details, routing_viz, features_viz, metrics, personality_response + + +# Example queries +EXAMPLE_QUERIES = [ + "What are the key factors to consider when choosing between microservices and monolithic architecture?", + "How can we optimize a Python application that processes 10GB of log files daily?", + "Compare the performance characteristics of B-trees vs LSM-trees for write-heavy workloads", + "Design a distributed rate limiting system that handles 100k requests per second", + "Explain the difference between supervised and unsupervised learning with examples", +] + + +# Gradio Interface +with gr.Blocks( + title="LangGraph Multi-Agent MCTS - Trained Models Demo", + theme=gr.themes.Soft(), + css=""" + .agent-box { border: 1px solid #ddd; padding: 10px; border-radius: 5px; margin: 5px 0; } + .highlight { background-color: #e3f2fd; padding: 10px; border-radius: 5px; margin: 10px 0; } + """, +) as demo: + gr.Markdown( + """ + # 🎯 LangGraph Multi-Agent MCTS Framework + ## Production Demo with Trained Neural Meta-Controllers + + This demo uses **REAL trained models**: + - 🧠 **RNN Meta-Controller**: GRU-based sequential pattern recognition + - 🤖 **BERT with LoRA**: Transformer-based text understanding for routing + + The meta-controllers learn to route queries to the optimal agent: + - **HRM**: Hierarchical reasoning for complex decomposition + - **TRM**: Iterative refinement for progressive improvement + - **MCTS**: Strategic exploration for optimization problems + + --- + """ + ) + + with gr.Row(): + with gr.Column(scale=2): + query_input = gr.Textbox( + label="Query", placeholder="Enter your question or reasoning task...", lines=4, max_lines=10 + ) + + gr.Markdown("**Example Queries:**") + example_dropdown = gr.Dropdown(choices=EXAMPLE_QUERIES, label="Select an example", interactive=True) + + def load_example(example): + return example + + example_dropdown.change(load_example, example_dropdown, query_input) + + with gr.Column(scale=1): + gr.Markdown("**Meta-Controller Selection**") + controller_type = gr.Radio( + choices=["RNN", "BERT"], + value="RNN", + label="Controller Type", + info="Choose which trained controller to use", + ) + + gr.Markdown( + """ + **Controller Comparison:** + - **RNN**: Fast, captures sequential patterns + - **BERT**: More context-aware, text understanding + """ + ) + + process_btn = gr.Button("🚀 Process Query", variant="primary", size="lg") + + gr.Markdown("---") + + with gr.Row(): + with gr.Column(): + gr.Markdown("### 🎯 Agent Response") + final_response_output = gr.Textbox(label="Response", lines=4, interactive=False) + + gr.Markdown("### 🤝 Personality-Infused Response") + gr.Markdown("*A conversational, balanced advisor interpretation*") + personality_output = gr.Textbox(label="Balanced Advisor Response", lines=8, interactive=False) + + gr.Markdown("### 📈 Performance Metrics") + metrics_output = gr.Markdown() + + with gr.Column(): + routing_viz = gr.Markdown(label="Controller Decision") + features_viz = gr.Markdown(label="Features") + + with gr.Accordion("🔍 Detailed Agent Information", open=False): + agent_details_output = gr.JSON(label="Agent Execution Details") + + # Wire up the processing + process_btn.click( + fn=process_query_sync, + inputs=[ + query_input, + controller_type, + ], + outputs=[final_response_output, agent_details_output, routing_viz, features_viz, metrics_output, personality_output], + ) + + gr.Markdown( + """ + --- + + ### 📚 About This Demo + + This is a **production demonstration** of trained neural meta-controllers for multi-agent routing. + + **Models:** + - RNN Meta-Controller: 10-dimensional feature vector → 3-class routing (HRM/TRM/MCTS) + - BERT with LoRA: Text features → routing decision with adapters + + **Training:** + - Synthetic dataset: 1000+ samples with balanced routing decisions + - Optimization: Adam optimizer, cross-entropy loss + - Validation: 80/20 train/val split with early stopping + + **Repository:** [GitHub - langgraph_multi_agent_mcts](https://github.com/ianshank/langgraph_multi_agent_mcts) + + --- + *Built with PyTorch, Transformers, PEFT, and Gradio* + """ + ) + + +if __name__ == "__main__": + # Initialize framework + print("Initializing framework with trained models...") + framework = IntegratedFramework() + + # Launch the demo + demo.launch(server_name="0.0.0.0", share=False, show_error=True) diff --git a/app_mock.py b/app_mock.py new file mode 100644 index 0000000000000000000000000000000000000000..e8433a617be2773f6a62a15d3a511cbc01a2ba87 --- /dev/null +++ b/app_mock.py @@ -0,0 +1,590 @@ +""" +LangGraph Multi-Agent MCTS Framework - Hugging Face Spaces Demo + +A proof-of-concept demonstration of multi-agent reasoning with Monte Carlo Tree Search. +""" + +import asyncio +import time +from dataclasses import dataclass + +import gradio as gr +import numpy as np + +# Demo-specific simplified implementations +from demo_src.agents_demo import HRMAgent, TRMAgent +from demo_src.llm_mock import HuggingFaceClient, MockLLMClient +from demo_src.mcts_demo import MCTSDemo +from demo_src.wandb_tracker import WandBTracker, is_wandb_available + + +@dataclass +class AgentResult: + """Result from a single agent.""" + + agent_name: str + response: str + confidence: float + reasoning_steps: list[str] + execution_time_ms: float + + +@dataclass +class FrameworkResult: + """Combined result from all agents.""" + + query: str + hrm_result: AgentResult | None + trm_result: AgentResult | None + mcts_result: dict | None + consensus_score: float + final_response: str + total_time_ms: float + metadata: dict + + +class MultiAgentFrameworkDemo: + """Simplified multi-agent framework for Hugging Face Spaces demo.""" + + def __init__(self, use_hf_inference: bool = False, hf_model: str = ""): + """Initialize the demo framework. + + Args: + use_hf_inference: Use Hugging Face Inference API instead of mock + hf_model: Hugging Face model ID for inference + """ + self.use_hf_inference = use_hf_inference + self.hf_model = hf_model + + # Initialize components + if use_hf_inference and hf_model: + self.llm_client = HuggingFaceClient(model_id=hf_model) + else: + self.llm_client = MockLLMClient() + + self.hrm_agent = HRMAgent(self.llm_client) + self.trm_agent = TRMAgent(self.llm_client) + self.mcts = MCTSDemo() + + async def process_query( + self, + query: str, + use_hrm: bool = True, + use_trm: bool = True, + use_mcts: bool = False, + mcts_iterations: int = 25, + exploration_weight: float = 1.414, + seed: int | None = None, + ) -> FrameworkResult: + """Process a query through the multi-agent framework. + + Args: + query: The input query to process + use_hrm: Enable Hierarchical Reasoning Module + use_trm: Enable Tree Reasoning Module + use_mcts: Enable Monte Carlo Tree Search + mcts_iterations: Number of MCTS iterations + exploration_weight: UCB1 exploration parameter + seed: Random seed for reproducibility + + Returns: + FrameworkResult with all agent outputs and consensus + """ + start_time = time.perf_counter() + + hrm_result = None + trm_result = None + mcts_result = None + + # Run enabled agents + tasks = [] + agent_names = [] + + if use_hrm: + tasks.append(self._run_hrm(query)) + agent_names.append("hrm") + + if use_trm: + tasks.append(self._run_trm(query)) + agent_names.append("trm") + + if use_mcts: + tasks.append(self._run_mcts(query, mcts_iterations, exploration_weight, seed)) + agent_names.append("mcts") + + # Execute agents concurrently + if tasks: + results = await asyncio.gather(*tasks, return_exceptions=True) + + for name, result in zip(agent_names, results, strict=False): + if isinstance(result, Exception): + continue + if name == "hrm": + hrm_result = result + elif name == "trm": + trm_result = result + elif name == "mcts": + mcts_result = result + + # Calculate consensus score + consensus_score = self._calculate_consensus(hrm_result, trm_result, mcts_result) + + # Generate final synthesized response + final_response = self._synthesize_response(query, hrm_result, trm_result, mcts_result, consensus_score) + + total_time = (time.perf_counter() - start_time) * 1000 + + return FrameworkResult( + query=query, + hrm_result=hrm_result, + trm_result=trm_result, + mcts_result=mcts_result, + consensus_score=consensus_score, + final_response=final_response, + total_time_ms=round(total_time, 2), + metadata={ + "agents_used": agent_names, + "mcts_config": ( + {"iterations": mcts_iterations, "exploration_weight": exploration_weight, "seed": seed} + if use_mcts + else None + ), + }, + ) + + async def _run_hrm(self, query: str) -> AgentResult: + """Run Hierarchical Reasoning Module.""" + start = time.perf_counter() + result = await self.hrm_agent.process(query) + elapsed = (time.perf_counter() - start) * 1000 + + return AgentResult( + agent_name="HRM (Hierarchical Reasoning)", + response=result["response"], + confidence=result["confidence"], + reasoning_steps=result["steps"], + execution_time_ms=round(elapsed, 2), + ) + + async def _run_trm(self, query: str) -> AgentResult: + """Run Tree Reasoning Module.""" + start = time.perf_counter() + result = await self.trm_agent.process(query) + elapsed = (time.perf_counter() - start) * 1000 + + return AgentResult( + agent_name="TRM (Iterative Refinement)", + response=result["response"], + confidence=result["confidence"], + reasoning_steps=result["steps"], + execution_time_ms=round(elapsed, 2), + ) + + async def _run_mcts(self, query: str, iterations: int, exploration_weight: float, seed: int | None) -> dict: + """Run Monte Carlo Tree Search.""" + start = time.perf_counter() + + # MCTSDemo.search is now async and uses the production framework + result = await self.mcts.search(query=query, iterations=iterations, exploration_weight=exploration_weight, seed=seed) + + elapsed = (time.perf_counter() - start) * 1000 + result["execution_time_ms"] = round(elapsed, 2) + + return result + + def _calculate_consensus( + self, hrm_result: AgentResult | None, trm_result: AgentResult | None, mcts_result: dict | None + ) -> float: + """Calculate agreement score between agents.""" + confidences = [] + + if hrm_result: + confidences.append(hrm_result.confidence) + if trm_result: + confidences.append(trm_result.confidence) + if mcts_result: + confidences.append(mcts_result.get("best_value", 0.5)) + + if not confidences: + return 0.0 + + # Consensus is based on confidence alignment and average + if len(confidences) == 1: + return confidences[0] + + avg_confidence = np.mean(confidences) + std_confidence = np.std(confidences) + + # Higher consensus when agents agree (low std) and are confident (high avg) + agreement_factor = max(0, 1 - std_confidence * 2) + consensus = avg_confidence * agreement_factor + + return round(min(1.0, consensus), 3) + + def _synthesize_response( + self, + query: str, + hrm_result: AgentResult | None, + trm_result: AgentResult | None, + mcts_result: dict | None, + consensus_score: float, + ) -> str: + """Synthesize final response from all agent outputs.""" + parts = [] + + if hrm_result and hrm_result.confidence > 0.5: + parts.append(f"[HRM] {hrm_result.response}") + + if trm_result and trm_result.confidence > 0.5: + parts.append(f"[TRM] {trm_result.response}") + + if mcts_result and mcts_result.get("best_value", 0) > 0.5: + parts.append(f"[MCTS] Best path: {mcts_result.get('best_action', 'N/A')}") + + if not parts: + truncated_query = f"{query[:80]}..." if len(query) > 80 else query + return f"Insufficient confidence to answer query: '{truncated_query}'." + + synthesis = " | ".join(parts) + + if consensus_score > 0.7: + return f"HIGH CONSENSUS ({consensus_score:.1%}): {synthesis}" + elif consensus_score > 0.4: + return f"MODERATE CONSENSUS ({consensus_score:.1%}): {synthesis}" + else: + return f"LOW CONSENSUS ({consensus_score:.1%}): {synthesis}" + + +# Global framework instance +framework = None +wandb_tracker = None + + +def initialize_framework(use_hf: bool, model_id: str): + """Initialize or reinitialize the framework.""" + global framework + framework = MultiAgentFrameworkDemo(use_hf_inference=use_hf, hf_model=model_id) + return "Framework initialized successfully!" + + +def process_query_sync( + query: str, + use_hrm: bool, + use_trm: bool, + use_mcts: bool, + mcts_iterations: int, + exploration_weight: float, + seed: int, + enable_wandb: bool = False, + wandb_project: str = "langgraph-mcts-demo", + wandb_run_name: str = "", +): + """Synchronous wrapper for async processing.""" + global framework, wandb_tracker + + if framework is None: + framework = MultiAgentFrameworkDemo() + + if not query.strip(): + return "Please enter a query.", {}, "", {}, "" + + # Handle seed + seed_value = seed if seed > 0 else None + + # Initialize W&B tracking if enabled + wandb_url = "" + if enable_wandb and is_wandb_available(): + if wandb_tracker is None: + wandb_tracker = WandBTracker(project_name=wandb_project, enabled=True) + + # Start a new run + run_name = wandb_run_name if wandb_run_name.strip() else None + config = { + "query": query[:200], # Truncate for config + "use_hrm": use_hrm, + "use_trm": use_trm, + "use_mcts": use_mcts, + "mcts_iterations": mcts_iterations, + "exploration_weight": exploration_weight, + "seed": seed_value, + } + wandb_tracker.init_run(run_name=run_name, config=config) + + # Run async function + result = asyncio.run( + framework.process_query( + query=query, + use_hrm=use_hrm, + use_trm=use_trm, + use_mcts=use_mcts, + mcts_iterations=int(mcts_iterations), + exploration_weight=exploration_weight, + seed=seed_value, + ) + ) + + # Format outputs + final_response = result.final_response + + # Agent details + agent_details = {} + if result.hrm_result: + agent_details["HRM"] = { + "response": result.hrm_result.response, + "confidence": f"{result.hrm_result.confidence:.1%}", + "reasoning_steps": result.hrm_result.reasoning_steps, + "time_ms": result.hrm_result.execution_time_ms, + } + + # Log to W&B + if enable_wandb and wandb_tracker: + wandb_tracker.log_agent_result( + "HRM", + result.hrm_result.response, + result.hrm_result.confidence, + result.hrm_result.execution_time_ms, + result.hrm_result.reasoning_steps, + ) + + if result.trm_result: + agent_details["TRM"] = { + "response": result.trm_result.response, + "confidence": f"{result.trm_result.confidence:.1%}", + "reasoning_steps": result.trm_result.reasoning_steps, + "time_ms": result.trm_result.execution_time_ms, + } + + # Log to W&B + if enable_wandb and wandb_tracker: + wandb_tracker.log_agent_result( + "TRM", + result.trm_result.response, + result.trm_result.confidence, + result.trm_result.execution_time_ms, + result.trm_result.reasoning_steps, + ) + + if result.mcts_result: + agent_details["MCTS"] = result.mcts_result + + # Log to W&B + if enable_wandb and wandb_tracker: + wandb_tracker.log_mcts_result(result.mcts_result) + + # Log consensus and performance to W&B + if enable_wandb and wandb_tracker: + wandb_tracker.log_consensus(result.consensus_score, result.metadata["agents_used"], result.final_response) + wandb_tracker.log_performance(result.total_time_ms) + wandb_tracker.log_query_summary(query, use_hrm, use_trm, use_mcts, result.consensus_score, result.total_time_ms) + + # Get run URL + wandb_url = wandb_tracker.get_run_url() or "" + + # Finish the run + wandb_tracker.finish_run() + + # Metrics + metrics = f""" +**Consensus Score:** {result.consensus_score:.1%} +**Total Processing Time:** {result.total_time_ms:.2f} ms +**Agents Used:** {", ".join(result.metadata["agents_used"])} +""" + + if wandb_url: + metrics += f"\n**W&B Run:** [{wandb_url}]({wandb_url})" + + # Full JSON result + full_result = { + "query": result.query, + "final_response": result.final_response, + "consensus_score": result.consensus_score, + "total_time_ms": result.total_time_ms, + "metadata": result.metadata, + "agent_details": agent_details, + "wandb_url": wandb_url, + } + + return final_response, agent_details, metrics, full_result, wandb_url + + +def visualize_mcts_tree(mcts_result: dict) -> str: + """Create ASCII visualization of MCTS tree.""" + if not mcts_result or "tree_visualization" not in mcts_result: + return "No MCTS tree data available" + + return mcts_result["tree_visualization"] + + +# Example queries for demonstration +EXAMPLE_QUERIES = [ + "What are the key factors to consider when choosing between microservices and monolithic architecture?", + "How can we optimize a Python application that processes 10GB of log files daily?", + "What is the best approach to implement rate limiting in a distributed system?", + "Should we use SQL or NoSQL database for a social media application with 1M users?", + "How to design a fault-tolerant message queue system?", +] + + +# Gradio Interface +with gr.Blocks( + title="LangGraph Multi-Agent MCTS Demo", + theme=gr.themes.Soft(), + css=""" + .agent-box { border: 1px solid #ddd; padding: 10px; border-radius: 5px; margin: 5px 0; } + .consensus-high { color: #28a745; font-weight: bold; } + .consensus-medium { color: #ffc107; font-weight: bold; } + .consensus-low { color: #dc3545; font-weight: bold; } + """, +) as demo: + gr.Markdown( + """ + # LangGraph Multi-Agent MCTS Framework + + **Proof-of-Concept Demo** - Multi-agent reasoning with Monte Carlo Tree Search + + This demo showcases: + - **HRM**: Hierarchical Reasoning Module - breaks down complex queries + - **TRM**: Tree Reasoning Module - iterative refinement of responses + - **MCTS**: Monte Carlo Tree Search - strategic exploration of solution space + - **Consensus**: Agreement scoring between agents + + --- + """ + ) + + with gr.Row(): + with gr.Column(scale=2): + query_input = gr.Textbox( + label="Query", placeholder="Enter your reasoning task or question...", lines=3, max_lines=10 + ) + + gr.Markdown("**Example Queries:**") + example_dropdown = gr.Dropdown(choices=EXAMPLE_QUERIES, label="Select an example", interactive=True) + + def load_example(example): + return example + + example_dropdown.change(load_example, example_dropdown, query_input) + + with gr.Column(scale=1): + gr.Markdown("**Agent Configuration**") + use_hrm = gr.Checkbox(label="Enable HRM (Hierarchical)", value=True) + use_trm = gr.Checkbox(label="Enable TRM (Iterative)", value=True) + use_mcts = gr.Checkbox(label="Enable MCTS", value=False) + + gr.Markdown("**MCTS Parameters**") + mcts_iterations = gr.Slider( + minimum=10, + maximum=100, + value=25, + step=5, + label="Iterations", + info="More iterations = better search, but slower", + ) + exploration_weight = gr.Slider( + minimum=0.1, + maximum=3.0, + value=1.414, + step=0.1, + label="Exploration Weight (C)", + info="Higher = more exploration, Lower = more exploitation", + ) + seed_input = gr.Number(label="Random Seed (0 for random)", value=0, precision=0) + + with gr.Accordion("Weights & Biases Tracking", open=False): + gr.Markdown( + """ + **Experiment Tracking with W&B** + + Track your experiments, visualize metrics, and compare runs. + Requires W&B API key set in Space secrets as `WANDB_API_KEY`. + """ + ) + with gr.Row(): + enable_wandb = gr.Checkbox( + label="Enable W&B Tracking", value=False, info="Log metrics and results to Weights & Biases" + ) + wandb_project = gr.Textbox( + label="Project Name", value="langgraph-mcts-demo", placeholder="Your W&B project name" + ) + wandb_run_name = gr.Textbox(label="Run Name (optional)", value="", placeholder="Auto-generated if empty") + + wandb_status = gr.Markdown(f"**W&B Status:** {'Available' if is_wandb_available() else 'Not installed'}") + + process_btn = gr.Button("Process Query", variant="primary", size="lg") + + gr.Markdown("---") + + with gr.Row(): + with gr.Column(): + gr.Markdown("### Final Response") + final_response_output = gr.Textbox(label="Synthesized Response", lines=4, interactive=False) + + gr.Markdown("### Performance Metrics") + metrics_output = gr.Markdown() + + with gr.Column(): + gr.Markdown("### Agent Details") + agent_details_output = gr.JSON(label="Individual Agent Results") + + with gr.Accordion("Full JSON Result", open=False): + full_result_output = gr.JSON(label="Complete Framework Output") + + with gr.Accordion("W&B Run Details", open=False, visible=True): + wandb_url_output = gr.Textbox( + label="W&B Run URL", interactive=False, placeholder="Enable W&B tracking to see run URL here" + ) + + # Wire up the processing + process_btn.click( + fn=process_query_sync, + inputs=[ + query_input, + use_hrm, + use_trm, + use_mcts, + mcts_iterations, + exploration_weight, + seed_input, + enable_wandb, + wandb_project, + wandb_run_name, + ], + outputs=[final_response_output, agent_details_output, metrics_output, full_result_output, wandb_url_output], + ) + + gr.Markdown( + """ + --- + + ### About This Demo + + This is a **proof-of-concept** demonstration of the LangGraph Multi-Agent MCTS Framework. + + **Features:** + - Multi-agent orchestration with consensus scoring + - Monte Carlo Tree Search for strategic reasoning + - Configurable exploration vs exploitation trade-offs + - Deterministic results with seeded randomness + - **Weights & Biases integration** for experiment tracking + + **Limitations (POC):** + - Uses mock/simplified LLM responses (not production LLM) + - Limited to demonstration scenarios + - No persistent storage or RAG + - Simplified MCTS implementation + + **Full Framework:** [GitHub Repository](https://github.com/ianshank/langgraph_multi_agent_mcts) + + --- + *Built with LangGraph, Gradio, Weights & Biases, and Python* + """ + ) + + +if __name__ == "__main__": + # Initialize with mock client for demo + framework = MultiAgentFrameworkDemo(use_hf_inference=False) + + # Launch the demo + demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True) diff --git a/demo_src/__init__.py b/demo_src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..622fa74289205c67a1f404910be05ac6892c9d74 --- /dev/null +++ b/demo_src/__init__.py @@ -0,0 +1 @@ +# Demo source modules for Hugging Face Spaces diff --git a/demo_src/agents_demo.py b/demo_src/agents_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..6335bc777201b0305c898d513e4ee6b6b57aa457 --- /dev/null +++ b/demo_src/agents_demo.py @@ -0,0 +1,234 @@ +""" +Simplified agent implementations for Hugging Face Spaces demo. +""" + +import asyncio +from typing import Any + + +class HRMAgent: + """Hierarchical Reasoning Module - breaks down complex queries.""" + + def __init__(self, llm_client): + """Initialize with an LLM client. + + Args: + llm_client: LLM client (MockLLMClient or HuggingFaceClient) + """ + self.llm_client = llm_client + self.name = "HRM (Hierarchical Reasoning)" + + async def process(self, query: str) -> dict[str, Any]: + """Process query using hierarchical decomposition. + + Args: + query: Input query to process + + Returns: + Dictionary with response, confidence, and reasoning steps + """ + # Step 1: Decompose the query + decomposition_steps = await self._decompose_query(query) + + # Step 2: Analyze each component + analysis_results = await self._analyze_components(decomposition_steps) + + # Step 3: Synthesize hierarchical response + llm_result = await self.llm_client.generate( + prompt=f"Hierarchical analysis of: {query}", context=f"Components: {', '.join(decomposition_steps)}" + ) + + # Compile reasoning steps + reasoning_steps = [ + f"1. Query decomposition: Identified {len(decomposition_steps)} key components", + f"2. Component analysis: {analysis_results}", + "3. Hierarchical synthesis: Combined insights from all levels", + f"4. Confidence assessment: {llm_result['confidence']:.1%} based on component clarity", + ] + + return { + "response": llm_result["response"], + "confidence": llm_result["confidence"], + "steps": reasoning_steps, + "components": decomposition_steps, + "tokens_used": llm_result.get("tokens_used", 0), + } + + async def _decompose_query(self, query: str) -> list[str]: + """Decompose query into hierarchical components.""" + # Simulate decomposition based on query structure + await asyncio.sleep(0.05) # Simulate processing + + # Simple heuristic decomposition + components = [] + + # Extract key phrases + query_lower = query.lower() + + if "?" in query: + components.append("Question type: Analytical") + else: + components.append("Question type: Directive") + + if "how" in query_lower: + components.append("Focus: Methodology/Process") + elif "what" in query_lower: + components.append("Focus: Definition/Identification") + elif "why" in query_lower: + components.append("Focus: Causation/Reasoning") + elif "should" in query_lower or "best" in query_lower: + components.append("Focus: Decision/Recommendation") + else: + components.append("Focus: General inquiry") + + # Domain detection + if any(term in query_lower for term in ["database", "sql", "nosql", "storage"]): + components.append("Domain: Data Management") + elif any(term in query_lower for term in ["architecture", "design", "pattern"]): + components.append("Domain: System Architecture") + elif any(term in query_lower for term in ["performance", "optimization", "speed"]): + components.append("Domain: Performance Engineering") + elif any(term in query_lower for term in ["scale", "distributed", "cluster"]): + components.append("Domain: Distributed Systems") + else: + components.append("Domain: Software Engineering") + + # Complexity assessment + word_count = len(query.split()) + if word_count > 20: + components.append("Complexity: High (detailed query)") + elif word_count > 10: + components.append("Complexity: Medium") + else: + components.append("Complexity: Low (concise query)") + + return components + + async def _analyze_components(self, components: list[str]) -> str: + """Analyze the decomposed components.""" + await asyncio.sleep(0.03) # Simulate processing + + # Generate analysis summary + analysis_parts = [] + + for component in components: + if "Focus:" in component: + focus = component.split(":")[1].strip() + analysis_parts.append(f"requires {focus.lower()} approach") + elif "Domain:" in component: + domain = component.split(":")[1].strip() + analysis_parts.append(f"applies to {domain}") + elif "Complexity:" in component: + complexity = component.split(":")[1].strip().split()[0] + analysis_parts.append(f"{complexity.lower()} complexity level") + + return "; ".join(analysis_parts) if analysis_parts else "Standard analysis" + + +class TRMAgent: + """Tree Reasoning Module - iterative refinement of responses.""" + + def __init__(self, llm_client): + """Initialize with an LLM client. + + Args: + llm_client: LLM client (MockLLMClient or HuggingFaceClient) + """ + self.llm_client = llm_client + self.name = "TRM (Iterative Refinement)" + self.max_iterations = 3 + + async def process(self, query: str) -> dict[str, Any]: + """Process query using iterative refinement. + + Args: + query: Input query to process + + Returns: + Dictionary with response, confidence, and reasoning steps + """ + reasoning_steps = [] + current_response = "" + current_confidence = 0.0 + + # Iterative refinement loop + for iteration in range(self.max_iterations): + step_num = iteration + 1 + + # Generate or refine response + if iteration == 0: + # Initial response + result = await self.llm_client.generate(prompt=query, context="") + current_response = result["response"] + current_confidence = result["confidence"] + reasoning_steps.append( + f"Iteration {step_num}: Initial response generated (confidence: {current_confidence:.1%})" + ) + else: + # Refinement iteration + refinement_result = await self._refine_response(query, current_response, iteration) + current_response = refinement_result["response"] + + # Confidence typically improves with refinement + confidence_improvement = min(0.1, (1 - current_confidence) * 0.3) + current_confidence = min(0.95, current_confidence + confidence_improvement) + + reasoning_steps.append( + f"Iteration {step_num}: {refinement_result['refinement_type']} " + f"(confidence: {current_confidence:.1%})" + ) + + # Check if confidence is high enough to stop + if current_confidence > 0.85: + reasoning_steps.append(f"Early termination: High confidence ({current_confidence:.1%}) achieved") + break + + # Final reasoning step + reasoning_steps.append(f"Final: Response refined through {len(reasoning_steps)} iterations") + + return { + "response": current_response, + "confidence": round(current_confidence, 3), + "steps": reasoning_steps, + "iterations_used": min(iteration + 1, self.max_iterations), + "refinement_history": reasoning_steps, + } + + async def _refine_response(self, query: str, current_response: str, iteration: int) -> dict[str, Any]: + """Refine the current response.""" + await asyncio.sleep(0.05) # Simulate refinement processing + + # Different refinement strategies based on iteration + refinement_strategies = [ + ("Clarity enhancement", "improve clarity and precision"), + ("Detail expansion", "add technical depth and specifics"), + ("Validation check", "verify accuracy and completeness"), + ] + + strategy_name, strategy_action = refinement_strategies[iteration % len(refinement_strategies)] + + # Generate refined response + refinement_prompt = f""" + Original query: {query} + Current response: {current_response} + Refinement task: {strategy_action} + """ + + result = await self.llm_client.generate( + prompt=refinement_prompt, context=f"Refinement iteration {iteration + 1}" + ) + + # Enhance the response based on strategy + enhanced_response = current_response + if strategy_name == "Clarity enhancement": + enhanced_response = f"{current_response}. {result['response']}" + elif strategy_name == "Detail expansion": + enhanced_response = f"{current_response}. Furthermore, {result['response']}" + else: # Validation + enhanced_response = f"{current_response}. Validated: {result['response']}" + + # Truncate if too long + if len(enhanced_response) > 300: + enhanced_response = enhanced_response[:297] + "..." + + return {"response": enhanced_response, "refinement_type": strategy_name, "strategy_action": strategy_action} diff --git a/demo_src/llm_mock.py b/demo_src/llm_mock.py new file mode 100644 index 0000000000000000000000000000000000000000..0c7334442f2a9767f5017c5703338891ceafe97c --- /dev/null +++ b/demo_src/llm_mock.py @@ -0,0 +1,182 @@ +""" +Mock and lightweight LLM clients for demo purposes. +""" + +import asyncio +import random +from typing import Any + + +class MockLLMClient: + """Mock LLM client that generates plausible demo responses.""" + + def __init__(self): + self.response_templates = { + "architecture": [ + "Consider scalability requirements and team expertise", + "Evaluate coupling, deployment complexity, and operational overhead", + "Balance between development speed and long-term maintainability", + ], + "optimization": [ + "Profile first to identify actual bottlenecks", + "Consider memory-mapped files and streaming processing", + "Implement parallel processing with appropriate chunk sizes", + ], + "database": [ + "Consider data consistency requirements and query patterns", + "Evaluate write-heavy vs read-heavy workload characteristics", + "Plan for horizontal scaling and data distribution strategies", + ], + "distributed": [ + "Implement proper failure detection and recovery mechanisms", + "Use circuit breakers and bulkhead patterns for resilience", + "Consider eventual consistency vs strong consistency trade-offs", + ], + "default": [ + "Break down the problem into smaller components", + "Consider trade-offs between different approaches", + "Evaluate based on specific use case requirements", + ], + } + + async def generate(self, prompt: str, context: str = "") -> dict[str, Any]: + """Generate a mock response based on the prompt and optional context.""" + # Simulate processing time + await asyncio.sleep(random.uniform(0.1, 0.3)) + + # Determine response category + prompt_lower = prompt.lower() + if "architecture" in prompt_lower or "microservice" in prompt_lower or "monolith" in prompt_lower: + category = "architecture" + elif "optim" in prompt_lower or "performance" in prompt_lower or "process" in prompt_lower: + category = "optimization" + elif "database" in prompt_lower or "sql" in prompt_lower or "nosql" in prompt_lower: + category = "database" + elif "distribut" in prompt_lower or "fault" in prompt_lower or "rate limit" in prompt_lower: + category = "distributed" + else: + category = "default" + + templates = self.response_templates[category] + + # Generate response with some randomness + response = random.choice(templates) + confidence = random.uniform(0.6, 0.95) + + # Add more detail based on prompt length (simulating "understanding") + if len(prompt) > 100: + confidence = min(0.95, confidence + 0.1) + response += f". Additionally, {random.choice(self.response_templates['default'])}" + + # Lightly incorporate context to simulate conditioning + context_snippet = context.strip() + if context_snippet: + confidence = min(0.99, confidence + 0.05) + response += f" (context signal: {context_snippet[:60]}{'...' if len(context_snippet) > 60 else ''})" + + return { + "response": response, + "confidence": round(confidence, 3), + "tokens_used": len(prompt.split()) * 2 + len(response.split()), + } + + async def generate_reasoning_steps(self, query: str, num_steps: int = 3) -> list[str]: + """Generate mock reasoning steps.""" + await asyncio.sleep(random.uniform(0.05, 0.15)) + + base_steps = [ + f"Analyzing query: '{query[:50]}...'", + "Identifying key requirements and constraints", + "Evaluating potential approaches", + "Considering trade-offs and implications", + "Synthesizing recommendations based on analysis", + "Validating conclusions against requirements", + ] + + return random.sample(base_steps, min(num_steps, len(base_steps))) + + +class HuggingFaceClient: + """Lightweight Hugging Face Inference API client.""" + + def __init__(self, model_id: str = "mistralai/Mistral-7B-Instruct-v0.2"): + """Initialize with a Hugging Face model. + + Args: + model_id: The model ID on Hugging Face Hub + """ + self.model_id = model_id + self._client = None + + def _get_client(self): + """Lazy load the HF client.""" + if self._client is None: + try: + from huggingface_hub import InferenceClient + + self._client = InferenceClient(model=self.model_id) + except ImportError: + raise ImportError("huggingface_hub not installed. Install with: pip install huggingface_hub") + return self._client + + async def generate(self, prompt: str, context: str = "") -> dict[str, Any]: + """Generate response using Hugging Face Inference API.""" + try: + client = self._get_client() + + # Format prompt + if context: + full_prompt = f"Context: {context}\n\nQuestion: {prompt}\n\nAnswer:" + else: + full_prompt = f"Question: {prompt}\n\nProvide a concise, technical answer:\n\nAnswer:" + + # Call HF Inference API (sync call wrapped in async) + response_text = await asyncio.to_thread( + client.text_generation, full_prompt, max_new_tokens=150, temperature=0.7, do_sample=True + ) + + # Estimate confidence based on response characteristics + confidence = min(0.95, 0.6 + len(response_text) / 500) + + return { + "response": response_text.strip(), + "confidence": round(confidence, 3), + "tokens_used": len(full_prompt.split()) + len(response_text.split()), + } + + except Exception as e: + # Fallback to mock on error + print(f"HF Inference error: {e}. Falling back to mock.") + mock = MockLLMClient() + return await mock.generate(prompt, context) + + async def generate_reasoning_steps(self, query: str, num_steps: int = 3) -> list[str]: + """Generate reasoning steps using HF model.""" + try: + client = self._get_client() + + prompt = f"""Break down this question into {num_steps} reasoning steps: +Question: {query} + +Reasoning steps (one per line): +1.""" + + response = await asyncio.to_thread(client.text_generation, prompt, max_new_tokens=200, temperature=0.5) + + # Parse steps from response + lines = response.strip().split("\n") + steps = [] + for line in lines: + line = line.strip() + if line and not line.startswith("#"): + # Remove numbering + if line[0].isdigit() and "." in line[:3]: + line = line.split(".", 1)[1].strip() + steps.append(line) + + return steps[:num_steps] if steps else ["Analysis in progress"] + + except Exception as e: + print(f"HF reasoning error: {e}. Falling back to mock.") + mock = MockLLMClient() + return await mock.generate_reasoning_steps(query, num_steps) diff --git a/demo_src/mcts_demo.py b/demo_src/mcts_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..bd3e10c0d6d4a76118e4d5626838890ee19102a7 --- /dev/null +++ b/demo_src/mcts_demo.py @@ -0,0 +1,436 @@ +""" +Educational MCTS demonstration using the production framework. + +This demo uses the real MCTSEngine from src.framework.mcts.core to provide +an authentic learning experience while remaining accessible for demonstrations. +""" + +from __future__ import annotations + +import math +from typing import Any + +from src.framework.mcts.core import MCTSEngine, MCTSNode, MCTSState +from src.framework.mcts.policies import RolloutPolicy, SelectionPolicy + + +class DemoRolloutPolicy(RolloutPolicy): + """ + Educational rollout policy for demo purposes. + + Evaluates states based on: + - Depth of exploration (deeper = more thorough) + - Action quality (domain-specific heuristics) + - Exploration randomness + """ + + def __init__(self, category: str, action_templates: dict[str, list[str]]): + """ + Initialize demo rollout policy. + + Args: + category: Query category for heuristic evaluation + action_templates: Available action templates for scoring + """ + self.category = category + self.action_templates = action_templates + + # Define key terms that indicate quality actions per category + self.quality_indicators = { + "architecture": ["scalability", "consistency", "requirements"], + "optimization": ["profile", "caching", "parallel"], + "database": ["patterns", "relationships", "scaling"], + "distributed": ["circuit", "retry", "bulkhead"], + "default": ["decompose", "constraints", "trade-offs"], + } + + async def evaluate( + self, + state: MCTSState, + rng, + max_depth: int = 10, + ) -> float: + """ + Evaluate a state through heuristic analysis. + + This combines: + - Depth bonus: rewards thorough exploration + - Action quality: rewards domain-appropriate actions + - Noise: adds exploration randomness + + Args: + state: State to evaluate + rng: Random number generator + max_depth: Maximum depth (unused in heuristic) + + Returns: + Estimated value in [0, 1] range + """ + # Base value + base_value = 0.5 + + # Depth bonus: deeper exploration = more value (up to 0.3) + depth = state.features.get("depth", 0) + depth_bonus = min(depth * 0.1, 0.3) + + # Action quality bonus + action_bonus = 0.0 + last_action = state.features.get("last_action", "") + + if last_action: + # Check if action contains quality indicators for this category + indicators = self.quality_indicators.get(self.category, self.quality_indicators["default"]) + for term in indicators: + if term in last_action.lower(): + action_bonus = 0.15 + break + + # Add exploration noise + noise = rng.uniform(-0.1, 0.1) + + # Combine components + value = base_value + depth_bonus + action_bonus + noise + + # Clamp to [0, 1] + return max(0.0, min(1.0, value)) + + +class MCTSDemo: + """ + Educational MCTS demonstration using the production framework. + + This class wraps the production MCTSEngine to provide: + - Simple, educational interface for demos + - Category-based action selection + - Tree visualization for learning + - Deterministic behavior with seeds + + Unlike the old mock implementation, this uses the real MCTS algorithm + with all its features: UCB1 selection, progressive widening, caching, etc. + """ + + def __init__(self, max_depth: int = 5): + """ + Initialize MCTS demo. + + Args: + max_depth: Maximum tree depth for exploration + """ + self.max_depth = max_depth + + # Action templates for different query types + # These provide domain-specific reasoning paths + self.action_templates = { + "architecture": [ + "Consider microservices for scalability", + "Evaluate monolith for simplicity", + "Analyze team capabilities", + "Assess deployment requirements", + "Review data consistency needs", + ], + "optimization": [ + "Profile application hotspots", + "Implement caching layer", + "Use parallel processing", + "Optimize database queries", + "Reduce memory allocations", + ], + "database": [ + "Analyze query patterns", + "Consider data relationships", + "Evaluate consistency requirements", + "Plan for horizontal scaling", + "Assess read/write ratios", + ], + "distributed": [ + "Implement circuit breakers", + "Add retry mechanisms", + "Use message queues", + "Apply bulkhead pattern", + "Design for eventual consistency", + ], + "default": [ + "Decompose the problem", + "Identify constraints", + "Evaluate trade-offs", + "Consider alternatives", + "Validate assumptions", + ], + } + + def _categorize_query(self, query: str) -> str: + """ + Categorize query to select appropriate action templates. + + Args: + query: User's input query + + Returns: + Category name for action selection + """ + query_lower = query.lower() + if "architecture" in query_lower or "microservice" in query_lower: + return "architecture" + elif "optim" in query_lower or "performance" in query_lower: + return "optimization" + elif "database" in query_lower or "sql" in query_lower: + return "database" + elif "distribut" in query_lower or "fault" in query_lower: + return "distributed" + return "default" + + def _create_action_generator(self, category: str): + """ + Create action generator function for this query category. + + Args: + category: Query category + + Returns: + Function that generates actions for a given state + """ + def action_generator(state: MCTSState) -> list[str]: + """Generate available actions from current state.""" + # Get category-specific actions + actions = self.action_templates.get(category, self.action_templates["default"]) + + # Filter out already-used actions (track via state features) + used_actions = state.features.get("used_actions", set()) + available = [a for a in actions if a not in used_actions] + + # If all actions used, allow re-exploring top 2 + if not available: + return actions[:2] + + return available + + return action_generator + + def _create_state_transition(self, category: str): + """ + Create state transition function for this query category. + + Args: + category: Query category + + Returns: + Function that computes next state from current state + action + """ + def state_transition(state: MCTSState, action: str) -> MCTSState: + """Compute next state by applying action.""" + # Track action history + action_history = list(state.features.get("action_history", [])) + action_history.append(action) + + # Track used actions + used_actions = set(state.features.get("used_actions", set())) + used_actions.add(action) + + # Increment depth + depth = state.features.get("depth", 0) + 1 + + # Create new state ID from action history + state_id = " -> ".join(action_history) + + # Build new state + new_state = MCTSState( + state_id=state_id, + features={ + "action_history": action_history, + "used_actions": used_actions, + "depth": depth, + "last_action": action, + "category": category, + }, + ) + + return new_state + + return state_transition + + def _generate_tree_visualization(self, root: MCTSNode, max_nodes: int = 20) -> str: + """ + Generate ASCII visualization of the MCTS tree. + + This provides educational insight into the search process. + + Args: + root: Root node of the tree + max_nodes: Maximum nodes to display + + Returns: + ASCII art representation of the tree + """ + max_nodes = max(1, max_nodes) + lines = [] + lines.append("MCTS Tree Visualization") + lines.append("=" * 50) + + nodes_rendered = 0 + + def format_node(node: MCTSNode, prefix: str = "", is_last: bool = True) -> list[str]: + nonlocal nodes_rendered + result = [] + + # Node representation + connector = "└── " if is_last else "├── " + + if nodes_rendered >= max_nodes: + result.append(f"{prefix}{connector}... (truncated)") + return result + + nodes_rendered += 1 + + # Display action or state + node_str = f"{node.state.state_id[:30]}..." + if node.action: + node_str = f"{node.action[:25]}..." + + stats = f"[V:{node.visits}, Q:{node.value:.3f}]" + + result.append(f"{prefix}{connector}{node_str} {stats}") + + # Recursively add children + new_prefix = prefix + (" " if is_last else "│ ") + + # Limit children shown + children_to_show = node.children[:3] + for i, child in enumerate(children_to_show): + is_child_last = i == len(children_to_show) - 1 + result.extend(format_node(child, new_prefix, is_child_last)) + + if len(node.children) > 3: + result.append(f"{new_prefix} ... and {len(node.children) - 3} more") + + return result + + # Start with root + lines.append(f"Root: {root.state.state_id[:40]}... [V:{root.visits}, Q:{root.value:.3f}]") + nodes_rendered += 1 + + for i, child in enumerate(root.children[:5]): + is_last = i == len(root.children[:5]) - 1 + lines.extend(format_node(child, "", is_last)) + + if len(root.children) > 5: + lines.append(f"... and {len(root.children) - 5} more branches") + + return "\n".join(lines) + + async def search( + self, + query: str, + iterations: int = 25, + exploration_weight: float = 1.414, + seed: int | None = None, + ) -> dict[str, Any]: + """ + Run MCTS search on the query using the production framework. + + This method demonstrates the full MCTS algorithm: + 1. Selection: UCB1-based tree traversal + 2. Expansion: Progressive widening of nodes + 3. Simulation: Heuristic evaluation (rollout) + 4. Backpropagation: Value updates up the tree + + Args: + query: The input query to analyze + iterations: Number of MCTS iterations (more = better but slower) + exploration_weight: UCB1 exploration constant (higher = more exploration) + seed: Random seed for deterministic results + + Returns: + Dictionary with: + - best_action: Recommended next step + - best_value: Confidence in recommendation + - statistics: Search metrics and performance data + - tree_visualization: ASCII art of search tree + """ + # Determine query category + category = self._categorize_query(query) + + # Initialize MCTS engine with production features + engine = MCTSEngine( + seed=seed if seed is not None else 42, + exploration_weight=exploration_weight, + progressive_widening_k=1.0, # Moderate expansion + progressive_widening_alpha=0.5, + max_parallel_rollouts=4, + cache_size_limit=10000, + ) + + # Create root state + root_state = MCTSState( + state_id=f"Query: {query[:50]}", + features={ + "query": query, + "category": category, + "action_history": [], + "used_actions": set(), + "depth": 0, + "last_action": "", + }, + ) + + # Create root node + root = MCTSNode(state=root_state, rng=engine.rng) + + # Create domain-specific functions + action_generator = self._create_action_generator(category) + state_transition = self._create_state_transition(category) + rollout_policy = DemoRolloutPolicy(category, self.action_templates) + + # Run MCTS search with production engine + best_action, stats = await engine.search( + root=root, + num_iterations=iterations, + action_generator=action_generator, + state_transition=state_transition, + rollout_policy=rollout_policy, + max_rollout_depth=self.max_depth, + selection_policy=SelectionPolicy.MAX_VISITS, # Most robust + ) + + # Extract best child info + best_child = None + if root.children: + best_child = max(root.children, key=lambda c: c.visits) + + # Compile results for demo interface + result = { + "best_action": best_action or "No action found", + "best_value": round(best_child.value, 4) if best_child else 0.0, + "root_visits": root.visits, + "total_nodes": engine.get_cached_node_count(), + "max_depth_reached": engine.get_cached_tree_depth(), + "iterations_completed": iterations, + "exploration_weight": exploration_weight, + "seed": seed, + "category": category, + + # Top actions sorted by visits + "top_actions": [ + { + "action": child.action, + "visits": child.visits, + "value": round(child.value, 4), + "ucb1": round( + child.visits / root.visits if root.visits > 0 else 0.0, 4 + ), # Simplified UCB display + } + for child in sorted(root.children, key=lambda c: -c.visits)[:5] + ], + + # Framework statistics + "framework_stats": { + "cache_hits": stats.get("cache_hits", 0), + "cache_misses": stats.get("cache_misses", 0), + "cache_hit_rate": round(stats.get("cache_hit_rate", 0.0), 4), + "total_simulations": stats.get("total_simulations", 0), + }, + + # Educational visualization + "tree_visualization": self._generate_tree_visualization(root), + } + + return result diff --git a/demo_src/wandb_tracker.py b/demo_src/wandb_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba8dbcfe5865a4c73905061dc6a4ba440f2ef50 --- /dev/null +++ b/demo_src/wandb_tracker.py @@ -0,0 +1,349 @@ +""" +Weights & Biases integration for experiment tracking. +""" + +import os +from datetime import datetime +from typing import Any + +try: + import wandb + + WANDB_AVAILABLE = True +except ImportError: + WANDB_AVAILABLE = False + wandb = None + + +class WandBTracker: + """Weights & Biases experiment tracker for multi-agent MCTS demo.""" + + def __init__(self, project_name: str = "langgraph-mcts-demo", entity: str | None = None, enabled: bool = True): + """Initialize W&B tracker. + + Args: + project_name: W&B project name + entity: W&B entity (username or team) + enabled: Whether tracking is enabled + """ + self.project_name = project_name + self.entity = entity + self.enabled = enabled and WANDB_AVAILABLE + self.run = None + self.run_id = None + + def is_available(self) -> bool: + """Check if W&B is available.""" + return WANDB_AVAILABLE + + def init_run( + self, run_name: str | None = None, config: dict[str, Any] | None = None, tags: list[str] | None = None + ) -> bool: + """Initialize a new W&B run. + + Args: + run_name: Optional name for the run + config: Configuration dictionary to log + tags: Tags for the run + + Returns: + True if run initialized successfully, False otherwise + """ + if not self.enabled: + return False + + try: + # Generate run name if not provided + if run_name is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + run_name = f"mcts_query_{timestamp}" + + # Default tags + if tags is None: + tags = ["demo", "multi-agent", "mcts"] + + # Initialize run + self.run = wandb.init( + project=self.project_name, + entity=self.entity, + name=run_name, + config=config or {}, + tags=tags, + reinit=True, + ) + + self.run_id = self.run.id + return True + + except Exception as e: + print(f"W&B init error: {e}") + self.enabled = False + return False + + def log_query_config(self, config: dict[str, Any]): + """Log query configuration. + + Args: + config: Configuration dictionary with agent settings, MCTS params, etc. + """ + if not self.enabled or not self.run: + return + + try: + wandb.config.update(config) + except Exception as e: + print(f"W&B config log error: {e}") + + def log_agent_result( + self, + agent_name: str, + response: str, + confidence: float, + execution_time_ms: float, + reasoning_steps: list[str] | None = None, + ): + """Log individual agent results. + + Args: + agent_name: Name of the agent (HRM, TRM, MCTS) + response: Agent's response text + confidence: Confidence score (0-1) + execution_time_ms: Execution time in milliseconds + reasoning_steps: Optional list of reasoning steps + """ + if not self.enabled or not self.run: + return + + try: + metrics = { + f"{agent_name}/confidence": confidence, + f"{agent_name}/execution_time_ms": execution_time_ms, + f"{agent_name}/response_length": len(response), + } + + if reasoning_steps: + metrics[f"{agent_name}/num_reasoning_steps"] = len(reasoning_steps) + + wandb.log(metrics) + + # Log response as text + wandb.log({f"{agent_name}/response": wandb.Html(f"
{response}")})
+
+ except Exception as e:
+ print(f"W&B agent result log error: {e}")
+
+ def log_mcts_result(self, mcts_result: dict[str, Any]):
+ """Log MCTS-specific metrics.
+
+ Args:
+ mcts_result: Dictionary containing MCTS search results
+ """
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ # Extract key metrics
+ metrics = {
+ "mcts/best_value": mcts_result.get("best_value", 0),
+ "mcts/root_visits": mcts_result.get("root_visits", 0),
+ "mcts/total_nodes": mcts_result.get("total_nodes", 0),
+ "mcts/max_depth": mcts_result.get("max_depth_reached", 0),
+ "mcts/iterations": mcts_result.get("iterations_completed", 0),
+ "mcts/exploration_weight": mcts_result.get("exploration_weight", 1.414),
+ }
+
+ wandb.log(metrics)
+
+ # Log top actions as table
+ if "top_actions" in mcts_result:
+ top_actions_data = []
+ for action in mcts_result["top_actions"]:
+ top_actions_data.append(
+ [
+ action.get("action", ""),
+ action.get("visits", 0),
+ action.get("value", 0),
+ action.get("ucb1", 0),
+ ]
+ )
+
+ if top_actions_data:
+ table = wandb.Table(data=top_actions_data, columns=["Action", "Visits", "Value", "UCB1"])
+ wandb.log({"mcts/top_actions_table": table})
+
+ # Log tree visualization as text artifact
+ if "tree_visualization" in mcts_result:
+ wandb.log({"mcts/tree_visualization": wandb.Html(f"{mcts_result['tree_visualization']}")})
+
+ except Exception as e:
+ print(f"W&B MCTS result log error: {e}")
+
+ def log_consensus(self, consensus_score: float, agents_used: list[str], final_response: str):
+ """Log consensus metrics.
+
+ Args:
+ consensus_score: Agreement score between agents (0-1)
+ agents_used: List of agent names that were used
+ final_response: Final synthesized response
+ """
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ wandb.log(
+ {
+ "consensus/score": consensus_score,
+ "consensus/num_agents": len(agents_used),
+ "consensus/agents": ", ".join(agents_used),
+ "consensus/final_response_length": len(final_response),
+ }
+ )
+
+ # Categorize consensus level
+ if consensus_score > 0.7:
+ consensus_level = "high"
+ elif consensus_score > 0.4:
+ consensus_level = "medium"
+ else:
+ consensus_level = "low"
+
+ wandb.log({"consensus/level": consensus_level})
+
+ except Exception as e:
+ print(f"W&B consensus log error: {e}")
+
+ def log_performance(self, total_time_ms: float):
+ """Log overall performance metrics.
+
+ Args:
+ total_time_ms: Total execution time in milliseconds
+ """
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ wandb.log({"performance/total_time_ms": total_time_ms, "performance/total_time_s": total_time_ms / 1000})
+ except Exception as e:
+ print(f"W&B performance log error: {e}")
+
+ def log_full_result(self, result: dict[str, Any]):
+ """Log the complete result as an artifact.
+
+ Args:
+ result: Full framework result dictionary
+ """
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ # Create artifact
+ artifact = wandb.Artifact(name=f"query_result_{self.run_id}", type="result")
+
+ # Add result as JSON
+ import json
+ import tempfile
+
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(result, f, indent=2, default=str)
+ temp_path = f.name
+
+ artifact.add_file(temp_path, name="result.json")
+ wandb.log_artifact(artifact)
+
+ # Clean up temp file
+ os.unlink(temp_path)
+
+ except Exception as e:
+ print(f"W&B full result log error: {e}")
+
+ def log_query_summary(
+ self, query: str, use_hrm: bool, use_trm: bool, use_mcts: bool, consensus_score: float, total_time_ms: float
+ ):
+ """Log a summary row for the query.
+
+ Args:
+ query: The input query
+ use_hrm: Whether HRM was enabled
+ use_trm: Whether TRM was enabled
+ use_mcts: Whether MCTS was enabled
+ consensus_score: Final consensus score
+ total_time_ms: Total execution time
+ """
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ # Create summary table entry
+ summary_data = [
+ [
+ query[:100] + "..." if len(query) > 100 else query,
+ "✓" if use_hrm else "✗",
+ "✓" if use_trm else "✗",
+ "✓" if use_mcts else "✗",
+ f"{consensus_score:.1%}",
+ f"{total_time_ms:.2f}",
+ ]
+ ]
+
+ table = wandb.Table(data=summary_data, columns=["Query", "HRM", "TRM", "MCTS", "Consensus", "Time (ms)"])
+
+ wandb.log({"query_summary": table})
+
+ except Exception as e:
+ print(f"W&B summary log error: {e}")
+
+ def finish_run(self):
+ """Finish the current W&B run."""
+ if not self.enabled or not self.run:
+ return
+
+ try:
+ wandb.finish()
+ self.run = None
+ self.run_id = None
+ except Exception as e:
+ print(f"W&B finish error: {e}")
+
+ def get_run_url(self) -> str | None:
+ """Get the URL for the current run.
+
+ Returns:
+ URL string or None if no active run
+ """
+ if not self.enabled or not self.run:
+ return None
+
+ try:
+ return self.run.get_url()
+ except Exception:
+ return None
+
+
+# Global tracker instance
+_global_tracker: WandBTracker | None = None
+
+
+def get_tracker(
+ project_name: str = "langgraph-mcts-demo", entity: str | None = None, enabled: bool = True
+) -> WandBTracker:
+ """Get or create the global W&B tracker.
+
+ Args:
+ project_name: W&B project name
+ entity: W&B entity
+ enabled: Whether tracking is enabled
+
+ Returns:
+ WandBTracker instance
+ """
+ global _global_tracker
+
+ if _global_tracker is None:
+ _global_tracker = WandBTracker(project_name=project_name, entity=entity, enabled=enabled)
+
+ return _global_tracker
+
+
+def is_wandb_available() -> bool:
+ """Check if W&B is available."""
+ return WANDB_AVAILABLE
diff --git a/models/bert_lora/final_model/README.md b/models/bert_lora/final_model/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4df78b2dcbdee07a91a2b3598da2770038a78e0f
--- /dev/null
+++ b/models/bert_lora/final_model/README.md
@@ -0,0 +1,206 @@
+---
+base_model: prajjwal1/bert-mini
+library_name: peft
+tags:
+- base_model:adapter:prajjwal1/bert-mini
+- lora
+- transformers
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.17.1
\ No newline at end of file
diff --git a/models/bert_lora/final_model/adapter_config.json b/models/bert_lora/final_model/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dae7210ba02813d4b1e90f83f2a43a744acb3ce9
--- /dev/null
+++ b/models/bert_lora/final_model/adapter_config.json
@@ -0,0 +1,40 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "prajjwal1/bert-mini",
+ "bias": "none",
+ "corda_config": null,
+ "eva_config": null,
+ "exclude_modules": null,
+ "fan_in_fan_out": false,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_bias": false,
+ "lora_dropout": 0.1,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": [
+ "classifier",
+ "score"
+ ],
+ "peft_type": "LORA",
+ "qalora_group_size": 16,
+ "r": 4,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "query",
+ "value"
+ ],
+ "target_parameters": null,
+ "task_type": "SEQ_CLS",
+ "trainable_token_indices": null,
+ "use_dora": false,
+ "use_qalora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/models/bert_lora/final_model/adapter_model.safetensors b/models/bert_lora/final_model/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..90ac62deb6311c1d6fd3ed11325a37e0e7827ae1
Binary files /dev/null and b/models/bert_lora/final_model/adapter_model.safetensors differ
diff --git a/models/bert_lora/generated_dataset.json b/models/bert_lora/generated_dataset.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad76e7af39a0620c2f4e1b07cdb4b28ffc1a502f
--- /dev/null
+++ b/models/bert_lora/generated_dataset.json
@@ -0,0 +1,12993 @@
+{
+ "seed": 42,
+ "num_samples": 999,
+ "samples": [
+ {
+ "features": {
+ "hrm_confidence": 0.932186814566789,
+ "trm_confidence": 0.36522885075930284,
+ "mcts_value": 0.7145138679647244,
+ "consensus_score": 0.7101167835754781,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 2637,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9283419105971059,
+ "trm_confidence": 0.6511300084852987,
+ "mcts_value": 0.10612189126399749,
+ "consensus_score": 0.5519417910279142,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 921,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8931595360241993,
+ "trm_confidence": 0.6525812194404133,
+ "mcts_value": 0.351698200208428,
+ "consensus_score": 0.5779273962479688,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4441,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9482893515977746,
+ "trm_confidence": 0.5358341835586543,
+ "mcts_value": 0.6430777574912441,
+ "consensus_score": 0.679972291175198,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2234,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9335150491221286,
+ "trm_confidence": 0.1622342921363004,
+ "mcts_value": 0.3890189803478681,
+ "consensus_score": 0.4036835270262114,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 3720,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9234286467723452,
+ "trm_confidence": 0.7966752297173753,
+ "mcts_value": 0.2682939337358132,
+ "consensus_score": 0.6368912112821515,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3981,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7389764516006414,
+ "trm_confidence": 0.30396424576879205,
+ "mcts_value": 0.1449897306915453,
+ "consensus_score": 0.42993960829016165,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 811,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9100795306006747,
+ "trm_confidence": 0.25304182222607313,
+ "mcts_value": 0.6741966292520353,
+ "consensus_score": 0.6733921988589547,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4492,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9047486511924926,
+ "trm_confidence": 0.112465622685393,
+ "mcts_value": 0.16087585630415913,
+ "consensus_score": 0.29416916401088267,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3904,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9115496135879005,
+ "trm_confidence": 0.6336003434427341,
+ "mcts_value": 0.3724329203075315,
+ "consensus_score": 0.6529425316366341,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 1235,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9005208885371415,
+ "trm_confidence": 0.37712235352817686,
+ "mcts_value": 0.4524833101935956,
+ "consensus_score": 0.6297086222361764,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2831,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8677621482236241,
+ "trm_confidence": 0.23336138024133235,
+ "mcts_value": 0.02366076687148142,
+ "consensus_score": 0.3622715762919518,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 1390,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9560209219804499,
+ "trm_confidence": 0.2002570943780814,
+ "mcts_value": 0.0499083666946623,
+ "consensus_score": 0.35833890608879715,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2177,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8671096457023835,
+ "trm_confidence": 0.6013358774543538,
+ "mcts_value": 0.5096013245558383,
+ "consensus_score": 0.6406263215255393,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 1613,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7068136219401581,
+ "trm_confidence": 0.054642268545230264,
+ "mcts_value": 0.4383374938777725,
+ "consensus_score": 0.3923065741713311,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4505,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7456936308139505,
+ "trm_confidence": 0.4496096311936752,
+ "mcts_value": 0.2880802654858253,
+ "consensus_score": 0.47066542105044684,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 3422,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8085437831660172,
+ "trm_confidence": 0.062103805426426364,
+ "mcts_value": 0.08361234832438866,
+ "consensus_score": 0.410466178548847,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 2480,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7797609884378558,
+ "trm_confidence": 0.6588082922365092,
+ "mcts_value": 0.5293644842266623,
+ "consensus_score": 0.6993559594654748,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3687,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7289172886460498,
+ "trm_confidence": 0.5676622519597796,
+ "mcts_value": 0.2866455884313108,
+ "consensus_score": 0.46821438263809284,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 4046,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7530318348817695,
+ "trm_confidence": 0.5593963977267774,
+ "mcts_value": 0.49533740036194435,
+ "consensus_score": 0.6464811355136844,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1550,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8752293906738207,
+ "trm_confidence": 0.5037801849547963,
+ "mcts_value": 0.06546371962314065,
+ "consensus_score": 0.46465257885137445,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 905,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7989583636998355,
+ "trm_confidence": 0.10101639056485813,
+ "mcts_value": 0.07227436912106161,
+ "consensus_score": 0.3416119555641275,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1507,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8743183419101185,
+ "trm_confidence": 0.26858765190610195,
+ "mcts_value": 0.45755670357289335,
+ "consensus_score": 0.43804834000231074,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2097,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9348205681750859,
+ "trm_confidence": 0.06906470554037497,
+ "mcts_value": 0.40627238425745194,
+ "consensus_score": 0.4681939515285418,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 813,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8420468203170861,
+ "trm_confidence": 0.19810844189938528,
+ "mcts_value": 0.24603972019376524,
+ "consensus_score": 0.4328661412977197,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1430,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9478875772583073,
+ "trm_confidence": 0.7598435856691546,
+ "mcts_value": 0.11891546028371681,
+ "consensus_score": 0.6196894364448695,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 3333,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7843701351517025,
+ "trm_confidence": 0.4512891576261886,
+ "mcts_value": 0.4975334024340737,
+ "consensus_score": 0.6314603967875196,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 3077,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7690641972684642,
+ "trm_confidence": 0.025031401865777046,
+ "mcts_value": 0.37123192203583794,
+ "consensus_score": 0.3626269638291808,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 2306,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7951416678468146,
+ "trm_confidence": 0.6624000747790032,
+ "mcts_value": 0.20222911121111173,
+ "consensus_score": 0.5562683771253194,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 4957,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7493823452746055,
+ "trm_confidence": 0.029164163348725774,
+ "mcts_value": 0.2825443492468585,
+ "consensus_score": 0.452172065434564,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 463,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9672377472635574,
+ "trm_confidence": 0.7748306511117701,
+ "mcts_value": 0.4499735556104071,
+ "consensus_score": 0.6938664616947368,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 1684,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.812097318662113,
+ "trm_confidence": 0.0672694610295489,
+ "mcts_value": 0.5317868798463629,
+ "consensus_score": 0.4228766563639145,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 4740,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7368273797234458,
+ "trm_confidence": 0.5292753052442554,
+ "mcts_value": 0.09761564970867558,
+ "consensus_score": 0.39042643985700676,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 3603,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7589303997143719,
+ "trm_confidence": 0.20448170182489145,
+ "mcts_value": 0.5122556808021458,
+ "consensus_score": 0.5862545459926631,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2174,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7041808863124605,
+ "trm_confidence": 0.13875378375170938,
+ "mcts_value": 0.07964446437793447,
+ "consensus_score": 0.3430581128699396,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1490,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9082787306928659,
+ "trm_confidence": 0.46970419528561314,
+ "mcts_value": 0.16147441014713151,
+ "consensus_score": 0.5739773506116561,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 431,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.739317325467194,
+ "trm_confidence": 0.07911795076613058,
+ "mcts_value": 0.5930068093130082,
+ "consensus_score": 0.44999633394709904,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 669,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8988592638290747,
+ "trm_confidence": 0.763408491622485,
+ "mcts_value": 0.22883022193361457,
+ "consensus_score": 0.7153276783241302,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 2989,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8901925335043255,
+ "trm_confidence": 0.08367933776133821,
+ "mcts_value": 0.11089530175490933,
+ "consensus_score": 0.3454119215367851,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 838,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9799069664800634,
+ "trm_confidence": 0.7077627732121353,
+ "mcts_value": 0.4112523272131102,
+ "consensus_score": 0.7565933788188737,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 3321,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9488285844648209,
+ "trm_confidence": 0.6763611211604272,
+ "mcts_value": 0.19747211169221357,
+ "consensus_score": 0.6137078572256313,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4667,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8809321472016177,
+ "trm_confidence": 0.32219040146723615,
+ "mcts_value": 0.29221234846655453,
+ "consensus_score": 0.4836213829865048,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 4814,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8361690646228899,
+ "trm_confidence": 0.18245181923444714,
+ "mcts_value": 0.17422351039937214,
+ "consensus_score": 0.446817654134259,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4797,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7199676570865534,
+ "trm_confidence": 0.3685296458124635,
+ "mcts_value": 0.0906226837308878,
+ "consensus_score": 0.4579728336345698,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 4618,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.976291141746235,
+ "trm_confidence": 0.14505398231091812,
+ "mcts_value": 0.2494976860299916,
+ "consensus_score": 0.38767028240079326,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 2706,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.716618622749278,
+ "trm_confidence": 0.10768718328337355,
+ "mcts_value": 0.032916293776263636,
+ "consensus_score": 0.3039694631584993,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 4018,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7953973290856148,
+ "trm_confidence": 0.35084619767887465,
+ "mcts_value": 0.6084760997666595,
+ "consensus_score": 0.6551328675414908,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 2470,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7710234613313187,
+ "trm_confidence": 0.1673449143479575,
+ "mcts_value": 0.3833105111979031,
+ "consensus_score": 0.42381211409969827,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 45,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8571258846152912,
+ "trm_confidence": 0.07697842942649595,
+ "mcts_value": 0.6310330448273446,
+ "consensus_score": 0.4321048262493981,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3450,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9530724854793258,
+ "trm_confidence": 0.7700285610160599,
+ "mcts_value": 0.8356447951906794,
+ "consensus_score": 0.9133204566167403,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 3143,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9336989063734387,
+ "trm_confidence": 0.11217602900417172,
+ "mcts_value": 0.44691933531130246,
+ "consensus_score": 0.5004426642677205,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 4170,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.815526848830106,
+ "trm_confidence": 0.45762469209576373,
+ "mcts_value": 0.1906616579001964,
+ "consensus_score": 0.41589141513281347,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 834,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.769770982158662,
+ "trm_confidence": 0.24614874601571698,
+ "mcts_value": 0.24539903097564403,
+ "consensus_score": 0.3859386992658501,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 922,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7890629423713855,
+ "trm_confidence": 0.653822834874885,
+ "mcts_value": 0.6314214625791688,
+ "consensus_score": 0.6876178322684955,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2308,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9545681466320873,
+ "trm_confidence": 0.5576803541380092,
+ "mcts_value": 0.6874076335747309,
+ "consensus_score": 0.7397631666614238,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 371,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9204679486930872,
+ "trm_confidence": 0.16606648132791707,
+ "mcts_value": 0.5700595955075086,
+ "consensus_score": 0.6243418221746831,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 4763,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7285287244689803,
+ "trm_confidence": 0.45613311816464464,
+ "mcts_value": 0.05310641505342561,
+ "consensus_score": 0.49977738376914194,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4801,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.940265252825529,
+ "trm_confidence": 0.49885035958088786,
+ "mcts_value": 0.6576118411438995,
+ "consensus_score": 0.757932118980056,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 459,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8770227686074927,
+ "trm_confidence": 0.07385539064541159,
+ "mcts_value": 0.4787747781537166,
+ "consensus_score": 0.410809239954098,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 49,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8397955458943682,
+ "trm_confidence": 0.3866406596674736,
+ "mcts_value": 0.5651471213314001,
+ "consensus_score": 0.6570433856001606,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 1186,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9793708707070305,
+ "trm_confidence": 0.10529022997310968,
+ "mcts_value": 0.10297746470847831,
+ "consensus_score": 0.31342132417835983,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3647,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9322964248425529,
+ "trm_confidence": 0.5586635055387557,
+ "mcts_value": 0.2776855134490763,
+ "consensus_score": 0.669221790749102,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3176,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8092576053316162,
+ "trm_confidence": 0.2230189473801963,
+ "mcts_value": 0.11178726047613538,
+ "consensus_score": 0.31091127890442094,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2780,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.814995946833223,
+ "trm_confidence": 0.5217223241664531,
+ "mcts_value": 0.3953878002827034,
+ "consensus_score": 0.6645966877951524,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 3067,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8129078420442478,
+ "trm_confidence": 0.7033773655847808,
+ "mcts_value": 0.5116969009545275,
+ "consensus_score": 0.766232968195287,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 1362,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8911221651973622,
+ "trm_confidence": 0.0964549421489706,
+ "mcts_value": 0.4653839426303197,
+ "consensus_score": 0.5215396230133073,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 487,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9476198533588963,
+ "trm_confidence": 0.250352173750384,
+ "mcts_value": 0.3886744578520911,
+ "consensus_score": 0.5173449870673691,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 2583,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9343882106294548,
+ "trm_confidence": 0.09227366636739937,
+ "mcts_value": 0.8319139640886433,
+ "consensus_score": 0.6953652852233815,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 3892,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7319258595614149,
+ "trm_confidence": 0.631360115596809,
+ "mcts_value": 0.42066339906103695,
+ "consensus_score": 0.6246747945181796,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 3407,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7086998509613408,
+ "trm_confidence": 0.14659200303695535,
+ "mcts_value": 0.08705739409814221,
+ "consensus_score": 0.3694700041705898,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 82,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.896880711798247,
+ "trm_confidence": 0.02881736647956746,
+ "mcts_value": 0.004326930085425239,
+ "consensus_score": 0.2203399195239597,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4901,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7715658461882166,
+ "trm_confidence": 0.5704339683573875,
+ "mcts_value": 0.03843501631801768,
+ "consensus_score": 0.5203377143495425,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 610,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9094362351935912,
+ "trm_confidence": 0.6782915533551156,
+ "mcts_value": 0.032499916743507586,
+ "consensus_score": 0.4804323239078826,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 3490,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9235564384986533,
+ "trm_confidence": 0.5188503109313061,
+ "mcts_value": 0.7009544972153812,
+ "consensus_score": 0.6454963473699984,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 274,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7812276253953401,
+ "trm_confidence": 0.4836066911859404,
+ "mcts_value": 0.6677425780176791,
+ "consensus_score": 0.6665010193950671,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 360,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.71270516547405,
+ "trm_confidence": 0.5417206443861774,
+ "mcts_value": 0.4347622805893685,
+ "consensus_score": 0.4976882661010326,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 3854,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9940081538662577,
+ "trm_confidence": 0.40995695340003957,
+ "mcts_value": 0.7009747611084067,
+ "consensus_score": 0.7289282912131234,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2238,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9838073360674285,
+ "trm_confidence": 0.26632882928637897,
+ "mcts_value": 0.5108558557039758,
+ "consensus_score": 0.6269525292704077,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2219,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7445316969760174,
+ "trm_confidence": 0.3276494531736131,
+ "mcts_value": 0.2604129715008172,
+ "consensus_score": 0.4390317864466729,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 4868,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7834226637343772,
+ "trm_confidence": 0.20824203195435526,
+ "mcts_value": 0.2924387541146086,
+ "consensus_score": 0.45023199267510605,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 4858,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8226349328286091,
+ "trm_confidence": 0.1572659757712483,
+ "mcts_value": 0.4251306463016947,
+ "consensus_score": 0.4317520338972982,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 1500,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8422398025270511,
+ "trm_confidence": 0.16744400594740463,
+ "mcts_value": 0.424901063235603,
+ "consensus_score": 0.4913493373230704,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2740,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8957299169563094,
+ "trm_confidence": 0.25162106077135565,
+ "mcts_value": 0.6265833766096053,
+ "consensus_score": 0.6011403281656927,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2653,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8081972003350762,
+ "trm_confidence": 0.36312049753242803,
+ "mcts_value": 0.5217329060540332,
+ "consensus_score": 0.6416307786519208,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 4253,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8560825344221707,
+ "trm_confidence": 0.6047680473796243,
+ "mcts_value": 0.23775067596476046,
+ "consensus_score": 0.6336768917245321,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 217,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7216177441218831,
+ "trm_confidence": 0.5233979204122917,
+ "mcts_value": 0.034542003153205686,
+ "consensus_score": 0.3826415097891913,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 2443,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7941680109405967,
+ "trm_confidence": 0.5155534217839192,
+ "mcts_value": 0.010192360309046276,
+ "consensus_score": 0.505405949248658,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2082,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7460838697181904,
+ "trm_confidence": 0.388193295848214,
+ "mcts_value": 0.07731850799225726,
+ "consensus_score": 0.3768490967352044,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 392,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9316314674063757,
+ "trm_confidence": 0.25860537680230905,
+ "mcts_value": 0.5718838939220751,
+ "consensus_score": 0.6284548524737226,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4629,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7032182934927028,
+ "trm_confidence": 0.12610740406362417,
+ "mcts_value": 0.3167428700488906,
+ "consensus_score": 0.314773116720065,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 2095,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9967399008050817,
+ "trm_confidence": 0.49855996974088884,
+ "mcts_value": 0.7524273072394143,
+ "consensus_score": 0.8473067254603828,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 2519,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8177718147525759,
+ "trm_confidence": 0.05745711959236592,
+ "mcts_value": 0.5421547088642604,
+ "consensus_score": 0.459217019867586,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 4596,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7542779956692605,
+ "trm_confidence": 0.5934979397686945,
+ "mcts_value": 0.029212916442549666,
+ "consensus_score": 0.4055667409489095,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 980,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8759335519044388,
+ "trm_confidence": 0.3827602430806626,
+ "mcts_value": 0.06526791031235166,
+ "consensus_score": 0.39005405924362707,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 4272,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8947447150402823,
+ "trm_confidence": 0.5326404951932822,
+ "mcts_value": 0.6063131424699416,
+ "consensus_score": 0.5895211472454582,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3721,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8015369449891978,
+ "trm_confidence": 0.5924331288738127,
+ "mcts_value": 0.33854244341010364,
+ "consensus_score": 0.6312296903162472,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1856,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9728656731614049,
+ "trm_confidence": 0.5124803335859882,
+ "mcts_value": 0.7421752482233381,
+ "consensus_score": 0.7106252441085485,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 4242,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7314939147688211,
+ "trm_confidence": 0.2516834826978827,
+ "mcts_value": 0.5792931580045768,
+ "consensus_score": 0.5469899665633753,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 52,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7574809028988099,
+ "trm_confidence": 0.016320735107701632,
+ "mcts_value": 0.6097875396601716,
+ "consensus_score": 0.45083785820693534,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 720,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7021943368879047,
+ "trm_confidence": 0.16742333812465543,
+ "mcts_value": 0.4233627716865501,
+ "consensus_score": 0.45774743684688934,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 4016,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8432517620883198,
+ "trm_confidence": 0.5659360942979396,
+ "mcts_value": 0.6714000325807745,
+ "consensus_score": 0.737668485666015,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3121,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.960040431478578,
+ "trm_confidence": 0.09813411461423771,
+ "mcts_value": 0.629905225135514,
+ "consensus_score": 0.5507109969815835,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4808,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9909445350116708,
+ "trm_confidence": 0.8772044609437818,
+ "mcts_value": 0.25679538015345144,
+ "consensus_score": 0.7550654915238845,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3189,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7371609324592663,
+ "trm_confidence": 0.026089803397021747,
+ "mcts_value": 0.49529267197729376,
+ "consensus_score": 0.41745441764124175,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 4301,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.993375093722152,
+ "trm_confidence": 0.36769174953390693,
+ "mcts_value": 0.7090558656421148,
+ "consensus_score": 0.6070047574284242,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 2707,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9774105001659665,
+ "trm_confidence": 0.7217430409667073,
+ "mcts_value": 0.03243850424508178,
+ "consensus_score": 0.5517378167205987,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 1851,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9025916886002452,
+ "trm_confidence": 0.5724551001857711,
+ "mcts_value": 0.6209817892745624,
+ "consensus_score": 0.7717675022979922,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1973,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7146891129469838,
+ "trm_confidence": 0.1441662035796492,
+ "mcts_value": 0.38227379065973416,
+ "consensus_score": 0.48533476332042036,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 4760,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9031862198093191,
+ "trm_confidence": 0.023780166634012397,
+ "mcts_value": 0.3223632530707673,
+ "consensus_score": 0.49557018944810566,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1280,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9558343388895738,
+ "trm_confidence": 0.29785724316647905,
+ "mcts_value": 0.7303216724735169,
+ "consensus_score": 0.6211264817336376,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2250,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7824475152265201,
+ "trm_confidence": 0.6050290054094732,
+ "mcts_value": 0.12802284211179998,
+ "consensus_score": 0.42212877249641706,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 2255,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9422294817928845,
+ "trm_confidence": 0.8411711110154104,
+ "mcts_value": 0.2496048616995867,
+ "consensus_score": 0.6592568750622169,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 605,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9992740111788752,
+ "trm_confidence": 0.6302840773303063,
+ "mcts_value": 0.5352594385037045,
+ "consensus_score": 0.7000796609219875,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1105,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7403100739939185,
+ "trm_confidence": 0.23395551080682137,
+ "mcts_value": 0.04300749469983864,
+ "consensus_score": 0.2794868340379491,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 1516,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8903620794855391,
+ "trm_confidence": 0.2713253444375048,
+ "mcts_value": 0.33225381110472557,
+ "consensus_score": 0.5898222662448384,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 2521,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7853622625335476,
+ "trm_confidence": 0.6147677554924093,
+ "mcts_value": 0.16112669194044266,
+ "consensus_score": 0.48548744942546856,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 1537,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9226953848253929,
+ "trm_confidence": 0.48600301576815713,
+ "mcts_value": 0.537581421516067,
+ "consensus_score": 0.6086365990061708,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 1505,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7466324692250267,
+ "trm_confidence": 0.5653600569069068,
+ "mcts_value": 0.18315666384793758,
+ "consensus_score": 0.5106809422095007,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 2929,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8315158775752578,
+ "trm_confidence": 0.348389783042449,
+ "mcts_value": 0.7276401227978485,
+ "consensus_score": 0.6707680898676567,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 3361,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9362769508688018,
+ "trm_confidence": 0.15486121796165153,
+ "mcts_value": 0.47013042707773395,
+ "consensus_score": 0.4408016967656381,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 1601,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8538196192900334,
+ "trm_confidence": 0.32638315973663695,
+ "mcts_value": 0.02701896922834707,
+ "consensus_score": 0.49436215388418314,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 1475,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7738199834514815,
+ "trm_confidence": 0.04415571589845285,
+ "mcts_value": 0.3066674970423677,
+ "consensus_score": 0.3780986257155231,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 247,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.733480110474768,
+ "trm_confidence": 0.24357587584709162,
+ "mcts_value": 0.03834356971646067,
+ "consensus_score": 0.3781044638014227,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 4886,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8182377331534292,
+ "trm_confidence": 0.2992247584654667,
+ "mcts_value": 0.0011927829157792366,
+ "consensus_score": 0.2952991657112979,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 821,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8524502067438625,
+ "trm_confidence": 0.3683258859978843,
+ "mcts_value": 0.2506367703176279,
+ "consensus_score": 0.4767363583824148,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2341,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.778104563024575,
+ "trm_confidence": 0.21868223955202493,
+ "mcts_value": 0.1644287290937298,
+ "consensus_score": 0.38304452400032296,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 1673,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7992207226164599,
+ "trm_confidence": 0.6505442046219245,
+ "mcts_value": 0.033960653943758315,
+ "consensus_score": 0.4867291147752941,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 965,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7142122060016445,
+ "trm_confidence": 0.08488742619537909,
+ "mcts_value": 0.5643524206239566,
+ "consensus_score": 0.356335973971218,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 4085,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7331888402686609,
+ "trm_confidence": 0.3926716075936083,
+ "mcts_value": 0.15300306038828654,
+ "consensus_score": 0.440130246169654,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3441,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7014224178735622,
+ "trm_confidence": 0.5132355235771408,
+ "mcts_value": 0.37235209791058166,
+ "consensus_score": 0.4615520436747531,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4976,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7762790451262471,
+ "trm_confidence": 0.6213609131888949,
+ "mcts_value": 0.3073400370129043,
+ "consensus_score": 0.589011555976013,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 406,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9439478037460338,
+ "trm_confidence": 0.26871223620600315,
+ "mcts_value": 0.6744944968198383,
+ "consensus_score": 0.6491982960298203,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 2582,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7952906903090685,
+ "trm_confidence": 0.054308038543778275,
+ "mcts_value": 0.02074383789783102,
+ "consensus_score": 0.2594099199983288,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3514,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9175549962552606,
+ "trm_confidence": 0.5789035275376786,
+ "mcts_value": 0.6038300850843744,
+ "consensus_score": 0.6635268371851003,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 748,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7378079025239879,
+ "trm_confidence": 0.09166489801650933,
+ "mcts_value": 0.4420617404990008,
+ "consensus_score": 0.3584336343015816,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 615,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7012022814200018,
+ "trm_confidence": 0.009967814726302898,
+ "mcts_value": 0.5970447162996493,
+ "consensus_score": 0.4529993709754845,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 2868,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9641140900383076,
+ "trm_confidence": 0.4633383320539006,
+ "mcts_value": 0.5372992540761998,
+ "consensus_score": 0.6095546636896446,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 2241,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7884265579608288,
+ "trm_confidence": 0.4568543529427487,
+ "mcts_value": 0.5756933887050966,
+ "consensus_score": 0.5106235419857642,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 789,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9621665493711404,
+ "trm_confidence": 0.21870105232034467,
+ "mcts_value": 0.5265855782305323,
+ "consensus_score": 0.5798592279919281,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3761,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9177308904146302,
+ "trm_confidence": 0.46360254817622953,
+ "mcts_value": 0.6201852913233449,
+ "consensus_score": 0.763856325169989,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 4622,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7037409633823639,
+ "trm_confidence": 0.4805950474667002,
+ "mcts_value": 0.3140842018588201,
+ "consensus_score": 0.48106569030807445,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 3864,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8185021720121669,
+ "trm_confidence": 0.49043636088886106,
+ "mcts_value": 0.10732565579300138,
+ "consensus_score": 0.5643979888062163,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 3413,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9576948581673141,
+ "trm_confidence": 0.782717527302752,
+ "mcts_value": 0.1819577406062614,
+ "consensus_score": 0.6347502196604081,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3226,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8137158085111915,
+ "trm_confidence": 0.36900498107278173,
+ "mcts_value": 0.5293747276201115,
+ "consensus_score": 0.616925701065254,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 4750,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7313845290414325,
+ "trm_confidence": 0.5707547505270651,
+ "mcts_value": 0.5465091100904041,
+ "consensus_score": 0.6758356170499672,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 26,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9230801095322956,
+ "trm_confidence": 0.019046151583096152,
+ "mcts_value": 0.8059803187685097,
+ "consensus_score": 0.5581311293657699,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 4657,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8183889664450247,
+ "trm_confidence": 0.2292612586441427,
+ "mcts_value": 0.437312910421789,
+ "consensus_score": 0.5111855865865955,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 60,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9806144108699484,
+ "trm_confidence": 0.41181074908830173,
+ "mcts_value": 0.1732539995949861,
+ "consensus_score": 0.4973342497610128,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3847,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7488614501663664,
+ "trm_confidence": 0.44418593967111597,
+ "mcts_value": 0.22027402789520992,
+ "consensus_score": 0.5620868176153923,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 999,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9260513689000773,
+ "trm_confidence": 0.7277810367370762,
+ "mcts_value": 0.22973591915308827,
+ "consensus_score": 0.5682737285925096,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 2400,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8405249923740312,
+ "trm_confidence": 0.19212168542979885,
+ "mcts_value": 0.033451822114106196,
+ "consensus_score": 0.3516644347045683,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 1231,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8486519669885855,
+ "trm_confidence": 0.08286983895231823,
+ "mcts_value": 0.18898692545925058,
+ "consensus_score": 0.3324577003816266,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 1515,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9704925560699316,
+ "trm_confidence": 0.8570596444224141,
+ "mcts_value": 0.8551428849736049,
+ "consensus_score": 0.9848315217089132,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 2056,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7913399579428099,
+ "trm_confidence": 0.38224065571157784,
+ "mcts_value": 0.06704468949902856,
+ "consensus_score": 0.4827011592782339,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2019,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7495933970101873,
+ "trm_confidence": 0.16451308392705075,
+ "mcts_value": 0.10413763288980703,
+ "consensus_score": 0.40980133616797476,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 2625,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7888098588730077,
+ "trm_confidence": 0.25572247655900815,
+ "mcts_value": 0.2788680909106773,
+ "consensus_score": 0.4931364697963714,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 225,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9824616570214297,
+ "trm_confidence": 0.10647370900903526,
+ "mcts_value": 0.7907933777490511,
+ "consensus_score": 0.5466553770564673,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 897,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7538476217038262,
+ "trm_confidence": 0.27041040829999513,
+ "mcts_value": 0.29413943336981513,
+ "consensus_score": 0.3885176157291874,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 679,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9623725419566715,
+ "trm_confidence": 0.29262210136103023,
+ "mcts_value": 0.4577908039206839,
+ "consensus_score": 0.5206097631859256,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 1488,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9820057032422544,
+ "trm_confidence": 0.7832112112455303,
+ "mcts_value": 0.6856348186200852,
+ "consensus_score": 0.8204828819601692,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2263,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8609704040731267,
+ "trm_confidence": 0.33068964615766583,
+ "mcts_value": 0.10026152842446387,
+ "consensus_score": 0.355773296792126,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 456,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9859616574715566,
+ "trm_confidence": 0.14492284170007116,
+ "mcts_value": 0.4911918073749651,
+ "consensus_score": 0.4822355528525055,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 4373,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7356859932220634,
+ "trm_confidence": 0.5828276906378326,
+ "mcts_value": 0.2043940190441073,
+ "consensus_score": 0.5292617433509195,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4855,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8595658274038205,
+ "trm_confidence": 0.14222030388897,
+ "mcts_value": 0.7509097381536648,
+ "consensus_score": 0.6478935302229244,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 1861,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7458631366379007,
+ "trm_confidence": 0.5946241390199914,
+ "mcts_value": 0.220489322146777,
+ "consensus_score": 0.4303468705409431,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 294,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8868126234964624,
+ "trm_confidence": 0.590769276685474,
+ "mcts_value": 0.6244423384763792,
+ "consensus_score": 0.643065668435547,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2660,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8917040947792494,
+ "trm_confidence": 0.0018278647108377233,
+ "mcts_value": 0.786453907816844,
+ "consensus_score": 0.5161745662951414,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 1994,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7387090171546125,
+ "trm_confidence": 0.0972924433311407,
+ "mcts_value": 0.4038447340742791,
+ "consensus_score": 0.3918675431673908,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3242,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9178540385151703,
+ "trm_confidence": 0.3771120311092349,
+ "mcts_value": 0.539743836851922,
+ "consensus_score": 0.6315090401686404,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3074,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8028140928264867,
+ "trm_confidence": 0.12456360802715516,
+ "mcts_value": 0.47657406224828497,
+ "consensus_score": 0.5371854332955573,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 4940,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9677463069061979,
+ "trm_confidence": 0.6504787881584428,
+ "mcts_value": 0.8606374831518863,
+ "consensus_score": 0.8325702883916245,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 869,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9842606798997977,
+ "trm_confidence": 0.3239360681824274,
+ "mcts_value": 0.6530359849135624,
+ "consensus_score": 0.6343722851557455,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 756,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8526802455254062,
+ "trm_confidence": 0.6827459592509878,
+ "mcts_value": 0.3155995705425496,
+ "consensus_score": 0.6464229514016462,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 3486,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8320150324700657,
+ "trm_confidence": 0.09209262859482964,
+ "mcts_value": 0.06714311190465266,
+ "consensus_score": 0.3639149089525342,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3668,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7059299336028074,
+ "trm_confidence": 0.197864506698318,
+ "mcts_value": 0.11985890761574668,
+ "consensus_score": 0.39684806714775983,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 3759,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9003547476543958,
+ "trm_confidence": 0.10353613391771245,
+ "mcts_value": 0.22021963019442897,
+ "consensus_score": 0.3357677365021964,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 1412,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9730980730555757,
+ "trm_confidence": 0.17682192410918202,
+ "mcts_value": 0.47328816085929903,
+ "consensus_score": 0.5813476062567055,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2916,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7765320793082142,
+ "trm_confidence": 0.4231560380588548,
+ "mcts_value": 0.21682255200993333,
+ "consensus_score": 0.5673959012640091,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2207,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7069682747040275,
+ "trm_confidence": 0.2428246057360107,
+ "mcts_value": 0.531162558238724,
+ "consensus_score": 0.51569730177957,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 890,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9561953128441687,
+ "trm_confidence": 0.27587954965212524,
+ "mcts_value": 0.4662397894445769,
+ "consensus_score": 0.5640697351987828,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 4067,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7732615002297665,
+ "trm_confidence": 0.6144668090215966,
+ "mcts_value": 0.622009131556389,
+ "consensus_score": 0.7385770574216465,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 114,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9294001685685999,
+ "trm_confidence": 0.6324923508225583,
+ "mcts_value": 0.39861605011890444,
+ "consensus_score": 0.6447932954062084,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 3035,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7867960408307003,
+ "trm_confidence": 0.17706179507643613,
+ "mcts_value": 0.4024359518088873,
+ "consensus_score": 0.4374277402170165,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3291,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9581709757883128,
+ "trm_confidence": 0.14391486044111265,
+ "mcts_value": 0.006442317663250758,
+ "consensus_score": 0.3367353783367668,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 680,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7414987210348147,
+ "trm_confidence": 0.49673549026026803,
+ "mcts_value": 0.07422407534739722,
+ "consensus_score": 0.4154986135787992,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1439,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7461890747732879,
+ "trm_confidence": 0.27320895570225096,
+ "mcts_value": 0.4004230314625919,
+ "consensus_score": 0.5598538437953609,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3167,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7413992552043716,
+ "trm_confidence": 0.5071787475187043,
+ "mcts_value": 0.17358326090088597,
+ "consensus_score": 0.5510235664425928,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 2268,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9497587854772022,
+ "trm_confidence": 0.14986038328681847,
+ "mcts_value": 0.359868932238171,
+ "consensus_score": 0.49693301937926315,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 1482,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8958060204607765,
+ "trm_confidence": 0.789473000549209,
+ "mcts_value": 0.7953459723410123,
+ "consensus_score": 0.830166878310794,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 3420,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9929992234649767,
+ "trm_confidence": 0.28426521696112633,
+ "mcts_value": 0.41185917690810053,
+ "consensus_score": 0.5479965907411591,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 1850,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8060013085205403,
+ "trm_confidence": 0.5559505259024953,
+ "mcts_value": 0.5874778619952927,
+ "consensus_score": 0.7140560561256785,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1979,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9246462339001165,
+ "trm_confidence": 0.660862778505748,
+ "mcts_value": 0.4051136030420285,
+ "consensus_score": 0.7421470757232248,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 2854,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7288285698915643,
+ "trm_confidence": 0.09654123447545868,
+ "mcts_value": 0.3356938520979582,
+ "consensus_score": 0.3005249148591434,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 1141,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8308949979082789,
+ "trm_confidence": 0.5661687200967872,
+ "mcts_value": 0.02546317595496685,
+ "consensus_score": 0.504627152613182,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1717,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7440591204464212,
+ "trm_confidence": 0.5544067179494655,
+ "mcts_value": 0.5301409123022616,
+ "consensus_score": 0.6173239368188868,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 226,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8183444553816263,
+ "trm_confidence": 0.5299927836231273,
+ "mcts_value": 0.18493918750750507,
+ "consensus_score": 0.5615579250574635,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 2699,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8394038913571988,
+ "trm_confidence": 0.2720299008072148,
+ "mcts_value": 0.1620693596078362,
+ "consensus_score": 0.3661769188777529,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 4810,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7257947050332835,
+ "trm_confidence": 0.11032220277393885,
+ "mcts_value": 0.10973091841392231,
+ "consensus_score": 0.37990413330176154,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 3499,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7411633773371836,
+ "trm_confidence": 0.4190258018660907,
+ "mcts_value": 0.2877089263480039,
+ "consensus_score": 0.4612163371898409,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 4766,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9618576876322535,
+ "trm_confidence": 0.16573581856333194,
+ "mcts_value": 0.19029638580714042,
+ "consensus_score": 0.470511406486588,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 1267,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8699261952428934,
+ "trm_confidence": 0.42415909204976393,
+ "mcts_value": 0.6379199814213259,
+ "consensus_score": 0.68610831058653,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 2657,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8804675357368047,
+ "trm_confidence": 0.3794571311556796,
+ "mcts_value": 0.20306567679207435,
+ "consensus_score": 0.4713946653578132,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 528,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8683669203670132,
+ "trm_confidence": 0.2961051612096069,
+ "mcts_value": 0.20799443698331926,
+ "consensus_score": 0.4618725336738305,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 2258,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9029957147528913,
+ "trm_confidence": 0.05306132320252652,
+ "mcts_value": 0.0008571698389146299,
+ "consensus_score": 0.26142683036625936,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 3870,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7515881055811068,
+ "trm_confidence": 0.5800854083517826,
+ "mcts_value": 0.3062177008384156,
+ "consensus_score": 0.5411820895928338,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2379,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7645534823886155,
+ "trm_confidence": 0.38927637401496484,
+ "mcts_value": 0.12991124070271629,
+ "consensus_score": 0.4638191008772238,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 4463,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7620955478922519,
+ "trm_confidence": 0.27989032203406305,
+ "mcts_value": 0.1166203836958097,
+ "consensus_score": 0.31314048952537765,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 909,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.808227605955476,
+ "trm_confidence": 0.03914402980123324,
+ "mcts_value": 0.25309903289406965,
+ "consensus_score": 0.3197849916812422,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2099,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9665096359188521,
+ "trm_confidence": 0.8055650852189132,
+ "mcts_value": 0.08650457300765457,
+ "consensus_score": 0.541007022639522,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 2305,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7365906892628868,
+ "trm_confidence": 0.5426186472867254,
+ "mcts_value": 0.48016222358567484,
+ "consensus_score": 0.6504189583593672,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 724,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7132525308832484,
+ "trm_confidence": 0.24607675484191002,
+ "mcts_value": 0.19865498207189433,
+ "consensus_score": 0.47572385567487907,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4580,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7511023580975557,
+ "trm_confidence": 0.6371909988083584,
+ "mcts_value": 0.32466538481186547,
+ "consensus_score": 0.569720662493389,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 4783,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8192575073664978,
+ "trm_confidence": 0.42058777144530124,
+ "mcts_value": 0.09210280147437415,
+ "consensus_score": 0.40819981871109534,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 4910,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.959843381899306,
+ "trm_confidence": 0.5067325029417107,
+ "mcts_value": 0.3084024260626239,
+ "consensus_score": 0.5733933124412227,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 4006,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9780308174466981,
+ "trm_confidence": 0.8190092519450405,
+ "mcts_value": 0.34374202281113436,
+ "consensus_score": 0.679446522653845,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 4847,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8385453734349244,
+ "trm_confidence": 0.48651928305460523,
+ "mcts_value": 0.40944883375396496,
+ "consensus_score": 0.5423142727533135,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 361,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7283919304748165,
+ "trm_confidence": 0.10435241816234642,
+ "mcts_value": 0.45439887808312374,
+ "consensus_score": 0.4068670845445116,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 3905,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9266618627420282,
+ "trm_confidence": 0.21280124956787808,
+ "mcts_value": 0.6834756638873442,
+ "consensus_score": 0.6931472306574268,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 532,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7157727970969165,
+ "trm_confidence": 0.5822254280053311,
+ "mcts_value": 0.24168142263616446,
+ "consensus_score": 0.5980066293552234,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4895,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7115625997970306,
+ "trm_confidence": 0.41611869374628807,
+ "mcts_value": 0.34410671336168863,
+ "consensus_score": 0.395902489742884,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 763,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8489252447317924,
+ "trm_confidence": 0.3488175050052104,
+ "mcts_value": 0.004098949373417123,
+ "consensus_score": 0.4579438422021932,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 4405,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8119181762674159,
+ "trm_confidence": 0.40228923558521373,
+ "mcts_value": 0.19246123282373728,
+ "consensus_score": 0.40100721074243934,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 757,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8609463771256752,
+ "trm_confidence": 0.734629772221256,
+ "mcts_value": 0.733232643276524,
+ "consensus_score": 0.8473980218816325,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 4657,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9634442273966612,
+ "trm_confidence": 0.3233287242950655,
+ "mcts_value": 0.0872995353312898,
+ "consensus_score": 0.5202167427889899,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 1230,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9946432888953534,
+ "trm_confidence": 0.5432376312320689,
+ "mcts_value": 0.3802049553050477,
+ "consensus_score": 0.6133588320469295,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 652,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9384251675252715,
+ "trm_confidence": 0.7023628971642141,
+ "mcts_value": 0.23844571519841753,
+ "consensus_score": 0.5871705836446232,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 339,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8561372331301683,
+ "trm_confidence": 0.3506024719689236,
+ "mcts_value": 0.5218305460581177,
+ "consensus_score": 0.5088684237489608,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 576,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8856144334508926,
+ "trm_confidence": 0.031499218465408804,
+ "mcts_value": 0.36273955042497696,
+ "consensus_score": 0.43106791291023394,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 2242,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7565422909036904,
+ "trm_confidence": 0.22594317178577628,
+ "mcts_value": 0.06355995720741427,
+ "consensus_score": 0.4381457657782145,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 1131,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.992325136932654,
+ "trm_confidence": 0.07204211978374034,
+ "mcts_value": 0.5815010472099233,
+ "consensus_score": 0.6229045853236821,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 1805,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8293229072913815,
+ "trm_confidence": 0.026181879911408894,
+ "mcts_value": 0.38989657417045104,
+ "consensus_score": 0.5138423864817833,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3307,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9117391010888465,
+ "trm_confidence": 0.2449543416327622,
+ "mcts_value": 0.732991603678876,
+ "consensus_score": 0.5389179646086839,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3812,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7352513146757804,
+ "trm_confidence": 0.038003488930421665,
+ "mcts_value": 0.5302491411336361,
+ "consensus_score": 0.3656528491081953,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 355,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.870234434064797,
+ "trm_confidence": 0.3059627671516115,
+ "mcts_value": 0.34276668916021336,
+ "consensus_score": 0.41894473093298157,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 1451,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7568090475018561,
+ "trm_confidence": 0.18787845586226595,
+ "mcts_value": 0.6334090395729941,
+ "consensus_score": 0.4448893916766401,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 646,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9490057537477085,
+ "trm_confidence": 0.8285426268405806,
+ "mcts_value": 0.7149941480792436,
+ "consensus_score": 0.9195239798114148,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4616,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8637027082243143,
+ "trm_confidence": 0.4150690578704749,
+ "mcts_value": 0.6865088423172458,
+ "consensus_score": 0.7494542998666873,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3824,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8369211573705536,
+ "trm_confidence": 0.2847031937971334,
+ "mcts_value": 0.2664699753767556,
+ "consensus_score": 0.44099947718503385,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 3622,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9339609531616935,
+ "trm_confidence": 0.3720073749593458,
+ "mcts_value": 0.7761545231594749,
+ "consensus_score": 0.7164851521456285,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 1583,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7617459379039022,
+ "trm_confidence": 0.6456893506063608,
+ "mcts_value": 0.14472663325826077,
+ "consensus_score": 0.5205384425137849,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 4357,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7709081306150634,
+ "trm_confidence": 0.032953426548556077,
+ "mcts_value": 0.10305271832695558,
+ "consensus_score": 0.21497045423736993,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 751,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9737346388886925,
+ "trm_confidence": 0.46938674734886066,
+ "mcts_value": 0.546948759270564,
+ "consensus_score": 0.6037301654912806,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 4317,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9592410487578175,
+ "trm_confidence": 0.8152849463702759,
+ "mcts_value": 0.2989306274247526,
+ "consensus_score": 0.6092680192247032,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 2042,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9837080955473421,
+ "trm_confidence": 0.33431545265646406,
+ "mcts_value": 0.6811385287471287,
+ "consensus_score": 0.6744826974177959,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3086,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8959575655068456,
+ "trm_confidence": 0.5355627499764045,
+ "mcts_value": 0.346050309650501,
+ "consensus_score": 0.5382526622436733,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 4515,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9810009075595381,
+ "trm_confidence": 0.16554322366482135,
+ "mcts_value": 0.09454228904780895,
+ "consensus_score": 0.41322501389713184,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 167,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8310002468016519,
+ "trm_confidence": 0.7272099321804025,
+ "mcts_value": 0.3554712478770538,
+ "consensus_score": 0.6334831251010022,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4867,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8778869893448848,
+ "trm_confidence": 0.17733316400235186,
+ "mcts_value": 0.49598773000963786,
+ "consensus_score": 0.427235986841987,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 96,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9690266114772095,
+ "trm_confidence": 0.41327531492474356,
+ "mcts_value": 0.05242908683320221,
+ "consensus_score": 0.5426863802971464,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 3741,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.828450959052662,
+ "trm_confidence": 0.46470563316140123,
+ "mcts_value": 0.623720838984344,
+ "consensus_score": 0.66517223230718,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 2180,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9015562565842201,
+ "trm_confidence": 0.769964489790417,
+ "mcts_value": 0.29730709197590294,
+ "consensus_score": 0.6412922996196729,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1853,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9209719279347199,
+ "trm_confidence": 0.3774085653223031,
+ "mcts_value": 0.1769154620345386,
+ "consensus_score": 0.5408060869867227,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 1940,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8880474948352626,
+ "trm_confidence": 0.5888840940074078,
+ "mcts_value": 0.7050565479056364,
+ "consensus_score": 0.6818466783992806,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 507,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7463269264066427,
+ "trm_confidence": 0.12775749309271756,
+ "mcts_value": 0.18829407029016992,
+ "consensus_score": 0.3600044340684342,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 3641,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9124472661527905,
+ "trm_confidence": 0.1423249734377036,
+ "mcts_value": 0.29059827613361366,
+ "consensus_score": 0.44413782379203814,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1951,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7924999611858532,
+ "trm_confidence": 0.08813876240274264,
+ "mcts_value": 0.38104096528938425,
+ "consensus_score": 0.4615655355974986,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 2120,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9346633585803386,
+ "trm_confidence": 0.6243199532163899,
+ "mcts_value": 0.6803116479784208,
+ "consensus_score": 0.7360797798925045,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1391,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8082431684252546,
+ "trm_confidence": 0.1249026322116057,
+ "mcts_value": 0.33109273139471257,
+ "consensus_score": 0.3998248484936129,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 4551,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.794154031633494,
+ "trm_confidence": 0.18936822561780658,
+ "mcts_value": 0.09085805086171304,
+ "consensus_score": 0.3458911097313942,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 4304,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7707991787861658,
+ "trm_confidence": 0.5119240758573433,
+ "mcts_value": 0.6238454816739297,
+ "consensus_score": 0.6351092826635906,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 128,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9996488912000976,
+ "trm_confidence": 0.5519494252201751,
+ "mcts_value": 0.8004891104852072,
+ "consensus_score": 0.865750791612955,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1800,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7243462001271698,
+ "trm_confidence": 0.009868138479946883,
+ "mcts_value": 0.297938402088473,
+ "consensus_score": 0.34291382027212014,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 2262,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8656488109913281,
+ "trm_confidence": 0.6405517801265206,
+ "mcts_value": 0.08319318191851317,
+ "consensus_score": 0.47483345756656303,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 352,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9984472640862603,
+ "trm_confidence": 0.7842654129109966,
+ "mcts_value": 0.7995273282172154,
+ "consensus_score": 0.8236162569308051,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 3748,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8818898194578146,
+ "trm_confidence": 0.734042348906971,
+ "mcts_value": 0.26947120053234996,
+ "consensus_score": 0.6398681460086686,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 1085,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7585556589455966,
+ "trm_confidence": 0.03816023455513779,
+ "mcts_value": 0.6251626215781484,
+ "consensus_score": 0.4545354267278202,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 25,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7575763484233256,
+ "trm_confidence": 0.11537982339162972,
+ "mcts_value": 0.5339442365121275,
+ "consensus_score": 0.42190914489960163,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 3850,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9033125843363409,
+ "trm_confidence": 0.654277114228846,
+ "mcts_value": 0.18898441735980578,
+ "consensus_score": 0.5374328446663108,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2813,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8289871149700143,
+ "trm_confidence": 0.3152982944972967,
+ "mcts_value": 0.18445907233839906,
+ "consensus_score": 0.49807304455598805,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2898,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9505377373242201,
+ "trm_confidence": 0.18122827258423413,
+ "mcts_value": 0.003662224693813079,
+ "consensus_score": 0.39270955061190405,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 4416,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7623994221125381,
+ "trm_confidence": 0.3437287593091789,
+ "mcts_value": 0.5365042757664858,
+ "consensus_score": 0.46535255341107823,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 1698,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8114629625219084,
+ "trm_confidence": 0.13435011914363088,
+ "mcts_value": 0.1390880405202477,
+ "consensus_score": 0.34364346477826113,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 3708,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8926757255703981,
+ "trm_confidence": 0.5472648702307961,
+ "mcts_value": 0.7791900694236185,
+ "consensus_score": 0.722078909801799,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1223,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7737626761015175,
+ "trm_confidence": 0.49172227340870756,
+ "mcts_value": 0.5084032169201707,
+ "consensus_score": 0.5108536549658871,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 663,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7556125818407456,
+ "trm_confidence": 0.3040915450308808,
+ "mcts_value": 0.18975375780077433,
+ "consensus_score": 0.4756980510826412,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 2699,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7798092617864996,
+ "trm_confidence": 0.18635909980374252,
+ "mcts_value": 0.17316669164944853,
+ "consensus_score": 0.30678597029114407,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2572,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7674068423637621,
+ "trm_confidence": 0.5650740523405162,
+ "mcts_value": 0.2664256764332823,
+ "consensus_score": 0.4399922192677869,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 3936,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9348715822515172,
+ "trm_confidence": 0.176103126668828,
+ "mcts_value": 0.7659075071611872,
+ "consensus_score": 0.6958268293709436,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 371,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7151607965950173,
+ "trm_confidence": 0.5336049555333756,
+ "mcts_value": 0.18811631230975562,
+ "consensus_score": 0.5027439643807547,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 3017,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9342515063557144,
+ "trm_confidence": 0.7829545983703684,
+ "mcts_value": 0.6076479123657337,
+ "consensus_score": 0.7631958887267172,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 944,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9234312783450263,
+ "trm_confidence": 0.6756302582411827,
+ "mcts_value": 0.6170405848403161,
+ "consensus_score": 0.6962716823518058,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 4935,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8508624810941059,
+ "trm_confidence": 0.3793934835125877,
+ "mcts_value": 0.4374857817217746,
+ "consensus_score": 0.534934282952401,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4806,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7740175160438414,
+ "trm_confidence": 0.496429549704044,
+ "mcts_value": 0.4569649040000525,
+ "consensus_score": 0.5804220897718424,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 1691,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9881878856886498,
+ "trm_confidence": 0.6411262857398229,
+ "mcts_value": 0.815232129761202,
+ "consensus_score": 0.911685820052825,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 1784,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8421798212914233,
+ "trm_confidence": 0.7322559470279331,
+ "mcts_value": 0.06962227405779058,
+ "consensus_score": 0.5304943303184939,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 495,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9363378551916618,
+ "trm_confidence": 0.3201360493631198,
+ "mcts_value": 0.8195931501855913,
+ "consensus_score": 0.7687283188648658,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 1255,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8010569631499805,
+ "trm_confidence": 0.4250948913284999,
+ "mcts_value": 0.46163252778246694,
+ "consensus_score": 0.6285138759041234,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 2283,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9135753431765736,
+ "trm_confidence": 0.458468610905465,
+ "mcts_value": 0.4032120715775476,
+ "consensus_score": 0.5027939998947915,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 4093,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7092678511108096,
+ "trm_confidence": 0.24153585322317822,
+ "mcts_value": 0.575848466210233,
+ "consensus_score": 0.41906020375707653,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 2699,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9599826496461017,
+ "trm_confidence": 0.6416055173615868,
+ "mcts_value": 0.7616910796400281,
+ "consensus_score": 0.7309447069688255,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2053,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7737099228984943,
+ "trm_confidence": 0.15545607553068916,
+ "mcts_value": 0.018312873062874588,
+ "consensus_score": 0.4075293492924431,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 548,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9231984168084972,
+ "trm_confidence": 0.43762291302503004,
+ "mcts_value": 0.3919526077106061,
+ "consensus_score": 0.574164427406082,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1559,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9664176085044609,
+ "trm_confidence": 0.7033165481696211,
+ "mcts_value": 0.3028680682544538,
+ "consensus_score": 0.6732559343386835,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 1323,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.970867806289085,
+ "trm_confidence": 0.8028508591083362,
+ "mcts_value": 0.2892701046838016,
+ "consensus_score": 0.6214898648369789,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 674,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7360331890304153,
+ "trm_confidence": 0.04535205026991548,
+ "mcts_value": 0.6295603232822491,
+ "consensus_score": 0.38512672420134136,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 2644,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8740720650228829,
+ "trm_confidence": 0.5994547247302315,
+ "mcts_value": 0.48837136070431225,
+ "consensus_score": 0.7304010642182652,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2206,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9138249081283899,
+ "trm_confidence": 0.36409936512452107,
+ "mcts_value": 0.28163535888200514,
+ "consensus_score": 0.5133837012016812,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 2109,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7782111344007834,
+ "trm_confidence": 0.1041268042075554,
+ "mcts_value": 0.16707883336197832,
+ "consensus_score": 0.41821962101122034,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 4765,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9569228891967217,
+ "trm_confidence": 0.06662118460104391,
+ "mcts_value": 0.6555669235911721,
+ "consensus_score": 0.5212367590721186,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 2268,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9729389868199702,
+ "trm_confidence": 0.25753376914582776,
+ "mcts_value": 0.49655720574113554,
+ "consensus_score": 0.5336434766722095,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1430,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8831006166370539,
+ "trm_confidence": 0.48829954656903063,
+ "mcts_value": 0.10578106858074822,
+ "consensus_score": 0.5311018378462882,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2802,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7222211658216119,
+ "trm_confidence": 0.27803258546131354,
+ "mcts_value": 0.2545103167234909,
+ "consensus_score": 0.3355969860641119,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 1660,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9228718929574247,
+ "trm_confidence": 0.19507172003678855,
+ "mcts_value": 0.682777879225739,
+ "consensus_score": 0.6094001545545434,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4805,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.849758267626667,
+ "trm_confidence": 0.4646397357959349,
+ "mcts_value": 0.695753635586873,
+ "consensus_score": 0.6684130626190737,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 3214,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9212545411893177,
+ "trm_confidence": 0.05900741430404592,
+ "mcts_value": 0.4235106507204567,
+ "consensus_score": 0.4642425199124821,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1681,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8404961604082992,
+ "trm_confidence": 0.7125053970548564,
+ "mcts_value": 0.3336501412668026,
+ "consensus_score": 0.5650607856410709,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 728,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7001704387776833,
+ "trm_confidence": 0.25924101689077567,
+ "mcts_value": 0.46167665754423126,
+ "consensus_score": 0.5203893262975305,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 1085,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7462849376240428,
+ "trm_confidence": 0.6082034279986184,
+ "mcts_value": 0.10816547835548636,
+ "consensus_score": 0.5128791248655455,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 1982,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8236775789392226,
+ "trm_confidence": 0.549736644951093,
+ "mcts_value": 0.16693731092908165,
+ "consensus_score": 0.5946088821893177,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 790,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9549190574203681,
+ "trm_confidence": 0.8539176146019773,
+ "mcts_value": 0.19145541734296373,
+ "consensus_score": 0.6241425125011274,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3414,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8747334224618132,
+ "trm_confidence": 0.769715104649675,
+ "mcts_value": 0.7672015316023072,
+ "consensus_score": 0.80923799917706,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3612,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9798800375360516,
+ "trm_confidence": 0.17937289642684212,
+ "mcts_value": 0.46505331630125507,
+ "consensus_score": 0.6080028276870187,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 4694,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9956863304536023,
+ "trm_confidence": 0.8822928611113594,
+ "mcts_value": 0.8115874843332822,
+ "consensus_score": 0.9562118942650263,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 198,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8641848523614624,
+ "trm_confidence": 0.4285375751873198,
+ "mcts_value": 0.38629759737868324,
+ "consensus_score": 0.5870073976257497,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3396,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8000649779435066,
+ "trm_confidence": 0.2314953845736733,
+ "mcts_value": 0.08906644020564398,
+ "consensus_score": 0.31128470958005844,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4928,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7683603359658501,
+ "trm_confidence": 0.507513472723641,
+ "mcts_value": 0.370259011369696,
+ "consensus_score": 0.4529421423043878,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 569,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8967618582948522,
+ "trm_confidence": 0.21913424302132728,
+ "mcts_value": 0.05174711649358016,
+ "consensus_score": 0.30020140377612037,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4028,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9546376339408267,
+ "trm_confidence": 0.1328082263232585,
+ "mcts_value": 0.5183944175395805,
+ "consensus_score": 0.524217698203371,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1919,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9408256750636647,
+ "trm_confidence": 0.7509014583599367,
+ "mcts_value": 0.009263096790295754,
+ "consensus_score": 0.5027175785440559,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1692,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.745011873474463,
+ "trm_confidence": 0.13080224976016785,
+ "mcts_value": 0.4256989638627555,
+ "consensus_score": 0.4788010601485034,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4799,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8482190810985331,
+ "trm_confidence": 0.22466917418298077,
+ "mcts_value": 0.3570747693720694,
+ "consensus_score": 0.44858760028451655,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 3672,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8056637480695372,
+ "trm_confidence": 0.5020157867165614,
+ "mcts_value": 0.013010889083076554,
+ "consensus_score": 0.42806584393091207,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1246,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9329945307666148,
+ "trm_confidence": 0.675158490223123,
+ "mcts_value": 0.5928470563406139,
+ "consensus_score": 0.6542262316724003,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 3108,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7736698116428354,
+ "trm_confidence": 0.028665874979164216,
+ "mcts_value": 0.15062604934972768,
+ "consensus_score": 0.3086327923371035,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 425,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9473246628050478,
+ "trm_confidence": 0.7843571444504731,
+ "mcts_value": 0.1706073965477051,
+ "consensus_score": 0.7200109843811864,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 1525,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9855786370242724,
+ "trm_confidence": 0.010672872423246276,
+ "mcts_value": 0.6757135888627713,
+ "consensus_score": 0.651362013950732,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 1171,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8650970517092917,
+ "trm_confidence": 0.05979703219928956,
+ "mcts_value": 0.6501712098150373,
+ "consensus_score": 0.5477874157363418,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2410,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8715525929563327,
+ "trm_confidence": 0.11849487677955242,
+ "mcts_value": 0.5707714821884884,
+ "consensus_score": 0.43527644544509964,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 925,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8417406032693464,
+ "trm_confidence": 0.3781618832841875,
+ "mcts_value": 0.2315820031465227,
+ "consensus_score": 0.5777198799784697,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 126,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7524422158699015,
+ "trm_confidence": 0.14326299569614181,
+ "mcts_value": 0.3026824843410715,
+ "consensus_score": 0.4879762453652392,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 498,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.9578683400256693,
+ "trm_confidence": 0.3244462548711551,
+ "mcts_value": 0.730665495080943,
+ "consensus_score": 0.6193332829221486,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2393,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7773682027443383,
+ "trm_confidence": 0.4693664392989025,
+ "mcts_value": 0.6687741749271744,
+ "consensus_score": 0.5686225964387391,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3358,
+ "has_rag_context": false
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.801920784324709,
+ "trm_confidence": 0.05000797943196162,
+ "mcts_value": 0.33362689209879987,
+ "consensus_score": 0.4459393545472813,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 2562,
+ "has_rag_context": true
+ },
+ "label": "hrm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.44399049870475243,
+ "trm_confidence": 0.9537098201940917,
+ "mcts_value": 0.7194485575615861,
+ "consensus_score": 0.6949941905822187,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 1264,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5618301748473334,
+ "trm_confidence": 0.7347696595957748,
+ "mcts_value": 0.3105706986357273,
+ "consensus_score": 0.4640513046208595,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 943,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1760292584653908,
+ "trm_confidence": 0.7142117520188469,
+ "mcts_value": 0.08585918085299075,
+ "consensus_score": 0.3135822163691997,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2713,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.021914319138401007,
+ "trm_confidence": 0.783890898730759,
+ "mcts_value": 0.4201730665660401,
+ "consensus_score": 0.3834030287215369,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3801,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15000280015235107,
+ "trm_confidence": 0.7041332065882151,
+ "mcts_value": 0.27581539819966233,
+ "consensus_score": 0.3554648631471029,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 39,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.45055688875567235,
+ "trm_confidence": 0.8591323538809237,
+ "mcts_value": 0.016572915728581297,
+ "consensus_score": 0.44511406149096355,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 4273,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.49324445079443197,
+ "trm_confidence": 0.730086219548575,
+ "mcts_value": 0.5937959316784491,
+ "consensus_score": 0.5101626314664054,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 1243,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5398238930337239,
+ "trm_confidence": 0.7854422765949668,
+ "mcts_value": 0.43344288561357935,
+ "consensus_score": 0.6720688782016373,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 3688,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.40120532900118133,
+ "trm_confidence": 0.8334813283081018,
+ "mcts_value": 0.10763342161337451,
+ "consensus_score": 0.41469719239302494,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 958,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6199851429898666,
+ "trm_confidence": 0.8357523794573152,
+ "mcts_value": 0.23831112475494545,
+ "consensus_score": 0.5606425829517397,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 1462,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.033064328666376076,
+ "trm_confidence": 0.7392763936439497,
+ "mcts_value": 0.6043824969206342,
+ "consensus_score": 0.41206962239866646,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 238,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6762765013148828,
+ "trm_confidence": 0.8270965361651815,
+ "mcts_value": 0.3653027473934347,
+ "consensus_score": 0.717118958651632,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 4406,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38453742310423844,
+ "trm_confidence": 0.9578592375427976,
+ "mcts_value": 0.23976082519376188,
+ "consensus_score": 0.5984809012971757,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1653,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16652177228253073,
+ "trm_confidence": 0.9929581199245436,
+ "mcts_value": 0.2691381459097174,
+ "consensus_score": 0.3824199327195652,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 2820,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14578435320948174,
+ "trm_confidence": 0.9040104806223674,
+ "mcts_value": 0.542556525391946,
+ "consensus_score": 0.4834788720457953,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 2008,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3802169571295888,
+ "trm_confidence": 0.9150255250490829,
+ "mcts_value": 0.2944678247370635,
+ "consensus_score": 0.6004279794464684,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 1336,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2981151404941536,
+ "trm_confidence": 0.9700273834431048,
+ "mcts_value": 0.0339131258470358,
+ "consensus_score": 0.43293859472298857,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4764,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10775691844490401,
+ "trm_confidence": 0.9241332618062232,
+ "mcts_value": 0.6650295757125843,
+ "consensus_score": 0.6530213762807613,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 763,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.8082211137175843,
+ "trm_confidence": 0.9140759829722251,
+ "mcts_value": 0.2780380965244117,
+ "consensus_score": 0.5993497079866142,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4129,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4005895980957989,
+ "trm_confidence": 0.7533737464888433,
+ "mcts_value": 0.009228730029908863,
+ "consensus_score": 0.473264372029028,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 1301,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15195141330764292,
+ "trm_confidence": 0.732474832442457,
+ "mcts_value": 0.17323875809406383,
+ "consensus_score": 0.2591170011809336,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 1372,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.27485041219425316,
+ "trm_confidence": 0.8135862782303556,
+ "mcts_value": 0.24191127190809028,
+ "consensus_score": 0.4090340974837071,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3025,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4998685572957993,
+ "trm_confidence": 0.7606342900469399,
+ "mcts_value": 0.6545629060364406,
+ "consensus_score": 0.7240604861378592,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3997,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5374037880661997,
+ "trm_confidence": 0.7943744707107527,
+ "mcts_value": 0.5310275204057412,
+ "consensus_score": 0.6897897312440535,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 857,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6693614822574055,
+ "trm_confidence": 0.9300626704321593,
+ "mcts_value": 0.2560408179709783,
+ "consensus_score": 0.7043367723547445,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 4636,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07751066073395808,
+ "trm_confidence": 0.9310272154476609,
+ "mcts_value": 0.545344697158897,
+ "consensus_score": 0.4247967399658241,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 3699,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6689706808174918,
+ "trm_confidence": 0.8807230759274343,
+ "mcts_value": 0.07641973561121224,
+ "consensus_score": 0.497222599193542,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4261,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19478995496854004,
+ "trm_confidence": 0.8344648262713499,
+ "mcts_value": 0.2884500524339905,
+ "consensus_score": 0.42425547095642147,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 933,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.369795165737947,
+ "trm_confidence": 0.9681229386148816,
+ "mcts_value": 0.28457740178950947,
+ "consensus_score": 0.5413342452478086,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2344,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11397331765338067,
+ "trm_confidence": 0.8508037974388201,
+ "mcts_value": 0.26423033765407056,
+ "consensus_score": 0.4228271947902419,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2422,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6565513876998769,
+ "trm_confidence": 0.7770165284605498,
+ "mcts_value": 0.5312697783633291,
+ "consensus_score": 0.7484645033740798,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 838,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12883318501989313,
+ "trm_confidence": 0.8579380271905042,
+ "mcts_value": 0.6201656254436124,
+ "consensus_score": 0.5524780423879958,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 838,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.35780216754323463,
+ "trm_confidence": 0.7345558159942782,
+ "mcts_value": 0.45754047753151095,
+ "consensus_score": 0.4284025278228275,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 911,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.31772290031448275,
+ "trm_confidence": 0.7202581943254159,
+ "mcts_value": 0.16037220844841238,
+ "consensus_score": 0.43203599133682113,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 2005,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1615036406046095,
+ "trm_confidence": 0.9684185745144196,
+ "mcts_value": 0.2807300984894874,
+ "consensus_score": 0.4877444328001141,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2460,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06737428867692129,
+ "trm_confidence": 0.7468873965882925,
+ "mcts_value": 0.646735595995737,
+ "consensus_score": 0.47806951804234776,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1293,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07693929077895703,
+ "trm_confidence": 0.976707291563801,
+ "mcts_value": 0.09996145642120899,
+ "consensus_score": 0.47046245046616725,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1728,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3196784718584861,
+ "trm_confidence": 0.8065494660981616,
+ "mcts_value": 0.2940388128620583,
+ "consensus_score": 0.4114312934648213,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4297,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05450232063485843,
+ "trm_confidence": 0.9115817228432954,
+ "mcts_value": 0.0314498724043145,
+ "consensus_score": 0.36325739980954025,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 2946,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5967026785359415,
+ "trm_confidence": 0.728206360650649,
+ "mcts_value": 0.47022154914127223,
+ "consensus_score": 0.6736303093811141,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 4255,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38166291205082625,
+ "trm_confidence": 0.8105032492619032,
+ "mcts_value": 0.21092602355647808,
+ "consensus_score": 0.37982354339428354,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 2722,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14097898988819313,
+ "trm_confidence": 0.8157763874389644,
+ "mcts_value": 0.18745924086930824,
+ "consensus_score": 0.406795340818609,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1230,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5867465316361682,
+ "trm_confidence": 0.8601716898167119,
+ "mcts_value": 0.2880046945944309,
+ "consensus_score": 0.595811537193605,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1504,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6928280035829012,
+ "trm_confidence": 0.9729392729552633,
+ "mcts_value": 0.051257843268573244,
+ "consensus_score": 0.5581614808960973,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 2698,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06843046817694912,
+ "trm_confidence": 0.8375128873484537,
+ "mcts_value": 0.41448900915880876,
+ "consensus_score": 0.3654568799861959,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 158,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7102941809372267,
+ "trm_confidence": 0.917029410513186,
+ "mcts_value": 0.513869027705941,
+ "consensus_score": 0.7362187389027544,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 2958,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03135605725763654,
+ "trm_confidence": 0.9607161358831008,
+ "mcts_value": 0.4787219630754682,
+ "consensus_score": 0.43351858069824545,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 3755,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13657622439010203,
+ "trm_confidence": 0.8260667437097314,
+ "mcts_value": 0.24615378011227454,
+ "consensus_score": 0.4436877883120809,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3462,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6621184400767522,
+ "trm_confidence": 0.9664086274682653,
+ "mcts_value": 0.1731040516466299,
+ "consensus_score": 0.5590060150832142,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 833,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24092127909679925,
+ "trm_confidence": 0.8831566012641474,
+ "mcts_value": 0.23426183125810168,
+ "consensus_score": 0.3606846276110622,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3088,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4387600739311695,
+ "trm_confidence": 0.7014572013331837,
+ "mcts_value": 0.5707069543934159,
+ "consensus_score": 0.6216086860980519,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 1208,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.23726287854895656,
+ "trm_confidence": 0.8308409267947315,
+ "mcts_value": 0.3812900998175645,
+ "consensus_score": 0.4693614931486405,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 3169,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07075106842511865,
+ "trm_confidence": 0.9985028251971431,
+ "mcts_value": 0.3456391869928632,
+ "consensus_score": 0.4077722660250702,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 748,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4984535248654349,
+ "trm_confidence": 0.7523611394880365,
+ "mcts_value": 0.3237594897871654,
+ "consensus_score": 0.6087071678236574,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 4766,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05787850903063726,
+ "trm_confidence": 0.8209820748791032,
+ "mcts_value": 0.18942332955531607,
+ "consensus_score": 0.45330058643701027,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 620,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38450059625280597,
+ "trm_confidence": 0.8720922006387127,
+ "mcts_value": 0.4481518822223316,
+ "consensus_score": 0.5435952820129928,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1542,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.502503780043091,
+ "trm_confidence": 0.8285177937524397,
+ "mcts_value": 0.63252569593037,
+ "consensus_score": 0.6284300778466518,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3271,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.04501556211515551,
+ "trm_confidence": 0.7510957264335486,
+ "mcts_value": 0.473920138132877,
+ "consensus_score": 0.3809558026382885,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 17,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7175235392332914,
+ "trm_confidence": 0.9587643401224977,
+ "mcts_value": 0.07636883334349913,
+ "consensus_score": 0.4889702055245333,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 2725,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.367568488911526,
+ "trm_confidence": 0.7704110467155687,
+ "mcts_value": 0.28575882241038536,
+ "consensus_score": 0.5336845579248805,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 149,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1346253143879693,
+ "trm_confidence": 0.9967275818499708,
+ "mcts_value": 0.7449151342248853,
+ "consensus_score": 0.5357726259417187,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 500,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5478730718912188,
+ "trm_confidence": 0.8995641896812363,
+ "mcts_value": 0.21053906941647224,
+ "consensus_score": 0.5363737352685999,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 1756,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2414694756928368,
+ "trm_confidence": 0.8386683790431004,
+ "mcts_value": 0.2256488687693059,
+ "consensus_score": 0.3854328194994388,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 213,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5188049086813686,
+ "trm_confidence": 0.9616312906010721,
+ "mcts_value": 0.11527457054933798,
+ "consensus_score": 0.5706754567857266,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 498,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.005559683494021258,
+ "trm_confidence": 0.7451801783011791,
+ "mcts_value": 0.030484982048218194,
+ "consensus_score": 0.31285855094184184,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 1355,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05924469930123762,
+ "trm_confidence": 0.9244212877239678,
+ "mcts_value": 0.4881669060107118,
+ "consensus_score": 0.5418260208309533,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2810,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7717402512901582,
+ "trm_confidence": 0.8785568056376277,
+ "mcts_value": 0.2005697978017373,
+ "consensus_score": 0.5548538736806177,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 1961,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6171263197611457,
+ "trm_confidence": 0.878933529808082,
+ "mcts_value": 0.5865940232253513,
+ "consensus_score": 0.7727262565572104,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 1025,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22804455185613456,
+ "trm_confidence": 0.8667632214627283,
+ "mcts_value": 0.2528673000162186,
+ "consensus_score": 0.5356138982910263,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 4544,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3896818826228778,
+ "trm_confidence": 0.8250737610156234,
+ "mcts_value": 0.3531946095675046,
+ "consensus_score": 0.44410749590674725,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 4884,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5777041376588443,
+ "trm_confidence": 0.7202122915399018,
+ "mcts_value": 0.07680937377083678,
+ "consensus_score": 0.39783808360657574,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4191,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.631420389676864,
+ "trm_confidence": 0.9700834298846228,
+ "mcts_value": 0.05970587071489856,
+ "consensus_score": 0.5921059756825924,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 1318,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6094912560302467,
+ "trm_confidence": 0.9484593557468007,
+ "mcts_value": 0.4973467618396553,
+ "consensus_score": 0.6495917882552565,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 960,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5157020450568663,
+ "trm_confidence": 0.8762565341586622,
+ "mcts_value": 0.33932349902157877,
+ "consensus_score": 0.6283181880276507,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2497,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3564181702938383,
+ "trm_confidence": 0.7607215838511286,
+ "mcts_value": 0.3180551149056891,
+ "consensus_score": 0.4927122465848959,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 2415,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4534609618708469,
+ "trm_confidence": 0.8748934786196683,
+ "mcts_value": 0.509363800249543,
+ "consensus_score": 0.6566159527138855,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1434,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.42324868604069527,
+ "trm_confidence": 0.8874281904053775,
+ "mcts_value": 0.477336157029331,
+ "consensus_score": 0.5162430178424677,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 4223,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5852719319442111,
+ "trm_confidence": 0.9618961646205122,
+ "mcts_value": 0.07792237608563882,
+ "consensus_score": 0.5601518143056683,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 84,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5033887527895373,
+ "trm_confidence": 0.9482322557526242,
+ "mcts_value": 0.5016128404374516,
+ "consensus_score": 0.6187962383020338,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 4188,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4246367125632823,
+ "trm_confidence": 0.9169960992901469,
+ "mcts_value": 0.3008444231972218,
+ "consensus_score": 0.4575683593412612,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1885,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6722444983329169,
+ "trm_confidence": 0.8120150629776024,
+ "mcts_value": 0.35967682625370156,
+ "consensus_score": 0.5616134951732532,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1137,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3238170141971061,
+ "trm_confidence": 0.9447191906653171,
+ "mcts_value": 0.6525504792475115,
+ "consensus_score": 0.7009032232620207,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4619,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.356989558313623,
+ "trm_confidence": 0.9877288190113332,
+ "mcts_value": 0.04522453596626121,
+ "consensus_score": 0.5479748424819816,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 3248,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16212506707963395,
+ "trm_confidence": 0.8656759361032029,
+ "mcts_value": 0.6274337243421171,
+ "consensus_score": 0.5514649168534099,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 3140,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7339216286610662,
+ "trm_confidence": 0.9851488809408118,
+ "mcts_value": 0.24536162360623354,
+ "consensus_score": 0.5654757280560386,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 3298,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48798433603059405,
+ "trm_confidence": 0.8597245959138609,
+ "mcts_value": 0.35144864419324634,
+ "consensus_score": 0.5503389321323188,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1278,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5603984069163154,
+ "trm_confidence": 0.7698196440028562,
+ "mcts_value": 0.09768133976289063,
+ "consensus_score": 0.42130619394362945,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4975,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.36557083789533973,
+ "trm_confidence": 0.7489620054153361,
+ "mcts_value": 0.23887854732585437,
+ "consensus_score": 0.47636898738027056,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2177,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.788803394620058,
+ "trm_confidence": 0.9740300979439191,
+ "mcts_value": 0.7246475016671664,
+ "consensus_score": 0.8554175148715983,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3086,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.013303586213472012,
+ "trm_confidence": 0.907148665365344,
+ "mcts_value": 0.7733231465659587,
+ "consensus_score": 0.5817125516747075,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 725,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3077365828115015,
+ "trm_confidence": 0.7037191641747388,
+ "mcts_value": 0.3062003949110987,
+ "consensus_score": 0.42661657162807,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 4154,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.28455970154793864,
+ "trm_confidence": 0.8582634623491548,
+ "mcts_value": 0.7354639078841835,
+ "consensus_score": 0.5606811521471453,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 4690,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3809472177227391,
+ "trm_confidence": 0.862992752870787,
+ "mcts_value": 0.49116930984838786,
+ "consensus_score": 0.6683701142621463,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 2749,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15940416182721573,
+ "trm_confidence": 0.707512060901555,
+ "mcts_value": 0.2817153276359186,
+ "consensus_score": 0.28907614115112334,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 3080,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4256674726587186,
+ "trm_confidence": 0.9240195144514141,
+ "mcts_value": 0.4976621895912099,
+ "consensus_score": 0.594996109156145,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3633,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7929585788152211,
+ "trm_confidence": 0.8964987772604393,
+ "mcts_value": 0.012587260670319693,
+ "consensus_score": 0.5788649049472699,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2383,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4733439573332464,
+ "trm_confidence": 0.863561470366792,
+ "mcts_value": 0.5319923236550607,
+ "consensus_score": 0.6947591670570525,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 698,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5082465744857824,
+ "trm_confidence": 0.707805221065471,
+ "mcts_value": 0.04664323192977533,
+ "consensus_score": 0.3598239848146607,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3014,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11624612534982769,
+ "trm_confidence": 0.8003077680152176,
+ "mcts_value": 0.3159019436862092,
+ "consensus_score": 0.391292788984855,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3190,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09900207687467694,
+ "trm_confidence": 0.8957701755301245,
+ "mcts_value": 0.2350631845562408,
+ "consensus_score": 0.4926088050201367,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 1857,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.839200706210048,
+ "trm_confidence": 0.9827513857700444,
+ "mcts_value": 0.3629901743729547,
+ "consensus_score": 0.7617934664041085,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3531,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08786009173036866,
+ "trm_confidence": 0.7094743992078996,
+ "mcts_value": 0.3743384467839459,
+ "consensus_score": 0.3370680983960174,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 267,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7094734398843789,
+ "trm_confidence": 0.9659936674383465,
+ "mcts_value": 0.4267944139338113,
+ "consensus_score": 0.755332857837817,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 684,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5675345712066688,
+ "trm_confidence": 0.7592446155589834,
+ "mcts_value": 0.5426028944636934,
+ "consensus_score": 0.7035572940915822,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 4548,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6229253486081775,
+ "trm_confidence": 0.7685008694895777,
+ "mcts_value": 0.381856515082727,
+ "consensus_score": 0.5755932611771244,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 1734,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16567819614037949,
+ "trm_confidence": 0.7865659020882856,
+ "mcts_value": 0.05659224693895597,
+ "consensus_score": 0.3355917611519778,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 2722,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3241416909846743,
+ "trm_confidence": 0.856292689826732,
+ "mcts_value": 0.351223555127338,
+ "consensus_score": 0.452408114827305,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 1895,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.619377773100635,
+ "trm_confidence": 0.9149868270181939,
+ "mcts_value": 0.4685106965131719,
+ "consensus_score": 0.7402103738682992,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 4145,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6540348403488278,
+ "trm_confidence": 0.9188231378797913,
+ "mcts_value": 0.5441732431263956,
+ "consensus_score": 0.7749456771829988,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 895,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6285553989241485,
+ "trm_confidence": 0.944174705112871,
+ "mcts_value": 0.38230153859435984,
+ "consensus_score": 0.7370670141569622,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2051,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14264086330260134,
+ "trm_confidence": 0.7317827070972884,
+ "mcts_value": 0.531137302466143,
+ "consensus_score": 0.4478308355211137,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 2832,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6023345605360149,
+ "trm_confidence": 0.8474831002892362,
+ "mcts_value": 0.6691200500211939,
+ "consensus_score": 0.6203354566909851,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 1953,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3507914131324489,
+ "trm_confidence": 0.8842845551414735,
+ "mcts_value": 0.3126509759824731,
+ "consensus_score": 0.5102851239046965,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1140,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6065421070084859,
+ "trm_confidence": 0.8416433353061938,
+ "mcts_value": 0.5530048631860084,
+ "consensus_score": 0.6353391340031346,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2918,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3764406418636302,
+ "trm_confidence": 0.7460476616122571,
+ "mcts_value": 0.18432514629470334,
+ "consensus_score": 0.483438947642208,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3714,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34580787111028655,
+ "trm_confidence": 0.7765373968172902,
+ "mcts_value": 0.6417948119039376,
+ "consensus_score": 0.5643082009684584,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 1183,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1430520289014284,
+ "trm_confidence": 0.962455254641549,
+ "mcts_value": 0.7069185384482377,
+ "consensus_score": 0.5105718196838507,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4681,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4297221008494155,
+ "trm_confidence": 0.9066256638279759,
+ "mcts_value": 0.3128610767072595,
+ "consensus_score": 0.6121956697556076,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 118,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19953631197384059,
+ "trm_confidence": 0.9677453011628979,
+ "mcts_value": 0.14785643143045163,
+ "consensus_score": 0.46694010105577965,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 377,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09414792784677961,
+ "trm_confidence": 0.7749011136801479,
+ "mcts_value": 0.5807953018467219,
+ "consensus_score": 0.42856095587348325,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2635,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33591852705329256,
+ "trm_confidence": 0.7036422702766131,
+ "mcts_value": 0.3092393207762463,
+ "consensus_score": 0.5110895146507483,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 3341,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48181244318081007,
+ "trm_confidence": 0.7503813941796591,
+ "mcts_value": 0.5103600743829874,
+ "consensus_score": 0.5081839680702516,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2594,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17949808914376916,
+ "trm_confidence": 0.7008121821027361,
+ "mcts_value": 0.3334761664417101,
+ "consensus_score": 0.37141425116139576,
+ "last_agent": "trm",
+ "iteration": 2,
+ "query_length": 2971,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3466169105373836,
+ "trm_confidence": 0.7544569324329081,
+ "mcts_value": 0.46250204482550167,
+ "consensus_score": 0.5362509531242983,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1516,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.49903474535210157,
+ "trm_confidence": 0.7524933512462464,
+ "mcts_value": 0.2352663374726087,
+ "consensus_score": 0.5521504143015393,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 3875,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.024672623991961514,
+ "trm_confidence": 0.8491077538652446,
+ "mcts_value": 0.16877198340705082,
+ "consensus_score": 0.39231990419162194,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 550,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6895519670085203,
+ "trm_confidence": 0.9646233128314313,
+ "mcts_value": 0.7519493168716266,
+ "consensus_score": 0.8417272431529607,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2840,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7314607128206364,
+ "trm_confidence": 0.8958361673473945,
+ "mcts_value": 0.07999904434156259,
+ "consensus_score": 0.532251537072416,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 2961,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.49541895353943594,
+ "trm_confidence": 0.8335796397998163,
+ "mcts_value": 0.6170285539474784,
+ "consensus_score": 0.7468757653775413,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 1504,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.042337164682284135,
+ "trm_confidence": 0.966226869441021,
+ "mcts_value": 0.6532135052245751,
+ "consensus_score": 0.5248709106886035,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2778,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09166962753153295,
+ "trm_confidence": 0.777922675625991,
+ "mcts_value": 0.07928118116289253,
+ "consensus_score": 0.2369557329818372,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1110,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4600966895664224,
+ "trm_confidence": 0.7649900314617617,
+ "mcts_value": 0.4567481143485402,
+ "consensus_score": 0.5684452702636409,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2275,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06392551939140412,
+ "trm_confidence": 0.9507588913807878,
+ "mcts_value": 0.6294178813732334,
+ "consensus_score": 0.6457235801481944,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 417,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.27840269980272137,
+ "trm_confidence": 0.8880298192981482,
+ "mcts_value": 0.5354546137596964,
+ "consensus_score": 0.6213747875897874,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 58,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3684733249395634,
+ "trm_confidence": 0.7183912197320327,
+ "mcts_value": 0.07360348928504164,
+ "consensus_score": 0.33577568669676794,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 2862,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1085917895613392,
+ "trm_confidence": 0.7551424371635501,
+ "mcts_value": 0.029457983415894704,
+ "consensus_score": 0.3723787239843709,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 2098,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3399464330694429,
+ "trm_confidence": 0.7084876916914343,
+ "mcts_value": 0.17191870566237222,
+ "consensus_score": 0.42612954689413557,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 551,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4661264689151281,
+ "trm_confidence": 0.7863427044884104,
+ "mcts_value": 0.037099782425568675,
+ "consensus_score": 0.5210284254474875,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 4645,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1626991710875187,
+ "trm_confidence": 0.839563741469072,
+ "mcts_value": 0.6009076245985423,
+ "consensus_score": 0.5816609984811155,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 4304,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.610584907402012,
+ "trm_confidence": 0.8388396813450355,
+ "mcts_value": 0.5052515886628856,
+ "consensus_score": 0.680192234511322,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 2476,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03248932695593583,
+ "trm_confidence": 0.8115133779488601,
+ "mcts_value": 0.5599191457885969,
+ "consensus_score": 0.45457359326509383,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3796,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3896249423489808,
+ "trm_confidence": 0.7628281844378286,
+ "mcts_value": 0.15414269285299517,
+ "consensus_score": 0.34837371019302776,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3598,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.30799173649505523,
+ "trm_confidence": 0.8140706751697913,
+ "mcts_value": 0.5819996073446608,
+ "consensus_score": 0.5321880521911325,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1375,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3445939379059129,
+ "trm_confidence": 0.8982604587774398,
+ "mcts_value": 0.5969309674681113,
+ "consensus_score": 0.6048412620007652,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3424,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07248691640425747,
+ "trm_confidence": 0.7953167004427881,
+ "mcts_value": 0.2884535276545265,
+ "consensus_score": 0.31019042439305106,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3260,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0318471571811411,
+ "trm_confidence": 0.9792133543616941,
+ "mcts_value": 0.5463945301768476,
+ "consensus_score": 0.5106987179032431,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 2172,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.137782560158677,
+ "trm_confidence": 0.7355455757064404,
+ "mcts_value": 0.17450190345222677,
+ "consensus_score": 0.32953794684888726,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2690,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06068349066973206,
+ "trm_confidence": 0.7485608081886223,
+ "mcts_value": 0.3564966240656323,
+ "consensus_score": 0.3543952817215828,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1661,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5716627543553109,
+ "trm_confidence": 0.7072688062262754,
+ "mcts_value": 0.06577214345905116,
+ "consensus_score": 0.3736063176058892,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 4420,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4792595224651311,
+ "trm_confidence": 0.8852980720509989,
+ "mcts_value": 0.7748521953486893,
+ "consensus_score": 0.6308243504765374,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2024,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3305727687781483,
+ "trm_confidence": 0.8869214510686106,
+ "mcts_value": 0.3021399182018856,
+ "consensus_score": 0.592793378378774,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 2143,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4630549898545562,
+ "trm_confidence": 0.8137375183274645,
+ "mcts_value": 0.6908313485013811,
+ "consensus_score": 0.5673948289702623,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1973,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12103597238851165,
+ "trm_confidence": 0.9113190636932357,
+ "mcts_value": 0.4327473909675736,
+ "consensus_score": 0.5230702186281557,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 1658,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5752077479652343,
+ "trm_confidence": 0.773235722525673,
+ "mcts_value": 0.023207103765942348,
+ "consensus_score": 0.49898927377377894,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2778,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43705698931283243,
+ "trm_confidence": 0.7483137048459819,
+ "mcts_value": 0.0388928802268296,
+ "consensus_score": 0.4260458548997197,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2065,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.49476042604650194,
+ "trm_confidence": 0.8095554109063201,
+ "mcts_value": 0.4216848270686829,
+ "consensus_score": 0.5168609890926189,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 3593,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.37745678874719507,
+ "trm_confidence": 0.900049985707135,
+ "mcts_value": 0.2572833569043462,
+ "consensus_score": 0.4287739523404067,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 3286,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19934137811910552,
+ "trm_confidence": 0.9959877752574893,
+ "mcts_value": 0.7301188829975164,
+ "consensus_score": 0.5755449884856995,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 3725,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5567073453453225,
+ "trm_confidence": 0.7973216992424815,
+ "mcts_value": 0.667820394630638,
+ "consensus_score": 0.7490087098029837,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 477,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5885682385532721,
+ "trm_confidence": 0.7443840739914134,
+ "mcts_value": 0.33371270642218903,
+ "consensus_score": 0.5228312239336328,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 2627,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.616400595471275,
+ "trm_confidence": 0.7743840974982654,
+ "mcts_value": 0.5376609440049948,
+ "consensus_score": 0.736773953874989,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 3719,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3676503385665803,
+ "trm_confidence": 0.7771451422092329,
+ "mcts_value": 0.2169944690565093,
+ "consensus_score": 0.41062309332611235,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4963,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48208269744353577,
+ "trm_confidence": 0.8407400310916351,
+ "mcts_value": 0.09494959287482052,
+ "consensus_score": 0.5587096279787113,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2008,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.565282606096974,
+ "trm_confidence": 0.896171629186319,
+ "mcts_value": 0.5701631380009995,
+ "consensus_score": 0.7522257705070587,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4150,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38184870645861857,
+ "trm_confidence": 0.7298528928596371,
+ "mcts_value": 0.041792505404248316,
+ "consensus_score": 0.33275672524540734,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 4818,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3956058611748936,
+ "trm_confidence": 0.7481014374828604,
+ "mcts_value": 0.5125608281675542,
+ "consensus_score": 0.6164655197628086,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 4570,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3636044808504141,
+ "trm_confidence": 0.8689032280259077,
+ "mcts_value": 0.5037191858411157,
+ "consensus_score": 0.5207597230038525,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 3767,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5042486778558699,
+ "trm_confidence": 0.9487663423609357,
+ "mcts_value": 0.20782446818532185,
+ "consensus_score": 0.6027481629327691,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 1989,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0860525064016166,
+ "trm_confidence": 0.876995566672904,
+ "mcts_value": 0.06177547004860453,
+ "consensus_score": 0.3917751143950009,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 1594,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.578535731544378,
+ "trm_confidence": 0.9286219063090528,
+ "mcts_value": 0.23085907844237044,
+ "consensus_score": 0.6211278092362573,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 1251,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1637657571093312,
+ "trm_confidence": 0.7978487926010698,
+ "mcts_value": 0.27442913148171366,
+ "consensus_score": 0.46048109136816084,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 572,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09808957441878606,
+ "trm_confidence": 0.7942464664235027,
+ "mcts_value": 0.18531774254454983,
+ "consensus_score": 0.4311539880440103,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3206,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.053949052911980266,
+ "trm_confidence": 0.9369717945206762,
+ "mcts_value": 0.21249967947120593,
+ "consensus_score": 0.3471608064460887,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 653,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.003914871612376854,
+ "trm_confidence": 0.8853357809165663,
+ "mcts_value": 0.1447452394219721,
+ "consensus_score": 0.37177355591158734,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1187,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.371068074401949,
+ "trm_confidence": 0.9294172657425274,
+ "mcts_value": 0.6216606633028641,
+ "consensus_score": 0.7357961992476911,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 1580,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2177393810241916,
+ "trm_confidence": 0.9755993845183886,
+ "mcts_value": 0.7762695635461,
+ "consensus_score": 0.5760450510508233,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 137,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3886885142379816,
+ "trm_confidence": 0.8843818657757279,
+ "mcts_value": 0.4842358700709443,
+ "consensus_score": 0.5161014917841068,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 2527,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3443884439931525,
+ "trm_confidence": 0.8793338021890114,
+ "mcts_value": 0.19052828786785117,
+ "consensus_score": 0.5479787111122544,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 364,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1315284405602689,
+ "trm_confidence": 0.8229103229646983,
+ "mcts_value": 0.5289007220143683,
+ "consensus_score": 0.4841381476335501,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 2661,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6112116300243542,
+ "trm_confidence": 0.944729083478624,
+ "mcts_value": 0.73414717924184,
+ "consensus_score": 0.8447240816360495,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 45,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20920003901734632,
+ "trm_confidence": 0.8826423427616783,
+ "mcts_value": 0.1737092300537618,
+ "consensus_score": 0.34463198824139984,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1519,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2619664334960157,
+ "trm_confidence": 0.9282858938551011,
+ "mcts_value": 0.17001383871513442,
+ "consensus_score": 0.4441448910463415,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4000,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7900517104476441,
+ "trm_confidence": 0.9624469462822163,
+ "mcts_value": 0.7333061975991593,
+ "consensus_score": 0.9215178589969355,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4511,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24394936531507527,
+ "trm_confidence": 0.7250686096445434,
+ "mcts_value": 0.16107211186535897,
+ "consensus_score": 0.31068481388053937,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 1115,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6754832080856815,
+ "trm_confidence": 0.9878137816945693,
+ "mcts_value": 0.5690404010302622,
+ "consensus_score": 0.77931625562138,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 3555,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6151419235787853,
+ "trm_confidence": 0.7560076354843795,
+ "mcts_value": 0.1979787353524825,
+ "consensus_score": 0.47408006726392415,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2981,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.46509443130234623,
+ "trm_confidence": 0.8353896755029916,
+ "mcts_value": 0.11728955828719136,
+ "consensus_score": 0.5072568831779007,
+ "last_agent": "mcts",
+ "iteration": 2,
+ "query_length": 4989,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21559775166278877,
+ "trm_confidence": 0.7486539804831354,
+ "mcts_value": 0.21486731881020213,
+ "consensus_score": 0.3838754859924959,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 3421,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09827180114003921,
+ "trm_confidence": 0.8625215714989685,
+ "mcts_value": 0.525298829132584,
+ "consensus_score": 0.5841534924297165,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 1177,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.027041610671315285,
+ "trm_confidence": 0.7294949773377593,
+ "mcts_value": 0.45409808520273953,
+ "consensus_score": 0.455749205666904,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2817,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3136839190701999,
+ "trm_confidence": 0.9330721164870491,
+ "mcts_value": 0.2527021236687303,
+ "consensus_score": 0.4344241706773192,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 361,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5423567683286818,
+ "trm_confidence": 0.9481024043823998,
+ "mcts_value": 0.6234234921929721,
+ "consensus_score": 0.7469709789523773,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 1487,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3592651604263909,
+ "trm_confidence": 0.7002473850504056,
+ "mcts_value": 0.21240272293132584,
+ "consensus_score": 0.3487292223594683,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 2419,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03534708308710173,
+ "trm_confidence": 0.7210700112660109,
+ "mcts_value": 0.5689164364087479,
+ "consensus_score": 0.40419839345802605,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 611,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4595328841269329,
+ "trm_confidence": 0.7427713846981341,
+ "mcts_value": 0.3861801905176152,
+ "consensus_score": 0.5676242344019248,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 342,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.37637053417760125,
+ "trm_confidence": 0.7229915984352644,
+ "mcts_value": 0.6146379513451925,
+ "consensus_score": 0.6057302688509411,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 2800,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03228201472915102,
+ "trm_confidence": 0.9034034662515389,
+ "mcts_value": 0.6965553049876887,
+ "consensus_score": 0.5170159200652188,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 195,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22673522654323894,
+ "trm_confidence": 0.9811045002628791,
+ "mcts_value": 0.43307446871534455,
+ "consensus_score": 0.4817267760636323,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 4955,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.706902151582988,
+ "trm_confidence": 0.9181032717884066,
+ "mcts_value": 0.1172332492359439,
+ "consensus_score": 0.5384099377645399,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3042,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5361609677126532,
+ "trm_confidence": 0.8660294109660951,
+ "mcts_value": 0.49629320506058805,
+ "consensus_score": 0.610438570706895,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 2899,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6022105841966343,
+ "trm_confidence": 0.7462911156168053,
+ "mcts_value": 0.492155691889362,
+ "consensus_score": 0.6981002830659674,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 3711,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3660627178418953,
+ "trm_confidence": 0.9784715944336495,
+ "mcts_value": 0.0031767683220894345,
+ "consensus_score": 0.36175125003510994,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 4906,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4947540701137711,
+ "trm_confidence": 0.8055107172550791,
+ "mcts_value": 0.5578112066415875,
+ "consensus_score": 0.6374013195390709,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4264,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.009179028716510931,
+ "trm_confidence": 0.8064405757412797,
+ "mcts_value": 0.19821918663560692,
+ "consensus_score": 0.3040663745259947,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2701,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5759017228908051,
+ "trm_confidence": 0.9279779192782271,
+ "mcts_value": 0.11509313567569202,
+ "consensus_score": 0.6012243745561644,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 2335,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38411527252869104,
+ "trm_confidence": 0.8944281790844549,
+ "mcts_value": 0.3490320696210835,
+ "consensus_score": 0.46494980643922096,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 1838,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08605914289116712,
+ "trm_confidence": 0.7492760161762799,
+ "mcts_value": 0.6194796224353809,
+ "consensus_score": 0.4419111985897207,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 3590,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48948486094393795,
+ "trm_confidence": 0.9694751335534431,
+ "mcts_value": 0.7991866758820104,
+ "consensus_score": 0.7127005379587096,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 4412,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5642706425765245,
+ "trm_confidence": 0.8036610975370052,
+ "mcts_value": 0.6188160411145123,
+ "consensus_score": 0.6567639625907616,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2564,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33312734089856844,
+ "trm_confidence": 0.7812761180158937,
+ "mcts_value": 0.025418611170304407,
+ "consensus_score": 0.440740922080869,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 4051,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03307344027948186,
+ "trm_confidence": 0.869594964101483,
+ "mcts_value": 0.06104188617818761,
+ "consensus_score": 0.36046003504232005,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 81,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16002249463825832,
+ "trm_confidence": 0.801608690026225,
+ "mcts_value": 0.13244891523472083,
+ "consensus_score": 0.41737562537958395,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1632,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5280922234212754,
+ "trm_confidence": 0.9879043657522382,
+ "mcts_value": 0.5697330752523868,
+ "consensus_score": 0.7372284971038164,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2780,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.39737379960595426,
+ "trm_confidence": 0.8450405954043092,
+ "mcts_value": 0.2412642627233361,
+ "consensus_score": 0.41561107133448993,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2378,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06432983471771407,
+ "trm_confidence": 0.7826905153250104,
+ "mcts_value": 0.13020762131912073,
+ "consensus_score": 0.35348883970009126,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 692,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.27174780465748594,
+ "trm_confidence": 0.7270515929468473,
+ "mcts_value": 0.2211892338025744,
+ "consensus_score": 0.4507547077068259,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2732,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7209489134411255,
+ "trm_confidence": 0.8460946246914985,
+ "mcts_value": 0.564267053498185,
+ "consensus_score": 0.709832775968962,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 670,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2817517105607826,
+ "trm_confidence": 0.982752778995942,
+ "mcts_value": 0.22593093578389653,
+ "consensus_score": 0.596693593524408,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1684,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.01577439131129125,
+ "trm_confidence": 0.7185656431076729,
+ "mcts_value": 0.37090365019669297,
+ "consensus_score": 0.2827274615509427,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 4032,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.47099881769875523,
+ "trm_confidence": 0.8638162200244239,
+ "mcts_value": 0.22295367841930777,
+ "consensus_score": 0.43205074719581693,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4342,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1947221200387112,
+ "trm_confidence": 0.7256672923055179,
+ "mcts_value": 0.007799355637569309,
+ "consensus_score": 0.296203379145196,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 717,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6616562017705362,
+ "trm_confidence": 0.8812921911396764,
+ "mcts_value": 0.31759699748431286,
+ "consensus_score": 0.6899362138102705,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2900,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2428662903497912,
+ "trm_confidence": 0.9832436939981397,
+ "mcts_value": 0.023142295682260235,
+ "consensus_score": 0.36495223526773574,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 945,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6688805237376906,
+ "trm_confidence": 0.8574002528330715,
+ "mcts_value": 0.22141704712565802,
+ "consensus_score": 0.5867200628239739,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 3712,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0670001734640036,
+ "trm_confidence": 0.7472804539865435,
+ "mcts_value": 0.3255789958687113,
+ "consensus_score": 0.4143307249261845,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 694,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3267956106620368,
+ "trm_confidence": 0.9321549283957002,
+ "mcts_value": 0.027350993797126976,
+ "consensus_score": 0.3519870391985993,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 126,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.26675615856256124,
+ "trm_confidence": 0.8486474023806047,
+ "mcts_value": 0.4196913407843851,
+ "consensus_score": 0.5078201302744079,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 879,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24076419485499137,
+ "trm_confidence": 0.7954192441326038,
+ "mcts_value": 0.4069230558316348,
+ "consensus_score": 0.500321991851245,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 3450,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5232461696556046,
+ "trm_confidence": 0.9082135039558441,
+ "mcts_value": 0.10914988406229703,
+ "consensus_score": 0.5017448951596606,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 2644,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13977011727098437,
+ "trm_confidence": 0.9565491839452641,
+ "mcts_value": 0.434483402192434,
+ "consensus_score": 0.4951348535874965,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 3296,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7027971798793406,
+ "trm_confidence": 0.8364775682002916,
+ "mcts_value": 0.2111804508894396,
+ "consensus_score": 0.6567402210062185,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3329,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4528137893677984,
+ "trm_confidence": 0.7608033363603153,
+ "mcts_value": 0.36276653795130237,
+ "consensus_score": 0.6143671543969972,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 722,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4481637236190501,
+ "trm_confidence": 0.741486100448232,
+ "mcts_value": 0.6248777808889668,
+ "consensus_score": 0.5105112890996961,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4318,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06857363256335977,
+ "trm_confidence": 0.7224344602682048,
+ "mcts_value": 0.4306979431022975,
+ "consensus_score": 0.436828020490445,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 2642,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6867730459739563,
+ "trm_confidence": 0.8570271543534677,
+ "mcts_value": 0.6646629381862011,
+ "consensus_score": 0.7986559333145749,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 1851,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07578925298603742,
+ "trm_confidence": 0.8517926511733398,
+ "mcts_value": 0.4953156926761809,
+ "consensus_score": 0.4010946427909876,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2697,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16010157541766123,
+ "trm_confidence": 0.7898537470823938,
+ "mcts_value": 0.42773108304892316,
+ "consensus_score": 0.516586144254161,
+ "last_agent": "trm",
+ "iteration": 0,
+ "query_length": 333,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.29374569321155797,
+ "trm_confidence": 0.9582303084863814,
+ "mcts_value": 0.5655772628314892,
+ "consensus_score": 0.586373694517166,
+ "last_agent": "hrm",
+ "iteration": 1,
+ "query_length": 1770,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5498426640572068,
+ "trm_confidence": 0.725907902350479,
+ "mcts_value": 0.578520264476097,
+ "consensus_score": 0.6824129043635416,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 905,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22196676364259524,
+ "trm_confidence": 0.995382212980279,
+ "mcts_value": 0.06800281593644064,
+ "consensus_score": 0.4408387822400072,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2222,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.35901101594477813,
+ "trm_confidence": 0.7021256062509844,
+ "mcts_value": 0.15898145305339606,
+ "consensus_score": 0.3442777347215891,
+ "last_agent": "none",
+ "iteration": 0,
+ "query_length": 4410,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2907794325942135,
+ "trm_confidence": 0.9636906617159372,
+ "mcts_value": 0.5865100538924193,
+ "consensus_score": 0.6649694420812382,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 4692,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.45162383408007106,
+ "trm_confidence": 0.9075638979000847,
+ "mcts_value": 0.0008014781372515946,
+ "consensus_score": 0.3603478898903641,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3176,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7951723642940054,
+ "trm_confidence": 0.981269577631884,
+ "mcts_value": 0.018015779119747756,
+ "consensus_score": 0.6565220316400938,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 2362,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17350022601120896,
+ "trm_confidence": 0.7020147352490155,
+ "mcts_value": 0.45230798396929145,
+ "consensus_score": 0.5012118061301566,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 1952,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6700725888611195,
+ "trm_confidence": 0.7931087773863589,
+ "mcts_value": 0.30442181656221784,
+ "consensus_score": 0.5786261709853329,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2346,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33108310202008395,
+ "trm_confidence": 0.8658014693792242,
+ "mcts_value": 0.6330995001890818,
+ "consensus_score": 0.6918048944891201,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 1477,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5166902976733988,
+ "trm_confidence": 0.8326445672393485,
+ "mcts_value": 0.614193224751526,
+ "consensus_score": 0.6671058116347,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1003,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5990841615461594,
+ "trm_confidence": 0.902918133015186,
+ "mcts_value": 0.16752512087819482,
+ "consensus_score": 0.5561840478842063,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 4964,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5226621376728493,
+ "trm_confidence": 0.9567887526441046,
+ "mcts_value": 0.11064255655673892,
+ "consensus_score": 0.5933025634730631,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3743,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6129866001129773,
+ "trm_confidence": 0.8176127385585569,
+ "mcts_value": 0.2971114670527823,
+ "consensus_score": 0.5900054324972238,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 3059,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4624508846282009,
+ "trm_confidence": 0.915452737638381,
+ "mcts_value": 0.40492024675669264,
+ "consensus_score": 0.6562619177500998,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2326,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6278566491726219,
+ "trm_confidence": 0.9877220014239958,
+ "mcts_value": 0.8770151232588164,
+ "consensus_score": 0.9251168545389185,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 4461,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.134780155862351,
+ "trm_confidence": 0.8828288105490902,
+ "mcts_value": 0.42535114381200245,
+ "consensus_score": 0.53406200788668,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3053,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24109591575028186,
+ "trm_confidence": 0.7328967697575094,
+ "mcts_value": 0.320223246140162,
+ "consensus_score": 0.4178703697231394,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 4663,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.41234914508910964,
+ "trm_confidence": 0.9512851878566462,
+ "mcts_value": 0.041476680211027434,
+ "consensus_score": 0.3716685895144577,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 28,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.645435040183728,
+ "trm_confidence": 0.9115377837997666,
+ "mcts_value": 0.1852714048300609,
+ "consensus_score": 0.5991834652475413,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 1162,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.47153643452004507,
+ "trm_confidence": 0.96663864825443,
+ "mcts_value": 0.7677453812636504,
+ "consensus_score": 0.6803563821819433,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 3270,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7422740536411904,
+ "trm_confidence": 0.8440240225583628,
+ "mcts_value": 0.08515006658387513,
+ "consensus_score": 0.6275423880481276,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 3237,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.36512558615465096,
+ "trm_confidence": 0.8032111504777462,
+ "mcts_value": 0.2056665511746096,
+ "consensus_score": 0.5178225856236568,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2685,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4046702589584559,
+ "trm_confidence": 0.8868074033068132,
+ "mcts_value": 0.5594746319742098,
+ "consensus_score": 0.6177706909658732,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 4443,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7244777124892807,
+ "trm_confidence": 0.9920046051237716,
+ "mcts_value": 0.7677781225671295,
+ "consensus_score": 0.8246683798676082,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1130,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1163748177424833,
+ "trm_confidence": 0.9285844535692526,
+ "mcts_value": 0.2544184440051873,
+ "consensus_score": 0.42227243777758766,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 2995,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21715587039051465,
+ "trm_confidence": 0.9460314234351094,
+ "mcts_value": 0.6305312909373195,
+ "consensus_score": 0.5126486087433115,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3133,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5205522309687037,
+ "trm_confidence": 0.8205096326507446,
+ "mcts_value": 0.6593660351069702,
+ "consensus_score": 0.7269518942863782,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 579,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24396749425820094,
+ "trm_confidence": 0.9338072634357416,
+ "mcts_value": 0.6702832678201668,
+ "consensus_score": 0.5816335703502248,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 2354,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6118934963390785,
+ "trm_confidence": 0.713319189073177,
+ "mcts_value": 0.5332547377561968,
+ "consensus_score": 0.6960586417493916,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1201,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08920373448121506,
+ "trm_confidence": 0.7860506664040456,
+ "mcts_value": 0.3438548600718579,
+ "consensus_score": 0.35037941774769443,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3864,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15847387327825596,
+ "trm_confidence": 0.7429590213229365,
+ "mcts_value": 0.12792101257159041,
+ "consensus_score": 0.28827365637353636,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2006,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6984138608003901,
+ "trm_confidence": 0.884507314571012,
+ "mcts_value": 0.4727101899991634,
+ "consensus_score": 0.724117013380798,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1770,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6713096838839204,
+ "trm_confidence": 0.7961752187349718,
+ "mcts_value": 0.43288967658355837,
+ "consensus_score": 0.5889604169542866,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 747,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4488211404725031,
+ "trm_confidence": 0.9005330712117321,
+ "mcts_value": 0.7086021561543003,
+ "consensus_score": 0.7204733703895366,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 724,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6098893298605026,
+ "trm_confidence": 0.7319166049037223,
+ "mcts_value": 0.3784285480519917,
+ "consensus_score": 0.5565481761013581,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1530,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.18342038240570238,
+ "trm_confidence": 0.946638182171232,
+ "mcts_value": 0.6811593682621512,
+ "consensus_score": 0.610614999599084,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 3571,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14906470567047758,
+ "trm_confidence": 0.9447500065979817,
+ "mcts_value": 0.043762371778592296,
+ "consensus_score": 0.43036790751419335,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 1191,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5791896333506865,
+ "trm_confidence": 0.7741202761842156,
+ "mcts_value": 0.20043586225665735,
+ "consensus_score": 0.4599537712140579,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2706,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05766527097728564,
+ "trm_confidence": 0.7952965662415622,
+ "mcts_value": 0.11022538489293426,
+ "consensus_score": 0.3044729840498236,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2683,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6029620503797231,
+ "trm_confidence": 0.7638024120280091,
+ "mcts_value": 0.5907938874476786,
+ "consensus_score": 0.6884096166109919,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 1257,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08530128330633818,
+ "trm_confidence": 0.7633662983313895,
+ "mcts_value": 0.2520781283405512,
+ "consensus_score": 0.3820795576734962,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 4228,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.23824827363124398,
+ "trm_confidence": 0.7888764206137916,
+ "mcts_value": 0.5446763936859894,
+ "consensus_score": 0.5995966421557799,
+ "last_agent": "hrm",
+ "iteration": 3,
+ "query_length": 4627,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03645149074883067,
+ "trm_confidence": 0.7834486851676804,
+ "mcts_value": 0.5042875204423015,
+ "consensus_score": 0.375596595433501,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3641,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06196604299688403,
+ "trm_confidence": 0.7553414752537317,
+ "mcts_value": 0.228067362442038,
+ "consensus_score": 0.43891547507428164,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 1758,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3671711066549766,
+ "trm_confidence": 0.7116732462322645,
+ "mcts_value": 0.05643153072067,
+ "consensus_score": 0.3869341087454756,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4955,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03268675557397803,
+ "trm_confidence": 0.7898234484449573,
+ "mcts_value": 0.017092359289111358,
+ "consensus_score": 0.2658728350317865,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 447,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.52843234883557,
+ "trm_confidence": 0.8722166517345464,
+ "mcts_value": 0.046973836607357426,
+ "consensus_score": 0.4131610208865,
+ "last_agent": "trm",
+ "iteration": 3,
+ "query_length": 1212,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.537218265182701,
+ "trm_confidence": 0.8317105336684291,
+ "mcts_value": 0.6348304972281507,
+ "consensus_score": 0.7195942206794689,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2895,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7122174030713968,
+ "trm_confidence": 0.9556537585022351,
+ "mcts_value": 0.6248546442004216,
+ "consensus_score": 0.824602298589196,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2946,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4075676356567753,
+ "trm_confidence": 0.8914861071737631,
+ "mcts_value": 0.6735839520535416,
+ "consensus_score": 0.6438178188689948,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 3894,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22118242784204983,
+ "trm_confidence": 0.9885658304105771,
+ "mcts_value": 0.6305436490773224,
+ "consensus_score": 0.6648145353061471,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1067,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.44713971051353746,
+ "trm_confidence": 0.8720577585125344,
+ "mcts_value": 0.1291584908205778,
+ "consensus_score": 0.5777778604336374,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 2380,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3700644311505463,
+ "trm_confidence": 0.9387385014415938,
+ "mcts_value": 0.17713630425104762,
+ "consensus_score": 0.43310515579166653,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 4026,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.44960067505898793,
+ "trm_confidence": 0.7270410946114799,
+ "mcts_value": 0.15686753079097213,
+ "consensus_score": 0.5197820503039892,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 2904,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3199410028080757,
+ "trm_confidence": 0.7254575330664803,
+ "mcts_value": 0.28191721973716205,
+ "consensus_score": 0.4927010141245345,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 1870,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1887215862478396,
+ "trm_confidence": 0.7784964746898831,
+ "mcts_value": 0.6295415304385765,
+ "consensus_score": 0.46526891754299554,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2389,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15580876886389092,
+ "trm_confidence": 0.7265170706252492,
+ "mcts_value": 0.03659760596095116,
+ "consensus_score": 0.3643137689828136,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 1120,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5036437005204956,
+ "trm_confidence": 0.8986148813894688,
+ "mcts_value": 0.7505958370597642,
+ "consensus_score": 0.8098702430062522,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3312,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.41157849497836363,
+ "trm_confidence": 0.9377275301005535,
+ "mcts_value": 0.5901441016930987,
+ "consensus_score": 0.5621351771273859,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 531,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5229840904691805,
+ "trm_confidence": 0.7254924259377215,
+ "mcts_value": 0.5835719402202927,
+ "consensus_score": 0.5559205968361439,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 3444,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1135682279040384,
+ "trm_confidence": 0.9880678013568904,
+ "mcts_value": 0.3099394152532933,
+ "consensus_score": 0.5101540764015787,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 3436,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0931857133634963,
+ "trm_confidence": 0.7069873180014079,
+ "mcts_value": 0.34132687164971404,
+ "consensus_score": 0.41111798093237595,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2463,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19816711313021182,
+ "trm_confidence": 0.8245844241216375,
+ "mcts_value": 0.01128953256772494,
+ "consensus_score": 0.28600632181700203,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 2538,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09542007485860418,
+ "trm_confidence": 0.7060433414489806,
+ "mcts_value": 0.4036851991817658,
+ "consensus_score": 0.42763328845763604,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1285,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10596334580580775,
+ "trm_confidence": 0.8703268350523573,
+ "mcts_value": 0.4345196590148425,
+ "consensus_score": 0.38258895880717203,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1557,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2038095686689596,
+ "trm_confidence": 0.8283441428088825,
+ "mcts_value": 0.49952058948680605,
+ "consensus_score": 0.5787469937813052,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2092,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.023670709999474536,
+ "trm_confidence": 0.861503439799904,
+ "mcts_value": 0.5505167267521973,
+ "consensus_score": 0.48756021343289146,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 656,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5498646131944848,
+ "trm_confidence": 0.9806820359409224,
+ "mcts_value": 0.41912753188379326,
+ "consensus_score": 0.6740757416147484,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 2834,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.30222002578153945,
+ "trm_confidence": 0.7870927770123011,
+ "mcts_value": 0.4710654708838269,
+ "consensus_score": 0.428114476243183,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 2188,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.517244726214594,
+ "trm_confidence": 0.9047679380297499,
+ "mcts_value": 0.5252141294078266,
+ "consensus_score": 0.6707013811352175,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 4448,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5782538078279272,
+ "trm_confidence": 0.9196957346546054,
+ "mcts_value": 0.19366752533222478,
+ "consensus_score": 0.5205335237756176,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2537,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.27759100049047153,
+ "trm_confidence": 0.8100229296789879,
+ "mcts_value": 0.42875768468675873,
+ "consensus_score": 0.449435213499085,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 731,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06612097058965101,
+ "trm_confidence": 0.9118732335049741,
+ "mcts_value": 0.3815523241631893,
+ "consensus_score": 0.35882608252140374,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 50,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2867772756099233,
+ "trm_confidence": 0.9847758442317486,
+ "mcts_value": 0.4988418394014734,
+ "consensus_score": 0.5416091682199714,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3350,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14486151824914825,
+ "trm_confidence": 0.8111884784516898,
+ "mcts_value": 0.07931269493394683,
+ "consensus_score": 0.2690967240029296,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 3213,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5723089366326094,
+ "trm_confidence": 0.7005789123620018,
+ "mcts_value": 0.011999521163016088,
+ "consensus_score": 0.4120130151634491,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2620,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.37732918644693036,
+ "trm_confidence": 0.9138658874296303,
+ "mcts_value": 0.7930651348065357,
+ "consensus_score": 0.6026172795010194,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 441,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22386318014259662,
+ "trm_confidence": 0.9786790051379077,
+ "mcts_value": 0.1251749172286063,
+ "consensus_score": 0.47523542337757685,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 3564,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5440501535428177,
+ "trm_confidence": 0.9773857535449653,
+ "mcts_value": 0.2729229773638274,
+ "consensus_score": 0.578132939434112,
+ "last_agent": "mcts",
+ "iteration": 1,
+ "query_length": 2733,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5210214009226612,
+ "trm_confidence": 0.7430050210628615,
+ "mcts_value": 0.4532371791200645,
+ "consensus_score": 0.5784902038906994,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2534,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.7436476874790247,
+ "trm_confidence": 0.9438716043087283,
+ "mcts_value": 0.46161146207938814,
+ "consensus_score": 0.7803539423196538,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 159,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3321384555464614,
+ "trm_confidence": 0.9596295566866301,
+ "mcts_value": 0.34033824077037494,
+ "consensus_score": 0.5050176867248054,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 4605,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34360040070387166,
+ "trm_confidence": 0.898138343593137,
+ "mcts_value": 0.1738642953167907,
+ "consensus_score": 0.5452360923540416,
+ "last_agent": "none",
+ "iteration": 3,
+ "query_length": 1367,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3708734210569285,
+ "trm_confidence": 0.7622059999079367,
+ "mcts_value": 0.35534973457636193,
+ "consensus_score": 0.5618726214324747,
+ "last_agent": "none",
+ "iteration": 1,
+ "query_length": 485,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4451081419245728,
+ "trm_confidence": 0.8594284516516211,
+ "mcts_value": 0.38743479820397003,
+ "consensus_score": 0.47119544529397467,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 356,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4561927932840184,
+ "trm_confidence": 0.8028064541133494,
+ "mcts_value": 0.34978475478808896,
+ "consensus_score": 0.45490497659531093,
+ "last_agent": "hrm",
+ "iteration": 0,
+ "query_length": 4638,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.399815127929161,
+ "trm_confidence": 0.7572655439716671,
+ "mcts_value": 0.11687132043692414,
+ "consensus_score": 0.5009945487675382,
+ "last_agent": "hrm",
+ "iteration": 2,
+ "query_length": 3720,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2547861301271446,
+ "trm_confidence": 0.9627238049078941,
+ "mcts_value": 0.3670222299040831,
+ "consensus_score": 0.6199826651155291,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 3757,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3682753491268558,
+ "trm_confidence": 0.7206816855197664,
+ "mcts_value": 0.6139025598324387,
+ "consensus_score": 0.6570031105111498,
+ "last_agent": "trm",
+ "iteration": 1,
+ "query_length": 301,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09011705955441619,
+ "trm_confidence": 0.7946610237708518,
+ "mcts_value": 0.4826493062063232,
+ "consensus_score": 0.4105788027159111,
+ "last_agent": "mcts",
+ "iteration": 3,
+ "query_length": 1686,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11852945767324931,
+ "trm_confidence": 0.8493902676335996,
+ "mcts_value": 0.3238043581681488,
+ "consensus_score": 0.3592431389123262,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2869,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14902826731190888,
+ "trm_confidence": 0.8565661461551899,
+ "mcts_value": 0.057772755366731966,
+ "consensus_score": 0.29491744671302983,
+ "last_agent": "mcts",
+ "iteration": 0,
+ "query_length": 3267,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48758750266237133,
+ "trm_confidence": 0.848045247328965,
+ "mcts_value": 0.06533413356703832,
+ "consensus_score": 0.4551926859669786,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 1058,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0511193831067477,
+ "trm_confidence": 0.8116235234548076,
+ "mcts_value": 0.15665045493738797,
+ "consensus_score": 0.3331483723501024,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2360,
+ "has_rag_context": false
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.37819121088269464,
+ "trm_confidence": 0.9938865043372864,
+ "mcts_value": 0.07211910142969438,
+ "consensus_score": 0.4461136263330919,
+ "last_agent": "none",
+ "iteration": 2,
+ "query_length": 1219,
+ "has_rag_context": true
+ },
+ "label": "trm"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.31723435453434623,
+ "trm_confidence": 0.23776372466300788,
+ "mcts_value": 0.6380934441153502,
+ "consensus_score": 0.33376791876446466,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1345,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1686840121841821,
+ "trm_confidence": 0.3768754430606373,
+ "mcts_value": 0.9407471422986178,
+ "consensus_score": 0.4322633759443156,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3990,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.42855636392941093,
+ "trm_confidence": 0.1846502295230367,
+ "mcts_value": 0.6646314497486088,
+ "consensus_score": 0.36041945971320166,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2756,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3837846892832857,
+ "trm_confidence": 0.17542621208214185,
+ "mcts_value": 0.8260592450783548,
+ "consensus_score": 0.4523110645038243,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1628,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13536975187858913,
+ "trm_confidence": 0.22041493479425212,
+ "mcts_value": 0.6886788593507293,
+ "consensus_score": 0.4286284993938186,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1277,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4640450577224613,
+ "trm_confidence": 0.3360216995983519,
+ "mcts_value": 0.8607849384132342,
+ "consensus_score": 0.6346544965847443,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 908,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3822976453806744,
+ "trm_confidence": 0.08681387071692682,
+ "mcts_value": 0.856150003967066,
+ "consensus_score": 0.42029892280404163,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4107,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5750886628472248,
+ "trm_confidence": 0.30967271573066185,
+ "mcts_value": 0.7515370227892435,
+ "consensus_score": 0.5961329131448321,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3365,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.18185877193841446,
+ "trm_confidence": 0.1690427244553648,
+ "mcts_value": 0.7922239563195673,
+ "consensus_score": 0.33405727528610984,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 424,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4683262010865425,
+ "trm_confidence": 0.3830163896608705,
+ "mcts_value": 0.9798412171603175,
+ "consensus_score": 0.6001141642393371,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1590,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6673941740574011,
+ "trm_confidence": 0.27658075424446743,
+ "mcts_value": 0.7172150697731274,
+ "consensus_score": 0.6137456131966376,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3704,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.41029586862175155,
+ "trm_confidence": 0.08314398616690488,
+ "mcts_value": 0.6646202905804774,
+ "consensus_score": 0.44384845643016324,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3689,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12670689546912067,
+ "trm_confidence": 0.2750004870208167,
+ "mcts_value": 0.9640358696520795,
+ "consensus_score": 0.48467150832871886,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4998,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.284618700591374,
+ "trm_confidence": 0.45444631860734425,
+ "mcts_value": 0.6116286456675187,
+ "consensus_score": 0.4650280632525767,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4661,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1919416816053799,
+ "trm_confidence": 0.5104898707022725,
+ "mcts_value": 0.7477884184725011,
+ "consensus_score": 0.4453370629321546,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 4404,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4319160371123925,
+ "trm_confidence": 0.05127219031728598,
+ "mcts_value": 0.815710392901433,
+ "consensus_score": 0.36859303552538447,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3731,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13051870523620435,
+ "trm_confidence": 0.1534902256461637,
+ "mcts_value": 0.934368432366737,
+ "consensus_score": 0.37076724564825825,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 750,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6336349293494238,
+ "trm_confidence": 0.20652809834757668,
+ "mcts_value": 0.874701583416583,
+ "consensus_score": 0.6033733348792163,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2360,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06683838837145224,
+ "trm_confidence": 0.49840071878243597,
+ "mcts_value": 0.9629660746374693,
+ "consensus_score": 0.5549764698683062,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2452,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5794184689020143,
+ "trm_confidence": 0.5395671972826748,
+ "mcts_value": 0.7286489352876984,
+ "consensus_score": 0.6159413233746899,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1597,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6056284183515128,
+ "trm_confidence": 0.22258915323155873,
+ "mcts_value": 0.8358025321613871,
+ "consensus_score": 0.5513833644009682,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2747,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.26967294262331243,
+ "trm_confidence": 0.6975536817441912,
+ "mcts_value": 0.9548156081322211,
+ "consensus_score": 0.5711938651033375,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 4031,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6360324407605084,
+ "trm_confidence": 0.5117975638356418,
+ "mcts_value": 0.9193728763200538,
+ "consensus_score": 0.7144308741981286,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 716,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.014257931579064364,
+ "trm_confidence": 0.35213206337937436,
+ "mcts_value": 0.8037201252899288,
+ "consensus_score": 0.4612308667842808,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 869,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3702812658390706,
+ "trm_confidence": 0.02343849829797033,
+ "mcts_value": 0.7561939959074084,
+ "consensus_score": 0.3599041751153518,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2137,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4832856761452664,
+ "trm_confidence": 0.30456142559419025,
+ "mcts_value": 0.6124963915962599,
+ "consensus_score": 0.4499168030676857,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2461,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3317891311203637,
+ "trm_confidence": 0.4066099439054674,
+ "mcts_value": 0.7752107125082925,
+ "consensus_score": 0.40580152529265495,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 194,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14516101266756698,
+ "trm_confidence": 0.3666476585589127,
+ "mcts_value": 0.7803698989263979,
+ "consensus_score": 0.38677221181038673,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1590,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17421580970351525,
+ "trm_confidence": 0.013584305345812252,
+ "mcts_value": 0.8555022223073749,
+ "consensus_score": 0.3285983529082227,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3200,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4947874216922198,
+ "trm_confidence": 0.6969688604377766,
+ "mcts_value": 0.984805745527544,
+ "consensus_score": 0.6845326939486411,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 651,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6900972403096294,
+ "trm_confidence": 0.45779103982578184,
+ "mcts_value": 0.8447041417793998,
+ "consensus_score": 0.7439608524389874,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 3863,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33902091207983737,
+ "trm_confidence": 0.6431585950876396,
+ "mcts_value": 0.879769484128925,
+ "consensus_score": 0.6820165532035585,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 4575,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34527141232123215,
+ "trm_confidence": 0.05654023144514711,
+ "mcts_value": 0.6278444007194423,
+ "consensus_score": 0.3879543488102126,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 3040,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3633985489127985,
+ "trm_confidence": 0.08223578013738105,
+ "mcts_value": 0.7119183154685265,
+ "consensus_score": 0.4627728820362914,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 4476,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.564362042106087,
+ "trm_confidence": 0.07407016937927796,
+ "mcts_value": 0.6820202394092734,
+ "consensus_score": 0.4630363159325126,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 4580,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.043136798864328404,
+ "trm_confidence": 0.3141644686405523,
+ "mcts_value": 0.6760243546432525,
+ "consensus_score": 0.284938943098287,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 1201,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07943945317119827,
+ "trm_confidence": 0.3972852005837342,
+ "mcts_value": 0.7797415764132528,
+ "consensus_score": 0.35592691464910986,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 1697,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.47916888501006816,
+ "trm_confidence": 0.12721069746834832,
+ "mcts_value": 0.9903578369125293,
+ "consensus_score": 0.43506473555601977,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2501,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6109251809065226,
+ "trm_confidence": 0.42224326203559465,
+ "mcts_value": 0.6873645332478184,
+ "consensus_score": 0.5088025226690918,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 2055,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.02293006118458668,
+ "trm_confidence": 0.5126228955174705,
+ "mcts_value": 0.9904803457432356,
+ "consensus_score": 0.5632771727200497,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2794,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6077083785703474,
+ "trm_confidence": 0.3758999568929495,
+ "mcts_value": 0.753722663641583,
+ "consensus_score": 0.610171703036418,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4911,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3561551971414369,
+ "trm_confidence": 0.5026040999614456,
+ "mcts_value": 0.7905138172334725,
+ "consensus_score": 0.5024261765259104,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1564,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5601399601255522,
+ "trm_confidence": 0.23818074832487054,
+ "mcts_value": 0.6783086408838899,
+ "consensus_score": 0.4221763496498987,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 2296,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.049455501039777416,
+ "trm_confidence": 0.02133663638725458,
+ "mcts_value": 0.9384726117379951,
+ "consensus_score": 0.32945521022237767,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 4636,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.621376823613001,
+ "trm_confidence": 0.19819455347125778,
+ "mcts_value": 0.9836682156268193,
+ "consensus_score": 0.6644694770882553,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3536,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5708519291478197,
+ "trm_confidence": 0.5551556028865148,
+ "mcts_value": 0.7691664572171499,
+ "consensus_score": 0.68311193027234,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2364,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6037348568884349,
+ "trm_confidence": 0.14812180701757874,
+ "mcts_value": 0.9227691911279663,
+ "consensus_score": 0.46662190675527215,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4698,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4900071078471968,
+ "trm_confidence": 0.005143697344983877,
+ "mcts_value": 0.715641312477804,
+ "consensus_score": 0.3746702590207091,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3910,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10939178398195547,
+ "trm_confidence": 0.6368554815027963,
+ "mcts_value": 0.9586427648589736,
+ "consensus_score": 0.5115745385178366,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 4515,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.04569406413506189,
+ "trm_confidence": 0.05244518738162813,
+ "mcts_value": 0.6337379577753245,
+ "consensus_score": 0.2619376192221952,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2060,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4125013925652396,
+ "trm_confidence": 0.373128423350449,
+ "mcts_value": 0.7779343037127447,
+ "consensus_score": 0.5588003864963985,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 1259,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24520848957413544,
+ "trm_confidence": 0.008550824040745563,
+ "mcts_value": 0.9989242970186536,
+ "consensus_score": 0.3426862126548425,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 906,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43921226307516803,
+ "trm_confidence": 0.36424863017582754,
+ "mcts_value": 0.975993037481997,
+ "consensus_score": 0.6527592104987346,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3920,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43377415349500237,
+ "trm_confidence": 0.6543300057320453,
+ "mcts_value": 0.6012244419456372,
+ "consensus_score": 0.6206815241310022,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 3779,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38558216552118657,
+ "trm_confidence": 0.4329781655522392,
+ "mcts_value": 0.9296955784400505,
+ "consensus_score": 0.4945350377049861,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4454,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5190127646113992,
+ "trm_confidence": 0.5880399059653311,
+ "mcts_value": 0.623520423129399,
+ "consensus_score": 0.6609301701610041,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3238,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3318393511494707,
+ "trm_confidence": 0.5881757144394681,
+ "mcts_value": 0.6270480030517179,
+ "consensus_score": 0.4297499172597767,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 4775,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.568117721217664,
+ "trm_confidence": 0.08919866975081975,
+ "mcts_value": 0.8045080957744134,
+ "consensus_score": 0.41219687973020713,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 428,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6647447717237248,
+ "trm_confidence": 0.5362233846202891,
+ "mcts_value": 0.8920854940579495,
+ "consensus_score": 0.773119623409771,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 2224,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.047843523276939165,
+ "trm_confidence": 0.4591671770872801,
+ "mcts_value": 0.6731612735946729,
+ "consensus_score": 0.30378115873790856,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1205,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21972065941926494,
+ "trm_confidence": 0.3403108164411346,
+ "mcts_value": 0.9808802901635648,
+ "consensus_score": 0.5619753283503452,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3782,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6851660581129289,
+ "trm_confidence": 0.5212923059696694,
+ "mcts_value": 0.8637869100021912,
+ "consensus_score": 0.6137079280840779,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3905,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3023762398867548,
+ "trm_confidence": 0.19880407143507609,
+ "mcts_value": 0.6586492571457411,
+ "consensus_score": 0.3103753646082625,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 427,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1187878525693058,
+ "trm_confidence": 0.5065164465219747,
+ "mcts_value": 0.8362149634342894,
+ "consensus_score": 0.4262214121144573,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1139,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2240733361205828,
+ "trm_confidence": 0.15755356804221354,
+ "mcts_value": 0.8087989548182943,
+ "consensus_score": 0.3911291754933931,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 2561,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10797053546654987,
+ "trm_confidence": 0.5040983132964151,
+ "mcts_value": 0.7212994472591383,
+ "consensus_score": 0.46085584376523214,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 619,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.45737663965518527,
+ "trm_confidence": 0.2515373237743721,
+ "mcts_value": 0.8075721740968564,
+ "consensus_score": 0.4933148914359465,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2215,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5448118163257429,
+ "trm_confidence": 0.5586054984192101,
+ "mcts_value": 0.6628045296607167,
+ "consensus_score": 0.6380608076129142,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2990,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.41688106510175704,
+ "trm_confidence": 0.6434554695583351,
+ "mcts_value": 0.7873350971167254,
+ "consensus_score": 0.5910117383513462,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3784,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0365974075749261,
+ "trm_confidence": 0.5197425037389355,
+ "mcts_value": 0.7485121950034508,
+ "consensus_score": 0.3650812535923534,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 4709,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.009780903592923794,
+ "trm_confidence": 0.0015646433231779942,
+ "mcts_value": 0.813886686969084,
+ "consensus_score": 0.27720953764304135,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 3222,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15990553020219894,
+ "trm_confidence": 0.41454500477946354,
+ "mcts_value": 0.8454375750864443,
+ "consensus_score": 0.44914922680662356,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2849,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.020882475609089554,
+ "trm_confidence": 0.2156888734253879,
+ "mcts_value": 0.6947442412509153,
+ "consensus_score": 0.3908294681314898,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 509,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3971523913098409,
+ "trm_confidence": 0.5801162774355986,
+ "mcts_value": 0.8303334474327215,
+ "consensus_score": 0.5497920590294468,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1739,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2202815043024345,
+ "trm_confidence": 0.6145396436116048,
+ "mcts_value": 0.6163479959086697,
+ "consensus_score": 0.5661205774551992,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 4524,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3425514376363715,
+ "trm_confidence": 0.38830366684107404,
+ "mcts_value": 0.7525395606350557,
+ "consensus_score": 0.39851664037438295,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 2853,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4623165669453584,
+ "trm_confidence": 0.36958590048149587,
+ "mcts_value": 0.8613619248014569,
+ "consensus_score": 0.53127614641898,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3363,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5771517973843117,
+ "trm_confidence": 0.2946381896294308,
+ "mcts_value": 0.6179765614036828,
+ "consensus_score": 0.47953969430487003,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1062,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10591547503264781,
+ "trm_confidence": 0.1158720510815211,
+ "mcts_value": 0.6699667811082876,
+ "consensus_score": 0.3289569323109717,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 4479,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1950315914457207,
+ "trm_confidence": 0.4535299440877101,
+ "mcts_value": 0.9622469588269951,
+ "consensus_score": 0.44503595517348216,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 569,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16648476941725376,
+ "trm_confidence": 0.5799478637875408,
+ "mcts_value": 0.8145279108890514,
+ "consensus_score": 0.6143474836122634,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 4423,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1690423424352558,
+ "trm_confidence": 0.6051152673501498,
+ "mcts_value": 0.916165307338229,
+ "consensus_score": 0.5194811534365443,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 899,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3735339537579074,
+ "trm_confidence": 0.36744546699302055,
+ "mcts_value": 0.7153546813656679,
+ "consensus_score": 0.5555309376028053,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3732,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.25031874715615304,
+ "trm_confidence": 0.33524288546700354,
+ "mcts_value": 0.7085552531300148,
+ "consensus_score": 0.4301542438819369,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 462,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3185152324082315,
+ "trm_confidence": 0.43265168677086213,
+ "mcts_value": 0.6075406316151813,
+ "consensus_score": 0.4449816343500297,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2300,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13140114571483114,
+ "trm_confidence": 0.5294149801296674,
+ "mcts_value": 0.7646622780518945,
+ "consensus_score": 0.43943960399395965,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3893,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08541045230398672,
+ "trm_confidence": 0.0799841057774221,
+ "mcts_value": 0.9509708316865555,
+ "consensus_score": 0.33218573041392463,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 4325,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17168609017650976,
+ "trm_confidence": 0.059342319940571306,
+ "mcts_value": 0.9052748329580975,
+ "consensus_score": 0.3615277096685557,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 4637,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3905206749498208,
+ "trm_confidence": 0.5069341912244236,
+ "mcts_value": 0.8933379393453402,
+ "consensus_score": 0.6332088422339671,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4970,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.22198446331259672,
+ "trm_confidence": 0.45778120270185196,
+ "mcts_value": 0.9082437330553728,
+ "consensus_score": 0.5256325518603668,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 581,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.605980924748164,
+ "trm_confidence": 0.09900190161810396,
+ "mcts_value": 0.8664868661221519,
+ "consensus_score": 0.49704436327276325,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 945,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34543005581446923,
+ "trm_confidence": 0.24886331304467244,
+ "mcts_value": 0.9958633986686976,
+ "consensus_score": 0.48502767116761686,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1037,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19863066706415367,
+ "trm_confidence": 0.049342653749843375,
+ "mcts_value": 0.8902833582950973,
+ "consensus_score": 0.4160742159399531,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 2372,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5222729624500508,
+ "trm_confidence": 0.5388251171454546,
+ "mcts_value": 0.7001506395959293,
+ "consensus_score": 0.6319043541606525,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1844,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16983775796793601,
+ "trm_confidence": 0.19196700055773697,
+ "mcts_value": 0.9960582456223053,
+ "consensus_score": 0.5153263112987674,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 2822,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.026012463273550022,
+ "trm_confidence": 0.3876334659856583,
+ "mcts_value": 0.9020105510002314,
+ "consensus_score": 0.4537276117936565,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 440,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.25493629665410233,
+ "trm_confidence": 0.5940680810031305,
+ "mcts_value": 0.6650998877199426,
+ "consensus_score": 0.44282997311359035,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 4586,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05273146371199239,
+ "trm_confidence": 0.47453250954306,
+ "mcts_value": 0.795691626875996,
+ "consensus_score": 0.4991054978548028,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 3727,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5383433396880142,
+ "trm_confidence": 0.35403079482493055,
+ "mcts_value": 0.8688062889198347,
+ "consensus_score": 0.6696075213804851,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 929,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.347716446502645,
+ "trm_confidence": 0.5108732447446628,
+ "mcts_value": 0.6941808068055524,
+ "consensus_score": 0.4581345169301566,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 670,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12727081704664014,
+ "trm_confidence": 0.23202471738705419,
+ "mcts_value": 0.7738441442948825,
+ "consensus_score": 0.35718845589604403,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 1700,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4561891312377787,
+ "trm_confidence": 0.4899456525584073,
+ "mcts_value": 0.9946360303688364,
+ "consensus_score": 0.611289438660992,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 4474,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6812791930683556,
+ "trm_confidence": 0.6048970737743725,
+ "mcts_value": 0.642552259724387,
+ "consensus_score": 0.6036385357633317,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3610,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2007419475322064,
+ "trm_confidence": 0.09521921165463454,
+ "mcts_value": 0.7292512824581884,
+ "consensus_score": 0.35160109700351094,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 3222,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6570274695412733,
+ "trm_confidence": 0.447013837431379,
+ "mcts_value": 0.6861428669917617,
+ "consensus_score": 0.6766481894648053,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3175,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.23164454587012664,
+ "trm_confidence": 0.13660021839519046,
+ "mcts_value": 0.9397847274576645,
+ "consensus_score": 0.3890140917354511,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 766,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1898918508955939,
+ "trm_confidence": 0.4539586088835965,
+ "mcts_value": 0.7303854411117326,
+ "consensus_score": 0.36868510796468457,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 339,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15464950354903867,
+ "trm_confidence": 0.14046663778904223,
+ "mcts_value": 0.6703398997124024,
+ "consensus_score": 0.22200390169644207,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 2176,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6963623972143801,
+ "trm_confidence": 0.6428773479732098,
+ "mcts_value": 0.70917592470996,
+ "consensus_score": 0.6258203849304502,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4246,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5618521589751637,
+ "trm_confidence": 0.08836900559639752,
+ "mcts_value": 0.9377654186636974,
+ "consensus_score": 0.45421863074809,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3960,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07515609478057111,
+ "trm_confidence": 0.40200810157588607,
+ "mcts_value": 0.9528339996382009,
+ "consensus_score": 0.4078011047565753,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 4111,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05815794228443604,
+ "trm_confidence": 0.40547793712618185,
+ "mcts_value": 0.7819077169902374,
+ "consensus_score": 0.39688453282745606,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4693,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1915902853443494,
+ "trm_confidence": 0.11026234512208015,
+ "mcts_value": 0.9499588801890357,
+ "consensus_score": 0.37856657901553775,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 4286,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09959808498961463,
+ "trm_confidence": 0.07795548095061157,
+ "mcts_value": 0.8640470535528122,
+ "consensus_score": 0.2736772647485496,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3947,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03259280621015039,
+ "trm_confidence": 0.5320502470804781,
+ "mcts_value": 0.9776460461986113,
+ "consensus_score": 0.5615267158398527,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 1656,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.31715748456687903,
+ "trm_confidence": 0.5996026240044333,
+ "mcts_value": 0.7142035966761665,
+ "consensus_score": 0.5893732969234061,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3651,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5981688076295373,
+ "trm_confidence": 0.2799658617472436,
+ "mcts_value": 0.8908062965073484,
+ "consensus_score": 0.62336407779224,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 849,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5430080698155945,
+ "trm_confidence": 0.6842541888902517,
+ "mcts_value": 0.8099641519819655,
+ "consensus_score": 0.6679487027569171,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4395,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2078571164724059,
+ "trm_confidence": 0.060513189288164355,
+ "mcts_value": 0.6535556277500726,
+ "consensus_score": 0.29288749713939705,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 378,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1721493554123952,
+ "trm_confidence": 0.42832623493944827,
+ "mcts_value": 0.9174310997566425,
+ "consensus_score": 0.5862051805525512,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 328,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.41325844502884623,
+ "trm_confidence": 0.3546277978318974,
+ "mcts_value": 0.7379064775863817,
+ "consensus_score": 0.43165487407108916,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 939,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4287118607378963,
+ "trm_confidence": 0.04911206963964689,
+ "mcts_value": 0.7069330857830289,
+ "consensus_score": 0.3622404120999607,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1259,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2870207995972183,
+ "trm_confidence": 0.22152209969067274,
+ "mcts_value": 0.8420707400012313,
+ "consensus_score": 0.5334308840117128,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3194,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5221610491550703,
+ "trm_confidence": 0.4772597995887381,
+ "mcts_value": 0.9637205262561082,
+ "consensus_score": 0.5642874467466807,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1468,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15794658994125135,
+ "trm_confidence": 0.5453969607405275,
+ "mcts_value": 0.683773134610504,
+ "consensus_score": 0.41955721819814945,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1135,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43199000437925217,
+ "trm_confidence": 0.608219663374262,
+ "mcts_value": 0.7772711948981151,
+ "consensus_score": 0.5279878332237377,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 1249,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3377762158718462,
+ "trm_confidence": 0.6468872542889508,
+ "mcts_value": 0.7456433448967901,
+ "consensus_score": 0.5857472691254665,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 3167,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6524794138598203,
+ "trm_confidence": 0.3511632894458381,
+ "mcts_value": 0.6260862679598145,
+ "consensus_score": 0.5439942579752518,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 4146,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19074201448203928,
+ "trm_confidence": 0.4611717367721946,
+ "mcts_value": 0.7753628362120595,
+ "consensus_score": 0.4896878806847168,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2747,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.27813371894457856,
+ "trm_confidence": 0.0030806453324853985,
+ "mcts_value": 0.8576347429346562,
+ "consensus_score": 0.36893259910581094,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3946,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.057343918929729384,
+ "trm_confidence": 0.19296580365101634,
+ "mcts_value": 0.7722439609923336,
+ "consensus_score": 0.3719495015694868,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3265,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4097169332098666,
+ "trm_confidence": 0.6768629970444513,
+ "mcts_value": 0.9665208135659924,
+ "consensus_score": 0.7579703734932746,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3559,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4791859970651187,
+ "trm_confidence": 0.3437572785462063,
+ "mcts_value": 0.6547327136015789,
+ "consensus_score": 0.3959238509215329,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4657,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12701811871100846,
+ "trm_confidence": 0.13127201260054583,
+ "mcts_value": 0.7263325940353923,
+ "consensus_score": 0.32405022804403866,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2794,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14610881840587236,
+ "trm_confidence": 0.03224543007153378,
+ "mcts_value": 0.7747408030151097,
+ "consensus_score": 0.31022441439405146,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 4839,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4162918703423202,
+ "trm_confidence": 0.3053725646623523,
+ "mcts_value": 0.7634403070509002,
+ "consensus_score": 0.5108381359765598,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1037,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6584276987172909,
+ "trm_confidence": 0.2996197271071803,
+ "mcts_value": 0.8030835374132496,
+ "consensus_score": 0.49022691502051574,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3610,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3866693525184374,
+ "trm_confidence": 0.5001864687890575,
+ "mcts_value": 0.8162287407554648,
+ "consensus_score": 0.5226244825444922,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 1884,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2815183694024254,
+ "trm_confidence": 0.12524734995951484,
+ "mcts_value": 0.7645123010483306,
+ "consensus_score": 0.37322348205449835,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 3920,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20955924034800916,
+ "trm_confidence": 0.47760280306181196,
+ "mcts_value": 0.8374452765271861,
+ "consensus_score": 0.5562316817182031,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3385,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.007527602320017867,
+ "trm_confidence": 0.0860795408424368,
+ "mcts_value": 0.629478332855336,
+ "consensus_score": 0.17223016157143217,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 2200,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3003525811907735,
+ "trm_confidence": 0.2763886248904426,
+ "mcts_value": 0.7099726767881527,
+ "consensus_score": 0.38919696122072633,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 601,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.542797116870872,
+ "trm_confidence": 0.1411816092963175,
+ "mcts_value": 0.6546417611118621,
+ "consensus_score": 0.4924810905196795,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 1604,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6647009394575618,
+ "trm_confidence": 0.32413276472680647,
+ "mcts_value": 0.6346983073152016,
+ "consensus_score": 0.567224436161121,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2763,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6203528193725827,
+ "trm_confidence": 0.1900075922950634,
+ "mcts_value": 0.8021590618237545,
+ "consensus_score": 0.5925447551355442,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4554,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.16650482550703383,
+ "trm_confidence": 0.3097693113012998,
+ "mcts_value": 0.7173500691167987,
+ "consensus_score": 0.42494158391789427,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 2654,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4091297662980429,
+ "trm_confidence": 0.36351973519098435,
+ "mcts_value": 0.8783515072913102,
+ "consensus_score": 0.5612175890826414,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2878,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2544274738110137,
+ "trm_confidence": 0.10481394578673936,
+ "mcts_value": 0.7550002142155073,
+ "consensus_score": 0.3907813448478427,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 1784,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5918205291007997,
+ "trm_confidence": 0.0763380145377019,
+ "mcts_value": 0.9439400227462136,
+ "consensus_score": 0.5763738663568887,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 3207,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19636694379014258,
+ "trm_confidence": 0.30947440618900923,
+ "mcts_value": 0.690009451873861,
+ "consensus_score": 0.35235996368713696,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1010,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3544618486405938,
+ "trm_confidence": 0.6221514364555268,
+ "mcts_value": 0.6883515339398161,
+ "consensus_score": 0.6134674023819895,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 3167,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6148680140308268,
+ "trm_confidence": 0.4971168583490874,
+ "mcts_value": 0.6719545858810111,
+ "consensus_score": 0.6090791467164668,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2568,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15169861868926238,
+ "trm_confidence": 0.08440420871119089,
+ "mcts_value": 0.8729957144008273,
+ "consensus_score": 0.3628212159351468,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4551,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5704315813364017,
+ "trm_confidence": 0.5241789003278413,
+ "mcts_value": 0.7295918187007265,
+ "consensus_score": 0.641652207466697,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 518,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11266319650705656,
+ "trm_confidence": 0.19303345284654838,
+ "mcts_value": 0.7558684743931463,
+ "consensus_score": 0.4298103324610913,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 958,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33947217517053047,
+ "trm_confidence": 0.5016657593251701,
+ "mcts_value": 0.6864448220328685,
+ "consensus_score": 0.46653265726760484,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4322,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1756367402124386,
+ "trm_confidence": 0.4417197013131317,
+ "mcts_value": 0.9150250250158817,
+ "consensus_score": 0.4415408780782525,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4388,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13150250773626537,
+ "trm_confidence": 0.6008398704647704,
+ "mcts_value": 0.8526673542319743,
+ "consensus_score": 0.4390067739467064,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 3149,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5604702739715944,
+ "trm_confidence": 0.41662668123237145,
+ "mcts_value": 0.8126018927370995,
+ "consensus_score": 0.6531839456316204,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3290,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5192746395010424,
+ "trm_confidence": 0.16653692066503153,
+ "mcts_value": 0.6911244604345537,
+ "consensus_score": 0.5551067285620601,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 4692,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17009010337397337,
+ "trm_confidence": 0.25263195544389805,
+ "mcts_value": 0.9434692481935456,
+ "consensus_score": 0.49832513457294436,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3510,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17058772796290878,
+ "trm_confidence": 0.1649605688983235,
+ "mcts_value": 0.7945433693066476,
+ "consensus_score": 0.2785450063396625,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 109,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43486322474956446,
+ "trm_confidence": 0.4386586274023478,
+ "mcts_value": 0.6124623243306682,
+ "consensus_score": 0.40948815318225856,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 3444,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2883823068083915,
+ "trm_confidence": 0.32557134992486714,
+ "mcts_value": 0.8328668272978313,
+ "consensus_score": 0.423929493564191,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 1517,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15675407418460977,
+ "trm_confidence": 0.21751818744351983,
+ "mcts_value": 0.9642656169000174,
+ "consensus_score": 0.378303847804623,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 1006,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13275246708368724,
+ "trm_confidence": 0.6435778470574205,
+ "mcts_value": 0.8532422948258542,
+ "consensus_score": 0.463810763672716,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1720,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2487519092491989,
+ "trm_confidence": 0.4168497941370826,
+ "mcts_value": 0.8815079231064198,
+ "consensus_score": 0.46041202899874717,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4798,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43066103537470446,
+ "trm_confidence": 0.15270442919272204,
+ "mcts_value": 0.7412751318883843,
+ "consensus_score": 0.474132405597394,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 3198,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.30771116271093707,
+ "trm_confidence": 0.541816860194706,
+ "mcts_value": 0.6175433133671664,
+ "consensus_score": 0.5191797458724317,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 4071,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3722451902992266,
+ "trm_confidence": 0.1430559856244378,
+ "mcts_value": 0.693978218101228,
+ "consensus_score": 0.4483388707386439,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 330,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34069695390044646,
+ "trm_confidence": 0.4205945634849206,
+ "mcts_value": 0.8026179113292021,
+ "consensus_score": 0.5087920630048397,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 1868,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4406615999824062,
+ "trm_confidence": 0.44450554110256446,
+ "mcts_value": 0.9516453804946554,
+ "consensus_score": 0.6341105772697317,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 2152,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.017872285996915215,
+ "trm_confidence": 0.005219501996662123,
+ "mcts_value": 0.8368011549795481,
+ "consensus_score": 0.35695934625039155,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 3910,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13871986073570491,
+ "trm_confidence": 0.6270115867856038,
+ "mcts_value": 0.845995105179858,
+ "consensus_score": 0.47526133858339714,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 4759,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10552091639326548,
+ "trm_confidence": 0.22136289713036456,
+ "mcts_value": 0.6962971814096776,
+ "consensus_score": 0.3206504448984908,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 763,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5814443788879998,
+ "trm_confidence": 0.11138242296333094,
+ "mcts_value": 0.6918609344958324,
+ "consensus_score": 0.5576115038375773,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4467,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08420000060926532,
+ "trm_confidence": 0.5717740472353708,
+ "mcts_value": 0.8234256267071626,
+ "consensus_score": 0.44807033138676555,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3520,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03884019056702028,
+ "trm_confidence": 0.11803313328209751,
+ "mcts_value": 0.841056260930314,
+ "consensus_score": 0.26488164351115007,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2176,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3590681153579105,
+ "trm_confidence": 0.4365026033918791,
+ "mcts_value": 0.9687685283097158,
+ "consensus_score": 0.5596107174327927,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 1213,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24055214801083497,
+ "trm_confidence": 0.31692561198959995,
+ "mcts_value": 0.7177291899339334,
+ "consensus_score": 0.4262157207483272,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 3072,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.10465821764253724,
+ "trm_confidence": 0.2449994628955808,
+ "mcts_value": 0.7951904659530589,
+ "consensus_score": 0.42507819257167323,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2077,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.036105681083131086,
+ "trm_confidence": 0.6531526690349112,
+ "mcts_value": 0.7666804094272028,
+ "consensus_score": 0.46803354790699225,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4934,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2033292341002282,
+ "trm_confidence": 0.2885614547813878,
+ "mcts_value": 0.9749441286315138,
+ "consensus_score": 0.5407177872229005,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2260,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38893016798324637,
+ "trm_confidence": 0.4825663175691036,
+ "mcts_value": 0.9979292693656894,
+ "consensus_score": 0.6518343651367351,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4872,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20618925247647954,
+ "trm_confidence": 0.01596017330247862,
+ "mcts_value": 0.8253024529881239,
+ "consensus_score": 0.37203127646855805,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1101,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20165745105494864,
+ "trm_confidence": 0.3569922506583052,
+ "mcts_value": 0.6658824561963457,
+ "consensus_score": 0.47551362976539985,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2120,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6552332642343762,
+ "trm_confidence": 0.08169258417520159,
+ "mcts_value": 0.9013474455802182,
+ "consensus_score": 0.6313694248295101,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4067,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.35430370894841656,
+ "trm_confidence": 0.43584077185655556,
+ "mcts_value": 0.6215098838274221,
+ "consensus_score": 0.4319182228520328,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1305,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.02930306778478665,
+ "trm_confidence": 0.30763723660522463,
+ "mcts_value": 0.8210191759091919,
+ "consensus_score": 0.3287501243915139,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2544,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12262977807574112,
+ "trm_confidence": 0.15365290566384016,
+ "mcts_value": 0.7198587585408276,
+ "consensus_score": 0.3275975984973937,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 4865,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.027919921899200804,
+ "trm_confidence": 0.15917366193599397,
+ "mcts_value": 0.622156824686119,
+ "consensus_score": 0.338304391695657,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 202,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6505977835323311,
+ "trm_confidence": 0.2318247714734927,
+ "mcts_value": 0.7275878370330569,
+ "consensus_score": 0.6036590132102162,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3878,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.19376230117373294,
+ "trm_confidence": 0.460903398328293,
+ "mcts_value": 0.9577157077922971,
+ "consensus_score": 0.4550532188029849,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 3258,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.002607855654334512,
+ "trm_confidence": 0.22732648150856585,
+ "mcts_value": 0.6251053111722411,
+ "consensus_score": 0.30291933501906637,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 399,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6576675279857744,
+ "trm_confidence": 0.0879573147862478,
+ "mcts_value": 0.9964800123105004,
+ "consensus_score": 0.5637231036945398,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1547,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5223972689147434,
+ "trm_confidence": 0.08690500903747479,
+ "mcts_value": 0.7465488619637961,
+ "consensus_score": 0.4183129050235457,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4194,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.032799915867008554,
+ "trm_confidence": 0.12207420247613396,
+ "mcts_value": 0.934884085944662,
+ "consensus_score": 0.2729516512169948,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 3021,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3698200768977788,
+ "trm_confidence": 0.4605242469841963,
+ "mcts_value": 0.8016841206715465,
+ "consensus_score": 0.5209919709580493,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 1230,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6063193015724208,
+ "trm_confidence": 0.015857791395556153,
+ "mcts_value": 0.621716435099165,
+ "consensus_score": 0.42072620067370714,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1575,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.47712759383618397,
+ "trm_confidence": 0.0856861934113269,
+ "mcts_value": 0.843366698617572,
+ "consensus_score": 0.5037122487838062,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 556,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24115339617428277,
+ "trm_confidence": 0.5273167309174219,
+ "mcts_value": 0.7772207997433388,
+ "consensus_score": 0.5238051665640774,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 3470,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.32142392562430644,
+ "trm_confidence": 0.11908708393095507,
+ "mcts_value": 0.6680921202370974,
+ "consensus_score": 0.2734781653997501,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 1676,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.051226483406671686,
+ "trm_confidence": 0.3158319118734727,
+ "mcts_value": 0.6291845364640543,
+ "consensus_score": 0.2811010337150727,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3814,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.31411206359138144,
+ "trm_confidence": 0.2625795571528991,
+ "mcts_value": 0.7618972994685068,
+ "consensus_score": 0.35376953006622147,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 360,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.39260310105547147,
+ "trm_confidence": 0.06913540039949356,
+ "mcts_value": 0.640948132006049,
+ "consensus_score": 0.2881484759127241,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 3999,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0031143173662041,
+ "trm_confidence": 0.13508210923749067,
+ "mcts_value": 0.6550539140529478,
+ "consensus_score": 0.21159523105528902,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 102,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.28351870855068445,
+ "trm_confidence": 0.4987368986319067,
+ "mcts_value": 0.7623777589951879,
+ "consensus_score": 0.5402621918668641,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 1984,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.39537985421393407,
+ "trm_confidence": 0.09709532221116647,
+ "mcts_value": 0.7750447562620203,
+ "consensus_score": 0.38116134291072923,
+ "last_agent": "none",
+ "iteration": 6,
+ "query_length": 4513,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4571445090238169,
+ "trm_confidence": 0.3565729185311946,
+ "mcts_value": 0.7843045065167165,
+ "consensus_score": 0.4729652904733782,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3495,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.04349742889371404,
+ "trm_confidence": 0.18068504975469057,
+ "mcts_value": 0.7917218730871143,
+ "consensus_score": 0.39944153958867057,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 2940,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2960341540506385,
+ "trm_confidence": 0.29864449828811546,
+ "mcts_value": 0.8408351304006768,
+ "consensus_score": 0.4173046031450657,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 1658,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6199640440317623,
+ "trm_confidence": 0.25792774231741794,
+ "mcts_value": 0.6619457645715343,
+ "consensus_score": 0.4232269367444519,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 4463,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.34327889208174345,
+ "trm_confidence": 0.6694379580513925,
+ "mcts_value": 0.9667826343447138,
+ "consensus_score": 0.7348815233350119,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4262,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5295282585288805,
+ "trm_confidence": 0.40413730215330446,
+ "mcts_value": 0.7527501648675294,
+ "consensus_score": 0.6060753755312994,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2839,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2118643256670007,
+ "trm_confidence": 0.6903129267689389,
+ "mcts_value": 0.9830369736776687,
+ "consensus_score": 0.6040799449989437,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 3918,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3883777802635978,
+ "trm_confidence": 0.21499190556342018,
+ "mcts_value": 0.9139847653152711,
+ "consensus_score": 0.5274748910061812,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3172,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4677165338014153,
+ "trm_confidence": 0.5336812877933015,
+ "mcts_value": 0.9450656503012365,
+ "consensus_score": 0.6655198675925663,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 3227,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.064791257116833,
+ "trm_confidence": 0.3878136514494973,
+ "mcts_value": 0.7049164642875259,
+ "consensus_score": 0.40851230489962564,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 2647,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.44256067596182397,
+ "trm_confidence": 0.6990868447642679,
+ "mcts_value": 0.6049027207460108,
+ "consensus_score": 0.5410932845820328,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 4789,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17563905026679527,
+ "trm_confidence": 0.2428720177572709,
+ "mcts_value": 0.7701661899108189,
+ "consensus_score": 0.4238671003608342,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 679,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6425597511360087,
+ "trm_confidence": 0.28482017194039755,
+ "mcts_value": 0.6160125051197055,
+ "consensus_score": 0.4800928459389961,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3975,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.028807240036680212,
+ "trm_confidence": 0.10169072715622485,
+ "mcts_value": 0.9455299801055019,
+ "consensus_score": 0.28795394821004383,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 1272,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24066451373569545,
+ "trm_confidence": 0.5360884799176838,
+ "mcts_value": 0.7751073468262039,
+ "consensus_score": 0.5658005925109914,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 2241,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5429223745586418,
+ "trm_confidence": 0.48654437294048475,
+ "mcts_value": 0.7431240129661425,
+ "consensus_score": 0.6240343280929974,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1190,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.02963761542642319,
+ "trm_confidence": 0.6081220430624733,
+ "mcts_value": 0.735873490187804,
+ "consensus_score": 0.4743636632100025,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 1890,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5241162636843427,
+ "trm_confidence": 0.46632505648051575,
+ "mcts_value": 0.8893638012649838,
+ "consensus_score": 0.5673579013326266,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2080,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4556607463952512,
+ "trm_confidence": 0.6330503900641488,
+ "mcts_value": 0.8631025294335881,
+ "consensus_score": 0.7362659819170875,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 1645,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5733551887565057,
+ "trm_confidence": 0.45122373788403347,
+ "mcts_value": 0.7454935918237665,
+ "consensus_score": 0.5054773788721658,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3981,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6662827140882281,
+ "trm_confidence": 0.2994956771298266,
+ "mcts_value": 0.9537992592418003,
+ "consensus_score": 0.6966713983480932,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 126,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.697918453147421,
+ "trm_confidence": 0.02115344262594493,
+ "mcts_value": 0.6581350844908569,
+ "consensus_score": 0.47853617716369623,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 2610,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5051054710091956,
+ "trm_confidence": 0.15147969429688585,
+ "mcts_value": 0.7022510975876359,
+ "consensus_score": 0.518753072114909,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4041,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13226202532353212,
+ "trm_confidence": 0.16659054134282192,
+ "mcts_value": 0.9133653943335903,
+ "consensus_score": 0.45989340141352153,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 998,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6511273501213394,
+ "trm_confidence": 0.3933144844251073,
+ "mcts_value": 0.782212362475966,
+ "consensus_score": 0.5864203019824493,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 39,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.28815102182140645,
+ "trm_confidence": 0.21852605082475737,
+ "mcts_value": 0.7734939894749928,
+ "consensus_score": 0.3918626493212357,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 489,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14084314868811096,
+ "trm_confidence": 0.268945602243145,
+ "mcts_value": 0.8727266902585276,
+ "consensus_score": 0.35681294157532756,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 1968,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11439160427818405,
+ "trm_confidence": 0.15855390735182287,
+ "mcts_value": 0.7418622304225458,
+ "consensus_score": 0.4018918310559654,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 557,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13211278804007376,
+ "trm_confidence": 0.5091159917392016,
+ "mcts_value": 0.7324797627224844,
+ "consensus_score": 0.4516118533956279,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 645,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24738488351716395,
+ "trm_confidence": 0.2693562444969951,
+ "mcts_value": 0.6591013780288429,
+ "consensus_score": 0.38983348186601663,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 150,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4521926221421078,
+ "trm_confidence": 0.6654542154042549,
+ "mcts_value": 0.9729888205344299,
+ "consensus_score": 0.7603564601768946,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 742,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3100398441795226,
+ "trm_confidence": 0.460622617268879,
+ "mcts_value": 0.74292359315087,
+ "consensus_score": 0.48991505681969105,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1338,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1429320477494029,
+ "trm_confidence": 0.6607633266673058,
+ "mcts_value": 0.9319946595939554,
+ "consensus_score": 0.5096725046576749,
+ "last_agent": "none",
+ "iteration": 4,
+ "query_length": 1889,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.517131593916873,
+ "trm_confidence": 0.6723713247332932,
+ "mcts_value": 0.9274807310812058,
+ "consensus_score": 0.6597691423142072,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 2021,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.31192002079093006,
+ "trm_confidence": 0.004095015871525065,
+ "mcts_value": 0.8149594502277363,
+ "consensus_score": 0.39188393433789537,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4475,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.37127967759295266,
+ "trm_confidence": 0.12477220220576545,
+ "mcts_value": 0.9726623447572149,
+ "consensus_score": 0.5187609223686881,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 678,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07748641831384613,
+ "trm_confidence": 0.07435221386350471,
+ "mcts_value": 0.8741260969676969,
+ "consensus_score": 0.3899769556847119,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 1605,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.09897539965443619,
+ "trm_confidence": 0.36286645309387816,
+ "mcts_value": 0.9184595034591311,
+ "consensus_score": 0.379425550262937,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 3230,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.14541151475870714,
+ "trm_confidence": 0.4780262591640515,
+ "mcts_value": 0.9777923425290989,
+ "consensus_score": 0.4594016570260412,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 492,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.699204268274178,
+ "trm_confidence": 0.018148359663660893,
+ "mcts_value": 0.8292384565366064,
+ "consensus_score": 0.4155932454202992,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 4669,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6388077833881113,
+ "trm_confidence": 0.6163674834515231,
+ "mcts_value": 0.9535348944394526,
+ "consensus_score": 0.6648724705235719,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3148,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.46426661995738583,
+ "trm_confidence": 0.1660000042337644,
+ "mcts_value": 0.7250444271136965,
+ "consensus_score": 0.4164542020271105,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1780,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21438169265579282,
+ "trm_confidence": 0.016469676876294147,
+ "mcts_value": 0.9857296347539451,
+ "consensus_score": 0.3565071821891915,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 3180,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.43823421407178675,
+ "trm_confidence": 0.6124293493895462,
+ "mcts_value": 0.645086426531144,
+ "consensus_score": 0.5377537730249061,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 2388,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4140413718557169,
+ "trm_confidence": 0.07080024932932218,
+ "mcts_value": 0.9143867285802508,
+ "consensus_score": 0.47384688078716997,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3661,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6867783549341924,
+ "trm_confidence": 0.12323032392527893,
+ "mcts_value": 0.6440642384213293,
+ "consensus_score": 0.4232395505850826,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 3098,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17469002741858916,
+ "trm_confidence": 0.3005963681150697,
+ "mcts_value": 0.7774678297930802,
+ "consensus_score": 0.4522891119879232,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4265,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13872532157599726,
+ "trm_confidence": 0.6681974928109701,
+ "mcts_value": 0.7225384948710823,
+ "consensus_score": 0.5719325584429859,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 2384,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24272674878274395,
+ "trm_confidence": 0.58772811267745,
+ "mcts_value": 0.80752569966152,
+ "consensus_score": 0.5964265337467927,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1102,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0007363807922716958,
+ "trm_confidence": 0.6732710189801955,
+ "mcts_value": 0.7477560894079189,
+ "consensus_score": 0.483846978274696,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 2180,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2030334913415154,
+ "trm_confidence": 0.6801032874356393,
+ "mcts_value": 0.8202565843277612,
+ "consensus_score": 0.5921314702046956,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 680,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.04504428752245054,
+ "trm_confidence": 0.24516113759752034,
+ "mcts_value": 0.715144560196313,
+ "consensus_score": 0.27033614762577307,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 1430,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.23301783778983257,
+ "trm_confidence": 0.46925515268030815,
+ "mcts_value": 0.7211019432982609,
+ "consensus_score": 0.40431259810438114,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1304,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.12227880344444905,
+ "trm_confidence": 0.6529222532052206,
+ "mcts_value": 0.6404797783707098,
+ "consensus_score": 0.42131746934726383,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 4349,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6154667532299842,
+ "trm_confidence": 0.10362325407179326,
+ "mcts_value": 0.6776406769395111,
+ "consensus_score": 0.36932701209435204,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1739,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.08971358945199735,
+ "trm_confidence": 0.6875615806959464,
+ "mcts_value": 0.7303964181662499,
+ "consensus_score": 0.5379049220026138,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 3616,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2691948469916442,
+ "trm_confidence": 0.22703799524254756,
+ "mcts_value": 0.879296909595676,
+ "consensus_score": 0.5019016364555173,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2247,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20623477405137147,
+ "trm_confidence": 0.3426701595313342,
+ "mcts_value": 0.8274639271393289,
+ "consensus_score": 0.48153238358586203,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 3062,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0066050093393553826,
+ "trm_confidence": 0.3988879217624059,
+ "mcts_value": 0.8338452848863351,
+ "consensus_score": 0.43983349585107095,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 180,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33249821550006176,
+ "trm_confidence": 0.37215949992821973,
+ "mcts_value": 0.8297870637085434,
+ "consensus_score": 0.4249424071794077,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 2264,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.38958886100466183,
+ "trm_confidence": 0.04028922516076817,
+ "mcts_value": 0.819223843937053,
+ "consensus_score": 0.4359574126581124,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1477,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4976213779106428,
+ "trm_confidence": 0.2008256766826197,
+ "mcts_value": 0.6188499666382289,
+ "consensus_score": 0.4694400731677854,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 4359,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5849432813373271,
+ "trm_confidence": 0.42391415051634174,
+ "mcts_value": 0.7761809380865067,
+ "consensus_score": 0.5087604503757408,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 1394,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4301947589066826,
+ "trm_confidence": 0.007278671057098373,
+ "mcts_value": 0.8566582715646531,
+ "consensus_score": 0.452916335726504,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 3093,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21256089197690825,
+ "trm_confidence": 0.2948914923860667,
+ "mcts_value": 0.6235903081559845,
+ "consensus_score": 0.3129685723986355,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 761,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.46808633399039357,
+ "trm_confidence": 0.18380072281210846,
+ "mcts_value": 0.6452869132028206,
+ "consensus_score": 0.47467249283102203,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 914,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5578799743408257,
+ "trm_confidence": 0.5487661061438933,
+ "mcts_value": 0.7565794654347361,
+ "consensus_score": 0.5961871104646532,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 4110,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.05688528576326571,
+ "trm_confidence": 0.304222998883027,
+ "mcts_value": 0.832162734472168,
+ "consensus_score": 0.33430254268044207,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 664,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.2522673022310073,
+ "trm_confidence": 0.20211872466786202,
+ "mcts_value": 0.836309499455014,
+ "consensus_score": 0.3708157987302678,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 3976,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5227074134042587,
+ "trm_confidence": 0.2929715439256572,
+ "mcts_value": 0.6533044577724123,
+ "consensus_score": 0.3927404693595567,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 637,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48722867299686506,
+ "trm_confidence": 0.17999942599883725,
+ "mcts_value": 0.6839737252374378,
+ "consensus_score": 0.5287782484460767,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 3467,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6081947137154111,
+ "trm_confidence": 0.2531792725155051,
+ "mcts_value": 0.8460758153320737,
+ "consensus_score": 0.5789780492314458,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 3122,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4046425342306847,
+ "trm_confidence": 0.5281018359588862,
+ "mcts_value": 0.6659661634434454,
+ "consensus_score": 0.5069134633177373,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 2624,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4088338487835222,
+ "trm_confidence": 0.2943976070261187,
+ "mcts_value": 0.6693235039583664,
+ "consensus_score": 0.36055634085907,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 3216,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6244276612236238,
+ "trm_confidence": 0.3819486118099557,
+ "mcts_value": 0.907731983807261,
+ "consensus_score": 0.5591491781164382,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 2719,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.20376871458816245,
+ "trm_confidence": 0.08672342650185047,
+ "mcts_value": 0.8997910516490358,
+ "consensus_score": 0.4768974041987981,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 2961,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06804884644945763,
+ "trm_confidence": 0.044298543381425914,
+ "mcts_value": 0.9294697959298257,
+ "consensus_score": 0.3989925746828216,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 455,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5875125004474275,
+ "trm_confidence": 0.04033643008293962,
+ "mcts_value": 0.9615448783904088,
+ "consensus_score": 0.5530153846951598,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 4449,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.48023325325328375,
+ "trm_confidence": 0.0729001088879177,
+ "mcts_value": 0.9932021996304753,
+ "consensus_score": 0.45510478965721624,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 1130,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.07023052249153387,
+ "trm_confidence": 0.5343709379002983,
+ "mcts_value": 0.6393357067489559,
+ "consensus_score": 0.3275165787235706,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3365,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6170942399050675,
+ "trm_confidence": 0.45535938149292754,
+ "mcts_value": 0.9201799249273803,
+ "consensus_score": 0.6287184664962302,
+ "last_agent": "hrm",
+ "iteration": 10,
+ "query_length": 1562,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6204802934349398,
+ "trm_confidence": 0.47657581474524546,
+ "mcts_value": 0.7102043357819398,
+ "consensus_score": 0.6815545649292236,
+ "last_agent": "mcts",
+ "iteration": 5,
+ "query_length": 2450,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4676245308653993,
+ "trm_confidence": 0.6398424062285161,
+ "mcts_value": 0.8450968626801308,
+ "consensus_score": 0.6270051948636078,
+ "last_agent": "none",
+ "iteration": 9,
+ "query_length": 159,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6278599542833151,
+ "trm_confidence": 0.4760425988010662,
+ "mcts_value": 0.9151010875463841,
+ "consensus_score": 0.5865291244442012,
+ "last_agent": "none",
+ "iteration": 5,
+ "query_length": 4674,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6771893195888536,
+ "trm_confidence": 0.025759580280082583,
+ "mcts_value": 0.9874601893508905,
+ "consensus_score": 0.5399593299964925,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 1672,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.42664222096266324,
+ "trm_confidence": 0.1000087329786002,
+ "mcts_value": 0.6343635619799833,
+ "consensus_score": 0.38564670521397426,
+ "last_agent": "hrm",
+ "iteration": 5,
+ "query_length": 98,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6376250648109357,
+ "trm_confidence": 0.5212521749410738,
+ "mcts_value": 0.6320077702106438,
+ "consensus_score": 0.6833264663421412,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 3425,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.431414967236463,
+ "trm_confidence": 0.2460915708799335,
+ "mcts_value": 0.7433629170008382,
+ "consensus_score": 0.5228777985021462,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 796,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.36891543450466274,
+ "trm_confidence": 0.15641123171088717,
+ "mcts_value": 0.697120372098111,
+ "consensus_score": 0.31499351078862187,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 3342,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.06133603798457597,
+ "trm_confidence": 0.6968683253433593,
+ "mcts_value": 0.8895709118212334,
+ "consensus_score": 0.6481281384391293,
+ "last_agent": "none",
+ "iteration": 7,
+ "query_length": 660,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5162963128563078,
+ "trm_confidence": 0.6154580997694961,
+ "mcts_value": 0.6974274223222228,
+ "consensus_score": 0.6726605315321647,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 4994,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4834933691149103,
+ "trm_confidence": 0.09271597870094306,
+ "mcts_value": 0.9279959791115091,
+ "consensus_score": 0.4242386564553582,
+ "last_agent": "mcts",
+ "iteration": 9,
+ "query_length": 519,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6350832535941797,
+ "trm_confidence": 0.46171324980985295,
+ "mcts_value": 0.807750446876551,
+ "consensus_score": 0.6140783324445211,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 4002,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3950473148643869,
+ "trm_confidence": 0.3561326175534258,
+ "mcts_value": 0.9041101138745393,
+ "consensus_score": 0.5971434892949538,
+ "last_agent": "mcts",
+ "iteration": 8,
+ "query_length": 3863,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5284664924573852,
+ "trm_confidence": 0.359632315666183,
+ "mcts_value": 0.6250207145109178,
+ "consensus_score": 0.4193117982181257,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 2544,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5556592843548339,
+ "trm_confidence": 0.21017367604051104,
+ "mcts_value": 0.8094508678178738,
+ "consensus_score": 0.46547605311069346,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 1578,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5918639565147593,
+ "trm_confidence": 0.5046427330550736,
+ "mcts_value": 0.8783734409608674,
+ "consensus_score": 0.593090888657487,
+ "last_agent": "hrm",
+ "iteration": 6,
+ "query_length": 2486,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.17372031475182587,
+ "trm_confidence": 0.36777085299629275,
+ "mcts_value": 0.8746896798232204,
+ "consensus_score": 0.5304262511627166,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 4420,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.45735500062212225,
+ "trm_confidence": 0.3846274245379609,
+ "mcts_value": 0.7331467002465364,
+ "consensus_score": 0.4443217392962154,
+ "last_agent": "mcts",
+ "iteration": 10,
+ "query_length": 4435,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13890753564258362,
+ "trm_confidence": 0.2638956732793761,
+ "mcts_value": 0.7893444052789862,
+ "consensus_score": 0.3639475105300018,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 3220,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.622438379720267,
+ "trm_confidence": 0.6589407839330126,
+ "mcts_value": 0.7849101441121128,
+ "consensus_score": 0.6326577884340824,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 4804,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6045200488979079,
+ "trm_confidence": 0.016588181483114306,
+ "mcts_value": 0.9979445614554436,
+ "consensus_score": 0.48993697408678183,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 3119,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.35960297881693765,
+ "trm_confidence": 0.38811491739756926,
+ "mcts_value": 0.8325627905169265,
+ "consensus_score": 0.477250480649796,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 2855,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5141036811251816,
+ "trm_confidence": 0.471118384598279,
+ "mcts_value": 0.8584046895883358,
+ "consensus_score": 0.5809657554330427,
+ "last_agent": "hrm",
+ "iteration": 9,
+ "query_length": 2349,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.13660095437682052,
+ "trm_confidence": 0.445975889224767,
+ "mcts_value": 0.6513189291697111,
+ "consensus_score": 0.3691818176349714,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2369,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.33445363477286755,
+ "trm_confidence": 0.42516981740064813,
+ "mcts_value": 0.6662468722161602,
+ "consensus_score": 0.4105705631007749,
+ "last_agent": "none",
+ "iteration": 8,
+ "query_length": 4914,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.15644903475187527,
+ "trm_confidence": 0.5496442951602336,
+ "mcts_value": 0.7246232553575843,
+ "consensus_score": 0.5688682950106064,
+ "last_agent": "hrm",
+ "iteration": 7,
+ "query_length": 1908,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.11063520378319223,
+ "trm_confidence": 0.5899335993964124,
+ "mcts_value": 0.7042277265701904,
+ "consensus_score": 0.3852363074032394,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 4243,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5931454356934307,
+ "trm_confidence": 0.01928333490401931,
+ "mcts_value": 0.8070959148296653,
+ "consensus_score": 0.5286054450237737,
+ "last_agent": "hrm",
+ "iteration": 4,
+ "query_length": 719,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.04279058165206425,
+ "trm_confidence": 0.583271527160927,
+ "mcts_value": 0.8116704876951447,
+ "consensus_score": 0.3847070222502957,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 3913,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.6169720930483836,
+ "trm_confidence": 0.44480049095375845,
+ "mcts_value": 0.7859484495431563,
+ "consensus_score": 0.6406127620558477,
+ "last_agent": "trm",
+ "iteration": 6,
+ "query_length": 3837,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.0733583824003766,
+ "trm_confidence": 0.3703074200514973,
+ "mcts_value": 0.984468956358673,
+ "consensus_score": 0.43055863777047165,
+ "last_agent": "trm",
+ "iteration": 5,
+ "query_length": 2794,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.5440256379818951,
+ "trm_confidence": 0.6081611824354874,
+ "mcts_value": 0.9191736468715476,
+ "consensus_score": 0.6490340366787835,
+ "last_agent": "trm",
+ "iteration": 7,
+ "query_length": 3514,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21164017849959813,
+ "trm_confidence": 0.39304547328913053,
+ "mcts_value": 0.8987430492545462,
+ "consensus_score": 0.43500295675164846,
+ "last_agent": "trm",
+ "iteration": 9,
+ "query_length": 3651,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03215589797470598,
+ "trm_confidence": 0.6947210968425096,
+ "mcts_value": 0.9509811659011822,
+ "consensus_score": 0.5088419364607382,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 2991,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3868629492725526,
+ "trm_confidence": 0.1678673810126457,
+ "mcts_value": 0.707013079346613,
+ "consensus_score": 0.39933539882401803,
+ "last_agent": "trm",
+ "iteration": 8,
+ "query_length": 4828,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.4160259913511262,
+ "trm_confidence": 0.10340999377874809,
+ "mcts_value": 0.8770966001067517,
+ "consensus_score": 0.5287339820537985,
+ "last_agent": "mcts",
+ "iteration": 4,
+ "query_length": 2363,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.46720028290912374,
+ "trm_confidence": 0.2578597578154601,
+ "mcts_value": 0.6228235667727845,
+ "consensus_score": 0.4797042324734294,
+ "last_agent": "mcts",
+ "iteration": 6,
+ "query_length": 1202,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03190645756728122,
+ "trm_confidence": 0.30115856844220246,
+ "mcts_value": 0.7831844040664724,
+ "consensus_score": 0.4002960159816925,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 433,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.21620392844068265,
+ "trm_confidence": 0.010104361945315464,
+ "mcts_value": 0.8672223942746995,
+ "consensus_score": 0.2956293396370816,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 800,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.1684574328960351,
+ "trm_confidence": 0.540957317932618,
+ "mcts_value": 0.8777532055014203,
+ "consensus_score": 0.48635142172215373,
+ "last_agent": "hrm",
+ "iteration": 8,
+ "query_length": 3036,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.03276345927082932,
+ "trm_confidence": 0.3748118647412166,
+ "mcts_value": 0.7542224514574996,
+ "consensus_score": 0.4863477374654152,
+ "last_agent": "none",
+ "iteration": 10,
+ "query_length": 3405,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.24146628227404615,
+ "trm_confidence": 0.11219073612816403,
+ "mcts_value": 0.9022786037697009,
+ "consensus_score": 0.4518281475111357,
+ "last_agent": "mcts",
+ "iteration": 7,
+ "query_length": 683,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.23558057312119612,
+ "trm_confidence": 0.46696633168896406,
+ "mcts_value": 0.9309570648942034,
+ "consensus_score": 0.5177396166273129,
+ "last_agent": "trm",
+ "iteration": 4,
+ "query_length": 2024,
+ "has_rag_context": true
+ },
+ "label": "mcts"
+ },
+ {
+ "features": {
+ "hrm_confidence": 0.3382837374618919,
+ "trm_confidence": 0.07576130578580771,
+ "mcts_value": 0.9800137852816538,
+ "consensus_score": 0.5022516420516963,
+ "last_agent": "trm",
+ "iteration": 10,
+ "query_length": 2337,
+ "has_rag_context": false
+ },
+ "label": "mcts"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/models/bert_lora/training_results.json b/models/bert_lora/training_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..70ae829f521a3edaa80917dbc6f8000c96785282
--- /dev/null
+++ b/models/bert_lora/training_results.json
@@ -0,0 +1,48 @@
+{
+ "config": {
+ "model_name": "prajjwal1/bert-mini",
+ "lora_r": 4,
+ "lora_alpha": 16,
+ "lora_dropout": 0.1,
+ "lr": 0.001,
+ "batch_size": 16,
+ "epochs": 5,
+ "warmup_steps": 100,
+ "seed": 42,
+ "num_samples": 1000,
+ "data_path": null,
+ "balanced": true,
+ "output_dir": "models/bert_lora"
+ },
+ "train_history": {
+ "train_loss": 1.1033503922549162,
+ "train_runtime": 11.0946,
+ "train_samples_per_second": 315.018,
+ "epochs": 5,
+ "final_metrics": {
+ "train_runtime": 11.0946,
+ "train_samples_per_second": 315.018,
+ "train_steps_per_second": 19.829,
+ "total_flos": 34821822412800.0,
+ "train_loss": 1.1033503922549162,
+ "epoch": 5.0
+ },
+ "eval_results": {
+ "eval_loss": 1.0453400611877441,
+ "eval_accuracy": 0.47651006711409394,
+ "eval_runtime": 0.1251,
+ "eval_samples_per_second": 1191.171,
+ "eval_steps_per_second": 79.944,
+ "epoch": 5.0
+ }
+ },
+ "test_results": {
+ "loss": 1.0559743153338401,
+ "accuracy": 0.4768211920529801
+ },
+ "model_params": {
+ "total_params": 11188486,
+ "trainable_params": 17155,
+ "trainable_percentage": 0.15
+ }
+}
\ No newline at end of file
diff --git a/models/rnn_meta_controller.history.json b/models/rnn_meta_controller.history.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e2219b882cb4f47e91da96dee8c241e00ff12c2
--- /dev/null
+++ b/models/rnn_meta_controller.history.json
@@ -0,0 +1,128 @@
+{
+ "config": {
+ "hidden_dim": 64,
+ "num_layers": 1,
+ "dropout": 0.1,
+ "lr": 0.001,
+ "batch_size": 32,
+ "epochs": 20,
+ "patience": 5,
+ "seed": 42,
+ "num_samples": 1000
+ },
+ "training_history": {
+ "train_losses": [
+ 1.060307163180727,
+ 0.9014069383794611,
+ 0.6105747597687172,
+ 0.35656250968123926,
+ 0.22574858390020602,
+ 0.16157509059165465,
+ 0.12456387586214325,
+ 0.10158110240643675,
+ 0.08592396827809738,
+ 0.07474524908783761,
+ 0.06479036057311477,
+ 0.057878461638183304,
+ 0.052609961931452606,
+ 0.04809149278497154,
+ 0.043710527828697,
+ 0.041286276738074695,
+ 0.03756282673302022,
+ 0.03491098284156936,
+ 0.031911260236731985,
+ 0.030496817025722878
+ ],
+ "val_losses": [
+ 1.0059996803601583,
+ 0.7808501919110616,
+ 0.47826388080914817,
+ 0.29279296696186063,
+ 0.2008462185660998,
+ 0.1529717780649662,
+ 0.12299496456980705,
+ 0.10291122049093246,
+ 0.08860023791591326,
+ 0.07790809428940217,
+ 0.06982718824098508,
+ 0.06387854401643077,
+ 0.05984275036801894,
+ 0.05463591649507483,
+ 0.04938021237030625,
+ 0.0452831008626769,
+ 0.04252756762628754,
+ 0.039516554485696055,
+ 0.038632405494960644,
+ 0.035608950459087886
+ ],
+ "val_accuracies": [
+ 0.8466666666666667,
+ 0.92,
+ 0.9822222222222222,
+ 0.9933333333333333,
+ 0.9911111111111112,
+ 0.9933333333333333,
+ 0.9955555555555555,
+ 0.9955555555555555,
+ 0.9955555555555555,
+ 0.9955555555555555,
+ 0.9955555555555555,
+ 0.9977777777777778,
+ 0.9933333333333333,
+ 0.9933333333333333,
+ 0.9977777777777778,
+ 0.9977777777777778,
+ 0.9977777777777778,
+ 0.9977777777777778,
+ 0.9955555555555555,
+ 0.9977777777777778
+ ],
+ "best_epoch": 20,
+ "best_val_loss": 0.035608950459087886,
+ "best_val_accuracy": 0.9977777777777778,
+ "stopped_early": false,
+ "total_epochs": 20
+ },
+ "test_results": {
+ "loss": 0.022989434589787076,
+ "accuracy": 0.9977777777777778,
+ "per_class_metrics": {
+ "hrm": {
+ "precision": 1.0,
+ "recall": 1.0,
+ "f1_score": 1.0,
+ "support": 153
+ },
+ "trm": {
+ "precision": 0.9933774834437086,
+ "recall": 1.0,
+ "f1_score": 0.9966777408637874,
+ "support": 150
+ },
+ "mcts": {
+ "precision": 1.0,
+ "recall": 0.9931972789115646,
+ "f1_score": 0.9965870307167235,
+ "support": 147
+ }
+ },
+ "confusion_matrix": [
+ [
+ 153,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 150,
+ 0
+ ],
+ [
+ 0,
+ 1,
+ 146
+ ]
+ ],
+ "total_samples": 450
+ }
+}
\ No newline at end of file
diff --git a/models/rnn_meta_controller.pt b/models/rnn_meta_controller.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ecac32abb9cde0a5c8899762752f8d206b36c47d
Binary files /dev/null and b/models/rnn_meta_controller.pt differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a59c1caf3390642ffa3007ad4cf87067d771a6c2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,28 @@
+# LangGraph Multi-Agent MCTS Demo - Dependencies
+# Optimized for Hugging Face Spaces deployment with trained models
+
+# Core UI Framework
+gradio>=4.0.0,<5.0.0
+
+# Numerical computation
+numpy>=1.24.0,<2.0.0
+
+# Machine Learning - Neural Models
+torch>=2.1.0
+transformers>=4.40.0
+peft>=0.7.0
+sentence-transformers>=2.2.0
+
+# Configuration
+pyyaml>=6.0
+
+# Experiment Tracking
+wandb>=0.16.0
+
+# Required for Gradio OAuth and model loading
+huggingface_hub>=0.20.0,<0.30.0
+
+# Note: This demo now uses REAL trained models:
+# - RNN Meta-Controller (models/rnn_meta_controller.pt)
+# - BERT with LoRA adapters (models/bert_lora/final_model/)
+# - Actual HRM and TRM agent implementations
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fed06e8d006ca2b3347dd740f88d0f7518f20d1
--- /dev/null
+++ b/src/adapters/__init__.py
@@ -0,0 +1,7 @@
+"""
+Adapters package for external service integrations.
+"""
+
+from .llm import BaseLLMClient, LLMResponse, create_client
+
+__all__ = ["create_client", "BaseLLMClient", "LLMResponse"]
diff --git a/src/adapters/llm/__init__.py b/src/adapters/llm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d176cb1b0d35d0bc2d82a2132318408ccde2c0b9
--- /dev/null
+++ b/src/adapters/llm/__init__.py
@@ -0,0 +1,257 @@
+"""
+LLM Client Factory and Provider Registry.
+
+This module provides a factory function to instantiate the correct LLM client
+based on provider settings, with lazy loading of adapters.
+"""
+
+import importlib
+import logging
+from typing import Any
+
+from .base import BaseLLMClient, LLMClient, LLMResponse, LLMToolResponse, ToolCall
+from .exceptions import (
+ CircuitBreakerOpenError,
+ LLMAuthenticationError,
+ LLMClientError,
+ LLMConnectionError,
+ LLMContentFilterError,
+ LLMContextLengthError,
+ LLMInvalidRequestError,
+ LLMModelNotFoundError,
+ LLMQuotaExceededError,
+ LLMRateLimitError,
+ LLMResponseParseError,
+ LLMServerError,
+ LLMStreamError,
+ LLMTimeoutError,
+)
+
+logger = logging.getLogger(__name__)
+
+# Provider registry with lazy loading
+# Maps provider name to (module_path, class_name)
+_PROVIDER_REGISTRY: dict[str, tuple[str, str]] = {
+ "openai": ("src.adapters.llm.openai_client", "OpenAIClient"),
+ "anthropic": ("src.adapters.llm.anthropic_client", "AnthropicClient"),
+ "lmstudio": ("src.adapters.llm.lmstudio_client", "LMStudioClient"),
+ "local": ("src.adapters.llm.lmstudio_client", "LMStudioClient"), # Alias
+}
+
+# Cache for loaded client classes
+_CLIENT_CACHE: dict[str, type[BaseLLMClient]] = {}
+
+
+def register_provider(name: str, module_path: str, class_name: str, override: bool = False) -> None:
+ """
+ Register a new LLM provider.
+
+ Args:
+ name: Provider identifier (e.g., "azure", "bedrock")
+ module_path: Full module path (e.g., "src.adapters.llm.azure_client")
+ class_name: Class name in the module (e.g., "AzureOpenAIClient")
+ override: If True, allow overriding existing provider
+ """
+ if name in _PROVIDER_REGISTRY and not override:
+ raise ValueError(f"Provider '{name}' already registered. Use override=True to replace.")
+
+ _PROVIDER_REGISTRY[name] = (module_path, class_name)
+ # Clear cache if overriding
+ if name in _CLIENT_CACHE:
+ del _CLIENT_CACHE[name]
+
+ logger.info(f"Registered LLM provider: {name} -> {module_path}.{class_name}")
+
+
+def list_providers() -> list[str]:
+ """
+ List all registered provider names.
+
+ Returns:
+ List of provider identifiers
+ """
+ return list(_PROVIDER_REGISTRY.keys())
+
+
+def get_provider_class(provider: str) -> type[BaseLLMClient]:
+ """
+ Get the client class for a provider (with lazy loading).
+
+ Args:
+ provider: Provider identifier
+
+ Returns:
+ Client class (not instantiated)
+
+ Raises:
+ ValueError: If provider not registered
+ ImportError: If module cannot be loaded
+ """
+ if provider not in _PROVIDER_REGISTRY:
+ available = ", ".join(list_providers())
+ raise ValueError(f"Unknown provider '{provider}'. Available: {available}")
+
+ # Check cache first
+ if provider in _CLIENT_CACHE:
+ return _CLIENT_CACHE[provider]
+
+ # Lazy load the module
+ module_path, class_name = _PROVIDER_REGISTRY[provider]
+
+ try:
+ module = importlib.import_module(module_path)
+ client_class = getattr(module, class_name)
+ except ImportError as e:
+ raise ImportError(f"Failed to load provider '{provider}': {e}") from e
+ except AttributeError as e:
+ raise ImportError(f"Class '{class_name}' not found in module '{module_path}'") from e
+
+ # Cache for future use
+ _CLIENT_CACHE[provider] = client_class
+ return client_class
+
+
+def create_client(
+ provider: str = "openai",
+ *,
+ api_key: str | None = None,
+ model: str | None = None,
+ base_url: str | None = None,
+ timeout: float | None = None,
+ max_retries: int | None = None,
+ **kwargs: Any,
+) -> BaseLLMClient:
+ """
+ Create an LLM client instance.
+
+ This is the main factory function for creating provider clients.
+
+ Args:
+ provider: Provider name ("openai", "anthropic", "lmstudio", etc.)
+ api_key: API key (may be optional for some providers)
+ model: Model identifier
+ base_url: Base URL for API
+ timeout: Request timeout in seconds
+ max_retries: Maximum retry attempts
+ **kwargs: Provider-specific parameters
+
+ Returns:
+ Configured LLMClient instance
+
+ Examples:
+ # OpenAI client
+ client = create_client("openai", model="gpt-4-turbo-preview")
+
+ # Anthropic client
+ client = create_client("anthropic", model="sonnet")
+
+ # Local LM Studio
+ client = create_client("lmstudio", base_url="http://localhost:1234/v1")
+
+ # With custom settings
+ client = create_client(
+ "openai",
+ api_key="sk-...",
+ timeout=120.0,
+ max_retries=5,
+ organization="org-..."
+ )
+ """
+ client_class = get_provider_class(provider)
+
+ # Build kwargs for client initialization
+ init_kwargs = {**kwargs}
+
+ if api_key is not None:
+ init_kwargs["api_key"] = api_key
+ if model is not None:
+ init_kwargs["model"] = model
+ if base_url is not None:
+ init_kwargs["base_url"] = base_url
+ if timeout is not None:
+ init_kwargs["timeout"] = timeout
+ if max_retries is not None:
+ init_kwargs["max_retries"] = max_retries
+
+ logger.info(f"Creating {provider} client with model={model or 'default'}")
+
+ return client_class(**init_kwargs)
+
+
+def create_client_from_config(config: dict) -> BaseLLMClient:
+ """
+ Create an LLM client from a configuration dictionary.
+
+ Useful for loading settings from YAML/JSON config files.
+
+ Args:
+ config: Configuration dictionary with keys:
+ - provider: Required provider name
+ - Other keys passed to create_client
+
+ Returns:
+ Configured LLMClient instance
+
+ Example:
+ config = {
+ "provider": "openai",
+ "model": "gpt-4-turbo-preview",
+ "timeout": 60.0,
+ "max_retries": 3
+ }
+ client = create_client_from_config(config)
+ """
+ config = config.copy()
+ provider = config.pop("provider", "openai")
+ return create_client(provider, **config)
+
+
+# Convenience aliases for common use cases
+def create_openai_client(**kwargs) -> BaseLLMClient:
+ """Create an OpenAI client."""
+ return create_client("openai", **kwargs)
+
+
+def create_anthropic_client(**kwargs) -> BaseLLMClient:
+ """Create an Anthropic Claude client."""
+ return create_client("anthropic", **kwargs)
+
+
+def create_local_client(**kwargs) -> BaseLLMClient:
+ """Create a local LM Studio client."""
+ return create_client("lmstudio", **kwargs)
+
+
+__all__ = [
+ # Base types
+ "LLMClient",
+ "LLMResponse",
+ "LLMToolResponse",
+ "ToolCall",
+ "BaseLLMClient",
+ # Exceptions
+ "LLMClientError",
+ "LLMAuthenticationError",
+ "LLMRateLimitError",
+ "LLMQuotaExceededError",
+ "LLMModelNotFoundError",
+ "LLMContextLengthError",
+ "LLMInvalidRequestError",
+ "LLMTimeoutError",
+ "LLMConnectionError",
+ "LLMServerError",
+ "LLMResponseParseError",
+ "LLMStreamError",
+ "LLMContentFilterError",
+ "CircuitBreakerOpenError",
+ # Factory functions
+ "create_client",
+ "create_client_from_config",
+ "create_openai_client",
+ "create_anthropic_client",
+ "create_local_client",
+ # Registry functions
+ "register_provider",
+ "list_providers",
+ "get_provider_class",
+]
diff --git a/src/adapters/llm/anthropic_client.py b/src/adapters/llm/anthropic_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f7482e47d385262879b3cadd767fba001264f6
--- /dev/null
+++ b/src/adapters/llm/anthropic_client.py
@@ -0,0 +1,521 @@
+"""
+Anthropic Claude LLM client adapter.
+
+Implements the LLMClient protocol for Anthropic's Messages API.
+Supports Claude 3 models with proper content block handling.
+"""
+
+import json
+import logging
+from collections.abc import AsyncIterator
+from typing import Any
+
+import httpx
+from tenacity import (
+ before_sleep_log,
+ retry,
+ retry_if_exception_type,
+ stop_after_attempt,
+ wait_exponential,
+)
+
+from .base import BaseLLMClient, LLMResponse, LLMToolResponse, ToolCall
+from .exceptions import (
+ CircuitBreakerOpenError,
+ LLMAuthenticationError,
+ LLMClientError,
+ LLMConnectionError,
+ LLMContentFilterError,
+ LLMContextLengthError,
+ LLMInvalidRequestError,
+ LLMModelNotFoundError,
+ LLMQuotaExceededError,
+ LLMRateLimitError,
+ LLMResponseParseError,
+ LLMServerError,
+ LLMStreamError,
+ LLMTimeoutError,
+)
+from .openai_client import CircuitBreaker
+
+logger = logging.getLogger(__name__)
+
+
+# Model mappings for convenience
+ANTHROPIC_MODELS = {
+ "claude-3-opus": "claude-3-opus-20240229",
+ "claude-3-sonnet": "claude-3-sonnet-20240229",
+ "claude-3-haiku": "claude-3-haiku-20240307",
+ "claude-3.5-sonnet": "claude-3-5-sonnet-20240620",
+ "claude-3.5-sonnet-v2": "claude-3-5-sonnet-20241022",
+ "claude-sonnet-4": "claude-sonnet-4-20250514",
+ # Add latest models
+ "opus": "claude-3-opus-20240229",
+ "sonnet": "claude-3-5-sonnet-20241022",
+ "haiku": "claude-3-haiku-20240307",
+}
+
+
+class AnthropicClient(BaseLLMClient):
+ """
+ Anthropic Claude API client.
+
+ Features:
+ - Messages API support (not legacy completion API)
+ - Content block handling (text, tool_use)
+ - Streaming with proper SSE parsing
+ - Model alias mapping
+ - System prompt support
+ - Tool/function calling (beta)
+ """
+
+ PROVIDER_NAME = "anthropic"
+ DEFAULT_BASE_URL = "https://api.anthropic.com"
+ DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
+ API_VERSION = "2023-06-01"
+
+ def __init__(
+ self,
+ api_key: str | None = None,
+ model: str | None = None,
+ base_url: str | None = None,
+ timeout: float = 120.0, # Claude can be slower
+ max_retries: int = 3,
+ # Circuit breaker settings
+ circuit_breaker_threshold: int = 5,
+ circuit_breaker_reset: float = 60.0,
+ # Rate limiting
+ rate_limit_per_minute: int | None = None,
+ ):
+ """
+ Initialize Anthropic client.
+
+ Args:
+ api_key: Anthropic API key (or set ANTHROPIC_API_KEY env var)
+ model: Model to use (supports aliases like 'sonnet', 'opus')
+ base_url: API base URL
+ timeout: Request timeout in seconds (default longer for Claude)
+ max_retries: Max retry attempts
+ circuit_breaker_threshold: Failures before circuit opens
+ circuit_breaker_reset: Seconds before circuit resets
+ rate_limit_per_minute: Rate limit for requests per minute (None to disable)
+ """
+ import os
+
+ api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
+ if not api_key:
+ raise LLMAuthenticationError(self.PROVIDER_NAME, "API key not provided and ANTHROPIC_API_KEY not set")
+
+ # Resolve model alias
+ model_name = model or self.DEFAULT_MODEL
+ resolved_model = ANTHROPIC_MODELS.get(model_name, model_name)
+
+ super().__init__(
+ api_key=api_key,
+ model=resolved_model,
+ base_url=base_url or self.DEFAULT_BASE_URL,
+ timeout=timeout,
+ max_retries=max_retries,
+ rate_limit_per_minute=rate_limit_per_minute,
+ )
+
+ self.circuit_breaker = CircuitBreaker(
+ failure_threshold=circuit_breaker_threshold,
+ reset_timeout=circuit_breaker_reset,
+ )
+ self._client: httpx.AsyncClient | None = None
+
+ async def _get_client(self) -> httpx.AsyncClient:
+ """Get or create the HTTP client."""
+ if self._client is None or self._client.is_closed:
+ headers = {
+ "x-api-key": self.api_key,
+ "anthropic-version": self.API_VERSION,
+ "Content-Type": "application/json",
+ }
+
+ self._client = httpx.AsyncClient(
+ base_url=self.base_url,
+ headers=headers,
+ timeout=httpx.Timeout(self.timeout),
+ )
+ return self._client
+
+ def _convert_messages_to_anthropic(self, messages: list[dict]) -> tuple[str | None, list[dict]]:
+ """
+ Convert OpenAI-style messages to Anthropic format.
+
+ Returns:
+ Tuple of (system_prompt, messages)
+ """
+ system_prompt = None
+ anthropic_messages = []
+
+ for msg in messages:
+ role = msg.get("role", "user")
+ content = msg.get("content", "")
+
+ if role == "system":
+ # Anthropic uses separate system parameter
+ system_prompt = content
+ elif role == "assistant":
+ anthropic_messages.append({"role": "assistant", "content": content})
+ elif role == "user":
+ anthropic_messages.append({"role": "user", "content": content})
+ elif role == "tool":
+ # Tool result message
+ anthropic_messages.append(
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "tool_result",
+ "tool_use_id": msg.get("tool_call_id", ""),
+ "content": content,
+ }
+ ],
+ }
+ )
+
+ return system_prompt, anthropic_messages
+
+ def _convert_tools_to_anthropic(self, tools: list[dict]) -> list[dict]:
+ """Convert OpenAI-style tool definitions to Anthropic format."""
+ anthropic_tools = []
+
+ for tool in tools:
+ if tool.get("type") == "function":
+ func = tool["function"]
+ anthropic_tools.append(
+ {
+ "name": func["name"],
+ "description": func.get("description", ""),
+ "input_schema": func.get("parameters", {"type": "object"}),
+ }
+ )
+ else:
+ # Already in Anthropic format
+ anthropic_tools.append(tool)
+
+ return anthropic_tools
+
+ def _handle_error_response(self, response: httpx.Response) -> None:
+ """Convert HTTP error responses to appropriate exceptions."""
+ status_code = response.status_code
+
+ try:
+ error_data = response.json()
+ error_type = error_data.get("error", {}).get("type", "")
+ error_message = error_data.get("error", {}).get("message", response.text)
+ except Exception:
+ error_type = ""
+ error_message = response.text
+
+ if status_code == 401:
+ raise LLMAuthenticationError(self.PROVIDER_NAME, error_message)
+ elif status_code == 429:
+ retry_after = response.headers.get("retry-after")
+ retry_after_float = float(retry_after) if retry_after else None
+ raise LLMRateLimitError(self.PROVIDER_NAME, retry_after=retry_after_float, message=error_message)
+ elif status_code == 402 or "billing" in error_type.lower():
+ raise LLMQuotaExceededError(self.PROVIDER_NAME, error_message)
+ elif status_code == 404 or error_type == "not_found_error":
+ raise LLMModelNotFoundError(self.PROVIDER_NAME, self.model)
+ elif status_code == 400:
+ if "context" in error_message.lower() or "token" in error_message.lower():
+ raise LLMContextLengthError(self.PROVIDER_NAME)
+ if "content_policy" in error_type or "safety" in error_message.lower():
+ raise LLMContentFilterError(self.PROVIDER_NAME, error_message)
+ raise LLMInvalidRequestError(self.PROVIDER_NAME, error_message)
+ elif status_code >= 500:
+ raise LLMServerError(self.PROVIDER_NAME, status_code, error_message)
+ else:
+ raise LLMClientError(error_message, self.PROVIDER_NAME, status_code=status_code)
+
+ def _make_retry_decorator(self):
+ """Create retry decorator with exponential backoff."""
+ return retry(
+ stop=stop_after_attempt(self.max_retries),
+ wait=wait_exponential(multiplier=1, min=2, max=120),
+ retry=retry_if_exception_type((LLMRateLimitError, LLMServerError, LLMConnectionError)),
+ before_sleep=before_sleep_log(logger, logging.WARNING),
+ reraise=True,
+ )
+
+ async def generate(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stream: bool = False,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse | AsyncIterator[str]:
+ """
+ Generate a response from Anthropic Claude.
+
+ Args:
+ messages: Chat messages (will be converted to Anthropic format)
+ prompt: Simple string prompt
+ temperature: Sampling temperature (0.0 to 1.0 for Claude)
+ max_tokens: Maximum tokens to generate (required for Anthropic)
+ tools: Tool definitions (will be converted to Anthropic format)
+ stream: If True, returns AsyncIterator
+ stop: Stop sequences
+ **kwargs: Additional parameters (top_p, top_k, etc.)
+
+ Returns:
+ LLMResponse or AsyncIterator[str] for streaming
+ """
+ # Apply rate limiting before proceeding
+ await self._apply_rate_limit()
+
+ # Check circuit breaker
+ if not self.circuit_breaker.can_execute():
+ raise CircuitBreakerOpenError(
+ self.PROVIDER_NAME,
+ self.circuit_breaker.failure_count,
+ self.circuit_breaker.get_reset_time(),
+ )
+
+ # Anthropic requires max_tokens
+ if max_tokens is None:
+ max_tokens = 4096 # Sensible default
+
+ if stream:
+ return self._generate_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+ else:
+ return await self._generate_non_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+
+ async def _generate_non_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int = 4096,
+ tools: list[dict] | None = None,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse:
+ """Non-streaming generation with retry logic."""
+
+ @self._make_retry_decorator()
+ async def _request():
+ client = await self._get_client()
+
+ # Convert messages
+ built_messages = self._build_messages(messages, prompt)
+ system_prompt, anthropic_messages = self._convert_messages_to_anthropic(built_messages)
+
+ # Build request payload
+ payload = {
+ "model": self.model,
+ "messages": anthropic_messages,
+ "max_tokens": max_tokens,
+ "temperature": min(temperature, 1.0), # Anthropic max is 1.0
+ }
+
+ if system_prompt:
+ payload["system"] = system_prompt
+ if stop:
+ payload["stop_sequences"] = stop
+ if tools:
+ payload["tools"] = self._convert_tools_to_anthropic(tools)
+
+ # Add any additional kwargs (top_p, top_k, etc.)
+ for key in ["top_p", "top_k", "metadata"]:
+ if key in kwargs:
+ payload[key] = kwargs[key]
+
+ try:
+ response = await client.post("/v1/messages", json=payload)
+ except httpx.TimeoutException:
+ raise LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ except httpx.ConnectError:
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+
+ if response.status_code != 200:
+ self._handle_error_response(response)
+
+ return response
+
+ try:
+ response = await _request()
+ self.circuit_breaker.record_success()
+ except Exception:
+ self.circuit_breaker.record_failure()
+ raise
+
+ # Parse response
+ try:
+ data = response.json()
+
+ # Extract text from content blocks
+ text_parts = []
+ tool_calls = []
+
+ for block in data.get("content", []):
+ if block.get("type") == "text":
+ text_parts.append(block.get("text", ""))
+ elif block.get("type") == "tool_use":
+ tool_calls.append(
+ ToolCall(
+ id=block.get("id", ""),
+ name=block.get("name", ""),
+ arguments=block.get("input", {}),
+ type="tool_use",
+ )
+ )
+
+ text = "\n".join(text_parts)
+
+ # Build usage dict
+ usage = {
+ "prompt_tokens": data.get("usage", {}).get("input_tokens", 0),
+ "completion_tokens": data.get("usage", {}).get("output_tokens", 0),
+ }
+ usage["total_tokens"] = usage["prompt_tokens"] + usage["completion_tokens"]
+
+ finish_reason = data.get("stop_reason", "stop")
+
+ if tool_calls:
+ llm_response = LLMToolResponse(
+ text=text,
+ usage=usage,
+ model=data.get("model", self.model),
+ raw_response=data,
+ finish_reason=finish_reason,
+ tool_calls=tool_calls,
+ )
+ else:
+ llm_response = LLMResponse(
+ text=text,
+ usage=usage,
+ model=data.get("model", self.model),
+ raw_response=data,
+ finish_reason=finish_reason,
+ )
+
+ self._update_stats(llm_response)
+ return llm_response
+
+ except (KeyError, json.JSONDecodeError) as e:
+ raise LLMResponseParseError(self.PROVIDER_NAME, response.text) from e
+
+ async def _generate_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int = 4096,
+ tools: list[dict] | None = None,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> AsyncIterator[str]:
+ """Streaming generation with Server-Sent Events."""
+
+ client = await self._get_client()
+
+ # Convert messages
+ built_messages = self._build_messages(messages, prompt)
+ system_prompt, anthropic_messages = self._convert_messages_to_anthropic(built_messages)
+
+ # Build request payload
+ payload = {
+ "model": self.model,
+ "messages": anthropic_messages,
+ "max_tokens": max_tokens,
+ "temperature": min(temperature, 1.0),
+ "stream": True,
+ }
+
+ if system_prompt:
+ payload["system"] = system_prompt
+ if stop:
+ payload["stop_sequences"] = stop
+ if tools:
+ payload["tools"] = self._convert_tools_to_anthropic(tools)
+
+ for key in ["top_p", "top_k"]:
+ if key in kwargs:
+ payload[key] = kwargs[key]
+
+ async def stream_generator():
+ try:
+ async with client.stream("POST", "/v1/messages", json=payload) as response:
+ if response.status_code != 200:
+ await response.aread()
+ self._handle_error_response(response)
+
+ async for line in response.aiter_lines():
+ if not line.strip():
+ continue
+
+ if line.startswith("event:"):
+ event_type = line[6:].strip()
+ continue
+
+ if line.startswith("data:"):
+ data_str = line[5:].strip()
+ if not data_str:
+ continue
+
+ try:
+ data = json.loads(data_str)
+ event_type = data.get("type", "")
+
+ if event_type == "content_block_delta":
+ delta = data.get("delta", {})
+ if delta.get("type") == "text_delta":
+ text = delta.get("text", "")
+ if text:
+ yield text
+
+ elif event_type == "message_stop":
+ break
+
+ except json.JSONDecodeError:
+ continue
+
+ self.circuit_breaker.record_success()
+
+ except httpx.TimeoutException:
+ self.circuit_breaker.record_failure()
+ raise LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ except httpx.ConnectError:
+ self.circuit_breaker.record_failure()
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+ except Exception as e:
+ self.circuit_breaker.record_failure()
+ if isinstance(e, LLMClientError):
+ raise
+ raise LLMStreamError(self.PROVIDER_NAME, str(e)) from e
+
+ return stream_generator()
+
+ async def close(self) -> None:
+ """Close the HTTP client."""
+ if self._client and not self._client.is_closed:
+ await self._client.aclose()
+ self._client = None
diff --git a/src/adapters/llm/base.py b/src/adapters/llm/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..31860b77004722a0d8f6f41c6fa9d59970140cd1
--- /dev/null
+++ b/src/adapters/llm/base.py
@@ -0,0 +1,305 @@
+"""
+Base LLM client interface for provider-agnostic model access.
+
+This module defines the protocol and data structures for LLM clients,
+enabling seamless switching between providers (OpenAI, Anthropic, LM Studio, etc.)
+"""
+
+import asyncio
+import time
+from abc import ABC, abstractmethod
+from collections.abc import AsyncIterator
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Protocol, runtime_checkable
+
+
+@dataclass
+class LLMResponse:
+ """Standardized response from any LLM provider."""
+
+ text: str
+ usage: dict = field(default_factory=dict)
+ model: str = ""
+ raw_response: Any = None
+ finish_reason: str = "stop"
+ created_at: datetime = field(default_factory=datetime.utcnow)
+
+ @property
+ def total_tokens(self) -> int:
+ """Total tokens used in request/response."""
+ return self.usage.get("total_tokens", 0)
+
+ @property
+ def prompt_tokens(self) -> int:
+ """Tokens used in prompt."""
+ return self.usage.get("prompt_tokens", 0)
+
+ @property
+ def completion_tokens(self) -> int:
+ """Tokens used in completion."""
+ return self.usage.get("completion_tokens", 0)
+
+
+@dataclass
+class ToolCall:
+ """Represents a tool/function call from the LLM."""
+
+ id: str
+ name: str
+ arguments: dict
+ type: str = "function"
+
+
+@dataclass
+class LLMToolResponse(LLMResponse):
+ """Response containing tool calls."""
+
+ tool_calls: list[ToolCall] = field(default_factory=list)
+
+
+class TokenBucketRateLimiter:
+ """
+ Token bucket rate limiter for controlling request rates.
+
+ This implementation uses a token bucket algorithm where:
+ - Tokens are added at a fixed rate (rate_per_second)
+ - Each request consumes one token
+ - If no tokens available, caller waits until one becomes available
+ """
+
+ def __init__(self, rate_per_minute: int = 60):
+ """
+ Initialize the rate limiter.
+
+ Args:
+ rate_per_minute: Maximum requests allowed per minute
+ """
+ self.rate_per_second = rate_per_minute / 60.0
+ self.max_tokens = float(rate_per_minute)
+ self.tokens = self.max_tokens
+ self.last_refill = time.monotonic()
+ self._lock = asyncio.Lock()
+ self._wait_count = 0
+ self._total_wait_time = 0.0
+
+ async def acquire(self) -> float:
+ """
+ Acquire a token, waiting if necessary.
+
+ Returns:
+ Time spent waiting (0.0 if no wait was needed)
+ """
+ async with self._lock:
+ now = time.monotonic()
+ elapsed = now - self.last_refill
+
+ # Refill tokens based on elapsed time
+ self.tokens = min(self.max_tokens, self.tokens + elapsed * self.rate_per_second)
+ self.last_refill = now
+
+ wait_time = 0.0
+ if self.tokens < 1:
+ # Calculate how long to wait for one token
+ wait_time = (1 - self.tokens) / self.rate_per_second
+ self._wait_count += 1
+ self._total_wait_time += wait_time
+
+ # Release lock during sleep to allow other operations
+ self._lock.release()
+ try:
+ await asyncio.sleep(wait_time)
+ finally:
+ await self._lock.acquire()
+
+ # After sleeping, update time and set tokens to 0
+ self.last_refill = time.monotonic()
+ self.tokens = 0
+ else:
+ self.tokens -= 1
+
+ return wait_time
+
+ @property
+ def stats(self) -> dict:
+ """Get rate limiter statistics."""
+ return {
+ "rate_limit_waits": self._wait_count,
+ "total_rate_limit_wait_time": self._total_wait_time,
+ "current_tokens": self.tokens,
+ }
+
+
+@runtime_checkable
+class LLMClient(Protocol):
+ """
+ Protocol for LLM clients.
+
+ This protocol defines the interface that all LLM provider adapters must implement.
+ Using Protocol allows for structural subtyping (duck typing) while maintaining
+ type safety.
+ """
+
+ async def generate(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stream: bool = False,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse | AsyncIterator[str]:
+ """
+ Generate a response from the LLM.
+
+ Args:
+ messages: List of message dicts in OpenAI format [{"role": "...", "content": "..."}]
+ prompt: Simple string prompt (converted to single user message)
+ temperature: Sampling temperature (0.0 to 2.0)
+ max_tokens: Maximum tokens to generate
+ tools: List of tool definitions for function calling
+ stream: If True, returns AsyncIterator[str] for streaming
+ stop: Stop sequences
+ **kwargs: Provider-specific parameters
+
+ Returns:
+ LLMResponse if stream=False, AsyncIterator[str] if stream=True
+
+ Raises:
+ LLMClientError: Base exception for all client errors
+ """
+ ...
+
+
+class BaseLLMClient(ABC):
+ """
+ Abstract base class for LLM clients.
+
+ Provides common functionality and enforces the interface contract.
+ All concrete implementations should inherit from this class.
+ """
+
+ def __init__(
+ self,
+ api_key: str | None = None,
+ model: str = "default",
+ base_url: str | None = None,
+ timeout: float = 60.0,
+ max_retries: int = 3,
+ rate_limit_per_minute: int | None = None,
+ ):
+ """
+ Initialize the LLM client.
+
+ Args:
+ api_key: API key for authentication
+ model: Model identifier
+ base_url: Base URL for API requests
+ timeout: Request timeout in seconds
+ max_retries: Maximum number of retry attempts
+ rate_limit_per_minute: Rate limit (requests per minute), None to disable
+ """
+ self.api_key = api_key
+ self.model = model
+ self.base_url = base_url
+ self.timeout = timeout
+ self.max_retries = max_retries
+ self._request_count = 0
+ self._total_tokens_used = 0
+ self._rate_limited_requests = 0
+
+ # Initialize rate limiter if configured
+ if rate_limit_per_minute is not None and rate_limit_per_minute > 0:
+ self._rate_limiter: TokenBucketRateLimiter | None = TokenBucketRateLimiter(
+ rate_per_minute=rate_limit_per_minute
+ )
+ else:
+ self._rate_limiter = None
+
+ @abstractmethod
+ async def generate(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stream: bool = False,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse | AsyncIterator[str]:
+ """Generate a response from the LLM."""
+ pass
+
+ def _build_messages(
+ self,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ ) -> list[dict]:
+ """
+ Build message list from either messages or prompt.
+
+ Args:
+ messages: Pre-formatted message list
+ prompt: Simple string prompt
+
+ Returns:
+ List of message dicts
+
+ Raises:
+ ValueError: If neither messages nor prompt provided
+ """
+ if messages is not None:
+ return messages
+ elif prompt is not None:
+ return [{"role": "user", "content": prompt}]
+ else:
+ raise ValueError("Either 'messages' or 'prompt' must be provided")
+
+ def _update_stats(self, response: LLMResponse) -> None:
+ """Update internal statistics."""
+ self._request_count += 1
+ self._total_tokens_used += response.total_tokens
+
+ async def _apply_rate_limit(self) -> None:
+ """
+ Apply rate limiting if configured.
+
+ Waits if necessary to comply with rate limits.
+ Tracks rate-limited requests in metrics.
+ """
+ if self._rate_limiter is not None:
+ wait_time = await self._rate_limiter.acquire()
+ if wait_time > 0:
+ self._rate_limited_requests += 1
+
+ @property
+ def stats(self) -> dict:
+ """Get client statistics."""
+ base_stats = {
+ "request_count": self._request_count,
+ "total_tokens_used": self._total_tokens_used,
+ "rate_limited_requests": self._rate_limited_requests,
+ }
+
+ # Include rate limiter stats if available
+ if self._rate_limiter is not None:
+ base_stats.update(self._rate_limiter.stats)
+
+ return base_stats
+
+ async def close(self) -> None: # noqa: B027
+ """Clean up resources. Override in subclasses if needed."""
+ pass
+
+ async def __aenter__(self):
+ """Async context manager entry."""
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit."""
+ await self.close()
diff --git a/src/adapters/llm/exceptions.py b/src/adapters/llm/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..528231d3107b0b650f558d8c71091b34ebe683a1
--- /dev/null
+++ b/src/adapters/llm/exceptions.py
@@ -0,0 +1,204 @@
+"""
+Custom exceptions for LLM client operations.
+
+Provides a hierarchy of structured exceptions for better error handling
+and debugging across different LLM providers.
+"""
+
+
+class LLMClientError(Exception):
+ """Base exception for all LLM client errors."""
+
+ def __init__(
+ self,
+ message: str,
+ provider: str = "unknown",
+ status_code: int | None = None,
+ retry_after: float | None = None,
+ ):
+ self.message = message
+ self.provider = provider
+ self.status_code = status_code
+ self.retry_after = retry_after
+ super().__init__(self.message)
+
+ def __str__(self) -> str:
+ parts = [f"[{self.provider}] {self.message}"]
+ if self.status_code:
+ parts.append(f"(status: {self.status_code})")
+ return " ".join(parts)
+
+
+class LLMAuthenticationError(LLMClientError):
+ """Authentication failed - invalid or missing API key."""
+
+ def __init__(self, provider: str, message: str = "Authentication failed"):
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=401,
+ )
+
+
+class LLMRateLimitError(LLMClientError):
+ """Rate limit exceeded - too many requests."""
+
+ def __init__(
+ self,
+ provider: str,
+ retry_after: float | None = None,
+ message: str = "Rate limit exceeded",
+ ):
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=429,
+ retry_after=retry_after,
+ )
+
+
+class LLMQuotaExceededError(LLMClientError):
+ """Quota or credits exhausted."""
+
+ def __init__(self, provider: str, message: str = "Quota exceeded"):
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=402,
+ )
+
+
+class LLMModelNotFoundError(LLMClientError):
+ """Requested model not available."""
+
+ def __init__(self, provider: str, model: str):
+ super().__init__(
+ message=f"Model '{model}' not found or not available",
+ provider=provider,
+ status_code=404,
+ )
+
+
+class LLMContextLengthError(LLMClientError):
+ """Input exceeds model's context window."""
+
+ def __init__(
+ self,
+ provider: str,
+ token_count: int | None = None,
+ max_tokens: int | None = None,
+ ):
+ message = "Context length exceeded"
+ if token_count and max_tokens:
+ message = f"Context length exceeded: {token_count} tokens provided, max is {max_tokens}"
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=400,
+ )
+
+
+class LLMInvalidRequestError(LLMClientError):
+ """Invalid request parameters."""
+
+ def __init__(self, provider: str, message: str = "Invalid request parameters"):
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=400,
+ )
+
+
+class LLMTimeoutError(LLMClientError):
+ """Request timed out."""
+
+ def __init__(self, provider: str, timeout: float):
+ super().__init__(
+ message=f"Request timed out after {timeout}s",
+ provider=provider,
+ status_code=408,
+ )
+
+
+class LLMConnectionError(LLMClientError):
+ """Failed to connect to the API endpoint."""
+
+ def __init__(self, provider: str, url: str | None = None):
+ message = "Failed to connect to API"
+ if url:
+ message = f"Failed to connect to {url}"
+ super().__init__(
+ message=message,
+ provider=provider,
+ )
+
+
+class LLMServerError(LLMClientError):
+ """Server-side error from the LLM provider."""
+
+ def __init__(
+ self,
+ provider: str,
+ status_code: int = 500,
+ message: str = "Server error",
+ ):
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=status_code,
+ )
+
+
+class LLMResponseParseError(LLMClientError):
+ """Failed to parse response from LLM provider."""
+
+ def __init__(self, provider: str, raw_response: str | None = None):
+ message = "Failed to parse response"
+ if raw_response:
+ preview = raw_response[:200] + "..." if len(raw_response) > 200 else raw_response
+ message = f"Failed to parse response: {preview}"
+ super().__init__(
+ message=message,
+ provider=provider,
+ )
+
+
+class LLMStreamError(LLMClientError):
+ """Error during streaming response."""
+
+ def __init__(self, provider: str, message: str = "Stream interrupted"):
+ super().__init__(
+ message=message,
+ provider=provider,
+ )
+
+
+class LLMContentFilterError(LLMClientError):
+ """Content blocked by safety filters."""
+
+ def __init__(self, provider: str, reason: str | None = None):
+ message = "Content blocked by safety filters"
+ if reason:
+ message = f"Content blocked: {reason}"
+ super().__init__(
+ message=message,
+ provider=provider,
+ status_code=400,
+ )
+
+
+class CircuitBreakerOpenError(LLMClientError):
+ """Circuit breaker is open, requests are being blocked."""
+
+ def __init__(
+ self,
+ provider: str,
+ failure_count: int,
+ reset_time: float,
+ ):
+ super().__init__(
+ message=f"Circuit breaker open after {failure_count} failures. Resets in {reset_time:.1f}s",
+ provider=provider,
+ )
+ self.failure_count = failure_count
+ self.reset_time = reset_time
diff --git a/src/adapters/llm/lmstudio_client.py b/src/adapters/llm/lmstudio_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..50091d1cb9a8f680c8a963226442a68102f2eb2e
--- /dev/null
+++ b/src/adapters/llm/lmstudio_client.py
@@ -0,0 +1,346 @@
+"""
+LM Studio local LLM client adapter.
+
+Implements the LLMClient protocol for LM Studio's OpenAI-compatible API.
+Designed for running local models with configurable endpoint.
+"""
+
+import json
+import logging
+from collections.abc import AsyncIterator
+from typing import Any
+
+import httpx
+
+from .base import BaseLLMClient, LLMResponse
+from .exceptions import (
+ LLMClientError,
+ LLMConnectionError,
+ LLMResponseParseError,
+ LLMServerError,
+ LLMStreamError,
+ LLMTimeoutError,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class LMStudioClient(BaseLLMClient):
+ """
+ LM Studio local server client.
+
+ LM Studio provides an OpenAI-compatible API for running local models.
+ This client is optimized for local deployment with:
+ - No authentication required (local)
+ - Configurable base URL
+ - No circuit breaker (local server expected to be stable)
+ - Longer timeouts for large models
+ """
+
+ PROVIDER_NAME = "lmstudio"
+ DEFAULT_BASE_URL = "http://localhost:1234/v1"
+ DEFAULT_MODEL = "local-model" # LM Studio uses the loaded model
+
+ def __init__(
+ self,
+ api_key: str | None = None, # Not required for local
+ model: str | None = None,
+ base_url: str | None = None,
+ timeout: float = 300.0, # Long timeout for local inference
+ max_retries: int = 2, # Fewer retries for local
+ # Rate limiting
+ rate_limit_per_minute: int | None = None,
+ ):
+ """
+ Initialize LM Studio client.
+
+ Args:
+ api_key: Not required for local server (ignored)
+ model: Model identifier (often ignored by LM Studio, uses loaded model)
+ base_url: Local server URL (default: http://localhost:1234/v1)
+ timeout: Request timeout in seconds (default longer for local models)
+ max_retries: Max retry attempts (fewer for local)
+ rate_limit_per_minute: Rate limit for requests per minute (None to disable)
+ """
+ import os
+
+ # Allow overriding via environment variable
+ base_url = base_url or os.environ.get("LMSTUDIO_BASE_URL", self.DEFAULT_BASE_URL)
+
+ super().__init__(
+ api_key=api_key or "not-required", # Placeholder
+ model=model or self.DEFAULT_MODEL,
+ base_url=base_url,
+ timeout=timeout,
+ max_retries=max_retries,
+ rate_limit_per_minute=rate_limit_per_minute,
+ )
+
+ self._client: httpx.AsyncClient | None = None
+
+ async def _get_client(self) -> httpx.AsyncClient:
+ """Get or create the HTTP client."""
+ if self._client is None or self._client.is_closed:
+ headers = {"Content-Type": "application/json"}
+
+ # Add auth header if provided (some local servers may require it)
+ if self.api_key and self.api_key != "not-required":
+ headers["Authorization"] = f"Bearer {self.api_key}"
+
+ self._client = httpx.AsyncClient(
+ base_url=self.base_url,
+ headers=headers,
+ timeout=httpx.Timeout(self.timeout),
+ )
+ return self._client
+
+ async def check_health(self) -> bool:
+ """
+ Check if LM Studio server is running.
+
+ Returns:
+ True if server is accessible, False otherwise
+ """
+ try:
+ client = await self._get_client()
+ response = await client.get("/models")
+ return response.status_code == 200
+ except Exception:
+ return False
+
+ async def list_models(self) -> list[dict]:
+ """
+ List available models on the LM Studio server.
+
+ Returns:
+ List of model information dicts
+ """
+ try:
+ client = await self._get_client()
+ response = await client.get("/models")
+ if response.status_code == 200:
+ data = response.json()
+ return data.get("data", [])
+ return []
+ except Exception as e:
+ logger.warning(f"Failed to list models: {e}")
+ return []
+
+ def _handle_error_response(self, response: httpx.Response) -> None:
+ """Handle error responses from LM Studio server."""
+ status_code = response.status_code
+
+ try:
+ error_data = response.json()
+ error_message = error_data.get("error", {}).get("message", response.text)
+ except Exception:
+ error_message = response.text
+
+ if status_code >= 500:
+ raise LLMServerError(self.PROVIDER_NAME, status_code, error_message)
+ else:
+ raise LLMClientError(error_message, self.PROVIDER_NAME, status_code=status_code)
+
+ async def generate(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stream: bool = False,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse | AsyncIterator[str]:
+ """
+ Generate a response from LM Studio local model.
+
+ Args:
+ messages: Chat messages in OpenAI format
+ prompt: Simple string prompt
+ temperature: Sampling temperature
+ max_tokens: Maximum tokens to generate
+ tools: Tool definitions (limited support in local models)
+ stream: If True, returns AsyncIterator
+ stop: Stop sequences
+ **kwargs: Additional parameters
+
+ Returns:
+ LLMResponse or AsyncIterator[str] for streaming
+ """
+ # Apply rate limiting before proceeding
+ await self._apply_rate_limit()
+
+ if stream:
+ return self._generate_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+ else:
+ return await self._generate_non_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+
+ async def _generate_non_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse:
+ """Non-streaming generation."""
+ client = await self._get_client()
+
+ # Build request payload (OpenAI-compatible)
+ payload = {
+ "model": self.model,
+ "messages": self._build_messages(messages, prompt),
+ "temperature": temperature,
+ }
+
+ if max_tokens is not None:
+ payload["max_tokens"] = max_tokens
+ if stop:
+ payload["stop"] = stop
+
+ # Note: most local models don't support tools well
+ if tools:
+ logger.warning("Tool calling may not be fully supported by local models")
+ payload["tools"] = tools
+
+ # Add additional kwargs (e.g., top_p, repeat_penalty)
+ for key in ["top_p", "top_k", "repeat_penalty", "presence_penalty", "frequency_penalty"]:
+ if key in kwargs:
+ payload[key] = kwargs[key]
+
+ # Retry logic for local server
+ last_error = None
+ for attempt in range(self.max_retries):
+ try:
+ response = await client.post("/chat/completions", json=payload)
+
+ if response.status_code != 200:
+ self._handle_error_response(response)
+
+ # Parse response
+ try:
+ data = response.json()
+ choice = data["choices"][0]
+ message = choice["message"]
+
+ usage = data.get("usage", {})
+ finish_reason = choice.get("finish_reason", "stop")
+
+ llm_response = LLMResponse(
+ text=message.get("content", ""),
+ usage=usage,
+ model=data.get("model", self.model),
+ raw_response=data,
+ finish_reason=finish_reason,
+ )
+
+ self._update_stats(llm_response)
+ return llm_response
+
+ except (KeyError, json.JSONDecodeError) as e:
+ raise LLMResponseParseError(self.PROVIDER_NAME, response.text) from e
+
+ except httpx.TimeoutException:
+ last_error = LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ logger.warning(f"Attempt {attempt + 1} timed out, retrying...")
+ except httpx.ConnectError:
+ last_error = LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+ logger.warning(f"Attempt {attempt + 1} connection failed, retrying...")
+ except LLMClientError:
+ raise # Don't retry client errors
+
+ # All retries exhausted
+ if last_error:
+ raise last_error
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+
+ async def _generate_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None, # noqa: ARG002
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> AsyncIterator[str]:
+ """Streaming generation."""
+ client = await self._get_client()
+
+ # Build request payload
+ payload = {
+ "model": self.model,
+ "messages": self._build_messages(messages, prompt),
+ "temperature": temperature,
+ "stream": True,
+ }
+
+ if max_tokens is not None:
+ payload["max_tokens"] = max_tokens
+ if stop:
+ payload["stop"] = stop
+
+ for key in ["top_p", "top_k", "repeat_penalty"]:
+ if key in kwargs:
+ payload[key] = kwargs[key]
+
+ async def stream_generator():
+ try:
+ async with client.stream("POST", "/chat/completions", json=payload) as response:
+ if response.status_code != 200:
+ await response.aread()
+ self._handle_error_response(response)
+
+ async for line in response.aiter_lines():
+ if line.startswith("data: "):
+ data_str = line[6:]
+ if data_str.strip() == "[DONE]":
+ break
+
+ try:
+ data = json.loads(data_str)
+ delta = data["choices"][0].get("delta", {})
+ content = delta.get("content", "")
+ if content:
+ yield content
+ except (json.JSONDecodeError, KeyError):
+ continue
+
+ except httpx.TimeoutException:
+ raise LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ except httpx.ConnectError:
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+ except Exception as e:
+ if isinstance(e, LLMClientError):
+ raise
+ raise LLMStreamError(self.PROVIDER_NAME, str(e)) from e
+
+ return stream_generator()
+
+ async def close(self) -> None:
+ """Close the HTTP client."""
+ if self._client and not self._client.is_closed:
+ await self._client.aclose()
+ self._client = None
diff --git a/src/adapters/llm/openai_client.py b/src/adapters/llm/openai_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..4099510ef5f496bbc8a76ab40d70092b071c5995
--- /dev/null
+++ b/src/adapters/llm/openai_client.py
@@ -0,0 +1,458 @@
+"""
+OpenAI-compatible LLM client adapter.
+
+Implements the LLMClient protocol for OpenAI API (and compatible APIs).
+Includes retry logic, circuit breaker pattern, and streaming support.
+"""
+
+import json
+import logging
+import time
+from collections.abc import AsyncIterator
+from typing import Any
+
+import httpx
+from tenacity import (
+ before_sleep_log,
+ retry,
+ retry_if_exception_type,
+ stop_after_attempt,
+ wait_exponential,
+)
+
+from .base import BaseLLMClient, LLMResponse, LLMToolResponse, ToolCall
+from .exceptions import (
+ CircuitBreakerOpenError,
+ LLMAuthenticationError,
+ LLMClientError,
+ LLMConnectionError,
+ LLMContextLengthError,
+ LLMInvalidRequestError,
+ LLMModelNotFoundError,
+ LLMQuotaExceededError,
+ LLMRateLimitError,
+ LLMResponseParseError,
+ LLMServerError,
+ LLMStreamError,
+ LLMTimeoutError,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class CircuitBreaker:
+ """Simple circuit breaker implementation for resilience."""
+
+ def __init__(
+ self,
+ failure_threshold: int = 5,
+ reset_timeout: float = 60.0,
+ half_open_max_calls: int = 1,
+ ):
+ self.failure_threshold = failure_threshold
+ self.reset_timeout = reset_timeout
+ self.half_open_max_calls = half_open_max_calls
+ self.failure_count = 0
+ self.last_failure_time = 0.0
+ self.state = "closed" # closed, open, half-open
+ self.half_open_calls = 0
+
+ def can_execute(self) -> bool:
+ """Check if request can be executed."""
+ if self.state == "closed":
+ return True
+
+ if self.state == "open":
+ # Check if reset timeout has passed
+ if time.time() - self.last_failure_time >= self.reset_timeout:
+ self.state = "half-open"
+ self.half_open_calls = 0
+ return True
+ return False
+
+ if self.state == "half-open":
+ return self.half_open_calls < self.half_open_max_calls
+
+ return False
+
+ def record_success(self) -> None:
+ """Record successful request."""
+ if self.state == "half-open":
+ self.state = "closed"
+ self.failure_count = 0
+ elif self.state == "closed":
+ self.failure_count = 0
+
+ def record_failure(self) -> None:
+ """Record failed request."""
+ self.failure_count += 1
+ self.last_failure_time = time.time()
+
+ if self.state == "half-open" or self.failure_count >= self.failure_threshold:
+ self.state = "open"
+
+ def get_reset_time(self) -> float:
+ """Get time until circuit resets."""
+ if self.state != "open":
+ return 0.0
+ elapsed = time.time() - self.last_failure_time
+ return max(0, self.reset_timeout - elapsed)
+
+
+class OpenAIClient(BaseLLMClient):
+ """
+ OpenAI API client with retry logic and circuit breaker.
+
+ Features:
+ - Exponential backoff retry for transient errors
+ - Circuit breaker to prevent cascading failures
+ - Streaming support
+ - Structured error handling
+ - Tool/function calling support
+ """
+
+ PROVIDER_NAME = "openai"
+ DEFAULT_BASE_URL = "https://api.openai.com/v1"
+ DEFAULT_MODEL = "gpt-4-turbo-preview"
+
+ def __init__(
+ self,
+ api_key: str | None = None,
+ model: str | None = None,
+ base_url: str | None = None,
+ timeout: float = 60.0,
+ max_retries: int = 3,
+ organization: str | None = None,
+ # Circuit breaker settings
+ circuit_breaker_threshold: int = 5,
+ circuit_breaker_reset: float = 60.0,
+ # Rate limiting
+ rate_limit_per_minute: int | None = None,
+ ):
+ """
+ Initialize OpenAI client.
+
+ Args:
+ api_key: OpenAI API key (or set OPENAI_API_KEY env var)
+ model: Model to use (default: gpt-4-turbo-preview)
+ base_url: API base URL (default: https://api.openai.com/v1)
+ timeout: Request timeout in seconds
+ max_retries: Max retry attempts for transient errors
+ organization: Optional organization ID
+ circuit_breaker_threshold: Failures before circuit opens
+ circuit_breaker_reset: Seconds before circuit resets
+ rate_limit_per_minute: Rate limit for requests per minute (None to disable)
+ """
+ import os
+
+ api_key = api_key or os.environ.get("OPENAI_API_KEY")
+ if not api_key:
+ raise LLMAuthenticationError(self.PROVIDER_NAME, "API key not provided and OPENAI_API_KEY not set")
+
+ super().__init__(
+ api_key=api_key,
+ model=model or self.DEFAULT_MODEL,
+ base_url=base_url or self.DEFAULT_BASE_URL,
+ timeout=timeout,
+ max_retries=max_retries,
+ rate_limit_per_minute=rate_limit_per_minute,
+ )
+
+ self.organization = organization
+ self.circuit_breaker = CircuitBreaker(
+ failure_threshold=circuit_breaker_threshold,
+ reset_timeout=circuit_breaker_reset,
+ )
+
+ # Initialize async HTTP client
+ self._client: httpx.AsyncClient | None = None
+
+ async def _get_client(self) -> httpx.AsyncClient:
+ """Get or create the HTTP client."""
+ if self._client is None or self._client.is_closed:
+ headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ }
+ if self.organization:
+ headers["OpenAI-Organization"] = self.organization
+
+ self._client = httpx.AsyncClient(
+ base_url=self.base_url,
+ headers=headers,
+ timeout=httpx.Timeout(self.timeout),
+ )
+ return self._client
+
+ def _handle_error_response(self, response: httpx.Response) -> None:
+ """Convert HTTP error responses to appropriate exceptions."""
+ status_code = response.status_code
+
+ try:
+ error_data = response.json()
+ error_message = error_data.get("error", {}).get("message", response.text)
+ except Exception:
+ error_message = response.text
+
+ if status_code == 401:
+ raise LLMAuthenticationError(self.PROVIDER_NAME, error_message)
+ elif status_code == 429:
+ retry_after = response.headers.get("Retry-After")
+ retry_after_float = float(retry_after) if retry_after else None
+ raise LLMRateLimitError(self.PROVIDER_NAME, retry_after=retry_after_float, message=error_message)
+ elif status_code == 402:
+ raise LLMQuotaExceededError(self.PROVIDER_NAME, error_message)
+ elif status_code == 404:
+ raise LLMModelNotFoundError(self.PROVIDER_NAME, self.model)
+ elif status_code == 400:
+ if "context_length" in error_message.lower():
+ raise LLMContextLengthError(self.PROVIDER_NAME)
+ raise LLMInvalidRequestError(self.PROVIDER_NAME, error_message)
+ elif status_code >= 500:
+ raise LLMServerError(self.PROVIDER_NAME, status_code, error_message)
+ else:
+ raise LLMClientError(error_message, self.PROVIDER_NAME, status_code=status_code)
+
+ def _make_retry_decorator(self):
+ """Create retry decorator with exponential backoff."""
+ return retry(
+ stop=stop_after_attempt(self.max_retries),
+ wait=wait_exponential(multiplier=1, min=1, max=60),
+ retry=retry_if_exception_type((LLMRateLimitError, LLMServerError, LLMConnectionError)),
+ before_sleep=before_sleep_log(logger, logging.WARNING),
+ reraise=True,
+ )
+
+ async def generate(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stream: bool = False,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse | AsyncIterator[str]:
+ """
+ Generate a response from OpenAI.
+
+ Args:
+ messages: Chat messages in OpenAI format
+ prompt: Simple string prompt
+ temperature: Sampling temperature (0.0 to 2.0)
+ max_tokens: Maximum tokens to generate
+ tools: Tool definitions for function calling
+ stream: If True, returns AsyncIterator
+ stop: Stop sequences
+ **kwargs: Additional OpenAI parameters (top_p, presence_penalty, etc.)
+
+ Returns:
+ LLMResponse or AsyncIterator[str] for streaming
+ """
+ # Apply rate limiting before proceeding
+ await self._apply_rate_limit()
+
+ # Check circuit breaker
+ if not self.circuit_breaker.can_execute():
+ raise CircuitBreakerOpenError(
+ self.PROVIDER_NAME,
+ self.circuit_breaker.failure_count,
+ self.circuit_breaker.get_reset_time(),
+ )
+
+ if stream:
+ return self._generate_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+ else:
+ return await self._generate_non_stream(
+ messages=messages,
+ prompt=prompt,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ tools=tools,
+ stop=stop,
+ **kwargs,
+ )
+
+ async def _generate_non_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse:
+ """Non-streaming generation with retry logic."""
+
+ @self._make_retry_decorator()
+ async def _request():
+ client = await self._get_client()
+
+ # Build request payload
+ payload = {
+ "model": self.model,
+ "messages": self._build_messages(messages, prompt),
+ "temperature": temperature,
+ }
+
+ if max_tokens is not None:
+ payload["max_tokens"] = max_tokens
+ if stop:
+ payload["stop"] = stop
+ if tools:
+ payload["tools"] = tools
+ payload["tool_choice"] = kwargs.pop("tool_choice", "auto")
+
+ # Add any additional kwargs
+ payload.update(kwargs)
+
+ try:
+ response = await client.post("/chat/completions", json=payload)
+ except httpx.TimeoutException:
+ raise LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ except httpx.ConnectError:
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+
+ if response.status_code != 200:
+ self._handle_error_response(response)
+
+ return response
+
+ try:
+ response = await _request()
+ self.circuit_breaker.record_success()
+ except Exception:
+ self.circuit_breaker.record_failure()
+ raise
+
+ # Parse response
+ try:
+ data = response.json()
+ choice = data["choices"][0]
+ message = choice["message"]
+
+ usage = data.get("usage", {})
+ finish_reason = choice.get("finish_reason", "stop")
+
+ # Check for tool calls
+ if "tool_calls" in message:
+ tool_calls = [
+ ToolCall(
+ id=tc["id"],
+ name=tc["function"]["name"],
+ arguments=json.loads(tc["function"]["arguments"]),
+ )
+ for tc in message["tool_calls"]
+ ]
+ llm_response = LLMToolResponse(
+ text=message.get("content", ""),
+ usage=usage,
+ model=data.get("model", self.model),
+ raw_response=data,
+ finish_reason=finish_reason,
+ tool_calls=tool_calls,
+ )
+ else:
+ llm_response = LLMResponse(
+ text=message.get("content", ""),
+ usage=usage,
+ model=data.get("model", self.model),
+ raw_response=data,
+ finish_reason=finish_reason,
+ )
+
+ self._update_stats(llm_response)
+ return llm_response
+
+ except (KeyError, json.JSONDecodeError) as e:
+ raise LLMResponseParseError(self.PROVIDER_NAME, response.text) from e
+
+ async def _generate_stream(
+ self,
+ *,
+ messages: list[dict] | None = None,
+ prompt: str | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ tools: list[dict] | None = None,
+ stop: list[str] | None = None,
+ **kwargs: Any,
+ ) -> AsyncIterator[str]:
+ """Streaming generation."""
+
+ client = await self._get_client()
+
+ # Build request payload
+ payload = {
+ "model": self.model,
+ "messages": self._build_messages(messages, prompt),
+ "temperature": temperature,
+ "stream": True,
+ }
+
+ if max_tokens is not None:
+ payload["max_tokens"] = max_tokens
+ if stop:
+ payload["stop"] = stop
+ # Note: tools with streaming have limited support
+ if tools:
+ payload["tools"] = tools
+
+ payload.update(kwargs)
+
+ async def stream_generator():
+ try:
+ async with client.stream("POST", "/chat/completions", json=payload) as response:
+ if response.status_code != 200:
+ # Read the full response for error handling
+ await response.aread()
+ self._handle_error_response(response)
+
+ async for line in response.aiter_lines():
+ if line.startswith("data: "):
+ data_str = line[6:]
+ if data_str.strip() == "[DONE]":
+ break
+
+ try:
+ data = json.loads(data_str)
+ delta = data["choices"][0].get("delta", {})
+ content = delta.get("content", "")
+ if content:
+ yield content
+ except (json.JSONDecodeError, KeyError):
+ continue
+
+ self.circuit_breaker.record_success()
+
+ except httpx.TimeoutException:
+ self.circuit_breaker.record_failure()
+ raise LLMTimeoutError(self.PROVIDER_NAME, self.timeout)
+ except httpx.ConnectError:
+ self.circuit_breaker.record_failure()
+ raise LLMConnectionError(self.PROVIDER_NAME, self.base_url)
+ except Exception as e:
+ self.circuit_breaker.record_failure()
+ if isinstance(e, LLMClientError):
+ raise
+ raise LLMStreamError(self.PROVIDER_NAME, str(e)) from e
+
+ return stream_generator()
+
+ async def close(self) -> None:
+ """Close the HTTP client."""
+ if self._client and not self._client.is_closed:
+ await self._client.aclose()
+ self._client = None
diff --git a/src/agents/__init__.py b/src/agents/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/agents/hrm_agent.py b/src/agents/hrm_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd3bc482f901ca777de39904d1f4c9e4af612f7c
--- /dev/null
+++ b/src/agents/hrm_agent.py
@@ -0,0 +1,454 @@
+"""
+Hierarchical Reasoning Model (HRM) Agent.
+
+Implements the HRM architecture with:
+- H-Module: High-level planning and decomposition
+- L-Module: Low-level execution and refinement
+- Adaptive Computation Time (ACT) for dynamic depth
+- Halting mechanism based on confidence thresholds
+
+Based on: "Hierarchical Reasoning for Compositional Generalization"
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..training.system_config import HRMConfig
+
+
+@dataclass
+class SubProblem:
+ """Represents a decomposed subproblem in the hierarchy."""
+
+ level: int # Hierarchy level (0 = root, higher = more abstract)
+ description: str # Natural language description
+ state: torch.Tensor # Latent state representation
+ parent_id: int | None = None # Parent subproblem ID
+ confidence: float = 0.0 # Confidence in this decomposition
+
+
+@dataclass
+class HRMOutput:
+ """Output from HRM processing."""
+
+ final_state: torch.Tensor # Final processed state
+ subproblems: list[SubProblem] # Hierarchical decomposition
+ halt_step: int # Step at which halting occurred
+ total_ponder_cost: float # Total computation cost (for training)
+ convergence_path: list[float] # Confidence at each step
+
+
+class AdaptiveComputationTime(nn.Module):
+ """
+ Adaptive Computation Time (ACT) mechanism for dynamic depth.
+
+ Allows the model to "ponder" longer on difficult problems by
+ dynamically adjusting the number of processing steps.
+ """
+
+ def __init__(self, hidden_dim: int, epsilon: float = 0.01):
+ super().__init__()
+ self.epsilon = epsilon
+
+ # Halting unit: predicts probability of halting
+ self.halt_fc = nn.Sequential(
+ nn.Linear(hidden_dim, hidden_dim // 2),
+ nn.ReLU(),
+ nn.Linear(hidden_dim // 2, 1),
+ nn.Sigmoid(),
+ )
+
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, float]:
+ """
+ Compute halting probabilities.
+
+ Args:
+ hidden_states: [batch, seq, hidden_dim]
+
+ Returns:
+ halt_probs: [batch, seq] probability of halting
+ ponder_cost: Scalar cost for training
+ """
+ # Compute halting probabilities
+ halt_logits = self.halt_fc(hidden_states) # [batch, seq, 1]
+ halt_probs = halt_logits.squeeze(-1) # [batch, seq]
+
+ # Ponder cost is the expected number of steps
+ ponder_cost = halt_probs.sum(dim=-1).mean()
+
+ return halt_probs, ponder_cost
+
+
+class HModule(nn.Module):
+ """
+ H-Module: High-level planning and abstract reasoning.
+
+ Responsible for:
+ - Decomposing problems into subproblems
+ - Abstract planning and strategy
+ - Coordinating L-module executions
+ """
+
+ def __init__(self, config: HRMConfig):
+ super().__init__()
+ self.config = config
+
+ # Multi-head self-attention for relational reasoning
+ self.attention = nn.MultiheadAttention(
+ embed_dim=config.h_dim,
+ num_heads=8,
+ dropout=config.dropout,
+ batch_first=True,
+ )
+
+ # Feed-forward network
+ self.ffn = nn.Sequential(
+ nn.Linear(config.h_dim, config.h_dim * 4),
+ nn.GELU(),
+ nn.Dropout(config.dropout),
+ nn.Linear(config.h_dim * 4, config.h_dim),
+ nn.Dropout(config.dropout),
+ )
+
+ # Layer normalization
+ self.norm1 = nn.LayerNorm(config.h_dim)
+ self.norm2 = nn.LayerNorm(config.h_dim)
+
+ # Decomposition head: outputs subproblem structure
+ self.decompose_head = nn.Sequential(
+ nn.Linear(config.h_dim, config.h_dim),
+ nn.ReLU(),
+ nn.Linear(config.h_dim, config.h_dim),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Process input through high-level reasoning.
+
+ Args:
+ x: [batch, seq, h_dim] input tensor
+
+ Returns:
+ [batch, seq, h_dim] processed tensor
+ """
+ # Self-attention for relational reasoning
+ attn_out, _ = self.attention(x, x, x)
+ x = self.norm1(x + attn_out)
+
+ # Feed-forward processing
+ ffn_out = self.ffn(x)
+ x = self.norm2(x + ffn_out)
+
+ return x
+
+ def decompose(self, x: torch.Tensor) -> torch.Tensor:
+ """Generate subproblem representations."""
+ return self.decompose_head(x)
+
+
+class LModule(nn.Module):
+ """
+ L-Module: Low-level execution and concrete operations.
+
+ Responsible for:
+ - Executing concrete operations
+ - Processing individual subproblems
+ - Generating intermediate results
+ """
+
+ def __init__(self, config: HRMConfig):
+ super().__init__()
+ self.config = config
+
+ # Projection from H-module to L-module dimension
+ self.h_to_l = nn.Linear(config.h_dim, config.l_dim)
+
+ # GRU for sequential processing
+ self.gru = nn.GRU(
+ input_size=config.l_dim,
+ hidden_size=config.l_dim,
+ num_layers=config.num_l_layers,
+ dropout=config.dropout if config.num_l_layers > 1 else 0,
+ batch_first=True,
+ )
+
+ # Output projection
+ self.output_proj = nn.Sequential(
+ nn.Linear(config.l_dim, config.l_dim * 2),
+ nn.ReLU(),
+ nn.Dropout(config.dropout),
+ nn.Linear(config.l_dim * 2, config.l_dim),
+ )
+
+ # Back-projection to H-module dimension
+ self.l_to_h = nn.Linear(config.l_dim, config.h_dim)
+
+ def forward(self, x: torch.Tensor, h_context: torch.Tensor | None = None) -> tuple[torch.Tensor, torch.Tensor]:
+ """
+ Execute low-level processing.
+
+ Args:
+ x: [batch, seq, h_dim] input from H-module
+ h_context: Optional hidden state
+
+ Returns:
+ output: [batch, seq, l_dim] processed output
+ l_to_h: [batch, seq, h_dim] back-projection to H-module
+ """
+ # Project to L-module dimension
+ x_l = self.h_to_l(x)
+
+ # Sequential processing
+ gru_out, _ = self.gru(x_l, h_context)
+
+ # Output processing
+ output = self.output_proj(gru_out)
+
+ # Back-project to H-module dimension for feedback
+ feedback = self.l_to_h(output)
+
+ return output, feedback
+
+
+class HRMAgent(nn.Module):
+ """
+ Complete Hierarchical Reasoning Model agent.
+
+ Combines H-module and L-module with ACT for adaptive computation.
+ """
+
+ def __init__(self, config: HRMConfig, device: str = "cpu"):
+ super().__init__()
+ self.config = config
+ self.device = device
+
+ # Input embedding
+ self.input_proj = nn.Linear(config.h_dim, config.h_dim)
+
+ # Core modules
+ self.h_module = nn.ModuleList([HModule(config) for _ in range(config.num_h_layers)])
+
+ self.l_module = LModule(config)
+
+ # Adaptive computation time
+ self.act = AdaptiveComputationTime(config.h_dim, config.ponder_epsilon)
+
+ # State integration
+ self.integrate = nn.Sequential(
+ nn.Linear(config.h_dim * 2, config.h_dim),
+ nn.LayerNorm(config.h_dim),
+ nn.GELU(),
+ )
+
+ self.to(device)
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ max_steps: int | None = None,
+ return_decomposition: bool = False,
+ ) -> HRMOutput:
+ """
+ Process input through hierarchical reasoning.
+
+ Args:
+ x: [batch, seq, h_dim] input tensor
+ max_steps: Maximum outer loop steps (defaults to config)
+ return_decomposition: Whether to return subproblem decomposition
+
+ Returns:
+ HRMOutput containing final state and optional decomposition
+ """
+ batch_size, seq_len, _ = x.shape
+ max_steps = max_steps or self.config.max_outer_steps
+
+ # Initial projection
+ h_state = self.input_proj(x)
+
+ # Tracking
+ subproblems = []
+ convergence_path = []
+ total_ponder_cost = 0.0
+
+ # Outer loop: iterative refinement
+ for step in range(max_steps):
+ # H-module: high-level planning
+ for h_layer in self.h_module:
+ h_state = h_layer(h_state)
+
+ # Check halting condition
+ halt_probs, ponder_cost = self.act(h_state)
+ total_ponder_cost += ponder_cost
+
+ # Average halting probability across sequence
+ avg_halt_prob = halt_probs.mean().item()
+ convergence_path.append(avg_halt_prob)
+
+ # Generate subproblem decomposition if requested
+ if return_decomposition:
+ subproblem_repr = self.h_module[0].decompose(h_state)
+ # Create subproblem entries (simplified)
+ for i in range(min(3, seq_len)): # Top 3 subproblems
+ subproblems.append(
+ SubProblem(
+ level=step,
+ description=f"Subproblem at step {step}, position {i}",
+ state=subproblem_repr[:, i, :].detach(),
+ confidence=halt_probs[:, i].mean().item(),
+ )
+ )
+
+ # Halt if confident enough
+ if avg_halt_prob >= self.config.halt_threshold:
+ break
+
+ # L-module: low-level execution
+ l_output, l_feedback = self.l_module(h_state)
+
+ # Integrate L-module feedback
+ h_state = self.integrate(torch.cat([h_state, l_feedback], dim=-1))
+
+ return HRMOutput(
+ final_state=h_state,
+ subproblems=subproblems,
+ halt_step=step + 1,
+ total_ponder_cost=total_ponder_cost,
+ convergence_path=convergence_path,
+ )
+
+ async def decompose_problem(self, query: str, state: torch.Tensor) -> list[SubProblem]:
+ """
+ Decompose a problem into hierarchical subproblems.
+
+ Args:
+ query: Natural language problem description
+ state: Initial state representation
+
+ Returns:
+ List of subproblems in hierarchical order
+ """
+ # Ensure state is batched
+ if state.dim() == 2:
+ state = state.unsqueeze(0) # [1, seq, dim]
+
+ # Forward pass with decomposition
+ output = self.forward(state, return_decomposition=True)
+
+ # Add query context to subproblems
+ for i, sp in enumerate(output.subproblems):
+ sp.description = f"{query} -> Level {sp.level} Subproblem {i}"
+
+ return output.subproblems
+
+ def get_parameter_count(self) -> int:
+ """Return total number of trainable parameters."""
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+# Training utilities
+class HRMLoss(nn.Module):
+ """
+ Combined loss for HRM training.
+
+ Includes:
+ - Task loss (e.g., cross-entropy for classification)
+ - Ponder cost regularization (encourages efficiency)
+ - Consistency loss (encourages stable convergence)
+ """
+
+ def __init__(
+ self,
+ task_weight: float = 1.0,
+ ponder_weight: float = 0.01,
+ consistency_weight: float = 0.1,
+ ):
+ super().__init__()
+ self.task_weight = task_weight
+ self.ponder_weight = ponder_weight
+ self.consistency_weight = consistency_weight
+
+ def forward(
+ self,
+ hrm_output: HRMOutput,
+ predictions: torch.Tensor,
+ targets: torch.Tensor,
+ task_loss_fn: nn.Module,
+ ) -> tuple[torch.Tensor, dict]:
+ """
+ Compute combined loss.
+
+ Args:
+ hrm_output: Output from HRM forward pass
+ predictions: Model predictions
+ targets: Ground truth targets
+ task_loss_fn: Loss function for the task
+
+ Returns:
+ total_loss: Combined loss
+ loss_dict: Dictionary of individual loss components
+ """
+ # Task loss
+ task_loss = task_loss_fn(predictions, targets)
+
+ # Ponder cost (encourages efficiency)
+ ponder_loss = hrm_output.total_ponder_cost
+
+ # Consistency loss (encourages monotonic convergence)
+ if len(hrm_output.convergence_path) > 1:
+ conv_tensor = torch.tensor(hrm_output.convergence_path)
+ # Penalize non-monotonic increases
+ diffs = conv_tensor[1:] - conv_tensor[:-1]
+ consistency_loss = F.relu(-diffs).mean() # Penalize decreases
+ else:
+ consistency_loss = torch.tensor(0.0)
+
+ # Combine losses
+ total_loss = (
+ self.task_weight * task_loss + self.ponder_weight * ponder_loss + self.consistency_weight * consistency_loss
+ )
+
+ loss_dict = {
+ "total": total_loss.item(),
+ "task": task_loss.item(),
+ "ponder": ponder_loss,
+ "consistency": consistency_loss.item(),
+ "halt_step": hrm_output.halt_step,
+ }
+
+ return total_loss, loss_dict
+
+
+def create_hrm_agent(config: HRMConfig, device: str = "cpu") -> HRMAgent:
+ """
+ Factory function to create and initialize HRM agent.
+
+ Args:
+ config: HRM configuration
+ device: Device to place model on
+
+ Returns:
+ Initialized HRMAgent
+ """
+ agent = HRMAgent(config, device)
+
+ # Initialize weights
+ def init_weights(m):
+ if isinstance(m, nn.Linear):
+ nn.init.xavier_uniform_(m.weight)
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ elif isinstance(m, nn.GRU):
+ for name, param in m.named_parameters():
+ if "weight" in name:
+ nn.init.orthogonal_(param)
+ elif "bias" in name:
+ nn.init.zeros_(param)
+
+ agent.apply(init_weights)
+
+ return agent
diff --git a/src/agents/meta_controller/__init__.py b/src/agents/meta_controller/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b175d96045c7dc2693833a3b09261536bd2a8c
--- /dev/null
+++ b/src/agents/meta_controller/__init__.py
@@ -0,0 +1,45 @@
+"""
+Neural Meta-Controller package for Multi-Agent MCTS Framework.
+
+This package provides the base infrastructure for neural network-based
+meta-controllers that dynamically select which agent to route queries to.
+"""
+
+from src.agents.meta_controller.base import (
+ AbstractMetaController,
+ MetaControllerFeatures,
+ MetaControllerPrediction,
+)
+from src.agents.meta_controller.rnn_controller import (
+ RNNMetaController,
+ RNNMetaControllerModel,
+)
+from src.agents.meta_controller.utils import (
+ features_to_tensor,
+ features_to_text,
+ normalize_features,
+ one_hot_encode_agent,
+)
+
+# Import BERT controller (may not be available if transformers/peft not installed)
+try:
+ from src.agents.meta_controller.bert_controller import BERTMetaController # noqa: F401
+
+ _bert_available = True
+except ImportError:
+ _bert_available = False
+
+__all__ = [
+ "AbstractMetaController",
+ "MetaControllerFeatures",
+ "MetaControllerPrediction",
+ "normalize_features",
+ "one_hot_encode_agent",
+ "features_to_tensor",
+ "features_to_text",
+ "RNNMetaController",
+ "RNNMetaControllerModel",
+]
+
+if _bert_available:
+ __all__.append("BERTMetaController")
diff --git a/src/agents/meta_controller/base.py b/src/agents/meta_controller/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3e35093c3a428a74724227f91e8eaac147fa029
--- /dev/null
+++ b/src/agents/meta_controller/base.py
@@ -0,0 +1,219 @@
+"""
+Abstract base class for Neural Meta-Controllers.
+
+Provides the foundation for neural network-based meta-controllers that
+dynamically select which agent (HRM, TRM, or MCTS) should handle a query.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class MetaControllerFeatures:
+ """
+ Features extracted from the current agent state for meta-controller prediction.
+
+ These features capture the current state of the multi-agent system,
+ including confidence scores from different agents and contextual information.
+ """
+
+ hrm_confidence: float
+ """Confidence score from the HRM (Human Response Model) agent."""
+
+ trm_confidence: float
+ """Confidence score from the TRM (Task Response Model) agent."""
+
+ mcts_value: float
+ """Value estimate from the MCTS (Monte Carlo Tree Search) process."""
+
+ consensus_score: float
+ """Agreement score between different agents."""
+
+ last_agent: str
+ """Name of the last agent used ('hrm', 'trm', 'mcts', or 'none')."""
+
+ iteration: int
+ """Current iteration number in the reasoning process."""
+
+ query_length: int
+ """Length of the input query in characters."""
+
+ has_rag_context: bool
+ """Whether RAG (Retrieval-Augmented Generation) context is available."""
+
+
+@dataclass
+class MetaControllerPrediction:
+ """
+ Prediction output from the meta-controller.
+
+ Contains the selected agent and associated confidence/probability information.
+ """
+
+ agent: str
+ """Name of the selected agent ('hrm', 'trm', or 'mcts')."""
+
+ confidence: float
+ """Confidence score for the prediction (0.0 to 1.0)."""
+
+ probabilities: dict[str, float] = field(default_factory=dict)
+ """Probability distribution over all possible agents."""
+
+
+class AbstractMetaController(ABC):
+ """
+ Abstract base class for neural meta-controllers.
+
+ This class defines the interface that all meta-controller implementations
+ must follow. Meta-controllers are responsible for deciding which agent
+ should handle a given query based on the current system state.
+
+ Attributes:
+ AGENT_NAMES: List of valid agent names that can be selected.
+ name: Name of this meta-controller instance.
+ seed: Random seed for reproducibility.
+ """
+
+ AGENT_NAMES = ["hrm", "trm", "mcts"]
+
+ def __init__(self, name: str, seed: int = 42) -> None:
+ """
+ Initialize the meta-controller.
+
+ Args:
+ name: Name identifier for this meta-controller instance.
+ seed: Random seed for reproducibility. Defaults to 42.
+ """
+ self.name = name
+ self.seed = seed
+
+ @abstractmethod
+ def predict(self, features: MetaControllerFeatures) -> MetaControllerPrediction:
+ """
+ Predict which agent should handle the current query.
+
+ Args:
+ features: Features extracted from the current agent state.
+
+ Returns:
+ Prediction containing the selected agent and confidence scores.
+ """
+ pass
+
+ @abstractmethod
+ def load_model(self, path: str) -> None:
+ """
+ Load a trained model from disk.
+
+ Args:
+ path: Path to the saved model file or directory.
+ """
+ pass
+
+ @abstractmethod
+ def save_model(self, path: str) -> None:
+ """
+ Save the current model to disk.
+
+ Args:
+ path: Path where the model should be saved.
+ """
+ pass
+
+ def extract_features(self, state: dict[str, Any]) -> MetaControllerFeatures:
+ """
+ Extract meta-controller features from an AgentState dictionary.
+
+ This method converts raw state information into the structured
+ MetaControllerFeatures format required for prediction.
+
+ Args:
+ state: Dictionary containing agent state information.
+ Expected keys include:
+ - 'hrm_confidence' or nested in 'agent_confidences'
+ - 'trm_confidence' or nested in 'agent_confidences'
+ - 'mcts_value' or nested in 'mcts_state'
+ - 'consensus_score'
+ - 'last_agent'
+ - 'iteration'
+ - 'query' or 'query_length'
+ - 'rag_context' or 'has_rag_context'
+
+ Returns:
+ MetaControllerFeatures instance with extracted values.
+
+ Example:
+ >>> state = {
+ ... 'agent_confidences': {'hrm': 0.8, 'trm': 0.6},
+ ... 'mcts_state': {'value': 0.75},
+ ... 'consensus_score': 0.7,
+ ... 'last_agent': 'hrm',
+ ... 'iteration': 2,
+ ... 'query': 'What is machine learning?',
+ ... 'rag_context': 'ML is a subset of AI...'
+ ... }
+ >>> features = controller.extract_features(state)
+ """
+ # Extract HRM confidence
+ if "hrm_confidence" in state:
+ hrm_confidence = float(state["hrm_confidence"])
+ elif "agent_confidences" in state and isinstance(state["agent_confidences"], dict):
+ hrm_confidence = float(state["agent_confidences"].get("hrm", 0.0))
+ else:
+ hrm_confidence = 0.0
+
+ # Extract TRM confidence
+ if "trm_confidence" in state:
+ trm_confidence = float(state["trm_confidence"])
+ elif "agent_confidences" in state and isinstance(state["agent_confidences"], dict):
+ trm_confidence = float(state["agent_confidences"].get("trm", 0.0))
+ else:
+ trm_confidence = 0.0
+
+ # Extract MCTS value
+ if "mcts_value" in state:
+ mcts_value = float(state["mcts_value"])
+ elif "mcts_state" in state and isinstance(state["mcts_state"], dict):
+ mcts_value = float(state["mcts_state"].get("value", 0.0))
+ else:
+ mcts_value = 0.0
+
+ # Extract consensus score
+ consensus_score = float(state.get("consensus_score", 0.0))
+
+ # Extract last agent
+ last_agent = str(state.get("last_agent", "none"))
+ if last_agent not in self.AGENT_NAMES and last_agent != "none":
+ last_agent = "none"
+
+ # Extract iteration
+ iteration = int(state.get("iteration", 0))
+
+ # Extract query length
+ if "query_length" in state:
+ query_length = int(state["query_length"])
+ elif "query" in state and isinstance(state["query"], str):
+ query_length = len(state["query"])
+ else:
+ query_length = 0
+
+ # Extract has_rag_context
+ if "has_rag_context" in state:
+ has_rag_context = bool(state["has_rag_context"])
+ elif "rag_context" in state:
+ has_rag_context = state["rag_context"] is not None and len(str(state["rag_context"])) > 0
+ else:
+ has_rag_context = False
+
+ return MetaControllerFeatures(
+ hrm_confidence=hrm_confidence,
+ trm_confidence=trm_confidence,
+ mcts_value=mcts_value,
+ consensus_score=consensus_score,
+ last_agent=last_agent,
+ iteration=iteration,
+ query_length=query_length,
+ has_rag_context=has_rag_context,
+ )
diff --git a/src/agents/meta_controller/bert_controller.py b/src/agents/meta_controller/bert_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b466036ef59475f362ded252ea73ca76e063ef6
--- /dev/null
+++ b/src/agents/meta_controller/bert_controller.py
@@ -0,0 +1,428 @@
+"""
+BERT-based Meta-Controller with LoRA adapters for efficient fine-tuning.
+
+This module provides a BERT-based meta-controller that uses Low-Rank Adaptation (LoRA)
+for parameter-efficient fine-tuning. The controller converts agent state features into
+text and uses a sequence classification model to predict the optimal agent.
+"""
+
+import warnings
+from typing import Any
+
+import torch
+
+from src.agents.meta_controller.base import (
+ AbstractMetaController,
+ MetaControllerFeatures,
+ MetaControllerPrediction,
+)
+from src.agents.meta_controller.utils import features_to_text
+
+# Handle optional transformers and peft imports gracefully
+_TRANSFORMERS_AVAILABLE = False
+_PEFT_AVAILABLE = False
+
+try:
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+ _TRANSFORMERS_AVAILABLE = True
+except ImportError:
+ warnings.warn(
+ "transformers library not installed. Install it with: pip install transformers",
+ ImportWarning,
+ stacklevel=2,
+ )
+ AutoTokenizer = None # type: ignore
+ AutoModelForSequenceClassification = None # type: ignore
+
+try:
+ from peft import LoraConfig, TaskType, get_peft_model
+
+ _PEFT_AVAILABLE = True
+except ImportError:
+ warnings.warn(
+ "peft library not installed. Install it with: pip install peft",
+ ImportWarning,
+ stacklevel=2,
+ )
+ LoraConfig = None # type: ignore
+ TaskType = None # type: ignore
+ get_peft_model = None # type: ignore
+
+
+class BERTMetaController(AbstractMetaController):
+ """
+ BERT-based meta-controller with optional LoRA adapters for efficient fine-tuning.
+
+ This controller converts agent state features into structured text and uses
+ a pre-trained BERT model (with optional LoRA adapters) to classify which
+ agent should handle the current query. LoRA enables parameter-efficient
+ fine-tuning by only training low-rank decomposition matrices.
+
+ Attributes:
+ DEFAULT_MODEL_NAME: Default BERT model to use.
+ NUM_LABELS: Number of output labels (agents to choose from).
+ device: PyTorch device for tensor operations.
+ model_name: Name of the pre-trained model.
+ lora_r: LoRA rank parameter.
+ lora_alpha: LoRA alpha scaling parameter.
+ lora_dropout: LoRA dropout rate.
+ use_lora: Whether to use LoRA adapters.
+ tokenizer: BERT tokenizer for text processing.
+ model: BERT sequence classification model (with or without LoRA).
+
+ Example:
+ >>> controller = BERTMetaController(name="BERTController", seed=42)
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> prediction = controller.predict(features)
+ >>> prediction.agent in ['hrm', 'trm', 'mcts']
+ True
+ >>> 0.0 <= prediction.confidence <= 1.0
+ True
+ """
+
+ DEFAULT_MODEL_NAME = "prajjwal1/bert-mini"
+ NUM_LABELS = 3
+
+ def __init__(
+ self,
+ name: str = "BERTMetaController",
+ seed: int = 42,
+ model_name: str | None = None,
+ lora_r: int = 4,
+ lora_alpha: int = 16,
+ lora_dropout: float = 0.1,
+ device: str | None = None,
+ use_lora: bool = True,
+ ) -> None:
+ """
+ Initialize the BERT meta-controller with optional LoRA adapters.
+
+ Args:
+ name: Name identifier for this controller. Defaults to "BERTMetaController".
+ seed: Random seed for reproducibility. Defaults to 42.
+ model_name: Pre-trained model name from HuggingFace. If None, uses DEFAULT_MODEL_NAME.
+ lora_r: LoRA rank parameter (lower = more compression). Defaults to 4.
+ lora_alpha: LoRA alpha scaling parameter. Defaults to 16.
+ lora_dropout: Dropout rate for LoRA layers. Defaults to 0.1.
+ device: Device to run model on ('cpu', 'cuda', 'mps', etc.).
+ If None, auto-detects best available device.
+ use_lora: Whether to apply LoRA adapters to the model. Defaults to True.
+
+ Raises:
+ ImportError: If transformers library is not installed.
+ ImportError: If use_lora is True and peft library is not installed.
+
+ Example:
+ >>> controller = BERTMetaController(
+ ... name="CustomBERT",
+ ... seed=123,
+ ... lora_r=8,
+ ... lora_alpha=32,
+ ... use_lora=True
+ ... )
+ """
+ super().__init__(name=name, seed=seed)
+
+ # Check for required dependencies
+ if not _TRANSFORMERS_AVAILABLE:
+ raise ImportError(
+ "transformers library is required for BERTMetaController. Install it with: pip install transformers"
+ )
+
+ if use_lora and not _PEFT_AVAILABLE:
+ raise ImportError("peft library is required for LoRA support. Install it with: pip install peft")
+
+ # Set random seed for reproducibility
+ torch.manual_seed(seed)
+
+ # Auto-detect device if not specified
+ if device is None:
+ if torch.cuda.is_available():
+ self.device = torch.device("cuda")
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ self.device = torch.device("mps")
+ else:
+ self.device = torch.device("cpu")
+ else:
+ self.device = torch.device(device)
+
+ # Store configuration parameters
+ self.model_name = model_name if model_name is not None else self.DEFAULT_MODEL_NAME
+ self.lora_r = lora_r
+ self.lora_alpha = lora_alpha
+ self.lora_dropout = lora_dropout
+ self.use_lora = use_lora
+
+ # Initialize tokenizer
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+ # Initialize base model for sequence classification
+ base_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.NUM_LABELS)
+
+ # Apply LoRA adapters if requested
+ if self.use_lora:
+ lora_config = LoraConfig(
+ task_type=TaskType.SEQ_CLS,
+ r=self.lora_r,
+ lora_alpha=self.lora_alpha,
+ lora_dropout=self.lora_dropout,
+ target_modules=["query", "value"],
+ )
+ self.model = get_peft_model(base_model, lora_config)
+ else:
+ self.model = base_model
+
+ # Move model to device
+ self.model = self.model.to(self.device)
+
+ # Set model to evaluation mode
+ self.model.eval()
+
+ # Initialize tokenization cache for performance optimization
+ self._tokenization_cache: dict[str, Any] = {}
+
+ def predict(self, features: MetaControllerFeatures) -> MetaControllerPrediction:
+ """
+ Predict which agent should handle the current query.
+
+ Converts features to structured text, tokenizes the text, runs through
+ the BERT model, and returns a prediction with confidence scores.
+
+ Args:
+ features: Features extracted from the current agent state.
+
+ Returns:
+ Prediction containing the selected agent, confidence score,
+ and probability distribution over all agents.
+
+ Example:
+ >>> controller = BERTMetaController()
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.9,
+ ... trm_confidence=0.3,
+ ... mcts_value=0.5,
+ ... consensus_score=0.8,
+ ... last_agent='none',
+ ... iteration=0,
+ ... query_length=100,
+ ... has_rag_context=False
+ ... )
+ >>> pred = controller.predict(features)
+ >>> isinstance(pred.agent, str)
+ True
+ >>> isinstance(pred.confidence, float)
+ True
+ >>> len(pred.probabilities) == 3
+ True
+ """
+ # Convert features to structured text
+ text = features_to_text(features)
+
+ # Check cache for tokenized text
+ if text in self._tokenization_cache:
+ inputs = self._tokenization_cache[text]
+ else:
+ # Tokenize the text
+ inputs = self.tokenizer(
+ text,
+ return_tensors="pt",
+ padding=True,
+ truncation=True,
+ max_length=512,
+ )
+ # Cache the tokenized result
+ self._tokenization_cache[text] = inputs
+
+ # Move inputs to device
+ inputs = {key: value.to(self.device) for key, value in inputs.items()}
+
+ # Perform inference without gradient tracking
+ with torch.no_grad():
+ # Get logits from model
+ outputs = self.model(**inputs)
+ logits = outputs.logits
+
+ # Apply softmax to get probabilities
+ probabilities = torch.nn.functional.softmax(logits, dim=-1)
+
+ # Get predicted agent index (argmax)
+ predicted_idx = torch.argmax(probabilities, dim=-1).item()
+
+ # Extract confidence for selected agent
+ confidence = probabilities[0, predicted_idx].item()
+
+ # Create probability dictionary
+ prob_dict: dict[str, float] = {}
+ for i, agent_name in enumerate(self.AGENT_NAMES):
+ prob_dict[agent_name] = probabilities[0, i].item()
+
+ # Get agent name
+ selected_agent = self.AGENT_NAMES[predicted_idx]
+
+ return MetaControllerPrediction(
+ agent=selected_agent,
+ confidence=float(confidence),
+ probabilities=prob_dict,
+ )
+
+ def load_model(self, path: str) -> None:
+ """
+ Load a trained model from disk.
+
+ For LoRA models, loads the PEFT adapter weights. For base models,
+ loads the full state dictionary.
+
+ Args:
+ path: Path to the saved model file or directory.
+ For LoRA models, this should be a directory containing
+ adapter_config.json and adapter_model.bin.
+ For base models, this should be a .pt or .pth file.
+
+ Raises:
+ FileNotFoundError: If the model file or directory does not exist.
+ RuntimeError: If the state dict is incompatible with the model.
+
+ Example:
+ >>> controller = BERTMetaController(use_lora=True)
+ >>> controller.load_model("/path/to/lora_adapter")
+ >>> controller = BERTMetaController(use_lora=False)
+ >>> controller.load_model("/path/to/model.pt")
+ """
+ if self.use_lora:
+ # Load PEFT adapter weights
+ # For PEFT models, the path should be a directory containing adapter files
+ from peft import PeftModel
+
+ # Get the base model from the PEFT wrapper
+ base_model = self.model.get_base_model()
+
+ # Load the PEFT model from the saved path
+ self.model = PeftModel.from_pretrained(base_model, path)
+ self.model = self.model.to(self.device)
+ else:
+ # Load base model state dict
+ state_dict = torch.load(path, map_location=self.device, weights_only=True)
+ self.model.load_state_dict(state_dict)
+
+ # Ensure model is in evaluation mode
+ self.model.eval()
+
+ def save_model(self, path: str) -> None:
+ """
+ Save the current model to disk.
+
+ For LoRA models, saves the PEFT adapter weights. For base models,
+ saves the full state dictionary.
+
+ Args:
+ path: Path where the model should be saved.
+ For LoRA models, this should be a directory path where
+ adapter_config.json and adapter_model.bin will be saved.
+ For base models, this should be a .pt or .pth file path.
+
+ Example:
+ >>> controller = BERTMetaController(use_lora=True)
+ >>> controller.save_model("/path/to/lora_adapter")
+ >>> controller = BERTMetaController(use_lora=False)
+ >>> controller.save_model("/path/to/model.pt")
+ """
+ if self.use_lora:
+ # Save PEFT adapter weights
+ # This saves only the LoRA adapter weights, not the full model
+ self.model.save_pretrained(path)
+ else:
+ # Save base model state dict
+ torch.save(self.model.state_dict(), path)
+
+ def clear_cache(self) -> None:
+ """
+ Clear the tokenization cache.
+
+ This method removes all cached tokenized inputs, freeing memory.
+ Useful when processing many different feature combinations or
+ when memory usage is a concern.
+
+ Example:
+ >>> controller = BERTMetaController()
+ >>> # After many predictions...
+ >>> controller.clear_cache()
+ >>> info = controller.get_cache_info()
+ >>> info['cache_size'] == 0
+ True
+ """
+ self._tokenization_cache.clear()
+
+ def get_cache_info(self) -> dict[str, Any]:
+ """
+ Get information about the current tokenization cache.
+
+ Returns:
+ Dictionary containing cache statistics:
+ - cache_size: Number of cached tokenizations
+ - cache_keys: List of cached text inputs (truncated for display)
+
+ Example:
+ >>> controller = BERTMetaController()
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> _ = controller.predict(features)
+ >>> info = controller.get_cache_info()
+ >>> 'cache_size' in info
+ True
+ >>> info['cache_size'] >= 1
+ True
+ """
+ # Truncate keys for display (first 50 chars)
+ truncated_keys = [key[:50] + "..." if len(key) > 50 else key for key in self._tokenization_cache]
+
+ return {
+ "cache_size": len(self._tokenization_cache),
+ "cache_keys": truncated_keys,
+ }
+
+ def get_trainable_parameters(self) -> dict[str, int]:
+ """
+ Get the number of trainable and total parameters in the model.
+
+ This is particularly useful for LoRA models to see the efficiency
+ gains from using low-rank adaptation.
+
+ Returns:
+ Dictionary containing:
+ - total_params: Total number of parameters in the model
+ - trainable_params: Number of trainable parameters
+ - trainable_percentage: Percentage of parameters that are trainable
+
+ Example:
+ >>> controller = BERTMetaController(use_lora=True)
+ >>> params = controller.get_trainable_parameters()
+ >>> params['trainable_percentage'] < 10.0 # LoRA trains <10% of params
+ True
+ """
+ total_params = sum(p.numel() for p in self.model.parameters())
+ trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+ trainable_percentage = (trainable_params / total_params) * 100 if total_params > 0 else 0.0
+
+ return {
+ "total_params": total_params,
+ "trainable_params": trainable_params,
+ "trainable_percentage": round(trainable_percentage, 2),
+ }
diff --git a/src/agents/meta_controller/config_loader.py b/src/agents/meta_controller/config_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df65ebacc42c944fa5a599142267350529ec6f5
--- /dev/null
+++ b/src/agents/meta_controller/config_loader.py
@@ -0,0 +1,304 @@
+"""
+Configuration loader for the Neural Meta-Controller framework.
+
+This module provides dataclass-based configuration management for the Meta-Controller,
+supporting both RNN and BERT-based neural network controllers with comprehensive
+validation and serialization capabilities.
+"""
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+@dataclass
+class RNNConfig:
+ """
+ Configuration for RNN-based Meta-Controller.
+
+ Attributes:
+ hidden_dim: Hidden dimension size for RNN layers. Default is 64.
+ num_layers: Number of RNN layers. Default is 1.
+ dropout: Dropout rate for regularization. Default is 0.1.
+ model_path: Optional path to a pre-trained model file. None for untrained model.
+ """
+
+ hidden_dim: int = 64
+ num_layers: int = 1
+ dropout: float = 0.1
+ model_path: str | None = None
+
+
+@dataclass
+class BERTConfig:
+ """
+ Configuration for BERT-based Meta-Controller with LoRA fine-tuning.
+
+ Attributes:
+ model_name: Name of the pre-trained BERT model from HuggingFace.
+ Default is "prajjwal1/bert-mini" for lightweight deployment.
+ use_lora: Whether to use LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+ Default is True.
+ lora_r: LoRA rank parameter. Controls the rank of the low-rank matrices.
+ Default is 4.
+ lora_alpha: LoRA alpha parameter. Scaling factor for LoRA weights.
+ Default is 16.
+ lora_dropout: Dropout rate for LoRA layers. Default is 0.1.
+ model_path: Optional path to a trained LoRA adapter. None for base model only.
+ """
+
+ model_name: str = "prajjwal1/bert-mini"
+ use_lora: bool = True
+ lora_r: int = 4
+ lora_alpha: int = 16
+ lora_dropout: float = 0.1
+ model_path: str | None = None
+
+
+@dataclass
+class InferenceConfig:
+ """
+ Configuration for inference settings.
+
+ Attributes:
+ device: Device to use for inference ("cpu", "cuda", "cuda:0", etc.).
+ None for auto-detection based on available hardware.
+ seed: Random seed for reproducibility. Default is 42.
+ """
+
+ device: str | None = None
+ seed: int = 42
+
+
+@dataclass
+class MetaControllerConfig:
+ """
+ Main configuration for the Neural Meta-Controller framework.
+
+ This configuration controls the behavior of the Meta-Controller, including
+ which type of neural network to use (RNN or BERT), fallback behavior,
+ and specific model parameters.
+
+ Attributes:
+ enabled: Whether the neural Meta-Controller is enabled. Default is False
+ for backward compatibility with rule-based systems.
+ type: Type of neural network controller ("rnn" or "bert"). Default is "rnn".
+ fallback_to_rule_based: Whether to fall back to rule-based selection on errors.
+ Default is True for robustness.
+ rnn: Configuration for RNN-based controller.
+ bert: Configuration for BERT-based controller.
+ inference: Configuration for inference settings.
+ """
+
+ enabled: bool = False
+ type: str = "rnn" # "rnn" or "bert"
+ fallback_to_rule_based: bool = True
+ rnn: RNNConfig = field(default_factory=RNNConfig)
+ bert: BERTConfig = field(default_factory=BERTConfig)
+ inference: InferenceConfig = field(default_factory=InferenceConfig)
+
+
+class MetaControllerConfigLoader:
+ """
+ Loader class for Meta-Controller configuration.
+
+ Provides methods for loading configuration from YAML files or dictionaries,
+ converting configuration to dictionaries, and validating configuration values.
+
+ Example:
+ >>> loader = MetaControllerConfigLoader()
+ >>> config = loader.load_from_yaml("config/meta_controller.yaml")
+ >>> print(config.type)
+ 'rnn'
+ >>> config.validate()
+ """
+
+ @staticmethod
+ def load_from_yaml(path: str) -> MetaControllerConfig:
+ """
+ Load Meta-Controller configuration from a YAML file.
+
+ Args:
+ path: Path to the YAML configuration file.
+
+ Returns:
+ MetaControllerConfig: Loaded and parsed configuration object.
+
+ Raises:
+ FileNotFoundError: If the specified file does not exist.
+ yaml.YAMLError: If the file contains invalid YAML.
+ KeyError: If the 'meta_controller' key is missing from the file.
+
+ Example:
+ >>> config = MetaControllerConfigLoader.load_from_yaml("config/meta_controller.yaml")
+ >>> print(config.enabled)
+ False
+ """
+ yaml_path = Path(path)
+
+ if not yaml_path.exists():
+ raise FileNotFoundError(f"Configuration file not found: {path}")
+
+ with open(yaml_path) as f:
+ raw_config = yaml.safe_load(f)
+
+ if "meta_controller" not in raw_config:
+ raise KeyError("Configuration file must contain 'meta_controller' key")
+
+ return MetaControllerConfigLoader.load_from_dict(raw_config["meta_controller"])
+
+ @staticmethod
+ def load_from_dict(config_dict: dict[str, Any]) -> MetaControllerConfig:
+ """
+ Load Meta-Controller configuration from a dictionary.
+
+ Args:
+ config_dict: Dictionary containing configuration values.
+
+ Returns:
+ MetaControllerConfig: Parsed configuration object with defaults
+ applied for missing values.
+
+ Example:
+ >>> config_dict = {
+ ... 'enabled': True,
+ ... 'type': 'bert',
+ ... 'bert': {'model_name': 'bert-base-uncased'}
+ ... }
+ >>> config = MetaControllerConfigLoader.load_from_dict(config_dict)
+ >>> print(config.type)
+ 'bert'
+ """
+ # Parse nested configurations
+ rnn_config = RNNConfig(**config_dict.get("rnn", {}))
+ bert_config = BERTConfig(**config_dict.get("bert", {}))
+ inference_config = InferenceConfig(**config_dict.get("inference", {}))
+
+ # Create main config with nested configs
+ return MetaControllerConfig(
+ enabled=config_dict.get("enabled", False),
+ type=config_dict.get("type", "rnn"),
+ fallback_to_rule_based=config_dict.get("fallback_to_rule_based", True),
+ rnn=rnn_config,
+ bert=bert_config,
+ inference=inference_config,
+ )
+
+ @staticmethod
+ def to_dict(config: MetaControllerConfig) -> dict[str, Any]:
+ """
+ Convert a MetaControllerConfig object to a dictionary.
+
+ Args:
+ config: MetaControllerConfig object to convert.
+
+ Returns:
+ Dict[str, Any]: Dictionary representation of the configuration.
+
+ Example:
+ >>> config = MetaControllerConfig(enabled=True, type='bert')
+ >>> config_dict = MetaControllerConfigLoader.to_dict(config)
+ >>> print(config_dict['enabled'])
+ True
+ """
+ return asdict(config)
+
+ @staticmethod
+ def validate(config: MetaControllerConfig) -> None:
+ """
+ Validate the Meta-Controller configuration.
+
+ Checks that:
+ - The controller type is valid ("rnn" or "bert")
+ - Model paths exist if specified
+ - Numeric parameters are within valid ranges
+
+ Args:
+ config: MetaControllerConfig object to validate.
+
+ Raises:
+ ValueError: If the configuration contains invalid values.
+ FileNotFoundError: If specified model paths do not exist.
+
+ Example:
+ >>> config = MetaControllerConfig(type='invalid')
+ >>> MetaControllerConfigLoader.validate(config)
+ ValueError: Invalid controller type 'invalid'. Must be 'rnn' or 'bert'.
+ """
+ # Validate controller type
+ valid_types = ["rnn", "bert"]
+ if config.type not in valid_types:
+ raise ValueError(f"Invalid controller type '{config.type}'. Must be one of: {valid_types}")
+
+ # Validate RNN config
+ if config.rnn.hidden_dim <= 0:
+ raise ValueError(f"RNN hidden_dim must be positive, got {config.rnn.hidden_dim}")
+ if config.rnn.num_layers <= 0:
+ raise ValueError(f"RNN num_layers must be positive, got {config.rnn.num_layers}")
+ if not 0.0 <= config.rnn.dropout <= 1.0:
+ raise ValueError(f"RNN dropout must be between 0 and 1, got {config.rnn.dropout}")
+ if config.rnn.model_path is not None:
+ rnn_path = Path(config.rnn.model_path)
+ if not rnn_path.exists():
+ raise FileNotFoundError(f"RNN model path does not exist: {config.rnn.model_path}")
+
+ # Validate BERT config
+ if config.bert.lora_r <= 0:
+ raise ValueError(f"BERT lora_r must be positive, got {config.bert.lora_r}")
+ if config.bert.lora_alpha <= 0:
+ raise ValueError(f"BERT lora_alpha must be positive, got {config.bert.lora_alpha}")
+ if not 0.0 <= config.bert.lora_dropout <= 1.0:
+ raise ValueError(f"BERT lora_dropout must be between 0 and 1, got {config.bert.lora_dropout}")
+ if config.bert.model_path is not None:
+ bert_path = Path(config.bert.model_path)
+ if not bert_path.exists():
+ raise FileNotFoundError(f"BERT model path does not exist: {config.bert.model_path}")
+
+ # Validate inference config
+ if config.inference.device is not None:
+ valid_devices = ["cpu", "cuda", "mps"]
+ # Check if device starts with a valid prefix (e.g., "cuda:0", "cuda:1")
+ device_base = config.inference.device.split(":")[0]
+ if device_base not in valid_devices:
+ raise ValueError(f"Invalid device '{config.inference.device}'. Must start with one of: {valid_devices}")
+
+ if not isinstance(config.inference.seed, int) or config.inference.seed < 0:
+ raise ValueError(f"Inference seed must be a non-negative integer, got {config.inference.seed}")
+
+ @staticmethod
+ def save_to_yaml(config: MetaControllerConfig, path: str) -> None:
+ """
+ Save a MetaControllerConfig object to a YAML file.
+
+ Args:
+ config: MetaControllerConfig object to save.
+ path: Path where the YAML file will be saved.
+
+ Example:
+ >>> config = MetaControllerConfig(enabled=True)
+ >>> MetaControllerConfigLoader.save_to_yaml(config, "my_config.yaml")
+ """
+ yaml_path = Path(path)
+ yaml_path.parent.mkdir(parents=True, exist_ok=True)
+
+ config_dict = {"meta_controller": MetaControllerConfigLoader.to_dict(config)}
+
+ with open(yaml_path, "w") as f:
+ yaml.dump(config_dict, f, default_flow_style=False, sort_keys=False)
+
+ @staticmethod
+ def get_default_config() -> MetaControllerConfig:
+ """
+ Get a default MetaControllerConfig with all default values.
+
+ Returns:
+ MetaControllerConfig: Configuration object with default values.
+
+ Example:
+ >>> config = MetaControllerConfigLoader.get_default_config()
+ >>> print(config.enabled)
+ False
+ """
+ return MetaControllerConfig()
diff --git a/src/agents/meta_controller/rnn_controller.py b/src/agents/meta_controller/rnn_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e6ef1b4b11e982319e4b0df55df8c62333d602
--- /dev/null
+++ b/src/agents/meta_controller/rnn_controller.py
@@ -0,0 +1,345 @@
+"""
+RNN-based Meta-Controller for dynamic agent selection.
+
+This module provides a GRU-based recurrent neural network meta-controller
+that learns to select the optimal agent (HRM, TRM, or MCTS) based on
+sequential patterns in the agent state features.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from src.agents.meta_controller.base import (
+ AbstractMetaController,
+ MetaControllerFeatures,
+ MetaControllerPrediction,
+)
+from src.agents.meta_controller.utils import features_to_tensor
+
+
+class RNNMetaControllerModel(nn.Module):
+ """
+ GRU-based neural network model for meta-controller predictions.
+
+ This model uses a Gated Recurrent Unit (GRU) to capture sequential
+ patterns in agent state features and predict which agent should be
+ selected next.
+
+ Architecture:
+ - GRU layer for sequence processing
+ - Dropout for regularization
+ - Linear layer for classification
+
+ Attributes:
+ gru: GRU recurrent layer for processing sequences.
+ dropout: Dropout layer for regularization.
+ fc: Fully connected output layer.
+ hidden_dim: Dimension of the hidden state.
+ num_layers: Number of GRU layers.
+ """
+
+ def __init__(
+ self,
+ input_dim: int = 10,
+ hidden_dim: int = 64,
+ num_layers: int = 1,
+ num_agents: int = 3,
+ dropout: float = 0.1,
+ ) -> None:
+ """
+ Initialize the RNN meta-controller model.
+
+ Args:
+ input_dim: Dimension of input features. Defaults to 10.
+ hidden_dim: Dimension of GRU hidden state. Defaults to 64.
+ num_layers: Number of stacked GRU layers. Defaults to 1.
+ num_agents: Number of agents to choose from. Defaults to 3.
+ dropout: Dropout probability for regularization. Defaults to 0.1.
+ """
+ super().__init__()
+
+ self.hidden_dim = hidden_dim
+ self.num_layers = num_layers
+
+ # GRU layer for sequence processing
+ self.gru = nn.GRU(
+ input_size=input_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
+ dropout=dropout if num_layers > 1 else 0.0,
+ )
+
+ # Dropout for regularization
+ self.dropout = nn.Dropout(p=dropout)
+
+ # Linear output layer for classification
+ self.fc = nn.Linear(hidden_dim, num_agents)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Forward pass through the model.
+
+ Processes input features through GRU and produces agent selection logits.
+
+ Args:
+ x: Input tensor of shape (batch_size, features) or
+ (batch_size, seq_len, features).
+
+ Returns:
+ Logits tensor of shape (batch_size, num_agents).
+ Note: Returns raw logits, NOT softmax probabilities.
+
+ Example:
+ >>> model = RNNMetaControllerModel()
+ >>> x = torch.randn(4, 10) # batch of 4, 10 features
+ >>> logits = model(x)
+ >>> logits.shape
+ torch.Size([4, 3])
+ """
+ # Handle 2D input by adding sequence dimension
+ if x.dim() == 2:
+ # Shape: (batch_size, features) -> (batch_size, 1, features)
+ x = x.unsqueeze(1)
+
+ # Pass through GRU
+ # output shape: (batch_size, seq_len, hidden_dim)
+ # hidden shape: (num_layers, batch_size, hidden_dim)
+ output, hidden = self.gru(x)
+
+ # Take the final hidden state from the last layer
+ # Shape: (batch_size, hidden_dim)
+ final_hidden = hidden[-1] if self.num_layers > 1 else hidden.squeeze(0)
+
+ # Apply dropout
+ dropped = self.dropout(final_hidden)
+
+ # Apply linear layer to get logits
+ logits = self.fc(dropped)
+
+ return logits
+
+
+class RNNMetaController(AbstractMetaController):
+ """
+ RNN-based meta-controller using GRU for agent selection.
+
+ This controller uses a recurrent neural network to learn patterns in
+ agent state sequences and predict the optimal agent for the current
+ situation. It supports both CPU and GPU execution.
+
+ Attributes:
+ device: PyTorch device (CPU or CUDA) for tensor operations.
+ hidden_dim: Dimension of GRU hidden state.
+ num_layers: Number of GRU layers.
+ dropout: Dropout probability.
+ model: The underlying RNNMetaControllerModel.
+ hidden_state: Optional hidden state for sequence tracking.
+
+ Example:
+ >>> controller = RNNMetaController(name="RNNController", seed=42)
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> prediction = controller.predict(features)
+ >>> prediction.agent in ['hrm', 'trm', 'mcts']
+ True
+ >>> 0.0 <= prediction.confidence <= 1.0
+ True
+ """
+
+ def __init__(
+ self,
+ name: str = "RNNMetaController",
+ seed: int = 42,
+ hidden_dim: int = 64,
+ num_layers: int = 1,
+ dropout: float = 0.1,
+ device: str | None = None,
+ ) -> None:
+ """
+ Initialize the RNN meta-controller.
+
+ Args:
+ name: Name identifier for this controller. Defaults to "RNNMetaController".
+ seed: Random seed for reproducibility. Defaults to 42.
+ hidden_dim: Dimension of GRU hidden state. Defaults to 64.
+ num_layers: Number of GRU layers. Defaults to 1.
+ dropout: Dropout probability. Defaults to 0.1.
+ device: Device to run model on ('cpu', 'cuda', 'mps', etc.).
+ If None, auto-detects best available device.
+ """
+ super().__init__(name=name, seed=seed)
+
+ # Set random seed for reproducibility
+ torch.manual_seed(seed)
+
+ # Auto-detect device if not specified
+ if device is None:
+ if torch.cuda.is_available():
+ self.device = torch.device("cuda")
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ self.device = torch.device("mps")
+ else:
+ self.device = torch.device("cpu")
+ else:
+ self.device = torch.device(device)
+
+ # Store configuration
+ self.hidden_dim = hidden_dim
+ self.num_layers = num_layers
+ self.dropout = dropout
+
+ # Initialize model
+ self.model = RNNMetaControllerModel(
+ input_dim=10, # Fixed based on features_to_tensor output
+ hidden_dim=hidden_dim,
+ num_layers=num_layers,
+ num_agents=len(self.AGENT_NAMES),
+ dropout=dropout,
+ )
+
+ # Move model to device
+ self.model = self.model.to(self.device)
+
+ # Set model to evaluation mode
+ self.model.eval()
+
+ # Initialize hidden state for sequence tracking
+ self.hidden_state: torch.Tensor | None = None
+
+ def predict(self, features: MetaControllerFeatures) -> MetaControllerPrediction:
+ """
+ Predict which agent should handle the current query.
+
+ Converts features to tensor format, runs through the GRU model,
+ and returns a prediction with confidence scores.
+
+ Args:
+ features: Features extracted from the current agent state.
+
+ Returns:
+ Prediction containing the selected agent, confidence score,
+ and probability distribution over all agents.
+
+ Example:
+ >>> controller = RNNMetaController()
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.9,
+ ... trm_confidence=0.3,
+ ... mcts_value=0.5,
+ ... consensus_score=0.8,
+ ... last_agent='none',
+ ... iteration=0,
+ ... query_length=100,
+ ... has_rag_context=False
+ ... )
+ >>> pred = controller.predict(features)
+ >>> isinstance(pred.agent, str)
+ True
+ >>> isinstance(pred.confidence, float)
+ True
+ >>> len(pred.probabilities) == 3
+ True
+ """
+ # Convert features to tensor
+ feature_tensor = features_to_tensor(features)
+
+ # Add batch dimension: (10,) -> (1, 10)
+ feature_tensor = feature_tensor.unsqueeze(0)
+
+ # Move to device
+ feature_tensor = feature_tensor.to(self.device)
+
+ # Perform inference without gradient tracking
+ with torch.no_grad():
+ # Get logits from model
+ logits = self.model(feature_tensor)
+
+ # Apply softmax to get probabilities
+ probabilities = F.softmax(logits, dim=-1)
+
+ # Get predicted agent index (argmax)
+ predicted_idx = torch.argmax(probabilities, dim=-1).item()
+
+ # Extract confidence for selected agent
+ confidence = probabilities[0, predicted_idx].item()
+
+ # Create probability dictionary
+ prob_dict: dict[str, float] = {}
+ for i, agent_name in enumerate(self.AGENT_NAMES):
+ prob_dict[agent_name] = probabilities[0, i].item()
+
+ # Get agent name
+ selected_agent = self.AGENT_NAMES[predicted_idx]
+
+ return MetaControllerPrediction(
+ agent=selected_agent,
+ confidence=float(confidence),
+ probabilities=prob_dict,
+ )
+
+ def load_model(self, path: str) -> None:
+ """
+ Load a trained model from disk.
+
+ Loads the model state dictionary from the specified path and
+ sets the model to evaluation mode.
+
+ Args:
+ path: Path to the saved model file (.pt or .pth).
+
+ Raises:
+ FileNotFoundError: If the model file does not exist.
+ RuntimeError: If the state dict is incompatible with the model.
+
+ Example:
+ >>> controller = RNNMetaController()
+ >>> controller.load_model("/path/to/model.pt")
+ """
+ # Load state dict with appropriate device mapping
+ state_dict = torch.load(path, map_location=self.device, weights_only=True)
+
+ # Load into model
+ self.model.load_state_dict(state_dict)
+
+ # Ensure model is in evaluation mode
+ self.model.eval()
+
+ def save_model(self, path: str) -> None:
+ """
+ Save the current model to disk.
+
+ Saves the model state dictionary to the specified path.
+
+ Args:
+ path: Path where the model should be saved (.pt or .pth).
+
+ Example:
+ >>> controller = RNNMetaController()
+ >>> controller.save_model("/path/to/model.pt")
+ """
+ torch.save(self.model.state_dict(), path)
+
+ def reset_hidden_state(self) -> None:
+ """
+ Reset the hidden state for sequence tracking.
+
+ This method clears any accumulated hidden state, useful when
+ starting a new conversation or resetting the controller state.
+
+ Example:
+ >>> controller = RNNMetaController()
+ >>> controller.reset_hidden_state()
+ >>> controller.hidden_state is None
+ True
+ """
+ self.hidden_state = None
diff --git a/src/agents/meta_controller/utils.py b/src/agents/meta_controller/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..faa5cab1ab7fd7bb5e73d94e086e35b31625e1cd
--- /dev/null
+++ b/src/agents/meta_controller/utils.py
@@ -0,0 +1,201 @@
+"""
+Utility functions for Neural Meta-Controller feature processing.
+
+Provides functions for normalizing, encoding, and converting features
+into formats suitable for different neural network architectures.
+"""
+
+import torch
+
+from src.agents.meta_controller.base import MetaControllerFeatures
+
+
+def normalize_features(features: MetaControllerFeatures) -> list[float]:
+ """
+ Normalize meta-controller features to a 10-dimensional vector in range [0, 1].
+
+ The normalization strategy:
+ - Confidence scores (hrm, trm, mcts_value, consensus): Already 0-1, clipped
+ - last_agent: Encoded as 3 one-hot values (hrm=0, trm=1, mcts=2)
+ - iteration: Normalized to 0-1 assuming max 20 iterations
+ - query_length: Normalized to 0-1 assuming max 10000 characters
+ - has_rag_context: Binary 0 or 1
+
+ Output vector structure (10 dimensions):
+ [hrm_conf, trm_conf, mcts_value, consensus, last_hrm, last_trm, last_mcts,
+ iteration_norm, query_length_norm, has_rag]
+
+ Args:
+ features: MetaControllerFeatures instance to normalize.
+
+ Returns:
+ List of 10 floats, each normalized to range [0, 1].
+
+ Example:
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> normalized = normalize_features(features)
+ >>> len(normalized)
+ 10
+ >>> all(0.0 <= v <= 1.0 for v in normalized)
+ True
+ """
+ # Clip confidence scores to [0, 1]
+ hrm_conf = max(0.0, min(1.0, features.hrm_confidence))
+ trm_conf = max(0.0, min(1.0, features.trm_confidence))
+ mcts_val = max(0.0, min(1.0, features.mcts_value))
+ consensus = max(0.0, min(1.0, features.consensus_score))
+
+ # One-hot encode last_agent (3 dimensions)
+ last_agent_onehot = one_hot_encode_agent(features.last_agent)
+
+ # Normalize iteration (assuming max 20 iterations)
+ max_iterations = 20
+ iteration_norm = max(0.0, min(1.0, features.iteration / max_iterations))
+
+ # Normalize query length (assuming max 10000 characters)
+ max_query_length = 10000
+ query_length_norm = max(0.0, min(1.0, features.query_length / max_query_length))
+
+ # Binary for has_rag_context
+ has_rag = 1.0 if features.has_rag_context else 0.0
+
+ # Combine into 10-dimensional vector
+ return [
+ hrm_conf,
+ trm_conf,
+ mcts_val,
+ consensus,
+ last_agent_onehot[0], # hrm
+ last_agent_onehot[1], # trm
+ last_agent_onehot[2], # mcts
+ iteration_norm,
+ query_length_norm,
+ has_rag,
+ ]
+
+
+def one_hot_encode_agent(agent: str) -> list[float]:
+ """
+ One-hot encode an agent name into a 3-dimensional vector.
+
+ Encoding:
+ - 'hrm' -> [1.0, 0.0, 0.0]
+ - 'trm' -> [0.0, 1.0, 0.0]
+ - 'mcts' -> [0.0, 0.0, 1.0]
+ - 'none' or other -> [0.0, 0.0, 0.0]
+
+ Args:
+ agent: Agent name string ('hrm', 'trm', 'mcts', or 'none').
+
+ Returns:
+ List of 3 floats representing the one-hot encoding.
+
+ Example:
+ >>> one_hot_encode_agent('hrm')
+ [1.0, 0.0, 0.0]
+ >>> one_hot_encode_agent('trm')
+ [0.0, 1.0, 0.0]
+ >>> one_hot_encode_agent('mcts')
+ [0.0, 0.0, 1.0]
+ >>> one_hot_encode_agent('none')
+ [0.0, 0.0, 0.0]
+ """
+ agent_lower = agent.lower()
+
+ if agent_lower == "hrm": # noqa: SIM116
+ return [1.0, 0.0, 0.0]
+ elif agent_lower == "trm":
+ return [0.0, 1.0, 0.0]
+ elif agent_lower == "mcts":
+ return [0.0, 0.0, 1.0]
+ else:
+ # 'none' or unknown agent
+ return [0.0, 0.0, 0.0]
+
+
+def features_to_tensor(features: MetaControllerFeatures) -> torch.Tensor:
+ """
+ Convert meta-controller features to a PyTorch tensor.
+
+ Uses normalize_features internally to create a normalized 10-dimensional
+ tensor suitable for neural network input.
+
+ Args:
+ features: MetaControllerFeatures instance to convert.
+
+ Returns:
+ PyTorch tensor of shape (10,) with float32 dtype.
+
+ Example:
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> tensor = features_to_tensor(features)
+ >>> tensor.shape
+ torch.Size([10])
+ >>> tensor.dtype
+ torch.float32
+ """
+ normalized = normalize_features(features)
+ return torch.tensor(normalized, dtype=torch.float32)
+
+
+def features_to_text(features: MetaControllerFeatures) -> str:
+ """
+ Convert meta-controller features to structured text format.
+
+ Creates a human-readable text representation suitable for text-based
+ models like BERT or other language models.
+
+ Args:
+ features: MetaControllerFeatures instance to convert.
+
+ Returns:
+ Structured text string describing the features.
+
+ Example:
+ >>> features = MetaControllerFeatures(
+ ... hrm_confidence=0.8,
+ ... trm_confidence=0.6,
+ ... mcts_value=0.75,
+ ... consensus_score=0.7,
+ ... last_agent='hrm',
+ ... iteration=2,
+ ... query_length=150,
+ ... has_rag_context=True
+ ... )
+ >>> text = features_to_text(features)
+ >>> 'HRM confidence: 0.800' in text
+ True
+ """
+ rag_status = "available" if features.has_rag_context else "not available"
+
+ text = (
+ f"Agent State Features:\n"
+ f"HRM confidence: {features.hrm_confidence:.3f}\n"
+ f"TRM confidence: {features.trm_confidence:.3f}\n"
+ f"MCTS value: {features.mcts_value:.3f}\n"
+ f"Consensus score: {features.consensus_score:.3f}\n"
+ f"Last agent used: {features.last_agent}\n"
+ f"Current iteration: {features.iteration}\n"
+ f"Query length: {features.query_length} characters\n"
+ f"RAG context: {rag_status}"
+ )
+
+ return text
diff --git a/src/agents/trm_agent.py b/src/agents/trm_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..085e5936622c94c12477e67bf2391ee997401f08
--- /dev/null
+++ b/src/agents/trm_agent.py
@@ -0,0 +1,395 @@
+"""
+Tiny Recursive Model (TRM) Agent.
+
+Implements recursive refinement with:
+- Deep supervision at all recursion levels
+- Convergence detection
+- Memory-efficient recursion
+- Iterative improvement mechanism
+
+Based on principles from:
+- "Recursive Refinement Networks"
+- "Deep Supervision for Neural Networks"
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+
+from ..training.system_config import TRMConfig
+
+
+@dataclass
+class TRMOutput:
+ """Output from TRM recursive processing."""
+
+ final_prediction: torch.Tensor # Final refined output
+ intermediate_predictions: list[torch.Tensor] # Predictions at each recursion
+ recursion_depth: int # Actual depth used
+ converged: bool # Whether convergence was achieved
+ convergence_step: int # Step at which convergence occurred
+ residual_norms: list[float] # L2 norms of residuals at each step
+
+
+class RecursiveBlock(nn.Module):
+ """
+ Core recursive processing block.
+
+ Applies the same transformation repeatedly, with residual connections.
+ """
+
+ def __init__(self, config: TRMConfig):
+ super().__init__()
+ self.config = config
+
+ # Main processing pathway
+ self.transform = nn.Sequential(
+ nn.Linear(config.latent_dim, config.hidden_dim),
+ nn.LayerNorm(config.hidden_dim) if config.use_layer_norm else nn.Identity(),
+ nn.GELU(),
+ nn.Dropout(config.dropout),
+ nn.Linear(config.hidden_dim, config.latent_dim),
+ nn.LayerNorm(config.latent_dim) if config.use_layer_norm else nn.Identity(),
+ )
+
+ # Residual scaling (learned)
+ self.residual_scale = nn.Parameter(torch.ones(1))
+
+ def forward(self, x: torch.Tensor, iteration: int = 0) -> torch.Tensor: # noqa: ARG002
+ """
+ Apply recursive transformation.
+
+ Args:
+ x: Input tensor [batch, ..., latent_dim]
+ iteration: Current recursion iteration (reserved for future iteration-dependent behavior)
+
+ Returns:
+ Refined tensor [batch, ..., latent_dim]
+ """
+ # Residual connection with learned scaling
+ residual = self.transform(x)
+ return x + self.residual_scale * residual
+
+
+class DeepSupervisionHead(nn.Module):
+ """
+ Supervision head for intermediate predictions.
+
+ Enables training signal at each recursion level.
+ """
+
+ def __init__(self, latent_dim: int, output_dim: int):
+ super().__init__()
+ self.head = nn.Sequential(
+ nn.Linear(latent_dim, latent_dim // 2),
+ nn.ReLU(),
+ nn.Linear(latent_dim // 2, output_dim),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """Generate prediction from latent state."""
+ return self.head(x)
+
+
+class TRMAgent(nn.Module):
+ """
+ Tiny Recursive Model for iterative refinement.
+
+ Features:
+ - Shared weights across recursions (parameter efficiency)
+ - Deep supervision at all levels
+ - Automatic convergence detection
+ - Residual connections for stable gradients
+ """
+
+ def __init__(self, config: TRMConfig, output_dim: int | None = None, device: str = "cpu"):
+ super().__init__()
+ self.config = config
+ self.device = device
+ self.output_dim = output_dim or config.latent_dim
+
+ # Initial encoding
+ self.encoder = nn.Sequential(
+ nn.Linear(config.latent_dim, config.hidden_dim),
+ nn.LayerNorm(config.hidden_dim) if config.use_layer_norm else nn.Identity(),
+ nn.GELU(),
+ nn.Linear(config.hidden_dim, config.latent_dim),
+ nn.LayerNorm(config.latent_dim) if config.use_layer_norm else nn.Identity(),
+ )
+
+ # Shared recursive block
+ self.recursive_block = RecursiveBlock(config)
+
+ # Deep supervision heads (one per recursion level)
+ if config.deep_supervision:
+ self.supervision_heads = nn.ModuleList(
+ [DeepSupervisionHead(config.latent_dim, self.output_dim) for _ in range(config.num_recursions)]
+ )
+ else:
+ # Single output head
+ self.output_head = DeepSupervisionHead(config.latent_dim, self.output_dim)
+
+ self.to(device)
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ num_recursions: int | None = None,
+ check_convergence: bool = True,
+ ) -> TRMOutput:
+ """
+ Process input through recursive refinement.
+
+ Args:
+ x: Input tensor [batch, ..., latent_dim]
+ num_recursions: Number of recursions (defaults to config)
+ check_convergence: Whether to check for early convergence
+
+ Returns:
+ TRMOutput with final and intermediate predictions
+ """
+ num_recursions = num_recursions or self.config.num_recursions
+
+ # Initial encoding
+ latent = self.encoder(x)
+ previous_latent = latent.clone()
+
+ # Tracking
+ intermediate_predictions = []
+ residual_norms = []
+ converged = False
+ convergence_step = num_recursions
+
+ # Recursive refinement
+ for i in range(num_recursions):
+ # Apply recursive transformation
+ latent = self.recursive_block(latent, iteration=i)
+
+ # Generate intermediate prediction
+ if self.config.deep_supervision and i < len(self.supervision_heads):
+ pred = self.supervision_heads[i](latent)
+ else:
+ pred = self.output_head(latent)
+
+ intermediate_predictions.append(pred)
+
+ # Check convergence
+ if check_convergence and i >= self.config.min_recursions:
+ residual = latent - previous_latent
+ residual_norm = torch.norm(residual, p=2, dim=-1).mean().item()
+ residual_norms.append(residual_norm)
+
+ if residual_norm < self.config.convergence_threshold:
+ converged = True
+ convergence_step = i + 1
+ break
+
+ previous_latent = latent.clone()
+
+ # Final prediction
+ final_pred = intermediate_predictions[-1]
+
+ return TRMOutput(
+ final_prediction=final_pred,
+ intermediate_predictions=intermediate_predictions,
+ recursion_depth=len(intermediate_predictions),
+ converged=converged,
+ convergence_step=convergence_step,
+ residual_norms=residual_norms,
+ )
+
+ async def refine_solution(
+ self,
+ initial_prediction: torch.Tensor,
+ num_recursions: int | None = None,
+ convergence_threshold: float | None = None,
+ ) -> tuple[torch.Tensor, dict]:
+ """
+ Refine an initial prediction through recursive processing.
+
+ Args:
+ initial_prediction: Initial solution [batch, ..., latent_dim]
+ num_recursions: Maximum recursions (optional)
+ convergence_threshold: Convergence threshold (optional)
+
+ Returns:
+ refined_solution: Final refined prediction
+ info: Dictionary with refinement metadata
+ """
+ # Temporarily override convergence threshold if provided
+ original_threshold = self.config.convergence_threshold
+ if convergence_threshold is not None:
+ self.config.convergence_threshold = convergence_threshold
+
+ # Process
+ output = self.forward(
+ initial_prediction,
+ num_recursions=num_recursions,
+ check_convergence=True,
+ )
+
+ # Restore original threshold
+ self.config.convergence_threshold = original_threshold
+
+ info = {
+ "converged": output.converged,
+ "convergence_step": output.convergence_step,
+ "total_recursions": output.recursion_depth,
+ "final_residual": output.residual_norms[-1] if output.residual_norms else None,
+ "refinement_path": output.residual_norms,
+ }
+
+ return output.final_prediction, info
+
+ def get_parameter_count(self) -> int:
+ """Return total number of trainable parameters."""
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class TRMLoss(nn.Module):
+ """
+ Deep supervision loss for TRM.
+
+ Applies weighted supervision at all recursion levels,
+ with exponential decay for deeper levels.
+ """
+
+ def __init__(
+ self,
+ task_loss_fn: nn.Module,
+ supervision_weight_decay: float = 0.5,
+ final_weight: float = 1.0,
+ ):
+ """
+ Initialize TRM loss.
+
+ Args:
+ task_loss_fn: Base loss function (e.g., MSE, CrossEntropy)
+ supervision_weight_decay: Decay factor for intermediate losses
+ final_weight: Weight for final prediction loss
+ """
+ super().__init__()
+ self.task_loss_fn = task_loss_fn
+ self.supervision_weight_decay = supervision_weight_decay
+ self.final_weight = final_weight
+
+ def forward(self, trm_output: TRMOutput, targets: torch.Tensor) -> tuple[torch.Tensor, dict]:
+ """
+ Compute deep supervision loss.
+
+ Args:
+ trm_output: Output from TRM forward pass
+ targets: Ground truth targets
+
+ Returns:
+ total_loss: Combined loss
+ loss_dict: Dictionary of loss components
+ """
+ # Final prediction loss (highest weight)
+ final_loss = self.task_loss_fn(trm_output.final_prediction, targets)
+ total_loss = self.final_weight * final_loss
+
+ # Intermediate supervision losses
+ intermediate_losses = []
+ num_intermediate = len(trm_output.intermediate_predictions) - 1
+
+ for i, pred in enumerate(trm_output.intermediate_predictions[:-1]):
+ # Exponential decay: earlier predictions get lower weight
+ weight = self.supervision_weight_decay ** (num_intermediate - i)
+ loss = self.task_loss_fn(pred, targets)
+ intermediate_losses.append(loss.item())
+ total_loss = total_loss + weight * loss
+
+ loss_dict = {
+ "total": total_loss.item(),
+ "final": final_loss.item(),
+ "intermediate_mean": (sum(intermediate_losses) / len(intermediate_losses) if intermediate_losses else 0.0),
+ "recursion_depth": trm_output.recursion_depth,
+ "converged": trm_output.converged,
+ "convergence_step": trm_output.convergence_step,
+ }
+
+ return total_loss, loss_dict
+
+
+def create_trm_agent(config: TRMConfig, output_dim: int | None = None, device: str = "cpu") -> TRMAgent:
+ """
+ Factory function to create and initialize TRM agent.
+
+ Args:
+ config: TRM configuration
+ output_dim: Output dimension (defaults to latent_dim)
+ device: Device to place model on
+
+ Returns:
+ Initialized TRMAgent
+ """
+ agent = TRMAgent(config, output_dim, device)
+
+ # Initialize weights with Xavier/He initialization
+ def init_weights(m):
+ if isinstance(m, nn.Linear):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+
+ agent.apply(init_weights)
+
+ return agent
+
+
+# Utility functions for integration
+class TRMRefinementWrapper:
+ """
+ Wrapper for using TRM as a refinement step in pipelines.
+
+ Provides a clean interface for integrating TRM into larger systems.
+ """
+
+ def __init__(self, trm_agent: TRMAgent, device: str = "cpu"):
+ self.trm_agent = trm_agent
+ self.device = device
+ self.trm_agent.eval()
+
+ @torch.no_grad()
+ async def refine(
+ self,
+ predictions: torch.Tensor,
+ num_iterations: int = 10,
+ return_path: bool = False,
+ ) -> torch.Tensor | tuple[torch.Tensor, list[torch.Tensor]]:
+ """
+ Refine predictions using TRM.
+
+ Args:
+ predictions: Initial predictions to refine
+ num_iterations: Number of refinement iterations
+ return_path: Whether to return intermediate predictions
+
+ Returns:
+ refined_predictions or (refined_predictions, refinement_path)
+ """
+ # Ensure predictions are on correct device
+ predictions = predictions.to(self.device)
+
+ # Run TRM
+ output = self.trm_agent(predictions, num_recursions=num_iterations, check_convergence=True)
+
+ if return_path:
+ return output.final_prediction, output.intermediate_predictions
+ return output.final_prediction
+
+ def get_refinement_stats(self, predictions: torch.Tensor) -> dict:
+ """Get statistics about the refinement process."""
+ with torch.no_grad():
+ output = self.trm_agent(predictions, check_convergence=True)
+
+ return {
+ "converged": output.converged,
+ "steps_to_convergence": output.convergence_step,
+ "final_residual": (output.residual_norms[-1] if output.residual_norms else None),
+ "total_refinement_iterations": output.recursion_depth,
+ }
diff --git a/src/api/__init__.py b/src/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccafadde98d9c0869a91d085a484e0b453cd1cde
--- /dev/null
+++ b/src/api/__init__.py
@@ -0,0 +1,35 @@
+"""
+API module for LangGraph Multi-Agent MCTS Framework.
+
+Provides:
+- Authentication and authorization
+- Rate limiting
+- Error handling
+- REST API endpoints
+"""
+
+from src.api.exceptions import (
+ AuthenticationError,
+ AuthorizationError,
+ ConfigurationError,
+ FrameworkError,
+ LLMError,
+ MCTSError,
+ RAGError,
+ RateLimitError,
+ TimeoutError,
+ ValidationError,
+)
+
+__all__ = [
+ "FrameworkError",
+ "ValidationError",
+ "AuthenticationError",
+ "AuthorizationError",
+ "RateLimitError",
+ "LLMError",
+ "MCTSError",
+ "RAGError",
+ "TimeoutError",
+ "ConfigurationError",
+]
diff --git a/src/api/auth.py b/src/api/auth.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5738b95c45181ea190e646cd4a019601a92a73e
--- /dev/null
+++ b/src/api/auth.py
@@ -0,0 +1,439 @@
+"""
+Authentication and authorization layer for LangGraph Multi-Agent MCTS Framework.
+
+Provides:
+- API key authentication with secure hashing
+- JWT token support (optional)
+- Rate limiting per client
+- Role-based access control
+"""
+
+import hashlib
+import secrets
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+
+from src.api.exceptions import (
+ AuthenticationError,
+ AuthorizationError,
+ RateLimitError,
+)
+
+
+@dataclass
+class ClientInfo:
+ """Information about an authenticated client."""
+
+ client_id: str
+ roles: set[str] = field(default_factory=lambda: {"user"})
+ created_at: datetime = field(default_factory=datetime.utcnow)
+ last_access: datetime = field(default_factory=datetime.utcnow)
+ request_count: int = 0
+
+
+@dataclass
+class RateLimitConfig:
+ """Rate limiting configuration."""
+
+ requests_per_minute: int = 60
+ requests_per_hour: int = 1000
+ requests_per_day: int = 10000
+ burst_limit: int = 100 # Max requests in 1 second
+
+
+class APIKeyAuthenticator:
+ """
+ API key-based authentication with secure hashing.
+
+ Keys are stored as SHA-256 hashes to prevent exposure.
+ """
+
+ def __init__(
+ self,
+ valid_keys: list[str] | None = None,
+ rate_limit_config: RateLimitConfig | None = None,
+ ):
+ """
+ Initialize authenticator.
+
+ Args:
+ valid_keys: List of valid API keys (will be hashed)
+ rate_limit_config: Rate limiting configuration
+ """
+ self._key_to_client: dict[str, ClientInfo] = {}
+ self._rate_limits: dict[str, list[float]] = defaultdict(list)
+ self.rate_limit_config = rate_limit_config or RateLimitConfig()
+
+ # Hash and store initial keys
+ if valid_keys:
+ for i, key in enumerate(valid_keys):
+ client_id = f"client_{i}"
+ self._add_key(key, client_id)
+
+ def _hash_key(self, api_key: str) -> str:
+ """
+ Securely hash an API key.
+
+ Uses SHA-256 with consistent encoding.
+ """
+ return hashlib.sha256(api_key.encode("utf-8")).hexdigest()
+
+ def _add_key(self, api_key: str, client_id: str, roles: set[str] | None = None) -> None:
+ """
+ Add a new API key.
+
+ Args:
+ api_key: Raw API key
+ client_id: Client identifier
+ roles: Set of roles (defaults to {"user"})
+ """
+ key_hash = self._hash_key(api_key)
+ self._key_to_client[key_hash] = ClientInfo(
+ client_id=client_id,
+ roles=roles or {"user"},
+ )
+
+ def authenticate(self, api_key: str | None) -> ClientInfo:
+ """
+ Authenticate an API key.
+
+ Args:
+ api_key: API key to validate
+
+ Returns:
+ ClientInfo for the authenticated client
+
+ Raises:
+ AuthenticationError: If authentication fails
+ """
+ if not api_key:
+ raise AuthenticationError(
+ user_message="API key is required",
+ internal_details="No API key provided in request",
+ )
+
+ # Constant-time comparison to prevent timing attacks
+ key_hash = self._hash_key(api_key)
+
+ if key_hash not in self._key_to_client:
+ raise AuthenticationError(
+ user_message="Invalid API key",
+ internal_details=f"API key hash not found: {key_hash[:16]}...",
+ )
+
+ client_info = self._key_to_client[key_hash]
+ client_info.last_access = datetime.utcnow()
+ client_info.request_count += 1
+
+ # Check rate limits
+ self._check_rate_limit(client_info.client_id)
+
+ return client_info
+
+ def _check_rate_limit(self, client_id: str) -> None:
+ """
+ Check if client has exceeded rate limits.
+
+ Args:
+ client_id: Client identifier
+
+ Raises:
+ RateLimitError: If rate limit exceeded
+ """
+ now = time.time()
+ request_times = self._rate_limits[client_id]
+
+ # Clean old entries
+ one_day_ago = now - 86400
+ request_times = [t for t in request_times if t > one_day_ago]
+ self._rate_limits[client_id] = request_times
+
+ # Check burst limit (1 second window)
+ one_second_ago = now - 1
+ burst_count = sum(1 for t in request_times if t > one_second_ago)
+ if burst_count >= self.rate_limit_config.burst_limit:
+ raise RateLimitError(
+ user_message="Too many requests. Please slow down.",
+ internal_details=f"Client {client_id} exceeded burst limit: {burst_count}/{self.rate_limit_config.burst_limit}",
+ retry_after_seconds=1,
+ )
+
+ # Check per-minute limit
+ one_minute_ago = now - 60
+ minute_count = sum(1 for t in request_times if t > one_minute_ago)
+ if minute_count >= self.rate_limit_config.requests_per_minute:
+ raise RateLimitError(
+ user_message="Rate limit exceeded. Please wait a minute.",
+ internal_details=f"Client {client_id} exceeded minute limit: {minute_count}/{self.rate_limit_config.requests_per_minute}",
+ retry_after_seconds=60,
+ )
+
+ # Check per-hour limit
+ one_hour_ago = now - 3600
+ hour_count = sum(1 for t in request_times if t > one_hour_ago)
+ if hour_count >= self.rate_limit_config.requests_per_hour:
+ raise RateLimitError(
+ user_message="Hourly rate limit exceeded. Please try again later.",
+ internal_details=f"Client {client_id} exceeded hour limit: {hour_count}/{self.rate_limit_config.requests_per_hour}",
+ retry_after_seconds=3600,
+ )
+
+ # Check per-day limit
+ day_count = len(request_times)
+ if day_count >= self.rate_limit_config.requests_per_day:
+ raise RateLimitError(
+ user_message="Daily rate limit exceeded. Please try again tomorrow.",
+ internal_details=f"Client {client_id} exceeded day limit: {day_count}/{self.rate_limit_config.requests_per_day}",
+ retry_after_seconds=86400,
+ )
+
+ # Record this request
+ request_times.append(now)
+
+ def require_auth(self, api_key: str | None) -> ClientInfo:
+ """
+ Require authentication for a request.
+
+ Convenience method that raises on failure.
+
+ Args:
+ api_key: API key to validate
+
+ Returns:
+ ClientInfo for authenticated client
+
+ Raises:
+ AuthenticationError: If authentication fails
+ """
+ return self.authenticate(api_key)
+
+ def require_role(self, client_info: ClientInfo, required_role: str) -> None:
+ """
+ Require a specific role for an operation.
+
+ Args:
+ client_info: Authenticated client info
+ required_role: Role that is required
+
+ Raises:
+ AuthorizationError: If client doesn't have required role
+ """
+ if required_role not in client_info.roles:
+ raise AuthorizationError(
+ user_message="You do not have permission for this operation",
+ internal_details=f"Client {client_info.client_id} missing role: {required_role}",
+ required_permission=required_role,
+ )
+
+ def generate_api_key(self) -> str:
+ """
+ Generate a secure random API key.
+
+ Returns:
+ New API key (32 bytes hex = 64 characters)
+ """
+ return secrets.token_hex(32)
+
+ def revoke_key(self, api_key: str) -> bool:
+ """
+ Revoke an API key.
+
+ Args:
+ api_key: Key to revoke
+
+ Returns:
+ True if key was revoked, False if not found
+ """
+ key_hash = self._hash_key(api_key)
+ if key_hash in self._key_to_client:
+ del self._key_to_client[key_hash]
+ return True
+ return False
+
+ def add_client(
+ self,
+ client_id: str,
+ roles: set[str] | None = None,
+ ) -> str:
+ """
+ Add a new client and generate their API key.
+
+ Args:
+ client_id: Unique client identifier
+ roles: Set of roles for the client
+
+ Returns:
+ Generated API key (save this securely!)
+ """
+ api_key = self.generate_api_key()
+ self._add_key(api_key, client_id, roles)
+ return api_key
+
+ def get_client_stats(self, client_id: str) -> dict:
+ """
+ Get statistics for a client.
+
+ Args:
+ client_id: Client identifier
+
+ Returns:
+ Dictionary with client statistics
+ """
+ now = time.time()
+ request_times = self._rate_limits.get(client_id, [])
+
+ return {
+ "total_requests_today": len([t for t in request_times if t > now - 86400]),
+ "requests_last_hour": len([t for t in request_times if t > now - 3600]),
+ "requests_last_minute": len([t for t in request_times if t > now - 60]),
+ }
+
+
+class JWTAuthenticator:
+ """
+ JWT token-based authentication.
+
+ Note: Requires PyJWT library for full functionality.
+ This is a placeholder for JWT support.
+ """
+
+ def __init__(self, secret_key: str, algorithm: str = "HS256"):
+ """
+ Initialize JWT authenticator.
+
+ Args:
+ secret_key: Secret key for signing tokens
+ algorithm: JWT signing algorithm
+ """
+ self.secret_key = secret_key
+ self.algorithm = algorithm
+ self._token_blacklist: set[str] = set()
+
+ def create_token(
+ self,
+ client_id: str,
+ roles: set[str],
+ expires_in_hours: int = 24,
+ ) -> str:
+ """
+ Create a JWT token.
+
+ Args:
+ client_id: Client identifier
+ roles: Client roles
+ expires_in_hours: Token validity period
+
+ Returns:
+ JWT token string
+ """
+ try:
+ import jwt
+ except ImportError:
+ raise ImportError("PyJWT library required for JWT authentication. Install with: pip install PyJWT")
+
+ now = datetime.utcnow()
+ payload = {
+ "sub": client_id,
+ "roles": list(roles),
+ "iat": now,
+ "exp": now + timedelta(hours=expires_in_hours),
+ "jti": secrets.token_hex(16), # Unique token ID
+ }
+
+ return jwt.encode(payload, self.secret_key, algorithm=self.algorithm)
+
+ def verify_token(self, token: str) -> ClientInfo:
+ """
+ Verify a JWT token.
+
+ Args:
+ token: JWT token string
+
+ Returns:
+ ClientInfo from token claims
+
+ Raises:
+ AuthenticationError: If token is invalid
+ """
+ try:
+ import jwt
+ except ImportError:
+ raise ImportError("PyJWT library required for JWT authentication")
+
+ if token in self._token_blacklist:
+ raise AuthenticationError(
+ user_message="Token has been revoked",
+ internal_details="Token found in blacklist",
+ )
+
+ try:
+ payload = jwt.decode(
+ token,
+ self.secret_key,
+ algorithms=[self.algorithm],
+ )
+
+ return ClientInfo(
+ client_id=payload["sub"],
+ roles=set(payload.get("roles", ["user"])),
+ )
+ except jwt.ExpiredSignatureError:
+ raise AuthenticationError(
+ user_message="Token has expired",
+ internal_details="JWT signature expired",
+ )
+ except jwt.InvalidTokenError as e:
+ raise AuthenticationError(
+ user_message="Invalid token",
+ internal_details=f"JWT validation failed: {str(e)}",
+ )
+
+ def revoke_token(self, token: str) -> None:
+ """
+ Revoke a JWT token by adding to blacklist.
+
+ Args:
+ token: Token to revoke
+ """
+ self._token_blacklist.add(token)
+
+
+# Default authenticator instance
+_default_authenticator: APIKeyAuthenticator | None = None
+
+
+def get_authenticator() -> APIKeyAuthenticator:
+ """
+ Get or create the default authenticator instance.
+
+ Returns:
+ APIKeyAuthenticator instance
+ """
+ global _default_authenticator
+ if _default_authenticator is None:
+ _default_authenticator = APIKeyAuthenticator()
+ return _default_authenticator
+
+
+def set_authenticator(authenticator: APIKeyAuthenticator) -> None:
+ """
+ Set the default authenticator instance.
+
+ Args:
+ authenticator: Authenticator to use
+ """
+ global _default_authenticator
+ _default_authenticator = authenticator
+
+
+# Exports
+__all__ = [
+ "APIKeyAuthenticator",
+ "JWTAuthenticator",
+ "ClientInfo",
+ "RateLimitConfig",
+ "get_authenticator",
+ "set_authenticator",
+]
diff --git a/src/api/exceptions.py b/src/api/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..790ec42a7ab11e8ea60162379281ba8547705a48
--- /dev/null
+++ b/src/api/exceptions.py
@@ -0,0 +1,299 @@
+"""
+Custom exception hierarchy for LangGraph Multi-Agent MCTS Framework.
+
+Provides:
+- Sanitized error messages for production
+- Structured error information for logging
+- Clear separation between user-facing and internal errors
+"""
+
+import re
+from datetime import datetime
+from typing import Any
+
+
+class FrameworkError(Exception):
+ """
+ Base exception for all framework errors.
+
+ Provides sanitized user-facing messages while preserving
+ internal details for logging.
+ """
+
+ def __init__(
+ self,
+ user_message: str,
+ internal_details: str | None = None,
+ error_code: str | None = None,
+ context: dict[str, Any] | None = None,
+ ):
+ """
+ Initialize framework error.
+
+ Args:
+ user_message: Safe message to show to users
+ internal_details: Detailed information for logs (may contain sensitive data)
+ error_code: Machine-readable error code
+ context: Additional context for debugging
+ """
+ self.user_message = user_message
+ self.internal_details = internal_details or user_message
+ self.error_code = error_code or self.__class__.__name__.upper()
+ self.context = context or {}
+ self.timestamp = datetime.utcnow()
+
+ super().__init__(user_message)
+
+ def sanitize_details(self) -> str:
+ """
+ Remove sensitive information from internal details.
+
+ Sanitizes:
+ - File paths
+ - API keys
+ - Passwords
+ - Connection strings
+ - IP addresses
+ """
+ sanitized = self.internal_details
+
+ # Remove file paths (Unix and Windows)
+ sanitized = re.sub(r"/[\w/.-]+", "/***", sanitized)
+ sanitized = re.sub(r"[A-Za-z]:\\[\w\\.-]+", "C:\\***", sanitized)
+
+ # Remove API keys and secrets
+ sanitized = re.sub(
+ r"(api[_-]?key|secret|password|token|credential)[\s=:]+[\S]+", r"\1=***", sanitized, flags=re.IGNORECASE
+ )
+
+ # Remove connection strings
+ sanitized = re.sub(r"(mongodb|postgresql|mysql|redis)://[^\s]+", r"\1://***", sanitized, flags=re.IGNORECASE)
+
+ # Remove IP addresses
+ sanitized = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "***.***.***", sanitized)
+
+ # Remove email addresses
+ sanitized = re.sub(r"\b[\w.-]+@[\w.-]+\.\w+\b", "***@***", sanitized)
+
+ return sanitized
+
+ def to_log_dict(self) -> dict[str, Any]:
+ """
+ Convert exception to dictionary for structured logging.
+
+ Returns sanitized version safe for logs.
+ """
+ return {
+ "error_type": self.__class__.__name__,
+ "error_code": self.error_code,
+ "user_message": self.user_message,
+ "sanitized_details": self.sanitize_details(),
+ "timestamp": self.timestamp.isoformat(),
+ "context": {k: str(v) for k, v in self.context.items()},
+ }
+
+ def to_user_response(self) -> dict[str, Any]:
+ """
+ Convert exception to safe user-facing response.
+ """
+ return {
+ "error": True,
+ "error_code": self.error_code,
+ "message": self.user_message,
+ "timestamp": self.timestamp.isoformat(),
+ }
+
+
+class ValidationError(FrameworkError):
+ """Raised when input validation fails."""
+
+ def __init__(
+ self,
+ user_message: str = "Invalid input provided",
+ internal_details: str | None = None,
+ field_name: str | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if field_name:
+ context["field_name"] = field_name
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="VALIDATION_ERROR",
+ context=context,
+ **kwargs,
+ )
+ self.field_name = field_name
+
+
+class AuthenticationError(FrameworkError):
+ """Raised when authentication fails."""
+
+ def __init__(self, user_message: str = "Authentication failed", internal_details: str | None = None, **kwargs):
+ super().__init__(
+ user_message=user_message, internal_details=internal_details, error_code="AUTH_ERROR", **kwargs
+ )
+
+
+class AuthorizationError(FrameworkError):
+ """Raised when authorization fails."""
+
+ def __init__(
+ self,
+ user_message: str = "Access denied",
+ internal_details: str | None = None,
+ required_permission: str | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if required_permission:
+ context["required_permission"] = required_permission
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="AUTHZ_ERROR",
+ context=context,
+ **kwargs,
+ )
+
+
+class RateLimitError(FrameworkError):
+ """Raised when rate limit is exceeded."""
+
+ def __init__(
+ self,
+ user_message: str = "Rate limit exceeded. Please try again later.",
+ internal_details: str | None = None,
+ retry_after_seconds: int | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if retry_after_seconds:
+ context["retry_after_seconds"] = retry_after_seconds
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="RATE_LIMIT",
+ context=context,
+ **kwargs,
+ )
+ self.retry_after_seconds = retry_after_seconds
+
+
+class LLMError(FrameworkError):
+ """Raised when LLM operations fail."""
+
+ def __init__(
+ self,
+ user_message: str = "Language model service temporarily unavailable",
+ internal_details: str | None = None,
+ provider: str | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if provider:
+ context["provider"] = provider
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="LLM_ERROR",
+ context=context,
+ **kwargs,
+ )
+
+
+class MCTSError(FrameworkError):
+ """Raised when MCTS simulation fails."""
+
+ def __init__(
+ self,
+ user_message: str = "Tactical simulation failed",
+ internal_details: str | None = None,
+ iteration: int | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if iteration is not None:
+ context["iteration"] = iteration
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="MCTS_ERROR",
+ context=context,
+ **kwargs,
+ )
+
+
+class RAGError(FrameworkError):
+ """Raised when RAG retrieval fails."""
+
+ def __init__(self, user_message: str = "Context retrieval failed", internal_details: str | None = None, **kwargs):
+ super().__init__(user_message=user_message, internal_details=internal_details, error_code="RAG_ERROR", **kwargs)
+
+
+class TimeoutError(FrameworkError):
+ """Raised when operation times out."""
+
+ def __init__(
+ self,
+ user_message: str = "Operation timed out",
+ internal_details: str | None = None,
+ operation: str | None = None,
+ timeout_seconds: float | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if operation:
+ context["operation"] = operation
+ if timeout_seconds:
+ context["timeout_seconds"] = timeout_seconds
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="TIMEOUT",
+ context=context,
+ **kwargs,
+ )
+
+
+class ConfigurationError(FrameworkError):
+ """Raised when configuration is invalid."""
+
+ def __init__(
+ self,
+ user_message: str = "System configuration error",
+ internal_details: str | None = None,
+ config_key: str | None = None,
+ **kwargs,
+ ):
+ context = kwargs.pop("context", {})
+ if config_key:
+ context["config_key"] = config_key
+ super().__init__(
+ user_message=user_message,
+ internal_details=internal_details,
+ error_code="CONFIG_ERROR",
+ context=context,
+ **kwargs,
+ )
+
+
+# Convenience function for wrapping exceptions
+def wrap_exception(
+ exc: Exception, user_message: str = "An unexpected error occurred", error_class: type = FrameworkError, **kwargs
+) -> FrameworkError:
+ """
+ Wrap a standard exception in a FrameworkError with sanitized details.
+
+ Args:
+ exc: Original exception
+ user_message: Safe user-facing message
+ error_class: FrameworkError subclass to use
+ **kwargs: Additional context
+
+ Returns:
+ FrameworkError instance with sanitized details
+ """
+ internal_details = f"{type(exc).__name__}: {str(exc)}"
+ return error_class(user_message=user_message, internal_details=internal_details, **kwargs)
diff --git a/src/api/inference_server.py b/src/api/inference_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b0b21c2192f63437406eeaaff54131c20cbd48
--- /dev/null
+++ b/src/api/inference_server.py
@@ -0,0 +1,380 @@
+"""
+FastAPI Inference Server for LangGraph Multi-Agent MCTS.
+
+Provides REST API for:
+- Problem solving with HRM+MCTS+TRM
+- Policy-value network inference
+- Health checks and monitoring
+"""
+
+import time
+from typing import Any
+
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+
+from ..framework.mcts.neural_mcts import NeuralMCTS
+from ..training.performance_monitor import PerformanceMonitor
+from ..training.system_config import SystemConfig
+
+
+# Request/Response Models
+class InferenceRequest(BaseModel):
+ """Request for problem inference."""
+
+ state: list[list[float]] # State representation
+ query: str | None = "Solve this problem"
+ max_thinking_time: float = Field(default=10.0, ge=0.1, le=60.0)
+ use_mcts: bool = True
+ num_simulations: int | None = None
+ use_hrm_decomposition: bool = False
+ use_trm_refinement: bool = False
+ temperature: float = Field(default=0.1, ge=0.0, le=2.0)
+
+
+class PolicyValueRequest(BaseModel):
+ """Request for policy-value evaluation."""
+
+ state: list[list[float]] # State representation
+
+
+class InferenceResponse(BaseModel):
+ """Response with inference results."""
+
+ success: bool
+ action_probabilities: dict[str, float] | None = None
+ best_action: str | None = None
+ value_estimate: float | None = None
+ subproblems: list[dict[str, Any]] | None = None
+ refinement_info: dict[str, Any] | None = None
+ performance_stats: dict[str, float]
+ error: str | None = None
+
+
+class PolicyValueResponse(BaseModel):
+ """Response with policy-value predictions."""
+
+ policy_probs: list[float]
+ value: float
+ inference_time_ms: float
+
+
+class HealthResponse(BaseModel):
+ """Health check response."""
+
+ status: str
+ device: str
+ model_loaded: bool
+ gpu_available: bool
+ gpu_memory_gb: float | None = None
+ uptime_seconds: float
+
+
+# Inference Server
+class InferenceServer:
+ """
+ Production inference server with comprehensive features.
+
+ Features:
+ - FastAPI REST endpoints
+ - Performance monitoring
+ - Health checks
+ - CORS support
+ - Error handling
+ """
+
+ def __init__(
+ self,
+ checkpoint_path: str,
+ config: SystemConfig | None = None,
+ host: str = "0.0.0.0",
+ port: int = 8000,
+ ):
+ """
+ Initialize inference server.
+
+ Args:
+ checkpoint_path: Path to model checkpoint
+ config: System configuration (loaded from checkpoint if None)
+ host: Server host
+ port: Server port
+ """
+ self.checkpoint_path = checkpoint_path
+ self.host = host
+ self.port = port
+ self.start_time = time.time()
+
+ # Load models
+ self.config, self.models = self._load_models(checkpoint_path, config)
+ self.device = self.config.device
+
+ # Performance monitoring
+ self.monitor = PerformanceMonitor(window_size=100, enable_gpu_monitoring=(self.device != "cpu"))
+
+ # Setup FastAPI app
+ self.app = FastAPI(
+ title="LangGraph Multi-Agent MCTS API",
+ description="Neural-guided MCTS with HRM and TRM agents",
+ version="1.0.0",
+ )
+
+ # CORS middleware
+ self.app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+ # Setup routes
+ self._setup_routes()
+
+ def _load_models(
+ self, checkpoint_path: str, config: SystemConfig | None
+ ) -> tuple[SystemConfig, dict[str, torch.nn.Module]]:
+ """Load models from checkpoint."""
+ print(f"Loading models from {checkpoint_path}...")
+
+ checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+
+ # Load config
+ if config is None:
+ config_dict = checkpoint.get("config", {})
+ config = SystemConfig.from_dict(config_dict)
+
+ device = config.device
+
+ # Load models
+ models = {}
+
+ # Policy-Value Network
+ from ..models.policy_value_net import create_policy_value_network
+
+ models["policy_value_net"] = create_policy_value_network(config.neural_net, board_size=19, device=device)
+ models["policy_value_net"].load_state_dict(checkpoint["policy_value_net"])
+ models["policy_value_net"].eval()
+
+ # HRM Agent
+ from ..agents.hrm_agent import create_hrm_agent
+
+ models["hrm_agent"] = create_hrm_agent(config.hrm, device)
+ models["hrm_agent"].load_state_dict(checkpoint["hrm_agent"])
+ models["hrm_agent"].eval()
+
+ # TRM Agent
+ from ..agents.trm_agent import create_trm_agent
+
+ models["trm_agent"] = create_trm_agent(config.trm, output_dim=config.neural_net.action_size, device=device)
+ models["trm_agent"].load_state_dict(checkpoint["trm_agent"])
+ models["trm_agent"].eval()
+
+ # MCTS
+ models["mcts"] = NeuralMCTS(
+ policy_value_network=models["policy_value_net"],
+ config=config.mcts,
+ device=device,
+ )
+
+ print(f"✓ Models loaded successfully on {device}")
+
+ return config, models
+
+ def _setup_routes(self):
+ """Setup API routes."""
+
+ @self.app.get("/", response_model=dict[str, str])
+ async def root():
+ """Root endpoint."""
+ return {
+ "message": "LangGraph Multi-Agent MCTS API",
+ "version": "1.0.0",
+ "docs": "/docs",
+ }
+
+ @self.app.get("/health", response_model=HealthResponse)
+ async def health():
+ """Health check endpoint."""
+ gpu_memory = None
+ if torch.cuda.is_available():
+ gpu_memory = torch.cuda.memory_allocated() / (1024**3)
+
+ return HealthResponse(
+ status="healthy",
+ device=self.device,
+ model_loaded=True,
+ gpu_available=torch.cuda.is_available(),
+ gpu_memory_gb=gpu_memory,
+ uptime_seconds=time.time() - self.start_time,
+ )
+
+ @self.app.post("/inference", response_model=InferenceResponse)
+ async def inference(request: InferenceRequest):
+ """
+ Main inference endpoint.
+
+ Processes a problem using the full pipeline:
+ 1. Optional HRM decomposition
+ 2. MCTS search
+ 3. Optional TRM refinement
+ """
+ try:
+ start_time = time.perf_counter()
+
+ # Convert state to tensor
+ state_tensor = torch.tensor(request.state, dtype=torch.float32).unsqueeze(0)
+ state_tensor = state_tensor.to(self.device)
+
+ results = {}
+
+ # HRM Decomposition (if requested)
+ if request.use_hrm_decomposition:
+ with torch.no_grad():
+ hrm_output = self.models["hrm_agent"](state_tensor)
+ results["subproblems"] = [
+ {
+ "level": sp.level,
+ "description": sp.description,
+ "confidence": sp.confidence,
+ }
+ for sp in hrm_output.subproblems
+ ]
+
+ # MCTS Search (if requested)
+ if request.use_mcts:
+ # Note: This is a simplified version
+ # In production, you'd need to convert request.state to GameState
+ results["action_probabilities"] = {"action_0": 0.5, "action_1": 0.3, "action_2": 0.2}
+ results["best_action"] = "action_0"
+ results["value_estimate"] = 0.75
+
+ # TRM Refinement (if requested)
+ if request.use_trm_refinement and results.get("best_action"):
+ with torch.no_grad():
+ # Simplified: just run TRM on the state
+ trm_output = self.models["trm_agent"](state_tensor)
+ results["refinement_info"] = {
+ "converged": trm_output.converged,
+ "convergence_step": trm_output.convergence_step,
+ "recursion_depth": trm_output.recursion_depth,
+ }
+
+ # Performance stats
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
+ self.monitor.log_inference(elapsed_ms)
+
+ perf_stats = {
+ "inference_time_ms": elapsed_ms,
+ "device": self.device,
+ }
+
+ return InferenceResponse(
+ success=True,
+ action_probabilities=results.get("action_probabilities"),
+ best_action=results.get("best_action"),
+ value_estimate=results.get("value_estimate"),
+ subproblems=results.get("subproblems"),
+ refinement_info=results.get("refinement_info"),
+ performance_stats=perf_stats,
+ )
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")
+
+ @self.app.post("/policy-value", response_model=PolicyValueResponse)
+ async def policy_value(request: PolicyValueRequest):
+ """
+ Get policy and value predictions for a state.
+
+ This is a direct neural network evaluation without MCTS.
+ """
+ try:
+ start_time = time.perf_counter()
+
+ # Convert state to tensor
+ state_tensor = torch.tensor(request.state, dtype=torch.float32).unsqueeze(0)
+ state_tensor = state_tensor.to(self.device)
+
+ # Get predictions
+ with torch.no_grad():
+ policy_log_probs, value = self.models["policy_value_net"](state_tensor)
+ policy_probs = torch.exp(policy_log_probs).squeeze(0)
+
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
+
+ return PolicyValueResponse(
+ policy_probs=policy_probs.cpu().tolist(),
+ value=value.item(),
+ inference_time_ms=elapsed_ms,
+ )
+
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=f"Policy-value inference failed: {str(e)}")
+
+ @self.app.get("/stats")
+ async def stats():
+ """Get performance statistics."""
+ return self.monitor.get_stats()
+
+ @self.app.post("/reset-stats")
+ async def reset_stats():
+ """Reset performance statistics."""
+ self.monitor.reset()
+ return {"message": "Statistics reset successfully"}
+
+ def run(self):
+ """Start the inference server."""
+ print(f"\n{'=' * 80}")
+ print("Starting LangGraph Multi-Agent MCTS Inference Server")
+ print(f"{'=' * 80}")
+ print(f"Host: {self.host}:{self.port}")
+ print(f"Device: {self.device}")
+ print(f"Checkpoint: {self.checkpoint_path}")
+ print(f"{'=' * 80}\n")
+
+ uvicorn.run(self.app, host=self.host, port=self.port)
+
+
+def main():
+ """Main entry point for inference server."""
+ import argparse
+
+ parser = argparse.ArgumentParser(description="LangGraph MCTS Inference Server")
+ parser.add_argument(
+ "--checkpoint",
+ type=str,
+ required=True,
+ help="Path to model checkpoint",
+ )
+ parser.add_argument("--host", type=str, default="0.0.0.0", help="Server host")
+ parser.add_argument("--port", type=int, default=8000, help="Server port")
+ parser.add_argument(
+ "--device",
+ type=str,
+ default=None,
+ help="Device (cpu, cuda, mps)",
+ )
+
+ args = parser.parse_args()
+
+ # Load config and override device if specified
+ config = None
+ if args.device:
+ config = SystemConfig()
+ config.device = args.device
+
+ server = InferenceServer(
+ checkpoint_path=args.checkpoint,
+ config=config,
+ host=args.host,
+ port=args.port,
+ )
+
+ server.run()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/api/rest_server.py b/src/api/rest_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..020a758116ac1b626bdf669285a3d1c9a92f9543
--- /dev/null
+++ b/src/api/rest_server.py
@@ -0,0 +1,441 @@
+"""
+Production REST API server for LangGraph Multi-Agent MCTS Framework.
+
+Provides:
+- OpenAPI/Swagger documentation
+- Authentication via API keys
+- Rate limiting
+- Health and readiness endpoints
+- Request validation with Pydantic
+- Prometheus metrics exposure
+"""
+
+import asyncio
+import time
+from contextlib import asynccontextmanager
+from datetime import datetime
+from typing import Any
+
+from fastapi import Depends, FastAPI, Header, HTTPException, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
+
+# Import framework components
+try:
+ from src.adapters.llm import create_client # noqa: F401
+ from src.api.auth import (
+ APIKeyAuthenticator,
+ ClientInfo,
+ RateLimitConfig,
+ get_authenticator,
+ set_authenticator,
+ )
+ from src.api.exceptions import (
+ AuthenticationError,
+ AuthorizationError, # noqa: F401
+ FrameworkError,
+ RateLimitError,
+ ValidationError, # noqa: F401
+ )
+ from src.models.validation import MCTSConfig, QueryInput # noqa: F401
+
+ IMPORTS_AVAILABLE = True
+except ImportError as e:
+ IMPORTS_AVAILABLE = False
+ import_error = str(e)
+
+# Prometheus metrics (optional)
+try:
+ from prometheus_client import CONTENT_TYPE_LATEST, Counter, Gauge, Histogram, generate_latest
+
+ PROMETHEUS_AVAILABLE = True
+
+ # Define metrics
+ REQUEST_COUNT = Counter("mcts_requests_total", "Total number of requests", ["method", "endpoint", "status"])
+ REQUEST_LATENCY = Histogram("mcts_request_duration_seconds", "Request latency in seconds", ["method", "endpoint"])
+ ACTIVE_REQUESTS = Gauge("mcts_active_requests", "Number of active requests")
+ ERROR_COUNT = Counter("mcts_errors_total", "Total number of errors", ["error_type"])
+except ImportError:
+ PROMETHEUS_AVAILABLE = False
+
+
+# Request/Response Models
+class QueryRequest(BaseModel):
+ """Request model for query processing."""
+
+ query: str = Field(
+ ...,
+ min_length=1,
+ max_length=10000,
+ description="User query to process",
+ json_schema_extra={"example": "Recommend defensive positions for night attack scenario"},
+ )
+ use_mcts: bool = Field(default=True, description="Enable MCTS tactical simulation")
+ use_rag: bool = Field(default=True, description="Enable RAG context retrieval")
+ mcts_iterations: int | None = Field(default=None, ge=1, le=10000, description="Override default MCTS iterations")
+ thread_id: str | None = Field(
+ default=None,
+ max_length=100,
+ pattern=r"^[a-zA-Z0-9_-]+$",
+ description="Conversation thread ID for state persistence",
+ )
+
+ class Config:
+ json_schema_extra = {
+ "example": {
+ "query": "Recommend defensive positions for night attack",
+ "use_mcts": True,
+ "use_rag": True,
+ "mcts_iterations": 200,
+ "thread_id": "session_123",
+ }
+ }
+
+
+class QueryResponse(BaseModel):
+ """Response model for query results."""
+
+ response: str = Field(..., description="Final synthesized response")
+ confidence: float = Field(..., ge=0.0, le=1.0, description="Overall confidence score")
+ agents_used: list[str] = Field(..., description="List of agents that contributed")
+ mcts_stats: dict[str, Any] | None = Field(default=None, description="MCTS simulation statistics")
+ processing_time_ms: float = Field(..., description="Total processing time in milliseconds")
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+
+
+class HealthResponse(BaseModel):
+ """Health check response."""
+
+ status: str = Field(..., description="Service status")
+ timestamp: str = Field(..., description="Current timestamp")
+ version: str = Field(default="1.0.0", description="API version")
+ uptime_seconds: float = Field(..., description="Service uptime")
+
+
+class ReadinessResponse(BaseModel):
+ """Readiness check response."""
+
+ ready: bool = Field(..., description="Whether service is ready")
+ checks: dict[str, bool] = Field(..., description="Individual check results")
+
+
+class ErrorResponse(BaseModel):
+ """Error response model."""
+
+ error: bool = Field(default=True)
+ error_code: str = Field(..., description="Machine-readable error code")
+ message: str = Field(..., description="Human-readable error message")
+ timestamp: str = Field(..., description="Error timestamp")
+
+
+# Application startup
+start_time = time.time()
+framework_instance = None
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ """Application lifespan manager."""
+ global framework_instance
+
+ # Startup
+ print("Starting MCTS Framework API server...")
+
+ # Initialize authenticator with demo key (replace in production)
+ authenticator = APIKeyAuthenticator(
+ valid_keys=["demo-api-key-replace-in-production"],
+ rate_limit_config=RateLimitConfig(
+ requests_per_minute=60,
+ requests_per_hour=1000,
+ requests_per_day=10000,
+ ),
+ )
+ set_authenticator(authenticator)
+
+ # Initialize framework (lazy loading)
+ # framework_instance = create_framework()
+
+ print("API server started successfully")
+
+ yield
+
+ # Shutdown
+ print("Shutting down API server...")
+
+
+# Create FastAPI app
+app = FastAPI(
+ title="LangGraph Multi-Agent MCTS API",
+ description="""
+## Multi-Agent Reasoning API with MCTS Tactical Simulation
+
+This API provides access to a sophisticated multi-agent reasoning framework that combines:
+- **HRM Agent**: Hierarchical decomposition of complex queries
+- **TRM Agent**: Iterative refinement for response quality
+- **MCTS Engine**: Monte Carlo Tree Search for tactical simulation
+- **RAG Integration**: Context retrieval from vector stores
+
+### Features
+- Secure API key authentication
+- Rate limiting per client
+- Real-time metrics (Prometheus)
+- Distributed tracing (OpenTelemetry)
+- Production-grade error handling
+
+### Quick Start
+1. Obtain an API key
+2. Include `X-API-Key` header in requests
+3. Send queries to `/query` endpoint
+4. Monitor health via `/health` endpoint
+ """,
+ version="1.0.0",
+ docs_url="/docs",
+ redoc_url="/redoc",
+ openapi_tags=[
+ {"name": "query", "description": "Query processing operations"},
+ {"name": "health", "description": "Health and readiness checks"},
+ {"name": "metrics", "description": "Observability endpoints"},
+ ],
+ lifespan=lifespan,
+)
+
+# CORS middleware
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"], # Configure appropriately for production
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+
+# Middleware for metrics
+@app.middleware("http")
+async def metrics_middleware(request: Request, call_next):
+ """Track request metrics."""
+ if PROMETHEUS_AVAILABLE:
+ ACTIVE_REQUESTS.inc()
+
+ start = time.perf_counter()
+
+ try:
+ response = await call_next(request)
+ status = response.status_code
+ except Exception:
+ status = 500
+ raise
+ finally:
+ if PROMETHEUS_AVAILABLE:
+ ACTIVE_REQUESTS.dec()
+ elapsed = time.perf_counter() - start
+ REQUEST_COUNT.labels(method=request.method, endpoint=request.url.path, status=str(status)).inc()
+ REQUEST_LATENCY.labels(method=request.method, endpoint=request.url.path).observe(elapsed)
+
+ return response
+
+
+# Authentication dependency
+async def verify_api_key(x_api_key: str = Header(..., description="API key for authentication")):
+ """Verify API key and return client info."""
+ if not IMPORTS_AVAILABLE:
+ raise HTTPException(status_code=500, detail="Authentication module not available")
+
+ try:
+ authenticator = get_authenticator()
+ client_info = authenticator.require_auth(x_api_key)
+ return client_info
+ except AuthenticationError as e:
+ if PROMETHEUS_AVAILABLE:
+ ERROR_COUNT.labels(error_type="authentication").inc()
+ raise HTTPException(status_code=401, detail=e.user_message)
+ except RateLimitError as e:
+ if PROMETHEUS_AVAILABLE:
+ ERROR_COUNT.labels(error_type="rate_limit").inc()
+ raise HTTPException(
+ status_code=429, detail=e.user_message, headers={"Retry-After": str(e.retry_after_seconds or 60)}
+ )
+
+
+# Exception handlers
+@app.exception_handler(FrameworkError)
+async def framework_error_handler(request: Request, exc: FrameworkError):
+ """Handle framework-specific errors."""
+ if PROMETHEUS_AVAILABLE:
+ ERROR_COUNT.labels(error_type=exc.error_code).inc()
+
+ return JSONResponse(status_code=500, content=exc.to_user_response())
+
+
+@app.exception_handler(ValidationError)
+async def validation_error_handler(request: Request, exc: ValidationError):
+ """Handle validation errors."""
+ if PROMETHEUS_AVAILABLE:
+ ERROR_COUNT.labels(error_type="validation").inc()
+
+ return JSONResponse(status_code=400, content=exc.to_user_response())
+
+
+# Endpoints
+@app.get("/health", response_model=HealthResponse, tags=["health"])
+async def health_check():
+ """
+ Health check endpoint.
+
+ Returns basic service health status. Use this for load balancer health checks.
+ """
+ return HealthResponse(
+ status="healthy",
+ timestamp=datetime.utcnow().isoformat(),
+ version="1.0.0",
+ uptime_seconds=time.time() - start_time,
+ )
+
+
+@app.get("/ready", response_model=ReadinessResponse, tags=["health"])
+async def readiness_check():
+ """
+ Readiness check endpoint.
+
+ Verifies all dependencies are available. Use this for Kubernetes readiness probes.
+ """
+ checks = {
+ "imports_available": IMPORTS_AVAILABLE,
+ "authenticator_configured": True,
+ "llm_client_available": True, # Would check actual client
+ "prometheus_available": PROMETHEUS_AVAILABLE,
+ }
+
+ # Check if all critical services are available
+ all_ready = all(
+ [
+ checks["imports_available"],
+ checks["authenticator_configured"],
+ ]
+ )
+
+ if not all_ready:
+ raise HTTPException(status_code=503, detail="Service not ready")
+
+ return ReadinessResponse(ready=all_ready, checks=checks)
+
+
+@app.get("/metrics", tags=["metrics"])
+async def prometheus_metrics():
+ """
+ Prometheus metrics endpoint.
+
+ Returns metrics in Prometheus text format for scraping.
+ """
+ if not PROMETHEUS_AVAILABLE:
+ raise HTTPException(status_code=501, detail="Prometheus metrics not available")
+
+ return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
+
+
+@app.post(
+ "/query",
+ response_model=QueryResponse,
+ tags=["query"],
+ responses={
+ 401: {"model": ErrorResponse, "description": "Authentication failed"},
+ 429: {"model": ErrorResponse, "description": "Rate limit exceeded"},
+ 400: {"model": ErrorResponse, "description": "Invalid input"},
+ 500: {"model": ErrorResponse, "description": "Internal server error"},
+ },
+)
+async def process_query(request: QueryRequest, client_info: ClientInfo = Depends(verify_api_key)):
+ """
+ Process a query using the multi-agent MCTS framework.
+
+ This endpoint:
+ 1. Validates the input query
+ 2. Optionally retrieves context via RAG
+ 3. Processes through HRM and TRM agents
+ 4. Optionally runs MCTS simulation
+ 5. Synthesizes a final response
+
+ **Authentication**: Requires valid API key in X-API-Key header.
+
+ **Rate Limiting**: Subject to rate limits per client.
+ """
+ start_time = time.perf_counter()
+
+ # Validate input using validation models
+ if IMPORTS_AVAILABLE:
+ try:
+ QueryInput(
+ query=request.query,
+ use_rag=request.use_rag,
+ use_mcts=request.use_mcts,
+ thread_id=request.thread_id,
+ )
+ except Exception as e:
+ if PROMETHEUS_AVAILABLE:
+ ERROR_COUNT.labels(error_type="validation").inc()
+ raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
+
+ # Process query (mock implementation for demo)
+ # In production, this would call the actual framework
+ await asyncio.sleep(0.1) # Simulate processing
+
+ processing_time = (time.perf_counter() - start_time) * 1000
+
+ # Mock response
+ return QueryResponse(
+ response=f"Processed query: {request.query[:100]}...",
+ confidence=0.85,
+ agents_used=["hrm", "trm"] + (["mcts"] if request.use_mcts else []),
+ mcts_stats=(
+ {
+ "iterations": request.mcts_iterations or 100,
+ "best_action": "recommended_action",
+ "root_visits": request.mcts_iterations or 100,
+ }
+ if request.use_mcts
+ else None
+ ),
+ processing_time_ms=processing_time,
+ metadata={
+ "client_id": client_info.client_id,
+ "thread_id": request.thread_id,
+ "rag_enabled": request.use_rag,
+ },
+ )
+
+
+@app.get("/stats", tags=["metrics"])
+async def get_stats(client_info: ClientInfo = Depends(verify_api_key)):
+ """
+ Get usage statistics for the authenticated client.
+
+ Returns request counts and rate limit information.
+ """
+ authenticator = get_authenticator()
+ stats = authenticator.get_client_stats(client_info.client_id)
+
+ return {
+ "client_id": client_info.client_id,
+ "roles": list(client_info.roles),
+ **stats,
+ "rate_limits": {
+ "per_minute": authenticator.rate_limit_config.requests_per_minute,
+ "per_hour": authenticator.rate_limit_config.requests_per_hour,
+ "per_day": authenticator.rate_limit_config.requests_per_day,
+ },
+ }
+
+
+# Entry point
+if __name__ == "__main__":
+ import uvicorn
+
+ uvicorn.run(
+ "src.api.rest_server:app",
+ host="0.0.0.0",
+ port=8000,
+ reload=False,
+ workers=4,
+ log_level="info",
+ access_log=True,
+ )
diff --git a/src/config/__init__.py b/src/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/config/meta_controller.yaml b/src/config/meta_controller.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..765ded2760bcceb6140100a271c201d061925e3d
--- /dev/null
+++ b/src/config/meta_controller.yaml
@@ -0,0 +1,22 @@
+meta_controller:
+ enabled: false # Disabled by default for backward compatibility
+ type: "rnn" # "rnn" or "bert"
+ fallback_to_rule_based: true # Fallback on errors
+
+ rnn:
+ hidden_dim: 64
+ num_layers: 1
+ dropout: 0.1
+ model_path: null # Path to trained model (null for untrained)
+
+ bert:
+ model_name: "prajjwal1/bert-mini"
+ use_lora: true
+ lora_r: 4
+ lora_alpha: 16
+ lora_dropout: 0.1
+ model_path: null # Path to trained LoRA adapter
+
+ inference:
+ device: null # Auto-detect if null
+ seed: 42
diff --git a/src/config/settings.py b/src/config/settings.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b398ae926ac1ec325329b8cfa102758352fa9dd
--- /dev/null
+++ b/src/config/settings.py
@@ -0,0 +1,431 @@
+"""
+Pydantic Settings v2 configuration management for LangGraph Multi-Agent MCTS.
+
+Provides:
+- Secure configuration loading from environment variables and .env files
+- Type-safe settings with validation
+- Secrets protection using SecretStr
+- MCTS parameter bounds validation
+- Support for multiple LLM providers
+"""
+
+from enum import Enum
+
+from pydantic import (
+ Field,
+ SecretStr,
+ field_validator,
+ model_validator,
+)
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class LLMProvider(str, Enum):
+ """Supported LLM providers."""
+
+ OPENAI = "openai"
+ ANTHROPIC = "anthropic"
+ LMSTUDIO = "lmstudio"
+
+
+class LogLevel(str, Enum):
+ """Supported log levels."""
+
+ DEBUG = "DEBUG"
+ INFO = "INFO"
+ WARNING = "WARNING"
+ ERROR = "ERROR"
+ CRITICAL = "CRITICAL"
+
+
+class MCTSImplementation(str, Enum):
+ """MCTS implementation variants."""
+
+ BASELINE = "baseline" # Original MCTS core
+ NEURAL = "neural" # Neural-guided AlphaZero-style MCTS
+
+
+class Settings(BaseSettings):
+ """
+ Application settings with security-first configuration.
+
+ All sensitive values use SecretStr to prevent accidental exposure in logs.
+ Configuration is loaded from environment variables with .env file support.
+ """
+
+ model_config = SettingsConfigDict(
+ env_file=".env",
+ env_file_encoding="utf-8",
+ case_sensitive=True,
+ extra="ignore",
+ validate_default=True,
+ )
+
+ # LLM Provider Configuration
+ LLM_PROVIDER: LLMProvider = Field(
+ default=LLMProvider.OPENAI, description="LLM provider to use (openai, anthropic, lmstudio)"
+ )
+
+ # API Keys (Secrets)
+ OPENAI_API_KEY: SecretStr | None = Field(
+ default=None, description="OpenAI API key (required if using OpenAI provider)"
+ )
+
+ ANTHROPIC_API_KEY: SecretStr | None = Field(
+ default=None, description="Anthropic API key (required if using Anthropic provider)"
+ )
+
+ BRAINTRUST_API_KEY: SecretStr | None = Field(
+ default=None, description="Braintrust API key for experiment tracking (optional)"
+ )
+
+ PINECONE_API_KEY: SecretStr | None = Field(
+ default=None, description="Pinecone API key for vector storage (optional)"
+ )
+
+ PINECONE_HOST: str | None = Field(
+ default=None, description="Pinecone host URL (e.g., https://index.svc.environment.pinecone.io)"
+ )
+
+ # Local LLM Configuration
+ LMSTUDIO_BASE_URL: str | None = Field(
+ default="http://localhost:1234/v1", description="LM Studio API base URL for local inference"
+ )
+
+ LMSTUDIO_MODEL: str | None = Field(default=None, description="LM Studio model identifier (e.g., liquid/lfm2-1.2b)")
+
+ # MCTS Configuration with bounds validation
+ MCTS_ENABLED: bool = Field(default=True, description="Enable MCTS for agent decision-making")
+
+ MCTS_IMPL: MCTSImplementation = Field(
+ default=MCTSImplementation.BASELINE, description="MCTS implementation variant to use"
+ )
+
+ MCTS_ITERATIONS: int = Field(default=100, ge=1, le=10000, description="Number of MCTS iterations (1-10000)")
+
+ MCTS_C: float = Field(
+ default=1.414, ge=0.0, le=10.0, description="MCTS exploration weight (UCB1 constant, 0.0-10.0)"
+ )
+
+ # Random seed for reproducibility
+ SEED: int | None = Field(default=None, ge=0, description="Random seed for reproducibility (optional)")
+
+ # LangSmith Configuration for tracing and evaluation
+ LANGSMITH_API_KEY: SecretStr | None = Field(
+ default=None, description="LangSmith API key for tracing and evaluation (optional)"
+ )
+
+ LANGSMITH_PROJECT: str = Field(default="langgraph-mcts", description="LangSmith project name")
+
+ LANGCHAIN_TRACING_V2: bool = Field(default=False, description="Enable LangChain tracing v2")
+
+ LANGCHAIN_ENDPOINT: str = Field(default="https://api.smith.langchain.com", description="LangChain API endpoint")
+
+ # Weights & Biases Configuration for experiment tracking
+ WANDB_API_KEY: SecretStr | None = Field(
+ default=None, description="Weights & Biases API key for experiment tracking (optional)"
+ )
+
+ WANDB_PROJECT: str = Field(default="langgraph-mcts", description="W&B project name")
+
+ WANDB_ENTITY: str | None = Field(default=None, description="W&B entity (username or team name)")
+
+ WANDB_MODE: str = Field(default="online", description="W&B mode: online, offline, or disabled")
+
+ # Logging Configuration
+ LOG_LEVEL: LogLevel = Field(default=LogLevel.INFO, description="Application log level")
+
+ # OpenTelemetry Configuration
+ OTEL_EXPORTER_OTLP_ENDPOINT: str | None = Field(
+ default=None, description="OpenTelemetry OTLP exporter endpoint URL"
+ )
+
+ # S3 Storage Configuration
+ S3_BUCKET: str | None = Field(default=None, description="S3 bucket name for artifact storage")
+
+ S3_PREFIX: str = Field(default="mcts-artifacts", description="S3 key prefix for stored artifacts")
+
+ S3_REGION: str = Field(default="us-east-1", description="AWS region for S3 bucket")
+
+ # Network Configuration (security)
+ HTTP_TIMEOUT_SECONDS: int = Field(default=30, ge=1, le=300, description="HTTP request timeout in seconds")
+
+ HTTP_MAX_RETRIES: int = Field(default=3, ge=0, le=10, description="Maximum HTTP request retries")
+
+ # Security Settings
+ MAX_QUERY_LENGTH: int = Field(
+ default=10000, ge=1, le=100000, description="Maximum allowed query length in characters"
+ )
+
+ RATE_LIMIT_REQUESTS_PER_MINUTE: int = Field(
+ default=60, ge=1, le=1000, description="Rate limit for API requests per minute"
+ )
+
+ @field_validator("OPENAI_API_KEY")
+ @classmethod
+ def validate_openai_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate OpenAI API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ # Check for obviously invalid patterns
+ if secret_value in ("", "your-api-key-here", "sk-xxx", "REPLACE_ME"):
+ raise ValueError("OpenAI API key appears to be a placeholder value")
+ if not secret_value.startswith("sk-"):
+ raise ValueError("OpenAI API key should start with 'sk-'")
+ if len(secret_value) < 20:
+ raise ValueError("OpenAI API key appears to be too short")
+ return v
+
+ @field_validator("ANTHROPIC_API_KEY")
+ @classmethod
+ def validate_anthropic_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate Anthropic API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ # Check for obviously invalid patterns
+ if secret_value in ("", "your-api-key-here", "REPLACE_ME"):
+ raise ValueError("Anthropic API key appears to be a placeholder value")
+ if len(secret_value) < 20:
+ raise ValueError("Anthropic API key appears to be too short")
+ return v
+
+ @field_validator("BRAINTRUST_API_KEY")
+ @classmethod
+ def validate_braintrust_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate Braintrust API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ # Check for obviously invalid patterns
+ if secret_value in ("", "your-api-key-here", "REPLACE_ME"):
+ raise ValueError("Braintrust API key appears to be a placeholder value")
+ if len(secret_value) < 20:
+ raise ValueError("Braintrust API key appears to be too short")
+ return v
+
+ @field_validator("PINECONE_API_KEY")
+ @classmethod
+ def validate_pinecone_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate Pinecone API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ # Check for obviously invalid patterns
+ if secret_value in ("", "your-api-key-here", "REPLACE_ME"):
+ raise ValueError("Pinecone API key appears to be a placeholder value")
+ if len(secret_value) < 20:
+ raise ValueError("Pinecone API key appears to be too short")
+ return v
+
+ @field_validator("LANGSMITH_API_KEY")
+ @classmethod
+ def validate_langsmith_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate LangSmith API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ if secret_value in ("", "your-api-key-here", "REPLACE_ME"):
+ raise ValueError("LangSmith API key appears to be a placeholder value")
+ if len(secret_value) < 20:
+ raise ValueError("LangSmith API key appears to be too short")
+ return v
+
+ @field_validator("WANDB_API_KEY")
+ @classmethod
+ def validate_wandb_key_format(cls, v: SecretStr | None) -> SecretStr | None:
+ """Validate Weights & Biases API key format without exposing the value."""
+ if v is not None:
+ secret_value = v.get_secret_value()
+ if secret_value in ("", "your-api-key-here", "REPLACE_ME"):
+ raise ValueError("W&B API key appears to be a placeholder value")
+ if len(secret_value) < 20:
+ raise ValueError("W&B API key appears to be too short")
+ return v
+
+ @field_validator("PINECONE_HOST")
+ @classmethod
+ def validate_pinecone_host(cls, v: str | None) -> str | None:
+ """Validate Pinecone host URL format."""
+ if v is not None and v != "":
+ if not v.startswith("https://"):
+ raise ValueError("Pinecone host must start with https://")
+ if "pinecone.io" not in v:
+ raise ValueError("Pinecone host should be a valid pinecone.io URL")
+ return v
+
+ @field_validator("LMSTUDIO_BASE_URL")
+ @classmethod
+ def validate_lmstudio_url(cls, v: str | None) -> str | None:
+ """Validate LM Studio base URL format."""
+ if v is not None:
+ if not v.startswith(("http://", "https://")):
+ raise ValueError("LM Studio base URL must start with http:// or https://")
+ # Warn if not localhost (potential security concern)
+ if not any(host in v for host in ("localhost", "127.0.0.1", "::1")):
+ import warnings
+
+ warnings.warn(
+ "LM Studio URL points to non-localhost address. Ensure this is intentional and secure.",
+ UserWarning,
+ stacklevel=2,
+ )
+ return v
+
+ @field_validator("OTEL_EXPORTER_OTLP_ENDPOINT")
+ @classmethod
+ def validate_otel_endpoint(cls, v: str | None) -> str | None:
+ """Validate OpenTelemetry endpoint URL."""
+ if v is not None and v != "" and not v.startswith(("http://", "https://", "grpc://")):
+ raise ValueError("OpenTelemetry endpoint must start with http://, https://, or grpc://")
+ return v
+
+ @field_validator("S3_BUCKET")
+ @classmethod
+ def validate_s3_bucket_name(cls, v: str | None) -> str | None:
+ """Validate S3 bucket name format."""
+ if v is not None:
+ # S3 bucket naming rules
+ if len(v) < 3 or len(v) > 63:
+ raise ValueError("S3 bucket name must be 3-63 characters long")
+ if not v.replace("-", "").replace(".", "").isalnum():
+ raise ValueError("S3 bucket name can only contain lowercase letters, numbers, hyphens, and periods")
+ if v.startswith("-") or v.endswith("-"):
+ raise ValueError("S3 bucket name cannot start or end with a hyphen")
+ return v
+
+ @model_validator(mode="after")
+ def validate_provider_credentials(self) -> "Settings":
+ """Ensure required API keys are provided for the selected provider."""
+ if self.LLM_PROVIDER == LLMProvider.OPENAI:
+ if self.OPENAI_API_KEY is None:
+ raise ValueError(
+ "OPENAI_API_KEY is required when using OpenAI provider. "
+ "Set the OPENAI_API_KEY environment variable."
+ )
+ elif self.LLM_PROVIDER == LLMProvider.ANTHROPIC:
+ if self.ANTHROPIC_API_KEY is None:
+ raise ValueError(
+ "ANTHROPIC_API_KEY is required when using Anthropic provider. "
+ "Set the ANTHROPIC_API_KEY environment variable."
+ )
+ elif self.LLM_PROVIDER == LLMProvider.LMSTUDIO and self.LMSTUDIO_BASE_URL is None:
+ raise ValueError("LMSTUDIO_BASE_URL is required when using LM Studio provider.")
+ return self
+
+ def get_api_key(self) -> str | None:
+ """
+ Get the API key for the current provider.
+
+ Returns the secret value - use with caution to avoid logging.
+ """
+ if self.LLM_PROVIDER == LLMProvider.OPENAI and self.OPENAI_API_KEY:
+ return self.OPENAI_API_KEY.get_secret_value()
+ elif self.LLM_PROVIDER == LLMProvider.ANTHROPIC and self.ANTHROPIC_API_KEY:
+ return self.ANTHROPIC_API_KEY.get_secret_value()
+ return None
+
+ def safe_dict(self) -> dict:
+ """
+ Return settings as dictionary with secrets masked.
+
+ Safe for logging and display purposes.
+ """
+ data = self.model_dump()
+ # Mask all sensitive fields
+ secret_fields = [
+ "OPENAI_API_KEY",
+ "ANTHROPIC_API_KEY",
+ "BRAINTRUST_API_KEY",
+ "PINECONE_API_KEY",
+ "LANGSMITH_API_KEY",
+ "WANDB_API_KEY",
+ ]
+ for field in secret_fields:
+ if field in data and data[field]:
+ data[field] = "***MASKED***"
+ return data
+
+ def get_braintrust_api_key(self) -> str | None:
+ """
+ Get the Braintrust API key if configured.
+
+ Returns the secret value - use with caution to avoid logging.
+ """
+ if self.BRAINTRUST_API_KEY:
+ return self.BRAINTRUST_API_KEY.get_secret_value()
+ return None
+
+ def get_pinecone_api_key(self) -> str | None:
+ """
+ Get the Pinecone API key if configured.
+
+ Returns the secret value - use with caution to avoid logging.
+ """
+ if self.PINECONE_API_KEY:
+ return self.PINECONE_API_KEY.get_secret_value()
+ return None
+
+ def get_langsmith_api_key(self) -> str | None:
+ """
+ Get the LangSmith API key if configured.
+
+ Returns the secret value - use with caution to avoid logging.
+ """
+ if self.LANGSMITH_API_KEY:
+ return self.LANGSMITH_API_KEY.get_secret_value()
+ return None
+
+ def get_wandb_api_key(self) -> str | None:
+ """
+ Get the Weights & Biases API key if configured.
+
+ Returns the secret value - use with caution to avoid logging.
+ """
+ if self.WANDB_API_KEY:
+ return self.WANDB_API_KEY.get_secret_value()
+ return None
+
+ def __repr__(self) -> str:
+ """Safe string representation that doesn't expose secrets."""
+ return f"Settings(LLM_PROVIDER={self.LLM_PROVIDER}, MCTS_ENABLED={self.MCTS_ENABLED}, MCTS_IMPL={self.MCTS_IMPL}, LOG_LEVEL={self.LOG_LEVEL})"
+
+
+# Global settings instance (lazily loaded)
+_settings: Settings | None = None
+
+
+def get_settings() -> Settings:
+ """
+ Get the global settings instance.
+
+ Settings are loaded once and cached. To reload, call reset_settings() first.
+
+ Returns:
+ Settings: Application configuration instance
+
+ Raises:
+ ValidationError: If configuration is invalid
+ """
+ global _settings
+ if _settings is None:
+ _settings = Settings()
+ return _settings
+
+
+def reset_settings() -> None:
+ """
+ Reset the global settings instance.
+
+ Forces settings to be reloaded from environment on next get_settings() call.
+ Useful for testing.
+ """
+ global _settings
+ _settings = None
+
+
+# Type exports for external use
+__all__ = [
+ "Settings",
+ "LLMProvider",
+ "LogLevel",
+ "MCTSImplementation",
+ "get_settings",
+ "reset_settings",
+]
diff --git a/src/data/__init__.py b/src/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba25fc46f08ba038e5732be81ca91a67bffcbb39
--- /dev/null
+++ b/src/data/__init__.py
@@ -0,0 +1,29 @@
+"""
+Dataset Integration Module for Multi-Agent MCTS Training.
+
+This module provides utilities for loading, preprocessing, and managing
+open-source datasets for training HRM/TRM agents and neural meta-controllers.
+
+Supported Datasets:
+- DABStep: Multi-step reasoning tasks (CC-BY-4.0)
+- PRIMUS-Seed: Cybersecurity domain knowledge (ODC-BY)
+- PRIMUS-Instruct: Instruction fine-tuning data (ODC-BY)
+"""
+
+from .dataset_loader import DABStepLoader, DatasetLoader, PRIMUSLoader
+from .preprocessing import TextPreprocessor, TokenizerWrapper
+from .tactical_augmentation import TacticalAugmenter
+from .train_test_split import DataSplitter, StratifiedSplitter
+
+__all__ = [
+ "DatasetLoader",
+ "DABStepLoader",
+ "PRIMUSLoader",
+ "TextPreprocessor",
+ "TokenizerWrapper",
+ "TacticalAugmenter",
+ "DataSplitter",
+ "StratifiedSplitter",
+]
+
+__version__ = "1.0.0"
diff --git a/src/data/dataset_loader.py b/src/data/dataset_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db982a77641a347217659100402f25062088558
--- /dev/null
+++ b/src/data/dataset_loader.py
@@ -0,0 +1,551 @@
+"""
+Dataset Loading Module for Open-Source Training Data.
+
+Provides unified loading interfaces for:
+- DABStep: Multi-step data analysis reasoning
+- PRIMUS: Cybersecurity domain knowledge
+- Custom tactical datasets
+
+License Attribution:
+- DABStep: CC-BY-4.0 (Creative Commons Attribution)
+- PRIMUS: ODC-BY (Open Data Commons Attribution)
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from collections.abc import Iterator
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DatasetSample:
+ """Standardized representation of a dataset sample."""
+
+ id: str
+ text: str
+ metadata: dict[str, Any] = field(default_factory=dict)
+ labels: list[str] | None = None
+ difficulty: str | None = None
+ domain: str | None = None
+ reasoning_steps: list[str] | None = None
+
+
+@dataclass
+class DatasetStatistics:
+ """Statistics about a loaded dataset."""
+
+ total_samples: int
+ domains: dict[str, int]
+ avg_text_length: float
+ difficulty_distribution: dict[str, int]
+ total_tokens: int = 0
+
+
+class DatasetLoader(ABC):
+ """Abstract base class for dataset loaders."""
+
+ def __init__(self, cache_dir: str | None = None):
+ """
+ Initialize dataset loader.
+
+ Args:
+ cache_dir: Directory to cache downloaded datasets
+ """
+ self.cache_dir = cache_dir or str(Path.home() / ".cache" / "mcts_datasets")
+ self._dataset = None
+ self._statistics = None
+
+ @abstractmethod
+ def load(self, split: str = "train") -> list[DatasetSample]:
+ """Load dataset split."""
+ pass
+
+ @abstractmethod
+ def get_statistics(self) -> DatasetStatistics:
+ """Get dataset statistics."""
+ pass
+
+ @abstractmethod
+ def iterate_samples(self, batch_size: int = 32) -> Iterator[list[DatasetSample]]:
+ """Iterate over samples in batches."""
+ pass
+
+
+class DABStepLoader(DatasetLoader):
+ """
+ Loader for DABStep Multi-Step Reasoning Dataset.
+
+ DABStep contains 450+ data analysis tasks requiring sequential,
+ iterative problem-solving. Perfect for training HRM/TRM agents.
+
+ License: CC-BY-4.0 (Attribution required)
+ Source: huggingface.co/datasets/adyen/DABstep
+ """
+
+ DATASET_NAME = "adyen/DABstep"
+ DIFFICULTIES = ["easy", "medium", "hard"]
+
+ def __init__(self, cache_dir: str | None = None):
+ """Initialize DABStep loader."""
+ super().__init__(cache_dir)
+ self._loaded_samples: list[DatasetSample] = []
+
+ def load(self, split: str = "train", difficulty: str | None = None) -> list[DatasetSample]:
+ """
+ Load DABStep dataset.
+
+ Args:
+ split: Dataset split ('train', 'validation', 'test')
+ difficulty: Filter by difficulty ('easy', 'medium', 'hard')
+
+ Returns:
+ List of DatasetSample objects
+ """
+ try:
+ from datasets import load_dataset
+
+ logger.info(f"Loading DABStep dataset (split={split})")
+
+ dataset = load_dataset(
+ self.DATASET_NAME,
+ cache_dir=self.cache_dir,
+ )
+
+ if split not in dataset:
+ available_splits = list(dataset.keys())
+ logger.warning(f"Split '{split}' not found. Available: {available_splits}")
+ split = available_splits[0] if available_splits else "train"
+
+ samples = []
+ for idx, item in enumerate(dataset[split]):
+ sample = DatasetSample(
+ id=f"dabstep_{split}_{idx}",
+ text=str(item.get("question", item.get("text", ""))),
+ metadata={
+ "source": "DABStep",
+ "license": "CC-BY-4.0",
+ "split": split,
+ "original_data": item,
+ },
+ difficulty=item.get("difficulty", "medium"),
+ domain="data_analysis",
+ reasoning_steps=item.get("steps", []),
+ )
+
+ if difficulty and sample.difficulty != difficulty:
+ continue
+
+ samples.append(sample)
+
+ self._loaded_samples = samples
+ logger.info(f"Loaded {len(samples)} DABStep samples")
+ return samples
+
+ except ImportError:
+ logger.error("datasets library not installed. Run: pip install datasets")
+ raise
+ except Exception as e:
+ logger.error(f"Failed to load DABStep: {e}")
+ raise
+
+ def get_statistics(self) -> DatasetStatistics:
+ """Get statistics about loaded DABStep data."""
+ if not self._loaded_samples:
+ raise ValueError("No samples loaded. Call load() first.")
+
+ difficulty_dist = {}
+ total_length = 0
+
+ for sample in self._loaded_samples:
+ diff = sample.difficulty or "unknown"
+ difficulty_dist[diff] = difficulty_dist.get(diff, 0) + 1
+ total_length += len(sample.text)
+
+ return DatasetStatistics(
+ total_samples=len(self._loaded_samples),
+ domains={"data_analysis": len(self._loaded_samples)},
+ avg_text_length=total_length / len(self._loaded_samples),
+ difficulty_distribution=difficulty_dist,
+ )
+
+ def iterate_samples(self, batch_size: int = 32) -> Iterator[list[DatasetSample]]:
+ """Iterate over samples in batches."""
+ if not self._loaded_samples:
+ raise ValueError("No samples loaded. Call load() first.")
+
+ for i in range(0, len(self._loaded_samples), batch_size):
+ yield self._loaded_samples[i : i + batch_size]
+
+ def get_reasoning_tasks(self) -> list[DatasetSample]:
+ """Get only samples with explicit reasoning steps."""
+ return [s for s in self._loaded_samples if s.reasoning_steps]
+
+
+class PRIMUSLoader(DatasetLoader):
+ """
+ Loader for PRIMUS Cybersecurity Dataset Suite.
+
+ PRIMUS contains:
+ - Seed: 674,848 cybersecurity documents (190M tokens)
+ - Instruct: 835 instruction-tuning samples
+ - Reasoning: Self-reflection data for reasoning
+
+ License: ODC-BY (Open Data Commons Attribution)
+ Source: huggingface.co/datasets/trendmicro-ailab/Primus-Seed
+ """
+
+ SEED_DATASET = "trendmicro-ailab/Primus-Seed"
+ INSTRUCT_DATASET = "trendmicro-ailab/Primus-Instruct"
+
+ DOMAINS = [
+ "mitre_attack",
+ "wikipedia",
+ "company_sites",
+ "threat_intelligence",
+ "vulnerability_db",
+ ]
+
+ def __init__(self, cache_dir: str | None = None):
+ """Initialize PRIMUS loader."""
+ super().__init__(cache_dir)
+ self._seed_samples: list[DatasetSample] = []
+ self._instruct_samples: list[DatasetSample] = []
+
+ def load(
+ self,
+ split: str = "train",
+ dataset_type: str = "seed",
+ domains: list[str] | None = None,
+ max_samples: int | None = None,
+ streaming: bool = True,
+ ) -> list[DatasetSample]:
+ """
+ Load PRIMUS dataset.
+
+ Args:
+ split: Dataset split ('train', 'validation', 'test')
+ dataset_type: 'seed' for knowledge base, 'instruct' for fine-tuning
+ domains: Filter by specific domains
+ max_samples: Limit number of samples (useful for large datasets)
+ streaming: Use streaming mode for large datasets (default True)
+
+ Returns:
+ List of DatasetSample objects
+ """
+ try:
+ from datasets import load_dataset
+
+ dataset_name = self.SEED_DATASET if dataset_type == "seed" else self.INSTRUCT_DATASET
+
+ logger.info(f"Loading PRIMUS {dataset_type} dataset")
+
+ # Use streaming for large seed dataset to avoid download issues
+ use_streaming = streaming and dataset_type == "seed" and max_samples is not None
+
+ if use_streaming:
+ logger.info(f"Using streaming mode (max_samples={max_samples})")
+ dataset = load_dataset(
+ dataset_name,
+ "default",
+ streaming=True,
+ cache_dir=self.cache_dir,
+ )
+ # For streaming, iterate the first available split
+ data_iter = iter(dataset["train"]) if "train" in dataset else iter(dataset[list(dataset.keys())[0]])
+ else:
+ dataset = load_dataset(
+ dataset_name,
+ cache_dir=self.cache_dir,
+ )
+
+ if split not in dataset:
+ available_splits = list(dataset.keys())
+ logger.warning(f"Split '{split}' not found. Using: {available_splits[0]}")
+ split = available_splits[0]
+
+ data_iter = iter(dataset[split])
+
+ samples = []
+ count = 0
+
+ for idx, item in enumerate(data_iter):
+ if max_samples and count >= max_samples:
+ break
+
+ domain = item.get("domain", item.get("source", "unknown"))
+
+ if domains and domain not in domains:
+ continue
+
+ if dataset_type == "instruct":
+ text = f"Instruction: {item.get('instruction', '')}\nResponse: {item.get('response', '')}"
+ else:
+ text = str(item.get("text", item.get("content", "")))
+
+ sample = DatasetSample(
+ id=f"primus_{dataset_type}_{split}_{idx}",
+ text=text,
+ metadata={
+ "source": f"PRIMUS-{dataset_type.capitalize()}",
+ "license": "ODC-BY",
+ "split": split,
+ "original_domain": domain,
+ },
+ domain=domain,
+ labels=item.get("labels", item.get("tags", [])),
+ )
+
+ samples.append(sample)
+ count += 1
+
+ if dataset_type == "seed":
+ self._seed_samples = samples
+ else:
+ self._instruct_samples = samples
+
+ logger.info(f"Loaded {len(samples)} PRIMUS {dataset_type} samples")
+ return samples
+
+ except ImportError:
+ logger.error("datasets library not installed. Run: pip install datasets")
+ raise
+ except Exception as e:
+ if "gated dataset" in str(e):
+ logger.error(
+ f"PRIMUS is a gated dataset. Please authenticate with HuggingFace:\n"
+ f"1. Create account at https://huggingface.co/\n"
+ f"2. Accept dataset terms at https://huggingface.co/datasets/{dataset_name}\n"
+ f"3. Create token at https://huggingface.co/settings/tokens\n"
+ f"4. Run: huggingface-cli login"
+ )
+ else:
+ logger.error(f"Failed to load PRIMUS: {e}")
+ raise
+
+ def load_seed(self, max_samples: int | None = None) -> list[DatasetSample]:
+ """Load PRIMUS-Seed knowledge base."""
+ return self.load(dataset_type="seed", max_samples=max_samples)
+
+ def load_instruct(self) -> list[DatasetSample]:
+ """Load PRIMUS-Instruct fine-tuning data."""
+ return self.load(dataset_type="instruct", streaming=False)
+
+ def get_statistics(self) -> DatasetStatistics:
+ """Get statistics about loaded PRIMUS data."""
+ all_samples = self._seed_samples + self._instruct_samples
+
+ if not all_samples:
+ raise ValueError("No samples loaded. Call load() first.")
+
+ domain_dist = {}
+ total_length = 0
+
+ for sample in all_samples:
+ domain = sample.domain or "unknown"
+ domain_dist[domain] = domain_dist.get(domain, 0) + 1
+ total_length += len(sample.text)
+
+ return DatasetStatistics(
+ total_samples=len(all_samples),
+ domains=domain_dist,
+ avg_text_length=total_length / len(all_samples),
+ difficulty_distribution={"cybersecurity": len(all_samples)},
+ )
+
+ def iterate_samples(self, batch_size: int = 32) -> Iterator[list[DatasetSample]]:
+ """Iterate over all loaded samples in batches."""
+ all_samples = self._seed_samples + self._instruct_samples
+
+ if not all_samples:
+ raise ValueError("No samples loaded. Call load() first.")
+
+ for i in range(0, len(all_samples), batch_size):
+ yield all_samples[i : i + batch_size]
+
+ def get_mitre_attack_samples(self) -> list[DatasetSample]:
+ """Get samples specifically from MITRE ATT&CK."""
+ return [s for s in self._seed_samples if "mitre" in (s.domain or "").lower()]
+
+ def get_threat_intelligence_samples(self) -> list[DatasetSample]:
+ """Get threat intelligence related samples."""
+ return [
+ s
+ for s in self._seed_samples
+ if any(kw in (s.domain or "").lower() for kw in ["threat", "cti", "intelligence"])
+ ]
+
+
+class CombinedDatasetLoader:
+ """
+ Unified loader for combining multiple datasets.
+
+ Provides a single interface for loading and managing:
+ - DABStep (multi-step reasoning)
+ - PRIMUS (cybersecurity knowledge)
+ - Custom tactical datasets
+ """
+
+ def __init__(self, cache_dir: str | None = None):
+ """Initialize combined loader."""
+ self.cache_dir = cache_dir
+ self.dabstep_loader = DABStepLoader(cache_dir)
+ self.primus_loader = PRIMUSLoader(cache_dir)
+ self._all_samples: list[DatasetSample] = []
+
+ def load_all(
+ self,
+ dabstep_split: str = "train",
+ primus_max_samples: int | None = 10000,
+ include_instruct: bool = True,
+ ) -> list[DatasetSample]:
+ """
+ Load all datasets.
+
+ Args:
+ dabstep_split: Split for DABStep
+ primus_max_samples: Max samples from PRIMUS-Seed (None for all)
+ include_instruct: Whether to include PRIMUS-Instruct
+
+ Returns:
+ Combined list of all samples
+ """
+ logger.info("Loading combined datasets")
+
+ # Load DABStep
+ dabstep_samples = self.dabstep_loader.load(split=dabstep_split)
+ logger.info(f"DABStep: {len(dabstep_samples)} samples")
+
+ # Load PRIMUS-Seed
+ primus_seed = self.primus_loader.load_seed(max_samples=primus_max_samples)
+ logger.info(f"PRIMUS-Seed: {len(primus_seed)} samples")
+
+ # Load PRIMUS-Instruct
+ primus_instruct = []
+ if include_instruct:
+ primus_instruct = self.primus_loader.load_instruct()
+ logger.info(f"PRIMUS-Instruct: {len(primus_instruct)} samples")
+
+ self._all_samples = dabstep_samples + primus_seed + primus_instruct
+ logger.info(f"Total combined samples: {len(self._all_samples)}")
+
+ return self._all_samples
+
+ def get_domain_distribution(self) -> dict[str, int]:
+ """Get distribution of samples across domains."""
+ dist = {}
+ for sample in self._all_samples:
+ domain = sample.domain or "unknown"
+ dist[domain] = dist.get(domain, 0) + 1
+ return dist
+
+ def filter_by_domain(self, domain: str) -> list[DatasetSample]:
+ """Filter samples by domain."""
+ return [s for s in self._all_samples if s.domain == domain]
+
+ def get_multi_step_reasoning_samples(self) -> list[DatasetSample]:
+ """Get samples suitable for multi-step reasoning training."""
+ return [
+ s
+ for s in self._all_samples
+ if s.reasoning_steps or s.domain == "data_analysis" or "instruct" in s.metadata.get("source", "").lower()
+ ]
+
+ def export_for_training(self, output_path: str, format: str = "jsonl") -> str:
+ """
+ Export dataset for training.
+
+ Args:
+ output_path: Path to save exported data
+ format: Export format ('jsonl', 'csv', 'parquet')
+
+ Returns:
+ Path to exported file
+ """
+ import json
+
+ output_file = Path(output_path)
+ output_file.parent.mkdir(parents=True, exist_ok=True)
+
+ if format == "jsonl":
+ with open(output_file, "w", encoding="utf-8") as f:
+ for sample in self._all_samples:
+ record = {
+ "id": sample.id,
+ "text": sample.text,
+ "domain": sample.domain,
+ "difficulty": sample.difficulty,
+ "labels": sample.labels,
+ "metadata": sample.metadata,
+ }
+ f.write(json.dumps(record) + "\n")
+ else:
+ raise NotImplementedError(f"Format {format} not yet supported")
+
+ logger.info(f"Exported {len(self._all_samples)} samples to {output_file}")
+ return str(output_file)
+
+
+def load_dataset(
+ dataset_name: str,
+ split: str = "train",
+ cache_dir: str | None = None,
+ **kwargs,
+) -> Any:
+ """
+ Unified interface for loading datasets from HuggingFace.
+
+ This function provides compatibility with the standard HuggingFace datasets API.
+ It wraps the underlying load_dataset function from the datasets library.
+
+ Args:
+ dataset_name: HuggingFace dataset identifier (e.g., "adyen/DABstep")
+ split: Dataset split to load ("train", "validation", "test")
+ cache_dir: Optional directory for caching downloaded datasets
+ **kwargs: Additional arguments passed to datasets.load_dataset
+
+ Returns:
+ HuggingFace Dataset object or dict of Dataset objects
+
+ Raises:
+ ImportError: If datasets library is not installed
+ Exception: If dataset loading fails
+
+ Examples:
+ >>> # Load DABStep dataset
+ >>> dataset = load_dataset("adyen/DABstep")
+ >>> samples = dataset["train"]
+
+ >>> # Load PRIMUS-Seed with custom cache
+ >>> dataset = load_dataset("trendmicro-ailab/Primus-Seed", cache_dir="/tmp/cache")
+
+ License Attribution:
+ - DABStep: CC-BY-4.0 (Creative Commons Attribution 4.0)
+ - PRIMUS: ODC-BY (Open Data Commons Attribution)
+ """
+ try:
+ from datasets import load_dataset as hf_load_dataset
+
+ logger.info(f"Loading dataset: {dataset_name} (split={split})")
+
+ load_kwargs = {
+ **kwargs,
+ }
+
+ if cache_dir:
+ load_kwargs["cache_dir"] = cache_dir
+
+ dataset = hf_load_dataset(dataset_name, **load_kwargs)
+
+ logger.info(f"Successfully loaded dataset: {dataset_name}")
+ return dataset
+
+ except ImportError:
+ logger.error("datasets library not installed. Run: pip install datasets")
+ raise ImportError("The datasets library is required but not installed. Install it with: pip install datasets")
+ except Exception as e:
+ logger.error(f"Failed to load dataset {dataset_name}: {e}")
+ raise
diff --git a/src/data/preprocessing.py b/src/data/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..4809af5e6b2cec0774c74f1e8cbfd53a3bf81276
--- /dev/null
+++ b/src/data/preprocessing.py
@@ -0,0 +1,406 @@
+"""
+Text Preprocessing Module for Training Data.
+
+Provides utilities for:
+- Text cleaning and normalization
+- Tokenization with various backends
+- Feature extraction for meta-controller training
+"""
+
+import logging
+import re
+from dataclasses import dataclass
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PreprocessedText:
+ """Preprocessed text with metadata."""
+
+ original: str
+ cleaned: str
+ tokens: list[str]
+ token_ids: list[int] | None = None
+ features: dict[str, Any] | None = None
+
+
+class TextPreprocessor:
+ """
+ Text preprocessing pipeline for multi-agent training data.
+
+ Handles:
+ - HTML/XML tag removal
+ - Special character normalization
+ - Whitespace cleanup
+ - Domain-specific preprocessing (cyber, military, etc.)
+ """
+
+ # Patterns for cleaning
+ HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
+ URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
+ MULTIPLE_SPACES = re.compile(r"\s+")
+ SPECIAL_CHARS = re.compile(r"[^\w\s\-.,!?;:()[\]{}\"'/]")
+
+ # Domain-specific patterns
+ IP_ADDRESS_PATTERN = re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b")
+ CVE_PATTERN = re.compile(r"CVE-\d{4}-\d{4,}")
+ MITRE_TECHNIQUE_PATTERN = re.compile(r"T\d{4}(?:\.\d{3})?")
+
+ def __init__(
+ self,
+ remove_html: bool = True,
+ normalize_urls: bool = True,
+ lowercase: bool = False,
+ preserve_domain_patterns: bool = True,
+ ):
+ """
+ Initialize preprocessor.
+
+ Args:
+ remove_html: Remove HTML/XML tags
+ normalize_urls: Replace URLs with placeholder
+ lowercase: Convert to lowercase
+ preserve_domain_patterns: Keep domain-specific patterns (IPs, CVEs, etc.)
+ """
+ self.remove_html = remove_html
+ self.normalize_urls = normalize_urls
+ self.lowercase = lowercase
+ self.preserve_domain_patterns = preserve_domain_patterns
+
+ def clean(self, text: str) -> str:
+ """
+ Clean and normalize text.
+
+ Args:
+ text: Raw input text
+
+ Returns:
+ Cleaned text
+ """
+ if not text:
+ return ""
+
+ result = text
+
+ # Remove HTML tags
+ if self.remove_html:
+ result = self.HTML_TAG_PATTERN.sub(" ", result)
+
+ # Preserve or normalize URLs
+ if self.normalize_urls:
+ if self.preserve_domain_patterns:
+ result = self.URL_PATTERN.sub("[URL]", result)
+ else:
+ result = self.URL_PATTERN.sub("", result)
+
+ # Normalize whitespace
+ result = self.MULTIPLE_SPACES.sub(" ", result)
+
+ # Lowercase if requested
+ if self.lowercase:
+ result = result.lower()
+
+ # Strip leading/trailing whitespace
+ result = result.strip()
+
+ return result
+
+ def extract_domain_features(self, text: str) -> dict[str, Any]:
+ """
+ Extract domain-specific features from text.
+
+ Args:
+ text: Input text
+
+ Returns:
+ Dictionary of extracted features
+ """
+ features = {
+ "has_ip_addresses": bool(self.IP_ADDRESS_PATTERN.search(text)),
+ "ip_count": len(self.IP_ADDRESS_PATTERN.findall(text)),
+ "has_cve": bool(self.CVE_PATTERN.search(text)),
+ "cve_ids": self.CVE_PATTERN.findall(text),
+ "has_mitre_techniques": bool(self.MITRE_TECHNIQUE_PATTERN.search(text)),
+ "mitre_techniques": self.MITRE_TECHNIQUE_PATTERN.findall(text),
+ "text_length": len(text),
+ "word_count": len(text.split()),
+ "sentence_count": len(re.findall(r"[.!?]+", text)),
+ }
+
+ # Detect domain indicators
+ domain_keywords = {
+ "cybersecurity": ["attack", "vulnerability", "exploit", "malware", "threat"],
+ "military": ["tactical", "reconnaissance", "deployment", "terrain", "objective"],
+ "data_analysis": ["dataset", "analysis", "correlation", "statistics", "visualization"],
+ }
+
+ for domain, keywords in domain_keywords.items():
+ features[f"is_{domain}"] = any(kw in text.lower() for kw in keywords)
+
+ return features
+
+ def preprocess(self, text: str) -> PreprocessedText:
+ """
+ Full preprocessing pipeline.
+
+ Args:
+ text: Raw input text
+
+ Returns:
+ PreprocessedText object with all preprocessing results
+ """
+ cleaned = self.clean(text)
+ tokens = cleaned.split() # Simple whitespace tokenization
+ features = self.extract_domain_features(text)
+
+ return PreprocessedText(
+ original=text,
+ cleaned=cleaned,
+ tokens=tokens,
+ features=features,
+ )
+
+ def batch_preprocess(self, texts: list[str]) -> list[PreprocessedText]:
+ """
+ Preprocess multiple texts.
+
+ Args:
+ texts: List of raw texts
+
+ Returns:
+ List of PreprocessedText objects
+ """
+ return [self.preprocess(text) for text in texts]
+
+
+class TokenizerWrapper:
+ """
+ Wrapper for various tokenization backends.
+
+ Supports:
+ - Simple whitespace tokenization
+ - HuggingFace tokenizers
+ - Custom vocabularies
+ """
+
+ def __init__(
+ self,
+ backend: str = "simple",
+ model_name: str | None = None,
+ max_length: int = 512,
+ ):
+ """
+ Initialize tokenizer.
+
+ Args:
+ backend: Tokenizer backend ('simple', 'huggingface', 'custom')
+ model_name: Model name for HuggingFace tokenizer
+ max_length: Maximum sequence length
+ """
+ self.backend = backend
+ self.model_name = model_name
+ self.max_length = max_length
+ self._tokenizer = None
+
+ if backend == "huggingface" and model_name:
+ self._load_huggingface_tokenizer()
+
+ def _load_huggingface_tokenizer(self):
+ """Load HuggingFace tokenizer."""
+ try:
+ from transformers import AutoTokenizer
+
+ self._tokenizer = AutoTokenizer.from_pretrained(
+ self.model_name,
+ model_max_length=self.max_length,
+ )
+ logger.info(f"Loaded HuggingFace tokenizer: {self.model_name}")
+ except ImportError:
+ logger.error("transformers library not installed. Run: pip install transformers")
+ raise
+
+ def tokenize(self, text: str) -> tuple[list[str], list[int] | None]:
+ """
+ Tokenize text.
+
+ Args:
+ text: Input text
+
+ Returns:
+ Tuple of (tokens, token_ids)
+ """
+ if self.backend == "simple":
+ tokens = text.split()[: self.max_length]
+ return tokens, None
+
+ elif self.backend == "huggingface" and self._tokenizer:
+ encoded = self._tokenizer(
+ text,
+ truncation=True,
+ max_length=self.max_length,
+ return_tensors=None,
+ )
+ tokens = self._tokenizer.convert_ids_to_tokens(encoded["input_ids"])
+ token_ids = encoded["input_ids"]
+ return tokens, token_ids
+
+ else:
+ raise ValueError(f"Unsupported backend: {self.backend}")
+
+ def batch_tokenize(self, texts: list[str]) -> list[tuple[list[str], list[int] | None]]:
+ """
+ Tokenize multiple texts.
+
+ Args:
+ texts: List of input texts
+
+ Returns:
+ List of (tokens, token_ids) tuples
+ """
+ return [self.tokenize(text) for text in texts]
+
+ def encode_for_training(self, texts: list[str]) -> dict[str, Any]:
+ """
+ Encode texts for model training.
+
+ Args:
+ texts: List of input texts
+
+ Returns:
+ Dictionary with encoded data ready for training
+ """
+ if self.backend != "huggingface" or not self._tokenizer:
+ raise ValueError("encode_for_training requires HuggingFace backend")
+
+ encoded = self._tokenizer(
+ texts,
+ truncation=True,
+ padding=True,
+ max_length=self.max_length,
+ return_tensors="pt",
+ )
+
+ return encoded
+
+
+class MetaControllerFeatureExtractor:
+ """
+ Extract features for meta-controller training.
+
+ Converts text and agent state information into numerical features
+ suitable for RNN/BERT routing decisions.
+ """
+
+ def __init__(self):
+ """Initialize feature extractor."""
+ self.preprocessor = TextPreprocessor()
+
+ def extract_query_features(self, query: str) -> dict[str, float]:
+ """
+ Extract numerical features from query text.
+
+ Args:
+ query: User query text
+
+ Returns:
+ Dictionary of numerical features
+ """
+ domain_features = self.preprocessor.extract_domain_features(query)
+
+ features = {
+ "query_length": domain_features["text_length"] / 10000, # Normalize
+ "word_count": domain_features["word_count"] / 500,
+ "sentence_count": domain_features["sentence_count"] / 50,
+ "has_technical_terms": float(
+ domain_features["has_ip_addresses"]
+ or domain_features["has_cve"]
+ or domain_features["has_mitre_techniques"]
+ ),
+ "is_cybersecurity": float(domain_features["is_cybersecurity"]),
+ "is_military": float(domain_features["is_military"]),
+ "is_data_analysis": float(domain_features["is_data_analysis"]),
+ "complexity_score": self._estimate_complexity(query),
+ }
+
+ return features
+
+ def _estimate_complexity(self, text: str) -> float:
+ """
+ Estimate query complexity (0-1 scale).
+
+ Args:
+ text: Input text
+
+ Returns:
+ Complexity score
+ """
+ # Simple heuristic based on length, technical terms, etc.
+ score = 0.0
+
+ # Length factor
+ word_count = len(text.split())
+ if word_count > 50:
+ score += 0.3
+ elif word_count > 20:
+ score += 0.1
+
+ # Technical term factor
+ technical_indicators = [
+ "analyze",
+ "compare",
+ "evaluate",
+ "synthesize",
+ "strategic",
+ "tactical",
+ "multi-step",
+ "consider",
+ ]
+ for term in technical_indicators:
+ if term in text.lower():
+ score += 0.1
+
+ # Question complexity
+ if "?" in text:
+ if any(kw in text.lower() for kw in ["why", "how", "what if"]):
+ score += 0.2
+ else:
+ score += 0.1
+
+ return min(score, 1.0)
+
+ def extract_agent_state_features(
+ self,
+ hrm_confidence: float = 0.0,
+ trm_confidence: float = 0.0,
+ mcts_iterations: int = 0,
+ consensus_score: float = 0.0,
+ rag_retrieved: int = 0,
+ ) -> list[float]:
+ """
+ Extract features from current agent state.
+
+ Args:
+ hrm_confidence: HRM agent confidence
+ trm_confidence: TRM agent confidence
+ mcts_iterations: MCTS iterations completed
+ consensus_score: Inter-agent consensus
+ rag_retrieved: Number of RAG documents retrieved
+
+ Returns:
+ List of normalized features (10-dimensional)
+ """
+ return [
+ hrm_confidence,
+ trm_confidence,
+ min(mcts_iterations / 1000, 1.0),
+ consensus_score,
+ min(rag_retrieved / 20, 1.0),
+ # Derived features
+ abs(hrm_confidence - trm_confidence), # Disagreement
+ (hrm_confidence + trm_confidence) / 2, # Average confidence
+ float(mcts_iterations > 0), # MCTS active
+ float(consensus_score > 0.7), # High consensus
+ float(rag_retrieved > 0), # RAG used
+ ]
diff --git a/src/data/tactical_augmentation.py b/src/data/tactical_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3456a68bb491c1bd36c9f5df449610cd67cdcd
--- /dev/null
+++ b/src/data/tactical_augmentation.py
@@ -0,0 +1,484 @@
+"""
+Tactical Data Augmentation Module.
+
+Provides domain-specific data augmentation techniques for:
+- Cybersecurity threat scenarios
+- Military tactical situations
+- Multi-step reasoning problems
+
+These augmentations help increase training data diversity and improve
+model robustness for tactical analysis tasks.
+"""
+
+import logging
+import random
+from dataclasses import dataclass
+
+from .dataset_loader import DatasetSample
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AugmentationResult:
+ """Result of data augmentation."""
+
+ original: DatasetSample
+ augmented: list[DatasetSample]
+ augmentation_types: list[str]
+
+
+class TacticalAugmenter:
+ """
+ Domain-specific data augmentation for tactical analysis.
+
+ Augmentation techniques:
+ - Paraphrasing tactical scenarios
+ - Varying urgency levels
+ - Adding/removing constraints
+ - Scenario parameter variation
+ - Threat actor substitution
+ - Temporal shifting
+ """
+
+ # Tactical scenario templates
+ URGENCY_MODIFIERS = {
+ "high": ["IMMEDIATE", "CRITICAL", "URGENT", "TIME-SENSITIVE"],
+ "medium": ["PRIORITY", "IMPORTANT", "ATTENTION REQUIRED"],
+ "low": ["ROUTINE", "STANDARD", "WHEN POSSIBLE"],
+ }
+
+ THREAT_ACTORS = [
+ "APT28",
+ "APT29",
+ "Lazarus Group",
+ "Cozy Bear",
+ "Fancy Bear",
+ "Unknown Actor",
+ "Nation-State Actor",
+ "Criminal Organization",
+ ]
+
+ ATTACK_VECTORS = [
+ "phishing",
+ "spear-phishing",
+ "watering hole",
+ "supply chain compromise",
+ "zero-day exploit",
+ "credential stuffing",
+ "brute force",
+ "social engineering",
+ ]
+
+ MILITARY_OBJECTIVES = [
+ "secure perimeter",
+ "establish forward position",
+ "conduct reconnaissance",
+ "neutralize threat",
+ "protect assets",
+ "maintain operational security",
+ "coordinate with allied forces",
+ "execute tactical withdrawal",
+ ]
+
+ ENVIRONMENTAL_CONDITIONS = [
+ "night operations",
+ "adverse weather",
+ "limited visibility",
+ "urban terrain",
+ "mountainous region",
+ "coastal area",
+ "contested airspace",
+ "electronic warfare environment",
+ ]
+
+ def __init__(self, seed: int = 42):
+ """
+ Initialize augmenter.
+
+ Args:
+ seed: Random seed for reproducibility
+ """
+ self.rng = random.Random(seed)
+ self._augmentation_count = 0
+
+ def augment_sample(
+ self,
+ sample: DatasetSample,
+ num_augmentations: int = 3,
+ techniques: list[str] | None = None,
+ ) -> AugmentationResult:
+ """
+ Augment a single sample.
+
+ Args:
+ sample: Original dataset sample
+ num_augmentations: Number of augmented versions to create
+ techniques: Specific techniques to use (None for random selection)
+
+ Returns:
+ AugmentationResult with augmented samples
+ """
+ available_techniques = [
+ "urgency_variation",
+ "parameter_substitution",
+ "constraint_addition",
+ "temporal_shift",
+ "perspective_change",
+ ]
+
+ if techniques:
+ available_techniques = [t for t in techniques if t in available_techniques]
+
+ augmented_samples = []
+ used_techniques = []
+
+ for _i in range(num_augmentations):
+ technique = self.rng.choice(available_techniques)
+ used_techniques.append(technique)
+
+ augmented_text = self._apply_technique(sample.text, sample.domain, technique)
+
+ aug_sample = DatasetSample(
+ id=f"{sample.id}_aug_{self._augmentation_count}",
+ text=augmented_text,
+ metadata={
+ **sample.metadata,
+ "augmentation": technique,
+ "original_id": sample.id,
+ },
+ labels=sample.labels,
+ difficulty=sample.difficulty,
+ domain=sample.domain,
+ reasoning_steps=sample.reasoning_steps,
+ )
+
+ augmented_samples.append(aug_sample)
+ self._augmentation_count += 1
+
+ return AugmentationResult(
+ original=sample,
+ augmented=augmented_samples,
+ augmentation_types=used_techniques,
+ )
+
+ def _apply_technique(self, text: str, domain: str | None, technique: str) -> str:
+ """Apply specific augmentation technique."""
+ if technique == "urgency_variation":
+ return self._augment_urgency(text)
+ elif technique == "parameter_substitution":
+ return self._augment_parameters(text, domain)
+ elif technique == "constraint_addition":
+ return self._augment_constraints(text, domain)
+ elif technique == "temporal_shift":
+ return self._augment_temporal(text)
+ elif technique == "perspective_change":
+ return self._augment_perspective(text, domain)
+ else:
+ return text
+
+ def _augment_urgency(self, text: str) -> str:
+ """Vary urgency level in the text."""
+ urgency_level = self.rng.choice(list(self.URGENCY_MODIFIERS.keys()))
+ modifier = self.rng.choice(self.URGENCY_MODIFIERS[urgency_level])
+
+ # Add urgency prefix
+ if urgency_level == "high":
+ return f"[{modifier}] {text}"
+ elif urgency_level == "medium":
+ return f"{modifier}: {text}"
+ else:
+ return f"({modifier}) {text}"
+
+ def _augment_parameters(self, text: str, domain: str | None) -> str:
+ """Substitute domain-specific parameters."""
+ if domain == "cybersecurity" or "cyber" in text.lower():
+ # Substitute threat actors
+ for actor in self.THREAT_ACTORS:
+ if actor in text:
+ new_actor = self.rng.choice([a for a in self.THREAT_ACTORS if a != actor])
+ text = text.replace(actor, new_actor)
+ break
+
+ # Substitute attack vectors
+ for vector in self.ATTACK_VECTORS:
+ if vector in text.lower():
+ new_vector = self.rng.choice([v for v in self.ATTACK_VECTORS if v != vector])
+ text = text.replace(vector, new_vector)
+ break
+
+ elif domain == "military" or any(kw in text.lower() for kw in ["tactical", "military", "reconnaissance"]):
+ # Substitute objectives
+ for obj in self.MILITARY_OBJECTIVES:
+ if obj in text.lower():
+ new_obj = self.rng.choice([o for o in self.MILITARY_OBJECTIVES if o != obj])
+ text = text.replace(obj, new_obj)
+ break
+
+ return text
+
+ def _augment_constraints(self, text: str, domain: str | None) -> str:
+ """Add additional constraints to the scenario."""
+ constraints = []
+
+ if domain == "cybersecurity":
+ constraints = [
+ "with limited network visibility",
+ "under active attack",
+ "with compromised credentials",
+ "during maintenance window",
+ "with restricted access to logs",
+ ]
+ elif domain == "military":
+ constraints = [
+ "with limited ammunition",
+ "under communication blackout",
+ "with reduced personnel",
+ "in contested environment",
+ "with time constraint of 2 hours",
+ ]
+ else:
+ constraints = [
+ "with incomplete information",
+ "under time pressure",
+ "with resource constraints",
+ "considering multiple stakeholders",
+ "with conflicting objectives",
+ ]
+
+ if constraints:
+ constraint = self.rng.choice(constraints)
+ return f"{text} [{constraint}]"
+
+ return text
+
+ def _augment_temporal(self, text: str) -> str:
+ """Shift temporal context."""
+ temporal_contexts = [
+ "In the past 24 hours, ",
+ "Over the next week, ",
+ "Immediately, ",
+ "During the upcoming operation, ",
+ "Following initial assessment, ",
+ ]
+
+ context = self.rng.choice(temporal_contexts)
+ return f"{context}{text.lower()}" if text else text
+
+ def _augment_perspective(self, text: str, domain: str | None) -> str:
+ """Change analytical perspective."""
+ perspectives = {
+ "cybersecurity": [
+ "From a threat hunter's perspective: ",
+ "Considering the attacker's viewpoint: ",
+ "For incident response purposes: ",
+ "From a risk management standpoint: ",
+ ],
+ "military": [
+ "From the commander's perspective: ",
+ "Considering enemy capabilities: ",
+ "For tactical planning purposes: ",
+ "From a logistics standpoint: ",
+ ],
+ "default": [
+ "From an analytical perspective: ",
+ "Considering all factors: ",
+ "For decision-making purposes: ",
+ "From a strategic viewpoint: ",
+ ],
+ }
+
+ domain_perspectives = perspectives.get(domain or "default", perspectives["default"])
+ perspective = self.rng.choice(domain_perspectives)
+
+ return f"{perspective}{text}"
+
+ def augment_batch(
+ self,
+ samples: list[DatasetSample],
+ augmentations_per_sample: int = 2,
+ ) -> list[DatasetSample]:
+ """
+ Augment a batch of samples.
+
+ Args:
+ samples: List of original samples
+ augmentations_per_sample: Number of augmentations per sample
+
+ Returns:
+ List of all samples (original + augmented)
+ """
+ all_samples = list(samples) # Keep originals
+
+ for sample in samples:
+ result = self.augment_sample(sample, num_augmentations=augmentations_per_sample)
+ all_samples.extend(result.augmented)
+
+ logger.info(
+ f"Augmented {len(samples)} samples to {len(all_samples)} (+{len(all_samples) - len(samples)} augmented)"
+ )
+
+ return all_samples
+
+ def create_tactical_scenarios(self, base_samples: list[DatasetSample]) -> list[DatasetSample]:
+ """
+ Create tactical scenario variations from base samples.
+
+ Combines multiple augmentation techniques to create
+ diverse tactical scenarios for training.
+
+ Args:
+ base_samples: Base dataset samples
+
+ Returns:
+ Extended list with tactical scenario variations
+ """
+ scenarios = list(base_samples)
+
+ for sample in base_samples:
+ # Create high-stakes variant
+ high_stakes = self._augment_urgency(sample.text)
+ high_stakes = self._augment_constraints(high_stakes, sample.domain)
+ scenarios.append(
+ DatasetSample(
+ id=f"{sample.id}_highstakes_{self._augmentation_count}",
+ text=high_stakes,
+ metadata={
+ **sample.metadata,
+ "scenario_type": "high_stakes",
+ "original_id": sample.id,
+ },
+ labels=sample.labels,
+ difficulty="hard", # High stakes scenarios are harder
+ domain=sample.domain,
+ reasoning_steps=sample.reasoning_steps,
+ )
+ )
+ self._augmentation_count += 1
+
+ # Create multi-perspective variant
+ if self.rng.random() > 0.5:
+ multi_perspective = self._augment_perspective(sample.text, sample.domain)
+ scenarios.append(
+ DatasetSample(
+ id=f"{sample.id}_multiperspective_{self._augmentation_count}",
+ text=multi_perspective,
+ metadata={
+ **sample.metadata,
+ "scenario_type": "multi_perspective",
+ "original_id": sample.id,
+ },
+ labels=sample.labels,
+ difficulty=sample.difficulty,
+ domain=sample.domain,
+ reasoning_steps=sample.reasoning_steps,
+ )
+ )
+ self._augmentation_count += 1
+
+ logger.info(f"Created {len(scenarios) - len(base_samples)} tactical scenarios")
+ return scenarios
+
+
+class CyberSecurityAugmenter(TacticalAugmenter):
+ """
+ Specialized augmenter for cybersecurity scenarios.
+
+ Focuses on:
+ - MITRE ATT&CK technique variations
+ - Threat intelligence context
+ - Incident response scenarios
+ """
+
+ MITRE_TACTICS = [
+ "Initial Access",
+ "Execution",
+ "Persistence",
+ "Privilege Escalation",
+ "Defense Evasion",
+ "Credential Access",
+ "Discovery",
+ "Lateral Movement",
+ "Collection",
+ "Exfiltration",
+ "Impact",
+ ]
+
+ SEVERITY_LEVELS = ["LOW", "MEDIUM", "HIGH", "CRITICAL"]
+
+ def augment_with_mitre_context(self, sample: DatasetSample) -> DatasetSample:
+ """
+ Add MITRE ATT&CK context to sample.
+
+ Args:
+ sample: Original sample
+
+ Returns:
+ Augmented sample with MITRE context
+ """
+ tactic = self.rng.choice(self.MITRE_TACTICS)
+ severity = self.rng.choice(self.SEVERITY_LEVELS)
+
+ augmented_text = f"[MITRE ATT&CK: {tactic}] [Severity: {severity}] {sample.text}"
+
+ return DatasetSample(
+ id=f"{sample.id}_mitre_{self._augmentation_count}",
+ text=augmented_text,
+ metadata={
+ **sample.metadata,
+ "mitre_tactic": tactic,
+ "severity": severity,
+ },
+ labels=sample.labels,
+ difficulty=sample.difficulty,
+ domain="cybersecurity",
+ reasoning_steps=sample.reasoning_steps,
+ )
+
+
+class MilitaryTacticalAugmenter(TacticalAugmenter):
+ """
+ Specialized augmenter for military tactical scenarios.
+
+ Focuses on:
+ - Environmental condition variations
+ - Force composition changes
+ - Mission objective variations
+ """
+
+ FORCE_COMPOSITIONS = [
+ "infantry platoon",
+ "mechanized company",
+ "special operations team",
+ "combined arms battalion",
+ "air assault element",
+ ]
+
+ def augment_with_force_composition(self, sample: DatasetSample) -> DatasetSample:
+ """
+ Add force composition context to sample.
+
+ Args:
+ sample: Original sample
+
+ Returns:
+ Augmented sample with force composition
+ """
+ force = self.rng.choice(self.FORCE_COMPOSITIONS)
+ condition = self.rng.choice(self.ENVIRONMENTAL_CONDITIONS)
+
+ augmented_text = f"[Force: {force}] [Conditions: {condition}] {sample.text}"
+
+ return DatasetSample(
+ id=f"{sample.id}_tactical_{self._augmentation_count}",
+ text=augmented_text,
+ metadata={
+ **sample.metadata,
+ "force_composition": force,
+ "environmental_conditions": condition,
+ },
+ labels=sample.labels,
+ difficulty=sample.difficulty,
+ domain="military",
+ reasoning_steps=sample.reasoning_steps,
+ )
diff --git a/src/data/train_test_split.py b/src/data/train_test_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a8c59e2e7c9ed879db8eea5d79a7a50151eabf
--- /dev/null
+++ b/src/data/train_test_split.py
@@ -0,0 +1,505 @@
+"""
+Data Splitting Module for Training Pipeline.
+
+Provides utilities for:
+- Train/validation/test splitting
+- Stratified sampling by domain or difficulty
+- Cross-validation fold creation
+- Reproducible splits with seeding
+"""
+
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any
+
+from .dataset_loader import DatasetSample
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DataSplit:
+ """Result of dataset splitting."""
+
+ train: list[DatasetSample]
+ validation: list[DatasetSample]
+ test: list[DatasetSample]
+ split_info: dict[str, Any]
+
+
+@dataclass
+class CrossValidationFold:
+ """Single fold for cross-validation."""
+
+ fold_id: int
+ train: list[DatasetSample]
+ validation: list[DatasetSample]
+
+
+class DataSplitter:
+ """
+ Basic dataset splitter with random sampling.
+
+ Provides reproducible train/validation/test splits
+ with configurable ratios.
+ """
+
+ def __init__(self, seed: int = 42):
+ """
+ Initialize splitter.
+
+ Args:
+ seed: Random seed for reproducibility
+ """
+ self.seed = seed
+ import random
+
+ self.rng = random.Random(seed)
+
+ def split(
+ self,
+ samples: list[DatasetSample],
+ train_ratio: float = 0.7,
+ val_ratio: float = 0.15,
+ test_ratio: float = 0.15,
+ shuffle: bool = True,
+ ) -> DataSplit:
+ """
+ Split dataset into train/validation/test sets.
+
+ Args:
+ samples: List of all samples
+ train_ratio: Proportion for training (default 0.7)
+ val_ratio: Proportion for validation (default 0.15)
+ test_ratio: Proportion for testing (default 0.15)
+ shuffle: Whether to shuffle before splitting
+
+ Returns:
+ DataSplit with train, validation, and test sets
+ """
+ if abs(train_ratio + val_ratio + test_ratio - 1.0) > 0.001:
+ raise ValueError("Ratios must sum to 1.0")
+
+ if not samples:
+ raise ValueError("Cannot split empty sample list")
+
+ # Copy and optionally shuffle
+ all_samples = list(samples)
+ if shuffle:
+ self.rng.shuffle(all_samples)
+
+ n = len(all_samples)
+ train_end = int(n * train_ratio)
+ val_end = train_end + int(n * val_ratio)
+
+ train_samples = all_samples[:train_end]
+ val_samples = all_samples[train_end:val_end]
+ test_samples = all_samples[val_end:]
+
+ split_info = {
+ "total_samples": n,
+ "train_samples": len(train_samples),
+ "val_samples": len(val_samples),
+ "test_samples": len(test_samples),
+ "train_ratio": len(train_samples) / n,
+ "val_ratio": len(val_samples) / n,
+ "test_ratio": len(test_samples) / n,
+ "seed": self.seed,
+ "shuffled": shuffle,
+ }
+
+ logger.info(f"Split {n} samples: train={len(train_samples)}, val={len(val_samples)}, test={len(test_samples)}")
+
+ return DataSplit(
+ train=train_samples,
+ validation=val_samples,
+ test=test_samples,
+ split_info=split_info,
+ )
+
+ def create_k_folds(
+ self,
+ samples: list[DatasetSample],
+ k: int = 5,
+ shuffle: bool = True,
+ ) -> list[CrossValidationFold]:
+ """
+ Create k-fold cross-validation splits.
+
+ Args:
+ samples: List of all samples
+ k: Number of folds
+ shuffle: Whether to shuffle before splitting
+
+ Returns:
+ List of CrossValidationFold objects
+ """
+ if k < 2:
+ raise ValueError("k must be at least 2")
+
+ if len(samples) < k:
+ raise ValueError(f"Need at least {k} samples for {k}-fold CV")
+
+ # Copy and optionally shuffle
+ all_samples = list(samples)
+ if shuffle:
+ self.rng.shuffle(all_samples)
+
+ # Calculate fold sizes
+ fold_size = len(all_samples) // k
+ folds = []
+
+ for fold_id in range(k):
+ # Validation is the current fold
+ val_start = fold_id * fold_size
+ val_end = len(all_samples) if fold_id == k - 1 else val_start + fold_size # noqa: SIM108
+
+ val_samples = all_samples[val_start:val_end]
+ train_samples = all_samples[:val_start] + all_samples[val_end:]
+
+ folds.append(
+ CrossValidationFold(
+ fold_id=fold_id,
+ train=train_samples,
+ validation=val_samples,
+ )
+ )
+
+ logger.info(f"Created {k}-fold cross-validation splits")
+ return folds
+
+
+class StratifiedSplitter(DataSplitter):
+ """
+ Stratified dataset splitter.
+
+ Ensures proportional representation of categories
+ (domain, difficulty, etc.) across splits.
+ """
+
+ def __init__(self, seed: int = 42, stratify_by: str = "domain"):
+ """
+ Initialize stratified splitter.
+
+ Args:
+ seed: Random seed for reproducibility
+ stratify_by: Attribute to stratify on ('domain', 'difficulty', 'labels')
+ """
+ super().__init__(seed)
+ self.stratify_by = stratify_by
+
+ def split(
+ self,
+ samples: list[DatasetSample],
+ train_ratio: float = 0.7,
+ val_ratio: float = 0.15,
+ test_ratio: float = 0.15,
+ shuffle: bool = True,
+ ) -> DataSplit:
+ """
+ Stratified split maintaining category proportions.
+
+ Args:
+ samples: List of all samples
+ train_ratio: Proportion for training
+ val_ratio: Proportion for validation
+ test_ratio: Proportion for testing
+ shuffle: Whether to shuffle before splitting
+
+ Returns:
+ DataSplit with stratified train, validation, and test sets
+ """
+ if abs(train_ratio + val_ratio + test_ratio - 1.0) > 0.001:
+ raise ValueError("Ratios must sum to 1.0")
+
+ if not samples:
+ raise ValueError("Cannot split empty sample list")
+
+ # Group samples by stratification key
+ groups = defaultdict(list)
+ for sample in samples:
+ key = self._get_stratify_key(sample)
+ groups[key].append(sample)
+
+ # Split each group proportionally
+ train_samples = []
+ val_samples = []
+ test_samples = []
+
+ for _key, group_samples in groups.items():
+ if shuffle:
+ self.rng.shuffle(group_samples)
+
+ n = len(group_samples)
+ train_end = int(n * train_ratio)
+ val_end = train_end + int(n * val_ratio)
+
+ train_samples.extend(group_samples[:train_end])
+ val_samples.extend(group_samples[train_end:val_end])
+ test_samples.extend(group_samples[val_end:])
+
+ # Final shuffle of combined sets
+ if shuffle:
+ self.rng.shuffle(train_samples)
+ self.rng.shuffle(val_samples)
+ self.rng.shuffle(test_samples)
+
+ # Verify stratification
+ stratify_info = self._verify_stratification(train_samples, val_samples, test_samples)
+
+ split_info = {
+ "total_samples": len(samples),
+ "train_samples": len(train_samples),
+ "val_samples": len(val_samples),
+ "test_samples": len(test_samples),
+ "train_ratio": len(train_samples) / len(samples),
+ "val_ratio": len(val_samples) / len(samples),
+ "test_ratio": len(test_samples) / len(samples),
+ "stratify_by": self.stratify_by,
+ "stratification_info": stratify_info,
+ "seed": self.seed,
+ "shuffled": shuffle,
+ }
+
+ logger.info(
+ f"Stratified split ({self.stratify_by}): "
+ f"train={len(train_samples)}, val={len(val_samples)}, "
+ f"test={len(test_samples)}"
+ )
+
+ return DataSplit(
+ train=train_samples,
+ validation=val_samples,
+ test=test_samples,
+ split_info=split_info,
+ )
+
+ def _get_stratify_key(self, sample: DatasetSample) -> str:
+ """Get stratification key for a sample."""
+ if self.stratify_by == "domain":
+ return sample.domain or "unknown"
+ elif self.stratify_by == "difficulty":
+ return sample.difficulty or "unknown"
+ elif self.stratify_by == "labels":
+ return ",".join(sorted(sample.labels)) if sample.labels else "unknown"
+ else:
+ return str(getattr(sample, self.stratify_by, "unknown"))
+
+ def _verify_stratification(
+ self,
+ train: list[DatasetSample],
+ val: list[DatasetSample],
+ test: list[DatasetSample],
+ ) -> dict[str, dict[str, float]]:
+ """
+ Verify that stratification was successful.
+
+ Returns dictionary showing distribution of stratification key
+ across train/val/test splits.
+ """
+
+ def get_distribution(samples: list[DatasetSample]) -> dict[str, float]:
+ if not samples:
+ return {}
+ counts = defaultdict(int)
+ for sample in samples:
+ key = self._get_stratify_key(sample)
+ counts[key] += 1
+ total = len(samples)
+ return {k: v / total for k, v in counts.items()}
+
+ return {
+ "train": get_distribution(train),
+ "validation": get_distribution(val),
+ "test": get_distribution(test),
+ }
+
+ def create_stratified_k_folds(
+ self,
+ samples: list[DatasetSample],
+ k: int = 5,
+ shuffle: bool = True,
+ ) -> list[CrossValidationFold]:
+ """
+ Create stratified k-fold cross-validation splits.
+
+ Args:
+ samples: List of all samples
+ k: Number of folds
+ shuffle: Whether to shuffle before splitting
+
+ Returns:
+ List of CrossValidationFold objects with stratification
+ """
+ if k < 2:
+ raise ValueError("k must be at least 2")
+
+ # Group samples by stratification key
+ groups = defaultdict(list)
+ for sample in samples:
+ key = self._get_stratify_key(sample)
+ groups[key].append(sample)
+
+ # Initialize folds
+ folds_data = [{"train": [], "val": []} for _ in range(k)]
+
+ # Distribute each group across folds
+ for _key, group_samples in groups.items():
+ if shuffle:
+ self.rng.shuffle(group_samples)
+
+ # Assign samples to folds
+ fold_size = len(group_samples) // k
+ for fold_id in range(k):
+ val_start = fold_id * fold_size
+ val_end = len(group_samples) if fold_id == k - 1 else val_start + fold_size
+
+ for i, sample in enumerate(group_samples):
+ if val_start <= i < val_end:
+ folds_data[fold_id]["val"].append(sample)
+ else:
+ folds_data[fold_id]["train"].append(sample)
+
+ # Create fold objects
+ folds = [
+ CrossValidationFold(
+ fold_id=i,
+ train=data["train"],
+ validation=data["val"],
+ )
+ for i, data in enumerate(folds_data)
+ ]
+
+ logger.info(f"Created stratified {k}-fold cross-validation splits")
+ return folds
+
+
+class BalancedSampler:
+ """
+ Balanced sampling for imbalanced datasets.
+
+ Provides utilities for:
+ - Oversampling minority classes
+ - Undersampling majority classes
+ - SMOTE-like synthetic sampling (for numerical features)
+ """
+
+ def __init__(self, seed: int = 42):
+ """Initialize balanced sampler."""
+ self.seed = seed
+ import random
+
+ self.rng = random.Random(seed)
+
+ def oversample_minority(
+ self,
+ samples: list[DatasetSample],
+ target_key: str = "domain",
+ target_ratio: float = 1.0,
+ ) -> list[DatasetSample]:
+ """
+ Oversample minority classes to balance dataset.
+
+ Args:
+ samples: Original samples
+ target_key: Attribute to balance on
+ target_ratio: Target ratio relative to majority (1.0 = equal)
+
+ Returns:
+ Balanced sample list (originals + oversampled)
+ """
+ # Group by target key
+ groups = defaultdict(list)
+ for sample in samples:
+ key = getattr(sample, target_key, "unknown") or "unknown"
+ groups[key].append(sample)
+
+ # Find majority class size
+ max_count = max(len(g) for g in groups.values())
+ target_count = int(max_count * target_ratio)
+
+ # Oversample minority classes
+ balanced = []
+ for _key, group in groups.items():
+ balanced.extend(group)
+
+ # Oversample if needed
+ if len(group) < target_count:
+ num_to_add = target_count - len(group)
+ for _ in range(num_to_add):
+ # Randomly duplicate from group
+ original = self.rng.choice(group)
+ duplicate = DatasetSample(
+ id=f"{original.id}_oversample_{self.rng.randint(0, 999999)}",
+ text=original.text,
+ metadata={**original.metadata, "oversampled": True},
+ labels=original.labels,
+ difficulty=original.difficulty,
+ domain=original.domain,
+ reasoning_steps=original.reasoning_steps,
+ )
+ balanced.append(duplicate)
+
+ logger.info(f"Oversampled from {len(samples)} to {len(balanced)} samples")
+ return balanced
+
+ def undersample_majority(
+ self,
+ samples: list[DatasetSample],
+ target_key: str = "domain",
+ target_ratio: float = 1.0,
+ ) -> list[DatasetSample]:
+ """
+ Undersample majority classes to balance dataset.
+
+ Args:
+ samples: Original samples
+ target_key: Attribute to balance on
+ target_ratio: Target ratio relative to minority (1.0 = equal)
+
+ Returns:
+ Balanced sample list (subset of originals)
+ """
+ # Group by target key
+ groups = defaultdict(list)
+ for sample in samples:
+ key = getattr(sample, target_key, "unknown") or "unknown"
+ groups[key].append(sample)
+
+ # Find minority class size
+ min_count = min(len(g) for g in groups.values())
+ target_count = int(min_count * target_ratio)
+
+ # Undersample majority classes
+ balanced = []
+ for _key, group in groups.items():
+ if len(group) > target_count:
+ # Randomly select target_count samples
+ balanced.extend(self.rng.sample(group, target_count))
+ else:
+ balanced.extend(group)
+
+ logger.info(f"Undersampled from {len(samples)} to {len(balanced)} samples")
+ return balanced
+
+ def get_class_distribution(
+ self,
+ samples: list[DatasetSample],
+ target_key: str = "domain",
+ ) -> dict[str, int]:
+ """
+ Get distribution of classes.
+
+ Args:
+ samples: Sample list
+ target_key: Attribute to analyze
+
+ Returns:
+ Dictionary of class counts
+ """
+ distribution = defaultdict(int)
+ for sample in samples:
+ key = getattr(sample, target_key, "unknown") or "unknown"
+ distribution[key] += 1
+ return dict(distribution)
diff --git a/src/framework/__init__.py b/src/framework/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a26dae4aa5e0cd67a0fdc606e779153ca0ca7b0
--- /dev/null
+++ b/src/framework/__init__.py
@@ -0,0 +1 @@
+# Framework module
diff --git a/src/framework/agents/__init__.py b/src/framework/agents/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb70202614c10dae0c0b114b12d74eb82eba81e
--- /dev/null
+++ b/src/framework/agents/__init__.py
@@ -0,0 +1,22 @@
+# Agents module for async agent implementations
+from .base import (
+ AgentContext,
+ AgentResult,
+ AsyncAgentBase,
+ CompositeAgent,
+ MetricsCollector,
+ NoOpMetricsCollector,
+ ParallelAgent,
+ SequentialAgent,
+)
+
+__all__ = [
+ "AsyncAgentBase",
+ "AgentContext",
+ "AgentResult",
+ "MetricsCollector",
+ "NoOpMetricsCollector",
+ "CompositeAgent",
+ "ParallelAgent",
+ "SequentialAgent",
+]
diff --git a/src/framework/agents/base.py b/src/framework/agents/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f24fb6960f5c4aac9e0b19919353ce1478959e6
--- /dev/null
+++ b/src/framework/agents/base.py
@@ -0,0 +1,533 @@
+"""
+Base async agent class for Multi-Agent MCTS Framework.
+
+Provides common patterns for all agents with hook points for metrics,
+logging, and extensibility.
+"""
+
+import asyncio
+import logging
+import time
+import uuid
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Protocol
+
+from src.adapters.llm.base import LLMClient, LLMResponse
+
+
+@dataclass
+class AgentContext:
+ """
+ Context passed to agent during processing.
+
+ Contains all information needed for the agent to process a request.
+ """
+
+ query: str
+ session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+ rag_context: str | None = None
+ metadata: dict = field(default_factory=dict)
+ conversation_history: list[dict] = field(default_factory=list)
+ max_iterations: int = 5
+ temperature: float = 0.7
+ additional_context: dict = field(default_factory=dict)
+
+ def to_dict(self) -> dict:
+ """Convert context to dictionary."""
+ return {
+ "query": self.query,
+ "session_id": self.session_id,
+ "rag_context": self.rag_context,
+ "metadata": self.metadata,
+ "conversation_history": self.conversation_history,
+ "max_iterations": self.max_iterations,
+ "temperature": self.temperature,
+ "additional_context": self.additional_context,
+ }
+
+
+@dataclass
+class AgentResult:
+ """
+ Result from agent processing.
+
+ Standardized result format for all agents.
+ """
+
+ response: str
+ confidence: float = 0.0
+ metadata: dict = field(default_factory=dict)
+ agent_name: str = ""
+ processing_time_ms: float = 0.0
+ token_usage: dict = field(default_factory=dict)
+ intermediate_steps: list[dict] = field(default_factory=list)
+ created_at: datetime = field(default_factory=datetime.utcnow)
+ error: str | None = None
+ success: bool = True
+
+ def to_dict(self) -> dict:
+ """Convert result to dictionary."""
+ return {
+ "response": self.response,
+ "confidence": self.confidence,
+ "metadata": self.metadata,
+ "agent_name": self.agent_name,
+ "processing_time_ms": self.processing_time_ms,
+ "token_usage": self.token_usage,
+ "intermediate_steps": self.intermediate_steps,
+ "created_at": self.created_at.isoformat(),
+ "error": self.error,
+ "success": self.success,
+ }
+
+
+class MetricsCollector(Protocol):
+ """Protocol for metrics collection."""
+
+ def record_latency(self, agent_name: str, latency_ms: float) -> None: ...
+ def record_tokens(self, agent_name: str, tokens: int) -> None: ...
+ def record_error(self, agent_name: str, error_type: str) -> None: ...
+ def record_success(self, agent_name: str) -> None: ...
+
+
+class NoOpMetricsCollector:
+ """Default no-op metrics collector."""
+
+ def record_latency(self, agent_name: str, latency_ms: float) -> None:
+ pass
+
+ def record_tokens(self, agent_name: str, tokens: int) -> None:
+ pass
+
+ def record_error(self, agent_name: str, error_type: str) -> None:
+ pass
+
+ def record_success(self, agent_name: str) -> None:
+ pass
+
+
+class AsyncAgentBase(ABC):
+ """
+ Base class for async agents in the Multi-Agent MCTS Framework.
+
+ Features:
+ - Async processing by default
+ - Hook points for metrics/logging
+ - Lifecycle management
+ - Error handling patterns
+ - Backward compatibility with existing framework
+ """
+
+ def __init__(
+ self,
+ model_adapter: LLMClient,
+ logger: Any = None,
+ name: str | None = None,
+ metrics_collector: MetricsCollector | None = None,
+ **config: Any,
+ ):
+ """
+ Initialize async agent.
+
+ Args:
+ model_adapter: LLM client for generating responses
+ logger: Logger instance (uses standard logging if None)
+ name: Agent name (uses class name if None)
+ metrics_collector: Optional metrics collector
+ **config: Additional configuration parameters
+ """
+ self.model_adapter = model_adapter
+ self.logger = logger or logging.getLogger(self.__class__.__name__)
+ self.name = name or self.__class__.__name__
+ self.metrics = metrics_collector or NoOpMetricsCollector()
+ self.config = config
+
+ # Runtime state
+ self._request_count = 0
+ self._total_processing_time = 0.0
+ self._error_count = 0
+ self._initialized = False
+
+ async def initialize(self) -> None:
+ """
+ Initialize agent resources.
+
+ Override this method to perform async initialization tasks
+ like loading prompts, setting up connections, etc.
+ """
+ self._initialized = True
+ self.logger.info(f"Agent {self.name} initialized")
+
+ async def shutdown(self) -> None:
+ """
+ Clean up agent resources.
+
+ Override this method to perform cleanup tasks.
+ """
+ self._initialized = False
+ self.logger.info(f"Agent {self.name} shutdown")
+
+ async def __aenter__(self):
+ """Async context manager entry."""
+ await self.initialize()
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit."""
+ await self.shutdown()
+
+ # Hook points for subclasses
+ async def pre_process(self, context: AgentContext) -> AgentContext:
+ """
+ Hook called before processing.
+
+ Override to modify context or perform pre-processing.
+
+ Args:
+ context: Agent context
+
+ Returns:
+ Potentially modified context
+ """
+ return context
+
+ async def post_process(self, _context: AgentContext, result: AgentResult) -> AgentResult:
+ """
+ Hook called after processing.
+
+ Override to modify result or perform post-processing.
+
+ Args:
+ context: Agent context
+ result: Agent result
+
+ Returns:
+ Potentially modified result
+ """
+ return result
+
+ async def on_error(self, _context: AgentContext, error: Exception) -> AgentResult:
+ """
+ Hook called when processing fails.
+
+ Override to customize error handling.
+
+ Args:
+ context: Agent context
+ error: The exception that occurred
+
+ Returns:
+ Error result
+ """
+ self.logger.error(f"Agent {self.name} error: {error}")
+ self._error_count += 1
+ self.metrics.record_error(self.name, type(error).__name__)
+
+ return AgentResult(
+ response="",
+ confidence=0.0,
+ agent_name=self.name,
+ error=str(error),
+ success=False,
+ )
+
+ @abstractmethod
+ async def _process_impl(self, context: AgentContext) -> AgentResult:
+ """
+ Core processing logic to be implemented by subclasses.
+
+ Args:
+ context: Agent context with all necessary information
+
+ Returns:
+ AgentResult with response and metadata
+ """
+ pass
+
+ async def process(
+ self,
+ query: str | None = None,
+ context: AgentContext | None = None,
+ *,
+ rag_context: str | None = None,
+ **kwargs: Any,
+ ) -> dict:
+ """
+ Process a query and return structured response.
+
+ This method provides backward compatibility with the existing
+ LangGraphMultiAgentFramework while using the new async patterns.
+
+ Args:
+ query: Query string (if not using context object)
+ context: Full context object (if not using query string)
+ rag_context: RAG context (used if query provided)
+ **kwargs: Additional parameters merged into context
+
+ Returns:
+ Dictionary with 'response' and 'metadata' keys for backward compatibility
+ """
+ # Build context if not provided
+ if context is None:
+ if query is None:
+ raise ValueError("Either 'query' or 'context' must be provided")
+ context = AgentContext(
+ query=query,
+ rag_context=rag_context,
+ additional_context=kwargs,
+ )
+
+ # Ensure initialized
+ if not self._initialized:
+ await self.initialize()
+
+ # Track timing
+ start_time = time.perf_counter()
+
+ try:
+ # Pre-processing hook
+ context = await self.pre_process(context)
+
+ # Core processing
+ result = await self._process_impl(context)
+
+ # Calculate timing
+ elapsed_ms = (time.perf_counter() - start_time) * 1000
+ result.processing_time_ms = elapsed_ms
+ result.agent_name = self.name
+
+ # Update stats
+ self._request_count += 1
+ self._total_processing_time += elapsed_ms
+ self.metrics.record_latency(self.name, elapsed_ms)
+ if result.token_usage:
+ self.metrics.record_tokens(self.name, result.token_usage.get("total_tokens", 0))
+ self.metrics.record_success(self.name)
+
+ # Post-processing hook
+ result = await self.post_process(context, result)
+
+ self.logger.info(f"Agent {self.name} processed query in {elapsed_ms:.2f}ms")
+
+ except Exception as e:
+ result = await self.on_error(context, e)
+ result.processing_time_ms = (time.perf_counter() - start_time) * 1000
+
+ # Return backward-compatible format
+ return {
+ "response": result.response,
+ "metadata": {
+ **result.metadata,
+ "agent_name": result.agent_name,
+ "confidence": result.confidence,
+ "processing_time_ms": result.processing_time_ms,
+ "token_usage": result.token_usage,
+ "success": result.success,
+ "error": result.error,
+ },
+ }
+
+ @property
+ def stats(self) -> dict:
+ """Get agent statistics."""
+ return {
+ "name": self.name,
+ "request_count": self._request_count,
+ "total_processing_time_ms": self._total_processing_time,
+ "error_count": self._error_count,
+ "average_processing_time_ms": (
+ self._total_processing_time / self._request_count if self._request_count > 0 else 0.0
+ ),
+ "initialized": self._initialized,
+ }
+
+ async def generate_llm_response(
+ self,
+ prompt: str | None = None,
+ messages: list[dict] | None = None,
+ temperature: float = 0.7,
+ max_tokens: int | None = None,
+ **kwargs: Any,
+ ) -> LLMResponse:
+ """
+ Convenience method to generate LLM response with error handling.
+
+ Args:
+ prompt: Simple string prompt
+ messages: Chat messages
+ temperature: Sampling temperature
+ max_tokens: Max tokens to generate
+ **kwargs: Additional parameters
+
+ Returns:
+ LLMResponse from the model adapter
+ """
+ response = await self.model_adapter.generate(
+ prompt=prompt,
+ messages=messages,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ **kwargs,
+ )
+ return response
+
+
+class CompositeAgent(AsyncAgentBase):
+ """
+ Agent that combines multiple sub-agents.
+
+ Useful for creating complex agents from simpler building blocks.
+ """
+
+ def __init__(
+ self,
+ model_adapter: LLMClient,
+ logger: Any = None,
+ name: str = "CompositeAgent",
+ sub_agents: list[AsyncAgentBase] | None = None,
+ **config: Any,
+ ):
+ super().__init__(model_adapter, logger, name, **config)
+ self.sub_agents = sub_agents or []
+
+ def add_agent(self, agent: AsyncAgentBase) -> None:
+ """Add a sub-agent."""
+ self.sub_agents.append(agent)
+
+ async def initialize(self) -> None:
+ """Initialize all sub-agents."""
+ await super().initialize()
+ for agent in self.sub_agents:
+ await agent.initialize()
+
+ async def shutdown(self) -> None:
+ """Shutdown all sub-agents."""
+ for agent in self.sub_agents:
+ await agent.shutdown()
+ await super().shutdown()
+
+
+class ParallelAgent(CompositeAgent):
+ """
+ Execute multiple agents in parallel and aggregate results.
+ """
+
+ async def _process_impl(self, context: AgentContext) -> AgentResult:
+ """Execute all sub-agents in parallel."""
+ if not self.sub_agents:
+ return AgentResult(
+ response="No sub-agents configured",
+ confidence=0.0,
+ agent_name=self.name,
+ )
+
+ # Run all agents concurrently
+ tasks = [agent.process(context=context) for agent in self.sub_agents]
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ # Aggregate results
+ successful_results = []
+ errors = []
+
+ for i, result in enumerate(results):
+ if isinstance(result, Exception):
+ errors.append(f"{self.sub_agents[i].name}: {str(result)}")
+ elif isinstance(result, dict) and result.get("metadata", {}).get("success", True):
+ successful_results.append(result)
+ else:
+ if isinstance(result, dict):
+ errors.append(
+ f"{self.sub_agents[i].name}: {result.get('metadata', {}).get('error', 'Unknown error')}"
+ )
+
+ if not successful_results:
+ return AgentResult(
+ response=f"All sub-agents failed: {'; '.join(errors)}",
+ confidence=0.0,
+ agent_name=self.name,
+ success=False,
+ error="All sub-agents failed",
+ )
+
+ # Aggregate: highest confidence wins (simple strategy)
+ best_result = max(successful_results, key=lambda r: r.get("metadata", {}).get("confidence", 0.0))
+
+ return AgentResult(
+ response=best_result["response"],
+ confidence=best_result.get("metadata", {}).get("confidence", 0.0),
+ metadata={
+ "aggregation_method": "highest_confidence",
+ "sub_agent_results": successful_results,
+ "errors": errors,
+ },
+ agent_name=self.name,
+ )
+
+
+class SequentialAgent(CompositeAgent):
+ """
+ Execute multiple agents sequentially, passing context through each.
+ """
+
+ async def _process_impl(self, context: AgentContext) -> AgentResult:
+ """Execute sub-agents in sequence."""
+ if not self.sub_agents:
+ return AgentResult(
+ response="No sub-agents configured",
+ confidence=0.0,
+ agent_name=self.name,
+ )
+
+ current_context = context
+ intermediate_results = []
+
+ for agent in self.sub_agents:
+ result = await agent.process(context=current_context)
+
+ intermediate_results.append(
+ {
+ "agent": agent.name,
+ "result": result,
+ }
+ )
+
+ # Check for failure
+ if not result.get("metadata", {}).get("success", True):
+ return AgentResult(
+ response=result["response"],
+ confidence=result.get("metadata", {}).get("confidence", 0.0),
+ metadata={
+ "failed_at": agent.name,
+ "intermediate_results": intermediate_results,
+ },
+ agent_name=self.name,
+ success=False,
+ error=result.get("metadata", {}).get("error"),
+ )
+
+ # Update context for next agent
+ current_context = AgentContext(
+ query=current_context.query,
+ session_id=current_context.session_id,
+ rag_context=result["response"], # Previous output becomes context
+ metadata={
+ **current_context.metadata,
+ f"{agent.name}_result": result["response"],
+ },
+ additional_context=current_context.additional_context,
+ )
+
+ # Final result from last agent
+ final_result = intermediate_results[-1]["result"]
+
+ return AgentResult(
+ response=final_result["response"],
+ confidence=final_result.get("metadata", {}).get("confidence", 0.0),
+ metadata={
+ "pipeline": [r["agent"] for r in intermediate_results],
+ "intermediate_results": intermediate_results,
+ },
+ agent_name=self.name,
+ )
diff --git a/src/framework/graph.py b/src/framework/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e379cff6da001cad89905b4055c21c183aa9fe
--- /dev/null
+++ b/src/framework/graph.py
@@ -0,0 +1,915 @@
+"""
+LangGraph Integration Module - Extract graph building with new MCTS core integration.
+
+Provides:
+- Graph building extracted from LangGraphMultiAgentFramework
+- Integration with new deterministic MCTS core
+- Backward compatibility with original process() signature
+- Support for parallel HRM/TRM execution
+"""
+
+from __future__ import annotations
+
+import asyncio
+import operator
+import time
+from typing import Annotated, Any, NotRequired, TypedDict
+
+# LangGraph imports (these would be installed dependencies)
+try:
+ from langgraph.checkpoint.memory import MemorySaver
+ from langgraph.graph import END, StateGraph
+except ImportError:
+ # Stubs for development without LangGraph installed
+ StateGraph = None
+ END = "END"
+ MemorySaver = None
+
+# Import new MCTS modules
+from .mcts.config import ConfigPreset, MCTSConfig, create_preset_config
+from .mcts.core import MCTSEngine, MCTSNode, MCTSState
+from .mcts.experiments import ExperimentTracker
+from .mcts.policies import (
+ HybridRolloutPolicy,
+)
+
+# Neural Meta-Controller imports (optional)
+try:
+ from src.agents.meta_controller.base import (
+ AbstractMetaController,
+ MetaControllerFeatures,
+ )
+ from src.agents.meta_controller.bert_controller import BERTMetaController
+ from src.agents.meta_controller.config_loader import (
+ MetaControllerConfig,
+ MetaControllerConfigLoader,
+ )
+ from src.agents.meta_controller.rnn_controller import RNNMetaController
+
+ _META_CONTROLLER_AVAILABLE = True
+except ImportError:
+ _META_CONTROLLER_AVAILABLE = False
+ AbstractMetaController = None # type: ignore
+ MetaControllerFeatures = None # type: ignore
+ RNNMetaController = None # type: ignore
+ BERTMetaController = None # type: ignore
+ MetaControllerConfig = None # type: ignore
+ MetaControllerConfigLoader = None # type: ignore
+
+
+class AgentState(TypedDict):
+ """Shared state for LangGraph agent framework."""
+
+ # Input
+ query: str
+ use_mcts: bool
+ use_rag: bool
+
+ # RAG context
+ rag_context: NotRequired[str]
+ retrieved_docs: NotRequired[list[dict]]
+
+ # Agent results
+ hrm_results: NotRequired[dict]
+ trm_results: NotRequired[dict]
+ agent_outputs: Annotated[list[dict], operator.add]
+
+ # MCTS simulation (updated for new core)
+ mcts_root: NotRequired[Any] # MCTSNode
+ mcts_iterations: NotRequired[int]
+ mcts_best_action: NotRequired[str]
+ mcts_stats: NotRequired[dict]
+ mcts_config: NotRequired[dict]
+
+ # Evaluation
+ confidence_scores: NotRequired[dict[str, float]]
+ consensus_reached: NotRequired[bool]
+ consensus_score: NotRequired[float]
+
+ # Control flow
+ iteration: int
+ max_iterations: int
+
+ # Neural Meta-Controller (optional)
+ routing_history: NotRequired[list[dict]]
+ meta_controller_predictions: NotRequired[list[dict]]
+ last_routed_agent: NotRequired[str]
+
+ # Output
+ final_response: NotRequired[str]
+ metadata: NotRequired[dict]
+
+
+class GraphBuilder:
+ """
+ Builds and configures the LangGraph state machine for multi-agent orchestration.
+
+ Extracts graph building logic from LangGraphMultiAgentFramework for modularity.
+ """
+
+ def __init__(
+ self,
+ hrm_agent,
+ trm_agent,
+ model_adapter,
+ logger,
+ vector_store=None,
+ mcts_config: MCTSConfig | None = None,
+ top_k_retrieval: int = 5,
+ max_iterations: int = 3,
+ consensus_threshold: float = 0.75,
+ enable_parallel_agents: bool = True,
+ meta_controller_config: Any | None = None,
+ ):
+ """
+ Initialize graph builder.
+
+ Args:
+ hrm_agent: HRM agent instance
+ trm_agent: TRM agent instance
+ model_adapter: Model adapter for LLM calls
+ logger: Logger instance
+ vector_store: Optional vector store for RAG
+ mcts_config: MCTS configuration (uses balanced preset if None)
+ top_k_retrieval: Number of documents for RAG
+ max_iterations: Maximum agent iterations
+ consensus_threshold: Threshold for consensus
+ enable_parallel_agents: Run HRM/TRM in parallel
+ meta_controller_config: Optional neural meta-controller configuration
+ """
+ self.hrm_agent = hrm_agent
+ self.trm_agent = trm_agent
+ self.model_adapter = model_adapter
+ self.logger = logger
+ self.vector_store = vector_store
+ self.top_k_retrieval = top_k_retrieval
+ self.max_iterations = max_iterations
+ self.consensus_threshold = consensus_threshold
+ self.enable_parallel_agents = enable_parallel_agents
+
+ # MCTS configuration
+ self.mcts_config = mcts_config or create_preset_config(ConfigPreset.BALANCED)
+
+ # MCTS engine with deterministic behavior
+ self.mcts_engine = MCTSEngine(
+ seed=self.mcts_config.seed,
+ exploration_weight=self.mcts_config.exploration_weight,
+ progressive_widening_k=self.mcts_config.progressive_widening_k,
+ progressive_widening_alpha=self.mcts_config.progressive_widening_alpha,
+ max_parallel_rollouts=self.mcts_config.max_parallel_rollouts,
+ cache_size_limit=self.mcts_config.cache_size_limit,
+ )
+
+ # Experiment tracking
+ self.experiment_tracker = ExperimentTracker(name="langgraph_mcts")
+
+ # Neural Meta-Controller (optional)
+ self.meta_controller: Any | None = None
+ self.meta_controller_config = meta_controller_config
+ self.use_neural_routing = False
+
+ if meta_controller_config is not None:
+ self._init_meta_controller(meta_controller_config)
+
+ def build_graph(self) -> StateGraph:
+ """
+ Build LangGraph state machine.
+
+ Returns:
+ Configured StateGraph
+ """
+ if StateGraph is None:
+ raise ImportError("LangGraph not installed. Install with: pip install langgraph")
+
+ workflow = StateGraph(AgentState)
+
+ # Add nodes
+ workflow.add_node("entry", self._entry_node)
+ workflow.add_node("retrieve_context", self._retrieve_context_node)
+ workflow.add_node("route_decision", self._route_decision_node)
+ workflow.add_node("parallel_agents", self._parallel_agents_node)
+ workflow.add_node("hrm_agent", self._hrm_agent_node)
+ workflow.add_node("trm_agent", self._trm_agent_node)
+ workflow.add_node("mcts_simulator", self._mcts_simulator_node)
+ workflow.add_node("aggregate_results", self._aggregate_results_node)
+ workflow.add_node("evaluate_consensus", self._evaluate_consensus_node)
+ workflow.add_node("synthesize", self._synthesize_node)
+
+ # Define edges
+ workflow.set_entry_point("entry")
+ workflow.add_edge("entry", "retrieve_context")
+ workflow.add_edge("retrieve_context", "route_decision")
+
+ # Conditional routing
+ workflow.add_conditional_edges(
+ "route_decision",
+ self._route_to_agents,
+ {
+ "parallel": "parallel_agents",
+ "hrm": "hrm_agent",
+ "trm": "trm_agent",
+ "mcts": "mcts_simulator",
+ "aggregate": "aggregate_results",
+ },
+ )
+
+ # Parallel agents to aggregation
+ workflow.add_edge("parallel_agents", "aggregate_results")
+
+ # Sequential agent nodes
+ workflow.add_edge("hrm_agent", "aggregate_results")
+ workflow.add_edge("trm_agent", "aggregate_results")
+ workflow.add_edge("mcts_simulator", "aggregate_results")
+
+ # Aggregation to evaluation
+ workflow.add_edge("aggregate_results", "evaluate_consensus")
+
+ # Conditional consensus check
+ workflow.add_conditional_edges(
+ "evaluate_consensus",
+ self._check_consensus,
+ {
+ "synthesize": "synthesize",
+ "iterate": "route_decision",
+ },
+ )
+
+ # Synthesis to end
+ workflow.add_edge("synthesize", END)
+
+ return workflow
+
+ def _entry_node(self, state: AgentState) -> dict:
+ """Initialize state and parse query."""
+ self.logger.info(f"Entry node: {state['query'][:100]}")
+ return {
+ "iteration": 0,
+ "agent_outputs": [],
+ "mcts_config": self.mcts_config.to_dict(),
+ }
+
+ def _retrieve_context_node(self, state: AgentState) -> dict:
+ """Retrieve context from vector store using RAG."""
+ if not state.get("use_rag", True) or not self.vector_store:
+ return {"rag_context": ""}
+
+ query = state["query"]
+
+ # Retrieve documents
+ docs = self.vector_store.similarity_search(query, k=self.top_k_retrieval)
+
+ # Format context
+ context = "\n\n".join([doc.page_content for doc in docs])
+
+ self.logger.info(f"Retrieved {len(docs)} documents")
+
+ return {
+ "rag_context": context,
+ "retrieved_docs": [{"content": doc.page_content, "metadata": doc.metadata} for doc in docs],
+ }
+
+ def _route_decision_node(self, _state: AgentState) -> dict:
+ """Prepare routing decision."""
+ return {}
+
+ def _init_meta_controller(self, config: Any) -> None:
+ """
+ Initialize the neural meta-controller based on configuration.
+
+ Args:
+ config: MetaControllerConfig or dict with configuration
+ """
+ if not _META_CONTROLLER_AVAILABLE:
+ self.logger.warning("Meta-controller modules not available. Falling back to rule-based routing.")
+ return
+
+ try:
+ # Handle both config object and dict
+ mc_config = MetaControllerConfigLoader.load_from_dict(config) if isinstance(config, dict) else config
+
+ if not mc_config.enabled:
+ self.logger.info("Neural meta-controller disabled in config")
+ return
+
+ # Initialize based on type
+ if mc_config.type == "rnn":
+ self.meta_controller = RNNMetaController(
+ name="GraphBuilder_RNN",
+ seed=mc_config.inference.seed,
+ hidden_dim=mc_config.rnn.hidden_dim,
+ num_layers=mc_config.rnn.num_layers,
+ dropout=mc_config.rnn.dropout,
+ device=mc_config.inference.device,
+ )
+ # Load trained model if path specified
+ if mc_config.rnn.model_path:
+ self.meta_controller.load_model(mc_config.rnn.model_path)
+ self.logger.info(f"Loaded RNN model from {mc_config.rnn.model_path}")
+
+ elif mc_config.type == "bert":
+ self.meta_controller = BERTMetaController(
+ name="GraphBuilder_BERT",
+ seed=mc_config.inference.seed,
+ model_name=mc_config.bert.model_name,
+ lora_r=mc_config.bert.lora_r,
+ lora_alpha=mc_config.bert.lora_alpha,
+ lora_dropout=mc_config.bert.lora_dropout,
+ device=mc_config.inference.device,
+ use_lora=mc_config.bert.use_lora,
+ )
+ # Load trained model if path specified
+ if mc_config.bert.model_path:
+ self.meta_controller.load_model(mc_config.bert.model_path)
+ self.logger.info(f"Loaded BERT model from {mc_config.bert.model_path}")
+ else:
+ raise ValueError(f"Unknown meta-controller type: {mc_config.type}")
+
+ self.use_neural_routing = True
+ self.logger.info(f"Initialized {mc_config.type.upper()} neural meta-controller")
+
+ except Exception as e:
+ self.logger.error(f"Failed to initialize meta-controller: {e}")
+ if hasattr(config, "fallback_to_rule_based") and config.fallback_to_rule_based:
+ self.logger.warning("Falling back to rule-based routing")
+ else:
+ raise
+
+ def _extract_meta_controller_features(self, state: AgentState) -> Any:
+ """
+ Extract features from AgentState for meta-controller prediction.
+
+ Args:
+ state: Current agent state
+
+ Returns:
+ MetaControllerFeatures instance
+ """
+ if not _META_CONTROLLER_AVAILABLE or MetaControllerFeatures is None:
+ return None
+
+ # Extract HRM confidence
+ hrm_conf = 0.0
+ if "hrm_results" in state:
+ hrm_conf = state["hrm_results"].get("metadata", {}).get("decomposition_quality_score", 0.5)
+
+ # Extract TRM confidence
+ trm_conf = 0.0
+ if "trm_results" in state:
+ trm_conf = state["trm_results"].get("metadata", {}).get("final_quality_score", 0.5)
+
+ # Extract MCTS value
+ mcts_val = 0.0
+ if "mcts_stats" in state:
+ mcts_val = state["mcts_stats"].get("best_action_value", 0.5)
+
+ # Consensus score
+ consensus = state.get("consensus_score", 0.0)
+
+ # Last agent used
+ last_agent = state.get("last_routed_agent", "none")
+
+ # Iteration
+ iteration = state.get("iteration", 0)
+
+ # Query length
+ query_length = len(state.get("query", ""))
+
+ # Has RAG context
+ has_rag = bool(state.get("rag_context", ""))
+
+ return MetaControllerFeatures(
+ hrm_confidence=hrm_conf,
+ trm_confidence=trm_conf,
+ mcts_value=mcts_val,
+ consensus_score=consensus,
+ last_agent=last_agent,
+ iteration=iteration,
+ query_length=query_length,
+ has_rag_context=has_rag,
+ )
+
+ def _neural_route_decision(self, state: AgentState) -> str:
+ """
+ Make routing decision using neural meta-controller.
+
+ Args:
+ state: Current agent state
+
+ Returns:
+ Route decision string ("parallel", "hrm", "trm", "mcts", "aggregate")
+ """
+ try:
+ features = self._extract_meta_controller_features(state)
+ if features is None:
+ return self._rule_based_route_decision(state)
+
+ prediction = self.meta_controller.predict(features)
+
+ # Log prediction for debugging
+ self.logger.debug(
+ f"Neural routing: agent={prediction.agent}, "
+ f"confidence={prediction.confidence:.3f}, "
+ f"probs={prediction.probabilities}"
+ )
+
+ # Map agent prediction to route
+ agent = prediction.agent
+
+ # Handle routing based on predicted agent
+ state.get("iteration", 0)
+
+ if agent == "hrm":
+ if "hrm_results" not in state:
+ return "hrm"
+ elif agent == "trm":
+ if "trm_results" not in state:
+ return "trm"
+ elif agent == "mcts" and state.get("use_mcts", False) and "mcts_stats" not in state:
+ return "mcts"
+
+ # If predicted agent already ran or not applicable, use rule-based
+ return self._rule_based_route_decision(state)
+
+ except Exception as e:
+ self.logger.error(f"Neural routing failed: {e}")
+ # Fallback to rule-based routing
+ return self._rule_based_route_decision(state)
+
+ def _rule_based_route_decision(self, state: AgentState) -> str:
+ """
+ Make routing decision using rule-based logic.
+
+ Args:
+ state: Current agent state
+
+ Returns:
+ Route decision string
+ """
+ iteration = state.get("iteration", 0)
+
+ # First iteration: run HRM and TRM
+ if iteration == 0:
+ if self.enable_parallel_agents:
+ if "hrm_results" not in state and "trm_results" not in state:
+ return "parallel"
+ else:
+ if "hrm_results" not in state:
+ return "hrm"
+ elif "trm_results" not in state:
+ return "trm"
+
+ # Run MCTS if enabled and not yet done
+ if state.get("use_mcts", False) and "mcts_stats" not in state:
+ return "mcts"
+
+ return "aggregate"
+
+ def _route_to_agents(self, state: AgentState) -> str:
+ """Route to appropriate agent based on state."""
+ # Use neural routing if enabled
+ if self.use_neural_routing and self.meta_controller is not None:
+ return self._neural_route_decision(state)
+
+ # Fall back to rule-based routing
+ return self._rule_based_route_decision(state)
+
+ async def _parallel_agents_node(self, state: AgentState) -> dict:
+ """Execute HRM and TRM agents in parallel."""
+ self.logger.info("Executing HRM and TRM agents in parallel")
+
+ # Run both agents concurrently
+ hrm_task = asyncio.create_task(
+ self.hrm_agent.process(
+ query=state["query"],
+ rag_context=state.get("rag_context"),
+ )
+ )
+
+ trm_task = asyncio.create_task(
+ self.trm_agent.process(
+ query=state["query"],
+ rag_context=state.get("rag_context"),
+ )
+ )
+
+ # Await both results
+ hrm_result, trm_result = await asyncio.gather(hrm_task, trm_task)
+
+ # Combine outputs
+ return {
+ "hrm_results": {
+ "response": hrm_result["response"],
+ "metadata": hrm_result["metadata"],
+ },
+ "trm_results": {
+ "response": trm_result["response"],
+ "metadata": trm_result["metadata"],
+ },
+ "agent_outputs": [
+ {
+ "agent": "hrm",
+ "response": hrm_result["response"],
+ "confidence": hrm_result["metadata"].get("decomposition_quality_score", 0.7),
+ },
+ {
+ "agent": "trm",
+ "response": trm_result["response"],
+ "confidence": trm_result["metadata"].get("final_quality_score", 0.7),
+ },
+ ],
+ }
+
+ async def _hrm_agent_node(self, state: AgentState) -> dict:
+ """Execute HRM agent."""
+ self.logger.info("Executing HRM agent")
+
+ result = await self.hrm_agent.process(
+ query=state["query"],
+ rag_context=state.get("rag_context"),
+ )
+
+ return {
+ "hrm_results": {
+ "response": result["response"],
+ "metadata": result["metadata"],
+ },
+ "agent_outputs": [
+ {
+ "agent": "hrm",
+ "response": result["response"],
+ "confidence": result["metadata"].get("decomposition_quality_score", 0.7),
+ }
+ ],
+ }
+
+ async def _trm_agent_node(self, state: AgentState) -> dict:
+ """Execute TRM agent."""
+ self.logger.info("Executing TRM agent")
+
+ result = await self.trm_agent.process(
+ query=state["query"],
+ rag_context=state.get("rag_context"),
+ )
+
+ return {
+ "trm_results": {
+ "response": result["response"],
+ "metadata": result["metadata"],
+ },
+ "agent_outputs": [
+ {
+ "agent": "trm",
+ "response": result["response"],
+ "confidence": result["metadata"].get("final_quality_score", 0.7),
+ }
+ ],
+ }
+
+ async def _mcts_simulator_node(self, state: AgentState) -> dict:
+ """Execute MCTS simulation using new deterministic engine."""
+ self.logger.info("Executing MCTS simulation with deterministic engine")
+
+ start_time = time.perf_counter()
+
+ # Reset engine for this simulation
+ self.mcts_engine.clear_cache()
+
+ # Create root state
+ root_state = MCTSState(
+ state_id="root",
+ features={
+ "query": state["query"][:100], # Truncate for hashing
+ "has_hrm": "hrm_results" in state,
+ "has_trm": "trm_results" in state,
+ },
+ )
+
+ root = MCTSNode(
+ state=root_state,
+ rng=self.mcts_engine.rng,
+ )
+
+ # Define action generator based on domain
+ def action_generator(mcts_state: MCTSState) -> list[str]:
+ """Generate available actions for state."""
+ depth = len(mcts_state.state_id.split("_")) - 1
+
+ if depth == 0:
+ # Root level actions
+ return ["action_A", "action_B", "action_C", "action_D"]
+ elif depth < self.mcts_config.max_tree_depth:
+ # Subsequent actions
+ return ["continue", "refine", "fallback", "escalate"]
+ else:
+ return [] # Terminal
+
+ # Define state transition
+ def state_transition(mcts_state: MCTSState, action: str) -> MCTSState:
+ """Compute next state from action."""
+ new_id = f"{mcts_state.state_id}_{action}"
+ new_features = mcts_state.features.copy()
+ new_features["last_action"] = action
+ new_features["depth"] = len(new_id.split("_")) - 1
+ return MCTSState(state_id=new_id, features=new_features)
+
+ # Create rollout policy using agent results
+ def heuristic_fn(mcts_state: MCTSState) -> float:
+ """Evaluate state using agent confidence."""
+ base = 0.5
+
+ # Bias based on agent confidence
+ if state.get("hrm_results"):
+ hrm_conf = state["hrm_results"]["metadata"].get("decomposition_quality_score", 0.5)
+ base += hrm_conf * 0.2
+
+ if state.get("trm_results"):
+ trm_conf = state["trm_results"]["metadata"].get("final_quality_score", 0.5)
+ base += trm_conf * 0.2
+
+ return min(base, 1.0)
+
+ rollout_policy = HybridRolloutPolicy(
+ heuristic_fn=heuristic_fn,
+ heuristic_weight=0.7,
+ random_weight=0.3,
+ )
+
+ # Run MCTS search
+ best_action, stats = await self.mcts_engine.search(
+ root=root,
+ num_iterations=self.mcts_config.num_iterations,
+ action_generator=action_generator,
+ state_transition=state_transition,
+ rollout_policy=rollout_policy,
+ max_rollout_depth=self.mcts_config.max_rollout_depth,
+ selection_policy=self.mcts_config.selection_policy,
+ )
+
+ end_time = time.perf_counter()
+ execution_time_ms = (end_time - start_time) * 1000
+
+ # Compute tree statistics
+ tree_depth = self.mcts_engine.get_tree_depth(root)
+ tree_node_count = self.mcts_engine.count_nodes(root)
+
+ # Track experiment
+ self.experiment_tracker.create_result(
+ experiment_id=f"mcts_{int(time.time())}",
+ config=self.mcts_config,
+ mcts_stats=stats,
+ execution_time_ms=execution_time_ms,
+ tree_depth=tree_depth,
+ tree_node_count=tree_node_count,
+ metadata={
+ "query": state["query"][:100],
+ "has_rag": state.get("use_rag", False),
+ },
+ )
+
+ self.logger.info(
+ f"MCTS complete: best_action={best_action}, "
+ f"iterations={stats['iterations']}, "
+ f"cache_hit_rate={stats['cache_hit_rate']:.2%}"
+ )
+
+ return {
+ "mcts_root": root,
+ "mcts_best_action": best_action,
+ "mcts_stats": stats,
+ "agent_outputs": [
+ {
+ "agent": "mcts",
+ "response": (
+ f"Simulated {stats['iterations']} scenarios with "
+ f"seed {self.mcts_config.seed}. "
+ f"Recommended action: {best_action} "
+ f"(visits={stats['best_action_visits']}, "
+ f"value={stats['best_action_value']:.3f})"
+ ),
+ "confidence": min(
+ stats["best_action_visits"] / stats["iterations"] if stats["iterations"] > 0 else 0.5,
+ 1.0,
+ ),
+ }
+ ],
+ }
+
+ def _aggregate_results_node(self, state: AgentState) -> dict:
+ """Aggregate results from all agents."""
+ self.logger.info("Aggregating agent results")
+
+ agent_outputs = state.get("agent_outputs", [])
+
+ confidence_scores = {output["agent"]: output["confidence"] for output in agent_outputs}
+
+ return {"confidence_scores": confidence_scores}
+
+ def _evaluate_consensus_node(self, state: AgentState) -> dict:
+ """Evaluate consensus among agents."""
+ agent_outputs = state.get("agent_outputs", [])
+
+ if len(agent_outputs) < 2:
+ return {
+ "consensus_reached": True,
+ "consensus_score": 1.0,
+ }
+
+ avg_confidence = sum(o["confidence"] for o in agent_outputs) / len(agent_outputs)
+
+ consensus_reached = avg_confidence >= self.consensus_threshold
+
+ self.logger.info(f"Consensus: {consensus_reached} (score={avg_confidence:.2f})")
+
+ return {
+ "consensus_reached": consensus_reached,
+ "consensus_score": avg_confidence,
+ }
+
+ def _check_consensus(self, state: AgentState) -> str:
+ """Check if consensus reached or need more iterations."""
+ if state.get("consensus_reached", False):
+ return "synthesize"
+
+ if state.get("iteration", 0) >= state.get("max_iterations", self.max_iterations):
+ return "synthesize"
+
+ return "iterate"
+
+ async def _synthesize_node(self, state: AgentState) -> dict:
+ """Synthesize final response from agent outputs."""
+ self.logger.info("Synthesizing final response")
+
+ agent_outputs = state.get("agent_outputs", [])
+
+ synthesis_prompt = f"""Query: {state["query"]}
+
+Agent Outputs:
+"""
+
+ for output in agent_outputs:
+ synthesis_prompt += f"""
+{output["agent"].upper()} (confidence={output["confidence"]:.2f}):
+{output["response"]}
+
+"""
+
+ synthesis_prompt += """
+Synthesize these outputs into a comprehensive final response.
+Prioritize higher-confidence outputs. Integrate insights from all agents.
+
+Final Response:"""
+
+ try:
+ response = await self.model_adapter.generate(
+ prompt=synthesis_prompt,
+ temperature=0.5,
+ )
+ final_response = response.text
+ except Exception as e:
+ self.logger.error(f"Synthesis failed: {e}")
+ best_output = max(agent_outputs, key=lambda o: o["confidence"])
+ final_response = best_output["response"]
+
+ metadata = {
+ "agents_used": [o["agent"] for o in agent_outputs],
+ "confidence_scores": state.get("confidence_scores", {}),
+ "consensus_score": state.get("consensus_score", 0.0),
+ "iterations": state.get("iteration", 0),
+ "mcts_config": state.get("mcts_config", {}),
+ }
+
+ if state.get("mcts_stats"):
+ metadata["mcts_stats"] = state["mcts_stats"]
+
+ return {
+ "final_response": final_response,
+ "metadata": metadata,
+ }
+
+
+class IntegratedFramework:
+ """
+ Integrated multi-agent framework with new MCTS core.
+
+ Maintains backward compatibility with original process() signature.
+ """
+
+ def __init__(
+ self,
+ model_adapter,
+ logger,
+ vector_store=None,
+ _embedding_model=None,
+ hrm_config: dict | None = None,
+ trm_config: dict | None = None,
+ mcts_config: MCTSConfig | None = None,
+ top_k_retrieval: int = 5,
+ max_iterations: int = 3,
+ consensus_threshold: float = 0.75,
+ enable_parallel_agents: bool = True,
+ ):
+ """
+ Initialize integrated framework.
+
+ Backward compatible with LangGraphMultiAgentFramework.
+ """
+ self.model_adapter = model_adapter
+ self.logger = logger
+ self.vector_store = vector_store
+
+ # Import agents (would be real imports in production)
+ try:
+ from improved_hrm_agent import HRMAgent
+ from improved_trm_agent import TRMAgent
+
+ self.hrm_agent = HRMAgent(
+ model_adapter=model_adapter,
+ logger=logger,
+ **(hrm_config or {}),
+ )
+ self.trm_agent = TRMAgent(
+ model_adapter=model_adapter,
+ logger=logger,
+ **(trm_config or {}),
+ )
+ except ImportError:
+ self.hrm_agent = None
+ self.trm_agent = None
+ self.logger.warning("Could not import HRM/TRM agents")
+
+ # Build graph
+ self.graph_builder = GraphBuilder(
+ hrm_agent=self.hrm_agent,
+ trm_agent=self.trm_agent,
+ model_adapter=model_adapter,
+ logger=logger,
+ vector_store=vector_store,
+ mcts_config=mcts_config,
+ top_k_retrieval=top_k_retrieval,
+ max_iterations=max_iterations,
+ consensus_threshold=consensus_threshold,
+ enable_parallel_agents=enable_parallel_agents,
+ )
+
+ # Compile graph
+ if StateGraph is not None:
+ self.graph = self.graph_builder.build_graph()
+ self.memory = MemorySaver() if MemorySaver else None
+ self.app = self.graph.compile(checkpointer=self.memory) if self.memory else self.graph.compile()
+ else:
+ self.graph = None
+ self.app = None
+
+ self.logger.info("Integrated framework initialized with new MCTS core")
+
+ async def process(
+ self,
+ query: str,
+ use_rag: bool = True,
+ use_mcts: bool = False,
+ config: dict | None = None,
+ ) -> dict:
+ """
+ Process query through LangGraph.
+
+ Backward compatible with original signature.
+
+ Args:
+ query: User query to process
+ use_rag: Enable RAG context retrieval
+ use_mcts: Enable MCTS simulation
+ config: Optional LangGraph config
+
+ Returns:
+ Dictionary with response, metadata, and state
+ """
+ if self.app is None:
+ raise RuntimeError("LangGraph not available. Install with: pip install langgraph")
+
+ initial_state = {
+ "query": query,
+ "use_rag": use_rag,
+ "use_mcts": use_mcts,
+ "iteration": 0,
+ "max_iterations": self.graph_builder.max_iterations,
+ "agent_outputs": [],
+ }
+
+ config = config or {"configurable": {"thread_id": "default"}}
+
+ result = await self.app.ainvoke(initial_state, config=config)
+
+ return {
+ "response": result.get("final_response", ""),
+ "metadata": result.get("metadata", {}),
+ "state": result,
+ }
+
+ def get_experiment_tracker(self) -> ExperimentTracker:
+ """Get the experiment tracker for analysis."""
+ return self.graph_builder.experiment_tracker
+
+ def set_mcts_seed(self, seed: int) -> None:
+ """Set MCTS seed for deterministic behavior."""
+ self.graph_builder.mcts_engine.reset_seed(seed)
+ self.graph_builder.mcts_config.seed = seed
diff --git a/src/framework/mcts/__init__.py b/src/framework/mcts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d1cdeb1251f532a6288042b735f654dadd0ac5
--- /dev/null
+++ b/src/framework/mcts/__init__.py
@@ -0,0 +1,26 @@
+"""
+MCTS (Monte Carlo Tree Search) module for multi-agent framework.
+
+Provides deterministic, testable MCTS with:
+- Progressive widening for controlled branching
+- Simulation result caching
+- Configurable selection and rollout policies
+- Experiment tracking and analysis
+"""
+
+from .config import MCTSConfig, create_preset_config
+from .core import MCTSEngine, MCTSNode
+from .experiments import ExperimentResult, ExperimentTracker
+from .policies import RolloutPolicy, SelectionPolicy, ucb1
+
+__all__ = [
+ "MCTSNode",
+ "MCTSEngine",
+ "ucb1",
+ "RolloutPolicy",
+ "SelectionPolicy",
+ "MCTSConfig",
+ "create_preset_config",
+ "ExperimentTracker",
+ "ExperimentResult",
+]
diff --git a/src/framework/mcts/config.py b/src/framework/mcts/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..767d19ca0c79ba5bf4aa82f75db4e30c9186546b
--- /dev/null
+++ b/src/framework/mcts/config.py
@@ -0,0 +1,355 @@
+"""
+MCTS Configuration Module - Parameter management and presets.
+
+Provides:
+- MCTSConfig dataclass with all parameters
+- Validation of parameter bounds
+- Preset configurations (fast, balanced, thorough)
+- Serialization support for experiment tracking
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass
+from enum import Enum
+from typing import Any
+
+from .policies import SelectionPolicy
+
+
+class ConfigPreset(Enum):
+ """Preset configuration names."""
+
+ FAST = "fast"
+ BALANCED = "balanced"
+ THOROUGH = "thorough"
+ EXPLORATION_HEAVY = "exploration_heavy"
+ EXPLOITATION_HEAVY = "exploitation_heavy"
+
+
+@dataclass
+class MCTSConfig:
+ """
+ Complete configuration for MCTS engine.
+
+ All MCTS parameters are centralized here with validation.
+ Supports serialization for experiment tracking and reproducibility.
+ """
+
+ # Core MCTS parameters
+ num_iterations: int = 100
+ """Number of MCTS iterations to run."""
+
+ seed: int = 42
+ """Random seed for deterministic behavior."""
+
+ exploration_weight: float = 1.414
+ """UCB1 exploration constant (c). Higher = more exploration."""
+
+ # Progressive widening
+ progressive_widening_k: float = 1.0
+ """Progressive widening coefficient. Higher = more conservative."""
+
+ progressive_widening_alpha: float = 0.5
+ """Progressive widening exponent. Lower = more aggressive expansion."""
+
+ # Rollout configuration
+ max_rollout_depth: int = 10
+ """Maximum depth for rollout simulations."""
+
+ rollout_policy: str = "hybrid"
+ """Rollout policy: 'random', 'greedy', 'hybrid'."""
+
+ # Action selection
+ selection_policy: SelectionPolicy = SelectionPolicy.MAX_VISITS
+ """Policy for final action selection."""
+
+ # Parallelization
+ max_parallel_rollouts: int = 4
+ """Maximum concurrent rollout simulations."""
+
+ # Caching
+ enable_cache: bool = True
+ """Enable simulation result caching."""
+
+ cache_size_limit: int = 10000
+ """Maximum number of cached simulation results."""
+
+ # Tree structure
+ max_tree_depth: int = 20
+ """Maximum depth of MCTS tree."""
+
+ max_children_per_node: int = 50
+ """Maximum children per node (action branching limit)."""
+
+ # Early termination
+ early_termination_threshold: float = 0.95
+ """Stop if best action has this fraction of total visits."""
+
+ min_iterations_before_termination: int = 50
+ """Minimum iterations before early termination check."""
+
+ # Value bounds
+ min_value: float = 0.0
+ """Minimum value for normalization."""
+
+ max_value: float = 1.0
+ """Maximum value for normalization."""
+
+ # Metadata
+ name: str = "default"
+ """Configuration name for tracking."""
+
+ description: str = ""
+ """Description of this configuration."""
+
+ def __post_init__(self):
+ """Validate configuration parameters after initialization."""
+ self.validate()
+
+ def validate(self) -> None:
+ """
+ Validate all configuration parameters.
+
+ Raises:
+ ValueError: If any parameter is out of valid bounds.
+ """
+ errors = []
+
+ # Core parameters
+ if self.num_iterations < 1:
+ errors.append("num_iterations must be >= 1")
+ if self.num_iterations > 100000:
+ errors.append("num_iterations should be <= 100000 for practical use")
+
+ if self.exploration_weight < 0:
+ errors.append("exploration_weight must be >= 0")
+ if self.exploration_weight > 10:
+ errors.append("exploration_weight should be <= 10")
+
+ # Progressive widening
+ if self.progressive_widening_k <= 0:
+ errors.append("progressive_widening_k must be > 0")
+ if not 0 < self.progressive_widening_alpha < 1:
+ errors.append("progressive_widening_alpha must be in (0, 1)")
+
+ # Rollout
+ if self.max_rollout_depth < 1:
+ errors.append("max_rollout_depth must be >= 1")
+ if self.rollout_policy not in ["random", "greedy", "hybrid", "llm"]:
+ errors.append("rollout_policy must be one of: random, greedy, hybrid, llm")
+
+ # Parallelization
+ if self.max_parallel_rollouts < 1:
+ errors.append("max_parallel_rollouts must be >= 1")
+ if self.max_parallel_rollouts > 100:
+ errors.append("max_parallel_rollouts should be <= 100")
+
+ # Caching
+ if self.cache_size_limit < 0:
+ errors.append("cache_size_limit must be >= 0")
+
+ # Tree structure
+ if self.max_tree_depth < 1:
+ errors.append("max_tree_depth must be >= 1")
+ if self.max_children_per_node < 1:
+ errors.append("max_children_per_node must be >= 1")
+
+ # Early termination
+ if not 0 < self.early_termination_threshold <= 1:
+ errors.append("early_termination_threshold must be in (0, 1]")
+ if self.min_iterations_before_termination < 1:
+ errors.append("min_iterations_before_termination must be >= 1")
+ if self.min_iterations_before_termination > self.num_iterations:
+ errors.append("min_iterations_before_termination must be <= num_iterations")
+
+ # Value bounds
+ if self.min_value >= self.max_value:
+ errors.append("min_value must be < max_value")
+
+ if errors:
+ raise ValueError("Invalid MCTS configuration:\n" + "\n".join(f" - {e}" for e in errors))
+
+ def to_dict(self) -> dict[str, Any]:
+ """
+ Convert configuration to dictionary for serialization.
+
+ Returns:
+ Dictionary representation of config.
+ """
+ d = asdict(self)
+ # Convert enum to string
+ d["selection_policy"] = self.selection_policy.value
+ return d
+
+ def to_json(self, indent: int = 2) -> str:
+ """
+ Serialize configuration to JSON string.
+
+ Args:
+ indent: JSON indentation level
+
+ Returns:
+ JSON string representation
+ """
+ return json.dumps(self.to_dict(), indent=indent)
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> MCTSConfig:
+ """
+ Create configuration from dictionary.
+
+ Args:
+ data: Dictionary with configuration parameters
+
+ Returns:
+ MCTSConfig instance
+ """
+ # Convert selection_policy string back to enum
+ if "selection_policy" in data and isinstance(data["selection_policy"], str):
+ data["selection_policy"] = SelectionPolicy(data["selection_policy"])
+ return cls(**data)
+
+ @classmethod
+ def from_json(cls, json_str: str) -> MCTSConfig:
+ """
+ Deserialize configuration from JSON string.
+
+ Args:
+ json_str: JSON string
+
+ Returns:
+ MCTSConfig instance
+ """
+ data = json.loads(json_str)
+ return cls.from_dict(data)
+
+ def copy(self, **overrides) -> MCTSConfig:
+ """
+ Create a copy with optional parameter overrides.
+
+ Args:
+ **overrides: Parameters to override
+
+ Returns:
+ New MCTSConfig instance
+ """
+ data = self.to_dict()
+ data.update(overrides)
+ return self.from_dict(data)
+
+ def __repr__(self) -> str:
+ return (
+ f"MCTSConfig(name={self.name!r}, "
+ f"iterations={self.num_iterations}, "
+ f"c={self.exploration_weight}, "
+ f"widening_k={self.progressive_widening_k}, "
+ f"widening_alpha={self.progressive_widening_alpha})"
+ )
+
+
+def create_preset_config(preset: ConfigPreset) -> MCTSConfig:
+ """
+ Create a preset configuration.
+
+ Args:
+ preset: Preset type to create
+
+ Returns:
+ MCTSConfig with preset parameters
+ """
+ if preset == ConfigPreset.FAST:
+ return MCTSConfig(
+ name="fast",
+ description="Fast search with minimal iterations",
+ num_iterations=25,
+ exploration_weight=1.414,
+ progressive_widening_k=0.5, # Aggressive widening
+ progressive_widening_alpha=0.5,
+ max_rollout_depth=5,
+ rollout_policy="random",
+ selection_policy=SelectionPolicy.MAX_VISITS,
+ max_parallel_rollouts=8,
+ cache_size_limit=1000,
+ early_termination_threshold=0.8,
+ min_iterations_before_termination=10,
+ )
+
+ elif preset == ConfigPreset.BALANCED:
+ return MCTSConfig(
+ name="balanced",
+ description="Balanced search for typical use cases",
+ num_iterations=100,
+ exploration_weight=1.414,
+ progressive_widening_k=1.0,
+ progressive_widening_alpha=0.5,
+ max_rollout_depth=10,
+ rollout_policy="hybrid",
+ selection_policy=SelectionPolicy.MAX_VISITS,
+ max_parallel_rollouts=4,
+ cache_size_limit=10000,
+ early_termination_threshold=0.9,
+ min_iterations_before_termination=50,
+ )
+
+ elif preset == ConfigPreset.THOROUGH:
+ return MCTSConfig(
+ name="thorough",
+ description="Thorough search for high-stakes decisions",
+ num_iterations=500,
+ exploration_weight=1.414,
+ progressive_widening_k=2.0, # Conservative widening
+ progressive_widening_alpha=0.6,
+ max_rollout_depth=20,
+ rollout_policy="hybrid",
+ selection_policy=SelectionPolicy.ROBUST_CHILD,
+ max_parallel_rollouts=4,
+ cache_size_limit=50000,
+ early_termination_threshold=0.95,
+ min_iterations_before_termination=200,
+ )
+
+ elif preset == ConfigPreset.EXPLORATION_HEAVY:
+ return MCTSConfig(
+ name="exploration_heavy",
+ description="High exploration for diverse action discovery",
+ num_iterations=200,
+ exploration_weight=2.5, # High exploration
+ progressive_widening_k=0.8, # More widening
+ progressive_widening_alpha=0.4, # Aggressive
+ max_rollout_depth=15,
+ rollout_policy="random",
+ selection_policy=SelectionPolicy.MAX_VISITS,
+ max_parallel_rollouts=6,
+ cache_size_limit=20000,
+ early_termination_threshold=0.95,
+ min_iterations_before_termination=100,
+ )
+
+ elif preset == ConfigPreset.EXPLOITATION_HEAVY:
+ return MCTSConfig(
+ name="exploitation_heavy",
+ description="High exploitation for known-good action refinement",
+ num_iterations=150,
+ exploration_weight=0.5, # Low exploration
+ progressive_widening_k=3.0, # Conservative
+ progressive_widening_alpha=0.7, # Very conservative
+ max_rollout_depth=10,
+ rollout_policy="greedy",
+ selection_policy=SelectionPolicy.MAX_VALUE,
+ max_parallel_rollouts=4,
+ cache_size_limit=10000,
+ early_termination_threshold=0.85,
+ min_iterations_before_termination=75,
+ )
+
+ else:
+ raise ValueError(f"Unknown preset: {preset}")
+
+
+# Default configurations for easy access
+DEFAULT_CONFIG = MCTSConfig()
+FAST_CONFIG = create_preset_config(ConfigPreset.FAST)
+BALANCED_CONFIG = create_preset_config(ConfigPreset.BALANCED)
+THOROUGH_CONFIG = create_preset_config(ConfigPreset.THOROUGH)
diff --git a/src/framework/mcts/core.py b/src/framework/mcts/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8e1387f6837474100af579bf8d87b16075ab3c3
--- /dev/null
+++ b/src/framework/mcts/core.py
@@ -0,0 +1,619 @@
+"""
+MCTS Core Module - Deterministic, testable Monte Carlo Tree Search implementation.
+
+Features:
+- Seeded RNG for deterministic behavior
+- Progressive widening to control branching factor
+- Simulation result caching with hashable state keys
+- Clear separation of MCTS phases: select, expand, simulate, backpropagate
+- Support for parallel rollouts with asyncio.Semaphore
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+from collections import OrderedDict
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any
+
+import numpy as np
+
+from .policies import RolloutPolicy, SelectionPolicy, ucb1
+
+
+@dataclass
+class MCTSState:
+ """Hashable state representation for caching."""
+
+ state_id: str
+ features: dict[str, Any] = field(default_factory=dict)
+
+ def to_hash_key(self) -> str:
+ """Generate a hashable key for this state."""
+ # Sort features for deterministic hashing
+ feature_str = str(sorted(self.features.items()))
+ combined = f"{self.state_id}:{feature_str}"
+ return hashlib.sha256(combined.encode()).hexdigest()
+
+
+class MCTSNode:
+ """
+ Monte Carlo Tree Search node with proper state management.
+
+ Attributes:
+ state: The state this node represents
+ parent: Parent node (None for root)
+ action: Action taken to reach this node from parent
+ children: List of child nodes
+ visits: Number of times this node has been visited
+ value_sum: Total accumulated value from simulations
+ rng: Seeded random number generator for deterministic behavior
+ """
+
+ def __init__(
+ self,
+ state: MCTSState,
+ parent: MCTSNode | None = None,
+ action: str | None = None,
+ rng: np.random.Generator | None = None,
+ ):
+ self.state = state
+ self.parent = parent
+ self.action = action
+ self.children: list[MCTSNode] = []
+ self.visits: int = 0
+ self.value_sum: float = 0.0
+ self.terminal: bool = False
+ self.expanded_actions: set = set()
+ self.available_actions: list[str] = []
+
+ # Track depth for O(1) tree statistics
+ self.depth: int = 0 if parent is None else parent.depth + 1
+
+ # Use provided RNG or create default
+ self._rng = rng or np.random.default_rng()
+
+ @property
+ def value(self) -> float:
+ """Average value of this node."""
+ if self.visits == 0:
+ return 0.0
+ return self.value_sum / self.visits
+
+ @property
+ def is_fully_expanded(self) -> bool:
+ """Check if all available actions have been expanded."""
+ return len(self.expanded_actions) >= len(self.available_actions)
+
+ def select_child(self, exploration_weight: float = 1.414) -> MCTSNode:
+ """
+ Select best child using UCB1 policy.
+
+ Args:
+ exploration_weight: Exploration constant (c in UCB1)
+
+ Returns:
+ Best child node according to UCB1
+ """
+ if not self.children:
+ raise ValueError("No children to select from")
+
+ best_child = None
+ best_score = float("-inf")
+
+ for child in self.children:
+ score = ucb1(
+ value_sum=child.value_sum,
+ visits=child.visits,
+ parent_visits=self.visits,
+ c=exploration_weight,
+ )
+ if score > best_score:
+ best_score = score
+ best_child = child
+
+ return best_child
+
+ def add_child(self, action: str, child_state: MCTSState) -> MCTSNode:
+ """
+ Add a child node for the given action.
+
+ Args:
+ action: Action taken to reach child state
+ child_state: State of the child node
+
+ Returns:
+ Newly created child node
+ """
+ child = MCTSNode(
+ state=child_state,
+ parent=self,
+ action=action,
+ rng=self._rng,
+ )
+ self.children.append(child)
+ self.expanded_actions.add(action)
+ return child
+
+ def get_unexpanded_action(self) -> str | None:
+ """Get a random unexpanded action."""
+ unexpanded = [a for a in self.available_actions if a not in self.expanded_actions]
+ if not unexpanded:
+ return None
+ return self._rng.choice(unexpanded)
+
+ def __repr__(self) -> str:
+ return (
+ f"MCTSNode(state={self.state.state_id}, "
+ f"visits={self.visits}, value={self.value:.3f}, "
+ f"children={len(self.children)})"
+ )
+
+
+class MCTSEngine:
+ """
+ Main MCTS engine with deterministic behavior and advanced features.
+
+ Features:
+ - Seeded RNG for reproducibility
+ - Progressive widening to control branching
+ - Simulation result caching
+ - Parallel rollout support with semaphore
+ """
+
+ def __init__(
+ self,
+ seed: int = 42,
+ exploration_weight: float = 1.414,
+ progressive_widening_k: float = 1.0,
+ progressive_widening_alpha: float = 0.5,
+ max_parallel_rollouts: int = 4,
+ cache_size_limit: int = 10000,
+ ):
+ """
+ Initialize MCTS engine.
+
+ Args:
+ seed: Random seed for deterministic behavior
+ exploration_weight: UCB1 exploration constant
+ progressive_widening_k: Progressive widening coefficient
+ progressive_widening_alpha: Progressive widening exponent
+ max_parallel_rollouts: Maximum concurrent rollouts
+ cache_size_limit: Maximum number of cached simulation results
+ """
+ self.seed = seed
+ self.rng = np.random.default_rng(seed)
+ self.exploration_weight = exploration_weight
+ self.progressive_widening_k = progressive_widening_k
+ self.progressive_widening_alpha = progressive_widening_alpha
+
+ # Parallel rollout control
+ self.max_parallel_rollouts = max_parallel_rollouts
+ self._semaphore: asyncio.Semaphore | None = None
+
+ # Simulation cache: state_hash -> (value, visit_count)
+ # Using OrderedDict for LRU eviction
+ self._simulation_cache: OrderedDict[str, tuple[float, int]] = OrderedDict()
+ self.cache_size_limit = cache_size_limit
+
+ # Statistics
+ self.total_simulations = 0
+ self.cache_hits = 0
+ self.cache_misses = 0
+ self.cache_evictions = 0
+
+ # Cached tree statistics for O(1) retrieval
+ self._cached_tree_depth: int = 0
+ self._cached_node_count: int = 0
+
+ def reset_seed(self, seed: int) -> None:
+ """Reset the random seed for new experiment."""
+ self.seed = seed
+ self.rng = np.random.default_rng(seed)
+
+ def clear_cache(self) -> None:
+ """Clear simulation result cache."""
+ self._simulation_cache.clear()
+ self.cache_hits = 0
+ self.cache_misses = 0
+ self.cache_evictions = 0
+
+ def should_expand(self, node: MCTSNode) -> bool:
+ """
+ Check if node should expand based on progressive widening.
+
+ Progressive widening formula: expand when visits > k * n^alpha
+ where n is the number of children.
+
+ This prevents excessive branching and focuses search on promising areas.
+ """
+ if node.terminal or node.is_fully_expanded:
+ return False
+
+ num_children = len(node.children)
+ threshold = self.progressive_widening_k * (num_children**self.progressive_widening_alpha)
+
+ return node.visits > threshold
+
+ def select(self, node: MCTSNode) -> MCTSNode:
+ """
+ MCTS Selection Phase: traverse tree to find leaf node.
+
+ Uses UCB1 to balance exploration and exploitation.
+ """
+ while node.children and not node.terminal:
+ # Check if we should expand instead of selecting
+ if self.should_expand(node):
+ break
+ node = node.select_child(self.exploration_weight)
+ return node
+
+ def expand(
+ self,
+ node: MCTSNode,
+ action_generator: Callable[[MCTSState], list[str]],
+ state_transition: Callable[[MCTSState, str], MCTSState],
+ ) -> MCTSNode:
+ """
+ MCTS Expansion Phase: add a new child node.
+
+ Args:
+ node: Node to expand
+ action_generator: Function to generate available actions
+ state_transition: Function to compute next state from action
+
+ Returns:
+ Newly expanded child node, or original node if cannot expand
+ """
+ if node.terminal:
+ return node
+
+ # Generate available actions if not yet done
+ if not node.available_actions:
+ node.available_actions = action_generator(node.state)
+
+ if not node.available_actions:
+ node.terminal = True
+ return node
+
+ # Check progressive widening
+ if not self.should_expand(node):
+ return node
+
+ # Get unexpanded action
+ action = node.get_unexpanded_action()
+ if action is None:
+ return node
+
+ # Create child state
+ child_state = state_transition(node.state, action)
+ child = node.add_child(action, child_state)
+
+ # Update cached node count for O(1) retrieval
+ self._cached_node_count += 1
+
+ return child
+
+ async def simulate(
+ self,
+ node: MCTSNode,
+ rollout_policy: RolloutPolicy,
+ max_depth: int = 10,
+ ) -> float:
+ """
+ MCTS Simulation Phase: evaluate node value through rollout.
+
+ Uses caching to avoid redundant simulations.
+
+ Args:
+ node: Node to simulate from
+ rollout_policy: Policy for rollout evaluation
+ max_depth: Maximum rollout depth
+
+ Returns:
+ Estimated value from simulation
+ """
+ # Check cache first
+ state_hash = node.state.to_hash_key()
+ if state_hash in self._simulation_cache:
+ cached_value, cached_count = self._simulation_cache[state_hash]
+ # Move to end for LRU (most recently used)
+ self._simulation_cache.move_to_end(state_hash)
+ self.cache_hits += 1
+ # Return cached average with small noise for exploration
+ noise = self.rng.normal(0, 0.01)
+ return cached_value + noise
+
+ self.cache_misses += 1
+
+ # Acquire semaphore for parallel control
+ if self._semaphore is None:
+ self._semaphore = asyncio.Semaphore(self.max_parallel_rollouts)
+
+ async with self._semaphore:
+ # Perform rollout
+ value = await rollout_policy.evaluate(
+ state=node.state,
+ rng=self.rng,
+ max_depth=max_depth,
+ )
+
+ self.total_simulations += 1
+
+ # Update cache with LRU eviction
+ if state_hash in self._simulation_cache:
+ # Update existing cache entry with running average
+ old_value, old_count = self._simulation_cache[state_hash]
+ new_count = old_count + 1
+ new_value = (old_value * old_count + value) / new_count
+ self._simulation_cache[state_hash] = (new_value, new_count)
+ # Move to end for LRU (most recently used)
+ self._simulation_cache.move_to_end(state_hash)
+ else:
+ # Evict oldest entry if cache is full
+ if len(self._simulation_cache) >= self.cache_size_limit:
+ # Remove the first item (least recently used)
+ self._simulation_cache.popitem(last=False)
+ self.cache_evictions += 1
+ # Add new entry at the end (most recently used)
+ self._simulation_cache[state_hash] = (value, 1)
+
+ return value
+
+ def backpropagate(self, node: MCTSNode, value: float) -> None:
+ """
+ MCTS Backpropagation Phase: update ancestor statistics.
+
+ Args:
+ node: Leaf node to start backpropagation
+ value: Value to propagate up the tree
+ """
+ # Update cached tree depth if this node is deeper than current max
+ if node.depth > self._cached_tree_depth:
+ self._cached_tree_depth = node.depth
+
+ current = node
+ while current is not None:
+ current.visits += 1
+ current.value_sum += value
+ current = current.parent
+
+ async def run_iteration(
+ self,
+ root: MCTSNode,
+ action_generator: Callable[[MCTSState], list[str]],
+ state_transition: Callable[[MCTSState, str], MCTSState],
+ rollout_policy: RolloutPolicy,
+ max_rollout_depth: int = 10,
+ ) -> None:
+ """
+ Run a single MCTS iteration (select, expand, simulate, backpropagate).
+
+ Args:
+ root: Root node of the tree
+ action_generator: Function to generate actions
+ state_transition: Function to compute state transitions
+ rollout_policy: Policy for rollout evaluation
+ max_rollout_depth: Maximum depth for rollouts
+ """
+ # Selection
+ leaf = self.select(root)
+
+ # Expansion
+ if not leaf.terminal and leaf.visits > 0:
+ leaf = self.expand(leaf, action_generator, state_transition)
+
+ # Simulation
+ value = await self.simulate(leaf, rollout_policy, max_rollout_depth)
+
+ # Backpropagation
+ self.backpropagate(leaf, value)
+
+ async def search(
+ self,
+ root: MCTSNode,
+ num_iterations: int,
+ action_generator: Callable[[MCTSState], list[str]],
+ state_transition: Callable[[MCTSState, str], MCTSState],
+ rollout_policy: RolloutPolicy,
+ max_rollout_depth: int = 10,
+ selection_policy: SelectionPolicy = SelectionPolicy.MAX_VISITS,
+ ) -> tuple[str | None, dict[str, Any]]:
+ """
+ Run MCTS search for specified number of iterations.
+
+ Args:
+ root: Root node to search from
+ num_iterations: Number of MCTS iterations
+ action_generator: Function to generate available actions
+ state_transition: Function to compute state transitions
+ rollout_policy: Policy for rollout simulation
+ max_rollout_depth: Maximum rollout depth
+ selection_policy: Policy for final action selection
+
+ Returns:
+ Tuple of (best_action, statistics_dict)
+ """
+ # Reset cached tree statistics for new search
+ self._cached_tree_depth = 0
+ self._cached_node_count = 1 # Start with root node
+
+ # Initialize root's available actions
+ if not root.available_actions:
+ root.available_actions = action_generator(root.state)
+
+ # Run iterations
+ for _i in range(num_iterations):
+ await self.run_iteration(
+ root=root,
+ action_generator=action_generator,
+ state_transition=state_transition,
+ rollout_policy=rollout_policy,
+ max_rollout_depth=max_rollout_depth,
+ )
+
+ # Select best action based on policy
+ best_action = self._select_best_action(root, selection_policy)
+
+ # Compute statistics
+ stats = self._compute_statistics(root, num_iterations)
+
+ return best_action, stats
+
+ def _select_best_action(
+ self,
+ root: MCTSNode,
+ policy: SelectionPolicy,
+ ) -> str | None:
+ """
+ Select the best action from root based on selection policy.
+
+ Args:
+ root: Root node with children
+ policy: Selection policy to use
+
+ Returns:
+ Best action string or None if no children
+ """
+ if not root.children:
+ return None
+
+ if policy == SelectionPolicy.MAX_VISITS:
+ # Most robust: select action with most visits
+ best_child = max(root.children, key=lambda c: c.visits)
+ elif policy == SelectionPolicy.MAX_VALUE:
+ # Greedy: select action with highest average value
+ best_child = max(root.children, key=lambda c: c.value)
+ elif policy == SelectionPolicy.ROBUST_CHILD:
+ # Robust: require both high visits and high value
+ # Normalize both metrics and combine
+ max_visits = max(c.visits for c in root.children)
+ max_value = max(c.value for c in root.children) or 1.0
+
+ def robust_score(child):
+ visit_score = child.visits / max_visits if max_visits > 0 else 0
+ value_score = child.value / max_value if max_value > 0 else 0
+ return 0.5 * visit_score + 0.5 * value_score
+
+ best_child = max(root.children, key=robust_score)
+ else:
+ # Default to max visits
+ best_child = max(root.children, key=lambda c: c.visits)
+
+ return best_child.action
+
+ def _compute_statistics(
+ self,
+ root: MCTSNode,
+ num_iterations: int,
+ ) -> dict[str, Any]:
+ """
+ Compute comprehensive MCTS statistics.
+
+ Args:
+ root: Root node
+ num_iterations: Number of iterations run
+
+ Returns:
+ Dictionary of statistics
+ """
+ # Best child info
+ best_child = None
+ if root.children:
+ best_child = max(root.children, key=lambda c: c.visits)
+
+ # Action statistics
+ action_stats = {}
+ for child in root.children:
+ action_stats[child.action] = {
+ "visits": child.visits,
+ "value": child.value,
+ "value_sum": child.value_sum,
+ "num_children": len(child.children),
+ }
+
+ return {
+ "iterations": num_iterations,
+ "root_visits": root.visits,
+ "root_value": root.value,
+ "num_children": len(root.children),
+ "best_action": best_child.action if best_child else None,
+ "best_action_visits": best_child.visits if best_child else 0,
+ "best_action_value": best_child.value if best_child else 0.0,
+ "action_stats": action_stats,
+ "total_simulations": self.total_simulations,
+ "cache_hits": self.cache_hits,
+ "cache_misses": self.cache_misses,
+ "cache_evictions": self.cache_evictions,
+ "cache_hit_rate": (
+ self.cache_hits / (self.cache_hits + self.cache_misses)
+ if (self.cache_hits + self.cache_misses) > 0
+ else 0.0
+ ),
+ "cache_size": len(self._simulation_cache),
+ "seed": self.seed,
+ }
+
+ def get_tree_depth(self, node: MCTSNode) -> int:
+ """Get maximum depth of the tree from given node.
+
+ Uses iterative BFS to avoid stack overflow for large trees (5000+ nodes).
+ Each level of the tree is processed iteratively, tracking depth as we go.
+ """
+ if not node.children:
+ return 0
+
+ from collections import deque
+
+ max_depth = 0
+ # Queue contains tuples of (node, depth)
+ queue = deque([(node, 0)])
+
+ while queue:
+ current_node, depth = queue.popleft()
+ max_depth = max(max_depth, depth)
+
+ for child in current_node.children:
+ queue.append((child, depth + 1))
+
+ return max_depth
+
+ def count_nodes(self, node: MCTSNode) -> int:
+ """Count total number of nodes in tree.
+
+ Uses iterative BFS to avoid stack overflow for large trees (5000+ nodes).
+ Traverses all nodes in the tree using a queue-based approach.
+ """
+ from collections import deque
+
+ count = 0
+ queue = deque([node])
+
+ while queue:
+ current_node = queue.popleft()
+ count += 1
+
+ for child in current_node.children:
+ queue.append(child)
+
+ return count
+
+ def get_cached_tree_depth(self) -> int:
+ """
+ Get cached maximum tree depth in O(1) time.
+
+ Returns:
+ Maximum depth of tree from last search
+ """
+ return self._cached_tree_depth
+
+ def get_cached_node_count(self) -> int:
+ """
+ Get cached total node count in O(1) time.
+
+ Returns:
+ Total number of nodes in tree from last search
+ """
+ return self._cached_node_count
diff --git a/src/framework/mcts/experiments.py b/src/framework/mcts/experiments.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2a872d80ddf766edd2f9cf6ca92b64ede13854e
--- /dev/null
+++ b/src/framework/mcts/experiments.py
@@ -0,0 +1,440 @@
+"""
+Experiment Tracking Module - Track, analyze, and compare MCTS experiments.
+
+Provides:
+- Experiment run tracking (seed, params, results)
+- Statistical analysis of MCTS performance
+- Comparison utilities for different configurations
+- Export to JSON/CSV for analysis
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+import statistics
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from .config import MCTSConfig
+
+
+@dataclass
+class ExperimentResult:
+ """Result of a single MCTS experiment run."""
+
+ # Identification
+ experiment_id: str
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+ # Configuration
+ config: dict[str, Any] | None = None
+ seed: int = 42
+
+ # Core results
+ best_action: str | None = None
+ best_action_value: float = 0.0
+ best_action_visits: int = 0
+ root_visits: int = 0
+
+ # Performance metrics
+ total_iterations: int = 0
+ total_simulations: int = 0
+ execution_time_ms: float = 0.0
+
+ # Cache statistics
+ cache_hits: int = 0
+ cache_misses: int = 0
+ cache_hit_rate: float = 0.0
+
+ # Tree statistics
+ tree_depth: int = 0
+ tree_node_count: int = 0
+ branching_factor: float = 0.0
+
+ # Action distribution
+ action_stats: dict[str, dict[str, Any]] = field(default_factory=dict)
+
+ # Optional metadata
+ metadata: dict[str, Any] = field(default_factory=dict)
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary."""
+ return asdict(self)
+
+ def to_json(self, indent: int = 2) -> str:
+ """Serialize to JSON."""
+ return json.dumps(self.to_dict(), indent=indent)
+
+ @classmethod
+ def from_dict(cls, data: dict[str, Any]) -> ExperimentResult:
+ """Create from dictionary."""
+ return cls(**data)
+
+ @classmethod
+ def from_json(cls, json_str: str) -> ExperimentResult:
+ """Deserialize from JSON."""
+ return cls.from_dict(json.loads(json_str))
+
+
+class ExperimentTracker:
+ """
+ Track and analyze MCTS experiments.
+
+ Features:
+ - Store multiple experiment results
+ - Statistical analysis across runs
+ - Configuration comparison
+ - Export to JSON/CSV
+ """
+
+ def __init__(self, name: str = "mcts_experiments"):
+ """
+ Initialize experiment tracker.
+
+ Args:
+ name: Name of this experiment series
+ """
+ self.name = name
+ self.results: list[ExperimentResult] = []
+ self.created_at = datetime.now().isoformat()
+
+ def add_result(self, result: ExperimentResult) -> None:
+ """
+ Add an experiment result.
+
+ Args:
+ result: ExperimentResult to add
+ """
+ self.results.append(result)
+
+ def create_result(
+ self,
+ experiment_id: str,
+ config: MCTSConfig,
+ mcts_stats: dict[str, Any],
+ execution_time_ms: float = 0.0,
+ tree_depth: int = 0,
+ tree_node_count: int = 0,
+ metadata: dict[str, Any] | None = None,
+ ) -> ExperimentResult:
+ """
+ Create and add an experiment result from MCTS statistics.
+
+ Args:
+ experiment_id: Unique ID for this experiment
+ config: MCTS configuration used
+ mcts_stats: Statistics dict from MCTSEngine.search()
+ execution_time_ms: Execution time in milliseconds
+ tree_depth: Depth of MCTS tree
+ tree_node_count: Total nodes in tree
+ metadata: Optional additional metadata
+
+ Returns:
+ Created ExperimentResult
+ """
+ # Calculate branching factor
+ branching_factor = 0.0
+ if tree_node_count > 1 and tree_depth > 0:
+ branching_factor = (tree_node_count - 1) / tree_depth
+
+ result = ExperimentResult(
+ experiment_id=experiment_id,
+ config=config.to_dict(),
+ seed=config.seed,
+ best_action=mcts_stats.get("best_action"),
+ best_action_value=mcts_stats.get("best_action_value", 0.0),
+ best_action_visits=mcts_stats.get("best_action_visits", 0),
+ root_visits=mcts_stats.get("root_visits", 0),
+ total_iterations=mcts_stats.get("iterations", 0),
+ total_simulations=mcts_stats.get("total_simulations", 0),
+ execution_time_ms=execution_time_ms,
+ cache_hits=mcts_stats.get("cache_hits", 0),
+ cache_misses=mcts_stats.get("cache_misses", 0),
+ cache_hit_rate=mcts_stats.get("cache_hit_rate", 0.0),
+ tree_depth=tree_depth,
+ tree_node_count=tree_node_count,
+ branching_factor=branching_factor,
+ action_stats=mcts_stats.get("action_stats", {}),
+ metadata=metadata or {},
+ )
+
+ self.add_result(result)
+ return result
+
+ def get_summary_statistics(self) -> dict[str, Any]:
+ """
+ Compute summary statistics across all experiments.
+
+ Returns:
+ Dictionary of summary statistics
+ """
+ if not self.results:
+ return {"error": "No results to analyze"}
+
+ # Extract metrics
+ best_values = [r.best_action_value for r in self.results]
+ best_visits = [r.best_action_visits for r in self.results]
+ exec_times = [r.execution_time_ms for r in self.results]
+ cache_rates = [r.cache_hit_rate for r in self.results]
+ tree_depths = [r.tree_depth for r in self.results]
+ node_counts = [r.tree_node_count for r in self.results]
+
+ def compute_stats(values: list[float]) -> dict[str, float]:
+ """Compute basic statistics."""
+ if not values:
+ return {}
+ return {
+ "mean": statistics.mean(values),
+ "std": statistics.stdev(values) if len(values) > 1 else 0.0,
+ "min": min(values),
+ "max": max(values),
+ "median": statistics.median(values),
+ }
+
+ # Best action consistency
+ best_actions = [r.best_action for r in self.results]
+ action_counts = {}
+ for action in best_actions:
+ action_counts[action] = action_counts.get(action, 0) + 1
+ most_common_action = max(action_counts.items(), key=lambda x: x[1])
+ consistency_rate = most_common_action[1] / len(best_actions)
+
+ return {
+ "num_experiments": len(self.results),
+ "best_action_value_stats": compute_stats(best_values),
+ "best_action_visits_stats": compute_stats(best_visits),
+ "execution_time_ms_stats": compute_stats(exec_times),
+ "cache_hit_rate_stats": compute_stats(cache_rates),
+ "tree_depth_stats": compute_stats(tree_depths),
+ "tree_node_count_stats": compute_stats(node_counts),
+ "action_consistency": {
+ "most_common_action": most_common_action[0],
+ "consistency_rate": consistency_rate,
+ "action_distribution": action_counts,
+ },
+ }
+
+ def compare_configs(
+ self,
+ config_names: list[str] | None = None,
+ ) -> dict[str, dict[str, Any]]:
+ """
+ Compare performance across different configurations.
+
+ Args:
+ config_names: Specific config names to compare (all if None)
+
+ Returns:
+ Dictionary mapping config names to their statistics
+ """
+ # Group results by configuration name
+ grouped: dict[str, list[ExperimentResult]] = {}
+
+ for result in self.results:
+ if result.config is None:
+ continue
+
+ config_name = result.config.get("name", "unnamed")
+
+ if config_names and config_name not in config_names:
+ continue
+
+ if config_name not in grouped:
+ grouped[config_name] = []
+ grouped[config_name].append(result)
+
+ # Compute statistics for each group
+ comparison = {}
+ for name, results in grouped.items():
+ values = [r.best_action_value for r in results]
+ times = [r.execution_time_ms for r in results]
+ visits = [r.best_action_visits for r in results]
+
+ comparison[name] = {
+ "num_runs": len(results),
+ "avg_value": statistics.mean(values) if values else 0.0,
+ "std_value": statistics.stdev(values) if len(values) > 1 else 0.0,
+ "avg_time_ms": statistics.mean(times) if times else 0.0,
+ "avg_visits": statistics.mean(visits) if visits else 0.0,
+ "value_per_ms": (
+ statistics.mean(values) / statistics.mean(times) if times and statistics.mean(times) > 0 else 0.0
+ ),
+ }
+
+ return comparison
+
+ def analyze_seed_consistency(self, seed: int) -> dict[str, Any]:
+ """
+ Analyze consistency of results for a specific seed.
+
+ Args:
+ seed: Seed value to analyze
+
+ Returns:
+ Analysis of determinism for this seed
+ """
+ seed_results = [r for r in self.results if r.seed == seed]
+
+ if not seed_results:
+ return {"error": f"No results found for seed {seed}"}
+
+ # Check if all results are identical
+ best_actions = [r.best_action for r in seed_results]
+ best_values = [r.best_action_value for r in seed_results]
+ best_visits = [r.best_action_visits for r in seed_results]
+
+ is_deterministic = len(set(best_actions)) == 1 and len(set(best_values)) == 1 and len(set(best_visits)) == 1
+
+ return {
+ "seed": seed,
+ "num_runs": len(seed_results),
+ "is_deterministic": is_deterministic,
+ "unique_actions": list(set(best_actions)),
+ "value_variance": statistics.variance(best_values) if len(best_values) > 1 else 0.0,
+ "visits_variance": statistics.variance(best_visits) if len(best_visits) > 1 else 0.0,
+ }
+
+ def export_to_json(self, file_path: str) -> None:
+ """
+ Export all results to JSON file.
+
+ Args:
+ file_path: Path to output file
+ """
+ data = {
+ "name": self.name,
+ "created_at": self.created_at,
+ "num_experiments": len(self.results),
+ "results": [r.to_dict() for r in self.results],
+ "summary": self.get_summary_statistics(),
+ }
+
+ path = Path(file_path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ with open(path, "w") as f:
+ json.dump(data, f, indent=2)
+
+ def export_to_csv(self, file_path: str) -> None:
+ """
+ Export results to CSV file for spreadsheet analysis.
+
+ Args:
+ file_path: Path to output file
+ """
+ if not self.results:
+ return
+
+ path = Path(file_path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Define CSV columns
+ fieldnames = [
+ "experiment_id",
+ "timestamp",
+ "seed",
+ "config_name",
+ "num_iterations",
+ "exploration_weight",
+ "best_action",
+ "best_action_value",
+ "best_action_visits",
+ "root_visits",
+ "total_simulations",
+ "execution_time_ms",
+ "cache_hit_rate",
+ "tree_depth",
+ "tree_node_count",
+ "branching_factor",
+ ]
+
+ with open(path, "w", newline="") as f:
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
+ writer.writeheader()
+
+ for result in self.results:
+ row = {
+ "experiment_id": result.experiment_id,
+ "timestamp": result.timestamp,
+ "seed": result.seed,
+ "config_name": (result.config.get("name", "unnamed") if result.config else "unknown"),
+ "num_iterations": (result.config.get("num_iterations", 0) if result.config else 0),
+ "exploration_weight": (result.config.get("exploration_weight", 0) if result.config else 0),
+ "best_action": result.best_action,
+ "best_action_value": result.best_action_value,
+ "best_action_visits": result.best_action_visits,
+ "root_visits": result.root_visits,
+ "total_simulations": result.total_simulations,
+ "execution_time_ms": result.execution_time_ms,
+ "cache_hit_rate": result.cache_hit_rate,
+ "tree_depth": result.tree_depth,
+ "tree_node_count": result.tree_node_count,
+ "branching_factor": result.branching_factor,
+ }
+ writer.writerow(row)
+
+ @classmethod
+ def load_from_json(cls, file_path: str) -> ExperimentTracker:
+ """
+ Load experiment tracker from JSON file.
+
+ Args:
+ file_path: Path to JSON file
+
+ Returns:
+ Loaded ExperimentTracker
+ """
+ with open(file_path) as f:
+ data = json.load(f)
+
+ tracker = cls(name=data.get("name", "loaded_experiments"))
+ tracker.created_at = data.get("created_at", tracker.created_at)
+
+ for result_data in data.get("results", []):
+ tracker.results.append(ExperimentResult.from_dict(result_data))
+
+ return tracker
+
+ def clear(self) -> None:
+ """Clear all results."""
+ self.results.clear()
+
+ def __len__(self) -> int:
+ return len(self.results)
+
+ def __repr__(self) -> str:
+ return f"ExperimentTracker(name={self.name!r}, num_results={len(self.results)})"
+
+
+def run_determinism_test(
+ engine_factory,
+ config: MCTSConfig,
+ num_runs: int = 3,
+) -> tuple[bool, dict[str, Any]]:
+ """
+ Test that MCTS produces deterministic results with same seed.
+
+ Args:
+ engine_factory: Factory function to create MCTSEngine
+ config: Configuration to test
+ num_runs: Number of runs to compare
+
+ Returns:
+ Tuple of (is_deterministic, analysis_dict)
+ """
+ ExperimentTracker(name="determinism_test")
+
+ # This is a stub - actual implementation would run the engine
+ # Results would be compared to verify determinism
+
+ analysis = {
+ "config": config.to_dict(),
+ "num_runs": num_runs,
+ "is_deterministic": True, # Would be computed from actual runs
+ "message": "Determinism test requires actual engine execution",
+ }
+
+ return True, analysis
diff --git a/src/framework/mcts/neural_mcts.py b/src/framework/mcts/neural_mcts.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf340667bc63fe0231e24bcda8562ed785b160f
--- /dev/null
+++ b/src/framework/mcts/neural_mcts.py
@@ -0,0 +1,624 @@
+"""
+Neural-Guided Monte Carlo Tree Search (MCTS).
+
+Implements AlphaZero-style MCTS with:
+- Policy and value network guidance
+- PUCT (Predictor + UCT) selection
+- Dirichlet noise for exploration
+- Virtual loss for parallel search
+- Temperature-based action selection
+
+Based on:
+- "Mastering the Game of Go with Deep Neural Networks and Tree Search" (AlphaGo)
+- "Mastering Chess and Shogi by Self-Play with a General RL Algorithm" (AlphaZero)
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...training.system_config import MCTSConfig
+
+
+@dataclass
+class GameState:
+ """
+ Abstract game/problem state interface.
+
+ Users should subclass this for their specific domain.
+ """
+
+ def get_legal_actions(self) -> list[Any]:
+ """Return list of legal actions from this state."""
+ raise NotImplementedError
+
+ def apply_action(self, action: Any) -> GameState:
+ """Apply action and return new state."""
+ raise NotImplementedError
+
+ def is_terminal(self) -> bool:
+ """Check if this is a terminal state."""
+ raise NotImplementedError
+
+ def get_reward(self, player: int = 1) -> float:
+ """Get reward for the player (1 or -1)."""
+ raise NotImplementedError
+
+ def to_tensor(self) -> torch.Tensor:
+ """Convert state to tensor for neural network input."""
+ raise NotImplementedError
+
+ def get_canonical_form(self, player: int) -> GameState: # noqa: ARG002
+ """Get state from perspective of given player."""
+ return self
+
+ def get_hash(self) -> str:
+ """Get unique hash for this state (for caching)."""
+ raise NotImplementedError
+
+ def action_to_index(self, action: Any) -> int:
+ """
+ Map action to its index in the neural network's action space.
+
+ This method should return the index corresponding to the action
+ in the network's policy output vector.
+
+ Default implementation uses string-based mapping for Tic-Tac-Toe style
+ actions (e.g., "0,0" -> 0, "0,1" -> 1, etc.). Override this method
+ for custom action mappings.
+
+ Args:
+ action: The action to map
+
+ Returns:
+ Index in the action space (0 to action_size-1)
+ """
+ # Default implementation for grid-based actions like "row,col"
+ if isinstance(action, str) and "," in action:
+ row, col = map(int, action.split(","))
+ # Assume 3x3 grid by default - override for different sizes
+ return row * 3 + col
+ # For other action types, assume they are already indices
+ return int(action)
+
+
+class NeuralMCTSNode:
+ """
+ MCTS node with neural network guidance.
+
+ Stores statistics for PUCT selection and backpropagation.
+ """
+
+ def __init__(
+ self,
+ state: GameState,
+ parent: NeuralMCTSNode | None = None,
+ action: Any | None = None,
+ prior: float = 0.0,
+ ):
+ self.state = state
+ self.parent = parent
+ self.action = action # Action that led to this node
+ self.prior = prior # Prior probability from policy network
+
+ # Statistics
+ self.visit_count: int = 0
+ self.value_sum: float = 0.0
+ self.virtual_loss: float = 0.0
+
+ # Children: action -> NeuralMCTSNode
+ self.children: dict[Any, NeuralMCTSNode] = {}
+
+ # Caching
+ self.is_expanded: bool = False
+ self.is_terminal: bool = state.is_terminal()
+
+ @property
+ def value(self) -> float:
+ """Average value (Q-value) of this node."""
+ if self.visit_count == 0:
+ return 0.0
+ return self.value_sum / self.visit_count
+
+ def expand(
+ self,
+ policy_probs: np.ndarray,
+ valid_actions: list[Any],
+ ):
+ """
+ Expand node by creating children for all legal actions.
+
+ Args:
+ policy_probs: Prior probabilities from policy network
+ valid_actions: List of legal actions
+ """
+ self.is_expanded = True
+
+ for action, prior in zip(valid_actions, policy_probs, strict=True):
+ if action not in self.children:
+ next_state = self.state.apply_action(action)
+ self.children[action] = NeuralMCTSNode(
+ state=next_state,
+ parent=self,
+ action=action,
+ prior=prior,
+ )
+
+ def select_child(self, c_puct: float) -> tuple[Any, NeuralMCTSNode]:
+ """
+ Select best child using PUCT algorithm.
+
+ PUCT = Q(s,a) + c_puct * P(s,a) * sqrt(N(s)) / (1 + N(s,a))
+
+ Args:
+ c_puct: Exploration constant
+
+ Returns:
+ (action, child_node) tuple
+ """
+ best_score = -float("inf")
+ best_action = None
+ best_child = None
+
+ # Precompute sqrt term for efficiency
+ sqrt_parent_visits = math.sqrt(self.visit_count)
+
+ for action, child in self.children.items():
+ # Q-value (average value)
+ q_value = child.value
+
+ # U-value (exploration bonus)
+ u_value = c_puct * child.prior * sqrt_parent_visits / (1 + child.visit_count + child.virtual_loss)
+
+ # PUCT score
+ puct_score = q_value + u_value
+
+ if puct_score > best_score:
+ best_score = puct_score
+ best_action = action
+ best_child = child
+
+ return best_action, best_child
+
+ def add_virtual_loss(self, virtual_loss: float):
+ """Add virtual loss for parallel search."""
+ self.virtual_loss += virtual_loss
+
+ def revert_virtual_loss(self, virtual_loss: float):
+ """Remove virtual loss after search completes."""
+ self.virtual_loss -= virtual_loss
+
+ def update(self, value: float):
+ """Update node statistics with search result."""
+ self.visit_count += 1
+ self.value_sum += value
+
+ def get_action_probs(self, temperature: float = 1.0) -> dict[Any, float]:
+ """
+ Get action selection probabilities based on visit counts.
+
+ Args:
+ temperature: Temperature parameter
+ - temperature -> 0: argmax (deterministic)
+ - temperature = 1: proportional to visits
+ - temperature -> inf: uniform
+
+ Returns:
+ Dictionary mapping actions to probabilities
+ """
+ if not self.children:
+ return {}
+
+ if temperature == 0:
+ # Deterministic: select most visited
+ visits = {action: child.visit_count for action, child in self.children.items()}
+ max_visits = max(visits.values())
+ best_actions = [a for a, v in visits.items() if v == max_visits]
+
+ # Uniform over best actions
+ prob = 1.0 / len(best_actions)
+ return {a: (prob if a in best_actions else 0.0) for a in self.children}
+
+ # Temperature-scaled visits
+ visits = np.array([child.visit_count for child in self.children.values()])
+ actions = list(self.children.keys())
+
+ if temperature != 1.0:
+ visits = visits ** (1.0 / temperature)
+
+ # Normalize to probabilities
+ probs = visits / visits.sum()
+
+ return dict(zip(actions, probs, strict=True))
+
+
+class NeuralMCTS:
+ """
+ Neural-guided MCTS for decision making.
+
+ Combines tree search with neural network evaluation
+ using the AlphaZero algorithm.
+ """
+
+ def __init__(
+ self,
+ policy_value_network: nn.Module,
+ config: MCTSConfig,
+ device: str = "cpu",
+ ):
+ """
+ Initialize neural MCTS.
+
+ Args:
+ policy_value_network: Network that outputs (policy, value)
+ config: MCTS configuration
+ device: Device for neural network
+ """
+ self.network = policy_value_network
+ self.config = config
+ self.device = device
+
+ # Caching for network evaluations
+ self.cache: dict[str, tuple[np.ndarray, float]] = {}
+ self.cache_hits = 0
+ self.cache_misses = 0
+
+ def add_dirichlet_noise(
+ self,
+ policy_probs: np.ndarray,
+ epsilon: float | None = None,
+ alpha: float | None = None,
+ ) -> np.ndarray:
+ """
+ Add Dirichlet noise to policy for exploration (at root only).
+
+ Policy' = (1 - epsilon) * Policy + epsilon * Noise
+
+ Args:
+ policy_probs: Original policy probabilities
+ epsilon: Mixing parameter (defaults to config)
+ alpha: Dirichlet concentration parameter (defaults to config)
+
+ Returns:
+ Noised policy probabilities
+ """
+ epsilon = epsilon or self.config.dirichlet_epsilon
+ alpha = alpha or self.config.dirichlet_alpha
+
+ noise = np.random.dirichlet([alpha] * len(policy_probs))
+ return (1 - epsilon) * policy_probs + epsilon * noise
+
+ @torch.no_grad()
+ async def evaluate_state(self, state: GameState, add_noise: bool = False) -> tuple[np.ndarray, float]:
+ """
+ Evaluate state using neural network.
+
+ Args:
+ state: Game state to evaluate
+ add_noise: Whether to add Dirichlet noise (for root exploration)
+
+ Returns:
+ (policy_probs, value) tuple
+ """
+ # Check cache
+ state_hash = state.get_hash()
+ if not add_noise and state_hash in self.cache:
+ self.cache_hits += 1
+ return self.cache[state_hash]
+
+ self.cache_misses += 1
+
+ # Get legal actions
+ legal_actions = state.get_legal_actions()
+ if not legal_actions:
+ return np.array([]), 0.0
+
+ # Convert state to tensor
+ state_tensor = state.to_tensor().unsqueeze(0).to(self.device)
+
+ # Network forward pass
+ policy_logits, value = self.network(state_tensor)
+
+ # Convert to numpy (detach to remove gradients)
+ policy_logits = policy_logits.squeeze(0).detach().cpu().numpy()
+ value = value.item()
+
+ # Proper action masking: Map legal actions to their indices in the action space
+ # Create a mask for legal actions
+ action_mask = np.full_like(policy_logits, -np.inf) # Mask all actions initially
+ action_indices = []
+
+ # Map legal actions to their network output indices
+ for action in legal_actions:
+ try:
+ action_idx = state.action_to_index(action)
+ if 0 <= action_idx < len(policy_logits):
+ action_mask[action_idx] = 0 # Unmask legal actions
+ action_indices.append(action_idx)
+ except (ValueError, IndexError, AttributeError) as e:
+ # Fallback: if action_to_index fails, use sequential mapping
+ print(f"Warning: action_to_index failed for action {action}: {e}")
+ action_indices = list(range(len(legal_actions)))
+ action_mask = np.full_like(policy_logits, -np.inf)
+ action_mask[action_indices] = 0
+ break
+
+ # Apply mask before softmax for numerical stability
+ masked_logits = policy_logits + action_mask
+
+ # Compute softmax over legal actions only
+ exp_logits = np.exp(masked_logits - np.max(masked_logits)) # Subtract max for stability
+ policy_probs_full = exp_logits / exp_logits.sum()
+
+ # Extract probabilities for legal actions in order
+ policy_probs = policy_probs_full[action_indices]
+
+ # Normalize to ensure probabilities sum to 1 (handle numerical errors)
+ if policy_probs.sum() > 0:
+ policy_probs = policy_probs / policy_probs.sum()
+ else:
+ # Fallback: uniform distribution over legal actions
+ policy_probs = np.ones(len(legal_actions)) / len(legal_actions)
+
+ # Add Dirichlet noise if requested (root exploration)
+ if add_noise:
+ policy_probs = self.add_dirichlet_noise(policy_probs)
+
+ # Cache result (without noise)
+ if not add_noise:
+ self.cache[state_hash] = (policy_probs, value)
+
+ return policy_probs, value
+
+ async def search(
+ self,
+ root_state: GameState,
+ num_simulations: int | None = None,
+ temperature: float = 1.0,
+ add_root_noise: bool = True,
+ ) -> tuple[dict[Any, float], NeuralMCTSNode]:
+ """
+ Run MCTS search from root state.
+
+ Args:
+ root_state: Initial state
+ num_simulations: Number of MCTS simulations
+ temperature: Temperature for action selection
+ add_root_noise: Whether to add Dirichlet noise to root
+
+ Returns:
+ (action_probs, root_node) tuple
+ """
+ num_simulations = num_simulations or self.config.num_simulations
+
+ # Create root node
+ root = NeuralMCTSNode(state=root_state)
+
+ # Expand root
+ policy_probs, _ = await self.evaluate_state(root_state, add_noise=add_root_noise)
+ legal_actions = root_state.get_legal_actions()
+ root.expand(policy_probs, legal_actions)
+
+ # Run simulations
+ for _ in range(num_simulations):
+ await self._simulate(root)
+
+ # Get action probabilities
+ action_probs = root.get_action_probs(temperature)
+
+ return action_probs, root
+
+ async def _simulate(self, node: NeuralMCTSNode) -> float:
+ """
+ Run single MCTS simulation (select, expand, evaluate, backpropagate).
+
+ Args:
+ node: Root node for this simulation
+
+ Returns:
+ Value from this simulation
+ """
+ path: list[NeuralMCTSNode] = []
+
+ # Selection: traverse tree using PUCT
+ current = node
+ while current.is_expanded and not current.is_terminal:
+ # Add virtual loss for parallel search
+ current.add_virtual_loss(self.config.virtual_loss)
+ path.append(current)
+
+ # Select best child
+ _, current = current.select_child(self.config.c_puct)
+
+ # Add leaf to path
+ path.append(current)
+ current.add_virtual_loss(self.config.virtual_loss)
+
+ # Evaluate leaf node
+ if current.is_terminal:
+ # Terminal node: use game result
+ value = current.state.get_reward()
+ else:
+ # Non-terminal: expand and evaluate with network
+ policy_probs, value = await self.evaluate_state(current.state, add_noise=False)
+
+ if not current.is_expanded:
+ legal_actions = current.state.get_legal_actions()
+ current.expand(policy_probs, legal_actions)
+
+ # Backpropagate
+ for node_in_path in reversed(path):
+ node_in_path.update(value)
+ node_in_path.revert_virtual_loss(self.config.virtual_loss)
+
+ # Flip value for opponent
+ value = -value
+
+ return value
+
+ def select_action(
+ self,
+ action_probs: dict[Any, float],
+ temperature: float = 1.0,
+ deterministic: bool = False,
+ ) -> Any:
+ """
+ Select action from probability distribution.
+
+ Args:
+ action_probs: Action probability dictionary
+ temperature: Temperature (unused if deterministic=True)
+ deterministic: If True, select action with highest probability
+
+ Returns:
+ Selected action
+ """
+ if not action_probs:
+ return None
+
+ actions = list(action_probs.keys())
+ probs = list(action_probs.values())
+
+ if deterministic or temperature == 0:
+ return actions[np.argmax(probs)]
+
+ # Sample from distribution
+ return np.random.choice(actions, p=probs)
+
+ def clear_cache(self):
+ """Clear the evaluation cache."""
+ self.cache.clear()
+ self.cache_hits = 0
+ self.cache_misses = 0
+
+ def get_cache_stats(self) -> dict:
+ """Get cache performance statistics."""
+ total = self.cache_hits + self.cache_misses
+ hit_rate = self.cache_hits / total if total > 0 else 0.0
+
+ return {
+ "cache_size": len(self.cache),
+ "cache_hits": self.cache_hits,
+ "cache_misses": self.cache_misses,
+ "hit_rate": hit_rate,
+ }
+
+
+# Training data collection
+@dataclass
+class MCTSExample:
+ """Training example from MCTS self-play."""
+
+ state: torch.Tensor # State representation
+ policy_target: np.ndarray # Target policy (visit counts)
+ value_target: float # Target value (game outcome)
+ player: int # Player to move (1 or -1)
+
+
+class SelfPlayCollector:
+ """
+ Collect training data from self-play games.
+
+ Uses MCTS to generate high-quality training examples.
+ """
+
+ def __init__(
+ self,
+ mcts: NeuralMCTS,
+ config: MCTSConfig,
+ ):
+ self.mcts = mcts
+ self.config = config
+
+ async def play_game(
+ self,
+ initial_state: GameState,
+ temperature_threshold: int | None = None,
+ ) -> list[MCTSExample]:
+ """
+ Play a single self-play game.
+
+ Args:
+ initial_state: Starting game state
+ temperature_threshold: Move number to switch to greedy play
+
+ Returns:
+ List of training examples from the game
+ """
+ temperature_threshold = temperature_threshold or self.config.temperature_threshold
+
+ examples: list[MCTSExample] = []
+ state = initial_state
+ player = 1 # Current player (1 or -1)
+ move_count = 0
+
+ while not state.is_terminal():
+ # Determine temperature
+ temperature = (
+ self.config.temperature_init if move_count < temperature_threshold else self.config.temperature_final
+ )
+
+ # Run MCTS
+ action_probs, root = await self.mcts.search(state, temperature=temperature, add_root_noise=True)
+
+ # Store training example
+ # Convert action probs to array for all actions
+ probs = np.array(list(action_probs.values()))
+
+ examples.append(
+ MCTSExample(
+ state=state.to_tensor(),
+ policy_target=probs,
+ value_target=0.0, # Will be filled with game outcome
+ player=player,
+ )
+ )
+
+ # Select and apply action
+ action = self.mcts.select_action(action_probs, temperature=temperature)
+ state = state.apply_action(action)
+
+ # Switch player
+ player = -player
+ move_count += 1
+
+ # Get game outcome
+ outcome = state.get_reward()
+
+ # Assign values to examples
+ for example in examples:
+ # Value is from perspective of the player who made the move
+ example.value_target = outcome if example.player == 1 else -outcome
+
+ return examples
+
+ async def generate_batch(self, num_games: int, initial_state_fn: Callable[[], GameState]) -> list[MCTSExample]:
+ """
+ Generate a batch of training examples from multiple games.
+
+ Args:
+ num_games: Number of games to play
+ initial_state_fn: Function that returns initial game state
+
+ Returns:
+ Combined list of training examples
+ """
+ all_examples = []
+
+ for _ in range(num_games):
+ initial_state = initial_state_fn()
+ examples = await self.play_game(initial_state)
+ all_examples.extend(examples)
+
+ # Clear cache periodically
+ if len(self.mcts.cache) > 10000:
+ self.mcts.clear_cache()
+
+ return all_examples
diff --git a/src/framework/mcts/policies.py b/src/framework/mcts/policies.py
new file mode 100644
index 0000000000000000000000000000000000000000..59c23239dc6799d72aa2910355840b87f3705176
--- /dev/null
+++ b/src/framework/mcts/policies.py
@@ -0,0 +1,397 @@
+"""
+MCTS Policies Module - Selection, rollout, and evaluation policies.
+
+Provides:
+- UCB1 with configurable exploration weight
+- Rollout heuristics (random, greedy, hybrid)
+- Action selection policies (max visits, max value, robust child)
+- Progressive widening parameters
+"""
+
+from __future__ import annotations
+
+import math
+from abc import ABC, abstractmethod
+from collections.abc import Awaitable, Callable
+from enum import Enum
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+ from .core import MCTSState
+
+
+def ucb1(
+ value_sum: float,
+ visits: int,
+ parent_visits: int,
+ c: float = 1.414,
+) -> float:
+ """
+ Upper Confidence Bound 1 (UCB1) formula for tree selection.
+
+ Formula: Q(s,a) + c * sqrt(N(s)) / sqrt(N(s,a))
+
+ Args:
+ value_sum: Total accumulated value for the node
+ visits: Number of visits to the node
+ parent_visits: Number of visits to the parent node
+ c: Exploration weight constant (default sqrt(2))
+
+ Returns:
+ UCB1 score for node selection
+ """
+ if visits == 0:
+ return float("inf")
+
+ exploitation = value_sum / visits
+ exploration = c * ((parent_visits) ** 0.5 / (visits) ** 0.5)
+
+ return exploitation + exploration
+
+
+def ucb1_tuned(
+ value_sum: float,
+ value_squared_sum: float,
+ visits: int,
+ parent_visits: int,
+ c: float = 1.0,
+) -> float:
+ """
+ UCB1-Tuned variant with variance estimate.
+
+ Provides tighter bounds by considering value variance.
+
+ Args:
+ value_sum: Total accumulated value
+ value_squared_sum: Sum of squared values (for variance)
+ visits: Number of visits
+ parent_visits: Parent visit count
+ c: Exploration constant
+
+ Returns:
+ UCB1-Tuned score
+ """
+ if visits == 0:
+ return float("inf")
+
+ mean_value = value_sum / visits
+ variance = value_squared_sum / visits - mean_value**2
+ variance = max(0, variance) # Ensure non-negative
+
+ # Variance bound term
+ ln_parent = math.log(parent_visits)
+ variance_bound = variance + math.sqrt(2 * ln_parent / visits)
+ min_bound = min(0.25, variance_bound)
+
+ exploitation = mean_value
+ exploration = c * math.sqrt(ln_parent / visits * min_bound)
+
+ return exploitation + exploration
+
+
+class SelectionPolicy(Enum):
+ """Policy for selecting the final action after MCTS search."""
+
+ MAX_VISITS = "max_visits"
+ """Select action with most visits (most robust)."""
+
+ MAX_VALUE = "max_value"
+ """Select action with highest average value (greedy)."""
+
+ ROBUST_CHILD = "robust_child"
+ """Select action balancing visits and value."""
+
+ SECURE_CHILD = "secure_child"
+ """Select action with lowest lower confidence bound."""
+
+
+class RolloutPolicy(ABC):
+ """Abstract base class for rollout/simulation policies."""
+
+ @abstractmethod
+ async def evaluate(
+ self,
+ state: MCTSState,
+ rng: np.random.Generator,
+ max_depth: int = 10,
+ ) -> float:
+ """
+ Evaluate a state through rollout simulation.
+
+ Args:
+ state: State to evaluate
+ rng: Seeded random number generator
+ max_depth: Maximum rollout depth
+
+ Returns:
+ Estimated value in [0, 1] range
+ """
+ pass
+
+
+class RandomRolloutPolicy(RolloutPolicy):
+ """Random rollout policy - uniform random evaluation."""
+
+ def __init__(self, base_value: float = 0.5, noise_scale: float = 0.3):
+ """
+ Initialize random rollout policy.
+
+ Args:
+ base_value: Base value for evaluations
+ noise_scale: Scale of random noise
+ """
+ self.base_value = base_value
+ self.noise_scale = noise_scale
+
+ async def evaluate(
+ self,
+ _state: MCTSState,
+ rng: np.random.Generator,
+ _max_depth: int = 10,
+ ) -> float:
+ """Generate random evaluation with noise."""
+ noise = rng.uniform(-self.noise_scale, self.noise_scale)
+ value = self.base_value + noise
+ return max(0.0, min(1.0, value))
+
+
+class GreedyRolloutPolicy(RolloutPolicy):
+ """Greedy rollout policy using domain heuristics."""
+
+ def __init__(
+ self,
+ heuristic_fn: Callable[[MCTSState], float],
+ noise_scale: float = 0.05,
+ ):
+ """
+ Initialize greedy rollout policy.
+
+ Args:
+ heuristic_fn: Function to evaluate state heuristically
+ noise_scale: Small noise for tie-breaking
+ """
+ self.heuristic_fn = heuristic_fn
+ self.noise_scale = noise_scale
+
+ async def evaluate(
+ self,
+ state: MCTSState,
+ rng: np.random.Generator,
+ _max_depth: int = 10,
+ ) -> float:
+ """Evaluate using heuristic with small noise."""
+ base_value = self.heuristic_fn(state)
+ noise = rng.uniform(-self.noise_scale, self.noise_scale)
+ value = base_value + noise
+ return max(0.0, min(1.0, value))
+
+
+class HybridRolloutPolicy(RolloutPolicy):
+ """Hybrid policy combining random and heuristic evaluation."""
+
+ def __init__(
+ self,
+ heuristic_fn: Callable[[MCTSState], float] | None = None,
+ heuristic_weight: float = 0.7,
+ random_weight: float = 0.3,
+ base_random_value: float = 0.5,
+ noise_scale: float = 0.2,
+ ):
+ """
+ Initialize hybrid rollout policy.
+
+ Args:
+ heuristic_fn: Optional heuristic evaluation function
+ heuristic_weight: Weight for heuristic component
+ random_weight: Weight for random component
+ base_random_value: Base value for random component
+ noise_scale: Noise scale for random component
+ """
+ self.heuristic_fn = heuristic_fn
+ self.heuristic_weight = heuristic_weight
+ self.random_weight = random_weight
+ self.base_random_value = base_random_value
+ self.noise_scale = noise_scale
+
+ # Normalize weights
+ total_weight = heuristic_weight + random_weight
+ if total_weight > 0:
+ self.heuristic_weight /= total_weight
+ self.random_weight /= total_weight
+
+ async def evaluate(
+ self,
+ state: MCTSState,
+ rng: np.random.Generator,
+ _max_depth: int = 10,
+ ) -> float:
+ """Combine heuristic and random evaluation."""
+ # Random component
+ random_noise = rng.uniform(-self.noise_scale, self.noise_scale)
+ random_value = self.base_random_value + random_noise
+
+ # Heuristic component
+ heuristic_value = self.heuristic_fn(state) if self.heuristic_fn is not None else self.base_random_value
+
+ # Combine
+ value = self.heuristic_weight * heuristic_value + self.random_weight * random_value
+
+ return max(0.0, min(1.0, value))
+
+
+class LLMRolloutPolicy(RolloutPolicy):
+ """Rollout policy that uses an LLM for state evaluation."""
+
+ def __init__(
+ self,
+ evaluate_fn: Callable[[MCTSState], Awaitable[float]],
+ cache_results: bool = True,
+ ):
+ """
+ Initialize LLM rollout policy.
+
+ Args:
+ evaluate_fn: Async function to evaluate state with LLM
+ cache_results: Whether to cache evaluation results
+ """
+ self.evaluate_fn = evaluate_fn
+ self.cache_results = cache_results
+ self._cache: dict = {}
+
+ async def evaluate(
+ self,
+ state: MCTSState,
+ _rng: np.random.Generator,
+ _max_depth: int = 10,
+ ) -> float:
+ """Evaluate state using LLM."""
+ state_key = state.to_hash_key()
+
+ if self.cache_results and state_key in self._cache:
+ return self._cache[state_key]
+
+ value = await self.evaluate_fn(state)
+ value = max(0.0, min(1.0, value))
+
+ if self.cache_results:
+ self._cache[state_key] = value
+
+ return value
+
+
+class ProgressiveWideningConfig:
+ """Configuration for progressive widening in MCTS."""
+
+ def __init__(
+ self,
+ k: float = 1.0,
+ alpha: float = 0.5,
+ ):
+ """
+ Configure progressive widening parameters.
+
+ Progressive widening expands when: visits > k * num_children^alpha
+
+ Args:
+ k: Coefficient controlling expansion threshold
+ alpha: Exponent controlling growth rate
+
+ Common configurations:
+ - k=1.0, alpha=0.5: Moderate widening (default)
+ - k=2.0, alpha=0.5: Conservative (fewer expansions)
+ - k=0.5, alpha=0.5: Aggressive (more expansions)
+ - k=1.0, alpha=0.3: Very aggressive
+ - k=1.0, alpha=0.7: Very conservative
+ """
+ if k <= 0:
+ raise ValueError("k must be positive")
+ if not 0 < alpha < 1:
+ raise ValueError("alpha must be in (0, 1)")
+
+ self.k = k
+ self.alpha = alpha
+
+ def should_expand(self, visits: int, num_children: int) -> bool:
+ """
+ Check if expansion should occur.
+
+ Args:
+ visits: Number of visits to node
+ num_children: Current number of children
+
+ Returns:
+ True if should expand, False otherwise
+ """
+ threshold = self.k * (num_children**self.alpha)
+ return visits > threshold
+
+ def min_visits_for_expansion(self, num_children: int) -> int:
+ """
+ Calculate minimum visits needed to expand to next child.
+
+ Args:
+ num_children: Current number of children
+
+ Returns:
+ Minimum visit count for expansion
+ """
+ threshold = self.k * (num_children**self.alpha)
+ return int(math.ceil(threshold))
+
+ def __repr__(self) -> str:
+ return f"ProgressiveWideningConfig(k={self.k}, alpha={self.alpha})"
+
+
+def compute_action_probabilities(
+ children_stats: list[dict],
+ temperature: float = 1.0,
+) -> list[float]:
+ """
+ Compute action probabilities from visit counts using softmax.
+
+ Args:
+ children_stats: List of dicts with 'visits' key
+ temperature: Temperature parameter (lower = more deterministic)
+
+ Returns:
+ List of probabilities for each action
+ """
+ if not children_stats:
+ return []
+
+ visits = np.array([c["visits"] for c in children_stats], dtype=float)
+
+ if temperature == 0:
+ # Deterministic: assign 1.0 to max, 0 to others
+ probs = np.zeros_like(visits)
+ probs[np.argmax(visits)] = 1.0
+ return probs.tolist()
+
+ # Apply temperature
+ scaled_visits = visits ** (1.0 / temperature)
+ probs = scaled_visits / scaled_visits.sum()
+ return probs.tolist()
+
+
+def select_action_stochastic(
+ children_stats: list[dict],
+ rng: np.random.Generator,
+ temperature: float = 1.0,
+) -> int:
+ """
+ Stochastically select action based on visit counts.
+
+ Args:
+ children_stats: List of child statistics
+ rng: Random number generator
+ temperature: Temperature for softmax
+
+ Returns:
+ Index of selected action
+ """
+ probs = compute_action_probabilities(children_stats, temperature)
+ if not probs:
+ raise ValueError("No actions to select from")
+ return rng.choice(len(probs), p=probs)
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/models/policy_value_net.py b/src/models/policy_value_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d2567beb7dde1dfd45d0411b80d50ebe1d1b2d0
--- /dev/null
+++ b/src/models/policy_value_net.py
@@ -0,0 +1,422 @@
+"""
+Policy-Value Network using ResNet Architecture.
+
+Implements the dual-head neural network used in AlphaZero:
+- Policy Head: Outputs action probabilities
+- Value Head: Outputs state value estimation
+
+Based on:
+- "Mastering Chess and Shogi by Self-Play with a General RL Algorithm" (AlphaZero)
+- Deep Residual Learning for Image Recognition (ResNet)
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..training.system_config import NeuralNetworkConfig
+
+
+class ResidualBlock(nn.Module):
+ """
+ Residual block with batch normalization and skip connections.
+
+ Architecture:
+ Conv -> BN -> ReLU -> Conv -> BN -> Add -> ReLU
+ """
+
+ def __init__(self, channels: int, use_batch_norm: bool = True):
+ super().__init__()
+
+ self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(channels) if use_batch_norm else nn.Identity()
+
+ self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False)
+ self.bn2 = nn.BatchNorm2d(channels) if use_batch_norm else nn.Identity()
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """Apply residual block transformation."""
+ residual = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = F.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+
+ # Skip connection
+ out = out + residual
+ out = F.relu(out)
+
+ return out
+
+
+class PolicyHead(nn.Module):
+ """
+ Policy head for outputting action probabilities.
+
+ Architecture:
+ Conv -> BN -> ReLU -> FC -> LogSoftmax
+ """
+
+ def __init__(
+ self,
+ input_channels: int,
+ policy_conv_channels: int,
+ action_size: int,
+ board_size: int = 19,
+ ):
+ super().__init__()
+
+ self.conv = nn.Conv2d(input_channels, policy_conv_channels, kernel_size=1, bias=False)
+ self.bn = nn.BatchNorm2d(policy_conv_channels)
+
+ # Assuming square board
+ fc_input_size = policy_conv_channels * board_size * board_size
+
+ self.fc = nn.Linear(fc_input_size, action_size)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Compute policy (action probabilities).
+
+ Args:
+ x: [batch, channels, height, width]
+
+ Returns:
+ Log probabilities: [batch, action_size]
+ """
+ batch_size = x.size(0)
+
+ out = self.conv(x)
+ out = self.bn(out)
+ out = F.relu(out)
+
+ # Flatten spatial dimensions
+ out = out.view(batch_size, -1)
+
+ # Fully connected layer
+ out = self.fc(out)
+
+ # Log probabilities for numerical stability
+ return F.log_softmax(out, dim=1)
+
+
+class ValueHead(nn.Module):
+ """
+ Value head for estimating state value.
+
+ Architecture:
+ Conv -> BN -> ReLU -> FC -> ReLU -> FC -> Tanh
+ """
+
+ def __init__(
+ self,
+ input_channels: int,
+ value_conv_channels: int,
+ value_fc_hidden: int,
+ board_size: int = 19,
+ ):
+ super().__init__()
+
+ self.conv = nn.Conv2d(input_channels, value_conv_channels, kernel_size=1, bias=False)
+ self.bn = nn.BatchNorm2d(value_conv_channels)
+
+ # Assuming square board
+ fc_input_size = value_conv_channels * board_size * board_size
+
+ self.fc1 = nn.Linear(fc_input_size, value_fc_hidden)
+ self.fc2 = nn.Linear(value_fc_hidden, 1)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ """
+ Compute value estimation.
+
+ Args:
+ x: [batch, channels, height, width]
+
+ Returns:
+ Value: [batch, 1] in range [-1, 1]
+ """
+ batch_size = x.size(0)
+
+ out = self.conv(x)
+ out = self.bn(out)
+ out = F.relu(out)
+
+ # Flatten spatial dimensions
+ out = out.view(batch_size, -1)
+
+ # Fully connected layers
+ out = self.fc1(out)
+ out = F.relu(out)
+
+ out = self.fc2(out)
+
+ # Tanh to bound value in [-1, 1]
+ return torch.tanh(out)
+
+
+class PolicyValueNetwork(nn.Module):
+ """
+ Combined policy-value network with ResNet backbone.
+
+ This is the core neural network used in AlphaZero-style learning.
+ """
+
+ def __init__(self, config: NeuralNetworkConfig, board_size: int = 19):
+ super().__init__()
+ self.config = config
+ self.board_size = board_size
+
+ # Initial convolution
+ self.conv_input = nn.Conv2d(
+ config.input_channels,
+ config.num_channels,
+ kernel_size=3,
+ padding=1,
+ bias=False,
+ )
+ self.bn_input = nn.BatchNorm2d(config.num_channels) if config.use_batch_norm else nn.Identity()
+
+ # Residual blocks (shared feature extractor)
+ self.res_blocks = nn.ModuleList(
+ [ResidualBlock(config.num_channels, config.use_batch_norm) for _ in range(config.num_res_blocks)]
+ )
+
+ # Policy head
+ self.policy_head = PolicyHead(
+ input_channels=config.num_channels,
+ policy_conv_channels=config.policy_conv_channels,
+ action_size=config.action_size,
+ board_size=board_size,
+ )
+
+ # Value head
+ self.value_head = ValueHead(
+ input_channels=config.num_channels,
+ value_conv_channels=config.value_conv_channels,
+ value_fc_hidden=config.value_fc_hidden,
+ board_size=board_size,
+ )
+
+ def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+ """
+ Forward pass through the network.
+
+ Args:
+ x: Input state [batch, channels, height, width]
+
+ Returns:
+ (policy_logits, value) tuple
+ - policy_logits: [batch, action_size] log probabilities
+ - value: [batch, 1] state value in [-1, 1]
+ """
+ # Initial convolution
+ out = self.conv_input(x)
+ out = self.bn_input(out)
+ out = F.relu(out)
+
+ # Residual blocks
+ for res_block in self.res_blocks:
+ out = res_block(out)
+
+ # Split into policy and value heads
+ policy = self.policy_head(out)
+ value = self.value_head(out)
+
+ return policy, value
+
+ def predict(self, state: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+ """
+ Inference mode prediction.
+
+ Args:
+ state: Input state tensor
+
+ Returns:
+ (policy_probs, value) tuple with probabilities (not log)
+ """
+ with torch.no_grad():
+ policy_log_probs, value = self.forward(state)
+ policy_probs = torch.exp(policy_log_probs)
+ return policy_probs, value
+
+ def get_parameter_count(self) -> int:
+ """Return total number of trainable parameters."""
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+
+class AlphaZeroLoss(nn.Module):
+ """
+ Combined loss function for AlphaZero training.
+
+ Loss = (z - v)^2 - π^T log(p) + c||θ||^2
+
+ Where:
+ - z: actual game outcome
+ - v: value prediction
+ - π: MCTS visit count distribution
+ - p: policy prediction
+ - c: L2 regularization coefficient
+ """
+
+ def __init__(self, value_loss_weight: float = 1.0):
+ super().__init__()
+ self.value_loss_weight = value_loss_weight
+
+ def forward(
+ self,
+ policy_logits: torch.Tensor,
+ value: torch.Tensor,
+ target_policy: torch.Tensor,
+ target_value: torch.Tensor,
+ ) -> tuple[torch.Tensor, dict]:
+ """
+ Compute AlphaZero loss.
+
+ Args:
+ policy_logits: Predicted policy log probabilities [batch, action_size]
+ value: Predicted values [batch, 1]
+ target_policy: Target policy from MCTS [batch, action_size]
+ target_value: Target value from game outcome [batch, 1]
+
+ Returns:
+ (total_loss, loss_dict) tuple
+ """
+ # Value loss: MSE between predicted and actual outcome
+ value_loss = F.mse_loss(value.squeeze(-1), target_value)
+
+ # Policy loss: Cross-entropy between MCTS policy and network policy
+ # Target policy is already normalized, policy_logits are log probabilities
+ policy_loss = -torch.sum(target_policy * policy_logits, dim=1).mean()
+
+ # Combined loss
+ total_loss = self.value_loss_weight * value_loss + policy_loss
+
+ loss_dict = {
+ "total": total_loss.item(),
+ "value": value_loss.item(),
+ "policy": policy_loss.item(),
+ }
+
+ return total_loss, loss_dict
+
+
+def create_policy_value_network(
+ config: NeuralNetworkConfig,
+ board_size: int = 19,
+ device: str = "cpu",
+) -> PolicyValueNetwork:
+ """
+ Factory function to create and initialize policy-value network.
+
+ Args:
+ config: Network configuration
+ board_size: Board/grid size (for games)
+ device: Device to place model on
+
+ Returns:
+ Initialized PolicyValueNetwork
+ """
+ network = PolicyValueNetwork(config, board_size)
+
+ # He initialization for convolutional layers
+ def init_weights(m):
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.Linear):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ if m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+
+ network.apply(init_weights)
+ network = network.to(device)
+
+ return network
+
+
+# Example: Simpler MLP-based policy-value network for non-spatial tasks
+class MLPPolicyValueNetwork(nn.Module):
+ """
+ MLP-based policy-value network for non-spatial state representations.
+
+ Useful for tasks where state is not naturally represented as an image.
+ """
+
+ def __init__(
+ self,
+ state_dim: int,
+ action_size: int,
+ hidden_dims: list[int] | None = None,
+ use_batch_norm: bool = True,
+ dropout: float = 0.1,
+ ):
+ super().__init__()
+ self.state_dim = state_dim
+ self.action_size = action_size
+
+ if hidden_dims is None:
+ hidden_dims = [512, 256]
+
+ # Shared feature extractor
+ layers = []
+ prev_dim = state_dim
+
+ for hidden_dim in hidden_dims:
+ layers.append(nn.Linear(prev_dim, hidden_dim))
+ if use_batch_norm:
+ layers.append(nn.BatchNorm1d(hidden_dim))
+ layers.append(nn.ReLU())
+ if dropout > 0:
+ layers.append(nn.Dropout(dropout))
+ prev_dim = hidden_dim
+
+ self.shared_network = nn.Sequential(*layers)
+
+ # Policy head
+ self.policy_head = nn.Sequential(
+ nn.Linear(prev_dim, prev_dim // 2),
+ nn.ReLU(),
+ nn.Linear(prev_dim // 2, action_size),
+ )
+
+ # Value head
+ self.value_head = nn.Sequential(
+ nn.Linear(prev_dim, prev_dim // 2),
+ nn.ReLU(),
+ nn.Linear(prev_dim // 2, 1),
+ nn.Tanh(),
+ )
+
+ def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+ """
+ Forward pass.
+
+ Args:
+ x: Input state [batch, state_dim]
+
+ Returns:
+ (policy_log_probs, value) tuple
+ """
+ # Shared features
+ features = self.shared_network(x)
+
+ # Policy
+ policy_logits = self.policy_head(features)
+ policy_log_probs = F.log_softmax(policy_logits, dim=1)
+
+ # Value
+ value = self.value_head(features)
+
+ return policy_log_probs, value
+
+ def get_parameter_count(self) -> int:
+ """Return total number of trainable parameters."""
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
diff --git a/src/models/validation.py b/src/models/validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a511e884446bc9e267a65ac1f24c58579d0b13
--- /dev/null
+++ b/src/models/validation.py
@@ -0,0 +1,489 @@
+"""
+Input validation models for LangGraph Multi-Agent MCTS framework.
+
+Provides:
+- Pydantic models for all external inputs
+- Query sanitization and length limits
+- Configuration validation
+- MCP tool input validation with strict type checking
+- Security-focused input processing
+"""
+
+import re
+from datetime import datetime
+from typing import Any
+
+from pydantic import (
+ BaseModel,
+ ConfigDict,
+ Field,
+ field_validator,
+ model_validator,
+)
+
+# Constants for validation
+MAX_QUERY_LENGTH = 10000
+MIN_QUERY_LENGTH = 1
+MAX_CONTEXT_LENGTH = 50000
+MAX_ITERATIONS = 10000
+MIN_ITERATIONS = 1
+MAX_EXPLORATION_WEIGHT = 10.0
+MIN_EXPLORATION_WEIGHT = 0.0
+MAX_BATCH_SIZE = 100
+
+
+class QueryInput(BaseModel):
+ """
+ Validated query input for the multi-agent framework.
+
+ Performs sanitization and security checks on user queries.
+ """
+
+ model_config = ConfigDict(
+ strict=True,
+ validate_assignment=True,
+ extra="forbid",
+ )
+
+ query: str = Field(
+ ..., min_length=MIN_QUERY_LENGTH, max_length=MAX_QUERY_LENGTH, description="User query to process"
+ )
+
+ use_rag: bool = Field(default=True, description="Enable RAG context retrieval")
+
+ use_mcts: bool = Field(default=False, description="Enable MCTS simulation for tactical planning")
+
+ thread_id: str | None = Field(
+ default=None,
+ max_length=100,
+ pattern=r"^[a-zA-Z0-9_-]+$",
+ description="Conversation thread ID for state persistence",
+ )
+
+ @field_validator("query")
+ @classmethod
+ def sanitize_query(cls, v: str) -> str:
+ """
+ Sanitize query input for security.
+
+ Removes potentially dangerous patterns while preserving legitimate content.
+ """
+ # Strip leading/trailing whitespace
+ v = v.strip()
+
+ # Check for empty query after stripping
+ if not v:
+ raise ValueError("Query cannot be empty or contain only whitespace")
+
+ # Remove null bytes
+ v = v.replace("\x00", "")
+
+ # Limit consecutive whitespace
+ v = re.sub(r"\s+", " ", v)
+
+ # Check for suspicious patterns (basic injection prevention)
+ suspicious_patterns = [
+ r"