#!/usr/bin/env python3
"""Evaluation script for MANIT RAG system"""

import time
import pandas as pd
from tabulate import tabulate
from app import MANITChatbot

def evaluate_performance():
    """Evaluate the RAG system with a set of test questions"""
    # Initialize the chatbot
    print("Initializing MANIT Chatbot for evaluation...")
    chatbot = MANITChatbot()
    
    # Test questions covering different types of queries
    test_questions = [
        "Who is the director of MANIT Bhopal?",
        "Who is the caretake of hostel 9?",
        "What are the prices of guest house at manit",
        "What are the dispensary timings and who are the staff present",
        "What research facilities are available at MANIT",
        "What is the contact number for dispensary",
        "Who are the associate deans at MANIT",
        "Tell me about training and placement cell at MANIT",
        "What is the syllabus of aritficial intelligence department",
        "What are the vision and mission of MANIT?",
        "Who is the faculty advisor of student street play society?",
        "On what research areas computer science department is working?",
        "what is the name of person who registered the design for a paver block",
        "What are the objective for intellectual property rights cell at manit",
        "Tell me about mentorship program at MANIT",
        "What are the recent events at manit"
    ]
    
    results = []
    
    print(f"\nEvaluating {len(test_questions)} questions...")
    print("=" * 80)
    
    for i, question in enumerate(test_questions, 1):
        print(f"\n{i}/{len(test_questions)}: {question}")
        
        # Time the response
        start_time = time.time()
        response = chatbot.process_query(question)
        end_time = time.time()
        
        response_time = end_time - start_time
        
        # Analyze response quality
        word_count = len(response.split())
        has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response
        is_short = word_count < 20
        is_too_long = word_count > 200
        
        results.append({
            "Question": question,
            "Response Time (s)": round(response_time, 2),
            "Word Count": word_count,
            "Has Thinking Tokens": has_thinking_tokens,
            "Too Short": is_short,
            "Too Long": is_too_long,
            "Response": response
        })
        
        print(f"Time: {response_time:.2f}s, Words: {word_count}")
        if has_thinking_tokens:
            print("⚠️  Warning: Response contains thinking tokens")
    
    # Create summary statistics
    df = pd.DataFrame(results)
    avg_time = df["Response Time (s)"].mean()
    avg_words = df["Word Count"].mean()
    thinking_tokens_count = df["Has Thinking Tokens"].sum()
    short_count = df["Too Short"].sum()
    long_count = df["Too Long"].sum()
    
    # Print summary
    print("\n" + "=" * 80)
    print("EVALUATION SUMMARY")
    print("=" * 80)
    print(f"Average Response Time: {avg_time:.2f}s")
    print(f"Average Response Length: {avg_words:.0f} words")
    print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}")
    print(f"Too Short Responses: {short_count}/{len(test_questions)}")
    print(f"Too Long Responses: {long_count}/{len(test_questions)}")
    
    # Print detailed results
    print("\nDETAILED RESULTS:")
    print("=" * 80)
    
    summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]]
    print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))
    
    # Print a few sample responses
    print("\nSAMPLE RESPONSES:")
    print("=" * 80)
    
    for i, result in enumerate(results[:3]):  # Show first 3 responses
        print(f"\n{i+1}. {result['Question']}")
        print(f"Time: {result['Response Time (s)']}s")
        print("Response:")
        print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response'])
        print("-" * 60)
    
    # Save full results to CSV
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    filename = f"evaluation_results_{timestamp}.csv"
    df.to_csv(filename, index=False)
    print(f"\nFull results saved to: {filename}")
    
    return results

if __name__ == "__main__":
    # Set performance mode via environment variable
    import os
    performance_mode = os.getenv("PERFORMANCE_MODE", "balanced")
    print(f"Running evaluation in {performance_mode} mode")
    
    evaluate_performance()