#!/usr/bin/env python3 """Evaluation script for MANIT RAG system""" import time import pandas as pd from tabulate import tabulate from app import MANITChatbot def evaluate_performance(): """Evaluate the RAG system with a set of test questions""" # Initialize the chatbot print("Initializing MANIT Chatbot for evaluation...") chatbot = MANITChatbot() # Test questions covering different types of queries test_questions = [ "Who is the director of MANIT Bhopal?", "Who is the caretake of hostel 9?", "What are the prices of guest house at manit", "What are the dispensary timings and who are the staff present", "What research facilities are available at MANIT", "What is the contact number for dispensary", "Who are the associate deans at MANIT", "Tell me about training and placement cell at MANIT", "What is the syllabus of aritficial intelligence department", "What are the vision and mission of MANIT?", "Who is the faculty advisor of student street play society?", "On what research areas computer science department is working?", "what is the name of person who registered the design for a paver block", "What are the objective for intellectual property rights cell at manit", "Tell me about mentorship program at MANIT", "What are the recent events at manit" ] results = [] print(f"\nEvaluating {len(test_questions)} questions...") print("=" * 80) for i, question in enumerate(test_questions, 1): print(f"\n{i}/{len(test_questions)}: {question}") # Time the response start_time = time.time() response = chatbot.process_query(question) end_time = time.time() response_time = end_time - start_time # Analyze response quality word_count = len(response.split()) has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response is_short = word_count < 20 is_too_long = word_count > 200 results.append({ "Question": question, "Response Time (s)": round(response_time, 2), "Word Count": word_count, "Has Thinking Tokens": has_thinking_tokens, "Too Short": is_short, "Too Long": is_too_long, "Response": response }) print(f"Time: {response_time:.2f}s, Words: {word_count}") if has_thinking_tokens: print("⚠️ Warning: Response contains thinking tokens") # Create summary statistics df = pd.DataFrame(results) avg_time = df["Response Time (s)"].mean() avg_words = df["Word Count"].mean() thinking_tokens_count = df["Has Thinking Tokens"].sum() short_count = df["Too Short"].sum() long_count = df["Too Long"].sum() # Print summary print("\n" + "=" * 80) print("EVALUATION SUMMARY") print("=" * 80) print(f"Average Response Time: {avg_time:.2f}s") print(f"Average Response Length: {avg_words:.0f} words") print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}") print(f"Too Short Responses: {short_count}/{len(test_questions)}") print(f"Too Long Responses: {long_count}/{len(test_questions)}") # Print detailed results print("\nDETAILED RESULTS:") print("=" * 80) summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]] print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False)) # Print a few sample responses print("\nSAMPLE RESPONSES:") print("=" * 80) for i, result in enumerate(results[:3]): # Show first 3 responses print(f"\n{i+1}. {result['Question']}") print(f"Time: {result['Response Time (s)']}s") print("Response:") print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response']) print("-" * 60) # Save full results to CSV timestamp = time.strftime("%Y%m%d-%H%M%S") filename = f"evaluation_results_{timestamp}.csv" df.to_csv(filename, index=False) print(f"\nFull results saved to: {filename}") return results if __name__ == "__main__": # Set performance mode via environment variable import os performance_mode = os.getenv("PERFORMANCE_MODE", "balanced") print(f"Running evaluation in {performance_mode} mode") evaluate_performance()