Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Evaluation script for MANIT RAG system""" | |
| import time | |
| import pandas as pd | |
| from tabulate import tabulate | |
| from app import MANITChatbot | |
| def evaluate_performance(): | |
| """Evaluate the RAG system with a set of test questions""" | |
| # Initialize the chatbot | |
| print("Initializing MANIT Chatbot for evaluation...") | |
| chatbot = MANITChatbot() | |
| # Test questions covering different types of queries | |
| test_questions = [ | |
| "Who is the director of MANIT Bhopal?", | |
| "Who is the caretake of hostel 9?", | |
| "What are the prices of guest house at manit", | |
| "What are the dispensary timings and who are the staff present", | |
| "What research facilities are available at MANIT", | |
| "What is the contact number for dispensary", | |
| "Who are the associate deans at MANIT", | |
| "Tell me about training and placement cell at MANIT", | |
| "What is the syllabus of aritficial intelligence department", | |
| "What are the vision and mission of MANIT?", | |
| "Who is the faculty advisor of student street play society?", | |
| "On what research areas computer science department is working?", | |
| "what is the name of person who registered the design for a paver block", | |
| "What are the objective for intellectual property rights cell at manit", | |
| "Tell me about mentorship program at MANIT", | |
| "What are the recent events at manit" | |
| ] | |
| results = [] | |
| print(f"\nEvaluating {len(test_questions)} questions...") | |
| print("=" * 80) | |
| for i, question in enumerate(test_questions, 1): | |
| print(f"\n{i}/{len(test_questions)}: {question}") | |
| # Time the response | |
| start_time = time.time() | |
| response = chatbot.process_query(question) | |
| end_time = time.time() | |
| response_time = end_time - start_time | |
| # Analyze response quality | |
| word_count = len(response.split()) | |
| has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response | |
| is_short = word_count < 20 | |
| is_too_long = word_count > 200 | |
| results.append({ | |
| "Question": question, | |
| "Response Time (s)": round(response_time, 2), | |
| "Word Count": word_count, | |
| "Has Thinking Tokens": has_thinking_tokens, | |
| "Too Short": is_short, | |
| "Too Long": is_too_long, | |
| "Response": response | |
| }) | |
| print(f"Time: {response_time:.2f}s, Words: {word_count}") | |
| if has_thinking_tokens: | |
| print("⚠️ Warning: Response contains thinking tokens") | |
| # Create summary statistics | |
| df = pd.DataFrame(results) | |
| avg_time = df["Response Time (s)"].mean() | |
| avg_words = df["Word Count"].mean() | |
| thinking_tokens_count = df["Has Thinking Tokens"].sum() | |
| short_count = df["Too Short"].sum() | |
| long_count = df["Too Long"].sum() | |
| # Print summary | |
| print("\n" + "=" * 80) | |
| print("EVALUATION SUMMARY") | |
| print("=" * 80) | |
| print(f"Average Response Time: {avg_time:.2f}s") | |
| print(f"Average Response Length: {avg_words:.0f} words") | |
| print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}") | |
| print(f"Too Short Responses: {short_count}/{len(test_questions)}") | |
| print(f"Too Long Responses: {long_count}/{len(test_questions)}") | |
| # Print detailed results | |
| print("\nDETAILED RESULTS:") | |
| print("=" * 80) | |
| summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]] | |
| print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False)) | |
| # Print a few sample responses | |
| print("\nSAMPLE RESPONSES:") | |
| print("=" * 80) | |
| for i, result in enumerate(results[:3]): # Show first 3 responses | |
| print(f"\n{i+1}. {result['Question']}") | |
| print(f"Time: {result['Response Time (s)']}s") | |
| print("Response:") | |
| print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response']) | |
| print("-" * 60) | |
| # Save full results to CSV | |
| timestamp = time.strftime("%Y%m%d-%H%M%S") | |
| filename = f"evaluation_results_{timestamp}.csv" | |
| df.to_csv(filename, index=False) | |
| print(f"\nFull results saved to: {filename}") | |
| return results | |
| if __name__ == "__main__": | |
| # Set performance mode via environment variable | |
| import os | |
| performance_mode = os.getenv("PERFORMANCE_MODE", "balanced") | |
| print(f"Running evaluation in {performance_mode} mode") | |
| evaluate_performance() |