MANIT-RAG-Chatbot / evaluate.py
hiteshwar21's picture
Upload 3 files
f8b36e1 verified
#!/usr/bin/env python3
"""Evaluation script for MANIT RAG system"""
import time
import pandas as pd
from tabulate import tabulate
from app import MANITChatbot
def evaluate_performance():
"""Evaluate the RAG system with a set of test questions"""
# Initialize the chatbot
print("Initializing MANIT Chatbot for evaluation...")
chatbot = MANITChatbot()
# Test questions covering different types of queries
test_questions = [
"Who is the director of MANIT Bhopal?",
"Who is the caretake of hostel 9?",
"What are the prices of guest house at manit",
"What are the dispensary timings and who are the staff present",
"What research facilities are available at MANIT",
"What is the contact number for dispensary",
"Who are the associate deans at MANIT",
"Tell me about training and placement cell at MANIT",
"What is the syllabus of aritficial intelligence department",
"What are the vision and mission of MANIT?",
"Who is the faculty advisor of student street play society?",
"On what research areas computer science department is working?",
"what is the name of person who registered the design for a paver block",
"What are the objective for intellectual property rights cell at manit",
"Tell me about mentorship program at MANIT",
"What are the recent events at manit"
]
results = []
print(f"\nEvaluating {len(test_questions)} questions...")
print("=" * 80)
for i, question in enumerate(test_questions, 1):
print(f"\n{i}/{len(test_questions)}: {question}")
# Time the response
start_time = time.time()
response = chatbot.process_query(question)
end_time = time.time()
response_time = end_time - start_time
# Analyze response quality
word_count = len(response.split())
has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response
is_short = word_count < 20
is_too_long = word_count > 200
results.append({
"Question": question,
"Response Time (s)": round(response_time, 2),
"Word Count": word_count,
"Has Thinking Tokens": has_thinking_tokens,
"Too Short": is_short,
"Too Long": is_too_long,
"Response": response
})
print(f"Time: {response_time:.2f}s, Words: {word_count}")
if has_thinking_tokens:
print("⚠️ Warning: Response contains thinking tokens")
# Create summary statistics
df = pd.DataFrame(results)
avg_time = df["Response Time (s)"].mean()
avg_words = df["Word Count"].mean()
thinking_tokens_count = df["Has Thinking Tokens"].sum()
short_count = df["Too Short"].sum()
long_count = df["Too Long"].sum()
# Print summary
print("\n" + "=" * 80)
print("EVALUATION SUMMARY")
print("=" * 80)
print(f"Average Response Time: {avg_time:.2f}s")
print(f"Average Response Length: {avg_words:.0f} words")
print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}")
print(f"Too Short Responses: {short_count}/{len(test_questions)}")
print(f"Too Long Responses: {long_count}/{len(test_questions)}")
# Print detailed results
print("\nDETAILED RESULTS:")
print("=" * 80)
summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]]
print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))
# Print a few sample responses
print("\nSAMPLE RESPONSES:")
print("=" * 80)
for i, result in enumerate(results[:3]): # Show first 3 responses
print(f"\n{i+1}. {result['Question']}")
print(f"Time: {result['Response Time (s)']}s")
print("Response:")
print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response'])
print("-" * 60)
# Save full results to CSV
timestamp = time.strftime("%Y%m%d-%H%M%S")
filename = f"evaluation_results_{timestamp}.csv"
df.to_csv(filename, index=False)
print(f"\nFull results saved to: {filename}")
return results
if __name__ == "__main__":
# Set performance mode via environment variable
import os
performance_mode = os.getenv("PERFORMANCE_MODE", "balanced")
print(f"Running evaluation in {performance_mode} mode")
evaluate_performance()