Spaces:

hiteshwar21
/

MANIT-RAG-Chatbot

Sleeping

App Files Files Community

MANIT-RAG-Chatbot / evaluate.py

hiteshwar21

Upload 3 files

f8b36e1 verified 3 months ago

raw

history blame contribute delete

4.68 kB

	#!/usr/bin/env python3
	"""Evaluation script for MANIT RAG system"""

	import time
	import pandas as pd
	from tabulate import tabulate
	from app import MANITChatbot

	def evaluate_performance():
	"""Evaluate the RAG system with a set of test questions"""
	# Initialize the chatbot
	print("Initializing MANIT Chatbot for evaluation...")
	chatbot = MANITChatbot()

	# Test questions covering different types of queries
	test_questions = [
	"Who is the director of MANIT Bhopal?",
	"Who is the caretake of hostel 9?",
	"What are the prices of guest house at manit",
	"What are the dispensary timings and who are the staff present",
	"What research facilities are available at MANIT",
	"What is the contact number for dispensary",
	"Who are the associate deans at MANIT",
	"Tell me about training and placement cell at MANIT",
	"What is the syllabus of aritficial intelligence department",
	"What are the vision and mission of MANIT?",
	"Who is the faculty advisor of student street play society?",
	"On what research areas computer science department is working?",
	"what is the name of person who registered the design for a paver block",
	"What are the objective for intellectual property rights cell at manit",
	"Tell me about mentorship program at MANIT",
	"What are the recent events at manit"
	]

	results = []

	print(f"\nEvaluating {len(test_questions)} questions...")
	print("=" * 80)

	for i, question in enumerate(test_questions, 1):
	print(f"\n{i}/{len(test_questions)}: {question}")

	# Time the response
	start_time = time.time()
	response = chatbot.process_query(question)
	end_time = time.time()

	response_time = end_time - start_time

	# Analyze response quality
	word_count = len(response.split())
	has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response
	is_short = word_count < 20
	is_too_long = word_count > 200

	results.append({
	"Question": question,
	"Response Time (s)": round(response_time, 2),
	"Word Count": word_count,
	"Has Thinking Tokens": has_thinking_tokens,
	"Too Short": is_short,
	"Too Long": is_too_long,
	"Response": response
	})

	print(f"Time: {response_time:.2f}s, Words: {word_count}")
	if has_thinking_tokens:
	print("⚠️ Warning: Response contains thinking tokens")

	# Create summary statistics
	df = pd.DataFrame(results)
	avg_time = df["Response Time (s)"].mean()
	avg_words = df["Word Count"].mean()
	thinking_tokens_count = df["Has Thinking Tokens"].sum()
	short_count = df["Too Short"].sum()
	long_count = df["Too Long"].sum()

	# Print summary
	print("\n" + "=" * 80)
	print("EVALUATION SUMMARY")
	print("=" * 80)
	print(f"Average Response Time: {avg_time:.2f}s")
	print(f"Average Response Length: {avg_words:.0f} words")
	print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}")
	print(f"Too Short Responses: {short_count}/{len(test_questions)}")
	print(f"Too Long Responses: {long_count}/{len(test_questions)}")

	# Print detailed results
	print("\nDETAILED RESULTS:")
	print("=" * 80)

	summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]]
	print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))

	# Print a few sample responses
	print("\nSAMPLE RESPONSES:")
	print("=" * 80)

	for i, result in enumerate(results[:3]): # Show first 3 responses
	print(f"\n{i+1}. {result['Question']}")
	print(f"Time: {result['Response Time (s)']}s")
	print("Response:")
	print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response'])
	print("-" * 60)

	# Save full results to CSV
	timestamp = time.strftime("%Y%m%d-%H%M%S")
	filename = f"evaluation_results_{timestamp}.csv"
	df.to_csv(filename, index=False)
	print(f"\nFull results saved to: {filename}")

	return results

	if __name__ == "__main__":
	# Set performance mode via environment variable
	import os
	performance_mode = os.getenv("PERFORMANCE_MODE", "balanced")
	print(f"Running evaluation in {performance_mode} mode")

	evaluate_performance()