hiteshwar21 commited on
Commit
63fb516
·
verified ·
1 Parent(s): 5109373

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +165 -0
  2. build_vector_store.py +31 -0
  3. evaluate.py +120 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Main application for MANIT RAG Chatbot"""
3
+ from typing import List, Dict
4
+ import gradio as gr
5
+ import numpy as np
6
+ import faiss
7
+ import pickle
8
+ import os
9
+ import time
10
+ from sentence_transformers import SentenceTransformer
11
+ from src.retrieval.semantic_retriever import SemanticRetriever
12
+ from src.generation.response_generator import ResponseGenerator
13
+ from config.settings import config
14
+
15
+ class MANITChatbot:
16
+ """Main chatbot class"""
17
+
18
+ def __init__(self):
19
+ # Load vector store
20
+ self.embeddings = np.load(os.path.join(config.VECTOR_STORE_PATH, "embeddings.npy"))
21
+ self.faiss_index = faiss.read_index(os.path.join(config.VECTOR_STORE_PATH, "faiss_index.bin"))
22
+
23
+ with open(os.path.join(config.VECTOR_STORE_PATH, "chunks.pkl"), "rb") as f:
24
+ self.chunks = pickle.load(f)
25
+
26
+ with open(os.path.join(config.VECTOR_STORE_PATH, "bm25.pkl"), "rb") as f:
27
+ self.bm25 = pickle.load(f)
28
+
29
+ with open(os.path.join(config.VECTOR_STORE_PATH, "relationships.pkl"), "rb") as f:
30
+ self.relationships = pickle.load(f)
31
+
32
+ # Initialize models
33
+ self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL, device='cpu')
34
+
35
+ # Initialize components
36
+ self.retriever = SemanticRetriever(
37
+ embedding_model=self.embedding_model,
38
+ faiss_index=self.faiss_index,
39
+ chunks=self.chunks,
40
+ bm25_index=self.bm25,
41
+ relationships=self.relationships
42
+ )
43
+
44
+ self.generator = ResponseGenerator()
45
+ print("MANIT Chatbot initialized successfully!")
46
+
47
+ def process_query(self, query: str) -> str:
48
+ """Process user query through full RAG pipeline"""
49
+ if not query.strip():
50
+ return "Please enter a question about MANIT Bhopal."
51
+
52
+ start_time = time.time()
53
+
54
+ try:
55
+ print(f"Processing query: {query}")
56
+
57
+ # Retrieve relevant documents
58
+ retrieval_start = time.time()
59
+ retrieved_chunks = self.retriever.retrieve(query)
60
+ retrieval_time = time.time() - retrieval_start
61
+
62
+ if not retrieved_chunks:
63
+ return "I couldn't find relevant information about this topic. Please try another question."
64
+
65
+ print(f"Retrieved {len(retrieved_chunks)} chunks in {retrieval_time:.2f}s")
66
+
67
+ # Format context
68
+ context = self._format_context(retrieved_chunks)
69
+
70
+ # Check if web search is needed
71
+ web_context = ""
72
+ if self.generator.needs_web_search(query, context):
73
+ web_results = self.generator.web_search(query)
74
+ if web_results:
75
+ web_context = "\n\n".join(web_results)
76
+
77
+ # Generate response
78
+ generation_start = time.time()
79
+ response = self.generator.generate_response(query, context, web_context)
80
+ generation_time = time.time() - generation_start
81
+
82
+ total_time = time.time() - start_time
83
+ print(f"Total processing time: {total_time:.2f}s (Retrieval: {retrieval_time:.2f}s, Generation: {generation_time:.2f}s)")
84
+
85
+ return response
86
+
87
+ except Exception as e:
88
+ print(f"Error processing query: {e}")
89
+ return "I encountered an error processing your question. Please try again."
90
+
91
+ def _format_context(self, chunks: List[Dict]) -> str:
92
+ """Format context for the prompt"""
93
+ context_parts = []
94
+
95
+ for chunk in chunks:
96
+ source = chunk['metadata']['source']
97
+ content = chunk['content']
98
+ context_parts.append(f"Source: {source}\nContent: {content}")
99
+
100
+ return "\n\n---\n\n".join(context_parts)
101
+
102
+ def create_interface():
103
+ """Create Gradio interface"""
104
+ chatbot = MANITChatbot()
105
+
106
+ def chat_fn(message, history):
107
+ """Process chat message and return both chatbot history and cleared message"""
108
+ response = chatbot.process_query(message)
109
+
110
+ # Append to history - format as [user_message, bot_response]
111
+ history.append([message, response])
112
+
113
+ # Return updated history AND empty string to clear input
114
+ return history, ""
115
+
116
+ with gr.Blocks(
117
+ title="MANIT Bhopal Expert Assistant",
118
+ theme=gr.themes.Soft(),
119
+ css=""".gradio-container {max-width: 900px; margin: 0 auto;}"""
120
+ ) as demo:
121
+
122
+ gr.Markdown("""
123
+ # 🎓 MANIT Bhopal Expert Assistant
124
+ *Powered by Advanced RAG Technology*
125
+
126
+ Ask questions about programs, admissions, faculty, facilities, research, and more.
127
+ """)
128
+
129
+ chatbot_ui = gr.Chatbot(
130
+ height=500,
131
+ show_label=False,
132
+ avatar_images=[None, "👨‍🎓"],
133
+ show_copy_button=True
134
+ )
135
+
136
+ with gr.Row():
137
+ msg = gr.Textbox(
138
+ label="Your Question",
139
+ placeholder="Ask about MANIT Bhopal...",
140
+ scale=8,
141
+ lines=2
142
+ )
143
+ submit = gr.Button("Send", scale=1, variant="primary")
144
+
145
+ gr.Examples(
146
+ examples=[
147
+ "Who is the current director of MANIT?",
148
+ "What programs are offered in Computer Applications?",
149
+ "What is the admission cancellation process?",
150
+ "Tell me about the faculty in Mechanical Engineering",
151
+ "What research facilities are available at MANIT?"
152
+ ],
153
+ inputs=msg,
154
+ label="Example Questions"
155
+ )
156
+
157
+ # Set up event handlers - return both chatbot and textbox components
158
+ msg.submit(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
159
+ submit.click(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
160
+
161
+ return demo
162
+
163
+ if __name__ == "__main__":
164
+ demo = create_interface()
165
+ demo.launch()
build_vector_store.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Build the vector store from raw text files"""
3
+
4
+ import os
5
+ import sys
6
+ from src.preprocessing.advanced_processor import AdvancedTextProcessor
7
+ from config.settings import config
8
+
9
+ def main():
10
+ print("Building MANIT RAG Vector Store...")
11
+
12
+ # Check if raw texts exist
13
+ if not os.path.exists(config.RAW_TEXT_PATH):
14
+ print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist")
15
+ sys.exit(1)
16
+
17
+ # Process texts and build vector store
18
+ processor = AdvancedTextProcessor()
19
+ chunks = processor.process_directory()
20
+
21
+ if not chunks:
22
+ print("No chunks were processed. Check your input files.")
23
+ sys.exit(1)
24
+
25
+ print(f"Processed {len(chunks)} chunks from text files")
26
+ processor.build_vector_store(chunks)
27
+
28
+ print("Vector store built successfully!")
29
+
30
+ if __name__ == "__main__":
31
+ main()
evaluate.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Evaluation script for MANIT RAG system"""
3
+
4
+ import time
5
+ import pandas as pd
6
+ from tabulate import tabulate
7
+ from app import MANITChatbot
8
+
9
+ def evaluate_performance():
10
+ """Evaluate the RAG system with a set of test questions"""
11
+ # Initialize the chatbot
12
+ print("Initializing MANIT Chatbot for evaluation...")
13
+ chatbot = MANITChatbot()
14
+
15
+ # Test questions covering different types of queries
16
+ test_questions = [
17
+ "Who is the director of MANIT Bhopal?",
18
+ "Who is the caretake of hostel 9?",
19
+ "What are the prices of guest house at manit",
20
+ "What are the dispensary timings and who are the staff present",
21
+ "What research facilities are available at MANIT",
22
+ "What is the contact number for dispensary",
23
+ "Who are the associate deans at MANIT",
24
+ "Tell me about training and placement cell at MANIT",
25
+ "What is the syllabus of aritficial intelligence department",
26
+ "What are the vision and mission of MANIT?",
27
+ "Who is the faculty advisor of student street play society?",
28
+ "On what research areas computer science department is working?",
29
+ "what is the name of person who registered the design for a paver block",
30
+ "What are the objective for intellectual property rights cell at manit",
31
+ "Tell me about mentorship program at MANIT",
32
+ "What are the recent events at manti"
33
+ ]
34
+
35
+ results = []
36
+
37
+ print(f"\nEvaluating {len(test_questions)} questions...")
38
+ print("=" * 80)
39
+
40
+ for i, question in enumerate(test_questions, 1):
41
+ print(f"\n{i}/{len(test_questions)}: {question}")
42
+
43
+ # Time the response
44
+ start_time = time.time()
45
+ response = chatbot.process_query(question)
46
+ end_time = time.time()
47
+
48
+ response_time = end_time - start_time
49
+
50
+ # Analyze response quality
51
+ word_count = len(response.split())
52
+ has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response
53
+ is_short = word_count < 20
54
+ is_too_long = word_count > 200
55
+
56
+ results.append({
57
+ "Question": question,
58
+ "Response Time (s)": round(response_time, 2),
59
+ "Word Count": word_count,
60
+ "Has Thinking Tokens": has_thinking_tokens,
61
+ "Too Short": is_short,
62
+ "Too Long": is_too_long,
63
+ "Response": response
64
+ })
65
+
66
+ print(f"Time: {response_time:.2f}s, Words: {word_count}")
67
+ if has_thinking_tokens:
68
+ print("⚠️ Warning: Response contains thinking tokens")
69
+
70
+ # Create summary statistics
71
+ df = pd.DataFrame(results)
72
+ avg_time = df["Response Time (s)"].mean()
73
+ avg_words = df["Word Count"].mean()
74
+ thinking_tokens_count = df["Has Thinking Tokens"].sum()
75
+ short_count = df["Too Short"].sum()
76
+ long_count = df["Too Long"].sum()
77
+
78
+ # Print summary
79
+ print("\n" + "=" * 80)
80
+ print("EVALUATION SUMMARY")
81
+ print("=" * 80)
82
+ print(f"Average Response Time: {avg_time:.2f}s")
83
+ print(f"Average Response Length: {avg_words:.0f} words")
84
+ print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}")
85
+ print(f"Too Short Responses: {short_count}/{len(test_questions)}")
86
+ print(f"Too Long Responses: {long_count}/{len(test_questions)}")
87
+
88
+ # Print detailed results
89
+ print("\nDETAILED RESULTS:")
90
+ print("=" * 80)
91
+
92
+ summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]]
93
+ print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))
94
+
95
+ # Print a few sample responses
96
+ print("\nSAMPLE RESPONSES:")
97
+ print("=" * 80)
98
+
99
+ for i, result in enumerate(results[:3]): # Show first 3 responses
100
+ print(f"\n{i+1}. {result['Question']}")
101
+ print(f"Time: {result['Response Time (s)']}s")
102
+ print("Response:")
103
+ print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response'])
104
+ print("-" * 60)
105
+
106
+ # Save full results to CSV
107
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
108
+ filename = f"evaluation_results_{timestamp}.csv"
109
+ df.to_csv(filename, index=False)
110
+ print(f"\nFull results saved to: {filename}")
111
+
112
+ return results
113
+
114
+ if __name__ == "__main__":
115
+ # Set performance mode via environment variable
116
+ import os
117
+ performance_mode = os.getenv("PERFORMANCE_MODE", "balanced")
118
+ print(f"Running evaluation in {performance_mode} mode")
119
+
120
+ evaluate_performance()