Spaces:

hiteshwar21
/

MANIT-RAG-Chatbot

Sleeping

App Files Files Community

hiteshwar21 commited on Sep 4

Commit

63fb516

verified ·

1 Parent(s): 5109373

Upload 3 files

Browse files

Files changed (3) hide show

app.py +165 -0
build_vector_store.py +31 -0
evaluate.py +120 -0

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+#!/usr/bin/env python3
+"""Main application for MANIT RAG Chatbot"""
+from typing import List, Dict
+import gradio as gr
+import numpy as np
+import faiss
+import pickle
+import os
+import time
+from sentence_transformers import SentenceTransformer
+from src.retrieval.semantic_retriever import SemanticRetriever
+from src.generation.response_generator import ResponseGenerator
+from config.settings import config
+class MANITChatbot:
+    """Main chatbot class"""
+    def __init__(self):
+        # Load vector store
+        self.embeddings = np.load(os.path.join(config.VECTOR_STORE_PATH, "embeddings.npy"))
+        self.faiss_index = faiss.read_index(os.path.join(config.VECTOR_STORE_PATH, "faiss_index.bin"))
+        with open(os.path.join(config.VECTOR_STORE_PATH, "chunks.pkl"), "rb") as f:
+            self.chunks = pickle.load(f)
+        with open(os.path.join(config.VECTOR_STORE_PATH, "bm25.pkl"), "rb") as f:
+            self.bm25 = pickle.load(f)
+        with open(os.path.join(config.VECTOR_STORE_PATH, "relationships.pkl"), "rb") as f:
+            self.relationships = pickle.load(f)
+        # Initialize models
+        self.embedding_model = SentenceTransformer(config.EMBEDDING_MODEL, device='cpu')
+        # Initialize components
+        self.retriever = SemanticRetriever(
+            embedding_model=self.embedding_model,
+            faiss_index=self.faiss_index,
+            chunks=self.chunks,
+            bm25_index=self.bm25,
+            relationships=self.relationships
+        )
+        self.generator = ResponseGenerator()
+        print("MANIT Chatbot initialized successfully!")
+    def process_query(self, query: str) -> str:
+        """Process user query through full RAG pipeline"""
+        if not query.strip():
+            return "Please enter a question about MANIT Bhopal."
+        start_time = time.time()
+        try:
+            print(f"Processing query: {query}")
+            # Retrieve relevant documents
+            retrieval_start = time.time()
+            retrieved_chunks = self.retriever.retrieve(query)
+            retrieval_time = time.time() - retrieval_start
+            if not retrieved_chunks:
+                return "I couldn't find relevant information about this topic. Please try another question."
+            print(f"Retrieved {len(retrieved_chunks)} chunks in {retrieval_time:.2f}s")
+            # Format context
+            context = self._format_context(retrieved_chunks)
+            # Check if web search is needed
+            web_context = ""
+            if self.generator.needs_web_search(query, context):
+                web_results = self.generator.web_search(query)
+                if web_results:
+                    web_context = "\n\n".join(web_results)
+            # Generate response
+            generation_start = time.time()
+            response = self.generator.generate_response(query, context, web_context)
+            generation_time = time.time() - generation_start
+            total_time = time.time() - start_time
+            print(f"Total processing time: {total_time:.2f}s (Retrieval: {retrieval_time:.2f}s, Generation: {generation_time:.2f}s)")
+            return response
+        except Exception as e:
+            print(f"Error processing query: {e}")
+            return "I encountered an error processing your question. Please try again."
+    def _format_context(self, chunks: List[Dict]) -> str:
+        """Format context for the prompt"""
+        context_parts = []
+        for chunk in chunks:
+            source = chunk['metadata']['source']
+            content = chunk['content']
+            context_parts.append(f"Source: {source}\nContent: {content}")
+        return "\n\n---\n\n".join(context_parts)
+def create_interface():
+    """Create Gradio interface"""
+    chatbot = MANITChatbot()
+    def chat_fn(message, history):
+        """Process chat message and return both chatbot history and cleared message"""
+        response = chatbot.process_query(message)
+        # Append to history - format as [user_message, bot_response]
+        history.append([message, response])
+        # Return updated history AND empty string to clear input
+        return history, ""
+    with gr.Blocks(
+        title="MANIT Bhopal Expert Assistant",
+        theme=gr.themes.Soft(),
+        css=""".gradio-container {max-width: 900px; margin: 0 auto;}"""
+    ) as demo:
+        gr.Markdown("""
+        # 🎓 MANIT Bhopal Expert Assistant
+        *Powered by Advanced RAG Technology*
+        Ask questions about programs, admissions, faculty, facilities, research, and more.
+        """)
+        chatbot_ui = gr.Chatbot(
+            height=500,
+            show_label=False,
+            avatar_images=[None, "👨‍🎓"],
+            show_copy_button=True
+        )
+        with gr.Row():
+            msg = gr.Textbox(
+                label="Your Question",
+                placeholder="Ask about MANIT Bhopal...",
+                scale=8,
+                lines=2
+            )
+            submit = gr.Button("Send", scale=1, variant="primary")
+        gr.Examples(
+            examples=[
+                "Who is the current director of MANIT?",
+                "What programs are offered in Computer Applications?",
+                "What is the admission cancellation process?",
+                "Tell me about the faculty in Mechanical Engineering",
+                "What research facilities are available at MANIT?"
+            ],
+            inputs=msg,
+            label="Example Questions"
+        )
+        # Set up event handlers - return both chatbot and textbox components
+        msg.submit(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
+        submit.click(chat_fn, [msg, chatbot_ui], [chatbot_ui, msg])
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

build_vector_store.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python3
+"""Build the vector store from raw text files"""
+import os
+import sys
+from src.preprocessing.advanced_processor import AdvancedTextProcessor
+from config.settings import config
+def main():
+    print("Building MANIT RAG Vector Store...")
+    # Check if raw texts exist
+    if not os.path.exists(config.RAW_TEXT_PATH):
+        print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist")
+        sys.exit(1)
+    # Process texts and build vector store
+    processor = AdvancedTextProcessor()
+    chunks = processor.process_directory()
+    if not chunks:
+        print("No chunks were processed. Check your input files.")
+        sys.exit(1)
+    print(f"Processed {len(chunks)} chunks from text files")
+    processor.build_vector_store(chunks)
+    print("Vector store built successfully!")
+if __name__ == "__main__":
+    main()

evaluate.py ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/usr/bin/env python3
+"""Evaluation script for MANIT RAG system"""
+import time
+import pandas as pd
+from tabulate import tabulate
+from app import MANITChatbot
+def evaluate_performance():
+    """Evaluate the RAG system with a set of test questions"""
+    # Initialize the chatbot
+    print("Initializing MANIT Chatbot for evaluation...")
+    chatbot = MANITChatbot()
+    # Test questions covering different types of queries
+    test_questions = [
+        "Who is the director of MANIT Bhopal?",
+        "Who is the caretake of hostel 9?",
+        "What are the prices of guest house at manit",
+        "What are the dispensary timings and who are the staff present",
+        "What research facilities are available at MANIT",
+        "What is the contact number for dispensary",
+        "Who are the associate deans at MANIT",
+        "Tell me about training and placement cell at MANIT",
+        "What is the syllabus of aritficial intelligence department",
+        "What are the vision and mission of MANIT?",
+        "Who is the faculty advisor of student street play society?",
+        "On what research areas computer science department is working?",
+        "what is the name of person who registered the design for a paver block",
+        "What are the objective for intellectual property rights cell at manit",
+        "Tell me about mentorship program at MANIT",
+        "What are the recent events at manti"
+    ]
+    results = []
+    print(f"\nEvaluating {len(test_questions)} questions...")
+    print("=" * 80)
+    for i, question in enumerate(test_questions, 1):
+        print(f"\n{i}/{len(test_questions)}: {question}")
+        # Time the response
+        start_time = time.time()
+        response = chatbot.process_query(question)
+        end_time = time.time()
+        response_time = end_time - start_time
+        # Analyze response quality
+        word_count = len(response.split())
+        has_thinking_tokens = "◁think▷" in response or "◁/think▷" in response
+        is_short = word_count < 20
+        is_too_long = word_count > 200
+        results.append({
+            "Question": question,
+            "Response Time (s)": round(response_time, 2),
+            "Word Count": word_count,
+            "Has Thinking Tokens": has_thinking_tokens,
+            "Too Short": is_short,
+            "Too Long": is_too_long,
+            "Response": response
+        })
+        print(f"Time: {response_time:.2f}s, Words: {word_count}")
+        if has_thinking_tokens:
+            print("⚠️  Warning: Response contains thinking tokens")
+    # Create summary statistics
+    df = pd.DataFrame(results)
+    avg_time = df["Response Time (s)"].mean()
+    avg_words = df["Word Count"].mean()
+    thinking_tokens_count = df["Has Thinking Tokens"].sum()
+    short_count = df["Too Short"].sum()
+    long_count = df["Too Long"].sum()
+    # Print summary
+    print("\n" + "=" * 80)
+    print("EVALUATION SUMMARY")
+    print("=" * 80)
+    print(f"Average Response Time: {avg_time:.2f}s")
+    print(f"Average Response Length: {avg_words:.0f} words")
+    print(f"Questions with Thinking Tokens: {thinking_tokens_count}/{len(test_questions)}")
+    print(f"Too Short Responses: {short_count}/{len(test_questions)}")
+    print(f"Too Long Responses: {long_count}/{len(test_questions)}")
+    # Print detailed results
+    print("\nDETAILED RESULTS:")
+    print("=" * 80)
+    summary_df = df[["Question", "Response Time (s)", "Word Count", "Has Thinking Tokens"]]
+    print(tabulate(summary_df, headers="keys", tablefmt="grid", showindex=False))
+    # Print a few sample responses
+    print("\nSAMPLE RESPONSES:")
+    print("=" * 80)
+    for i, result in enumerate(results[:3]):  # Show first 3 responses
+        print(f"\n{i+1}. {result['Question']}")
+        print(f"Time: {result['Response Time (s)']}s")
+        print("Response:")
+        print(result['Response'][:300] + "..." if len(result['Response']) > 300 else result['Response'])
+        print("-" * 60)
+    # Save full results to CSV
+    timestamp = time.strftime("%Y%m%d-%H%M%S")
+    filename = f"evaluation_results_{timestamp}.csv"
+    df.to_csv(filename, index=False)
+    print(f"\nFull results saved to: {filename}")
+    return results
+if __name__ == "__main__":
+    # Set performance mode via environment variable
+    import os
+    performance_mode = os.getenv("PERFORMANCE_MODE", "balanced")
+    print(f"Running evaluation in {performance_mode} mode")
+    evaluate_performance()