MANIT-RAG-Chatbot / build_vector_store.py
hiteshwar21's picture
Upload 3 files
f8b36e1 verified
#!/usr/bin/env python3
"""Rebuild vector store with optimized processing"""
import os
import sys
from pathlib import Path
# Add the project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from src.preprocessing.advanced_processor import OptimizedTextProcessor
from config.settings import config
def main():
print("Rebuilding MANIT RAG Vector Store with optimized chunking...")
# Check if raw texts exist
if not os.path.exists(config.RAW_TEXT_PATH):
print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist")
sys.exit(1)
# Process with optimized processor
processor = OptimizedTextProcessor()
chunks = processor.process_directory()
if not chunks:
print("No chunks were processed. Check your input files.")
sys.exit(1)
print(f"Processed {len(chunks)} optimized chunks")
processor.build_vector_store(chunks)
print("βœ… Optimized vector store built successfully!")
print(f"πŸ“Š Total chunks: {len(chunks)}")
# Display chunk type distribution
chunk_types = {}
for chunk in chunks:
chunk_type = chunk['metadata'].get('chunk_type', 'unknown')
chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
print("\nπŸ“ˆ Chunk distribution:")
for chunk_type, count in sorted(chunk_types.items()):
print(f" - {chunk_type}: {count} chunks")
if __name__ == "__main__":
main()