#!/usr/bin/env python3 """Rebuild vector store with optimized processing""" import os import sys from pathlib import Path # Add the project root to Python path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from src.preprocessing.advanced_processor import OptimizedTextProcessor from config.settings import config def main(): print("Rebuilding MANIT RAG Vector Store with optimized chunking...") # Check if raw texts exist if not os.path.exists(config.RAW_TEXT_PATH): print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist") sys.exit(1) # Process with optimized processor processor = OptimizedTextProcessor() chunks = processor.process_directory() if not chunks: print("No chunks were processed. Check your input files.") sys.exit(1) print(f"Processed {len(chunks)} optimized chunks") processor.build_vector_store(chunks) print("āœ… Optimized vector store built successfully!") print(f"šŸ“Š Total chunks: {len(chunks)}") # Display chunk type distribution chunk_types = {} for chunk in chunks: chunk_type = chunk['metadata'].get('chunk_type', 'unknown') chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1 print("\nšŸ“ˆ Chunk distribution:") for chunk_type, count in sorted(chunk_types.items()): print(f" - {chunk_type}: {count} chunks") if __name__ == "__main__": main()