Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Rebuild vector store with optimized processing""" | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # Add the project root to Python path | |
| project_root = Path(__file__).parent | |
| sys.path.insert(0, str(project_root)) | |
| from src.preprocessing.advanced_processor import OptimizedTextProcessor | |
| from config.settings import config | |
| def main(): | |
| print("Rebuilding MANIT RAG Vector Store with optimized chunking...") | |
| # Check if raw texts exist | |
| if not os.path.exists(config.RAW_TEXT_PATH): | |
| print(f"Error: Raw text path {config.RAW_TEXT_PATH} does not exist") | |
| sys.exit(1) | |
| # Process with optimized processor | |
| processor = OptimizedTextProcessor() | |
| chunks = processor.process_directory() | |
| if not chunks: | |
| print("No chunks were processed. Check your input files.") | |
| sys.exit(1) | |
| print(f"Processed {len(chunks)} optimized chunks") | |
| processor.build_vector_store(chunks) | |
| print("β Optimized vector store built successfully!") | |
| print(f"π Total chunks: {len(chunks)}") | |
| # Display chunk type distribution | |
| chunk_types = {} | |
| for chunk in chunks: | |
| chunk_type = chunk['metadata'].get('chunk_type', 'unknown') | |
| chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1 | |
| print("\nπ Chunk distribution:") | |
| for chunk_type, count in sorted(chunk_types.items()): | |
| print(f" - {chunk_type}: {count} chunks") | |
| if __name__ == "__main__": | |
| main() |