Spaces:
Runtime error
Runtime error
| """ | |
| ViMedical Disease Dataset - Download & Process | |
| Downloads and processes Vietnamese medical disease dataset into ChromaDB | |
| Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples) | |
| """ | |
| import requests | |
| import pandas as pd | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| import re | |
| def download_vimedical(): | |
| """Download ViMedical dataset from HuggingFace""" | |
| print("📥 Downloading ViMedical Disease dataset...") | |
| # HuggingFace dataset URL | |
| url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv" | |
| # Create datasets directory | |
| os.makedirs("data_mining/datasets", exist_ok=True) | |
| output_path = "data_mining/datasets/vimedical_disease.csv" | |
| try: | |
| # Download | |
| response = requests.get(url, timeout=60) | |
| response.raise_for_status() | |
| # Save | |
| with open(output_path, 'wb') as f: | |
| f.write(response.content) | |
| # Check file size | |
| file_size = os.path.getsize(output_path) / (1024 * 1024) # MB | |
| print(f"✅ Downloaded: {output_path}") | |
| print(f"📊 File size: {file_size:.2f} MB") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Download failed: {e}") | |
| return False | |
| def extract_symptoms(question): | |
| """Extract symptom description from question""" | |
| # Remove common prefixes | |
| prefixes = [ | |
| 'Tôi đang có triệu chứng như ', | |
| 'Tôi thường xuyên ', | |
| 'Tôi cảm thấy ', | |
| 'Tôi bị ', | |
| 'Tôi hay ', | |
| 'Tôi có ' | |
| ] | |
| symptom = question | |
| for prefix in prefixes: | |
| if symptom.startswith(prefix): | |
| symptom = symptom[len(prefix):] | |
| break | |
| # Remove question suffix | |
| suffixes = [ | |
| '. Tôi bị bệnh gì?', | |
| '. Tôi có thể bị gì?', | |
| '. Đó là bệnh gì?' | |
| ] | |
| for suffix in suffixes: | |
| if symptom.endswith(suffix): | |
| symptom = symptom[:-len(suffix)] | |
| break | |
| return symptom.strip() | |
| def process_vimedical(): | |
| """Process ViMedical dataset and build ChromaDB""" | |
| print("\n🔨 Processing ViMedical dataset...") | |
| # Load dataset | |
| csv_path = "data_mining/datasets/vimedical_disease.csv" | |
| if not os.path.exists(csv_path): | |
| print(f"❌ Dataset not found: {csv_path}") | |
| return False | |
| df = pd.read_csv(csv_path) | |
| print(f"📊 Loaded {len(df)} records") | |
| print(f"📊 Unique diseases: {df['Disease'].nunique()}") | |
| # Initialize embedder | |
| print("🤖 Loading embedding model...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| # Initialize ChromaDB | |
| print("💾 Initializing ChromaDB...") | |
| os.makedirs("data_mining/output", exist_ok=True) | |
| client = chromadb.PersistentClient(path="data_mining/output/medical_chroma") | |
| # Create collection | |
| collection = client.get_or_create_collection( | |
| name="medical_diseases", | |
| metadata={"hnsw:space": "cosine"} | |
| ) | |
| # Group by disease | |
| print("📝 Processing diseases...") | |
| disease_groups = df.groupby('Disease') | |
| processed = 0 | |
| for disease_name, group in disease_groups: | |
| # Extract symptoms from all questions | |
| symptoms = [] | |
| for question in group['Question']: | |
| symptom = extract_symptoms(question) | |
| if symptom: | |
| symptoms.append(symptom) | |
| # Create document text | |
| doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n" | |
| doc_text += "\n".join(f"- {s}" for s in symptoms[:10]) # Limit to 10 examples | |
| # Generate embedding | |
| embedding = embedder.encode(doc_text) | |
| # Add to ChromaDB | |
| collection.add( | |
| ids=[f"disease_{processed:04d}"], | |
| embeddings=[embedding.tolist()], | |
| documents=[doc_text], | |
| metadatas=[{ | |
| 'disease_name': disease_name, | |
| 'num_examples': len(symptoms), | |
| 'source': 'ViMedical_Disease' | |
| }] | |
| ) | |
| processed += 1 | |
| if processed % 50 == 0: | |
| print(f" Processed {processed}/{len(disease_groups)} diseases...") | |
| print(f"✅ Processed {processed} diseases") | |
| print(f"💾 Database saved to: data_mining/output/medical_chroma/") | |
| # Get database size | |
| db_path = "data_mining/output/medical_chroma" | |
| total_size = 0 | |
| for dirpath, dirnames, filenames in os.walk(db_path): | |
| for filename in filenames: | |
| filepath = os.path.join(dirpath, filename) | |
| total_size += os.path.getsize(filepath) | |
| print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB") | |
| return True | |
| def main(): | |
| """Main function - download and process""" | |
| print("=" * 60) | |
| print("ViMedical Disease Dataset - Download & Process") | |
| print("=" * 60) | |
| # Step 1: Download | |
| if not download_vimedical(): | |
| return False | |
| # Step 2: Process | |
| if not process_vimedical(): | |
| return False | |
| print("\n" + "=" * 60) | |
| print("✅ ViMedical dataset ready!") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| exit(0 if success else 1) | |