my-gradio-app / data_mining /mining_vimedical.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
"""
ViMedical Disease Dataset - Download & Process
Downloads and processes Vietnamese medical disease dataset into ChromaDB
Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples)
"""
import requests
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
import re
def download_vimedical():
"""Download ViMedical dataset from HuggingFace"""
print("📥 Downloading ViMedical Disease dataset...")
# HuggingFace dataset URL
url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv"
# Create datasets directory
os.makedirs("data_mining/datasets", exist_ok=True)
output_path = "data_mining/datasets/vimedical_disease.csv"
try:
# Download
response = requests.get(url, timeout=60)
response.raise_for_status()
# Save
with open(output_path, 'wb') as f:
f.write(response.content)
# Check file size
file_size = os.path.getsize(output_path) / (1024 * 1024) # MB
print(f"✅ Downloaded: {output_path}")
print(f"📊 File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"❌ Download failed: {e}")
return False
def extract_symptoms(question):
"""Extract symptom description from question"""
# Remove common prefixes
prefixes = [
'Tôi đang có triệu chứng như ',
'Tôi thường xuyên ',
'Tôi cảm thấy ',
'Tôi bị ',
'Tôi hay ',
'Tôi có '
]
symptom = question
for prefix in prefixes:
if symptom.startswith(prefix):
symptom = symptom[len(prefix):]
break
# Remove question suffix
suffixes = [
'. Tôi bị bệnh gì?',
'. Tôi có thể bị gì?',
'. Đó là bệnh gì?'
]
for suffix in suffixes:
if symptom.endswith(suffix):
symptom = symptom[:-len(suffix)]
break
return symptom.strip()
def process_vimedical():
"""Process ViMedical dataset and build ChromaDB"""
print("\n🔨 Processing ViMedical dataset...")
# Load dataset
csv_path = "data_mining/datasets/vimedical_disease.csv"
if not os.path.exists(csv_path):
print(f"❌ Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"📊 Loaded {len(df)} records")
print(f"📊 Unique diseases: {df['Disease'].nunique()}")
# Initialize embedder
print("🤖 Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
print("💾 Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/medical_chroma")
# Create collection
collection = client.get_or_create_collection(
name="medical_diseases",
metadata={"hnsw:space": "cosine"}
)
# Group by disease
print("📝 Processing diseases...")
disease_groups = df.groupby('Disease')
processed = 0
for disease_name, group in disease_groups:
# Extract symptoms from all questions
symptoms = []
for question in group['Question']:
symptom = extract_symptoms(question)
if symptom:
symptoms.append(symptom)
# Create document text
doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n"
doc_text += "\n".join(f"- {s}" for s in symptoms[:10]) # Limit to 10 examples
# Generate embedding
embedding = embedder.encode(doc_text)
# Add to ChromaDB
collection.add(
ids=[f"disease_{processed:04d}"],
embeddings=[embedding.tolist()],
documents=[doc_text],
metadatas=[{
'disease_name': disease_name,
'num_examples': len(symptoms),
'source': 'ViMedical_Disease'
}]
)
processed += 1
if processed % 50 == 0:
print(f" Processed {processed}/{len(disease_groups)} diseases...")
print(f"✅ Processed {processed} diseases")
print(f"💾 Database saved to: data_mining/output/medical_chroma/")
# Get database size
db_path = "data_mining/output/medical_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("ViMedical Disease Dataset - Download & Process")
print("=" * 60)
# Step 1: Download
if not download_vimedical():
return False
# Step 2: Process
if not process_vimedical():
return False
print("\n" + "=" * 60)
print("✅ ViMedical dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)