Spaces:

lapnt3
/

my-gradio-app

Runtime error

my-gradio-app / data_mining /mining_vimedical.py

Nguyen Trong Lap

Recreate history without binary blobs

eeb0f9c about 2 months ago

5.34 kB

	"""
	ViMedical Disease Dataset - Download & Process
	Downloads and processes Vietnamese medical disease dataset into ChromaDB
	Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples)
	"""

	import requests
	import pandas as pd
	import chromadb
	from sentence_transformers import SentenceTransformer
	import os
	import re

	def download_vimedical():
	"""Download ViMedical dataset from HuggingFace"""

	print("📥 Downloading ViMedical Disease dataset...")

	# HuggingFace dataset URL
	url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv"

	# Create datasets directory
	os.makedirs("data_mining/datasets", exist_ok=True)
	output_path = "data_mining/datasets/vimedical_disease.csv"

	try:
	# Download
	response = requests.get(url, timeout=60)
	response.raise_for_status()

	# Save
	with open(output_path, 'wb') as f:
	f.write(response.content)

	# Check file size
	file_size = os.path.getsize(output_path) / (1024 * 1024) # MB

	print(f"✅ Downloaded: {output_path}")
	print(f"📊 File size: {file_size:.2f} MB")

	return True

	except Exception as e:
	print(f"❌ Download failed: {e}")
	return False

	def extract_symptoms(question):
	"""Extract symptom description from question"""
	# Remove common prefixes
	prefixes = [
	'Tôi đang có triệu chứng như ',
	'Tôi thường xuyên ',
	'Tôi cảm thấy ',
	'Tôi bị ',
	'Tôi hay ',
	'Tôi có '
	]

	symptom = question
	for prefix in prefixes:
	if symptom.startswith(prefix):
	symptom = symptom[len(prefix):]
	break

	# Remove question suffix
	suffixes = [
	'. Tôi bị bệnh gì?',
	'. Tôi có thể bị gì?',
	'. Đó là bệnh gì?'
	]
	for suffix in suffixes:
	if symptom.endswith(suffix):
	symptom = symptom[:-len(suffix)]
	break

	return symptom.strip()

	def process_vimedical():
	"""Process ViMedical dataset and build ChromaDB"""

	print("\n🔨 Processing ViMedical dataset...")

	# Load dataset
	csv_path = "data_mining/datasets/vimedical_disease.csv"
	if not os.path.exists(csv_path):
	print(f"❌ Dataset not found: {csv_path}")
	return False

	df = pd.read_csv(csv_path)
	print(f"📊 Loaded {len(df)} records")
	print(f"📊 Unique diseases: {df['Disease'].nunique()}")

	# Initialize embedder
	print("🤖 Loading embedding model...")
	embedder = SentenceTransformer('keepitreal/vietnamese-sbert')

	# Initialize ChromaDB
	print("💾 Initializing ChromaDB...")
	os.makedirs("data_mining/output", exist_ok=True)
	client = chromadb.PersistentClient(path="data_mining/output/medical_chroma")

	# Create collection
	collection = client.get_or_create_collection(
	name="medical_diseases",
	metadata={"hnsw:space": "cosine"}
	)

	# Group by disease
	print("📝 Processing diseases...")
	disease_groups = df.groupby('Disease')

	processed = 0
	for disease_name, group in disease_groups:
	# Extract symptoms from all questions
	symptoms = []
	for question in group['Question']:
	symptom = extract_symptoms(question)
	if symptom:
	symptoms.append(symptom)

	# Create document text
	doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n"
	doc_text += "\n".join(f"- {s}" for s in symptoms[:10]) # Limit to 10 examples

	# Generate embedding
	embedding = embedder.encode(doc_text)

	# Add to ChromaDB
	collection.add(
	ids=[f"disease_{processed:04d}"],
	embeddings=[embedding.tolist()],
	documents=[doc_text],
	metadatas=[{
	'disease_name': disease_name,
	'num_examples': len(symptoms),
	'source': 'ViMedical_Disease'
	}]
	)

	processed += 1
	if processed % 50 == 0:
	print(f" Processed {processed}/{len(disease_groups)} diseases...")

	print(f"✅ Processed {processed} diseases")
	print(f"💾 Database saved to: data_mining/output/medical_chroma/")

	# Get database size
	db_path = "data_mining/output/medical_chroma"
	total_size = 0
	for dirpath, dirnames, filenames in os.walk(db_path):
	for filename in filenames:
	filepath = os.path.join(dirpath, filename)
	total_size += os.path.getsize(filepath)

	print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB")

	return True

	def main():
	"""Main function - download and process"""
	print("=" * 60)
	print("ViMedical Disease Dataset - Download & Process")
	print("=" * 60)

	# Step 1: Download
	if not download_vimedical():
	return False

	# Step 2: Process
	if not process_vimedical():
	return False

	print("\n" + "=" * 60)
	print("✅ ViMedical dataset ready!")
	print("=" * 60)
	return True

	if __name__ == "__main__":
	success = main()
	exit(0 if success else 1)