Spaces:
Sleeping
Sleeping
| import torch | |
| import librosa | |
| import numpy as np | |
| import os | |
| import traceback | |
| import subprocess | |
| import shutil | |
| from transformers import ( | |
| Wav2Vec2ForCTC, | |
| AutoTokenizer, | |
| Wav2Vec2FeatureExtractor | |
| ) | |
| print("Loading Pronunciation module...") | |
| MODEL_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft" | |
| model = None | |
| tokenizer = None | |
| feature_extractor = None | |
| def find_espeak_exe(): | |
| candidates = [ | |
| r"C:\Program Files\eSpeak NG\espeak-ng.exe", | |
| r"C:\Program Files (x86)\eSpeak NG\espeak-ng.exe", | |
| r"D:\Program Files\eSpeak NG\espeak-ng.exe" | |
| ] | |
| path_in_env = shutil.which("espeak-ng") | |
| if path_in_env: return path_in_env | |
| for path in candidates: | |
| if os.path.exists(path): | |
| return path | |
| return None | |
| ESPEAK_PATH = find_espeak_exe() | |
| if ESPEAK_PATH: | |
| print(f"Found eSpeak at: {ESPEAK_PATH}") | |
| else: | |
| print("WARNING: eSpeak-ng not found. IPA generation will fail.") | |
| try: | |
| print("Loading Feature Extractor...") | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID) | |
| print("Loading Tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| print("Loading Acoustic Model...") | |
| model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID) | |
| print("Pronunciation module ready.") | |
| except Exception as e: | |
| print(f"Failed to load AI model: {e}") | |
| def get_expected_ipa(text): | |
| """Gọi subprocess espeak-ng.exe để lấy IPA chuẩn từ văn bản.""" | |
| if not ESPEAK_PATH: | |
| return "N/A" | |
| try: | |
| cmd = [ESPEAK_PATH, "-v", "en-us", "-q", "--ipa", text] | |
| startupinfo = None | |
| if os.name == 'nt': | |
| startupinfo = subprocess.STARTUPINFO() | |
| startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW | |
| result = subprocess.run( | |
| cmd, | |
| capture_output=True, | |
| text=True, | |
| encoding='utf-8', | |
| startupinfo=startupinfo | |
| ) | |
| if result.returncode == 0: | |
| return result.stdout.strip().replace('\n', ' ') | |
| else: | |
| return "N/A" | |
| except Exception as e: | |
| print(f"Subprocess error: {e}") | |
| return "N/A" | |
| def grade_pronunciation_advanced(audio_path, reference_text): | |
| """ | |
| Trả về chuỗi IPA thực tế (Audio) và IPA chuẩn (Text). | |
| """ | |
| actual_ipa = "N/A" | |
| if model and tokenizer and feature_extractor: | |
| try: | |
| y, sr = librosa.load(audio_path, sr=16000) | |
| input_values = feature_extractor(y, sampling_rate=16000, return_tensors="pt").input_values | |
| with torch.no_grad(): | |
| logits = model(input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| actual_ipa = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| except Exception as e: | |
| print(f"AI IPA Error: {e}") | |
| actual_ipa = "Error" | |
| expected_ipa = get_expected_ipa(reference_text) | |
| return { | |
| "actual_ipa": actual_ipa, | |
| "expected_ipa": expected_ipa | |
| } |