ielts-grader-api / src /pronunciation.py
diminch's picture
Deploy V15 Clean (Removed binary files history)
d939bae
import torch
import librosa
import numpy as np
import os
import traceback
import subprocess
import shutil
from transformers import (
Wav2Vec2ForCTC,
AutoTokenizer,
Wav2Vec2FeatureExtractor
)
print("Loading Pronunciation module...")
MODEL_ID = "facebook/wav2vec2-lv-60-espeak-cv-ft"
model = None
tokenizer = None
feature_extractor = None
def find_espeak_exe():
candidates = [
r"C:\Program Files\eSpeak NG\espeak-ng.exe",
r"C:\Program Files (x86)\eSpeak NG\espeak-ng.exe",
r"D:\Program Files\eSpeak NG\espeak-ng.exe"
]
path_in_env = shutil.which("espeak-ng")
if path_in_env: return path_in_env
for path in candidates:
if os.path.exists(path):
return path
return None
ESPEAK_PATH = find_espeak_exe()
if ESPEAK_PATH:
print(f"Found eSpeak at: {ESPEAK_PATH}")
else:
print("WARNING: eSpeak-ng not found. IPA generation will fail.")
try:
print("Loading Feature Extractor...")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_ID)
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print("Loading Acoustic Model...")
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
print("Pronunciation module ready.")
except Exception as e:
print(f"Failed to load AI model: {e}")
def get_expected_ipa(text):
"""Gọi subprocess espeak-ng.exe để lấy IPA chuẩn từ văn bản."""
if not ESPEAK_PATH:
return "N/A"
try:
cmd = [ESPEAK_PATH, "-v", "en-us", "-q", "--ipa", text]
startupinfo = None
if os.name == 'nt':
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
result = subprocess.run(
cmd,
capture_output=True,
text=True,
encoding='utf-8',
startupinfo=startupinfo
)
if result.returncode == 0:
return result.stdout.strip().replace('\n', ' ')
else:
return "N/A"
except Exception as e:
print(f"Subprocess error: {e}")
return "N/A"
def grade_pronunciation_advanced(audio_path, reference_text):
"""
Trả về chuỗi IPA thực tế (Audio) và IPA chuẩn (Text).
"""
actual_ipa = "N/A"
if model and tokenizer and feature_extractor:
try:
y, sr = librosa.load(audio_path, sr=16000)
input_values = feature_extractor(y, sampling_rate=16000, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
actual_ipa = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
except Exception as e:
print(f"AI IPA Error: {e}")
actual_ipa = "Error"
expected_ipa = get_expected_ipa(reference_text)
return {
"actual_ipa": actual_ipa,
"expected_ipa": expected_ipa
}