Spaces:
Running
Running
| # Based on example code of https://huggingface.co/facebook/m2m100_1.2B | |
| # and https://github.com/wannaphong/ttsmms | |
| # See also https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md | |
| import gradio as gr | |
| import os | |
| import re | |
| import soundfile as sf | |
| import json | |
| import nltk | |
| from underthesea import sent_tokenize as vie_sent_tokenize # Vietnamese NLP toolkit | |
| from underthesea import text_normalize as vie_text_normalize | |
| from nltk import sent_tokenize as nltk_sent_tokenize | |
| from ttsmms import download | |
| from ttsmms import TTS | |
| from collections import OrderedDict | |
| import uuid | |
| import datetime | |
| import shutil | |
| from num2words import num2words | |
| this_description = """Text To Speech for [1000+ languages](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) - using [fairseq MMS TTS](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/README.md) and [ttsmms](https://github.com/wannaphong/ttsmms) wrapper. | |
| Please note that for some languages, it may not pronounce all words correctly (yet). | |
| """ | |
| nltk.download("punkt") | |
| # Pre-download some languages | |
| tts_models = {} | |
| eng_path = download("eng", "./data") | |
| tts_models["eng"] = eng_path | |
| vie_path = download("vie", "./data") | |
| tts_models["vie"] = vie_path | |
| mya_path = download("mya", "./data") | |
| tts_models["mya"] = mya_path | |
| lang_codes = OrderedDict() | |
| language_names = list(lang_codes.keys()) | |
| with open("lang_code.txt", "r") as file: | |
| for line in file: | |
| line = line.strip() | |
| if line.startswith("----"): | |
| continue | |
| iso, lang = line.split("\t", 1) | |
| lang_codes[lang + " (" + iso + ")"] = iso | |
| language_names = list(lang_codes.keys()) | |
| # Load num2words_lang_map | |
| with open("num2words_lang_map.json") as f: | |
| num2words_lang_map = json.load(f, object_pairs_hook=OrderedDict) | |
| def convert_numbers_to_words_num2words(text, lang): | |
| # Find all numbers in the text using regex | |
| numbers = re.findall(r"\d+", text) | |
| # Sort numbers in descending order of length | |
| sorted_numbers = sorted(numbers, key=len, reverse=True) | |
| print(sorted_numbers) | |
| # Replace numbers with their word equivalents | |
| for number in sorted_numbers: | |
| number_word = num2words(int(number), lang=num2words_lang_map[lang][0]) | |
| text = text.replace(number, number_word) | |
| return text | |
| def convert_mya_numbers_to_words(text): | |
| from mm_num2word import mm_num2word, extract_num | |
| numbers = extract_num(text) | |
| sorted_numbers = sorted(numbers, key=len, reverse=True) | |
| print(sorted_numbers) | |
| for n in sorted_numbers: | |
| text = text.replace(n, mm_num2word(n)) | |
| return text | |
| def prepare_sentences(text, lang="mya"): | |
| sentences = [] | |
| # pre-process the text for some languages | |
| if lang.lower() == "mya": | |
| text = convert_mya_numbers_to_words(text) | |
| text = text.replace("\u104A", ",").replace("\u104B", ".") | |
| if lang in num2words_lang_map: | |
| print("num2words supports this lang", lang) | |
| text = convert_numbers_to_words_num2words(text, lang) | |
| print("Processed text", text) | |
| # Not sure why this can fix unclear pronunciation for the first word of vie | |
| text = text.lower() | |
| paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()] | |
| if lang.lower() == "vie": | |
| for paragraph in paragraphs: | |
| sentences_raw = vie_sent_tokenize(paragraph) | |
| sentences.extend( | |
| [ | |
| vie_text_normalize(sentence) | |
| for sentence in sentences_raw | |
| if sentence.strip() | |
| ] | |
| ) | |
| else: | |
| sentences = [ | |
| sentence | |
| for paragraph in paragraphs | |
| for sentence in nltk_sent_tokenize(paragraph) | |
| if sentence.strip() | |
| ] | |
| return sentences | |
| def list_dir(lang): | |
| # Get the current directory | |
| current_dir = os.getcwd() | |
| print(current_dir) | |
| # List all files in the current directory | |
| files = os.listdir(current_dir) | |
| # Filter the list to include only WAV files | |
| wav_files = [file for file in files if file.endswith(".wav")] | |
| print("Total wav files:", len(wav_files)) | |
| # Print the last WAV file | |
| sorted_list = sorted(wav_files) | |
| print(lang, sorted_list[-1]) | |
| def combine_wav(source_dir, stamp, lang): | |
| # Get a list of all WAV files in the folder | |
| wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")] | |
| # Sort the files alphabetically to ensure the correct order of combination | |
| wav_files.sort() | |
| # Combine the WAV files | |
| combined_data = [] | |
| for file in wav_files: | |
| file_path = os.path.join(source_dir, file) | |
| data, sr = sf.read(file_path) | |
| combined_data.extend(data) | |
| # Save the combined audio to a new WAV file | |
| combined_file_path = f"{stamp}_{lang}.wav" | |
| sf.write(combined_file_path, combined_data, sr) | |
| shutil.rmtree(source_dir) | |
| list_dir(lang) | |
| # Display the combined audio in the Hugging Face Space app | |
| return combined_file_path | |
| def mms_tts(Input_Text, lang_name="Burmese (mya)"): | |
| # lang_code = lang_codes[lang_name] | |
| try: | |
| lang_code = lang_codes[lang_name] | |
| except KeyError: | |
| lang_code = "mya" | |
| user_model = download(lang_code, "./data") | |
| tts = TTS(user_model) | |
| sentences = prepare_sentences(Input_Text, lang_code) | |
| # output_dir = f"out_{lang_code}" | |
| current_datetime = datetime.datetime.now() | |
| timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f") | |
| user_dir = f"u_{timestamp}" | |
| if os.path.exists(user_dir): | |
| session_id = str(uuid.uuid4()) # Generate a random session ID | |
| user_dir = f"u_{session_id}_{timestamp}" | |
| os.makedirs(user_dir, exist_ok=True) | |
| print("New user directory", user_dir) | |
| for i, sentence in enumerate(sentences): | |
| tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav") | |
| combined_file_path = combine_wav(user_dir, timestamp, lang_code) | |
| return combined_file_path | |
| # common_languages = ["eng", "mya", "vie"] # List of common language codes | |
| iface = gr.Interface( | |
| fn=mms_tts, | |
| title="Massively Multilingual Speech (MMS) - Text To Speech", | |
| description=this_description, | |
| inputs=[ | |
| gr.Textbox(lines=5, placeholder="Enter text (unlimited sentences)", label="Input text (unlimited sentences)"), | |
| gr.Dropdown( | |
| choices=language_names, | |
| label="Select language 1,000+", | |
| value="Burmese (mya)", | |
| ), | |
| ], | |
| outputs="audio", | |
| ) | |
| # outputs=[ | |
| # "audio", | |
| # gr.File(label="Download", type="file", download_to="done.wav") | |
| # ]) | |
| iface.launch() | |