Spaces:
Running
Running
| import os | |
| import io | |
| import torch | |
| import torchaudio | |
| import numpy as np | |
| import gradio as gr | |
| from transformers import ( | |
| SeamlessM4TFeatureExtractor, | |
| SeamlessM4TTokenizer, | |
| SeamlessM4Tv2ForSpeechToText, | |
| AutoTokenizer, | |
| AutoFeatureExtractor, | |
| ) | |
| from pydub import AudioSegment | |
| import nltk | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| from lang_list import LANGUAGE_NAME_TO_CODE, ASR_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES | |
| nltk.download('punkt_tab') | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32 | |
| SAMPLE_RATE = 16000 | |
| stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained( | |
| "ai4bharat/indic-seamless", | |
| torch_dtype=DTYPE | |
| ).to(DEVICE) | |
| feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained( | |
| "ai4bharat/indic-seamless" | |
| ) | |
| tt_tokenizer = SeamlessM4TTokenizer.from_pretrained( | |
| "ai4bharat/indic-seamless" | |
| ) | |
| repo_id = "ai4bharat/indic-parler-tts-pretrained" | |
| finetuned_repo_id = "ai4bharat/indic-parler-tts" | |
| tts_model = ParlerTTSForConditionalGeneration.from_pretrained( | |
| repo_id, | |
| attn_implementation="eager", | |
| torch_dtype=DTYPE, | |
| ).to(DEVICE) | |
| finetuned_tts = ParlerTTSForConditionalGeneration.from_pretrained( | |
| finetuned_repo_id, | |
| attn_implementation="eager", | |
| torch_dtype=DTYPE, | |
| ).to(DEVICE) | |
| tts_tokenizer = AutoTokenizer.from_pretrained(repo_id) | |
| description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large") | |
| tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) | |
| VOICES = [ | |
| "Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya" | |
| ] | |
| def numpy_to_mp3(audio_array, sampling_rate): | |
| if np.issubdtype(audio_array.dtype, np.floating): | |
| max_val = np.max(np.abs(audio_array)) | |
| audio_array = (audio_array / max_val) * 32767 | |
| audio_array = audio_array.astype(np.int16) | |
| segment = AudioSegment( | |
| audio_array.tobytes(), | |
| frame_rate=sampling_rate, | |
| sample_width=audio_array.dtype.itemsize, | |
| channels=1 | |
| ) | |
| mp3_io = io.BytesIO() | |
| segment.export(mp3_io, format="mp3", bitrate="320k") | |
| return mp3_io.getvalue() | |
| def transcribe_and_translate(audio_path, source_language, target_language): | |
| wav, orig_sr = torchaudio.load(audio_path) | |
| wav = torchaudio.functional.resample(wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE) | |
| inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE) | |
| tgt = LANGUAGE_NAME_TO_CODE[target_language] | |
| gen = stt_model.generate(**inputs, tgt_lang=tgt)[0] | |
| text = tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True) | |
| return text | |
| def generate_tts(text, voice, finetuned=False): | |
| description = f"{voice} speaks in a neutral tone with clear audio." | |
| sentences = nltk.sent_tokenize(text) | |
| all_audio = [] | |
| for sent in sentences: | |
| desc_inputs = description_tokenizer(description, return_tensors="pt").to(DEVICE) | |
| prompt_inputs = tts_tokenizer(sent, return_tensors="pt").to(DEVICE) | |
| model = finetuned_tts if finetuned else tts_model | |
| gen = model.generate( | |
| input_ids=desc_inputs.input_ids, | |
| attention_mask=desc_inputs.attention_mask, | |
| prompt_input_ids=prompt_inputs.input_ids, | |
| prompt_attention_mask=prompt_inputs.attention_mask, | |
| do_sample=True, | |
| return_dict_in_generate=True | |
| ) | |
| if hasattr(gen, 'sequences') and hasattr(gen, 'audios_length'): | |
| audio = gen.sequences[0, :gen.audios_length[0]] | |
| audio_np = audio.to(torch.float32).cpu().numpy().flatten() | |
| all_audio.append(audio_np) | |
| combined = np.concatenate(all_audio) | |
| return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate) | |
| def pipeline(audio_path, source_language, target_language, voice, finetuned): | |
| text = transcribe_and_translate(audio_path, source_language, target_language) | |
| audio_bytes = generate_tts(text, voice, finetuned) | |
| return text, audio_bytes | |
| def build_ui(): | |
| with gr.Blocks() as demo: | |
| gr.Markdown("🎙AUDIO TRANSLATOR🎙") | |
| gr.Markdown(" ") | |
| gr.Markdown("How to Use:") | |
| gr.Markdown("1. Upload or record your audio clip.") | |
| gr.Markdown("2. Select source & target languages.") | |
| gr.Markdown("3. Choose a voice persona.") | |
| gr.Markdown("4. (Optional) Toggle fine-tuned TTS (for better speech).") | |
| gr.Markdown("5. Click \"Run\" for translated text & speech.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio(label="Input Audio", type="filepath") | |
| src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English") | |
| tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English") | |
| voice = gr.Dropdown(VOICES, label="Voice", value=VOICES[0]) | |
| finetune = gr.Checkbox(label="Use Finetuned TTS", value=False) | |
| run_btn = gr.Button("Run") | |
| with gr.Column(): | |
| text_out = gr.Textbox(label="Translated Text") | |
| audio_out = gr.Audio(label="Synthesized Speech", format="mp3") | |
| run_btn.click( | |
| fn=pipeline, | |
| inputs=[audio_in, src, tgt, voice, finetune], | |
| outputs=[text_out, audio_out] | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| ui = build_ui() | |
| ui.launch(share=True) |