{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test MagpieTTS `do_tts` Method\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!export PYTHONPATH=$PYTHONPATH:/workspace/NeMo"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NeMo W 2025-12-09 06:46:22 nemo_logging:364] /usr/local/lib/python3.12/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
" \n",
"[WARNING | nv_one_logger.api.config]: OneLogger: Setting error_handling_strategy to DISABLE_QUIETLY_AND_REPORT_METRIC_ERROR for rank (rank=0) with OneLogger disabled. To override: explicitly set error_handling_strategy parameter.\n",
"[WARNING | nv_one_logger.training_telemetry.api.training_telemetry_provider]: No exporters were provided. This means that no telemetry data will be collected.\n"
]
}
],
"source": [
"import torch\n",
"# import torchaudio\n",
"import soundfile as sf\n",
"import librosa\n",
"\n",
"from IPython.display import Audio\n",
"from nemo.collections.tts.models import MagpieTTSModel\n",
"from examples.tts.magpietts.utils import ModelLoadConfig, load_magpie_model\n",
"from nemo.collections.tts.parts.utils.tts_dataset_utils import load_audio\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Configuration - UPDATE THESE PATHS\n",
"CHECKPOINT_PATH = \"/checkpoints/results/CE-Removed_GRPO_Magpie_TTS_ML_V1_val_cer_gt_0_1014_step_800.ckpt\"\n",
"# CHECKPOINT_PATH = \"/checkpoints/GRPO_Magpie_TTS_ML_V1_val_cer_gt_0_1014_step_800.ckpt\"\n",
"# CONFIG_PATH = \"/checkpoints/hparams.yaml\"\n",
"CONFIG_PATH = \"/checkpoints/results/CE-Removed_GRPO_Magpie_TTS_ML_V1_val_cer_gt_0_1014_step_800_config.yaml\" # Optional\n",
"CHECKPOINT_PATH = \"/checkpoints/results/ML_MagpieTTS/CE-Removed_GRPO_Magpie_TTS_ML_V1.nemo\"\n",
"CODEC_MODEL_PATH = \"nvidia/nemo-nano-codec-22khz-1.89kbps-21.5fps\" # \"/nemo_codec_checkpoints/21fps_causal_codecmodel.nemo\"\n",
"CONTEXT_AUDIO_PATH = \"/LibriTTS_subset/39174/5463_39174_000005_000001.wav\" # \"/Emma_Ref/ref0.wav\"\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# model_config = ModelLoadConfig(\n",
"# hparams_file=CONFIG_PATH,\n",
"# checkpoint_file=CHECKPOINT_PATH,\n",
"# codecmodel_path=CODEC_MODEL_PATH,\n",
"# legacy_codebooks=False,\n",
"# legacy_text_conditioning=False,\n",
"# hparams_from_wandb=None,\n",
"# )\n",
"\n",
"\n",
"model_config = ModelLoadConfig(\n",
" nemo_file=CHECKPOINT_PATH,\n",
" codecmodel_path=CODEC_MODEL_PATH,\n",
" legacy_codebooks=False,\n",
" legacy_text_conditioning=False,\n",
" hparams_from_wandb=None,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NeMo W 2025-12-09 06:47:01 modelPT:188] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
" Train config : \n",
" dataset:\n",
" dataset_type: tarred_vocoder\n",
" dataset_args:\n",
" dataset_meta:\n",
" mls_english:\n",
" manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/tarred_audio/train_manifest.json\n",
" tar_filepath: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/tarred_audio/audio_{0..1999}.tar\n",
" cv:\n",
" manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/tarred_audio/train_manifest.json\n",
" tar_filepath: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/tarred_audio/audio_{0..279}.tar\n",
" sample_rate: 22050\n",
" n_samples: 24576\n",
" min_duration: 0.4\n",
" max_duration: null\n",
" shard_strategy: replicate\n",
" sample_type: weighted_random\n",
" sample_args:\n",
" batch_size: 32\n",
" steps_per_epoch: 1000\n",
" dataset_weights:\n",
" - 0.75\n",
" - 0.25\n",
" shuffle_n: 10000\n",
" dataloader_params:\n",
" batch_size: 32\n",
" drop_last: true\n",
" num_workers: 4\n",
" \n",
"[NeMo W 2025-12-09 06:47:01 modelPT:195] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
" Validation config : \n",
" dataset:\n",
" dataset_type: vocoder\n",
" dataset_args:\n",
" sample_rate: 22050\n",
" n_samples: null\n",
" min_duration: null\n",
" max_duration: null\n",
" trunc_duration: 10.0\n",
" dataset_meta:\n",
" mls_english:\n",
" manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/val_manifest_unseen.json\n",
" audio_dir: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/mls_english/filtered_22khz/audio_22khz\n",
" cv:\n",
" manifest_path: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/val_manifest.json\n",
" audio_dir: /lustre/fsw/portfolios/llmservice/projects/llmservice_nemo_speechlm/data/TTS/commonvoice13/22khz/audio_22khz\n",
" dataloader_params:\n",
" batch_size: 4\n",
" num_workers: 2\n",
" \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NeMo I 2025-12-09 06:47:01 audio_codec:99] Vector quantizer does not support commit loss.\n",
"Speaker encoder loaded and frozen !!\n",
"[NeMo I 2025-12-09 06:47:08 save_restore_connector:284] Model AudioCodecModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--nemo-nano-codec-22khz-1.89kbps-21.5fps/snapshots/3c482a402a3c4cf33690a2c0f0a7d41afea6bd6a/nemo-nano-codec-22khz-1.89kbps-21.5fps.nemo.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NeMo W 2025-12-09 06:47:10 i18n_ipa:122] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
"[NeMo W 2025-12-09 06:47:10 i18n_ipa:122] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
"[NeMo W 2025-12-09 06:47:12 i18n_ipa:122] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n",
"[NeMo W 2025-12-09 06:47:12 zh_cn_pinyin:100] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[NeMo I 2025-12-09 06:47:15 magpietts:299] Local transformer type: autoregressive\n",
"[NeMo I 2025-12-09 06:47:16 magpietts:499] Loaded baked context embedding with shape torch.Size([130, 768]), length 130\n",
"[NeMo I 2025-12-09 06:47:16 save_restore_connector:284] Model MagpieTTSModel was successfully restored from /checkpoints/results/ML_MagpieTTS/CE-Removed_GRPO_Magpie_TTS_ML_V1.nemo.\n",
"Model type: decoder_ce\n",
"Has baked context: True\n",
"Supports text conditioning: True\n"
]
}
],
"source": [
"# Load model\n",
"model, checkpoint_name = load_magpie_model(model_config)\n",
"model.eval().cuda()\n",
"\n",
"print(f\"Model type: {model.model_type}\")\n",
"print(f\"Has baked context: {model.has_baked_context_embedding}\")\n",
"print(f\"Supports text conditioning: {model.use_text_conditioning_encoder}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following tests should only work if there is a baked context embedding in the model, if not then it should throw error."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"transcript = {\n",
" \"en\": \"Hello, this is a test of the text to speech system.\",\n",
" \"es\": \"Pero ni el ceremonioso usted ni las razones de Luis convenc\\u00edan a la se\\u00f1ora.\",\n",
" \"fr\": \"J'ai fait ce que vous dites, r\\u00e9pondit le marchand; je ne puis le nier. - Cela \\u00e9tant, reprit le g\\u00e9nie\",\n",
" \"vi\": \"Tôi vừa mở cửa thì bất ngờ reo lên vì thấy một món quà. Chiếc laptop mới nằm gọn trên bàn với dòng chữ happy birthday. Tôi mỉm cười hạnh phúc và bật máy lên ngay.\",\n",
" \"zh\": \" 可使用英特尔X E O N E 五 杠二六八零 T v 二或具有更高性能的服务器。\" \n",
"}\n",
"language = \"zh\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test 1: Baked Context (if available)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Building prefix dict from the default dictionary ...\n",
"[DEBUG | jieba ]: Building prefix dict from the default dictionary ...\n",
"Loading model from cache /tmp/jieba.cache\n",
"[DEBUG | jieba ]: Loading model from cache /tmp/jieba.cache\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Available tokenizers: ['english_phoneme', 'spanish_phoneme', 'german_phoneme', 'mandarin_phoneme', 'french_chartokenizer', 'hindi_phoneme', 'italian_phoneme', 'vietnamese_phoneme', 'text_ce_tokenizer']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading model cost 0.575 seconds.\n",
"[DEBUG | jieba ]: Loading model cost 0.575 seconds.\n",
"Prefix dict has been built successfully.\n",
"[DEBUG | jieba ]: Prefix dict has been built successfully.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Decoding timestep 0\n",
"Decoding timestep 20\n",
"Decoding timestep 40\n",
"Decoding timestep 60\n",
"Decoding timestep 80\n",
"Decoding timestep 100\n",
"Decoding timestep 120\n",
"Decoding timestep 140\n",
"Decoding timestep 160\n",
"End detected for item 0 at decoder timestep: 169\n",
"All ends reached\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"audio, audio_len = model.do_tts(transcript[language], language=language, apply_TN=True)\n",
"audio_np = audio[0, :audio_len[0]].cpu().numpy()\n",
"display(Audio(audio_np, rate=model.sample_rate))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test 2: Without Text Normalization"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Available tokenizers: ['english_phoneme', 'spanish_phoneme', 'german_phoneme', 'mandarin_phoneme', 'french_chartokenizer', 'hindi_phoneme', 'italian_phoneme', 'vietnamese_phoneme', 'text_ce_tokenizer']\n",
"Decoding timestep 0\n",
"Decoding timestep 20\n",
"Decoding timestep 40\n",
"Decoding timestep 60\n",
"Decoding timestep 80\n",
"Decoding timestep 100\n",
"Decoding timestep 120\n",
"Decoding timestep 140\n",
"Decoding timestep 160\n",
"Decoding timestep 180\n",
"End detected for item 0 at decoder timestep: 198\n",
"All ends reached\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Skip text normalization (input already normalized)\n",
"audio, audio_len = model.do_tts(\n",
" transcript[language], language=language, apply_TN=False,\n",
")\n",
"audio_np = audio[0, :audio_len[0]].cpu().numpy()\n",
"display(Audio(audio_np, rate=model.sample_rate))\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# model.state_dict()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# model.save_to(\"/checkpoints/results/CE-Removed_GRPO_Magpie_TTS_ML_V1.nemo\") # restore_from"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}