+
+## Acknowledge
+
+1. We borrowed a lot of code from [FunASR](https://github.com/modelscope/FunASR).
+2. We borrowed a lot of code from [FunCodec](https://github.com/modelscope/FunCodec).
+3. We borrowed a lot of code from [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS).
+4. We borrowed a lot of code from [AcademiCodec](https://github.com/yangdongchao/AcademiCodec).
+5. We borrowed a lot of code from [WeNet](https://github.com/wenet-e2e/wenet).
+
+## Disclaimer
+The content provided above is for academic purposes only and is intended to demonstrate technical capabilities. Some examples are sourced from the internet. If any content infringes on your rights, please contact us to request its removal.
diff --git a/cosyvoice/__init__.py b/cosyvoice/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/cosyvoice/__pycache__/__init__.cpython-310.pyc b/cosyvoice/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0c5148c2b4dafe98d3a6be683f4192d577b9f4d
Binary files /dev/null and b/cosyvoice/__pycache__/__init__.cpython-310.pyc differ
diff --git a/cosyvoice/__pycache__/__init__.cpython-38.pyc b/cosyvoice/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66096c4ee88c66fb7a5180d0334398712d1d88eb
Binary files /dev/null and b/cosyvoice/__pycache__/__init__.cpython-38.pyc differ
diff --git a/cosyvoice/bin/average_model.py b/cosyvoice/bin/average_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d095dcd99f915f0ffdbc3a0c14fcb6f8db900be0
--- /dev/null
+++ b/cosyvoice/bin/average_model.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 Mobvoi Inc (Di Wu)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import glob
+
+import yaml
+import torch
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description='average model')
+ parser.add_argument('--dst_model', required=True, help='averaged model')
+ parser.add_argument('--src_path',
+ required=True,
+ help='src model path for average')
+ parser.add_argument('--val_best',
+ action="store_true",
+ help='averaged model')
+ parser.add_argument('--num',
+ default=5,
+ type=int,
+ help='nums for averaged model')
+
+ args = parser.parse_args()
+ print(args)
+ return args
+
+
+def main():
+ args = get_args()
+ val_scores = []
+ if args.val_best:
+ yamls = glob.glob('{}/*.yaml'.format(args.src_path))
+ yamls = [
+ f for f in yamls
+ if not (os.path.basename(f).startswith('train')
+ or os.path.basename(f).startswith('init'))
+ ]
+ for y in yamls:
+ with open(y, 'r') as f:
+ dic_yaml = yaml.load(f, Loader=yaml.BaseLoader)
+ loss = float(dic_yaml['loss_dict']['loss'])
+ epoch = int(dic_yaml['epoch'])
+ step = int(dic_yaml['step'])
+ tag = dic_yaml['tag']
+ val_scores += [[epoch, step, loss, tag]]
+ sorted_val_scores = sorted(val_scores,
+ key=lambda x: x[2],
+ reverse=False)
+ print("best val (epoch, step, loss, tag) = " +
+ str(sorted_val_scores[:args.num]))
+ path_list = [
+ args.src_path + '/epoch_{}_whole.pt'.format(score[0])
+ for score in sorted_val_scores[:args.num]
+ ]
+ print(path_list)
+ avg = {}
+ num = args.num
+ assert num == len(path_list)
+ for path in path_list:
+ print('Processing {}'.format(path))
+ states = torch.load(path, map_location=torch.device('cpu'))
+ for k in states.keys():
+ if k not in avg.keys():
+ avg[k] = states[k].clone()
+ else:
+ avg[k] += states[k]
+ # average
+ for k in avg.keys():
+ if avg[k] is not None:
+ # pytorch 1.6 use true_divide instead of /=
+ avg[k] = torch.true_divide(avg[k], num)
+ print('Saving to {}'.format(args.dst_model))
+ torch.save(avg, args.dst_model)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cosyvoice/bin/export_jit.py b/cosyvoice/bin/export_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd486e97117f2086075ba90726e09edf195b7f3
--- /dev/null
+++ b/cosyvoice/bin/export_jit.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description='export your model for deployment')
+ parser.add_argument('--model_dir',
+ type=str,
+ default='pretrained_models/CosyVoice-300M',
+ help='local path')
+ args = parser.parse_args()
+ print(args)
+ return args
+
+
+def get_optimized_script(model, preserved_attrs=[]):
+ script = torch.jit.script(model)
+ if preserved_attrs != []:
+ script = torch.jit.freeze(script, preserved_attrs=preserved_attrs)
+ else:
+ script = torch.jit.freeze(script)
+ script = torch.jit.optimize_for_inference(script)
+ return script
+
+
+def main():
+ args = get_args()
+ logging.basicConfig(level=logging.DEBUG,
+ format='%(asctime)s %(levelname)s %(message)s')
+
+ torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+ torch._C._jit_set_profiling_mode(False)
+ torch._C._jit_set_profiling_executor(False)
+
+ try:
+ model = CosyVoice(args.model_dir)
+ except Exception:
+ try:
+ model = CosyVoice2(args.model_dir)
+ except Exception:
+ raise TypeError('no valid model_type!')
+
+ if not isinstance(model, CosyVoice2):
+ # 1. export llm text_encoder
+ llm_text_encoder = model.model.llm.text_encoder
+ script = get_optimized_script(llm_text_encoder)
+ script.save('{}/llm.text_encoder.fp32.zip'.format(args.model_dir))
+ script = get_optimized_script(llm_text_encoder.half())
+ script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+
+ # 2. export llm llm
+ llm_llm = model.model.llm.llm
+ script = get_optimized_script(llm_llm, ['forward_chunk'])
+ script.save('{}/llm.llm.fp32.zip'.format(args.model_dir))
+ script = get_optimized_script(llm_llm.half(), ['forward_chunk'])
+ script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+
+ # 3. export flow encoder
+ flow_encoder = model.model.flow.encoder
+ script = get_optimized_script(flow_encoder)
+ script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+ script = get_optimized_script(flow_encoder.half())
+ script.save('{}/flow.encoder.fp16.zip'.format(args.model_dir))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/cosyvoice/bin/export_onnx.py b/cosyvoice/bin/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ddd358949d56ebbcded651114e84789e2b908ef
--- /dev/null
+++ b/cosyvoice/bin/export_onnx.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, hexisyztem@icloud.com)
+# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
+
+
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+ x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+ mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+ mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+ t = torch.rand((batch_size), dtype=torch.float32, device=device)
+ spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+ cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+ return x, mask, mu, t, spks, cond
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description='export your model for deployment')
+ parser.add_argument('--model_dir',
+ type=str,
+ default='pretrained_models/CosyVoice-300M',
+ help='local path')
+ args = parser.parse_args()
+ print(args)
+ return args
+
+
+def main():
+ args = get_args()
+ logging.basicConfig(level=logging.DEBUG,
+ format='%(asctime)s %(levelname)s %(message)s')
+
+ try:
+ model = CosyVoice(args.model_dir)
+ except Exception:
+ try:
+ model = CosyVoice2(args.model_dir)
+ except Exception:
+ raise TypeError('no valid model_type!')
+
+ # 1. export flow decoder estimator
+ estimator = model.model.flow.decoder.estimator
+
+ device = model.model.device
+ batch_size, seq_len = 2, 256
+ out_channels = model.model.flow.decoder.estimator.out_channels
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+ torch.onnx.export(
+ estimator,
+ (x, mask, mu, t, spks, cond),
+ '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+ export_params=True,
+ opset_version=18,
+ do_constant_folding=True,
+ input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+ output_names=['estimator_out'],
+ dynamic_axes={
+ 'x': {2: 'seq_len'},
+ 'mask': {2: 'seq_len'},
+ 'mu': {2: 'seq_len'},
+ 'cond': {2: 'seq_len'},
+ 'estimator_out': {2: 'seq_len'},
+ }
+ )
+
+ # 2. test computation consistency
+ option = onnxruntime.SessionOptions()
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+ option.intra_op_num_threads = 1
+ providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+ estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+ sess_options=option, providers=providers)
+
+ for _ in tqdm(range(10)):
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
+ output_pytorch = estimator(x, mask, mu, t, spks, cond)
+ ort_inputs = {
+ 'x': x.cpu().numpy(),
+ 'mask': mask.cpu().numpy(),
+ 'mu': mu.cpu().numpy(),
+ 't': t.cpu().numpy(),
+ 'spks': spks.cpu().numpy(),
+ 'cond': cond.cpu().numpy()
+ }
+ output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+ torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/cosyvoice/bin/export_trt.sh b/cosyvoice/bin/export_trt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..808d02a6e927fe00da51c56f2f4c9f8ccc7b4ba5
--- /dev/null
+++ b/cosyvoice/bin/export_trt.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Copyright 2024 Alibaba Inc. All Rights Reserved.
+# download tensorrt from https://developer.nvidia.com/tensorrt/download/10x, check your system and cuda for compatibability
+# for example for linux + cuda12.4, you can download https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-10.0.1.6.Linux.x86_64-gnu.cuda-12.4.tar.gz
+TRT_DIR=模型说明: 此模型使用零样本音色克隆技术,只需3-10秒参考音频即可模仿目标音色。
+模型说明: 此模型支持多个语种,无需参考音频即可生成自然语音。
+使用技巧: 输入文本语言应与选择的说话人语言一致以获得最佳效果。
+模型说明: 此模型在音色克隆基础上增加了情感控制能力,可生成带有特定情感的语音。
+使用技巧: 情感表达效果与文本内容相关,请确保文本与所选情感匹配。
+
+
+