papercast / synthesis /supertonic_tts.py
batuhanozkose
feat: Add CPU-based TTS support (Supertonic-66M)
93dccae
"""
Supertonic TTS Integration for PaperCast
CPU-based Text-to-Speech using Supertone/supertonic model.
Provides an interface compatible with the main TTSEngine.
"""
import os
import io
import wave
import numpy as np
from typing import Iterator, Optional
from huggingface_hub import snapshot_download
# Voice mapping for Supertonic
SUPERTONIC_VOICES = {
"M1 (Male 1)": "M1",
"M2 (Male 2)": "M2",
"F1 (Female 1)": "F1",
"F2 (Female 2)": "F2",
}
# Default voices for Host and Guest
DEFAULT_HOST_VOICE = "M1" # Male voice for Host
DEFAULT_GUEST_VOICE = "F1" # Female voice for Guest
class SupertonicWrapper:
"""Wrapper for Supertonic TTS to integrate with PaperCast"""
def __init__(self, assets_dir: Optional[str] = None, use_gpu: bool = False):
"""
Initialize Supertonic TTS
Args:
assets_dir: Path to assets directory (default: ./supertonic_assets)
use_gpu: Whether to use GPU (default: False, CPU-only)
"""
self.use_gpu = use_gpu
# Set assets directory in papercast project
if assets_dir is None:
project_root = os.path.dirname(os.path.dirname(__file__))
self.assets_dir = os.path.join(project_root, "supertonic_assets")
else:
self.assets_dir = assets_dir
self.tts = None
self._initialized = False
print(f"Supertonic assets directory: {self.assets_dir}")
def _ensure_models_downloaded(self):
"""Download models from HuggingFace if not present"""
if not os.path.exists(self.assets_dir):
print(f"Downloading Supertonic models to {self.assets_dir}...")
print("This is a one-time download (~400MB)...")
snapshot_download(repo_id="Supertone/supertonic", local_dir=self.assets_dir)
print("Download complete.")
def initialize(self):
"""Initialize the TTS model"""
if self._initialized:
return
print("Initializing Supertonic TTS (CPU mode)...")
self._ensure_models_downloaded()
# Import helper functions (lazy import to avoid loading if not needed)
try:
from synthesis.supertonic_helper import load_text_to_speech
onnx_dir = os.path.join(self.assets_dir, "onnx")
self.tts = load_text_to_speech(onnx_dir, use_gpu=self.use_gpu)
self._initialized = True
print(f"✓ Supertonic TTS ready (CPU mode)")
except ImportError as e:
raise ImportError(
f"Failed to import Supertonic helper functions: {e}\n"
"Make sure required dependencies are installed (onnxruntime, soundfile)."
)
def get_available_voices(self) -> list[str]:
"""Get list of available voice styles"""
return list(SUPERTONIC_VOICES.keys())
def get_voice_id(self, voice_name: str) -> str:
"""Convert voice display name to voice ID"""
return SUPERTONIC_VOICES.get(voice_name, DEFAULT_HOST_VOICE)
def get_voice_path(self, voice_id: str) -> str:
"""Get the full path to a voice style file"""
return os.path.join(self.assets_dir, "voice_styles", f"{voice_id}.json")
@property
def sample_rate(self) -> int:
"""Get the sample rate of the TTS model"""
if not self._initialized:
self.initialize()
return self.tts.sample_rate
def synthesize_chunk(
self,
text: str,
voice_id: str,
speed: float = 1.0,
steps: int = 5,
silence_duration: float = 0.3,
max_len: int = 300
) -> Iterator[np.ndarray]:
"""
Synthesize speech from text (streaming)
Args:
text: Input text to synthesize
voice_id: Voice ID (M1, M2, F1, F2)
speed: Speech speed multiplier (0.5-2.0)
steps: Number of diffusion steps (1-50, lower=faster, higher=better quality)
silence_duration: Duration of silence between chunks
max_len: Maximum length of each chunk
Yields:
Audio chunks as numpy arrays (float32, [-1, 1])
"""
if not self._initialized:
self.initialize()
# Import helper function
from synthesis.supertonic_helper import load_voice_style
voice_path = self.get_voice_path(voice_id)
if not os.path.exists(voice_path):
raise ValueError(f"Voice style '{voice_id}' not found at {voice_path}")
style = load_voice_style([voice_path])
yield from self.tts.stream(text, style, steps, speed, silence_duration, max_len)
@staticmethod
def audio_to_int16(audio_np: np.ndarray) -> np.ndarray:
"""Convert float32 audio to int16"""
audio_clipped = np.clip(audio_np, -1.0, 1.0)
return (audio_clipped * 32767.0).astype(np.int16)
@staticmethod
def audio_to_wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes:
"""Convert int16 audio to WAV bytes"""
buffer = io.BytesIO()
with wave.open(buffer, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio_int16.tobytes())
return buffer.getvalue()
def synthesize_to_audio_segment(
self,
text: str,
voice_id: str,
speed: float = 1.0,
steps: int = 5,
):
"""
Synthesize speech and return as AudioSegment
Args:
text: Input text to synthesize
voice_id: Voice ID (M1, M2, F1, F2)
speed: Speech speed multiplier (0.5-2.0)
steps: Number of diffusion steps (1-50)
Returns:
AudioSegment object
"""
from pydub import AudioSegment
from io import BytesIO
# Collect all chunks
chunks = []
for audio_chunk in self.synthesize_chunk(
text=text,
voice_id=voice_id,
speed=speed,
steps=steps,
silence_duration=0.3,
max_len=300
):
chunks.append(audio_chunk)
# Concatenate all chunks
full_audio = np.concatenate(chunks) if chunks else np.array([], dtype=np.float32)
# Convert to int16 and then to WAV bytes
audio_int16 = self.audio_to_int16(full_audio)
wav_bytes = self.audio_to_wav_bytes(audio_int16, self.sample_rate)
# Convert to AudioSegment
return AudioSegment.from_wav(BytesIO(wav_bytes))
# Global instance
_supertonic_instance: Optional[SupertonicWrapper] = None
def get_supertonic_engine() -> SupertonicWrapper:
"""
Get or create Supertonic TTS engine instance
Returns:
SupertonicWrapper instance
"""
global _supertonic_instance
if _supertonic_instance is None:
_supertonic_instance = SupertonicWrapper()
return _supertonic_instance