Spaces:
Running
Running
| """ | |
| Supertonic TTS Integration for PaperCast | |
| CPU-based Text-to-Speech using Supertone/supertonic model. | |
| Provides an interface compatible with the main TTSEngine. | |
| """ | |
| import os | |
| import io | |
| import wave | |
| import numpy as np | |
| from typing import Iterator, Optional | |
| from huggingface_hub import snapshot_download | |
| # Voice mapping for Supertonic | |
| SUPERTONIC_VOICES = { | |
| "M1 (Male 1)": "M1", | |
| "M2 (Male 2)": "M2", | |
| "F1 (Female 1)": "F1", | |
| "F2 (Female 2)": "F2", | |
| } | |
| # Default voices for Host and Guest | |
| DEFAULT_HOST_VOICE = "M1" # Male voice for Host | |
| DEFAULT_GUEST_VOICE = "F1" # Female voice for Guest | |
| class SupertonicWrapper: | |
| """Wrapper for Supertonic TTS to integrate with PaperCast""" | |
| def __init__(self, assets_dir: Optional[str] = None, use_gpu: bool = False): | |
| """ | |
| Initialize Supertonic TTS | |
| Args: | |
| assets_dir: Path to assets directory (default: ./supertonic_assets) | |
| use_gpu: Whether to use GPU (default: False, CPU-only) | |
| """ | |
| self.use_gpu = use_gpu | |
| # Set assets directory in papercast project | |
| if assets_dir is None: | |
| project_root = os.path.dirname(os.path.dirname(__file__)) | |
| self.assets_dir = os.path.join(project_root, "supertonic_assets") | |
| else: | |
| self.assets_dir = assets_dir | |
| self.tts = None | |
| self._initialized = False | |
| print(f"Supertonic assets directory: {self.assets_dir}") | |
| def _ensure_models_downloaded(self): | |
| """Download models from HuggingFace if not present""" | |
| if not os.path.exists(self.assets_dir): | |
| print(f"Downloading Supertonic models to {self.assets_dir}...") | |
| print("This is a one-time download (~400MB)...") | |
| snapshot_download(repo_id="Supertone/supertonic", local_dir=self.assets_dir) | |
| print("Download complete.") | |
| def initialize(self): | |
| """Initialize the TTS model""" | |
| if self._initialized: | |
| return | |
| print("Initializing Supertonic TTS (CPU mode)...") | |
| self._ensure_models_downloaded() | |
| # Import helper functions (lazy import to avoid loading if not needed) | |
| try: | |
| from synthesis.supertonic_helper import load_text_to_speech | |
| onnx_dir = os.path.join(self.assets_dir, "onnx") | |
| self.tts = load_text_to_speech(onnx_dir, use_gpu=self.use_gpu) | |
| self._initialized = True | |
| print(f"✓ Supertonic TTS ready (CPU mode)") | |
| except ImportError as e: | |
| raise ImportError( | |
| f"Failed to import Supertonic helper functions: {e}\n" | |
| "Make sure required dependencies are installed (onnxruntime, soundfile)." | |
| ) | |
| def get_available_voices(self) -> list[str]: | |
| """Get list of available voice styles""" | |
| return list(SUPERTONIC_VOICES.keys()) | |
| def get_voice_id(self, voice_name: str) -> str: | |
| """Convert voice display name to voice ID""" | |
| return SUPERTONIC_VOICES.get(voice_name, DEFAULT_HOST_VOICE) | |
| def get_voice_path(self, voice_id: str) -> str: | |
| """Get the full path to a voice style file""" | |
| return os.path.join(self.assets_dir, "voice_styles", f"{voice_id}.json") | |
| def sample_rate(self) -> int: | |
| """Get the sample rate of the TTS model""" | |
| if not self._initialized: | |
| self.initialize() | |
| return self.tts.sample_rate | |
| def synthesize_chunk( | |
| self, | |
| text: str, | |
| voice_id: str, | |
| speed: float = 1.0, | |
| steps: int = 5, | |
| silence_duration: float = 0.3, | |
| max_len: int = 300 | |
| ) -> Iterator[np.ndarray]: | |
| """ | |
| Synthesize speech from text (streaming) | |
| Args: | |
| text: Input text to synthesize | |
| voice_id: Voice ID (M1, M2, F1, F2) | |
| speed: Speech speed multiplier (0.5-2.0) | |
| steps: Number of diffusion steps (1-50, lower=faster, higher=better quality) | |
| silence_duration: Duration of silence between chunks | |
| max_len: Maximum length of each chunk | |
| Yields: | |
| Audio chunks as numpy arrays (float32, [-1, 1]) | |
| """ | |
| if not self._initialized: | |
| self.initialize() | |
| # Import helper function | |
| from synthesis.supertonic_helper import load_voice_style | |
| voice_path = self.get_voice_path(voice_id) | |
| if not os.path.exists(voice_path): | |
| raise ValueError(f"Voice style '{voice_id}' not found at {voice_path}") | |
| style = load_voice_style([voice_path]) | |
| yield from self.tts.stream(text, style, steps, speed, silence_duration, max_len) | |
| def audio_to_int16(audio_np: np.ndarray) -> np.ndarray: | |
| """Convert float32 audio to int16""" | |
| audio_clipped = np.clip(audio_np, -1.0, 1.0) | |
| return (audio_clipped * 32767.0).astype(np.int16) | |
| def audio_to_wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes: | |
| """Convert int16 audio to WAV bytes""" | |
| buffer = io.BytesIO() | |
| with wave.open(buffer, "wb") as wf: | |
| wf.setnchannels(1) | |
| wf.setsampwidth(2) | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(audio_int16.tobytes()) | |
| return buffer.getvalue() | |
| def synthesize_to_audio_segment( | |
| self, | |
| text: str, | |
| voice_id: str, | |
| speed: float = 1.0, | |
| steps: int = 5, | |
| ): | |
| """ | |
| Synthesize speech and return as AudioSegment | |
| Args: | |
| text: Input text to synthesize | |
| voice_id: Voice ID (M1, M2, F1, F2) | |
| speed: Speech speed multiplier (0.5-2.0) | |
| steps: Number of diffusion steps (1-50) | |
| Returns: | |
| AudioSegment object | |
| """ | |
| from pydub import AudioSegment | |
| from io import BytesIO | |
| # Collect all chunks | |
| chunks = [] | |
| for audio_chunk in self.synthesize_chunk( | |
| text=text, | |
| voice_id=voice_id, | |
| speed=speed, | |
| steps=steps, | |
| silence_duration=0.3, | |
| max_len=300 | |
| ): | |
| chunks.append(audio_chunk) | |
| # Concatenate all chunks | |
| full_audio = np.concatenate(chunks) if chunks else np.array([], dtype=np.float32) | |
| # Convert to int16 and then to WAV bytes | |
| audio_int16 = self.audio_to_int16(full_audio) | |
| wav_bytes = self.audio_to_wav_bytes(audio_int16, self.sample_rate) | |
| # Convert to AudioSegment | |
| return AudioSegment.from_wav(BytesIO(wav_bytes)) | |
| # Global instance | |
| _supertonic_instance: Optional[SupertonicWrapper] = None | |
| def get_supertonic_engine() -> SupertonicWrapper: | |
| """ | |
| Get or create Supertonic TTS engine instance | |
| Returns: | |
| SupertonicWrapper instance | |
| """ | |
| global _supertonic_instance | |
| if _supertonic_instance is None: | |
| _supertonic_instance = SupertonicWrapper() | |
| return _supertonic_instance | |