Spaces:

MCP-1st-Birthday
/

papercast

Running

papercast / synthesis /supertonic_tts.py

batuhanozkose

feat: Add CPU-based TTS support (Supertonic-66M)

93dccae 23 days ago

6.95 kB

	"""
	Supertonic TTS Integration for PaperCast

	CPU-based Text-to-Speech using Supertone/supertonic model.
	Provides an interface compatible with the main TTSEngine.
	"""
	import os
	import io
	import wave
	import numpy as np
	from typing import Iterator, Optional

	from huggingface_hub import snapshot_download


	# Voice mapping for Supertonic
	SUPERTONIC_VOICES = {
	"M1 (Male 1)": "M1",
	"M2 (Male 2)": "M2",
	"F1 (Female 1)": "F1",
	"F2 (Female 2)": "F2",
	}

	# Default voices for Host and Guest
	DEFAULT_HOST_VOICE = "M1" # Male voice for Host
	DEFAULT_GUEST_VOICE = "F1" # Female voice for Guest


	class SupertonicWrapper:
	"""Wrapper for Supertonic TTS to integrate with PaperCast"""

	def __init__(self, assets_dir: Optional[str] = None, use_gpu: bool = False):
	"""
	Initialize Supertonic TTS

	Args:
	assets_dir: Path to assets directory (default: ./supertonic_assets)
	use_gpu: Whether to use GPU (default: False, CPU-only)
	"""
	self.use_gpu = use_gpu

	# Set assets directory in papercast project
	if assets_dir is None:
	project_root = os.path.dirname(os.path.dirname(__file__))
	self.assets_dir = os.path.join(project_root, "supertonic_assets")
	else:
	self.assets_dir = assets_dir

	self.tts = None
	self._initialized = False

	print(f"Supertonic assets directory: {self.assets_dir}")

	def _ensure_models_downloaded(self):
	"""Download models from HuggingFace if not present"""
	if not os.path.exists(self.assets_dir):
	print(f"Downloading Supertonic models to {self.assets_dir}...")
	print("This is a one-time download (~400MB)...")
	snapshot_download(repo_id="Supertone/supertonic", local_dir=self.assets_dir)
	print("Download complete.")

	def initialize(self):
	"""Initialize the TTS model"""
	if self._initialized:
	return

	print("Initializing Supertonic TTS (CPU mode)...")
	self._ensure_models_downloaded()

	# Import helper functions (lazy import to avoid loading if not needed)
	try:
	from synthesis.supertonic_helper import load_text_to_speech

	onnx_dir = os.path.join(self.assets_dir, "onnx")
	self.tts = load_text_to_speech(onnx_dir, use_gpu=self.use_gpu)
	self._initialized = True
	print(f"✓ Supertonic TTS ready (CPU mode)")

	except ImportError as e:
	raise ImportError(
	f"Failed to import Supertonic helper functions: {e}\n"
	"Make sure required dependencies are installed (onnxruntime, soundfile)."
	)

	def get_available_voices(self) -> list[str]:
	"""Get list of available voice styles"""
	return list(SUPERTONIC_VOICES.keys())

	def get_voice_id(self, voice_name: str) -> str:
	"""Convert voice display name to voice ID"""
	return SUPERTONIC_VOICES.get(voice_name, DEFAULT_HOST_VOICE)

	def get_voice_path(self, voice_id: str) -> str:
	"""Get the full path to a voice style file"""
	return os.path.join(self.assets_dir, "voice_styles", f"{voice_id}.json")

	@property
	def sample_rate(self) -> int:
	"""Get the sample rate of the TTS model"""
	if not self._initialized:
	self.initialize()
	return self.tts.sample_rate

	def synthesize_chunk(
	self,
	text: str,
	voice_id: str,
	speed: float = 1.0,
	steps: int = 5,
	silence_duration: float = 0.3,
	max_len: int = 300
	) -> Iterator[np.ndarray]:
	"""
	Synthesize speech from text (streaming)

	Args:
	text: Input text to synthesize
	voice_id: Voice ID (M1, M2, F1, F2)
	speed: Speech speed multiplier (0.5-2.0)
	steps: Number of diffusion steps (1-50, lower=faster, higher=better quality)
	silence_duration: Duration of silence between chunks
	max_len: Maximum length of each chunk

	Yields:
	Audio chunks as numpy arrays (float32, [-1, 1])
	"""
	if not self._initialized:
	self.initialize()

	# Import helper function
	from synthesis.supertonic_helper import load_voice_style

	voice_path = self.get_voice_path(voice_id)
	if not os.path.exists(voice_path):
	raise ValueError(f"Voice style '{voice_id}' not found at {voice_path}")

	style = load_voice_style([voice_path])

	yield from self.tts.stream(text, style, steps, speed, silence_duration, max_len)

	@staticmethod
	def audio_to_int16(audio_np: np.ndarray) -> np.ndarray:
	"""Convert float32 audio to int16"""
	audio_clipped = np.clip(audio_np, -1.0, 1.0)
	return (audio_clipped * 32767.0).astype(np.int16)

	@staticmethod
	def audio_to_wav_bytes(audio_int16: np.ndarray, sample_rate: int) -> bytes:
	"""Convert int16 audio to WAV bytes"""
	buffer = io.BytesIO()
	with wave.open(buffer, "wb") as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(sample_rate)
	wf.writeframes(audio_int16.tobytes())
	return buffer.getvalue()

	def synthesize_to_audio_segment(
	self,
	text: str,
	voice_id: str,
	speed: float = 1.0,
	steps: int = 5,
	):
	"""
	Synthesize speech and return as AudioSegment

	Args:
	text: Input text to synthesize
	voice_id: Voice ID (M1, M2, F1, F2)
	speed: Speech speed multiplier (0.5-2.0)
	steps: Number of diffusion steps (1-50)

	Returns:
	AudioSegment object
	"""
	from pydub import AudioSegment
	from io import BytesIO

	# Collect all chunks
	chunks = []
	for audio_chunk in self.synthesize_chunk(
	text=text,
	voice_id=voice_id,
	speed=speed,
	steps=steps,
	silence_duration=0.3,
	max_len=300
	):
	chunks.append(audio_chunk)

	# Concatenate all chunks
	full_audio = np.concatenate(chunks) if chunks else np.array([], dtype=np.float32)

	# Convert to int16 and then to WAV bytes
	audio_int16 = self.audio_to_int16(full_audio)
	wav_bytes = self.audio_to_wav_bytes(audio_int16, self.sample_rate)

	# Convert to AudioSegment
	return AudioSegment.from_wav(BytesIO(wav_bytes))


	# Global instance
	_supertonic_instance: Optional[SupertonicWrapper] = None


	def get_supertonic_engine() -> SupertonicWrapper:
	"""
	Get or create Supertonic TTS engine instance

	Returns:
	SupertonicWrapper instance
	"""
	global _supertonic_instance

	if _supertonic_instance is None:
	_supertonic_instance = SupertonicWrapper()

	return _supertonic_instance