import numpy as np import io import wave import os import requests class AudioHandler: def __init__(self): # Sample rate will depend on upstream TTS; default is safe for playback. self.sample_rate = 22050 # Optional: URL of your external TTS Space or vLLM-style OpenAI server. # For your case, set KANITTS_URL in the my-voice-agent Space settings to point at: # https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint) self.kanitts_url = os.getenv("KANITTS_URL") print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}") def text_to_speech(self, text: str): """ Convert text to speech using an external TTS backend. Recommended setups: - Use your KaniTTS Space: - Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes. - Set KANITTS_URL in this Space to that endpoint URL. - OR use a vLLM/OpenAI-compatible TTS server: - Point KANITTS_URL to its TTS endpoint. Returns: A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component, or None if TTS is not configured or fails. """ if not text or not text.strip(): return None if not self.kanitts_url: print("KANITTS_URL is not set; skipping TTS.") return None try: # Example: POST JSON; adjust if your KaniTTS/vLLM API differs. # For KaniTTS Space, implement a compatible /tts handler that: # - Accepts: { "text": "..." } # - Returns: audio bytes (wav/ogg) as response body. resp = requests.post( self.kanitts_url, json={"text": text}, timeout=30, ) resp.raise_for_status() audio_bytes = resp.content if not audio_bytes: print("KaniTTS/vLLM TTS returned empty audio.") return None # Try to parse as WAV; if different format, adapt accordingly. with wave.open(io.BytesIO(audio_bytes), "rb") as wf: sr = wf.getframerate() n_channels = wf.getnchannels() n_frames = wf.getnframes() audio_data = wf.readframes(n_frames) audio_np = np.frombuffer(audio_data, dtype=np.int16) if n_channels > 1: audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16) return (sr, audio_np.astype(np.float32) / 32768.0) except Exception as e: print(f"Error during TTS request to {self.kanitts_url}: {e}") return None def speech_to_text(self, audio_filepath): """ Placeholder STT. Options: - Connect to OpenAI Whisper / local STT / another Space and call it here. - For now, returns None so the rest of the app still works. """ if not audio_filepath: return None print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'") return None