Update src/app/conversation_core.py
Browse files- src/app/conversation_core.py +8 -20
src/app/conversation_core.py
CHANGED
|
@@ -10,6 +10,9 @@ from dataclasses import dataclass
|
|
| 10 |
from typing import List, Optional, Tuple
|
| 11 |
from .config import get_user_dir
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from gtts import gTTS
|
|
@@ -109,17 +112,6 @@ _LANG_HINTS = {
|
|
| 109 |
# SPEECH RECOGNITION — faster-whisper
|
| 110 |
##########################################
|
| 111 |
|
| 112 |
-
from faster_whisper import WhisperModel
|
| 113 |
-
|
| 114 |
-
_whisper_model = None
|
| 115 |
-
|
| 116 |
-
def load_whisper():
|
| 117 |
-
global _whisper_model
|
| 118 |
-
if _whisper_model is None:
|
| 119 |
-
_whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
|
| 120 |
-
return _whisper_model
|
| 121 |
-
|
| 122 |
-
|
| 123 |
def transcribe_audio(audio_segment, spoken_lang=None):
|
| 124 |
"""
|
| 125 |
Accepts a pydub AudioSegment (mono, 16k).
|
|
@@ -353,7 +345,10 @@ class ConversationManager:
|
|
| 353 |
|
| 354 |
from transformers import pipeline
|
| 355 |
|
| 356 |
-
|
|
|
|
|
|
|
|
|
|
| 357 |
whisper_pipe = pipeline(
|
| 358 |
task="automatic-speech-recognition",
|
| 359 |
model="openai/whisper-small",
|
|
@@ -363,21 +358,14 @@ class ConversationManager:
|
|
| 363 |
def transcribe(self, audio_segment, spoken_lang=None):
|
| 364 |
import numpy as np
|
| 365 |
|
| 366 |
-
# Convert AudioSegment → numpy float32 PCM
|
| 367 |
audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
|
| 368 |
-
audio = audio / np.max(np.abs(audio))
|
| 369 |
|
| 370 |
-
# Transformers Whisper expects a Python list or numpy array
|
| 371 |
result = whisper_pipe(audio)
|
| 372 |
-
|
| 373 |
text = result.get("text", "").strip()
|
| 374 |
|
| 375 |
-
# transformers Whisper does not provide language predictions on CPU
|
| 376 |
return text, spoken_lang or "unknown", 1.0
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
###########################################################
|
| 382 |
# TEXT → SPEECH
|
| 383 |
###########################################################
|
|
|
|
| 10 |
from typing import List, Optional, Tuple
|
| 11 |
from .config import get_user_dir
|
| 12 |
|
| 13 |
+
import numpy as np
|
| 14 |
+
from transformers import pipeline
|
| 15 |
+
from pydub import AudioSegment
|
| 16 |
|
| 17 |
import torch
|
| 18 |
from gtts import gTTS
|
|
|
|
| 112 |
# SPEECH RECOGNITION — faster-whisper
|
| 113 |
##########################################
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
def transcribe_audio(audio_segment, spoken_lang=None):
|
| 116 |
"""
|
| 117 |
Accepts a pydub AudioSegment (mono, 16k).
|
|
|
|
| 345 |
|
| 346 |
from transformers import pipeline
|
| 347 |
|
| 348 |
+
###########################################################
|
| 349 |
+
# AUDIO TRANSCRIPTION — Transformers Whisper
|
| 350 |
+
###########################################################
|
| 351 |
+
|
| 352 |
whisper_pipe = pipeline(
|
| 353 |
task="automatic-speech-recognition",
|
| 354 |
model="openai/whisper-small",
|
|
|
|
| 358 |
def transcribe(self, audio_segment, spoken_lang=None):
|
| 359 |
import numpy as np
|
| 360 |
|
|
|
|
| 361 |
audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
|
| 362 |
+
audio = audio / np.max(np.abs(audio))
|
| 363 |
|
|
|
|
| 364 |
result = whisper_pipe(audio)
|
|
|
|
| 365 |
text = result.get("text", "").strip()
|
| 366 |
|
|
|
|
| 367 |
return text, spoken_lang or "unknown", 1.0
|
| 368 |
|
|
|
|
|
|
|
|
|
|
| 369 |
###########################################################
|
| 370 |
# TEXT → SPEECH
|
| 371 |
###########################################################
|