-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspeech_engine.py
More file actions
164 lines (134 loc) · 4.77 KB
/
speech_engine.py
File metadata and controls
164 lines (134 loc) · 4.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import speech_recognition as sr
import asyncio
import logging
import edge_tts
import pygame
import os
import tempfile
import hashlib
from config import GROQ_API_KEY, WHISPER_MODEL
import groq
# logging.basicConfig(level=logging.INFO)
# Initialize Groq client for Whisper
groq_client = None
if GROQ_API_KEY:
try:
groq_client = groq.Groq(api_key=GROQ_API_KEY)
except Exception as e:
logging.error(f"Failed to initialize Groq client for Whisper: {e}")
# Initialize Pygame Mixer Lazily
def _ensure_mixer():
if not pygame.mixer.get_init():
try:
pygame.mixer.init()
except Exception as e:
logging.error(f"Failed to initialize Pygame mixer: {e}")
# Voice Configuration
VOICE = "en-GB-RyanNeural"
# TTS Cache
CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tts_cache")
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
_is_speaking = False
SILENT_MODE = False
def set_silent_mode(enabled: bool):
global SILENT_MODE
SILENT_MODE = enabled
if enabled:
stop_speaking()
def stop_speaking():
"""Stops the current speech playback immediately."""
global _is_speaking
_is_speaking = False
try:
if pygame.mixer.get_init():
pygame.mixer.music.stop()
pygame.mixer.music.unload()
except Exception as e:
logging.error(f"Error stopping speech: {e}")
async def speak(text):
"""Generates speech using Edge-TTS and plays it with Pygame (with Caching)."""
global _is_speaking
if SILENT_MODE:
logging.info(f"Silent Mode (Speech Suppressed): {text}")
return
if not text:
return
# Check for interruption before starting
_ensure_mixer()
if not pygame.mixer.get_init():
return
try:
# Generate filename based on text hash
text_hash = hashlib.md5(text.encode()).hexdigest()
audio_file = os.path.join(CACHE_DIR, f"{text_hash}.mp3")
# Generate if not cached
if not os.path.exists(audio_file):
communicate = edge_tts.Communicate(text, VOICE)
await communicate.save(audio_file)
# Play
_is_speaking = True
pygame.mixer.music.load(audio_file)
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy() and _is_speaking:
await asyncio.sleep(0.1)
if not _is_speaking:
pygame.mixer.music.stop()
pygame.mixer.music.unload()
_is_speaking = False
except Exception as e:
logging.error(f"Edge-TTS Error: {e}")
_is_speaking = False
def speak_sync(text):
"""Synchronous wrapper for speak."""
asyncio.run(speak(text))
def listen_sync():
"""Listens using SpeechRecognition but transcribes with Faster Whisper."""
recognizer = sr.Recognizer()
mic = sr.Microphone()
try:
with mic as source:
recognizer.adjust_for_ambient_noise(source, duration=0.5)
logging.info("Listening...")
try:
# Capture audio
audio = recognizer.listen(source, timeout=5, phrase_time_limit=10)
except sr.WaitTimeoutError:
return None
logging.info("Transcribing with Groq Whisper...")
# Save to temp file for Whisper
temp_wav = os.path.join(tempfile.gettempdir(), "temp_command.wav")
with open(temp_wav, "wb") as f:
f.write(audio.get_wav_data())
if groq_client:
try:
with open(temp_wav, "rb") as file:
transcription = groq_client.audio.transcriptions.create(
file=(temp_wav, file.read()),
model=WHISPER_MODEL,
response_format="text",
)
command = transcription.strip().lower()
except Exception as e:
logging.error(f"Groq Whisper transcription failed: {e}. Falling back to Google.")
command = recognizer.recognize_google(audio).lower()
else:
# Fallback if client failed to load
command = recognizer.recognize_google(audio).lower()
# Cleanup
try:
os.remove(temp_wav)
except:
pass
logging.info(f"User said: {command}")
return command
except (sr.UnknownValueError, sr.WaitTimeoutError):
# Benign errors (silence or timeout)
return None
except Exception as e:
import traceback
logging.error(f"Speech Error: {repr(e)}\n{traceback.format_exc()}")
return None
async def listen():
return await asyncio.to_thread(listen_sync)