-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_andy.py
More file actions
192 lines (148 loc) · 5.63 KB
/
train_andy.py
File metadata and controls
192 lines (148 loc) · 5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
"""
Train custom "Andy" wake word model for OpenWakeWord
Two methods:
1. Synthetic: Generate TTS samples of "Andy" (fast but less accurate)
2. Real: Record yourself saying "Andy" multiple times (more accurate)
This script helps with both approaches.
"""
import os
import sys
import json
import wave
import struct
import numpy as np
from pathlib import Path
SAMPLE_RATE = 16000
WAKE_WORD = "andy"
OUTPUT_DIR = Path("~/projects/voice-control/training_data").expanduser()
MODEL_OUTPUT = Path("~/projects/voice-control/models").expanduser()
def setup_directories():
"""Create necessary directories"""
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_OUTPUT.mkdir(parents=True, exist_ok=True)
(OUTPUT_DIR / "positive").mkdir(exist_ok=True)
(OUTPUT_DIR / "negative").mkdir(exist_ok=True)
print(f"Created directories in {OUTPUT_DIR}")
def record_sample(filename: str, duration: float = 2.0):
"""Record a single audio sample"""
import pyaudio
p = pyaudio.PyAudio()
stream = p.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=1024
)
print(f"Recording for {duration} seconds... Say 'Andy'!")
frames = []
for _ in range(0, int(SAMPLE_RATE / 1024 * duration)):
data = stream.read(1024)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
# Save as WAV
wf = wave.open(str(filename), 'wb')
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(SAMPLE_RATE)
wf.writeframes(b''.join(frames))
wf.close()
print(f"Saved to {filename}")
def record_positive_samples(count: int = 20):
"""Record multiple samples of saying 'Andy'"""
positive_dir = OUTPUT_DIR / "positive"
print(f"\n=== Recording {count} positive samples ===")
print("Say 'Andy' clearly after each prompt.\n")
for i in range(count):
filename = positive_dir / f"andy_{i:03d}.wav"
input(f"Press Enter to record sample {i+1}/{count}...")
record_sample(str(filename), duration=1.5)
print(f"\nRecorded {count} positive samples!")
def record_negative_samples(count: int = 20):
"""Record negative samples (random speech, silence, noise)"""
negative_dir = OUTPUT_DIR / "negative"
print(f"\n=== Recording {count} negative samples ===")
print("Say random words, make noise, or stay silent.\n")
for i in range(count):
filename = negative_dir / f"negative_{i:03d}.wav"
input(f"Press Enter to record negative sample {i+1}/{count}...")
record_sample(str(filename), duration=2.0)
print(f"\nRecorded {count} negative samples!")
def generate_synthetic_samples():
"""Generate synthetic 'Andy' samples using TTS"""
try:
from TTS.api import TTS
except ImportError:
print("Installing TTS library...")
os.system("pip install TTS")
from TTS.api import TTS
positive_dir = OUTPUT_DIR / "positive"
print("Generating synthetic 'Andy' samples...")
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")
# Generate variations
phrases = [
"Andy", "andy", "Andy!", "Hey Andy", "Okay Andy",
"Andy?", "ANDY", "andi", "Andie"
]
for i, phrase in enumerate(phrases):
filename = positive_dir / f"synthetic_andy_{i:03d}.wav"
tts.tts_to_file(text=phrase, file_path=str(filename))
print(f"Generated: {filename}")
print(f"\nGenerated {len(phrases)} synthetic samples!")
def train_model():
"""Train the OpenWakeWord model"""
print("\n=== Training Custom Wake Word Model ===\n")
# Check if we have enough samples
positive_count = len(list((OUTPUT_DIR / "positive").glob("*.wav")))
negative_count = len(list((OUTPUT_DIR / "negative").glob("*.wav")))
print(f"Positive samples: {positive_count}")
print(f"Negative samples: {negative_count}")
if positive_count < 10:
print("\n⚠️ Need at least 10 positive samples!")
print("Run: python train_andy.py record")
return
# OpenWakeWord training
print("\nTraining model... (this may take a few minutes)")
# Use openwakeword's training functionality
from openwakeword.train import train_model as oww_train
model_path = MODEL_OUTPUT / "andy_v1.onnx"
oww_train(
positive_audio_dir=str(OUTPUT_DIR / "positive"),
negative_audio_dir=str(OUTPUT_DIR / "negative") if negative_count > 0 else None,
output_path=str(model_path),
epochs=100
)
print(f"\n✅ Model saved to: {model_path}")
print("\nTo use this model, update server.py:")
print(f' WAKE_WORD_MODELS = ["{model_path}"]')
def main():
setup_directories()
if len(sys.argv) < 2:
print("""
Usage: python train_andy.py <command>
Commands:
record - Record real samples (recommended, ~5 minutes)
synthetic - Generate synthetic samples using TTS (faster)
train - Train the model after recording samples
Recommended workflow:
1. python train_andy.py record
2. python train_andy.py train
""")
return
command = sys.argv[1]
if command == "record":
record_positive_samples(20)
print("\nDo you want to record negative samples too? (recommended)")
if input("Record negative samples? [y/N]: ").lower() == 'y':
record_negative_samples(10)
elif command == "synthetic":
generate_synthetic_samples()
elif command == "train":
train_model()
else:
print(f"Unknown command: {command}")
if __name__ == "__main__":
main()