voice-control/train_andy.py at main · offloadmywork/voice-control · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
"""
Train custom "Andy" wake word model for OpenWakeWord

Two methods:
1. Synthetic: Generate TTS samples of "Andy" (fast but less accurate)
2. Real: Record yourself saying "Andy" multiple times (more accurate)

This script helps with both approaches.
"""

import os
import sys
import json
import wave
import struct
import numpy as np
from pathlib import Path

SAMPLE_RATE = 16000
WAKE_WORD = "andy"
OUTPUT_DIR = Path("~/projects/voice-control/training_data").expanduser()
MODEL_OUTPUT = Path("~/projects/voice-control/models").expanduser()

def setup_directories():
    """Create necessary directories"""
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    MODEL_OUTPUT.mkdir(parents=True, exist_ok=True)
    (OUTPUT_DIR / "positive").mkdir(exist_ok=True)
    (OUTPUT_DIR / "negative").mkdir(exist_ok=True)
    print(f"Created directories in {OUTPUT_DIR}")

def record_sample(filename: str, duration: float = 2.0):
    """Record a single audio sample"""
    import pyaudio

    p = pyaudio.PyAudio()
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=SAMPLE_RATE,
        input=True,
        frames_per_buffer=1024
    )

    print(f"Recording for {duration} seconds... Say 'Andy'!")
    frames = []
    for _ in range(0, int(SAMPLE_RATE / 1024 * duration)):
        data = stream.read(1024)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    # Save as WAV
    wf = wave.open(str(filename), 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(2)
    wf.setframerate(SAMPLE_RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

    print(f"Saved to {filename}")

def record_positive_samples(count: int = 20):
    """Record multiple samples of saying 'Andy'"""
    positive_dir = OUTPUT_DIR / "positive"

    print(f"\n=== Recording {count} positive samples ===")
    print("Say 'Andy' clearly after each prompt.\n")

    for i in range(count):
        filename = positive_dir / f"andy_{i:03d}.wav"
        input(f"Press Enter to record sample {i+1}/{count}...")
        record_sample(str(filename), duration=1.5)

    print(f"\nRecorded {count} positive samples!")

def record_negative_samples(count: int = 20):
    """Record negative samples (random speech, silence, noise)"""
    negative_dir = OUTPUT_DIR / "negative"

    print(f"\n=== Recording {count} negative samples ===")
    print("Say random words, make noise, or stay silent.\n")

    for i in range(count):
        filename = negative_dir / f"negative_{i:03d}.wav"
        input(f"Press Enter to record negative sample {i+1}/{count}...")
        record_sample(str(filename), duration=2.0)

    print(f"\nRecorded {count} negative samples!")

def generate_synthetic_samples():
    """Generate synthetic 'Andy' samples using TTS"""
    try:
        from TTS.api import TTS
    except ImportError:
        print("Installing TTS library...")
        os.system("pip install TTS")
        from TTS.api import TTS

    positive_dir = OUTPUT_DIR / "positive"

    print("Generating synthetic 'Andy' samples...")
    tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC")

    # Generate variations
    phrases = [
        "Andy", "andy", "Andy!", "Hey Andy", "Okay Andy",
        "Andy?", "ANDY", "andi", "Andie"
    ]

    for i, phrase in enumerate(phrases):
        filename = positive_dir / f"synthetic_andy_{i:03d}.wav"
        tts.tts_to_file(text=phrase, file_path=str(filename))
        print(f"Generated: {filename}")

    print(f"\nGenerated {len(phrases)} synthetic samples!")

def train_model():
    """Train the OpenWakeWord model"""
    print("\n=== Training Custom Wake Word Model ===\n")

    # Check if we have enough samples
    positive_count = len(list((OUTPUT_DIR / "positive").glob("*.wav")))
    negative_count = len(list((OUTPUT_DIR / "negative").glob("*.wav")))

    print(f"Positive samples: {positive_count}")
    print(f"Negative samples: {negative_count}")

    if positive_count < 10:
        print("\n⚠️  Need at least 10 positive samples!")
        print("Run: python train_andy.py record")
        return

    # OpenWakeWord training
    print("\nTraining model... (this may take a few minutes)")

    # Use openwakeword's training functionality
    from openwakeword.train import train_model as oww_train

    model_path = MODEL_OUTPUT / "andy_v1.onnx"

    oww_train(
        positive_audio_dir=str(OUTPUT_DIR / "positive"),
        negative_audio_dir=str(OUTPUT_DIR / "negative") if negative_count > 0 else None,
        output_path=str(model_path),
        epochs=100
    )

    print(f"\n✅ Model saved to: {model_path}")
    print("\nTo use this model, update server.py:")
    print(f'  WAKE_WORD_MODELS = ["{model_path}"]')

def main():
    setup_directories()

    if len(sys.argv) < 2:
        print("""
Usage: python train_andy.py <command>

Commands:
  record    - Record real samples (recommended, ~5 minutes)
  synthetic - Generate synthetic samples using TTS (faster)
  train     - Train the model after recording samples

Recommended workflow:
  1. python train_andy.py record
  2. python train_andy.py train
""")
        return

    command = sys.argv[1]

    if command == "record":
        record_positive_samples(20)
        print("\nDo you want to record negative samples too? (recommended)")
        if input("Record negative samples? [y/N]: ").lower() == 'y':
            record_negative_samples(10)

    elif command == "synthetic":
        generate_synthetic_samples()

    elif command == "train":
        train_model()

    else:
        print(f"Unknown command: {command}")

if __name__ == "__main__":
    main()