forked from esuyoonn/2026_Spring_DSL_Modeling_CV
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathspeech_features.py
More file actions
29 lines (24 loc) · 1.04 KB
/
speech_features.py
File metadata and controls
29 lines (24 loc) · 1.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# speech_features.py
import torch
import opensmile
import librosa
# Feature extraction and silence detection
def load_silero_vad():
vad_model, vad_utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
trust_repo=True
)
return vad_model, vad_utils
def get_speech_timestamps(wav, vad_model, sampling_rate=16000):
(get_speech_timestamps, _, _, _, _) = vad_utils
return get_speech_timestamps(wav, vad_model, sampling_rate)
def compute_pitch_energy(y, sr, HOP=512, FRAME_LENGTH=2048):
# pitch & energy calculation (using librosa)
f0, _, _ = librosa.pyin(y, fmin=70, fmax=400, sr=sr, hop_length=HOP)
pitch_mean = float(np.nanmean(f0)) if np.any(~np.isnan(f0)) else float("nan")
pitch_std = float(np.nanstd(f0)) if np.any(~np.isnan(f0)) else float("nan")
rms = librosa.feature.rms(y=y, frame_length=FRAME_LENGTH, hop_length=HOP)[0]
energy_mean = float(np.mean(rms))
energy_std = float(np.std(rms))
return pitch_mean, pitch_std, energy_mean, energy_std, f0, rms