Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd
.Python
*.egg
*.egg-info/
dist/
build/

# Virtual environments
.venv/
venv/
env/

# Project outputs
outputs/

# Large model files
*.h5
*.pb
*.tflite
*.onnx

# Secrets and Environment
.env
.env.local
.env.*.local
69 changes: 69 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Intelligent CC Suggestion Tool — Goal 1

## Sound Event Detection Module
This module automatically detects and classifies non-speech audio events (like honking, laughter, music) from a video file.

---

## 🛠 Setup Instructions

### 1. Prerequisites
- **Python 3.10+**
- **FFmpeg**: Must be installed and available in your system's PATH.
- *Windows*: `winget install ffmpeg`
- *Linux*: `sudo apt install ffmpeg`

### 2. Environment Setup
Clone the repository and set up a virtual environment:

```bash
# Clone the repository
git clone https://github.com/Siddharth-732/Intelligent-cc-generation.git
cd Intelligent-cc-generation

# Create a virtual environment
python -m venv .venv

# Activate the environment
# Windows:
.\.venv\Scripts\activate
# Linux/Mac:
source .venv/bin/activate

# Install dependencies
pip install -r requirements.txt
```

### 3. Usage
Verify the installation by running the test script on a video file:

```bash
python test_goal_1.py "path/to/your/video.mp4"
```

---

## 💻 Programmatic Usage
```python
from cc_tool.audio import extract_audio, SoundEventDetector

# 1. Extract audio from video
wav_path = extract_audio("video.mp4")

# 2. Initialize detector
detector = SoundEventDetector(confidence_threshold=0.3)

# 3. Detect non-speech events
events = detector.detect(wav_path)

for e in events:
print(f"[{e.start_sec}s - {e.end_sec}s] {e.label} ({e.confidence})")
```

---

## 📁 Project Structure
- `cc_tool/audio/extractor.py`: Audio extraction logic.
- `cc_tool/audio/detector.py`: YAMNet model implementation.
- `cc_tool/audio/models.py`: Data models.
- `cc_tool/audio/utils.py`: Audio processing utilities.
1 change: 1 addition & 0 deletions cc_tool/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Marks this directory as a Python package
5 changes: 5 additions & 0 deletions cc_tool/audio/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .detector import SoundEventDetector
from .extractor import extract_audio
from .models import AudioEvent

__all__ = ["SoundEventDetector", "extract_audio", "AudioEvent"]
82 changes: 82 additions & 0 deletions cc_tool/audio/detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import numpy as np
import soundfile as sf
import tensorflow_hub as hub
import csv
import os
from cc_tool.audio.models import AudioEvent
from cc_tool.audio.utils import chunk_audio, normalize_waveform
from cc_tool.audio.mapping import get_canonical_label, _IGNORE_GROUPS

# AudioSet indices for speech - we ignore these
SPEECH_INDICES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

class SoundEventDetector:
def __init__(self, confidence_threshold=0.25): # Lowered threshold slightly for grouped detection
self.confidence_threshold = confidence_threshold
self._model = None
self._class_names = []

def _load_model(self):
if self._model is None:
self._model = hub.load("https://tfhub.dev/google/yamnet/1")
class_map_path = self._model.class_map_path().numpy().decode()
if os.path.exists(class_map_path):
with open(class_map_path, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
self._class_names = [row["display_name"] for row in reader]
else:
import urllib.request
with urllib.request.urlopen(class_map_path) as f:
reader = csv.DictReader(line.decode("utf-8") for line in f)
self._class_names = [row["display_name"] for row in reader]

def detect(self, wav_path):
self._load_model()
waveform, sr = sf.read(wav_path, dtype="float32")
if waveform.ndim > 1: waveform = waveform.mean(axis=1)
waveform = normalize_waveform(waveform)

chunks = chunk_audio(waveform, sr)
raw_events = []
Comment on lines +33 to +40

for start_sec, end_sec, chunk in chunks:
scores, _, _ = self._model(chunk)
mean_scores = scores.numpy().mean(axis=0)

# Multi-label grouped scoring:
# Sum scores for each canonical group across all 521 YAMNet classes.
group_scores = {}
for idx, score in enumerate(mean_scores):
if idx in SPEECH_INDICES: continue
canonical = get_canonical_label(self._class_names[idx])
group_scores[canonical] = group_scores.get(canonical, 0) + score

# Emit EVERY group that clears the threshold — not just the loudest one.
# This is the key fix: co-occurring sounds (e.g., crowd + snake hiss)
# were previously suppressed because only max() was kept per chunk.
# Skip _ambient_ — it's the catch-all for unmapped background classes.
for group, score in group_scores.items():
if group in _IGNORE_GROUPS:
continue
if score >= self.confidence_threshold:
raw_events.append(AudioEvent(
label=group,
confidence=float(score),
start_sec=start_sec,
end_sec=end_sec
))

return self._merge_events(raw_events)

def _merge_events(self, events):
if not events: return []
events.sort(key=lambda x: x.start_sec)
merged = [events[0]]
for curr in events[1:]:
prev = merged[-1]
if curr.label == prev.label and curr.start_sec <= prev.end_sec:
prev.end_sec = max(prev.end_sec, curr.end_sec)
prev.confidence = max(prev.confidence, curr.confidence)
else:
merged.append(curr)
return merged
35 changes: 35 additions & 0 deletions cc_tool/audio/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import subprocess
from pathlib import Path

def extract_audio(video_path: str, output_wav: str | None = None) -> str:
"""
Extract audio track from video using FFmpeg.
Assumes 'ffmpeg' is available in the system PATH.
"""
video_path = Path(video_path)
if not video_path.exists():
raise FileNotFoundError(f"Video file not found: {video_path}")

if output_wav is None:
os.makedirs("outputs/audio", exist_ok=True)
output_wav = f"outputs/audio/{video_path.stem}.wav"

# Standard command using system 'ffmpeg'
cmd = [
"ffmpeg", "-y",
"-i", str(video_path),
"-vn",
"-acodec", "pcm_s16le",
"-ar", "16000",
"-ac", "1",
str(output_wav)
]

result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(
f"FFmpeg failed. Ensure FFmpeg is installed and in your PATH.\nError: {result.stderr}"
)

return str(output_wav)
147 changes: 147 additions & 0 deletions cc_tool/audio/mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
Semantic mapping for YAMNet classes to canonical CC labels.
Groups 521 specific classes into ~20 stable categories.
"""

# Map of YAMNet Display Names → Canonical CC Labels.
# Keys must match the 'display_name' field from YAMNet's class map CSV exactly.
LABEL_GROUPS = {
# --- Impacts & Explosions ---
"Explosion": "[Gunshot/Explosion]",
"Gunshot, gunfire": "[Gunshot/Explosion]",
"Machine gun": "[Gunshot/Explosion]",
"Fusillade": "[Gunshot/Explosion]",
"Firecracker": "[Gunshot/Explosion]",
"Fireworks": "[Gunshot/Explosion]",
"Artillery fire": "[Gunshot/Explosion]",
"Cap gun": "[Gunshot/Explosion]",
"Burst, pop": "[Impact/Pop]",
"Boom": "[Impact/Pop]",
"Thud": "[Impact/Pop]",
"Slam": "[Impact/Pop]",
"Hammer": "[Metallic Impact]",
"Clang": "[Metallic Impact]",
"Clatter": "[Metallic Impact]",
"Dishes, pots, and pans": "[Metallic Impact]",
"Cutlery, silverware": "[Metallic Impact]",
"Glass": "[Glass Breaking]",
"Shatter": "[Glass Breaking]",
"Breaking": "[Glass Breaking]",
"Chink and clink": "[Glass Breaking]",
"Ding-dong": "[Bell/Chime]",
"Bell": "[Bell/Chime]",
"Church bell": "[Bell/Chime]",
"Cowbell": "[Bell/Chime]",

# --- Vehicles & Mechanical ---
"Motor vehicle (road)": "[Vehicle]",
"Car": "[Vehicle]",
"Truck": "[Vehicle]",
"Bus": "[Vehicle]",
"Engine": "[Vehicle]",
"Motorcycle": "[Vehicle]",
"Race car, auto racing": "[Vehicle]",
"Car alarm": "[Car Alarm]",
"Horn": "[Horn/Honking]",
"Car passing by": "[Vehicle]",
"Vehicle horn, car horn, honking": "[Horn/Honking]",
"Bicycle": "[Mechanical]",
"Skateboard": "[Mechanical]",
"Tools": "[Mechanical]",
"Drill": "[Mechanical]",
"Chainsaw": "[Mechanical]",
"Power tool": "[Mechanical]",

# --- Nature ---
"Rain": "[Rain]",
"Raindrop": "[Rain]",
"Heavy rain": "[Rain]",
"Wind": "[Wind]",
"Rustling leaves": "[Wind]",
"Thunderstorm": "[Thunder]",
"Thunder": "[Thunder]",
"Lightning": "[Thunder]",
"Ocean": "[Water/Ocean]",
"Water": "[Water/Ocean]",
"Stream": "[Water/Ocean]",
"Waterfall": "[Water/Ocean]",
"Fire": "[Fire]",
"Crackle": "[Fire]",

# --- Animals ---
"Dog": "[Animal Sound]",
"Bark": "[Animal Sound]",
"Howl": "[Animal Sound]",
"Growling": "[Animal Sound]",
"Cat": "[Animal Sound]",
"Meow": "[Animal Sound]",
"Purr": "[Animal Sound]",
"Caterwaul": "[Animal Sound]",
"Roar": "[Animal Sound]",
"Animal": "[Animal Sound]",
"Bird": "[Bird Sound]",
"Crow": "[Bird Sound]",
"Chirp, tweet": "[Bird Sound]",
"Birdsong": "[Bird Sound]",
"Squawk": "[Bird Sound]",
# Snake / reptile sounds — this was the missing category
"Hiss": "[Snake/Hiss]",
"Snake": "[Snake/Hiss]",
"Rattle": "[Snake/Hiss]",
"Rattlesnake": "[Snake/Hiss]",
"Insect": "[Insect Sound]",
"Cricket": "[Insect Sound]",
"Mosquito": "[Insect Sound]",

# --- Human Non-Speech ---
"Crowd": "[Crowd]",
"Cheering": "[Crowd]",
"Applause": "[Applause]",
"Clapping": "[Applause]",
"Laughter": "[Laughter]",
"Chuckle, chortle": "[Laughter]",
"Giggle": "[Laughter]",
"Crying, sobbing": "[Crying]",
"Whimper": "[Crying]",
"Screaming": "[Scream]",
"Shout": "[Shout]",
"Whistling": "[Whistle]",
"Walk, footsteps": "[Footsteps]",
"Run": "[Footsteps]",
"Gasp": "[Gasp]",
"Groan": "[Groan]",
"Snoring": "[Snoring]",
"Cough": "[Cough]",
"Sneeze": "[Sneeze]",

# --- Emergency ---
"Siren": "[Siren]",
"Emergency vehicle": "[Siren]",
"Police car (siren)": "[Siren]",
"Ambulance (siren)": "[Siren]",
"Fire engine, fire truck (siren)": "[Siren]",
"Alarm": "[Alarm]",
"Smoke detector, smoke alarm": "[Alarm]",
"Beeping": "[Alarm]",

# --- Music ---
"Music": "[Music]",
"Musical instrument": "[Music]",
"Singing": "[Music]",
"Drum": "[Music]",
"Guitar": "[Music]",
"Piano": "[Music]",
}

# Classes not in LABEL_GROUPS collapse to this — prevents raw YAMNet labels
# (like 'Inside, small room') from inflating group scores and beating real events.
DEFAULT_GROUP = "_ambient_"

# Internal: classes that are ambient/background and should be ignored entirely
_IGNORE_GROUPS = {"_ambient_"}

def get_canonical_label(yamnet_label: str) -> str:
"""Map a raw YAMNet display_name to a canonical CC label.
Returns DEFAULT_GROUP for unmapped ambient classes.
"""
return LABEL_GROUPS.get(yamnet_label, DEFAULT_GROUP)
19 changes: 19 additions & 0 deletions cc_tool/audio/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""
Intelligent CC Suggestion Tool — Audio Event Data Models
"""
from dataclasses import dataclass

@dataclass
class AudioEvent:
label: str
confidence: float
start_sec: float
end_sec: float

def to_dict(self) -> dict:
return {
"label": self.label,
"confidence": round(self.confidence, 4),
"start_sec": round(self.start_sec, 3),
"end_sec": round(self.end_sec, 3),
}
Loading