PlanetRead · Siddharth-732 · May 12, 2026 · May 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,29 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+*.egg
+*.egg-info/
+dist/
+build/
+
+# Virtual environments
+.venv/
+venv/
+env/
+
+# Project outputs
+outputs/
+
+# Large model files
+*.h5
+*.pb
+*.tflite
+*.onnx
+
+# Secrets and Environment
+.env
+.env.local
+.env.*.local
diff --git a/README.md b/README.md
@@ -0,0 +1,69 @@
+# Intelligent CC Suggestion Tool — Goal 1
+
+## Sound Event Detection Module
+This module automatically detects and classifies non-speech audio events (like honking, laughter, music) from a video file.
+
+---
+
+## 🛠 Setup Instructions
+
+### 1. Prerequisites
+- **Python 3.10+**
+- **FFmpeg**: Must be installed and available in your system's PATH.
+  - *Windows*: `winget install ffmpeg`
+  - *Linux*: `sudo apt install ffmpeg`
+
+### 2. Environment Setup
+Clone the repository and set up a virtual environment:
+
+```bash
+# Clone the repository
+git clone https://github.com/Siddharth-732/Intelligent-cc-generation.git
+cd Intelligent-cc-generation
+
+# Create a virtual environment
+python -m venv .venv
+
+# Activate the environment
+# Windows:
+.\.venv\Scripts\activate
+# Linux/Mac:
+source .venv/bin/activate
+
+# Install dependencies
+pip install -r requirements.txt
+```
+
+### 3. Usage
+Verify the installation by running the test script on a video file:
+
+```bash
+python test_goal_1.py "path/to/your/video.mp4"
+```
+
+---
+
+## 💻 Programmatic Usage
+```python
+from cc_tool.audio import extract_audio, SoundEventDetector
+
+# 1. Extract audio from video
+wav_path = extract_audio("video.mp4")
+
+# 2. Initialize detector
+detector = SoundEventDetector(confidence_threshold=0.3)
+
+# 3. Detect non-speech events
+events = detector.detect(wav_path)
+
+for e in events:
+    print(f"[{e.start_sec}s - {e.end_sec}s] {e.label} ({e.confidence})")
+```
+
+---
+
+## 📁 Project Structure
+- `cc_tool/audio/extractor.py`: Audio extraction logic.
+- `cc_tool/audio/detector.py`: YAMNet model implementation.
+- `cc_tool/audio/models.py`: Data models.
+- `cc_tool/audio/utils.py`: Audio processing utilities.
diff --git a/cc_tool/__init__.py b/cc_tool/__init__.py
@@ -0,0 +1 @@
+# Marks this directory as a Python package
diff --git a/cc_tool/audio/__init__.py b/cc_tool/audio/__init__.py
@@ -0,0 +1,5 @@
+from .detector import SoundEventDetector
+from .extractor import extract_audio
+from .models import AudioEvent
+
+__all__ = ["SoundEventDetector", "extract_audio", "AudioEvent"]
diff --git a/cc_tool/audio/detector.py b/cc_tool/audio/detector.py
@@ -0,0 +1,82 @@
+import numpy as np
+import soundfile as sf
+import tensorflow_hub as hub
+import csv
+import os
+from cc_tool.audio.models import AudioEvent
+from cc_tool.audio.utils import chunk_audio, normalize_waveform
+from cc_tool.audio.mapping import get_canonical_label, _IGNORE_GROUPS
+
+# AudioSet indices for speech - we ignore these
+SPEECH_INDICES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+class SoundEventDetector:
+    def __init__(self, confidence_threshold=0.25): # Lowered threshold slightly for grouped detection
+        self.confidence_threshold = confidence_threshold
+        self._model = None
+        self._class_names = []
+
+    def _load_model(self):
+        if self._model is None:
+            self._model = hub.load("https://tfhub.dev/google/yamnet/1")
+            class_map_path = self._model.class_map_path().numpy().decode()
+            if os.path.exists(class_map_path):
+                with open(class_map_path, 'r', encoding='utf-8') as f:
+                    reader = csv.DictReader(f)
+                    self._class_names = [row["display_name"] for row in reader]
+            else:
+                import urllib.request
+                with urllib.request.urlopen(class_map_path) as f:
+                    reader = csv.DictReader(line.decode("utf-8") for line in f)
+                    self._class_names = [row["display_name"] for row in reader]
+
+    def detect(self, wav_path):
+        self._load_model()
+        waveform, sr = sf.read(wav_path, dtype="float32")
+        if waveform.ndim > 1: waveform = waveform.mean(axis=1)
+        waveform = normalize_waveform(waveform)
+
+        chunks = chunk_audio(waveform, sr)
+        raw_events = []
+
+        for start_sec, end_sec, chunk in chunks:
+            scores, _, _ = self._model(chunk)
+            mean_scores = scores.numpy().mean(axis=0)
+
+            # Multi-label grouped scoring:
+            # Sum scores for each canonical group across all 521 YAMNet classes.
+            group_scores = {}
+            for idx, score in enumerate(mean_scores):
+                if idx in SPEECH_INDICES: continue
+                canonical = get_canonical_label(self._class_names[idx])
+                group_scores[canonical] = group_scores.get(canonical, 0) + score
+
+            # Emit EVERY group that clears the threshold — not just the loudest one.
+            # This is the key fix: co-occurring sounds (e.g., crowd + snake hiss)
+            # were previously suppressed because only max() was kept per chunk.
+            # Skip _ambient_ — it's the catch-all for unmapped background classes.
+            for group, score in group_scores.items():
+                if group in _IGNORE_GROUPS:
+                    continue
+                if score >= self.confidence_threshold:
+                    raw_events.append(AudioEvent(
+                        label=group,
+                        confidence=float(score),
+                        start_sec=start_sec,
+                        end_sec=end_sec
+                    ))
+
+        return self._merge_events(raw_events)
+
+    def _merge_events(self, events):
+        if not events: return []
+        events.sort(key=lambda x: x.start_sec)
+        merged = [events[0]]
+        for curr in events[1:]:
+            prev = merged[-1]
+            if curr.label == prev.label and curr.start_sec <= prev.end_sec:
+                prev.end_sec = max(prev.end_sec, curr.end_sec)
+                prev.confidence = max(prev.confidence, curr.confidence)
+            else:
+                merged.append(curr)
+        return merged
diff --git a/cc_tool/audio/extractor.py b/cc_tool/audio/extractor.py
@@ -0,0 +1,35 @@
+import os
+import subprocess
+from pathlib import Path
+
+def extract_audio(video_path: str, output_wav: str | None = None) -> str:
+    """
+    Extract audio track from video using FFmpeg.
+    Assumes 'ffmpeg' is available in the system PATH.
+    """
+    video_path = Path(video_path)
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+
+    if output_wav is None:
+        os.makedirs("outputs/audio", exist_ok=True)
+        output_wav = f"outputs/audio/{video_path.stem}.wav"
+
+    # Standard command using system 'ffmpeg'
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", str(video_path),
+        "-vn",
+        "-acodec", "pcm_s16le",
+        "-ar", "16000",
+        "-ac", "1",
+        str(output_wav)
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"FFmpeg failed. Ensure FFmpeg is installed and in your PATH.\nError: {result.stderr}"
+        )
+
+    return str(output_wav)
diff --git a/cc_tool/audio/mapping.py b/cc_tool/audio/mapping.py
@@ -0,0 +1,147 @@
+"""
+Semantic mapping for YAMNet classes to canonical CC labels.
+Groups 521 specific classes into ~20 stable categories.
+"""
+
+# Map of YAMNet Display Names → Canonical CC Labels.
+# Keys must match the 'display_name' field from YAMNet's class map CSV exactly.
+LABEL_GROUPS = {
+    # --- Impacts & Explosions ---
+    "Explosion": "[Gunshot/Explosion]",
+    "Gunshot, gunfire": "[Gunshot/Explosion]",
+    "Machine gun": "[Gunshot/Explosion]",
+    "Fusillade": "[Gunshot/Explosion]",
+    "Firecracker": "[Gunshot/Explosion]",
+    "Fireworks": "[Gunshot/Explosion]",
+    "Artillery fire": "[Gunshot/Explosion]",
+    "Cap gun": "[Gunshot/Explosion]",
+    "Burst, pop": "[Impact/Pop]",
+    "Boom": "[Impact/Pop]",
+    "Thud": "[Impact/Pop]",
+    "Slam": "[Impact/Pop]",
+    "Hammer": "[Metallic Impact]",
+    "Clang": "[Metallic Impact]",
+    "Clatter": "[Metallic Impact]",
+    "Dishes, pots, and pans": "[Metallic Impact]",
+    "Cutlery, silverware": "[Metallic Impact]",
+    "Glass": "[Glass Breaking]",
+    "Shatter": "[Glass Breaking]",
+    "Breaking": "[Glass Breaking]",
+    "Chink and clink": "[Glass Breaking]",
+    "Ding-dong": "[Bell/Chime]",
+    "Bell": "[Bell/Chime]",
+    "Church bell": "[Bell/Chime]",
+    "Cowbell": "[Bell/Chime]",
+
+    # --- Vehicles & Mechanical ---
+    "Motor vehicle (road)": "[Vehicle]",
+    "Car": "[Vehicle]",
+    "Truck": "[Vehicle]",
+    "Bus": "[Vehicle]",
+    "Engine": "[Vehicle]",
+    "Motorcycle": "[Vehicle]",
+    "Race car, auto racing": "[Vehicle]",
+    "Car alarm": "[Car Alarm]",
+    "Horn": "[Horn/Honking]",
+    "Car passing by": "[Vehicle]",
+    "Vehicle horn, car horn, honking": "[Horn/Honking]",
+    "Bicycle": "[Mechanical]",
+    "Skateboard": "[Mechanical]",
+    "Tools": "[Mechanical]",
+    "Drill": "[Mechanical]",
+    "Chainsaw": "[Mechanical]",
+    "Power tool": "[Mechanical]",
+
+    # --- Nature ---
+    "Rain": "[Rain]",
+    "Raindrop": "[Rain]",
+    "Heavy rain": "[Rain]",
+    "Wind": "[Wind]",
+    "Rustling leaves": "[Wind]",
+    "Thunderstorm": "[Thunder]",
+    "Thunder": "[Thunder]",
+    "Lightning": "[Thunder]",
+    "Ocean": "[Water/Ocean]",
+    "Water": "[Water/Ocean]",
+    "Stream": "[Water/Ocean]",
+    "Waterfall": "[Water/Ocean]",
+    "Fire": "[Fire]",
+    "Crackle": "[Fire]",
+
+    # --- Animals ---
+    "Dog": "[Animal Sound]",
+    "Bark": "[Animal Sound]",
+    "Howl": "[Animal Sound]",
+    "Growling": "[Animal Sound]",
+    "Cat": "[Animal Sound]",
+    "Meow": "[Animal Sound]",
+    "Purr": "[Animal Sound]",
+    "Caterwaul": "[Animal Sound]",
+    "Roar": "[Animal Sound]",
+    "Animal": "[Animal Sound]",
+    "Bird": "[Bird Sound]",
+    "Crow": "[Bird Sound]",
+    "Chirp, tweet": "[Bird Sound]",
+    "Birdsong": "[Bird Sound]",
+    "Squawk": "[Bird Sound]",
+    # Snake / reptile sounds — this was the missing category
+    "Hiss": "[Snake/Hiss]",
+    "Snake": "[Snake/Hiss]",
+    "Rattle": "[Snake/Hiss]",
+    "Rattlesnake": "[Snake/Hiss]",
+    "Insect": "[Insect Sound]",
+    "Cricket": "[Insect Sound]",
+    "Mosquito": "[Insect Sound]",
+
+    # --- Human Non-Speech ---
+    "Crowd": "[Crowd]",
+    "Cheering": "[Crowd]",
+    "Applause": "[Applause]",
+    "Clapping": "[Applause]",
+    "Laughter": "[Laughter]",
+    "Chuckle, chortle": "[Laughter]",
+    "Giggle": "[Laughter]",
+    "Crying, sobbing": "[Crying]",
+    "Whimper": "[Crying]",
+    "Screaming": "[Scream]",
+    "Shout": "[Shout]",
+    "Whistling": "[Whistle]",
+    "Walk, footsteps": "[Footsteps]",
+    "Run": "[Footsteps]",
+    "Gasp": "[Gasp]",
+    "Groan": "[Groan]",
+    "Snoring": "[Snoring]",
+    "Cough": "[Cough]",
+    "Sneeze": "[Sneeze]",
+
+    # --- Emergency ---
+    "Siren": "[Siren]",
+    "Emergency vehicle": "[Siren]",
+    "Police car (siren)": "[Siren]",
+    "Ambulance (siren)": "[Siren]",
+    "Fire engine, fire truck (siren)": "[Siren]",
+    "Alarm": "[Alarm]",
+    "Smoke detector, smoke alarm": "[Alarm]",
+    "Beeping": "[Alarm]",
+
+    # --- Music ---
+    "Music": "[Music]",
+    "Musical instrument": "[Music]",
+    "Singing": "[Music]",
+    "Drum": "[Music]",
+    "Guitar": "[Music]",
+    "Piano": "[Music]",
+}
+
+# Classes not in LABEL_GROUPS collapse to this — prevents raw YAMNet labels
+# (like 'Inside, small room') from inflating group scores and beating real events.
+DEFAULT_GROUP = "_ambient_"
+
+# Internal: classes that are ambient/background and should be ignored entirely
+_IGNORE_GROUPS = {"_ambient_"}
+
+def get_canonical_label(yamnet_label: str) -> str:
+    """Map a raw YAMNet display_name to a canonical CC label.
+    Returns DEFAULT_GROUP for unmapped ambient classes.
+    """
+    return LABEL_GROUPS.get(yamnet_label, DEFAULT_GROUP)
diff --git a/cc_tool/audio/models.py b/cc_tool/audio/models.py
@@ -0,0 +1,19 @@
+"""
+Intelligent CC Suggestion Tool — Audio Event Data Models
+"""
+from dataclasses import dataclass
+
+@dataclass
+class AudioEvent:
+    label: str
+    confidence: float
+    start_sec: float
+    end_sec: float
+
+    def to_dict(self) -> dict:
+        return {
+            "label": self.label,
+            "confidence": round(self.confidence, 4),
+            "start_sec": round(self.start_sec, 3),
+            "end_sec": round(self.end_sec, 3),
+        }