PlanetRead · bhuvan-somisetty · May 16, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,27 @@
+name: tests
+
+on:
+  push:
+    branches: ["main", "feat/**", "fix/**", "chore/**"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: install test dependencies
+        run: pip install -r requirements-dev.txt
+
+      - name: run tests
+        run: python -m pytest tests/ -v --tb=short
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1 @@
+pytest>=7.4.0
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/fusion/__init__.py b/src/fusion/__init__.py
@@ -0,0 +1,19 @@
+"""CC Decision Engine — Goal 3 of the intelligent CC suggestion pipeline."""
+
+from src.fusion.engine import (
+    AudioSignal,
+    CCDecision,
+    FusionConfig,
+    VisualSignal,
+    batch_decide,
+    decide,
+)
+
+__all__ = [
+    "AudioSignal",
+    "VisualSignal",
+    "CCDecision",
+    "FusionConfig",
+    "decide",
+    "batch_decide",
+]
diff --git a/src/fusion/engine.py b/src/fusion/engine.py
@@ -0,0 +1,299 @@
+"""CC Decision Engine — Goal 3.
+
+Combines audio event signals and visual reaction signals into a
+CC / no-CC decision.  The key design principle is *category-aware fusion*:
+the balance between audio and visual evidence depends on what kind of
+sound was detected.
+
+  HIGH_IMPACT events (gunshot, explosion, alarm, siren, glass breaking)
+      → audio confidence alone is usually sufficient; visual reaction is
+        a bonus that can rescue lower-confidence detections.
+
+  AMBIENT events (music, rain, wind, traffic)
+      → these sounds play for long stretches without warranting a caption.
+        Visual reaction must confirm that the sound is actually affecting
+        the scene before the engine fires.
+
+  GENERAL events (applause, crying, dog barking, crowd, etc.)
+      → weighted fusion; audio leads but a strong visual reaction can
+        push borderline detections over the threshold.
+
+No ML dependencies — all decisions are deterministic arithmetic on the
+confidence scores produced by upstream modules.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+
+# ---------------------------------------------------------------------------
+# Category sets (mirrors src/audio/labels.py — duplicated so this module
+# is standalone and does not depend on Goal 1 being merged first).
+# ---------------------------------------------------------------------------
+
+_HIGH_IMPACT: frozenset[str] = frozenset({
+    "[gunshot]",
+    "[explosion]",
+    "[alarm]",
+    "[siren]",
+    "[glass breaking]",
+})
+
+_AMBIENT: frozenset[str] = frozenset({
+    "[music]",
+    "[rain]",
+    "[wind]",
+    "[traffic]",
+})
+
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+@dataclasses.dataclass
+class AudioSignal:
+    """Audio event emitted by the sound event detection module."""
+
+    label: str
+    start_s: float
+    end_s: float
+    confidence: float
+
+
+@dataclasses.dataclass
+class VisualSignal:
+    """Visual reaction score emitted by the speaker reaction module.
+
+    *reaction_score* is a value in [0, 1] where 0 means no detectable
+    reaction and 1 means a strong, unambiguous reaction to the audio event.
+    Pass ``reaction_score=0.0`` when no visual analysis was performed.
+    """
+
+    reaction_score: float
+
+
+@dataclasses.dataclass
+class CCDecision:
+    """Output of the fusion engine for a single audio event."""
+
+    accepted: bool
+    label: str
+    start_s: float
+    end_s: float
+    audio_confidence: float
+    reaction_score: float
+    combined_score: float
+    # Plain-English explanation of why the event was accepted or rejected.
+    reason: str
+
+
+@dataclasses.dataclass
+class FusionConfig:
+    """Threshold configuration for the decision engine.
+
+    All values are in [0, 1].  The defaults are tuned to keep precision
+    high at the cost of some recall — better to miss a marginal caption
+    than to flood a video with ambient-noise captions.
+    """
+
+    # HIGH_IMPACT: audio weight, visual weight, minimum combined score.
+    high_impact_audio_w: float = 0.80
+    high_impact_visual_w: float = 0.20
+    high_impact_min_score: float = 0.40
+
+    # AMBIENT: visual weight is dominant; also gate on minimum reaction.
+    ambient_audio_w: float = 0.35
+    ambient_visual_w: float = 0.65
+    ambient_min_reaction: float = 0.35
+    ambient_min_score: float = 0.55
+
+    # GENERAL: audio leads but visual reaction can tip borderline events.
+    general_audio_w: float = 0.60
+    general_visual_w: float = 0.40
+    general_min_score: float = 0.45
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+_DEFAULT_CONFIG = FusionConfig()
+
+
+def decide(
+    audio: AudioSignal,
+    visual: VisualSignal,
+    config: FusionConfig = _DEFAULT_CONFIG,
+) -> CCDecision:
+    """Return a CC/no-CC decision for one audio event.
+
+    Parameters
+    ----------
+    audio:
+        Event from the sound event detection module.
+    visual:
+        Reaction score from the speaker reaction module.  Use
+        ``VisualSignal(reaction_score=0.0)`` when no visual analysis
+        was performed.
+    config:
+        Threshold configuration.  Defaults are tuned for high precision.
+    """
+    category = _category(audio.label)
+
+    if category == "HIGH_IMPACT":
+        return _decide_high_impact(audio, visual, config)
+    if category == "AMBIENT":
+        return _decide_ambient(audio, visual, config)
+    return _decide_general(audio, visual, config)
+
+
+def batch_decide(
+    pairs: list[tuple[AudioSignal, VisualSignal]],
+    config: FusionConfig = _DEFAULT_CONFIG,
+) -> list[CCDecision]:
+    """Run :func:`decide` over a list of (audio, visual) pairs."""
+    return [decide(a, v, config) for a, v in pairs]
+
+
+# ---------------------------------------------------------------------------
+# Category routing
+# ---------------------------------------------------------------------------
+
+def _category(label: str) -> str:
+    if label in _HIGH_IMPACT:
+        return "HIGH_IMPACT"
+    if label in _AMBIENT:
+        return "AMBIENT"
+    return "GENERAL"
+
+
+# ---------------------------------------------------------------------------
+# Per-category decision logic
+# ---------------------------------------------------------------------------
+
+def _decide_high_impact(
+    audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
+) -> CCDecision:
+    combined = (
+        audio.confidence * cfg.high_impact_audio_w
+        + visual.reaction_score * cfg.high_impact_visual_w
+    )
+    combined = round(combined, 4)
+    accepted = combined >= cfg.high_impact_min_score
+
+    if accepted:
+        reason = (
+            f"high-impact event: combined score {combined:.2f} "
+            f"(audio {audio.confidence:.2f} × {cfg.high_impact_audio_w} "
+            f"+ reaction {visual.reaction_score:.2f} × {cfg.high_impact_visual_w})"
+        )
+    else:
+        reason = (
+            f"high-impact event below threshold: combined {combined:.2f} "
+            f"< {cfg.high_impact_min_score} "
+            f"(audio confidence {audio.confidence:.2f} too low)"
+        )
+
+    return CCDecision(
+        accepted=accepted,
+        label=audio.label,
+        start_s=audio.start_s,
+        end_s=audio.end_s,
+        audio_confidence=audio.confidence,
+        reaction_score=visual.reaction_score,
+        combined_score=combined,
+        reason=reason,
+    )
+
+
+def _decide_ambient(
+    audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
+) -> CCDecision:
+    # Gate first: ambient sounds require a minimum visible reaction.
+    if visual.reaction_score < cfg.ambient_min_reaction:
+        combined = round(
+            audio.confidence * cfg.ambient_audio_w
+            + visual.reaction_score * cfg.ambient_visual_w,
+            4,
+        )
+        return CCDecision(
+            accepted=False,
+            label=audio.label,
+            start_s=audio.start_s,
+            end_s=audio.end_s,
+            audio_confidence=audio.confidence,
+            reaction_score=visual.reaction_score,
+            combined_score=combined,
+            reason=(
+                f"ambient sound rejected: reaction score {visual.reaction_score:.2f} "
+                f"below minimum {cfg.ambient_min_reaction} "
+                "(no visible scene response — likely background noise)"
+            ),
+        )
+
+    combined = round(
+        audio.confidence * cfg.ambient_audio_w
+        + visual.reaction_score * cfg.ambient_visual_w,
+        4,
+    )
+    accepted = combined >= cfg.ambient_min_score
+
+    if accepted:
+        reason = (
+            f"ambient event confirmed by visual reaction: combined {combined:.2f} "
+            f"(audio {audio.confidence:.2f} × {cfg.ambient_audio_w} "
+            f"+ reaction {visual.reaction_score:.2f} × {cfg.ambient_visual_w})"
+        )
+    else:
+        reason = (
+            f"ambient event: combined score {combined:.2f} "
+            f"below threshold {cfg.ambient_min_score} "
+            f"despite reaction {visual.reaction_score:.2f}"
+        )
+
+    return CCDecision(
+        accepted=accepted,
+        label=audio.label,
+        start_s=audio.start_s,
+        end_s=audio.end_s,
+        audio_confidence=audio.confidence,
+        reaction_score=visual.reaction_score,
+        combined_score=combined,
+        reason=reason,
+    )
+
+
+def _decide_general(
+    audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
+) -> CCDecision:
+    combined = round(
+        audio.confidence * cfg.general_audio_w
+        + visual.reaction_score * cfg.general_visual_w,
+        4,
+    )
+    accepted = combined >= cfg.general_min_score
+
+    if accepted:
+        reason = (
+            f"accepted: combined score {combined:.2f} "
+            f"(audio {audio.confidence:.2f} × {cfg.general_audio_w} "
+            f"+ reaction {visual.reaction_score:.2f} × {cfg.general_visual_w})"
+        )
+    else:
+        reason = (
+            f"rejected: combined score {combined:.2f} "
+            f"below threshold {cfg.general_min_score} "
+            f"(audio {audio.confidence:.2f}, reaction {visual.reaction_score:.2f})"
+        )
+
+    return CCDecision(
+        accepted=accepted,
+        label=audio.label,
+        start_s=audio.start_s,
+        end_s=audio.end_s,
+        audio_confidence=audio.confidence,
+        reaction_score=visual.reaction_score,
+        combined_score=combined,
+        reason=reason,
+    )
diff --git a/tests/__init__.py b/tests/__init__.py