Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: tests

on:
push:
branches: ["main", "feat/**", "fix/**", "chore/**"]
pull_request:
branches: ["main"]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]

steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: install test dependencies
run: pip install -r requirements-dev.txt

- name: run tests
run: python -m pytest tests/ -v --tb=short
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pytest>=7.4.0
Empty file added src/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions src/fusion/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""CC Decision Engine — Goal 3 of the intelligent CC suggestion pipeline."""

from src.fusion.engine import (
AudioSignal,
CCDecision,
FusionConfig,
VisualSignal,
batch_decide,
decide,
)

__all__ = [
"AudioSignal",
"VisualSignal",
"CCDecision",
"FusionConfig",
"decide",
"batch_decide",
]
299 changes: 299 additions & 0 deletions src/fusion/engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
"""CC Decision Engine — Goal 3.

Combines audio event signals and visual reaction signals into a
CC / no-CC decision. The key design principle is *category-aware fusion*:
the balance between audio and visual evidence depends on what kind of
sound was detected.

HIGH_IMPACT events (gunshot, explosion, alarm, siren, glass breaking)
→ audio confidence alone is usually sufficient; visual reaction is
a bonus that can rescue lower-confidence detections.

AMBIENT events (music, rain, wind, traffic)
→ these sounds play for long stretches without warranting a caption.
Visual reaction must confirm that the sound is actually affecting
the scene before the engine fires.

GENERAL events (applause, crying, dog barking, crowd, etc.)
→ weighted fusion; audio leads but a strong visual reaction can
push borderline detections over the threshold.

No ML dependencies — all decisions are deterministic arithmetic on the
confidence scores produced by upstream modules.
"""

from __future__ import annotations

import dataclasses

# ---------------------------------------------------------------------------
# Category sets (mirrors src/audio/labels.py — duplicated so this module
# is standalone and does not depend on Goal 1 being merged first).
# ---------------------------------------------------------------------------

_HIGH_IMPACT: frozenset[str] = frozenset({
"[gunshot]",
"[explosion]",
"[alarm]",
"[siren]",
"[glass breaking]",
})

_AMBIENT: frozenset[str] = frozenset({
"[music]",
"[rain]",
"[wind]",
"[traffic]",
})


# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------

@dataclasses.dataclass
class AudioSignal:
"""Audio event emitted by the sound event detection module."""

label: str
start_s: float
end_s: float
confidence: float


@dataclasses.dataclass
class VisualSignal:
"""Visual reaction score emitted by the speaker reaction module.

*reaction_score* is a value in [0, 1] where 0 means no detectable
reaction and 1 means a strong, unambiguous reaction to the audio event.
Pass ``reaction_score=0.0`` when no visual analysis was performed.
"""

reaction_score: float


@dataclasses.dataclass
class CCDecision:
"""Output of the fusion engine for a single audio event."""

accepted: bool
label: str
start_s: float
end_s: float
audio_confidence: float
reaction_score: float
combined_score: float
# Plain-English explanation of why the event was accepted or rejected.
reason: str


@dataclasses.dataclass
class FusionConfig:
"""Threshold configuration for the decision engine.

All values are in [0, 1]. The defaults are tuned to keep precision
high at the cost of some recall — better to miss a marginal caption
than to flood a video with ambient-noise captions.
"""

# HIGH_IMPACT: audio weight, visual weight, minimum combined score.
high_impact_audio_w: float = 0.80
high_impact_visual_w: float = 0.20
high_impact_min_score: float = 0.40

# AMBIENT: visual weight is dominant; also gate on minimum reaction.
ambient_audio_w: float = 0.35
ambient_visual_w: float = 0.65
ambient_min_reaction: float = 0.35
ambient_min_score: float = 0.55

# GENERAL: audio leads but visual reaction can tip borderline events.
general_audio_w: float = 0.60
general_visual_w: float = 0.40
general_min_score: float = 0.45


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

_DEFAULT_CONFIG = FusionConfig()


def decide(
audio: AudioSignal,
visual: VisualSignal,
config: FusionConfig = _DEFAULT_CONFIG,
) -> CCDecision:
"""Return a CC/no-CC decision for one audio event.

Parameters
----------
audio:
Event from the sound event detection module.
visual:
Reaction score from the speaker reaction module. Use
``VisualSignal(reaction_score=0.0)`` when no visual analysis
was performed.
config:
Threshold configuration. Defaults are tuned for high precision.
"""
category = _category(audio.label)

if category == "HIGH_IMPACT":
return _decide_high_impact(audio, visual, config)
if category == "AMBIENT":
return _decide_ambient(audio, visual, config)
return _decide_general(audio, visual, config)


def batch_decide(
pairs: list[tuple[AudioSignal, VisualSignal]],
config: FusionConfig = _DEFAULT_CONFIG,
) -> list[CCDecision]:
"""Run :func:`decide` over a list of (audio, visual) pairs."""
return [decide(a, v, config) for a, v in pairs]


# ---------------------------------------------------------------------------
# Category routing
# ---------------------------------------------------------------------------

def _category(label: str) -> str:
if label in _HIGH_IMPACT:
return "HIGH_IMPACT"
if label in _AMBIENT:
return "AMBIENT"
return "GENERAL"


# ---------------------------------------------------------------------------
# Per-category decision logic
# ---------------------------------------------------------------------------

def _decide_high_impact(
audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
) -> CCDecision:
combined = (
audio.confidence * cfg.high_impact_audio_w
+ visual.reaction_score * cfg.high_impact_visual_w
)
combined = round(combined, 4)
accepted = combined >= cfg.high_impact_min_score

if accepted:
reason = (
f"high-impact event: combined score {combined:.2f} "
f"(audio {audio.confidence:.2f} × {cfg.high_impact_audio_w} "
f"+ reaction {visual.reaction_score:.2f} × {cfg.high_impact_visual_w})"
)
else:
reason = (
f"high-impact event below threshold: combined {combined:.2f} "
f"< {cfg.high_impact_min_score} "
f"(audio confidence {audio.confidence:.2f} too low)"
)

return CCDecision(
accepted=accepted,
label=audio.label,
start_s=audio.start_s,
end_s=audio.end_s,
audio_confidence=audio.confidence,
reaction_score=visual.reaction_score,
combined_score=combined,
reason=reason,
)


def _decide_ambient(
audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
) -> CCDecision:
# Gate first: ambient sounds require a minimum visible reaction.
if visual.reaction_score < cfg.ambient_min_reaction:
combined = round(
audio.confidence * cfg.ambient_audio_w
+ visual.reaction_score * cfg.ambient_visual_w,
4,
)
return CCDecision(
accepted=False,
label=audio.label,
start_s=audio.start_s,
end_s=audio.end_s,
audio_confidence=audio.confidence,
reaction_score=visual.reaction_score,
combined_score=combined,
reason=(
f"ambient sound rejected: reaction score {visual.reaction_score:.2f} "
f"below minimum {cfg.ambient_min_reaction} "
"(no visible scene response — likely background noise)"
),
)

combined = round(
audio.confidence * cfg.ambient_audio_w
+ visual.reaction_score * cfg.ambient_visual_w,
4,
)
accepted = combined >= cfg.ambient_min_score

if accepted:
reason = (
f"ambient event confirmed by visual reaction: combined {combined:.2f} "
f"(audio {audio.confidence:.2f} × {cfg.ambient_audio_w} "
f"+ reaction {visual.reaction_score:.2f} × {cfg.ambient_visual_w})"
)
else:
reason = (
f"ambient event: combined score {combined:.2f} "
f"below threshold {cfg.ambient_min_score} "
f"despite reaction {visual.reaction_score:.2f}"
)

return CCDecision(
accepted=accepted,
label=audio.label,
start_s=audio.start_s,
end_s=audio.end_s,
audio_confidence=audio.confidence,
reaction_score=visual.reaction_score,
combined_score=combined,
reason=reason,
)


def _decide_general(
audio: AudioSignal, visual: VisualSignal, cfg: FusionConfig
) -> CCDecision:
combined = round(
audio.confidence * cfg.general_audio_w
+ visual.reaction_score * cfg.general_visual_w,
4,
)
accepted = combined >= cfg.general_min_score

if accepted:
reason = (
f"accepted: combined score {combined:.2f} "
f"(audio {audio.confidence:.2f} × {cfg.general_audio_w} "
f"+ reaction {visual.reaction_score:.2f} × {cfg.general_visual_w})"
)
else:
reason = (
f"rejected: combined score {combined:.2f} "
f"below threshold {cfg.general_min_score} "
f"(audio {audio.confidence:.2f}, reaction {visual.reaction_score:.2f})"
)

return CCDecision(
accepted=accepted,
label=audio.label,
start_s=audio.start_s,
end_s=audio.end_s,
audio_confidence=audio.confidence,
reaction_score=visual.reaction_score,
combined_score=combined,
reason=reason,
)
Empty file added tests/__init__.py
Empty file.
Loading