diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 0af074f..74f45cb 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -32,6 +32,7 @@ from synthbanshee.config.scene_config import SceneConfig console = Console() +logger = logging.getLogger(__name__) class DiscoveredScene(NamedTuple): @@ -212,6 +213,23 @@ def _build_preprocessing_metadata(result: PreprocessingResult) -> PreprocessingA ) +def _infer_data_root(output_dir: Path, override: Path | None) -> Path | None: + """Pick a data root for path-rewriting in clip metadata. + + Returns *override* when supplied. Otherwise infers from *output_dir*: + the corpus convention is ``/data/he/``, so two parents up is + the right anchor. Returns ``None`` only when no sensible inference is + possible (e.g. *output_dir* has fewer than two parents), in which case + callers leave paths absolute. + """ + if override is not None: + return Path(override).resolve() + resolved = Path(output_dir).resolve() + if len(resolved.parents) < 2: + return None + return resolved.parent.parent + + def _run_generate_pipeline( config: Path, output_dir: Path, @@ -222,6 +240,7 @@ def _run_generate_pipeline( speaker_overrides: dict[str, str] | None = None, project_profile: ProjectProfile | None = None, enable_breathiness: bool = False, + data_root: Path | None = None, ) -> tuple[Path | None, list[str]]: """Run the full single-clip generate pipeline. @@ -698,6 +717,12 @@ def vlog(msg: str) -> None: effective_prosody_caps=_cap_events, ) + # #108: clip metadata records repo-relative paths so the corpus is + # portable. Anchor at *data_root*; fall back to the legacy absolute form + # if a path is genuinely outside the root (with a warning logged). + from synthbanshee.package._paths import relative_to_data_root + + _resolved_data_root = _infer_data_root(output_dir, data_root) metadata = label_gen.generate_clip_metadata( clip_id=f"{clip_id}_00", project=scene.project, @@ -709,8 +734,12 @@ def vlog(msg: str) -> None: scene_config_path=str(config), random_seed=scene.random_seed, preprocessing=preprocessing_meta, - dirty_file_path=str(result.dirty_path) if result.dirty_path else None, - transcript_path=str(clip_txt), + dirty_file_path=( + relative_to_data_root(result.dirty_path, _resolved_data_root) + if result.dirty_path + else None + ), + transcript_path=relative_to_data_root(clip_txt, _resolved_data_root), acoustic_scene=acoustic_scene_meta, quality_flags=quality_flags, generation_metadata=gen_meta, @@ -788,6 +817,17 @@ def cli() -> None: envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR", help="LLM script generation cache directory.", ) +@click.option( + "--data-root", + type=click.Path(path_type=Path), + default=None, + envvar="SYNTHBANSHEE_DATA_ROOT", + help=( + "Root directory that paths in clip JSON metadata are written " + "relative to (corpus repo root by convention). Defaults to " + "two parents above --output-dir." + ), +) @click.option( "--dry-run", is_flag=True, @@ -817,6 +857,7 @@ def generate( cache_dir: Path, dirty_dir: Path, script_cache_dir: Path, + data_root: Path | None, dry_run: bool, project_profile: str | None, verbose: bool, @@ -858,6 +899,7 @@ def generate( script_cache_dir, verbose=verbose, project_profile=profile, + data_root=data_root, ) if wav_path is None: @@ -948,13 +990,10 @@ def _distribute_speakers( Scenes whose speakers already match the assigned variant get an empty override dict (no-op). """ - import logging import random from synthbanshee.config.speaker_config import SpeakerConfig - logger = logging.getLogger(__name__) - # 1. Discover all available speakers. all_speakers: dict[str, SpeakerConfig] = {} for search_dir in [Path("configs/speakers"), Path("configs/examples")]: @@ -1040,6 +1079,7 @@ def _render_one( speaker_overrides: dict[str, str] | None = None, project_profile: ProjectProfile | None = None, enable_breathiness: bool = False, + data_root: Path | None = None, ) -> tuple[Path | None, list[str]]: """Render a single clip with retries. @@ -1067,6 +1107,7 @@ def _render_one( speaker_overrides=speaker_overrides, project_profile=project_profile, enable_breathiness=enable_breathiness, + data_root=data_root, ) if wav_path is not None: return wav_path, messages @@ -1112,6 +1153,16 @@ def _render_one( envvar="SYNTHBANSHEE_SCRIPT_CACHE_DIR", help="LLM script generation cache directory.", ) +@click.option( + "--data-root", + type=click.Path(path_type=Path), + default=None, + envvar="SYNTHBANSHEE_DATA_ROOT", + help=( + "Root directory that paths in clip JSON metadata and manifest CSV " + "are written relative to. Defaults to two parents above --output-dir." + ), +) @click.option( "--manifest-out", "-m", @@ -1159,6 +1210,7 @@ def generate_batch( cache_dir: Path, dirty_dir: Path, script_cache_dir: Path, + data_root: Path | None, manifest_out: Path | None, dry_run: bool, workers: int, @@ -1283,6 +1335,7 @@ def generate_batch( speaker_overrides=speaker_override_map.get(scene_yaml), project_profile=profile, enable_breathiness=run_cfg.enable_breathiness, + data_root=data_root, ) progress.advance(task_id) if wav_path is None: @@ -1313,10 +1366,11 @@ def generate_batch( script_cache_dir, run_cfg.max_retries, stop_event, - verbose, - speaker_override_map.get(scene_yaml), - profile, - run_cfg.enable_breathiness, + verbose=verbose, + speaker_overrides=speaker_override_map.get(scene_yaml), + project_profile=profile, + enable_breathiness=run_cfg.enable_breathiness, + data_root=data_root, ): scene_yaml for scene_yaml in selected_paths } @@ -1366,7 +1420,15 @@ def generate_batch( ) # --- Manifest --- - rows = generate_manifest(out_dir, manifest_path, splits=splits, clip_ids=set(splits.keys())) + # #108: anchor manifest paths at the same data root used for clip JSON. + _manifest_data_root = _infer_data_root(out_dir, data_root) + rows = generate_manifest( + out_dir, + manifest_path, + splits=splits, + clip_ids=set(splits.keys()), + relative_to=_manifest_data_root, + ) console.print(f"[bold green]Manifest written:[/bold green] {manifest_path} ({len(rows)} rows)") # --- Summary --- diff --git a/synthbanshee/labels/schema.py b/synthbanshee/labels/schema.py index cd0aca4..934b7ea 100644 --- a/synthbanshee/labels/schema.py +++ b/synthbanshee/labels/schema.py @@ -216,7 +216,19 @@ class ClipMetadata(BaseModel): weak_label: WeakLabel preprocessing_applied: PreprocessingApplied = Field(default_factory=PreprocessingApplied) dirty_file_path: str | None = None + """Path to the retained pre-preprocessing WAV (#108). + + Written relative to the data root passed to ``synthbanshee generate`` + via ``--data-root`` / ``SYNTHBANSHEE_DATA_ROOT`` (default: two parents + above ``--output-dir``). Older corpus snapshots may still carry + absolute paths; consumers should treat both forms as accepted but + only emit relative paths going forward.""" + transcript_path: str | None = None + """Path to the ``.txt`` transcript file (#108). + + Written relative to the data root — see ``dirty_file_path`` for the + anchor convention.""" quality_flags: list[str] = Field(default_factory=list) annotator_confidence: float = Field(ge=0.0, le=1.0, default=1.0) iaa_reviewed: bool = False diff --git a/synthbanshee/package/_paths.py b/synthbanshee/package/_paths.py new file mode 100644 index 0000000..3ed80b3 --- /dev/null +++ b/synthbanshee/package/_paths.py @@ -0,0 +1,48 @@ +"""Shared helper for repo-relative path rendering in clip metadata +and manifest CSV. #108 — keep cli.py and manifest.py in sync. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def relative_to_data_root(path: Path, data_root: Path | None) -> str: + """Render *path* as a string anchored at *data_root* when possible. + + Returns a POSIX-style relative string (``"data/he/clip.wav"``) when + *path* resolves to a location under *data_root* (symlinks resolved + on both sides). Falls back to a POSIX-style **absolute** string in + every other case: + + - *data_root* is ``None``: nothing to anchor against. + - *path* resolves outside *data_root*: also emits a + ``logger.warning`` so a misconfigured ``--data-root`` is loud + rather than silent. + + Both branches always call ``Path.resolve()`` and ``Path.as_posix()`` + so the JSON / CSV path shape is **stable** regardless of: + + - whether the caller passed a relative or absolute ``path`` + (#108 review: a relative input used to produce a relative-string + fallback, contradicting the docstring and making metadata depend + on the working directory), + - the host OS (#108 review: Windows would otherwise emit + backslashes; POSIX separators keep the corpus portable). + """ + resolved = Path(path).resolve() + if data_root is None: + return resolved.as_posix() + try: + return resolved.relative_to(Path(data_root).resolve()).as_posix() + except ValueError: + logger.warning( + "Path %s is outside data_root %s; recording absolute path. " + "Configure --data-root / SYNTHBANSHEE_DATA_ROOT to fix.", + path, + data_root, + ) + return resolved.as_posix() diff --git a/synthbanshee/package/manifest.py b/synthbanshee/package/manifest.py index 9cf37ac..96bf1cf 100644 --- a/synthbanshee/package/manifest.py +++ b/synthbanshee/package/manifest.py @@ -14,6 +14,7 @@ import pydantic from synthbanshee.labels.schema import ClipMetadata +from synthbanshee.package._paths import relative_to_data_root as _maybe_relative _MANIFEST_COLUMNS = [ "clip_id", @@ -57,6 +58,7 @@ def generate_manifest( *, splits: dict[str, str] | None = None, clip_ids: set[str] | None = None, + relative_to: Path | None = None, ) -> list[ManifestRow]: """Scan data_dir recursively for clip JSON files and write a manifest CSV. @@ -73,6 +75,10 @@ def generate_manifest( whose clip_id is in this set are included in the manifest. Use this to restrict the manifest to a specific generation run when ``data_dir`` may contain clips from previous runs. + relative_to: Optional data root for path columns (#108). When + provided, ``wav_path`` and ``strong_labels_path`` are written + relative to this directory; paths outside the root fall back to + absolute form. Returns: List of ManifestRow objects that were written to output_path. @@ -112,8 +118,10 @@ def generate_manifest( max_intensity=metadata.weak_label.max_intensity, quality_flags=",".join(metadata.quality_flags), split=(splits or {}).get(metadata.clip_id, ""), - wav_path=str(json_path.with_suffix(".wav")), - strong_labels_path=str(jsonl_path) if jsonl_path.exists() else "", + wav_path=_maybe_relative(json_path.with_suffix(".wav"), relative_to), + strong_labels_path=( + _maybe_relative(jsonl_path, relative_to) if jsonl_path.exists() else "" + ), ) ) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 9af7acd..d268afb 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -21,12 +21,14 @@ _derive_event_type, _discover_scene_configs, _distribute_speakers, + _infer_data_root, _print_batch_summary, _print_selection_summary, _run_generate_pipeline, _select_configs_by_typology, cli, ) +from synthbanshee.package._paths import relative_to_data_root from synthbanshee.package.validator import ValidationResult SCENES_DIR = Path(__file__).parent.parent.parent / "configs" / "scenes" @@ -405,6 +407,113 @@ def test_full_batch_with_mocked_tts(self, tmp_path): # Manifest CSV must exist with at least a header row assert manifest_path.exists() + def test_full_batch_writes_relative_paths_to_manifest_and_clip_json( + self, tmp_path, monkeypatch + ): + """End-to-end #108 contract: with --data-root explicit, the resulting + manifest CSV's ``wav_path`` column and the per-clip JSON's + ``transcript_path`` are repo-relative — not absolute paths. + + Catches the failure mode where the data_root threading was correct + in `_run_generate_pipeline` but forgotten in the `generate_manifest` + call (or vice versa). Both must be relative-anchored. + """ + import csv as _csv + + import yaml as _yaml + + # #107: ensure none of the dir env vars leak into this CliRunner + # invocation. The strip fixture for that lives on the #107 PR + # branch (#110); this test keeps the protection local until that + # PR merges so the two fixes can land independently. + for _env in ( + "SYNTHBANSHEE_DATA_DIR", + "SYNTHBANSHEE_CACHE_DIR", + "SYNTHBANSHEE_SCRIPT_CACHE_DIR", + ): + monkeypatch.delenv(_env, raising=False) + + # Lay out the corpus layout `/data/he/`. + data_root = tmp_path / "corpus" + out_dir = data_root / "data" / "he" + out_dir.mkdir(parents=True) + + scenes_dir = tmp_path / "scenes" + scenes_dir.mkdir() + scene = { + "scene_id": "SP_NEU_A_0000", + "project": "she_proves", + "language": "he", + "violence_typology": "NEU", + "tier": "A", + "random_seed": 0, + "speakers": [{"speaker_id": "AGG_M_30-45_001", "role": "AGG"}], + "script_template": "synthbanshee/script/templates/she_proves/neutral_domestic_routine.j2", + "script_slots": {}, + "intensity_arc": [1, 1, 1], + "target_duration_minutes": 3.0, + "output_dir": str(out_dir), + } + (scenes_dir / "scene_000.yaml").write_text(_yaml.dump(scene), encoding="utf-8") + + run_cfg_path = tmp_path / "run.yaml" + run_cfg_path.write_text( + _BATCH_RUN_CONFIG_TEMPLATE.format( + output_dir=str(out_dir), + scene_configs_dir=str(scenes_dir), + ), + encoding="utf-8", + ) + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + manifest_path = data_root / "manifest.csv" + runner = CliRunner() + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + result = runner.invoke( + cli, + [ + "generate-batch", + "--run-config", + str(run_cfg_path), + "--manifest-out", + str(manifest_path), + "--cache-dir", + str(data_root / "assets" / "speech"), + "--dirty-dir", + str(data_root / "assets" / "speech" / "dirty"), + "--script-cache-dir", + str(data_root / "assets" / "scripts"), + "--data-root", + str(data_root), + ], + ) + + assert result.exit_code == 0, result.output + + # Manifest CSV: wav_path column must be repo-relative. + with manifest_path.open(encoding="utf-8") as fh: + rows = list(_csv.DictReader(fh)) + assert len(rows) == 1, f"expected one row, got: {rows}" + wav_col = rows[0]["wav_path"] + assert not Path(wav_col).is_absolute(), ( + f"manifest wav_path should be relative, got: {wav_col!r}" + ) + assert wav_col.startswith("data/he/"), wav_col + + # Per-clip JSON: transcript_path must also be repo-relative. + clip_jsons = list(out_dir.rglob("*.json")) + assert clip_jsons, "no clip JSON written" + meta = json.loads(clip_jsons[0].read_text(encoding="utf-8")) + assert not Path(meta["transcript_path"]).is_absolute() + assert meta["transcript_path"].startswith("data/he/"), meta["transcript_path"] + def test_no_distribute_speakers_skips_distribution(self, tmp_path): """--no-distribute-speakers produces empty override maps for every scene.""" import yaml as _yaml @@ -838,6 +947,96 @@ def test_completely_unknown_emotional_state_fails_cleanly(self, tmp_path): "clip_json should not exist on pre-validation failure" ) + def test_metadata_paths_are_repo_relative(self, tmp_path): + """`dirty_file_path` and `transcript_path` in clip JSON are repo-relative (#108). + + Models the corpus layout: ``/data/he//`` + and ``/assets/speech/dirty/_dirty.wav``. With + ``data_root`` resolved to ````, both metadata path + fields must come out relative — not absolute, machine-specific + paths as in pre-#108 corpus snapshots. + """ + data_root = tmp_path / "corpus" + out_dir = data_root / "data" / "he" + cache_dir = data_root / "assets" / "speech" + dirty_dir = cache_dir / "dirty" + script_cache_dir = data_root / "assets" / "scripts" + for d in (out_dir, cache_dir, dirty_dir, script_cache_dir): + d.mkdir(parents=True, exist_ok=True) + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + wav, _messages = _run_generate_pipeline( + SCENES_DIR / "test_scene_001.yaml", + out_dir, + cache_dir, + dirty_dir, + script_cache_dir, + data_root=data_root, + ) + + assert wav is not None + meta_json = json.loads(wav.with_suffix(".json").read_text(encoding="utf-8")) + assert meta_json["transcript_path"].startswith("data/he/"), ( + f"transcript_path should be repo-relative, got: {meta_json['transcript_path']}" + ) + assert not Path(meta_json["transcript_path"]).is_absolute() + if meta_json.get("dirty_file_path"): + assert meta_json["dirty_file_path"].startswith("assets/speech/dirty/"), ( + f"dirty_file_path should be repo-relative, got: {meta_json['dirty_file_path']}" + ) + assert not Path(meta_json["dirty_file_path"]).is_absolute() + + def test_metadata_paths_fall_back_to_absolute_when_outside_root(self, tmp_path): + """Paths outside *data_root* keep their absolute form (graceful fallback). + + Documents the contract: ``_relative_to_data_root`` does not raise + when a path is outside the root — it returns ``str(path)`` so the + clip is still written successfully. Future runs with a correct + ``data_root`` produce relative paths; legacy data is not retro- + actively rewritten. + """ + out_dir = tmp_path / "out" + cache_dir = tmp_path / "cache" + dirty_dir = tmp_path / "dirty" + script_cache_dir = tmp_path / "scripts" + for d in (out_dir, cache_dir, dirty_dir, script_cache_dir): + d.mkdir(parents=True, exist_ok=True) + # data_root deliberately points elsewhere — neither out_dir nor + # dirty_dir are under it. + bogus_root = tmp_path / "elsewhere" + bogus_root.mkdir() + + turns = _make_dialogue_turns(n=1) + mixed = _make_mixed_scene(n_turns=1) + + with ( + patch("synthbanshee.script.generator.ScriptGenerator") as MockGen, + patch("synthbanshee.tts.renderer.TTSRenderer") as MockRenderer, + ): + MockGen.return_value.generate.return_value = turns + MockRenderer.return_value.render_scene.return_value = mixed + wav, _ = _run_generate_pipeline( + SCENES_DIR / "test_scene_001.yaml", + out_dir, + cache_dir, + dirty_dir, + script_cache_dir, + data_root=bogus_root, + ) + + assert wav is not None + meta_json = json.loads(wav.with_suffix(".json").read_text(encoding="utf-8")) + # Out-of-root → absolute fallback. Not great, but not a crash. + assert Path(meta_json["transcript_path"]).is_absolute() + # --------------------------------------------------------------------------- # generate command — additional failure and warning branches @@ -1185,6 +1384,7 @@ def _fail_then_succeed( speaker_overrides=None, project_profile=None, enable_breathiness=False, + data_root=None, ): call_count[0] += 1 if call_count[0] == 1: @@ -1730,6 +1930,116 @@ def _fake_preprocessing_result(peak_dbfs: float): ) +class TestInferDataRoot: + """Direct unit tests for `_infer_data_root` — the helper that owns the + "two parents above output_dir" magic used by #108. Covers every branch + of the function so a future refactor can't silently regress the + inference contract. + """ + + def test_explicit_override_returned_resolved(self, tmp_path): + """An explicit override is returned (resolved, unchanged otherwise).""" + override = tmp_path / "explicit_root" + override.mkdir() + # output_dir is irrelevant when override is set + result = _infer_data_root(Path("/whatever/data/he"), override) + assert result == override.resolve() + + def test_corpus_shape_inferred_to_two_parents_up(self, tmp_path): + """`/data/he/` → infers `` (the corpus convention).""" + data_he = tmp_path / "data" / "he" + data_he.mkdir(parents=True) + result = _infer_data_root(data_he, override=None) + assert result == tmp_path.resolve() + + def test_shallow_root_returns_none(self): + """A path with fewer than two parents has no sensible data root.""" + # "/" has zero parents; "/foo" has one parent ("/"). Both bail. + assert _infer_data_root(Path("/"), override=None) is None + + def test_relative_path_resolves_against_cwd(self, tmp_path, monkeypatch): + """A relative output_dir resolves against cwd before inference.""" + nested = tmp_path / "corpus" / "data" / "he" + nested.mkdir(parents=True) + monkeypatch.chdir(tmp_path / "corpus") + result = _infer_data_root(Path("data/he"), override=None) + # Resolved path: /corpus/data/he → parent.parent → /corpus + assert result == (tmp_path / "corpus").resolve() + + +class TestRelativeToDataRoot: + """Direct unit tests for the shared helper that owns #108's + repo-relative path rendering — exercised by both cli.py + (clip JSON) and manifest.py (manifest CSV).""" + + def test_returns_posix_relative_when_under_root(self, tmp_path): + root = tmp_path / "corpus" + path = root / "data" / "he" / "clip.wav" + path.parent.mkdir(parents=True) + path.touch() + # POSIX separators, regardless of host OS (#108 Copilot review). + assert relative_to_data_root(path, root) == "data/he/clip.wav" + + def test_data_root_none_returns_resolved_absolute_posix(self, tmp_path): + """`data_root=None` returns a *resolved* absolute POSIX string.""" + target = tmp_path / "corpus" / "data" / "he" / "clip.wav" + target.parent.mkdir(parents=True) + target.touch() + result = relative_to_data_root(target, None) + assert Path(result).is_absolute() + # No backslashes — POSIX everywhere. + assert "\\" not in result + + def test_relative_input_data_root_none_still_returns_absolute(self, tmp_path, monkeypatch): + """A relative input + ``data_root=None`` must resolve to absolute. + + Copilot's #108 review caught the original bug: the function + claimed "absolute fallback" but returned ``str(path)`` verbatim, + which is relative if the input was relative. The fix calls + ``Path.resolve()`` on every code path. + """ + nested = tmp_path / "corpus" / "data" / "he" + nested.mkdir(parents=True) + (nested / "clip.wav").touch() + monkeypatch.chdir(tmp_path / "corpus") + result = relative_to_data_root(Path("data/he/clip.wav"), None) + assert Path(result).is_absolute(), ( + f"relative input with data_root=None must resolve to absolute, got: {result!r}" + ) + + def test_path_outside_root_falls_back_to_resolved_absolute_with_warning(self, tmp_path, caplog): + root = tmp_path / "corpus" + root.mkdir() + outside = tmp_path / "elsewhere" / "clip.wav" + outside.parent.mkdir(parents=True) + outside.touch() + with caplog.at_level("WARNING", logger="synthbanshee.package._paths"): + result = relative_to_data_root(outside, root) + assert Path(result).is_absolute() + # Must be resolved (no backslashes, no '..'). + assert "\\" not in result and ".." not in result + # The point of the warning: misconfigured data_root must not be silent. + assert any("is outside data_root" in rec.message for rec in caplog.records), ( + f"No fallback warning emitted. caplog records: {caplog.records}" + ) + + def test_relative_input_outside_root_also_resolves_to_absolute( + self, tmp_path, monkeypatch, caplog + ): + """Combined edge case from Copilot's #108 review: relative input + + path outside the configured data_root must still produce an + absolute string (not a misleading relative one).""" + root = tmp_path / "corpus" + root.mkdir() + elsewhere = tmp_path / "elsewhere" + elsewhere.mkdir() + (elsewhere / "clip.wav").touch() + monkeypatch.chdir(tmp_path) + with caplog.at_level("WARNING", logger="synthbanshee.package._paths"): + result = relative_to_data_root(Path("elsewhere/clip.wav"), root) + assert Path(result).is_absolute(), f"expected absolute fallback, got: {result!r}" + + class TestBuildPreprocessingMetadata: """`_build_preprocessing_metadata` must pass the *measured* peak from `PreprocessingResult.peak_dbfs` into `PreprocessingApplied.normalized_dbfs` diff --git a/tests/unit/test_manifest.py b/tests/unit/test_manifest.py index 6f306d4..20ecf00 100644 --- a/tests/unit/test_manifest.py +++ b/tests/unit/test_manifest.py @@ -175,6 +175,37 @@ def test_csv_has_all_expected_columns(self, tmp_path): reader = csv.DictReader(fh) assert set(reader.fieldnames or []) == set(_MANIFEST_COLUMNS) + def test_relative_to_rewrites_path_columns(self, tmp_path): + """When ``relative_to`` is set, ``wav_path`` / ``strong_labels_path`` + are written relative to that root (#108).""" + speaker_dir = tmp_path / "data" / "he" / "agg_m_30-45_001" + wav_path = _write_valid_clip(speaker_dir, "clip_rel_00") + # Also drop a .jsonl so the strong_labels_path column is populated. + wav_path.with_suffix(".jsonl").write_text("{}\n", encoding="utf-8") + out = tmp_path / "manifest.csv" + + rows = generate_manifest(tmp_path, out, relative_to=tmp_path) + + assert len(rows) == 1 + assert rows[0].wav_path == "data/he/agg_m_30-45_001/clip_rel_00.wav" + assert rows[0].strong_labels_path == "data/he/agg_m_30-45_001/clip_rel_00.jsonl" + assert not Path(rows[0].wav_path).is_absolute() + + def test_relative_to_none_keeps_absolute_paths(self, tmp_path): + """Default ``relative_to=None`` preserves pre-#108 absolute-path behaviour. + + Documents the back-compat contract — callers that don't opt into + the new ``relative_to`` argument see no behaviour change. + """ + speaker_dir = tmp_path / "spk_abs" + _write_valid_clip(speaker_dir, "clip_abs_00") + out = tmp_path / "manifest.csv" + + rows = generate_manifest(tmp_path, out) + + assert len(rows) == 1 + assert Path(rows[0].wav_path).is_absolute() + def test_dirty_files_excluded(self, tmp_path): """JSON files whose stems contain '_dirty' must not appear in the manifest.""" clip_dir = tmp_path / "spk_000"