diff --git a/src/hal0/slots/manager.py b/src/hal0/slots/manager.py index aa6c0dc1..3ce547da 100644 --- a/src/hal0/slots/manager.py +++ b/src/hal0/slots/manager.py @@ -29,6 +29,7 @@ import asyncio import contextlib import logging +import os import re import shutil import time @@ -111,6 +112,8 @@ # /health) is demoted to ERROR — but only after this many CONSECUTIVE failures, # so a single transient blip doesn't trigger a disruptive model reload. _HEALTH_FAIL_STRIKES: int = 2 +# Must match the exact flag spellings emitted by ContainerProvider's llama +# launch renderer; drift checks compare argv text, not llama-server aliases. _CONFIG_DRIFT_KEYS: tuple[str, ...] = ("--ctx-size", "--model", "--alias", "-b", "-ub") # Idle-monitor defaults. A READY slot whose last activity is older than @@ -1252,7 +1255,7 @@ async def compute_config_drift( diffs = [ {"key": key, "running": running_flags.get(key), "rendered": rendered_flags.get(key)} for key in _CONFIG_DRIFT_KEYS - if running_flags.get(key) != rendered_flags.get(key) + if not _config_drift_values_equal(key, running_flags.get(key), rendered_flags.get(key)) ] return {"drifted": bool(diffs), "diffs": diffs} @@ -2567,6 +2570,12 @@ def _argv_values(argv: list[str], keys: tuple[str, ...]) -> dict[str, str | None return out +def _config_drift_values_equal(key: str, running: str | None, rendered: str | None) -> bool: + if key == "--model" and running is not None and rendered is not None: + return os.path.realpath(running) == os.path.realpath(rendered) + return running == rendered + + def _normalize_ctx_key(cfg_dict: dict[str, Any]) -> None: """Fold the legacy ``[model].ctx_size`` alias into the canonical ``context_size`` (SlotConfig's field), in place (#585). diff --git a/tests/providers/test_container.py b/tests/providers/test_container.py index be81fe32..b3380016 100644 --- a/tests/providers/test_container.py +++ b/tests/providers/test_container.py @@ -524,6 +524,28 @@ def test_expected_argv_uses_launch_plan_context_derive(self) -> None: assert argv is not None assert argv[argv.index("--ctx-size") + 1] == "32768" + def test_expected_argv_emits_config_drift_watched_flag_spellings(self) -> None: + """#863: drift watches exact argv spellings, so renderer renames must fail.""" + provider = self._provider() + with patch( + "hal0.providers.container._resolve_profile", + return_value=_moe_profile(), + ): + argv = provider.expected_argv( + _slot_cfg(model={"default": "chadrock-35b-ace-saber", "context_size": 131072}), + _model_info(), + ) + + assert argv is not None + assert "--model" in argv + assert "--alias" in argv + assert "--ctx-size" in argv + assert "-b" in argv + assert "-ub" in argv + assert "-c" not in argv + assert "--batch-size" not in argv + assert "--ubatch-size" not in argv + # ── load_sync / unload_sync systemd interaction ─────────────────────────────── diff --git a/tests/slots/test_manager.py b/tests/slots/test_manager.py index ac1d0b62..312e13a7 100644 --- a/tests/slots/test_manager.py +++ b/tests/slots/test_manager.py @@ -527,6 +527,41 @@ async def test_status_omits_config_drift_when_argv_matches( assert snap.metadata.get("config_drift") == {"drifted": False, "diffs": []} +async def test_status_omits_config_drift_when_model_paths_resolve_to_same_file( + slot_root: Path, + tmp_path: Path, + container_stub: FakeContainerProvider, +) -> None: + """#863: --model compares real paths so symlink/remount spelling is stable.""" + real_dir = tmp_path / "models" + real_dir.mkdir() + model_path = real_dir / "qwen.gguf" + model_path.write_text("", encoding="utf-8") + link_dir = tmp_path / "model-link" + link_dir.symlink_to(real_dir, target_is_directory=True) + + expected = [ + "--host", + "0.0.0.0", + "--port", + "8081", + "--model", + str(model_path), + "--ctx-size", + "131072", + ] + running = list(expected) + running[running.index("--model") + 1] = str(link_dir / "qwen.gguf") + container_stub.expected_argv_by_slot["chat"] = expected + container_stub.running_argv_by_slot["chat"] = running + + sm = SlotManager() + await sm.load("chat") + snap = await sm.status("chat", include_config_drift=True) + + assert snap.metadata.get("config_drift") == {"drifted": False, "diffs": []} + + async def test_list_does_not_compute_config_drift_on_poll_path( slot_root: Path, container_stub: FakeContainerProvider,