From 5daada572a51e4544c7882883044dac177781f49 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 20:43:46 -0800 Subject: [PATCH 01/89] add docker example --- examples/docker_hello_world/Dockerfile | 15 +++ examples/docker_hello_world/hello_docker.py | 134 ++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 examples/docker_hello_world/Dockerfile create mode 100644 examples/docker_hello_world/hello_docker.py diff --git a/examples/docker_hello_world/Dockerfile b/examples/docker_hello_world/Dockerfile new file mode 100644 index 0000000000..3ceb24b3b4 --- /dev/null +++ b/examples/docker_hello_world/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.12-slim + +RUN apt-get update && apt-get install -y \ + iproute2 \ + libx11-6 libgl1 libglib2.0-0 \ + libidn2-0 libgfortran5 libgomp1 \ + cowsay \ + && rm -rf /var/lib/apt/lists/* + + +# Copy example module so it's importable inside the container +COPY examples/docker_hello_world/hello_docker.py /dimos/source/examples/docker_hello_world/hello_docker.py +RUN touch /dimos/source/examples/__init__.py /dimos/source/examples/docker_hello_world/__init__.py + +WORKDIR /app diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py new file mode 100644 index 0000000000..c6a5f0bb3e --- /dev/null +++ b/examples/docker_hello_world/hello_docker.py @@ -0,0 +1,134 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Hello World Docker Module +========================== + +Minimal example showing a DimOS module running inside Docker. + +The module receives a string on its ``prompt`` input stream, runs it through +cowsay inside the container, and publishes the ASCII art on its ``greeting`` +output stream. + +NOTE: Requires Linux. Docker Desktop on macOS does not support host networking, +which is needed for LCM multicast between host and container. + +Usage: + python examples/docker_hello_world/hello_docker.py +""" + +from __future__ import annotations + +from pathlib import Path +import subprocess +import time + +from dimos.core.blueprints import autoconnect +from dimos.core.core import rpc +from dimos.core.docker_runner import DockerModuleConfig +from dimos.core.module import Module +from dimos.core.stream import In, Out + +# --------------------------------------------------------------------------- +# Docker module (runs inside container) +# --------------------------------------------------------------------------- + + +class HelloDockerConfig(DockerModuleConfig): + docker_image: str = "dimos-hello-docker:latest" + docker_file: Path | None = Path(__file__).parent / "Dockerfile" + docker_build_context: Path | None = Path(__file__).parents[2] # repo root + docker_gpus: str | None = None # no GPU needed + docker_rm: bool = True + docker_restart_policy: str = "no" + docker_env: dict[str, str] = {"CI": "1"} # skip interactive system configurator + + +class HelloDockerModule(Module["HelloDockerConfig"]): + """A trivial module that runs inside Docker and echoes greetings.""" + + default_config = HelloDockerConfig + + prompt: In[str] + greeting: Out[str] + + @rpc + def start(self) -> None: + super().start() + self.prompt.subscribe(self._on_prompt) + + def _cowsay(self, text: str) -> str: + """Run cowsay inside the container and return the ASCII art.""" + result = subprocess.run( + ["/usr/games/cowsay", text], + capture_output=True, + text=True, + ) + return result.stdout + + def _on_prompt(self, text: str) -> None: + art = self._cowsay(text) + print(f"[HelloDockerModule]\n{art}") + self.greeting.publish(art) + + @rpc + def greet(self, name: str) -> str: + """RPC method that can be called directly.""" + return self._cowsay(f"Hello, {name}!") + + +# --------------------------------------------------------------------------- +# Host-side module (sends prompts and prints greetings) +# --------------------------------------------------------------------------- + + +class PromptModule(Module): + """Publishes prompts and listens to greetings.""" + + prompt: Out[str] + greeting: In[str] + + @rpc + def start(self) -> None: + super().start() + self.greeting.subscribe(self._on_greeting) + + def _on_greeting(self, text: str) -> None: + print(f"[PromptModule] Received: {text}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + coordinator = autoconnect( + PromptModule.blueprint(), + HelloDockerModule.blueprint(), + ).build() + + # Get module proxies + prompt_mod = coordinator.get_instance(PromptModule) + docker_mod = coordinator.get_instance(HelloDockerModule) + + # Test RPC + print(docker_mod.greet("World")) + + # Test stream + prompt_mod.prompt.publish("stream test") + time.sleep(2) + + coordinator.close_all() + print("Done!") From 1412542bfdd6e762729d77e01e3ce08c441ebaca Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 21:10:12 -0800 Subject: [PATCH 02/89] add docker module system --- dimos/core/docker_worker_manager.py | 57 ++++++++++++++++++++++++++++ dimos/core/module_coordinator.py | 58 +++++++++++++++++++++++++---- 2 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py new file mode 100644 index 0000000000..42843577ba --- /dev/null +++ b/dimos/core/docker_worker_manager.py @@ -0,0 +1,57 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from dimos.core.docker_runner import DockerModule +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.core.module import Module + +logger = setup_logger() + + +class DockerWorkerManager: + """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules.""" + + def __init__(self) -> None: + self._docker_modules: list[DockerModule] = [] + self._closed = False + + def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: + if self._closed: + raise RuntimeError("DockerWorkerManager is closed") + + logger.info("Deploying module in Docker.", module=module_class.__name__) + dm = DockerModule(module_class, *args, **kwargs) + self._docker_modules.append(dm) + return dm + + def close_all(self) -> None: + if self._closed: + return + self._closed = True + + logger.info("Stopping all Docker modules...") + for dm in reversed(self._docker_modules): + try: + dm.stop() + except Exception: + logger.error("Error stopping Docker module", exc_info=True) + + self._docker_modules.clear() + logger.info("All Docker modules stopped.") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 86afb9ebc4..9d33255d4c 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,6 +18,8 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_runner import is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -33,6 +35,7 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _client: WorkerManager | None = None + _docker_client: DockerWorkerManager | None = None _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" @@ -53,6 +56,7 @@ def start(self) -> None: n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() + self._docker_client = DockerWorkerManager() if self._global_config.dtop: from dimos.core.resource_monitor.monitor import StatsMonitor @@ -73,15 +77,23 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) + if self._docker_client is not None: + self._docker_client.close_all() self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - module: ModuleProxy = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - self._deployed_modules[module_class] = module - return module + if is_docker_module(module_class): + if not self._docker_client: + self._docker_client = DockerWorkerManager() + module = self._docker_client.deploy(module_class, *args, **kwargs) # type: ignore[assignment] + else: + module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] + + self._deployed_modules[module_class] = module # type: ignore[assignment] + return module # type: ignore[return-value] def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] @@ -89,10 +101,42 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - modules = self._client.deploy_parallel(module_specs) - for (module_class, _, _), module in zip(module_specs, modules, strict=True): - self._deployed_modules[module_class] = module # type: ignore[assignment] - return modules # type: ignore[return-value] + # Separate docker modules from regular modules + docker_specs = [] + worker_specs = [] + spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) + + for spec in module_specs: + module_class = spec[0] + if is_docker_module(module_class): + spec_indices.append(("docker", len(docker_specs))) + docker_specs.append(spec) + else: + spec_indices.append(("worker", len(worker_specs))) + worker_specs.append(spec) + + # Deploy worker modules in parallel via WorkerManager + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + + # Deploy docker modules (each gets its own DockerModule) + docker_results: list[Any] = [] + for module_class, args, kwargs in docker_specs: + if not self._docker_client: + self._docker_client = DockerWorkerManager() + dm = self._docker_client.deploy(module_class, *args, **kwargs) + docker_results.append(dm) + + # Reassemble results in original order + results: list[Any] = [] + for kind, idx in spec_indices: + if kind == "docker": + results.append(docker_results[idx]) + else: + results.append(worker_results[idx]) + + for (module_class, _, _), module in zip(module_specs, results, strict=True): + self._deployed_modules[module_class] = module + return results # type: ignore[return-value] def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) From b63bf73177f0ef2fd8ff138d232f1a97d10cbbd5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:15:46 -0800 Subject: [PATCH 03/89] fixup --- .gitignore | 1 + dimos/core/docker_runner.py | 41 +++- dimos/core/docker_worker_manager.py | 1 + dimos/core/module.py | 3 +- dimos/core/module_coordinator.py | 15 +- dimos/core/tests/test_docker_deployment.py | 223 ++++++++++++++++++++ examples/docker_hello_world/hello_docker.py | 9 +- pyproject.toml | 2 + uv.lock | 4 + 9 files changed, 285 insertions(+), 14 deletions(-) create mode 100644 dimos/core/tests/test_docker_deployment.py diff --git a/.gitignore b/.gitignore index 4045db012e..12b2f19ca3 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,7 @@ package-lock.json # Ignore build artifacts dist/ build/ +.Dockerfile.dimos # Ignore data directory but keep .lfs subdirectory data/* diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index ee56163ca6..566e28a70e 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,17 +25,20 @@ import time from typing import TYPE_CHECKING, Any -from dimos.core.docker_build import build_image, image_exists -from dimos.core.module import Module, ModuleConfig +from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall -from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger -from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT + +# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 if TYPE_CHECKING: from collections.abc import Callable from pathlib import Path + from dimos.core.module import Module + logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution @@ -186,7 +189,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}" ) - # RPC setup + # RPC setup (lazy import to keep container-side imports light) + from dimos.protocol.rpc import LCMRPC + self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) @@ -194,6 +199,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._bound_rpc_calls: dict[str, RpcCall] = {} # Build image if needed (but don't start - caller must call start() explicitly) + from dimos.core.docker_build import build_image, image_exists + if not image_exists(config): logger.info(f"Building {config.docker_image}") build_image(config) @@ -400,7 +407,29 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: if cfg.docker_command: return list(cfg.docker_command) - module_path = f"{self._module_class.__module__}.{self._module_class.__name__}" + module_name = self._module_class.__module__ + if module_name == "__main__": + # When run as `python script.py`, __module__ is "__main__". + # Resolve to the actual dotted module path so the container can import it. + import __main__ + + spec = getattr(__main__, "__spec__", None) + if spec and spec.name: + module_name = spec.name + else: + # Fallback: derive from file path relative to cwd + main_file = getattr(__main__, "__file__", None) + if main_file: + import pathlib + + rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + module_name = str(rel.with_suffix("")).replace("/", ".") + else: + raise RuntimeError( + "Cannot determine module path for __main__. " + "Run with `python -m` or set docker_command explicitly." + ) + module_path = f"{module_name}.{self._module_class.__name__}" # Filter out docker-specific kwargs (paths, etc.) - only pass module config kwargs = {"config": _extract_module_config(cfg)} payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs} diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 42843577ba..97f27a6d7a 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -38,6 +38,7 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) + dm.start() # Docker modules must be running before streams/RPC can be wired self._docker_modules.append(dm) return dm diff --git a/dimos/core/module.py b/dimos/core/module.py index 48a99a79a3..127be545fe 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -218,11 +218,12 @@ def inputs(self) -> dict[str, In]: # type: ignore[type-arg] @classproperty def rpcs(self) -> dict[str, Callable[..., Any]]: + _skip = {"rpcs", "blueprint", "module_info", "io"} return { name: getattr(self, name) for name in dir(self) if not name.startswith("_") - and name != "rpcs" # Exclude the rpcs property itself to prevent recursion + and name not in _skip and callable(getattr(self, name, None)) and hasattr(getattr(self, name), "__rpc__") } diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 9d33255d4c..dae1760b9e 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,14 +18,13 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger if TYPE_CHECKING: + from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor from dimos.core.rpc_client import ModuleProxy @@ -53,6 +52,8 @@ def __init__( self._deployed_modules = {} def start(self) -> None: + from dimos.core.docker_worker_manager import DockerWorkerManager + n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() @@ -85,6 +86,9 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") + from dimos.core.docker_runner import is_docker_module + from dimos.core.docker_worker_manager import DockerWorkerManager + if is_docker_module(module_class): if not self._docker_client: self._docker_client = DockerWorkerManager() @@ -101,9 +105,12 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") + from dimos.core.docker_runner import is_docker_module + from dimos.core.docker_worker_manager import DockerWorkerManager + # Separate docker modules from regular modules - docker_specs = [] - worker_specs = [] + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) for spec in module_specs: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py new file mode 100644 index 0000000000..85f2b0508a --- /dev/null +++ b/dimos/core/tests/test_docker_deployment.py @@ -0,0 +1,223 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Smoke tests for Docker module deployment routing. + +These tests verify that the ModuleCoordinator correctly detects and routes +docker modules to the DockerWorkerManager WITHOUT actually running Docker. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + +import pytest + +from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager +from dimos.core.module import Module +from dimos.core.module_coordinator import ModuleCoordinator +from dimos.core.stream import Out + +if TYPE_CHECKING: + from pathlib import Path + +# -- Fixtures: fake module classes ------------------------------------------- + + +@dataclass +class FakeDockerConfig(DockerModuleConfig): + docker_image: str = "fake:latest" + docker_file: Path | None = None + docker_gpus: str | None = None + docker_rm: bool = True + docker_restart_policy: str = "no" + + +class FakeDockerModule(Module["FakeDockerConfig"]): + default_config = FakeDockerConfig + output: Out[str] + + +class FakeRegularModule(Module): + output: Out[str] + + +# -- Tests ------------------------------------------------------------------- + + +class TestIsDockerModule: + def test_docker_module_detected(self): + assert is_docker_module(FakeDockerModule) is True + + def test_regular_module_not_detected(self): + assert is_docker_module(FakeRegularModule) is False + + def test_plain_class_not_detected(self): + assert is_docker_module(str) is False + + def test_no_default_config(self): + class Bare(Module): + pass + + # Module has default_config = ModuleConfig, which is not DockerModuleConfig + assert is_docker_module(Bare) is False + + +class TestDockerWorkerManager: + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_deploy_creates_docker_module(self, mock_docker_module_cls): + mock_instance = MagicMock() + mock_docker_module_cls.return_value = mock_instance + + mgr = DockerWorkerManager() + result = mgr.deploy(FakeDockerModule, some_kwarg="value") + + mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value") + assert result is mock_instance + assert len(mgr._docker_modules) == 1 + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls): + dm1 = MagicMock() + dm2 = MagicMock() + mock_docker_module_cls.side_effect = [dm1, dm2] + + mgr = DockerWorkerManager() + mgr.deploy(FakeDockerModule) + mgr.deploy(FakeDockerModule) + mgr.close_all() + + # Stopped in reverse order + assert dm2.stop.call_count == 1 + assert dm1.stop.call_count == 1 + assert dm2.stop.called + assert dm1.stop.called + assert len(mgr._docker_modules) == 0 + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_close_all_idempotent(self, mock_docker_module_cls): + mock_docker_module_cls.return_value = MagicMock() + mgr = DockerWorkerManager() + mgr.deploy(FakeDockerModule) + mgr.close_all() + mgr.close_all() # second call should be no-op + + @patch("dimos.core.docker_worker_manager.DockerModule") + def test_deploy_after_close_raises(self, mock_docker_module_cls): + mgr = DockerWorkerManager() + mgr.close_all() + with pytest.raises(RuntimeError, match="closed"): + mgr.deploy(FakeDockerModule) + + +class TestModuleCoordinatorDockerRouting: + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_routes_docker_module_to_docker_manager( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + result = coordinator.deploy(FakeDockerModule) + + # Should NOT go through worker manager + mock_worker_mgr.deploy.assert_not_called() + # Should create a DockerModule + mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + assert result is mock_dm + # Should be tracked + assert coordinator.get_instance(FakeDockerModule) is mock_dm + + coordinator.stop() + + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + mock_proxy = MagicMock() + mock_worker_mgr.deploy.return_value = mock_proxy + + coordinator = ModuleCoordinator() + coordinator.start() + + result = coordinator.deploy(FakeRegularModule) + + mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule) + assert result is mock_proxy + + coordinator.stop() + + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_parallel_separates_docker_and_regular( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + regular_proxy = MagicMock() + mock_worker_mgr.deploy_parallel.return_value = [regular_proxy] + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + specs = [ + (FakeRegularModule, (), {}), + (FakeDockerModule, (), {}), + ] + results = coordinator.deploy_parallel(specs) + + # Regular module goes through worker manager + mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) + # Docker module gets its own DockerModule + mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + + # Results are in original order + assert results[0] is regular_proxy + assert results[1] is mock_dm + + coordinator.stop() + + @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + coordinator.deploy(FakeDockerModule) + coordinator.stop() + + # The deployed module's stop() is called during coordinator.stop() loop + mock_dm.stop.assert_called() + # Worker manager also closed + mock_worker_mgr.close_all.assert_called_once() diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index c6a5f0bb3e..871be6f5d2 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -31,11 +31,11 @@ from __future__ import annotations +from dataclasses import dataclass, field from pathlib import Path import subprocess import time -from dimos.core.blueprints import autoconnect from dimos.core.core import rpc from dimos.core.docker_runner import DockerModuleConfig from dimos.core.module import Module @@ -46,6 +46,7 @@ # --------------------------------------------------------------------------- +@dataclass(kw_only=True) class HelloDockerConfig(DockerModuleConfig): docker_image: str = "dimos-hello-docker:latest" docker_file: Path | None = Path(__file__).parent / "Dockerfile" @@ -53,7 +54,7 @@ class HelloDockerConfig(DockerModuleConfig): docker_gpus: str | None = None # no GPU needed docker_rm: bool = True docker_restart_policy: str = "no" - docker_env: dict[str, str] = {"CI": "1"} # skip interactive system configurator + docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"}) class HelloDockerModule(Module["HelloDockerConfig"]): @@ -114,6 +115,8 @@ def _on_greeting(self, text: str) -> None: # --------------------------------------------------------------------------- if __name__ == "__main__": + from dimos.core.blueprints import autoconnect + coordinator = autoconnect( PromptModule.blueprint(), HelloDockerModule.blueprint(), @@ -130,5 +133,5 @@ def _on_greeting(self, text: str) -> None: prompt_mod.prompt.publish("stream test") time.sleep(2) - coordinator.close_all() + coordinator.stop() print("Done!") diff --git a/pyproject.toml b/pyproject.toml index cb4607ced5..55eb570836 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -294,6 +294,8 @@ docker = [ "sortedcontainers", "PyTurboJPEG", "rerun-sdk", + "langchain-core", + "typing_extensions", "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'", "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'", ] diff --git a/uv.lock b/uv.lock index 2f53ef0e6f..a7e9070a7d 100644 --- a/uv.lock +++ b/uv.lock @@ -1848,6 +1848,7 @@ dev = [ ] docker = [ { name = "dimos-lcm" }, + { name = "langchain-core" }, { name = "lcm" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -1865,6 +1866,7 @@ docker = [ { name = "sortedcontainers" }, { name = "structlog" }, { name = "typer" }, + { name = "typing-extensions" }, ] drone = [ { name = "pymavlink" }, @@ -2003,6 +2005,7 @@ requires-dist = [ { name = "langchain", marker = "extra == 'agents'", specifier = "==1.2.3" }, { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-core", marker = "extra == 'agents'", specifier = "==1.2.3" }, + { name = "langchain-core", marker = "extra == 'docker'" }, { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -2118,6 +2121,7 @@ requires-dist = [ { name = "types-tabulate", marker = "extra == 'dev'", specifier = ">=0.9.0.20241207,<1" }, { name = "types-tensorflow", marker = "extra == 'dev'", specifier = ">=2.18.0.20251008,<3" }, { name = "types-tqdm", marker = "extra == 'dev'", specifier = ">=4.67.0.20250809,<5" }, + { name = "typing-extensions", marker = "extra == 'docker'" }, { name = "ultralytics", marker = "extra == 'perception'", specifier = ">=8.3.70" }, { name = "unitree-webrtc-connect-leshy", marker = "extra == 'unitree'", specifier = ">=2.0.7" }, { name = "uvicorn", marker = "extra == 'web'", specifier = ">=0.34.0" }, From 580eda4d6a621644f7fec36a6846bbf6b827672a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:33:41 -0800 Subject: [PATCH 04/89] fix rerun imports --- dimos/core/docker_runner.py | 5 +---- dimos/visualization/rerun/bridge.py | 3 --- dimos/visualization/rerun/constants.py | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 dimos/visualization/rerun/constants.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 566e28a70e..2735b0cefe 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -28,10 +28,7 @@ from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall from dimos.utils.logging_config import setup_logger - -# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 +from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT if TYPE_CHECKING: from collections.abc import Callable diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 47bce27dcf..420ffd1769 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -39,9 +39,6 @@ from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.utils.logging_config import setup_logger -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 - # TODO OUT visual annotations # # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific) diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py new file mode 100644 index 0000000000..e1c98176ad --- /dev/null +++ b/dimos/visualization/rerun/constants.py @@ -0,0 +1,17 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# isolated so that they can be imported into lightweight modules without importing all of rerun +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 From 5374de612c2942c5553fda4b37b2eaa07522755c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:37:43 -0800 Subject: [PATCH 05/89] fixup imports --- dimos/core/module_coordinator.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index dae1760b9e..155ffb28db 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,13 +18,14 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_runner import is_docker_module +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger if TYPE_CHECKING: - from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor from dimos.core.rpc_client import ModuleProxy @@ -52,8 +53,6 @@ def __init__( self._deployed_modules = {} def start(self) -> None: - from dimos.core.docker_worker_manager import DockerWorkerManager - n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() @@ -86,9 +85,6 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - from dimos.core.docker_runner import is_docker_module - from dimos.core.docker_worker_manager import DockerWorkerManager - if is_docker_module(module_class): if not self._docker_client: self._docker_client = DockerWorkerManager() @@ -105,9 +101,6 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - from dimos.core.docker_runner import is_docker_module - from dimos.core.docker_worker_manager import DockerWorkerManager - # Separate docker modules from regular modules docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] From bc66a45fdaba0d81453575942537e6f6fd5b78fd Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:48:14 -0800 Subject: [PATCH 06/89] fixup --- dimos/core/docker_runner.py | 9 ++++++++- dimos/core/docker_worker_manager.py | 8 +++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 2735b0cefe..f6bbd98325 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -419,7 +419,14 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: if main_file: import pathlib - rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + try: + rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd()) + except ValueError: + raise RuntimeError( + f"Cannot derive module path: '{main_file}' is not under cwd " + f"'{pathlib.Path.cwd()}'. " + "Run with `python -m` or set docker_command explicitly." + ) from None module_name = str(rel.with_suffix("")).replace("/", ".") else: raise RuntimeError( diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 97f27a6d7a..bd432f18e2 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -14,6 +14,7 @@ from __future__ import annotations +from contextlib import suppress from typing import TYPE_CHECKING, Any from dimos.core.docker_runner import DockerModule @@ -38,7 +39,12 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) - dm.start() # Docker modules must be running before streams/RPC can be wired + try: + dm.start() # Docker modules must be running before streams/RPC can be wired + except Exception: + with suppress(Exception): + dm.stop() + raise self._docker_modules.append(dm) return dm From d8436097a1b3bc43a6d30e0df334f45d63a29cde Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 22:52:39 -0800 Subject: [PATCH 07/89] simplify stop logic --- dimos/core/docker_worker_manager.py | 21 --------------------- dimos/core/module_coordinator.py | 2 -- 2 files changed, 23 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index bd432f18e2..8e368d15a8 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -31,13 +31,8 @@ class DockerWorkerManager: def __init__(self) -> None: self._docker_modules: list[DockerModule] = [] - self._closed = False def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - if self._closed: - raise RuntimeError("DockerWorkerManager is closed") - - logger.info("Deploying module in Docker.", module=module_class.__name__) dm = DockerModule(module_class, *args, **kwargs) try: dm.start() # Docker modules must be running before streams/RPC can be wired @@ -45,20 +40,4 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke with suppress(Exception): dm.stop() raise - self._docker_modules.append(dm) return dm - - def close_all(self) -> None: - if self._closed: - return - self._closed = True - - logger.info("Stopping all Docker modules...") - for dm in reversed(self._docker_modules): - try: - dm.stop() - except Exception: - logger.error("Error stopping Docker module", exc_info=True) - - self._docker_modules.clear() - logger.info("All Docker modules stopped.") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 155ffb28db..97541640dc 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -77,8 +77,6 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) - if self._docker_client is not None: - self._docker_client.close_all() self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] From 30254a140324cac2c541c7825cf58a144151bb90 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:10:06 -0800 Subject: [PATCH 08/89] simplify and explain --- dimos/core/docker_worker_manager.py | 43 ---------- dimos/core/module_coordinator.py | 36 ++++++--- dimos/core/tests/test_docker_deployment.py | 91 ++++++++-------------- 3 files changed, 57 insertions(+), 113 deletions(-) delete mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py deleted file mode 100644 index 8e368d15a8..0000000000 --- a/dimos/core/docker_worker_manager.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from contextlib import suppress -from typing import TYPE_CHECKING, Any - -from dimos.core.docker_runner import DockerModule -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from dimos.core.module import Module - -logger = setup_logger() - - -class DockerWorkerManager: - """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules.""" - - def __init__(self) -> None: - self._docker_modules: list[DockerModule] = [] - - def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - dm = DockerModule(module_class, *args, **kwargs) - try: - dm.start() # Docker modules must be running before streams/RPC can be wired - except Exception: - with suppress(Exception): - dm.stop() - raise - return dm diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 97541640dc..25f8fdbc22 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,8 +18,7 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager +from dimos.core.docker_runner import DockerModule, is_docker_module from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -35,7 +34,6 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _client: WorkerManager | None = None - _docker_client: DockerWorkerManager | None = None _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" @@ -56,7 +54,6 @@ def start(self) -> None: n = self._n if self._n is not None else 2 self._client = WorkerManager(n_workers=n) self._client.start() - self._docker_client = DockerWorkerManager() if self._global_config.dtop: from dimos.core.resource_monitor.monitor import StatsMonitor @@ -79,14 +76,30 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] + def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: + from contextlib import suppress + + logger.info("Deploying module in Docker.", module=module_class.__name__) + dm = DockerModule(module_class, *args, **kwargs) + try: + # why are docker modules started here? shouldn't they be started in start_all_modules? + # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries + # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running. + # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that + # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things + dm.start() + except Exception: + with suppress(Exception): + dm.stop() + raise + return dm + def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") if is_docker_module(module_class): - if not self._docker_client: - self._docker_client = DockerWorkerManager() - module = self._docker_client.deploy(module_class, *args, **kwargs) # type: ignore[assignment] + module = self._deploy_docker(module_class, *args, **kwargs) # type: ignore[assignment] else: module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] @@ -119,9 +132,7 @@ def deploy_parallel( # Deploy docker modules (each gets its own DockerModule) docker_results: list[Any] = [] for module_class, args, kwargs in docker_specs: - if not self._docker_client: - self._docker_client = DockerWorkerManager() - dm = self._docker_client.deploy(module_class, *args, **kwargs) + dm = self._deploy_docker(module_class, *args, **kwargs) docker_results.append(dm) # Reassemble results in original order @@ -137,9 +148,10 @@ def deploy_parallel( return results # type: ignore[return-value] def start_all_modules(self) -> None: - modules = list(self._deployed_modules.values()) + # Docker modules are already started during deploy, (see their deploy as to why this is) + modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)] if isinstance(self._client, WorkerManager): - with ThreadPoolExecutor(max_workers=len(modules)) as executor: + with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: list(executor.map(lambda m: m.start(), modules)) else: for module in modules: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 85f2b0508a..99c1debbb6 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -16,7 +16,7 @@ Smoke tests for Docker module deployment routing. These tests verify that the ModuleCoordinator correctly detects and routes -docker modules to the DockerWorkerManager WITHOUT actually running Docker. +docker modules to DockerModule WITHOUT actually running Docker. """ from __future__ import annotations @@ -28,7 +28,6 @@ import pytest from dimos.core.docker_runner import DockerModuleConfig, is_docker_module -from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator from dimos.core.stream import Out @@ -78,59 +77,10 @@ class Bare(Module): assert is_docker_module(Bare) is False -class TestDockerWorkerManager: - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_deploy_creates_docker_module(self, mock_docker_module_cls): - mock_instance = MagicMock() - mock_docker_module_cls.return_value = mock_instance - - mgr = DockerWorkerManager() - result = mgr.deploy(FakeDockerModule, some_kwarg="value") - - mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value") - assert result is mock_instance - assert len(mgr._docker_modules) == 1 - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls): - dm1 = MagicMock() - dm2 = MagicMock() - mock_docker_module_cls.side_effect = [dm1, dm2] - - mgr = DockerWorkerManager() - mgr.deploy(FakeDockerModule) - mgr.deploy(FakeDockerModule) - mgr.close_all() - - # Stopped in reverse order - assert dm2.stop.call_count == 1 - assert dm1.stop.call_count == 1 - assert dm2.stop.called - assert dm1.stop.called - assert len(mgr._docker_modules) == 0 - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_close_all_idempotent(self, mock_docker_module_cls): - mock_docker_module_cls.return_value = MagicMock() - mgr = DockerWorkerManager() - mgr.deploy(FakeDockerModule) - mgr.close_all() - mgr.close_all() # second call should be no-op - - @patch("dimos.core.docker_worker_manager.DockerModule") - def test_deploy_after_close_raises(self, mock_docker_module_cls): - mgr = DockerWorkerManager() - mgr.close_all() - with pytest.raises(RuntimeError, match="closed"): - mgr.deploy(FakeDockerModule) - - class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_routes_docker_module_to_docker_manager( - self, mock_worker_manager_cls, mock_docker_module_cls - ): + def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr @@ -144,14 +94,38 @@ def test_deploy_routes_docker_module_to_docker_manager( # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() - # Should create a DockerModule + # Should create a DockerModule and start it mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_dm.start.assert_called_once() assert result is mock_dm # Should be tracked assert coordinator.get_instance(FakeDockerModule) is mock_dm coordinator.stop() + @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.module_coordinator.WorkerManager") + def test_deploy_docker_cleans_up_on_start_failure( + self, mock_worker_manager_cls, mock_docker_module_cls + ): + mock_worker_mgr = MagicMock() + mock_worker_manager_cls.return_value = mock_worker_mgr + + mock_dm = MagicMock() + mock_dm.start.side_effect = RuntimeError("start failed") + mock_docker_module_cls.return_value = mock_dm + + coordinator = ModuleCoordinator() + coordinator.start() + + with pytest.raises(RuntimeError, match="start failed"): + coordinator.deploy(FakeDockerModule) + + # stop() called to clean up the failed container + mock_dm.stop.assert_called_once() + + coordinator.stop() + @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): mock_worker_mgr = MagicMock() @@ -169,7 +143,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( self, mock_worker_manager_cls, mock_docker_module_cls @@ -196,6 +170,7 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) # Docker module gets its own DockerModule mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_dm.start.assert_called_once() # Results are in original order assert results[0] is regular_proxy @@ -203,7 +178,7 @@ def test_deploy_parallel_separates_docker_and_regular( coordinator.stop() - @patch("dimos.core.docker_worker_manager.DockerModule") + @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -217,7 +192,7 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke coordinator.deploy(FakeDockerModule) coordinator.stop() - # The deployed module's stop() is called during coordinator.stop() loop - mock_dm.stop.assert_called() + # stop() called exactly once (no double cleanup) + assert mock_dm.stop.call_count == 1 # Worker manager also closed mock_worker_mgr.close_all.assert_called_once() From 16565b02070275669ad18fdcf45aa501c5849290 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:26:39 -0800 Subject: [PATCH 09/89] parallel start of docker modules --- dimos/core/module_coordinator.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 25f8fdbc22..b16812a4dd 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -129,11 +129,16 @@ def deploy_parallel( # Deploy worker modules in parallel via WorkerManager worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - # Deploy docker modules (each gets its own DockerModule) - docker_results: list[Any] = [] - for module_class, args, kwargs in docker_specs: - dm = self._deploy_docker(module_class, *args, **kwargs) - docker_results.append(dm) + # Deploy docker modules in parallel (each starts its own container) + if docker_specs: + with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: + futures = [ + executor.submit(self._deploy_docker, module_class, *args, **kwargs) + for module_class, args, kwargs in docker_specs + ] + docker_results: list[Any] = [f.result() for f in futures] + else: + docker_results: list[Any] = [] # Reassemble results in original order results: list[Any] = [] From 91170176f7a820eb1b059cd1c7a32b3780f5b3ee Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:33:09 -0800 Subject: [PATCH 10/89] fix container name to be stable --- dimos/core/docker_runner.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index f6bbd98325..1fc281c035 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -18,7 +18,6 @@ from dataclasses import dataclass, field import importlib import json -import os import signal import subprocess import threading @@ -181,9 +180,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - self._container_name = ( - config.docker_container_name - or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}" + self._container_name = config.docker_container_name or self._default_container_name( + module_class, config ) # RPC setup (lazy import to keep container-side imports light) @@ -202,6 +200,16 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Building {config.docker_image}") build_image(config) + @staticmethod + def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: + import hashlib + + name = module_class.__name__.lower() + path_hash = hashlib.sha256( + str(config.docker_file.resolve()).encode() # type: ignore[union-attr] + ).hexdigest()[:12] + return f"dimos_{name}_{path_hash}" + def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable From ab150fa1b663784191daf4644f13e7dcf0c4c1ec Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 4 Mar 2026 23:51:12 -0800 Subject: [PATCH 11/89] lazy import --- dimos/core/o3dpickle.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py index 1912ab7739..1c1464fece 100644 --- a/dimos/core/o3dpickle.py +++ b/dimos/core/o3dpickle.py @@ -14,25 +14,34 @@ import copyreg -import numpy as np -import open3d as o3d # type: ignore[import-untyped] - +# open3d is imported lazily (inside functions) rather than at module level. +# dimos.core.core imports this module just to register pickle handlers, and core is +# imported by almost everything — including lightweight docker modules that don't use +# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere, +# which crashes in environments where those packages aren't installed or version-matched. +# (i.e. minimal docker envs) def reduce_external(obj): # type: ignore[no-untyped-def] + import numpy as np + # Convert Vector3dVector to numpy array for pickling points_array = np.asarray(obj.points) return (reconstruct_pointcloud, (points_array,)) def reconstruct_pointcloud(points_array): # type: ignore[no-untyped-def] - # Create new PointCloud and assign the points + import open3d as o3d # type: ignore[import-untyped] + pc = o3d.geometry.PointCloud() pc.points = o3d.utility.Vector3dVector(points_array) return pc def register_picklers() -> None: - # Register for the actual PointCloud class that gets instantiated - # We need to create a dummy PointCloud to get its actual class + try: + import open3d as o3d # type: ignore[import-untyped] + except ImportError: + return # open3d not installed in this environment; skip registration + _dummy_pc = o3d.geometry.PointCloud() copyreg.pickle(_dummy_pc.__class__, reduce_external) From 84c045e106bbf7b91dcaa0cc3e3238c891355780 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:08:21 -0800 Subject: [PATCH 12/89] clean up --- dimos/core/docker_runner.py | 139 ++++++++++++-------- dimos/core/module.py | 25 +++- dimos/core/module_coordinator.py | 89 ++++--------- dimos/core/o3dpickle.py | 21 +-- dimos/core/tests/test_docker_deployment.py | 21 ++- examples/docker_hello_world/hello_docker.py | 7 +- 6 files changed, 155 insertions(+), 147 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 1fc281c035..c6a196b7a7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -26,6 +26,7 @@ from dimos.core.module import ModuleConfig from dimos.core.rpc_client import RpcCall +from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -139,6 +140,32 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") +def _prompt_restart(container_name: str) -> bool: + """Ask the user whether to restart a running container. + + Returns True to restart, False to reuse. + Falls back to restart when stdin is not a TTY (e.g. CI). + """ + import sys + + if not sys.stdin.isatty(): + logger.warning( + f"Container '{container_name}' already running — restarting (non-interactive)." + ) + return True + + print(f"\nContainer '{container_name}' is already running.") + print(" [r] Restart — stop the existing container and start a fresh one") + print(" [u] Use — attach to the existing container as-is") + while True: + choice = input("Choice [r/u]: ").strip().lower() + if choice in ("r", "restart"): + return True + if choice in ("u", "use"): + return False + print("Please enter 'r' or 'u'.") + + def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" out: dict[str, Any] = {} @@ -161,21 +188,22 @@ class DockerModule: Host-side handle for a module running inside Docker. Lifecycle: - - start(): launches container, waits for module ready via RPC - - stop(): stops container - - __getattr__: exposes RpcCall for @rpc methods on remote module + - start(): builds the image if needed, launches the container, waits for readiness, calls the remote module's start() RPC (after streams are wired) + - stop(): stops the container and cleans up Communication: All RPC happens via LCM multicast (requires --network=host). """ + config : DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: # Config config_class = getattr(module_class, "default_config", DockerModuleConfig) + assert issubclass(config_class, DockerModuleConfig) config = config_class(**kwargs) - + # Module info self._module_class = module_class - self._config = config + self.config = config self._args = args self._kwargs = kwargs self._running = False @@ -184,21 +212,13 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non module_class, config ) - # RPC setup (lazy import to keep container-side imports light) - from dimos.protocol.rpc import LCMRPC self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - - # Build image if needed (but don't start - caller must call start() explicitly) - from dimos.core.docker_build import build_image, image_exists - - if not image_exists(config): - logger.info(f"Building {config.docker_image}") - build_image(config) + self._deferred_transports: dict[str, str] = {} # stream_name -> topic @staticmethod def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: @@ -210,44 +230,56 @@ def _default_container_name(module_class: type[Module], config: DockerModuleConf ).hexdigest()[:12] return f"dimos_{name}_{path_hash}" + def get_rpc_method_names(self) -> list[str]: + return self.rpc_calls + def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: - # Check all requested methods exist missing = set(methods) - self._bound_rpc_calls.keys() if missing: raise ValueError(f"RPC methods not found: {missing}") - # Return single RpcCall or tuple calls = tuple(self._bound_rpc_calls[m] for m in methods) return calls[0] if len(calls) == 1 else calls def start(self) -> None: - if self._running: - return + """Invoke the remote module's start() RPC. - cfg = self._config + Called after stream transports are wired so the module can subscribe + to its streams with valid transports. + """ + from dimos.core.docker_build import build_image, image_exists - # Prevent accidental kill of running container with same name - if _is_container_running(cfg, self._container_name): - raise RuntimeError( - f"Container '{self._container_name}' already running. " - "Choose a different container_name or stop the existing container." - ) - _remove_container(cfg, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) + if not image_exists(self.config): + logger.info(f"Building {self.config.docker_image}") + build_image(self.config) + try: - self.rpc.start() - self._running = True - self._wait_for_ready() + cfg = self.config + if _is_container_running(cfg, self._container_name): + restart = _prompt_restart(self._container_name) + if restart: + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _remove_container(cfg, self._container_name) + + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + + self.rpc.start() + self._running = True + self._configure_streams(self._deferred_transports) + self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) + except Exception: + with suppress(Exception): + self.stop() + raise def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" @@ -263,13 +295,13 @@ def stop(self) -> None: self._unsub_fns.clear() # Stop and remove container - _run([_docker_bin(self._config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(self._config, self._container_name) + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _remove_container(self.config, self._container_name) self._running = False logger.info(f"Stopped container: {self._container_name}") def status(self) -> dict[str, Any]: - cfg = self._config + cfg = self.config return { "module": self.remote_name, "container_name": self._container_name, @@ -278,19 +310,17 @@ def status(self) -> dict[str, Any]: } def tail_logs(self, n: int = 200) -> str: - return _tail_logs(self._config, self._container_name, n=n) + return _tail_logs(self.config, self._container_name, n=n) def set_transport(self, stream_name: str, transport: Any) -> bool: - """Configure stream transport in container. Mirrors Module.set_transport() for autoconnect().""" + """Defer stream transport config until start() when the container is running.""" topic = getattr(transport, "topic", None) if topic is None: return False if hasattr(topic, "topic"): topic = topic.topic - result, _ = self.rpc.call_sync( - f"{self.remote_name}/configure_stream", ([stream_name, str(topic)], {}) - ) - return bool(result) + self._deferred_transports[stream_name] = str(topic) + return True def __getattr__(self, name: str) -> Any: if name in self.rpcs: @@ -302,7 +332,7 @@ def __getattr__(self, name: str) -> Any: def _build_docker_run_command(self) -> list[str]: """Build the complete `docker run` command.""" - cfg = self._config + cfg = self.config self._validate_config(cfg) cmd = [_docker_bin(cfg), "run", "-d"] @@ -448,9 +478,13 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: # DimOS base image entrypoint already runs "dimos.core.docker_runner run" return ["--payload", json.dumps(payload, separators=(",", ":"))] - def _wait_for_ready(self) -> None: - """Poll the module's RPC endpoint until ready, crashed, or timeout.""" - cfg = self._config + def _configure_streams(self, streams: dict[str, str]) -> None: + """Poll configure_streams RPC until the container's RPC server is up, then wire streams. + + Also serves as the liveness gate — the first successful call proves the + container is ready to accept RPCs. + """ + cfg = self.config start_time = time.time() logger.info(f"Waiting for {self.remote_name} to be ready...") @@ -462,13 +496,14 @@ def _wait_for_ready(self) -> None: try: self.rpc.call_sync( - f"{self.remote_name}/start", ([], {}), rpc_timeout=RPC_READY_TIMEOUT + f"{self.remote_name}/configure_streams", + ([streams], {}), + rpc_timeout=RPC_READY_TIMEOUT, ) elapsed = time.time() - start_time logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)") return except (TimeoutError, ConnectionError, OSError): - # Module not ready yet - retry after poll interval time.sleep(cfg.docker_poll_interval) logs = _tail_logs(cfg, self._container_name) diff --git a/dimos/core/module.py b/dimos/core/module.py index 127be545fe..72df61d4c7 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -446,15 +446,26 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type return True @rpc - def configure_stream(self, stream_name: str, topic: str) -> bool: - """Configure a stream's transport by topic. Called by DockerModule for stream wiring.""" + def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: + """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring. + + Args: + streams: mapping of stream_name -> topic + + Returns: + mapping of stream_name -> success + """ from dimos.core.transport import pLCMTransport - stream = getattr(self, stream_name, None) - if not isinstance(stream, (Out, In)): - return False - stream._transport = pLCMTransport(topic) - return True + results: dict[str, bool] = {} + for stream_name, topic in streams.items(): + stream = getattr(self, stream_name, None) + if not isinstance(stream, (Out, In)): + results[stream_name] = False + else: + stream._transport = pLCMTransport(topic) + results[stream_name] = True + return results # called from remote def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]): # type: ignore[no-untyped-def] diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index b16812a4dd..3d71e8776b 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -76,33 +76,14 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] - def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule: - from contextlib import suppress - - logger.info("Deploying module in Docker.", module=module_class.__name__) - dm = DockerModule(module_class, *args, **kwargs) - try: - # why are docker modules started here? shouldn't they be started in start_all_modules? - # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries - # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running. - # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that - # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things - dm.start() - except Exception: - with suppress(Exception): - dm.stop() - raise - return dm - def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - - if is_docker_module(module_class): - module = self._deploy_docker(module_class, *args, **kwargs) # type: ignore[assignment] - else: - module = self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - + module = ( + DockerModule(module_class, *args, **kwargs) # type: ignore[assignment] + if is_docker_module(module_class) + else self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] + ) self._deployed_modules[module_class] = module # type: ignore[assignment] return module # type: ignore[return-value] @@ -112,49 +93,38 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - # Separate docker modules from regular modules - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - - for spec in module_specs: - module_class = spec[0] - if is_docker_module(module_class): - spec_indices.append(("docker", len(docker_specs))) - docker_specs.append(spec) - else: - spec_indices.append(("worker", len(worker_specs))) - worker_specs.append(spec) - - # Deploy worker modules in parallel via WorkerManager + docker_specs = [ + (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class) + ] + worker_specs = [ + (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class) + ] + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - # Deploy docker modules in parallel (each starts its own container) + docker_results: list[Any] = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - futures = [ - executor.submit(self._deploy_docker, module_class, *args, **kwargs) - for module_class, args, kwargs in docker_specs - ] - docker_results: list[Any] = [f.result() for f in futures] - else: - docker_results: list[Any] = [] - - # Reassemble results in original order - results: list[Any] = [] - for kind, idx in spec_indices: - if kind == "docker": - results.append(docker_results[idx]) - else: - results.append(worker_results[idx]) + docker_results = list( + executor.map( + lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + ) + ) + + # Reassemble in original order + worker_iter = iter(worker_results) + docker_iter = iter(docker_results) + results: list[Any] = [ + next(docker_iter) if is_docker_module(module_class) else next(worker_iter) + for module_class, _, _ in module_specs + ] for (module_class, _, _), module in zip(module_specs, results, strict=True): - self._deployed_modules[module_class] = module + self._deployed_modules[module_class] = module # type: ignore[assignment] return results # type: ignore[return-value] def start_all_modules(self) -> None: - # Docker modules are already started during deploy, (see their deploy as to why this is) - modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)] + modules = list(self._deployed_modules.values()) if isinstance(self._client, WorkerManager): with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: list(executor.map(lambda m: m.start(), modules)) @@ -162,10 +132,9 @@ def start_all_modules(self) -> None: for module in modules: module.start() - module_list = list(self._deployed_modules.values()) for module in modules: if hasattr(module, "on_system_modules"): - module.on_system_modules(module_list) + module.on_system_modules(modules) def get_instance(self, module: type[ModuleT]) -> ModuleProxy: return self._deployed_modules.get(module) # type: ignore[return-value, no-any-return] diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py index 1c1464fece..1912ab7739 100644 --- a/dimos/core/o3dpickle.py +++ b/dimos/core/o3dpickle.py @@ -14,34 +14,25 @@ import copyreg -# open3d is imported lazily (inside functions) rather than at module level. -# dimos.core.core imports this module just to register pickle handlers, and core is -# imported by almost everything — including lightweight docker modules that don't use -# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere, -# which crashes in environments where those packages aren't installed or version-matched. -# (i.e. minimal docker envs) +import numpy as np +import open3d as o3d # type: ignore[import-untyped] -def reduce_external(obj): # type: ignore[no-untyped-def] - import numpy as np +def reduce_external(obj): # type: ignore[no-untyped-def] # Convert Vector3dVector to numpy array for pickling points_array = np.asarray(obj.points) return (reconstruct_pointcloud, (points_array,)) def reconstruct_pointcloud(points_array): # type: ignore[no-untyped-def] - import open3d as o3d # type: ignore[import-untyped] - + # Create new PointCloud and assign the points pc = o3d.geometry.PointCloud() pc.points = o3d.utility.Vector3dVector(points_array) return pc def register_picklers() -> None: - try: - import open3d as o3d # type: ignore[import-untyped] - except ImportError: - return # open3d not installed in this environment; skip registration - + # Register for the actual PointCloud class that gets instantiated + # We need to create a dummy PointCloud to get its actual class _dummy_pc = o3d.geometry.PointCloud() copyreg.pickle(_dummy_pc.__class__, reduce_external) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 99c1debbb6..7a02682fda 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -94,36 +94,32 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() - # Should create a DockerModule and start it + # Should construct a DockerModule (container launch happens inside __init__) mock_docker_module_cls.assert_called_once_with(FakeDockerModule) - mock_dm.start.assert_called_once() + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() assert result is mock_dm - # Should be tracked assert coordinator.get_instance(FakeDockerModule) is mock_dm coordinator.stop() @patch("dimos.core.module_coordinator.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_docker_cleans_up_on_start_failure( + def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls ): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr - mock_dm = MagicMock() - mock_dm.start.side_effect = RuntimeError("start failed") - mock_docker_module_cls.return_value = mock_dm + # Container launch fails inside __init__; DockerModule handles its own cleanup + mock_docker_module_cls.side_effect = RuntimeError("launch failed") coordinator = ModuleCoordinator() coordinator.start() - with pytest.raises(RuntimeError, match="start failed"): + with pytest.raises(RuntimeError, match="launch failed"): coordinator.deploy(FakeDockerModule) - # stop() called to clean up the failed container - mock_dm.stop.assert_called_once() - coordinator.stop() @patch("dimos.core.module_coordinator.WorkerManager") @@ -170,7 +166,8 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) # Docker module gets its own DockerModule mock_docker_module_cls.assert_called_once_with(FakeDockerModule) - mock_dm.start.assert_called_once() + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() # Results are in original order assert results[0] is regular_proxy diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 871be6f5d2..187384854e 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -106,6 +106,11 @@ def start(self) -> None: super().start() self.greeting.subscribe(self._on_greeting) + @rpc + def send(self, text: str) -> None: + """Publish a prompt message onto the stream.""" + self.prompt.publish(text) + def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") @@ -130,7 +135,7 @@ def _on_greeting(self, text: str) -> None: print(docker_mod.greet("World")) # Test stream - prompt_mod.prompt.publish("stream test") + prompt_mod.send("stream test") time.sleep(2) coordinator.stop() From 9ead3fd350f7f10e987c2da4e090ed259cc284a1 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:19:20 -0800 Subject: [PATCH 13/89] revert --- dimos/core/docker_runner.py | 2 +- dimos/visualization/rerun/bridge.py | 3 +++ dimos/visualization/rerun/constants.py | 17 ----------------- 3 files changed, 4 insertions(+), 18 deletions(-) delete mode 100644 dimos/visualization/rerun/constants.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c6a196b7a7..e1a583b285 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -28,7 +28,7 @@ from dimos.core.rpc_client import RpcCall from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger -from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT +from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT if TYPE_CHECKING: from collections.abc import Callable diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py index 9cadbc617f..cc4b13ecb9 100644 --- a/dimos/visualization/rerun/bridge.py +++ b/dimos/visualization/rerun/bridge.py @@ -39,6 +39,9 @@ from dimos.protocol.pubsub.patterns import Glob, pattern_matches from dimos.utils.logging_config import setup_logger +RERUN_GRPC_PORT = 9876 +RERUN_WEB_PORT = 9090 + # TODO OUT visual annotations # # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific) diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py deleted file mode 100644 index e1c98176ad..0000000000 --- a/dimos/visualization/rerun/constants.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# isolated so that they can be imported into lightweight modules without importing all of rerun -RERUN_GRPC_PORT = 9876 -RERUN_WEB_PORT = 9090 From b98d5d0469ccf77e8f8e976fe9b4e816fce0c829 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 10:29:35 -0800 Subject: [PATCH 14/89] cleanup --- dimos/core/docker_runner.py | 4 ++-- dimos/core/module.py | 3 +-- dimos/core/module_coordinator.py | 11 ++++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index e1a583b285..3f1b3031c7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import RpcCall +from dimos.core.rpc_client import RpcCall, ModuleProxy from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -183,7 +183,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: # Host-side Docker-backed Module handle -class DockerModule: +class DockerModule(ModuleProxy): """ Host-side handle for a module running inside Docker. diff --git a/dimos/core/module.py b/dimos/core/module.py index 72df61d4c7..24be321ee2 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -218,12 +218,11 @@ def inputs(self) -> dict[str, In]: # type: ignore[type-arg] @classproperty def rpcs(self) -> dict[str, Callable[..., Any]]: - _skip = {"rpcs", "blueprint", "module_info", "io"} return { name: getattr(self, name) for name in dir(self) if not name.startswith("_") - and name not in _skip + and name != "rpcs" # Exclude the rpcs property itself to prevent recursion and callable(getattr(self, name, None)) and hasattr(getattr(self, name), "__rpc__") } diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3d71e8776b..c2483bdd74 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -79,11 +79,12 @@ def stop(self) -> None: def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - module = ( - DockerModule(module_class, *args, **kwargs) # type: ignore[assignment] - if is_docker_module(module_class) - else self._client.deploy(module_class, *args, **kwargs) # type: ignore[union-attr, attr-defined, assignment] - ) + + deployed_module : ModuleProxy + if is_docker_module(module_class): + deployed_module = DockerModule(module_class, *args, **kwargs) + else: + deployed_module = self._client.deploy(module_class, *args, **kwargs) self._deployed_modules[module_class] = module # type: ignore[assignment] return module # type: ignore[return-value] From 2fed467ebee9d855a99493ceeed923388902456c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:01:11 -0800 Subject: [PATCH 15/89] fixup deploy_parallel --- dimos/core/module.py | 2 +- dimos/core/module_coordinator.py | 33 ++++++++++++++------------------ 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/dimos/core/module.py b/dimos/core/module.py index 24be321ee2..14aeea6da5 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -446,7 +446,7 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type @rpc def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: - """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring. + """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring. Args: streams: mapping of stream_name -> topic diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index c2483bdd74..8698af55cf 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -94,16 +94,19 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - docker_specs = [ - (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class) - ] - worker_specs = [ - (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class) - ] + # Separate docker modules from regular modules + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + for module_class, args, kwargs in module_specs: + if is_docker_module(module_class): + docker_specs.append(spec) + else: + worker_specs.append(spec) - docker_results: list[Any] = [] + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + docker_results = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: docker_results = list( @@ -111,17 +114,9 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) - - # Reassemble in original order - worker_iter = iter(worker_results) - docker_iter = iter(docker_results) - results: list[Any] = [ - next(docker_iter) if is_docker_module(module_class) else next(worker_iter) - for module_class, _, _ in module_specs - ] - - for (module_class, _, _), module in zip(module_specs, results, strict=True): - self._deployed_modules[module_class] = module # type: ignore[assignment] + + for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True): + self._deployed_modules[module_class] = module return results # type: ignore[return-value] def start_all_modules(self) -> None: From 9b7696bc19dffca53cca0526fa0395f3e791ac12 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:08:13 -0800 Subject: [PATCH 16/89] clean up reconnect logic --- dimos/core/docker_runner.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 3f1b3031c7..c7e40f0997 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -140,7 +140,7 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") -def _prompt_restart(container_name: str) -> bool: +def _prompt_reconnect(container_name: str) -> bool: """Ask the user whether to restart a running container. Returns True to restart, False to reuse. @@ -152,7 +152,7 @@ def _prompt_restart(container_name: str) -> bool: logger.warning( f"Container '{container_name}' already running — restarting (non-interactive)." ) - return True + return False print(f"\nContainer '{container_name}' is already running.") print(" [r] Restart — stop the existing container and start a fresh one") @@ -160,9 +160,9 @@ def _prompt_restart(container_name: str) -> bool: while True: choice = input("Choice [r/u]: ").strip().lower() if choice in ("r", "restart"): - return True - if choice in ("u", "use"): return False + if choice in ("u", "use"): + return True print("Please enter 'r' or 'u'.") @@ -258,12 +258,14 @@ def start(self) -> None: try: cfg = self.config + reconnect = False if _is_container_running(cfg, self._container_name): - restart = _prompt_restart(self._container_name) - if restart: + reconnect = _prompt_reconnect(self._container_name) + if not reconnect: _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(cfg, self._container_name) - + if not reconnect: + _remove_container(cfg, self._container_name) + cmd = self._build_docker_run_command() logger.info(f"Starting docker container: {self._container_name}") r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) From 3ec607089a5d739f2386712056872f928743ce3d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:20:53 -0800 Subject: [PATCH 17/89] fixup --- dimos/core/module_coordinator.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 8698af55cf..9689a6119b 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -114,19 +114,17 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) + specs = worker_specs+docker_specs + results = worker_results+docker_results - for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True): + for (module_class, _, _), module in zip(specs, results, strict=True): self._deployed_modules[module_class] = module return results # type: ignore[return-value] def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) - if isinstance(self._client, WorkerManager): - with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor: - list(executor.map(lambda m: m.start(), modules)) - else: - for module in modules: - module.start() + with ThreadPoolExecutor(max_workers=len(modules)) as executor: + list(executor.map(lambda m: m.start(), modules)) for module in modules: if hasattr(module, "on_system_modules"): From e06be8ec142e554e14ff1103b9bcc6e19619afc0 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:22:15 -0800 Subject: [PATCH 18/89] - --- dimos/core/module_coordinator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 9689a6119b..2d15734b30 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -85,8 +85,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: deployed_module = DockerModule(module_class, *args, **kwargs) else: deployed_module = self._client.deploy(module_class, *args, **kwargs) - self._deployed_modules[module_class] = module # type: ignore[assignment] - return module # type: ignore[return-value] + self._deployed_modules[module_class] = deployed_module + return deployed_module def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] From 06181edae2b61d2d1c6abc98953772c0907e4db5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 11:53:32 -0800 Subject: [PATCH 19/89] fix deployment/coordinator timeline --- dimos/core/docker_runner.py | 110 ++++++++++++++++++------------------ dimos/core/module.py | 22 -------- 2 files changed, 54 insertions(+), 78 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c7e40f0997..fb3fc28af7 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -196,12 +196,12 @@ class DockerModule(ModuleProxy): config : DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - # Config + from dimos.core.docker_build import build_image, image_exists + config_class = getattr(module_class, "default_config", DockerModuleConfig) assert issubclass(config_class, DockerModuleConfig) config = config_class(**kwargs) - - # Module info + self._module_class = module_class self.config = config self._args = args @@ -212,13 +212,43 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non module_class, config ) - self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - self._deferred_transports: dict[str, str] = {} # stream_name -> topic + + # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ + try: + if not image_exists(config): + logger.info(f"Building {config.docker_image}") + build_image(config) + + reconnect = False + if _is_container_running(config, self._container_name): + reconnect = _prompt_reconnect(self._container_name) + if not reconnect: + _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + if not reconnect: + _remove_container(config, self._container_name) + + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) + + self.rpc.start() + self._running = True + # docker run -d returns before Module.__init__ finishes in the container, + # so we poll until the RPC server is reachable before returning. + self._wait_for_rpc() + except Exception: + with suppress(Exception): + self.stop() + raise @staticmethod def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: @@ -236,6 +266,11 @@ def get_rpc_method_names(self) -> list[str]: def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable + # Forward to container — Module.set_rpc_method unpickles the RpcCall + # and wires it with the container's own LCMRPC + self.rpc.call_sync( + f"{self.remote_name}/set_rpc_method", ([method, callable], {}) + ) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: missing = set(methods) - self._bound_rpc_calls.keys() @@ -245,38 +280,8 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: return calls[0] if len(calls) == 1 else calls def start(self) -> None: - """Invoke the remote module's start() RPC. - - Called after stream transports are wired so the module can subscribe - to its streams with valid transports. - """ - from dimos.core.docker_build import build_image, image_exists - - if not image_exists(self.config): - logger.info(f"Building {self.config.docker_image}") - build_image(self.config) + """Invoke the remote module's start() RPC.""" try: - - cfg = self.config - reconnect = False - if _is_container_running(cfg, self._container_name): - reconnect = _prompt_reconnect(self._container_name) - if not reconnect: - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - if not reconnect: - _remove_container(cfg, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) - - self.rpc.start() - self._running = True - self._configure_streams(self._deferred_transports) self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) except Exception: with suppress(Exception): @@ -285,10 +290,11 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - # Signal remote module, stop RPC, unsubscribe handlers (ignore failures) + if not self._running: + return + with suppress(Exception): - if self._running: - self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) + self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): self.rpc.stop() for unsub in self._unsub_fns: @@ -296,7 +302,6 @@ def stop(self) -> None: unsub() self._unsub_fns.clear() - # Stop and remove container _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) _remove_container(self.config, self._container_name) self._running = False @@ -315,14 +320,11 @@ def tail_logs(self, n: int = 200) -> str: return _tail_logs(self.config, self._container_name, n=n) def set_transport(self, stream_name: str, transport: Any) -> bool: - """Defer stream transport config until start() when the container is running.""" - topic = getattr(transport, "topic", None) - if topic is None: - return False - if hasattr(topic, "topic"): - topic = topic.topic - self._deferred_transports[stream_name] = str(topic) - return True + """Forward to the container's Module.set_transport RPC.""" + result, _ = self.rpc.call_sync( + f"{self.remote_name}/set_transport", ([stream_name, transport], {}) + ) + return bool(result) def __getattr__(self, name: str) -> Any: if name in self.rpcs: @@ -480,12 +482,8 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: # DimOS base image entrypoint already runs "dimos.core.docker_runner run" return ["--payload", json.dumps(payload, separators=(",", ":"))] - def _configure_streams(self, streams: dict[str, str]) -> None: - """Poll configure_streams RPC until the container's RPC server is up, then wire streams. - - Also serves as the liveness gate — the first successful call proves the - container is ready to accept RPCs. - """ + def _wait_for_rpc(self) -> None: + """Poll until the container's RPC server is reachable.""" cfg = self.config start_time = time.time() @@ -498,8 +496,8 @@ def _configure_streams(self, streams: dict[str, str]) -> None: try: self.rpc.call_sync( - f"{self.remote_name}/configure_streams", - ([streams], {}), + f"{self.remote_name}/get_rpc_method_names", + ([], {}), rpc_timeout=RPC_READY_TIMEOUT, ) elapsed = time.time() - start_time diff --git a/dimos/core/module.py b/dimos/core/module.py index 14aeea6da5..af642b71bd 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -444,28 +444,6 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool: # type stream._transport = transport return True - @rpc - def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]: - """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring. - - Args: - streams: mapping of stream_name -> topic - - Returns: - mapping of stream_name -> success - """ - from dimos.core.transport import pLCMTransport - - results: dict[str, bool] = {} - for stream_name, topic in streams.items(): - stream = getattr(self, stream_name, None) - if not isinstance(stream, (Out, In)): - results[stream_name] = False - else: - stream._transport = pLCMTransport(topic) - results[stream_name] = True - return results - # called from remote def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]): # type: ignore[no-untyped-def] input_stream = getattr(self, input_name, None) From d87ab954912212cf71e0bdcde5366a011913ec30 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:32:36 -0800 Subject: [PATCH 20/89] fir enforcement of either dockerfile or image pull --- dimos/core/docker_runner.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index fb3fc28af7..c7b2528969 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -221,8 +221,17 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ try: if not image_exists(config): - logger.info(f"Building {config.docker_image}") - build_image(config) + if config.docker_file is not None: + logger.info(f"Building {config.docker_image}") + build_image(config) + else: + logger.info(f"Pulling {config.docker_image}") + r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to pull image '{config.docker_image}'.\n" + f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) reconnect = False if _is_container_running(config, self._container_name): From b3d24ef364f8de94aa0666b4b94d544ffaddb17d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:33:24 -0800 Subject: [PATCH 21/89] fix reconnect system --- dimos/core/docker_runner.py | 49 ++++++++++--------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c7b2528969..8cca64ca16 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -140,31 +140,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") -def _prompt_reconnect(container_name: str) -> bool: - """Ask the user whether to restart a running container. - - Returns True to restart, False to reuse. - Falls back to restart when stdin is not a TTY (e.g. CI). - """ - import sys - - if not sys.stdin.isatty(): - logger.warning( - f"Container '{container_name}' already running — restarting (non-interactive)." - ) - return False - - print(f"\nContainer '{container_name}' is already running.") - print(" [r] Restart — stop the existing container and start a fresh one") - print(" [u] Use — attach to the existing container as-is") - while True: - choice = input("Choice [r/u]: ").strip().lower() - if choice in ("r", "restart"): - return False - if choice in ("u", "use"): - return True - print("Please enter 'r' or 'u'.") - def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" @@ -235,20 +210,22 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): - reconnect = _prompt_reconnect(self._container_name) - if not reconnect: + if config.docker_reconnect_container: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + else: + logger.info(f"Stopping existing container: {self._container_name}") _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + if not reconnect: _remove_container(config, self._container_name) - - cmd = self._build_docker_run_command() - logger.info(f"Starting docker container: {self._container_name}") - r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) - if r.returncode != 0: - raise RuntimeError( - f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) - + cmd = self._build_docker_run_command() + logger.info(f"Starting docker container: {self._container_name}") + r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT) + if r.returncode != 0: + raise RuntimeError( + f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + ) self.rpc.start() self._running = True # docker run -d returns before Module.__init__ finishes in the container, From 83cb7c7422dbcb14168a4fd3be444beca6a6ef98 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:33:42 -0800 Subject: [PATCH 22/89] - --- dimos/core/docker_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 8cca64ca16..15677a0e03 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -98,6 +98,9 @@ class DockerModuleConfig(ModuleConfig): docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 + # Reconnect to a running container instead of restarting it + docker_reconnect_container: bool = False + # Advanced docker_bin: str = "docker" From d62396bda3844a05f2ae66b8fd081bfb08f7176c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:34:14 -0800 Subject: [PATCH 23/89] fix deploy_parallel --- dimos/core/module_coordinator.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 2d15734b30..59e1e5a657 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -94,19 +94,11 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - # Separate docker modules from regular modules - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - spec_indices: list[tuple[str, int]] = [] # ("docker"|"worker", index_in_sublist) - - for module_class, args, kwargs in module_specs: - if is_docker_module(module_class): - docker_specs.append(spec) - else: - worker_specs.append(spec) + docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] + worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - docker_results = [] + docker_results: list[Any] = [] if docker_specs: with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: docker_results = list( @@ -114,12 +106,13 @@ def deploy_parallel( lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs ) ) - specs = worker_specs+docker_specs - results = worker_results+docker_results - - for (module_class, _, _), module in zip(specs, results, strict=True): + + results = worker_results + docker_results + for (module_class, _, _), module in zip( + worker_specs + docker_specs, results, strict=True + ): self._deployed_modules[module_class] = module - return results # type: ignore[return-value] + return results def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) From 20aa4f1e0c0964a2ad6493015e62cb857aa62367 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:34:38 -0800 Subject: [PATCH 24/89] better error --- dimos/core/module_coordinator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 59e1e5a657..3dda7c38b0 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -116,6 +116,8 @@ def deploy_parallel( def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) + if not modules: + raise ValueError("No modules deployed. Call deploy() before start_all_modules().") with ThreadPoolExecutor(max_workers=len(modules)) as executor: list(executor.map(lambda m: m.start(), modules)) From 34598539176d7f763825aa7febeeca027b833de5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:36:59 -0800 Subject: [PATCH 25/89] clean container name generation --- dimos/core/docker_runner.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 15677a0e03..d11a68e2a1 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -186,9 +186,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - self._container_name = config.docker_container_name or self._default_container_name( - module_class, config - ) + # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo" + image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1] + self._container_name = config.docker_container_name or f"dimos_{image_base}" self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] @@ -239,16 +239,6 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.stop() raise - @staticmethod - def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str: - import hashlib - - name = module_class.__name__.lower() - path_hash = hashlib.sha256( - str(config.docker_file.resolve()).encode() # type: ignore[union-attr] - ).hexdigest()[:12] - return f"dimos_{name}_{path_hash}" - def get_rpc_method_names(self) -> list[str]: return self.rpc_calls From 5538b6517e6808e06596aa761a4ab457476ed66c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:43:44 -0800 Subject: [PATCH 26/89] fixup typing for ModuleProxy --- dimos/core/docker_runner.py | 24 +++++++++++++----------- dimos/core/module_coordinator.py | 10 +++++----- dimos/core/rpc_client.py | 15 +++++++++++++-- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index d11a68e2a1..74e7c840c8 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import RpcCall, ModuleProxy +from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall from dimos.protocol.rpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -161,7 +161,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: # Host-side Docker-backed Module handle -class DockerModule(ModuleProxy): +class DockerModule(ModuleProxyProtocol): """ Host-side handle for a module running inside Docker. @@ -171,13 +171,17 @@ class DockerModule(ModuleProxy): Communication: All RPC happens via LCM multicast (requires --network=host). """ - config : DockerModuleConfig + config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: from dimos.core.docker_build import build_image, image_exists config_class = getattr(module_class, "default_config", DockerModuleConfig) - assert issubclass(config_class, DockerModuleConfig) + if not issubclass(config_class, DockerModuleConfig): + raise TypeError( + f"{module_class.__name__}.default_config must be a DockerModuleConfig subclass, " + f"got {config_class.__name__}" + ) config = config_class(**kwargs) self._module_class = module_class @@ -196,7 +200,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - # Build image, launch container, wait for RPC server — mirrors worker Module.__init__ + # Build or pull image, launch container, wait for RPC server try: if not image_exists(config): if config.docker_file is not None: @@ -269,9 +273,6 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - if not self._running: - return - with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): @@ -280,9 +281,10 @@ def stop(self) -> None: with suppress(Exception): unsub() self._unsub_fns.clear() - - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) - _remove_container(self.config, self._container_name) + with suppress(Exception): + _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + with suppress(Exception): + _remove_container(self.config, self._container_name) self._running = False logger.info(f"Stopped container: {self._container_name}") diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3dda7c38b0..5534d9f9a7 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -27,7 +27,7 @@ if TYPE_CHECKING: from dimos.core.module import Module, ModuleT from dimos.core.resource_monitor.monitor import StatsMonitor - from dimos.core.rpc_client import ModuleProxy + from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol logger = setup_logger() @@ -37,7 +37,7 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _global_config: GlobalConfig _n: int | None = None _memory_limit: str = "auto" - _deployed_modules: dict[type[Module], ModuleProxy] + _deployed_modules: dict[type[Module], ModuleProxyProtocol] _stats_monitor: StatsMonitor | None = None def __init__( @@ -79,14 +79,14 @@ def stop(self) -> None: def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") - - deployed_module : ModuleProxy + + deployed_module: ModuleProxyProtocol if is_docker_module(module_class): deployed_module = DockerModule(module_class, *args, **kwargs) else: deployed_module = self._client.deploy(module_class, *args, **kwargs) self._deployed_modules[module_class] = deployed_module - return deployed_module + return deployed_module # type: ignore[return-value] def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index e46124469c..a89c54caf0 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -13,7 +13,7 @@ # limitations under the License. from collections.abc import Callable -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Protocol from dimos.core.stream import RemoteStream from dimos.core.worker import MethodCallProxy @@ -80,7 +80,18 @@ def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] self._stop_rpc_client = None -class RPCClient: +class ModuleProxyProtocol(Protocol): + """Protocol for host-side handles to remote modules (worker or Docker).""" + + def start(self) -> None: ... + def stop(self) -> None: ... + def set_transport(self, stream_name: str, transport: Any) -> bool: ... + def get_rpc_method_names(self) -> list[str]: ... + def set_rpc_method(self, method: str, callable: RpcCall) -> None: ... + def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... + + +class RPCClient(ModuleProxyProtocol): def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] self.rpc = LCMRPC() self.actor_class = actor_class From 52146f21fb94953cb0f086c276bd673185fed2f3 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 12:53:53 -0800 Subject: [PATCH 27/89] misc --- dimos/core/docker_runner.py | 6 ++--- dimos/core/module_coordinator.py | 28 ++++++++++++---------- dimos/core/tests/test_docker_deployment.py | 2 +- pyproject.toml | 4 +++- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 74e7c840c8..1a0fc718ae 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -190,9 +190,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo" - image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1] - self._container_name = config.docker_container_name or f"dimos_{image_base}" + # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" + image_ref = config.docker_image.rsplit("/", 1)[-1] + self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 5534d9f9a7..90538cfc0a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -97,21 +97,25 @@ def deploy_parallel( docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + worker_results: list[Any] = [] docker_results: list[Any] = [] - if docker_specs: - with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - docker_results = list( - executor.map( - lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + try: + worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] + if docker_specs: + with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: + docker_results = list( + executor.map( + lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs + ) ) - ) + finally: + results = worker_results + docker_results + # Register whatever succeeded so stop() can clean them up + for (module_class, _, _), module in zip( + worker_specs + docker_specs, results, strict=False + ): + self._deployed_modules[module_class] = module - results = worker_results + docker_results - for (module_class, _, _), module in zip( - worker_specs + docker_specs, results, strict=True - ): - self._deployed_modules[module_class] = module return results def start_all_modules(self) -> None: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 7a02682fda..e6ddbc4a73 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular( # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() - # Results are in original order + # Results are worker-first, then docker assert results[0] is regular_proxy assert results[1] is mock_dm diff --git a/pyproject.toml b/pyproject.toml index dcd2a5d987..31d3322453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -321,10 +321,12 @@ docker = [ "sortedcontainers", "PyTurboJPEG", "rerun-sdk", - "langchain-core", "typing_extensions", "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'", "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'", + # these below should be removed later, right now they are needed even for running `dimos --help` (seperate non-docker issue) + "langchain-core", + "matplotlib", ] base = [ From 3cf2dff187037a59f46d1b507fb53d3a93a2024e Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 13:48:44 -0800 Subject: [PATCH 28/89] testing fixup --- dimos/core/docker_runner.py | 25 +++++++++++++++------- dimos/core/module_coordinator.py | 5 ++++- dimos/core/rpc_client.py | 2 +- dimos/core/test_core.py | 2 +- dimos/core/tests/test_docker_deployment.py | 8 +++---- uv.lock | 2 ++ 6 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 1a0fc718ae..7ce89c40e6 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -143,7 +143,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s return out + ("\n" + err if err else "") - def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" out: dict[str, Any] = {} @@ -171,6 +170,7 @@ class DockerModule(ModuleProxyProtocol): Communication: All RPC happens via LCM multicast (requires --network=host). """ + config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: @@ -192,7 +192,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.remote_name = module_class.__name__ # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] - self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + self._container_name = ( + config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + ) self.rpc = LCMRPC() self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] @@ -208,7 +210,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non build_image(config) else: logger.info(f"Pulling {config.docker_image}") - r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT) + r = _run( + [_docker_bin(config), "pull", config.docker_image], + timeout=DOCKER_RUN_TIMEOUT, + ) if r.returncode != 0: raise RuntimeError( f"Failed to pull image '{config.docker_image}'.\n" @@ -222,7 +227,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = True else: logger.info(f"Stopping existing container: {self._container_name}") - _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _run( + [_docker_bin(config), "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) if not reconnect: _remove_container(config, self._container_name) @@ -251,9 +259,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: self._bound_rpc_calls[method] = callable # Forward to container — Module.set_rpc_method unpickles the RpcCall # and wires it with the container's own LCMRPC - self.rpc.call_sync( - f"{self.remote_name}/set_rpc_method", ([method, callable], {}) - ) + self.rpc.call_sync(f"{self.remote_name}/set_rpc_method", ([method, callable], {})) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: missing = set(methods) - self._bound_rpc_calls.keys() @@ -282,7 +288,10 @@ def stop(self) -> None: unsub() self._unsub_fns.clear() with suppress(Exception): - _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT) + _run( + [_docker_bin(self.config), "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) with suppress(Exception): _remove_container(self.config, self._container_name) self._running = False diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 90538cfc0a..3e8ff31018 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,7 +18,6 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_runner import DockerModule, is_docker_module from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -77,6 +76,8 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] + from dimos.core.docker_runner import DockerModule, is_docker_module + if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") @@ -91,6 +92,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] ) -> list[ModuleProxy]: + from dimos.core.docker_runner import DockerModule, is_docker_module + if not self._client: raise ValueError("Not started") diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index a89c54caf0..c9e73ac54e 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -91,7 +91,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: ... def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... -class RPCClient(ModuleProxyProtocol): +class RPCClient: def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] self.rpc = LCMRPC() self.actor_class = actor_class diff --git a/dimos/core/test_core.py b/dimos/core/test_core.py index 197539ef67..30f14c93b4 100644 --- a/dimos/core/test_core.py +++ b/dimos/core/test_core.py @@ -80,7 +80,7 @@ def test_classmethods() -> None: # Check that we have the expected RPC methods assert "navigate_to" in class_rpcs, "navigate_to should be in rpcs" assert "start" in class_rpcs, "start should be in rpcs" - assert len(class_rpcs) == 9 + assert len(class_rpcs) == 8 # Check that the values are callable assert callable(class_rpcs["navigate_to"]), "navigate_to should be callable" diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index e6ddbc4a73..f60f37a21a 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -78,7 +78,7 @@ class Bare(Module): class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -103,7 +103,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls @@ -139,7 +139,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( self, mock_worker_manager_cls, mock_docker_module_cls @@ -175,7 +175,7 @@ def test_deploy_parallel_separates_docker_and_regular( coordinator.stop() - @patch("dimos.core.module_coordinator.DockerModule") + @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() diff --git a/uv.lock b/uv.lock index 084e157ee5..820bb92f2d 100644 --- a/uv.lock +++ b/uv.lock @@ -1852,6 +1852,7 @@ docker = [ { name = "dimos-lcm" }, { name = "langchain-core" }, { name = "lcm" }, + { name = "matplotlib" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "open3d", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, @@ -2022,6 +2023,7 @@ requires-dist = [ { name = "lcm", marker = "extra == 'docker'" }, { name = "llvmlite", specifier = ">=0.42.0" }, { name = "lxml-stubs", marker = "extra == 'dev'", specifier = ">=0.5.1,<1" }, + { name = "matplotlib", marker = "extra == 'docker'" }, { name = "matplotlib", marker = "extra == 'manipulation'", specifier = ">=3.7.1" }, { name = "md-babel-py", marker = "extra == 'dev'", specifier = "==1.1.1" }, { name = "moondream", marker = "extra == 'perception'" }, From 8eaed575ede4a356f3f9507b742f3e214f09a687 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 15:54:12 -0800 Subject: [PATCH 29/89] maintain order --- dimos/core/docker_runner.py | 50 ++++++++++++++++++++++++++++---- dimos/core/module_coordinator.py | 29 +++++++++++++----- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 7ce89c40e6..776cef516d 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,6 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution +DOCKER_PULL_TIMEOUT = 600 # Timeout for `docker pull` (large images over slow connections) DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -136,6 +137,31 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: return r.returncode == 0 and r.stdout.strip() == "true" +def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None: + """Return the container's start time as a Unix timestamp, or None on failure.""" + r = _run( + [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name], + timeout=DOCKER_STATUS_TIMEOUT, + ) + if r.returncode != 0: + return None + from datetime import datetime + + try: + # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z" + raw = r.stdout.strip() + # Truncate nanoseconds to microseconds for fromisoformat compatibility + if "." in raw: + base, frac = raw.split(".", 1) + frac = frac.rstrip("Z")[:6] + raw = f"{base}.{frac}+00:00" + else: + raw = raw.rstrip("Z") + "+00:00" + return datetime.fromisoformat(raw).timestamp() + except (ValueError, OSError): + return None + + def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() @@ -190,10 +216,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._kwargs = kwargs self._running = False self.remote_name = module_class.__name__ - # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2" + # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] self._container_name = ( - config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}" + config.docker_container_name + or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}" ) self.rpc = LCMRPC() @@ -212,7 +239,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Pulling {config.docker_image}") r = _run( [_docker_bin(config), "pull", config.docker_image], - timeout=DOCKER_RUN_TIMEOUT, + timeout=DOCKER_PULL_TIMEOUT, ) if r.returncode != 0: raise RuntimeError( @@ -223,9 +250,18 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): if config.docker_reconnect_container: - logger.info(f"Reconnecting to running container: {self._container_name}") - reconnect = True - else: + # Verify the container hasn't restarted since we last ran + container_start = _container_started_at(config, self._container_name) + process_start = time.time() # conservative: current time as upper bound + if container_start is not None and container_start > process_start - 5: + logger.warning( + f"Container {self._container_name} appears to have restarted recently " + f"(started at {container_start:.0f}). Treating as fresh start." + ) + else: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + if not reconnect: logger.info(f"Stopping existing container: {self._container_name}") _run( [_docker_bin(config), "stop", self._container_name], @@ -279,6 +315,8 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" + if not self._running: + return with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) with suppress(Exception): diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 3e8ff31018..01f657dd1a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -97,8 +97,19 @@ def deploy_parallel( if not self._client: raise ValueError("Not started") - docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])] - worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])] + # Split by type, tracking original indices for reassembly + docker_indices: list[int] = [] + worker_indices: list[int] = [] + docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + # the i is needed for maintaining order on the returned output + for i, spec in enumerate(module_specs): + if is_docker_module(spec[0]): + docker_indices.append(i) + docker_specs.append(spec) + else: + worker_indices.append(i) + worker_specs.append(spec) worker_results: list[Any] = [] docker_results: list[Any] = [] @@ -112,12 +123,16 @@ def deploy_parallel( ) ) finally: - results = worker_results + docker_results + # Reassemble results in original input order + results: list[Any] = [None] * len(module_specs) + for idx, mod in zip(worker_indices, worker_results, strict=False): + results[idx] = mod + for idx, mod in zip(docker_indices, docker_results, strict=False): + results[idx] = mod # Register whatever succeeded so stop() can clean them up - for (module_class, _, _), module in zip( - worker_specs + docker_specs, results, strict=False - ): - self._deployed_modules[module_class] = module + for spec, module in zip(module_specs, results, strict=False): + if module is not None: + self._deployed_modules[spec[0]] = module return results From 55c234df7d4351c1e737f97396d7b4a96b0b211b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:12:44 -0800 Subject: [PATCH 30/89] refine --- dimos/core/docker_runner.py | 46 ++++----------------- dimos/core/tests/test_docker_deployment.py | 2 +- examples/docker_hello_world/hello_docker.py | 1 + 3 files changed, 10 insertions(+), 39 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 776cef516d..aacdbe7c19 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -137,31 +137,6 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: return r.returncode == 0 and r.stdout.strip() == "true" -def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None: - """Return the container's start time as a Unix timestamp, or None on failure.""" - r = _run( - [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name], - timeout=DOCKER_STATUS_TIMEOUT, - ) - if r.returncode != 0: - return None - from datetime import datetime - - try: - # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z" - raw = r.stdout.strip() - # Truncate nanoseconds to microseconds for fromisoformat compatibility - if "." in raw: - base, frac = raw.split(".", 1) - frac = frac.rstrip("Z")[:6] - raw = f"{base}.{frac}+00:00" - else: - raw = raw.rstrip("Z") + "+00:00" - return datetime.fromisoformat(raw).timestamp() - except (ValueError, OSError): - return None - - def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() @@ -250,18 +225,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non reconnect = False if _is_container_running(config, self._container_name): if config.docker_reconnect_container: - # Verify the container hasn't restarted since we last ran - container_start = _container_started_at(config, self._container_name) - process_start = time.time() # conservative: current time as upper bound - if container_start is not None and container_start > process_start - 5: - logger.warning( - f"Container {self._container_name} appears to have restarted recently " - f"(started at {container_start:.0f}). Treating as fresh start." - ) - else: - logger.info(f"Reconnecting to running container: {self._container_name}") - reconnect = True - if not reconnect: + logger.info(f"Reconnecting to running container: {self._container_name}") + reconnect = True + else: logger.info(f"Stopping existing container: {self._container_name}") _run( [_docker_bin(config), "stop", self._container_name], @@ -284,7 +250,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._wait_for_rpc() except Exception: with suppress(Exception): - self.stop() + self._cleanup() raise def get_rpc_method_names(self) -> list[str]: @@ -319,6 +285,10 @@ def stop(self) -> None: return with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) + self._cleanup() + + def _cleanup(self) -> None: + """Release all resources. Safe to call multiple times or from partial init.""" with suppress(Exception): self.rpc.stop() for unsub in self._unsub_fns: diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index f60f37a21a..95db171e1c 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular( # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() - # Results are worker-first, then docker + # Results preserve input order assert results[0] is regular_proxy assert results[1] is mock_dm diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 187384854e..eb4765a629 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -76,6 +76,7 @@ def _cowsay(self, text: str) -> str: ["/usr/games/cowsay", text], capture_output=True, text=True, + check=True, ) return result.stdout From d9d4716ec159f286dc86a00e3937cb005f091093 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:37:59 -0800 Subject: [PATCH 31/89] make pull out configurable --- dimos/core/docker_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index aacdbe7c19..89fa9d9af3 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,7 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution -DOCKER_PULL_TIMEOUT = 600 # Timeout for `docker pull` (large images over slow connections) +DOCKER_PULL_TIMEOUT_DEFAULT = 600 # Default timeout for `docker pull` DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -95,7 +95,8 @@ class DockerModuleConfig(ModuleConfig): docker_command: list[str] | None = None docker_extra_args: list[str] = field(default_factory=list) - # Startup readiness + # Timeouts + docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 @@ -214,7 +215,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non logger.info(f"Pulling {config.docker_image}") r = _run( [_docker_bin(config), "pull", config.docker_image], - timeout=DOCKER_PULL_TIMEOUT, + timeout=config.docker_pull_timeout, ) if r.returncode != 0: raise RuntimeError( From 215e9ba7aa732d8ac974962b675164fc98496dd2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 16:38:25 -0800 Subject: [PATCH 32/89] have example show using normal config --- examples/docker_hello_world/hello_docker.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index eb4765a629..66e95df316 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -56,6 +56,9 @@ class HelloDockerConfig(DockerModuleConfig): docker_restart_policy: str = "no" docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"}) + # Custom (non-docker) config field — passed to the container via JSON + greeting_prefix: str = "Hello" + class HelloDockerModule(Module["HelloDockerConfig"]): """A trivial module that runs inside Docker and echoes greetings.""" @@ -88,7 +91,13 @@ def _on_prompt(self, text: str) -> None: @rpc def greet(self, name: str) -> str: """RPC method that can be called directly.""" - return self._cowsay(f"Hello, {name}!") + prefix = self.config.greeting_prefix + return self._cowsay(f"{prefix}, {name}!") + + @rpc + def get_greeting_prefix(self) -> str: + """Return the config value to verify it was passed to the container.""" + return self.config.greeting_prefix # --------------------------------------------------------------------------- @@ -125,14 +134,19 @@ def _on_greeting(self, text: str) -> None: coordinator = autoconnect( PromptModule.blueprint(), - HelloDockerModule.blueprint(), + HelloDockerModule.blueprint(greeting_prefix="Howdy"), ).build() # Get module proxies prompt_mod = coordinator.get_instance(PromptModule) docker_mod = coordinator.get_instance(HelloDockerModule) - # Test RPC + # Test that custom config was passed to the container + prefix = docker_mod.get_greeting_prefix() + assert prefix == "Howdy", f"Expected 'Howdy', got {prefix!r}" + print(f"Config passed to container: greeting_prefix={prefix!r}") + + # Test RPC (should use the custom prefix) print(docker_mod.greet("World")) # Test stream From 1f8ab0a31ef2da34012b41eecbd8eee323b5c3fc Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 17:55:27 -0800 Subject: [PATCH 33/89] Add DockerWorkerManager --- dimos/core/docker_runner.py | 7 ++- dimos/core/docker_worker_manager.py | 59 ++++++++++++++++++++++ dimos/core/module_coordinator.py | 13 ++--- dimos/core/tests/test_docker_deployment.py | 10 ++-- 4 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 dimos/core/docker_worker_manager.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 89fa9d9af3..26d822ce73 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -110,7 +110,11 @@ class DockerModuleConfig(ModuleConfig): def is_docker_module(module_class: type) -> bool: """Check if a module class should run in Docker based on its default_config.""" default_config = getattr(module_class, "default_config", None) - return default_config is not None and issubclass(default_config, DockerModuleConfig) + return ( + default_config is not None + and isinstance(default_config, type) + and issubclass(default_config, DockerModuleConfig) + ) # Docker helpers @@ -284,6 +288,7 @@ def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" if not self._running: return + self._running = False # claim shutdown before any side-effects with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) self._cleanup() diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py new file mode 100644 index 0000000000..52317d984b --- /dev/null +++ b/dimos/core/docker_worker_manager.py @@ -0,0 +1,59 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from dimos.core.docker_runner import DockerModule + from dimos.core.module import Module + + +class DockerWorkerManager: + """Parallel deployment of Docker-backed modules.""" + + @staticmethod + def deploy_parallel( + specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], + ) -> list[DockerModule]: + """Deploy multiple DockerModules in parallel, collecting partial results on failure. + + Returns all successfully-created DockerModules. If any deployment fails, + the successful ones are still returned (so the caller can register them + for cleanup), and the first exception is re-raised. + """ + from dimos.core.docker_runner import DockerModule + + results: dict[int, DockerModule] = {} + first_exc: Exception | None = None + + with ThreadPoolExecutor(max_workers=len(specs)) as executor: + futures: dict[Future[DockerModule], int] = { + executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i + for i, spec in enumerate(specs) + } + for fut in as_completed(futures): + idx = futures[fut] + try: + results[idx] = fut.result() + except Exception as e: + if first_exc is None: + first_exc = e + + # Return in input order (missing indices = failed deployments) + ordered = [results[i] for i in sorted(results)] + if first_exc is not None: + raise first_exc + return ordered diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 01f657dd1a..4ede195571 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,6 +18,7 @@ import threading from typing import TYPE_CHECKING, Any +from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager @@ -76,6 +77,7 @@ def stop(self) -> None: self._client.close_all() # type: ignore[union-attr] def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] + # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator from dimos.core.docker_runner import DockerModule, is_docker_module if not self._client: @@ -92,7 +94,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: def deploy_parallel( self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] ) -> list[ModuleProxy]: - from dimos.core.docker_runner import DockerModule, is_docker_module + # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator + from dimos.core.docker_runner import is_docker_module if not self._client: raise ValueError("Not started") @@ -102,7 +105,6 @@ def deploy_parallel( worker_indices: list[int] = [] docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] - # the i is needed for maintaining order on the returned output for i, spec in enumerate(module_specs): if is_docker_module(spec[0]): docker_indices.append(i) @@ -116,12 +118,7 @@ def deploy_parallel( try: worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] if docker_specs: - with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor: - docker_results = list( - executor.map( - lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs - ) - ) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 95db171e1c..17d1290916 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -139,10 +139,10 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator.stop() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_parallel_separates_docker_and_regular( - self, mock_worker_manager_cls, mock_docker_module_cls + self, mock_worker_manager_cls, mock_docker_deploy ): mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr @@ -151,7 +151,7 @@ def test_deploy_parallel_separates_docker_and_regular( mock_worker_mgr.deploy_parallel.return_value = [regular_proxy] mock_dm = MagicMock() - mock_docker_module_cls.return_value = mock_dm + mock_docker_deploy.return_value = [mock_dm] coordinator = ModuleCoordinator() coordinator.start() @@ -164,8 +164,8 @@ def test_deploy_parallel_separates_docker_and_regular( # Regular module goes through worker manager mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) - # Docker module gets its own DockerModule - mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + # Docker specs go through DockerWorkerManager + mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})]) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() From 4536ce12c1efffd47004cb57443c7a6f9cfda65a Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 21:44:22 -0800 Subject: [PATCH 34/89] add proper cleanup handling if a module fails to deploy correctly --- dimos/core/docker_worker_manager.py | 43 ++-- dimos/core/module_coordinator.py | 11 +- .../tests/test_parallel_deploy_cleanup.py | 219 ++++++++++++++++++ dimos/core/worker_manager.py | 30 ++- dimos/utils/safe_thread_map.py | 92 ++++++++ 5 files changed, 350 insertions(+), 45 deletions(-) create mode 100644 dimos/core/tests/test_parallel_deploy_cleanup.py create mode 100644 dimos/utils/safe_thread_map.py diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 52317d984b..b70ff3ba52 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -13,9 +13,11 @@ # limitations under the License. from __future__ import annotations -from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from contextlib import suppress from typing import TYPE_CHECKING, Any +from dimos.utils.safe_thread_map import safe_thread_map + if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule from dimos.core.module import Module @@ -28,32 +30,21 @@ class DockerWorkerManager: def deploy_parallel( specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], ) -> list[DockerModule]: - """Deploy multiple DockerModules in parallel, collecting partial results on failure. + """Deploy multiple DockerModules in parallel. - Returns all successfully-created DockerModules. If any deployment fails, - the successful ones are still returned (so the caller can register them - for cleanup), and the first exception is re-raised. + If any deployment fails, all successfully-started containers are + stopped before an ExceptionGroup is raised. """ from dimos.core.docker_runner import DockerModule - results: dict[int, DockerModule] = {} - first_exc: Exception | None = None - - with ThreadPoolExecutor(max_workers=len(specs)) as executor: - futures: dict[Future[DockerModule], int] = { - executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i - for i, spec in enumerate(specs) - } - for fut in as_completed(futures): - idx = futures[fut] - try: - results[idx] = fut.result() - except Exception as e: - if first_exc is None: - first_exc = e - - # Return in input order (missing indices = failed deployments) - ordered = [results[i] for i in sorted(results)] - if first_exc is not None: - raise first_exc - return ordered + def _on_errors( + _outcomes: list, successes: list[DockerModule], errors: list[Exception] + ) -> None: + for mod in successes: + with suppress(Exception): + mod.stop() + raise ExceptionGroup("docker deploy_parallel failed", errors) + + return safe_thread_map( + specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors + ) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 4ede195571..48546c5568 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,12 +113,9 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) - worker_results: list[Any] = [] - docker_results: list[Any] = [] try: - worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else [] - if docker_specs: - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) + worker_results = self._client.deploy_parallel(worker_specs) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) @@ -127,9 +124,9 @@ def deploy_parallel( for idx, mod in zip(docker_indices, docker_results, strict=False): results[idx] = mod # Register whatever succeeded so stop() can clean them up - for spec, module in zip(module_specs, results, strict=False): + for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: - self._deployed_modules[spec[0]] = module + self._deployed_modules[module_class] = module return results diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py new file mode 100644 index 0000000000..1987fa4be7 --- /dev/null +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -0,0 +1,219 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Tests that deploy_parallel cleans up successfully-started modules when a +sibling deployment fails ("middle module throws" scenario). +""" + +from __future__ import annotations + +import threading +from unittest.mock import MagicMock, patch + +import pytest + + +class TestDockerWorkerManagerPartialFailure: + """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" + + @patch("dimos.core.docker_runner.DockerModule") + def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls): + """Deploy 3 modules where the middle one fails. The other two must be stopped.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + mod_c = MagicMock(name="ModuleC") + + barrier = threading.Barrier(3, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + label = cls.__name__ + barrier.wait() + if label == "B": + raise RuntimeError("B failed to start") + return mod_a if label == "A" else mod_c + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: + DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(exc_info.value.exceptions) == 1 + assert "B failed to start" in str(exc_info.value.exceptions[0]) + + # Both successful modules must have been stopped exactly once + mod_a.stop.assert_called_once() + mod_c.stop.assert_called_once() + + @patch("dimos.core.docker_runner.DockerModule") + def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls): + """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + + barrier = threading.Barrier(3, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + label = cls.__name__ + barrier.wait() + if label == "B": + raise RuntimeError("B failed") + if label == "C": + raise ValueError("C failed") + return mod_a + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: + DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(exc_info.value.exceptions) == 2 + messages = {str(e) for e in exc_info.value.exceptions} + assert "B failed" in messages + assert "C failed" in messages + + # The one successful module must have been stopped + mod_a.stop.assert_called_once() + + @patch("dimos.core.docker_runner.DockerModule") + def test_all_succeed_no_stops(self, mock_docker_module_cls): + """When all deployments succeed, no modules should be stopped.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mocks = [MagicMock(name=f"Mod{i}") for i in range(3)] + + def fake_constructor(cls, *args, **kwargs): + return mocks[["A", "B", "C"].index(cls.__name__)] + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + results = DockerWorkerManager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + assert len(results) == 3 + for m in mocks: + m.stop.assert_not_called() + + @patch("dimos.core.docker_runner.DockerModule") + def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls): + """If stop() itself raises during cleanup, the original deploy error still propagates.""" + from dimos.core.docker_worker_manager import DockerWorkerManager + + mod_a = MagicMock(name="ModuleA") + mod_a.stop.side_effect = OSError("stop failed") + + barrier = threading.Barrier(2, timeout=5) + + def fake_constructor(cls, *args, **kwargs): + barrier.wait() + if cls.__name__ == "B": + raise RuntimeError("B exploded") + return mod_a + + mock_docker_module_cls.side_effect = fake_constructor + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + + with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed"): + DockerWorkerManager.deploy_parallel([(FakeA, (), {}), (FakeB, (), {})]) + + # stop was attempted despite it raising + mod_a.stop.assert_called_once() + + +class TestWorkerManagerPartialFailure: + """WorkerManager.deploy_parallel must clean up successful RPCClients when one fails.""" + + def test_middle_module_fails_cleans_up_siblings(self): + from dimos.core.worker_manager import WorkerManager + + manager = WorkerManager(n_workers=2) + + mock_workers = [MagicMock(name=f"Worker{i}") for i in range(2)] + for w in mock_workers: + w.module_count = 0 + w.reserve_slot = MagicMock( + side_effect=lambda w=w: setattr(w, "module_count", w.module_count + 1) + ) + + manager._workers = mock_workers + manager._started = True + + def fake_deploy_module(module_class, args=(), kwargs=None): + if module_class.__name__ == "B": + raise RuntimeError("B failed to deploy") + return MagicMock(name=f"actor_{module_class.__name__}") + + for w in mock_workers: + w.deploy_module = fake_deploy_module + + FakeA = type("A", (), {}) + FakeB = type("B", (), {}) + FakeC = type("C", (), {}) + + rpc_clients_created: list[MagicMock] = [] + + with patch("dimos.core.worker_manager.RPCClient") as mock_rpc_cls: + + def make_rpc(actor, cls): + client = MagicMock(name=f"rpc_{cls.__name__}") + rpc_clients_created.append(client) + return client + + mock_rpc_cls.side_effect = make_rpc + + with pytest.raises(ExceptionGroup, match="worker deploy_parallel failed"): + manager.deploy_parallel( + [ + (FakeA, (), {}), + (FakeB, (), {}), + (FakeC, (), {}), + ] + ) + + # Every successfully-created RPC client must have been cleaned up exactly once + for client in rpc_clients_created: + client.stop_rpc_client.assert_called_once() diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 4dbb51eb54..25a052590c 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -14,12 +14,13 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor +from contextlib import suppress from typing import TYPE_CHECKING, Any from dimos.core.rpc_client import RPCClient from dimos.core.worker import Worker from dimos.utils.logging_config import setup_logger +from dimos.utils.safe_thread_map import safe_thread_map if TYPE_CHECKING: from dimos.core.module import ModuleT @@ -65,6 +66,9 @@ def deploy_parallel( if self._closed: raise RuntimeError("WorkerManager is closed") + if len(module_specs) == 0: + return [] + # Auto-start for backward compatibility if not self._started: self.start() @@ -78,17 +82,19 @@ def deploy_parallel( worker.reserve_slot() assignments.append((worker, module_class, args, kwargs)) - def _deploy( - item: tuple[Worker, type[ModuleT], tuple[Any, ...], dict[Any, Any]], - ) -> RPCClient: - worker, module_class, args, kwargs = item - actor = worker.deploy_module(module_class, args=args, kwargs=kwargs) - return RPCClient(actor, module_class) - - with ThreadPoolExecutor(max_workers=len(assignments)) as pool: - results = list(pool.map(_deploy, assignments)) - - return results + def _on_errors( + _outcomes: list, successes: list[RPCClient], errors: list[Exception] + ) -> None: + for rpc_client in successes: + with suppress(Exception): + rpc_client.stop_rpc_client() + raise ExceptionGroup("worker deploy_parallel failed", errors) + + return safe_thread_map( + assignments, + lambda item: RPCClient(item[0].deploy_module(item[1], item[2], item[3]), item[1]), + _on_errors, + ) @property def workers(self) -> list[Worker]: diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py new file mode 100644 index 0000000000..f051b0d950 --- /dev/null +++ b/dimos/utils/safe_thread_map.py @@ -0,0 +1,92 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from concurrent.futures import Future, ThreadPoolExecutor, as_completed +from typing import TYPE_CHECKING, Any, TypeVar + +if TYPE_CHECKING: + from collections.abc import Callable, Sequence + +T = TypeVar("T") +R = TypeVar("R") + + +def safe_thread_map( + items: Sequence[T], + fn: Callable[[T], R], + on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any] + | None = None, +) -> list[R]: + """Thread-pool map that waits for all items to finish before raising and a cleanup handler + + - Empty *items* → returns ``[]`` immediately. + - All succeed → returns results in input order. + - Any fail → calls ``on_errors(outcomes, successes, errors)`` where + *outcomes* is a list of ``(input, result_or_exception)`` pairs in input + order, *successes* is the list of successful results, and *errors* is + the list of exceptions. If *on_errors* raises, that exception propagates. + If *on_errors* returns normally, its return value is returned from + ``safe_thread_map``. If *on_errors* is ``None``, raises an + ``ExceptionGroup``. + + Example:: + + def start_service(name: str) -> Connection: + return connect(name) + + def cleanup( + outcomes: list[tuple[str, Connection | Exception]], + successes: list[Connection], + errors: list[Exception], + ) -> None: + for conn in successes: + conn.close() + raise ExceptionGroup("failed to start services", errors) + + connections = safe_thread_map( + ["db", "cache", "queue"], + start_service, + cleanup, # called only if any start_service() raises + ) + """ + if not items: + return [] + + outcomes: dict[int, R | Exception] = {} + + with ThreadPoolExecutor(max_workers=len(items)) as pool: + futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)} + for fut in as_completed(futures): + idx = futures[fut] + try: + outcomes[idx] = fut.result() + except Exception as e: + outcomes[idx] = e + + successes: list[R] = [] + errors: list[Exception] = [] + for v in outcomes.values(): + if isinstance(v, Exception): + errors.append(v) + else: + successes.append(v) + + if errors: + if on_errors is not None: + zipped = [(items[i], outcomes[i]) for i in range(len(items))] + return on_errors(zipped, successes, errors) # type: ignore[return-value] + raise ExceptionGroup("safe_thread_map failed", errors) + + return [outcomes[i] for i in range(len(items))] # type: ignore[misc] From 59c5cc065e30f355d0011bf00b18ae31994774fd Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 21:48:13 -0800 Subject: [PATCH 35/89] mypy fixup --- dimos/core/docker_worker_manager.py | 2 +- dimos/core/module_coordinator.py | 4 ++-- dimos/core/worker_manager.py | 2 +- dimos/utils/safe_thread_map.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index b70ff3ba52..34183fda9f 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -38,7 +38,7 @@ def deploy_parallel( from dimos.core.docker_runner import DockerModule def _on_errors( - _outcomes: list, successes: list[DockerModule], errors: list[Exception] + _outcomes: list[Any], successes: list[DockerModule], errors: list[Exception] ) -> None: for mod in successes: with suppress(Exception): diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 48546c5568..8269a47bf9 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -115,13 +115,13 @@ def deploy_parallel( try: worker_results = self._client.deploy_parallel(worker_specs) - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) + docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] finally: # Reassemble results in original input order results: list[Any] = [None] * len(module_specs) for idx, mod in zip(worker_indices, worker_results, strict=False): results[idx] = mod - for idx, mod in zip(docker_indices, docker_results, strict=False): + for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] results[idx] = mod # Register whatever succeeded so stop() can clean them up for (module_class, _, _), module in zip(module_specs, results, strict=False): diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 25a052590c..b9c25c8445 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -83,7 +83,7 @@ def deploy_parallel( assignments.append((worker, module_class, args, kwargs)) def _on_errors( - _outcomes: list, successes: list[RPCClient], errors: list[Exception] + _outcomes: list[Any], successes: list[RPCClient], errors: list[Exception] ) -> None: for rpc_client in successes: with suppress(Exception): diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index f051b0d950..240f5e7099 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -86,7 +86,7 @@ def cleanup( if errors: if on_errors is not None: zipped = [(items[i], outcomes[i]) for i in range(len(items))] - return on_errors(zipped, successes, errors) # type: ignore[return-value] + return on_errors(zipped, successes, errors) # type: ignore[return-value, no-any-return] raise ExceptionGroup("safe_thread_map failed", errors) return [outcomes[i] for i in range(len(items))] # type: ignore[misc] From 5d46c8b659f99c74a6f1aa55b025d396ec4c23a4 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 5 Mar 2026 22:18:08 -0800 Subject: [PATCH 36/89] - --- dimos/core/module_coordinator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 8269a47bf9..cbcdb179e9 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,6 +113,8 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) + worker_results: list[Any] = [] + docker_results: list[Any] = [] try: worker_results = self._client.deploy_parallel(worker_specs) docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] From c72b380ba8fab997e09a4fa6406067226a715533 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 21:35:53 -0800 Subject: [PATCH 37/89] add docker_build_ssh and image rebuild check --- dimos/core/docker_build.py | 41 +++++++++++++++++++++++++++++++++++++ dimos/core/docker_runner.py | 36 ++++++++++++++++++++------------ 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 7ee90fc5c3..2679450269 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -20,6 +20,7 @@ from __future__ import annotations +import hashlib import subprocess from typing import TYPE_CHECKING @@ -90,14 +91,52 @@ def _convert_dockerfile(dockerfile: Path) -> Path: return converted +_BUILD_HASH_LABEL = "dimos.build.hash" + + +def _compute_build_hash(cfg: DockerModuleConfig) -> str: + """Hash Dockerfile contents, build args, and build context path.""" + assert cfg.docker_file is not None + digest = hashlib.sha256() + digest.update(cfg.docker_file.read_bytes()) + for key, val in sorted(cfg.docker_build_args.items()): + digest.update(f"{key}={val}".encode()) + return digest.hexdigest() + + +def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: + """Read the build hash label from an existing Docker image.""" + r = _run( + [ + docker_bin, + "image", + "inspect", + "-f", + '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', + image_name, + ], + timeout=DOCKER_CMD_TIMEOUT, + ) + if r.returncode != 0: + return None + value = r.stdout.strip() + # docker prints "" when the label is missing + return value if value and value != "" else None + + def build_image(cfg: DockerModuleConfig) -> None: """Build Docker image using footer mode conversion.""" if cfg.docker_file is None: raise ValueError("docker_file is required for building Docker images") + + build_hash = _compute_build_hash(cfg) dockerfile = _convert_dockerfile(cfg.docker_file) context = cfg.docker_build_context or cfg.docker_file.parent cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) + if cfg.docker_build_ssh: + cmd.extend(["--ssh", "default"]) for k, v in cfg.docker_build_args.items(): cmd.extend(["--build-arg", f"{k}={v}"]) cmd.append(str(context)) @@ -115,6 +154,8 @@ def image_exists(cfg: DockerModuleConfig) -> bool: __all__ = [ "DIMOS_FOOTER", + "_compute_build_hash", + "_get_image_build_hash", "build_image", "image_exists", ] diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 26d822ce73..4a19746c5e 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -54,6 +54,8 @@ class DockerModuleConfig(ModuleConfig): For advanced Docker options not listed here, use docker_extra_args. Example: docker_extra_args=["--cap-add=SYS_ADMIN", "--read-only"] + + NOTE: a DockerModule will rebuild automatically if the Dockerfile or build args change """ # Build / image @@ -61,6 +63,7 @@ class DockerModuleConfig(ModuleConfig): docker_file: Path | None = None # Required on host for building, not needed in container docker_build_context: Path | None = None docker_build_args: dict[str, str] = field(default_factory=dict) + docker_build_ssh: bool = False # Pass --ssh default to docker build (for private repo clones) # Identity docker_container_name: str | None = None @@ -180,7 +183,12 @@ class DockerModule(ModuleProxyProtocol): config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - from dimos.core.docker_build import build_image, image_exists + from dimos.core.docker_build import ( + _compute_build_hash, + _get_image_build_hash, + build_image, + image_exists, + ) config_class = getattr(module_class, "default_config", DockerModuleConfig) if not issubclass(config_class, DockerModuleConfig): @@ -211,21 +219,23 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non # Build or pull image, launch container, wait for RPC server try: - if not image_exists(config): - if config.docker_file is not None: + if config.docker_file is not None: + current_hash = _compute_build_hash(config) + stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image) + if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) - else: - logger.info(f"Pulling {config.docker_image}") - r = _run( - [_docker_bin(config), "pull", config.docker_image], - timeout=config.docker_pull_timeout, + elif not image_exists(config): + logger.info(f"Pulling {config.docker_image}") + r = _run( + [_docker_bin(config), "pull", config.docker_image], + timeout=config.docker_pull_timeout, + ) + if r.returncode != 0: + raise RuntimeError( + f"Failed to pull image '{config.docker_image}'.\n" + f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" ) - if r.returncode != 0: - raise RuntimeError( - f"Failed to pull image '{config.docker_image}'.\n" - f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" - ) reconnect = False if _is_container_running(config, self._container_name): From 6e0a5c5886af3f311b8317fcc5028016eeb8a256 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 21:42:48 -0800 Subject: [PATCH 38/89] simplify --- dimos/core/docker_build.py | 52 ++++++++++++++----------------------- dimos/core/docker_runner.py | 21 ++++++--------- 2 files changed, 27 insertions(+), 46 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 2679450269..d3fbcec685 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -33,10 +33,11 @@ logger = setup_logger() -# Timeout for quick Docker commands +_BUILD_HASH_LABEL = "dimos.build.hash" + DOCKER_CMD_TIMEOUT = 20 -# Sentinel value to detect already-converted Dockerfiles (UUID ensures uniqueness) +# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" # Footer appended to Dockerfiles for DimOS module conversion @@ -54,28 +55,6 @@ """ -def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]: - """Run a command and return the result.""" - return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) - - -def _run_streaming(cmd: list[str]) -> int: - """Run command and stream output to terminal. Returns exit code.""" - result = subprocess.run(cmd, text=True) - return result.returncode - - -def _docker_bin(cfg: DockerModuleConfig) -> str: - """Get docker binary path.""" - return cfg.docker_bin or "docker" - - -def _image_exists(docker_bin: str, image_name: str) -> bool: - """Check if a Docker image exists locally.""" - r = _run([docker_bin, "image", "inspect", image_name], timeout=DOCKER_CMD_TIMEOUT) - return r.returncode == 0 - - def _convert_dockerfile(dockerfile: Path) -> Path: """Append DimOS footer to Dockerfile. Returns path to converted file.""" content = dockerfile.read_text() @@ -91,9 +70,6 @@ def _convert_dockerfile(dockerfile: Path) -> Path: return converted -_BUILD_HASH_LABEL = "dimos.build.hash" - - def _compute_build_hash(cfg: DockerModuleConfig) -> str: """Hash Dockerfile contents, build args, and build context path.""" assert cfg.docker_file is not None @@ -106,7 +82,7 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str: def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: """Read the build hash label from an existing Docker image.""" - r = _run( + r = subprocess.run( [ docker_bin, "image", @@ -115,7 +91,10 @@ def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', image_name, ], + capture_output=True, + text=True, timeout=DOCKER_CMD_TIMEOUT, + check=False, ) if r.returncode != 0: return None @@ -133,7 +112,7 @@ def build_image(cfg: DockerModuleConfig) -> None: dockerfile = _convert_dockerfile(cfg.docker_file) context = cfg.docker_build_context or cfg.docker_file.parent - cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) if cfg.docker_build_ssh: cmd.extend(["--ssh", "default"]) @@ -142,14 +121,21 @@ def build_image(cfg: DockerModuleConfig) -> None: cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") - exit_code = _run_streaming(cmd) - if exit_code != 0: - raise RuntimeError(f"Docker build failed with exit code {exit_code}") + result = subprocess.run(cmd, text=True) + if result.returncode != 0: + raise RuntimeError(f"Docker build failed with exit code {result.returncode}") def image_exists(cfg: DockerModuleConfig) -> bool: """Check if the configured Docker image exists locally.""" - return _image_exists(_docker_bin(cfg), cfg.docker_image) + r = subprocess.run( + [cfg.docker_bin, "image", "inspect", cfg.docker_image], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + return r.returncode == 0 __all__ = [ diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 4a19746c5e..c81d4367bc 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -128,25 +128,20 @@ def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.Complete return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) -def _docker_bin(cfg: DockerModuleConfig) -> str: - """Get docker binary path, defaulting to 'docker' if empty/None.""" - return cfg.docker_bin or "docker" - - def _remove_container(cfg: DockerModuleConfig, name: str) -> None: - _run([_docker_bin(cfg), "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) + _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: r = _run( - [_docker_bin(cfg), "inspect", "-f", "{{.State.Running}}", name], + [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name], timeout=DOCKER_STATUS_TIMEOUT, ) return r.returncode == 0 and r.stdout.strip() == "true" def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: - r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) + r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) out = (r.stdout or "").rstrip() err = (r.stderr or "").rstrip() return out + ("\n" + err if err else "") @@ -221,14 +216,14 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non try: if config.docker_file is not None: current_hash = _compute_build_hash(config) - stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image) + stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image) if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) elif not image_exists(config): logger.info(f"Pulling {config.docker_image}") r = _run( - [_docker_bin(config), "pull", config.docker_image], + [config.docker_bin, "pull", config.docker_image], timeout=config.docker_pull_timeout, ) if r.returncode != 0: @@ -245,7 +240,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non else: logger.info(f"Stopping existing container: {self._container_name}") _run( - [_docker_bin(config), "stop", self._container_name], + [config.docker_bin, "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT, ) @@ -313,7 +308,7 @@ def _cleanup(self) -> None: self._unsub_fns.clear() with suppress(Exception): _run( - [_docker_bin(self.config), "stop", self._container_name], + [self.config.docker_bin, "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT, ) with suppress(Exception): @@ -353,7 +348,7 @@ def _build_docker_run_command(self) -> list[str]: cfg = self.config self._validate_config(cfg) - cmd = [_docker_bin(cfg), "run", "-d"] + cmd = [cfg.docker_bin, "run", "-d"] self._add_lifecycle_args(cmd, cfg) self._add_network_args(cmd, cfg) self._add_port_args(cmd, cfg) From 2b4adaeb06ca9fa0cd45334d8d1c122453e47fed Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 6 Mar 2026 23:07:30 -0800 Subject: [PATCH 39/89] misc --- dimos/core/docker_build.py | 19 +++++++++++-------- dimos/core/docker_runner.py | 2 +- dimos/core/module_coordinator.py | 9 +++++++-- dimos/utils/safe_thread_map.py | 2 ++ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index d3fbcec685..036c4cfd6c 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -71,25 +71,26 @@ def _convert_dockerfile(dockerfile: Path) -> Path: def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents, build args, and build context path.""" + """Hash Dockerfile contents, build args, and SSH flag.""" assert cfg.docker_file is not None digest = hashlib.sha256() digest.update(cfg.docker_file.read_bytes()) for key, val in sorted(cfg.docker_build_args.items()): digest.update(f"{key}={val}".encode()) + digest.update(f"ssh={cfg.docker_build_ssh}".encode()) return digest.hexdigest() -def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None: +def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: """Read the build hash label from an existing Docker image.""" r = subprocess.run( [ - docker_bin, + cfg.docker_bin, "image", "inspect", "-f", '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', - image_name, + cfg.docker_image, ], capture_output=True, text=True, @@ -121,9 +122,13 @@ def build_image(cfg: DockerModuleConfig) -> None: cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") - result = subprocess.run(cmd, text=True) + # Stream stdout to terminal so the user sees build progress, but capture + # stderr separately so we can include it in the error message on failure. + result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) if result.returncode != 0: - raise RuntimeError(f"Docker build failed with exit code {result.returncode}") + raise RuntimeError( + f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" + ) def image_exists(cfg: DockerModuleConfig) -> bool: @@ -140,8 +145,6 @@ def image_exists(cfg: DockerModuleConfig) -> bool: __all__ = [ "DIMOS_FOOTER", - "_compute_build_hash", - "_get_image_build_hash", "build_image", "image_exists", ] diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c81d4367bc..97dbe5e209 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -216,7 +216,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non try: if config.docker_file is not None: current_hash = _compute_build_hash(config) - stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image) + stored_hash = _get_image_build_hash(config) if current_hash != stored_hash: logger.info(f"Building {config.docker_image}") build_image(config) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index cbcdb179e9..7e42f566fa 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -113,19 +113,24 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) + # Intentionally sequential: worker deploys first, then docker. + # Both internally parallelize across their own items. Running them + # concurrently would add complexity for minimal gain since they use + # different resource pools (processes vs containers). worker_results: list[Any] = [] docker_results: list[Any] = [] try: worker_results = self._client.deploy_parallel(worker_specs) docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] finally: - # Reassemble results in original input order + # Reassemble whatever succeeded into original input order so + # stop() can clean them up even if a later deploy raised. + # zip(strict=False) safely handles partial results (empty lists). results: list[Any] = [None] * len(module_specs) for idx, mod in zip(worker_indices, worker_results, strict=False): results[idx] = mod for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] results[idx] = mod - # Register whatever succeeded so stop() can clean them up for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: self._deployed_modules[module_class] = module diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index 240f5e7099..6729c989f3 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -75,6 +75,8 @@ def cleanup( except Exception as e: outcomes[idx] = e + # Note: successes/errors are in completion order, not input order. + # This is fine — on_errors only needs them for cleanup, not ordering. successes: list[R] = [] errors: list[Exception] = [] for v in outcomes.values(): From cb83f9aca2cc866f3f74f8d18f61d6fc0a6cc926 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 00:11:47 -0800 Subject: [PATCH 40/89] add docker_build_extra_args --- dimos/core/docker_build.py | 6 +++--- dimos/core/docker_runner.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 036c4cfd6c..5b54ecbf22 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -77,7 +77,8 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str: digest.update(cfg.docker_file.read_bytes()) for key, val in sorted(cfg.docker_build_args.items()): digest.update(f"{key}={val}".encode()) - digest.update(f"ssh={cfg.docker_build_ssh}".encode()) + for arg in cfg.docker_build_extra_args: + digest.update(arg.encode()) return digest.hexdigest() @@ -115,10 +116,9 @@ def build_image(cfg: DockerModuleConfig) -> None: context = cfg.docker_build_context or cfg.docker_file.parent cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) - if cfg.docker_build_ssh: - cmd.extend(["--ssh", "default"]) for k, v in cfg.docker_build_args.items(): cmd.extend(["--build-arg", f"{k}={v}"]) + cmd.extend(cfg.docker_build_extra_args) cmd.append(str(context)) logger.info(f"Building Docker image: {cfg.docker_image}") diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 97dbe5e209..a72718b564 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -63,7 +63,7 @@ class DockerModuleConfig(ModuleConfig): docker_file: Path | None = None # Required on host for building, not needed in container docker_build_context: Path | None = None docker_build_args: dict[str, str] = field(default_factory=dict) - docker_build_ssh: bool = False # Pass --ssh default to docker build (for private repo clones) + docker_build_extra_args: list[str] = field(default_factory=list) # Extra args for docker build # Identity docker_container_name: str | None = None From c74c5b907fc2f5447a6d1397a7de6c43119da63f Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 02:49:39 -0800 Subject: [PATCH 41/89] PR review fixes: better error messages, consistent API, restore install.sh - Include docker_build_ssh in build hash so toggling SSH triggers rebuild - Capture stderr on build failure for actionable error messages - Change _get_image_build_hash to take cfg instead of raw docker_bin str - Remove private names from __all__ in docker_build.py - Add helpful TypeError when DockerModule payload isn't JSON-serializable - Replace ThreadPoolExecutor.map in start_all_modules with safe_thread_map to surface all failures via ExceptionGroup instead of losing all but first - Restore scripts/install.sh and README.md (accidentally removed) - Add intent comments on deploy_parallel and safe_thread_map design choices --- dimos/core/docker_runner.py | 10 +++++++++- dimos/core/module_coordinator.py | 12 +++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index a72718b564..6d12705521 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -489,7 +489,15 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: kwargs = {"config": _extract_module_config(cfg)} payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs} # DimOS base image entrypoint already runs "dimos.core.docker_runner run" - return ["--payload", json.dumps(payload, separators=(",", ":"))] + try: + payload_json = json.dumps(payload, separators=(",", ":")) + except TypeError as e: + raise TypeError( + f"Cannot serialize DockerModule payload to JSON: {e}\n" + f"Ensure all constructor args/kwargs for {self._module_class.__name__} are " + f"JSON-serializable, or use docker_command to bypass automatic payload generation." + ) from e + return ["--payload", payload_json] def _wait_for_rpc(self) -> None: """Poll until the container's RPC server is reachable.""" diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index ac693c1795..6c639117bc 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -14,7 +14,6 @@ from __future__ import annotations -from concurrent.futures import ThreadPoolExecutor import threading from typing import TYPE_CHECKING, Any @@ -173,11 +172,18 @@ def deploy_parallel( return results def start_all_modules(self) -> None: + from dimos.utils.safe_thread_map import safe_thread_map + modules = list(self._deployed_modules.values()) if not modules: raise ValueError("No modules deployed. Call deploy() before start_all_modules().") - with ThreadPoolExecutor(max_workers=len(modules)) as executor: - list(executor.map(lambda m: m.start(), modules)) + + def _on_start_errors( + _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + ) -> None: + raise ExceptionGroup("start_all_modules failed", errors) + + safe_thread_map(modules, lambda m: m.start(), _on_start_errors) for module in modules: if hasattr(module, "on_system_modules"): From 45ee6fe1501d701ebc6669d8d18735e36907f9c0 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 02:56:50 -0800 Subject: [PATCH 42/89] fix pull problem --- dimos/core/docker_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 6d12705521..987e834eae 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -222,14 +222,15 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non build_image(config) elif not image_exists(config): logger.info(f"Pulling {config.docker_image}") - r = _run( + r = subprocess.run( [config.docker_bin, "pull", config.docker_image], + text=True, + stderr=subprocess.PIPE, timeout=config.docker_pull_timeout, ) if r.returncode != 0: raise RuntimeError( - f"Failed to pull image '{config.docker_image}'.\n" - f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" + f"Failed to pull image '{config.docker_image}'.\nSTDERR:\n{r.stderr}" ) reconnect = False From 029a8633d579460656bb270ec50dbb30740e129f Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 14:31:00 -0800 Subject: [PATCH 43/89] fix reconnect edgecase and __getattr__ loop edgecase --- dimos/core/docker_runner.py | 22 ++--- dimos/core/tests/test_docker_deployment.py | 97 ++++++++++++++++++++++ 2 files changed, 109 insertions(+), 10 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 987e834eae..db5f804659 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -307,15 +307,16 @@ def _cleanup(self) -> None: with suppress(Exception): unsub() self._unsub_fns.clear() - with suppress(Exception): - _run( - [self.config.docker_bin, "stop", self._container_name], - timeout=DOCKER_STOP_TIMEOUT, - ) - with suppress(Exception): - _remove_container(self.config, self._container_name) + if not self.config.docker_reconnect_container: + with suppress(Exception): + _run( + [self.config.docker_bin, "stop", self._container_name], + timeout=DOCKER_STOP_TIMEOUT, + ) + with suppress(Exception): + _remove_container(self.config, self._container_name) self._running = False - logger.info(f"Stopped container: {self._container_name}") + logger.info(f"Cleaned up container handle: {self._container_name}") def status(self) -> dict[str, Any]: cfg = self.config @@ -337,10 +338,11 @@ def set_transport(self, stream_name: str, transport: Any) -> bool: return bool(result) def __getattr__(self, name: str) -> Any: - if name in self.rpcs: + rpcs = self.__dict__.get("rpcs") + if rpcs is not None and name in rpcs: original_method = getattr(self._module_class, name, None) return RpcCall(original_method, self.rpc, name, self.remote_name, self._unsub_fns, None) - raise AttributeError(f"{name} not found on {self._module_class.__name__}") + raise AttributeError(f"{name} not found on {type(self).__name__}") # Docker command building (split into focused helpers for readability) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 17d1290916..e89b88e327 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -193,3 +193,100 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke assert mock_dm.stop.call_count == 1 # Worker manager also closed mock_worker_mgr.close_all.assert_called_once() + + +class TestDockerModuleGetattr: + """Tests for DockerModule.__getattr__ avoiding infinite recursion.""" + + def test_getattr_no_recursion_when_rpcs_not_set(self): + """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse.""" + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure + with pytest.raises(AttributeError): + _ = dm.some_method + + def test_getattr_no_recursion_on_cleanup_attrs(self): + """Accessing cleanup-related attrs before they exist must raise, not recurse.""" + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse + for attr in ("rpc", "config", "_container_name", "_unsub_fns"): + with pytest.raises(AttributeError): + getattr(dm, attr) + + def test_getattr_delegates_to_rpc_when_rpcs_set(self): + from dimos.core.docker_runner import DockerModule + from dimos.core.rpc_client import RpcCall + + dm = DockerModule.__new__(DockerModule) + dm.rpcs = {"do_thing"} + + # _module_class needs a real method with __name__ for RpcCall + class FakeMod: + def do_thing(self) -> None: ... + + dm._module_class = FakeMod + dm.rpc = MagicMock() + dm.remote_name = "FakeMod" + dm._unsub_fns = [] + + result = dm.do_thing + assert isinstance(result, RpcCall) + + def test_getattr_raises_for_unknown_method(self): + from dimos.core.docker_runner import DockerModule + + dm = DockerModule.__new__(DockerModule) + dm.rpcs = {"do_thing"} + + with pytest.raises(AttributeError, match="not found"): + _ = dm.nonexistent + + +class TestDockerModuleCleanupReconnect: + """Tests for DockerModule._cleanup with docker_reconnect_container.""" + + def test_cleanup_skips_stop_when_reconnect(self): + from dimos.core.docker_runner import DockerModule + + with patch.object(DockerModule, "__init__", lambda self: None): + dm = DockerModule.__new__(DockerModule) + dm._running = True + dm._container_name = "test_container" + dm._unsub_fns = [] + dm.rpc = MagicMock() + dm.remote_name = "TestModule" + + # reconnect mode: should NOT stop/rm the container + dm.config = FakeDockerConfig(docker_reconnect_container=True) + with ( + patch("dimos.core.docker_runner._run") as mock_run, + patch("dimos.core.docker_runner._remove_container") as mock_rm, + ): + dm._cleanup() + mock_run.assert_not_called() + mock_rm.assert_not_called() + + def test_cleanup_stops_container_when_not_reconnect(self): + from dimos.core.docker_runner import DockerModule + + with patch.object(DockerModule, "__init__", lambda self: None): + dm = DockerModule.__new__(DockerModule) + dm._running = True + dm._container_name = "test_container" + dm._unsub_fns = [] + dm.rpc = MagicMock() + dm.remote_name = "TestModule" + + # normal mode: should stop and rm the container + dm.config = FakeDockerConfig(docker_reconnect_container=False) + with ( + patch("dimos.core.docker_runner._run") as mock_run, + patch("dimos.core.docker_runner._remove_container") as mock_rm, + ): + dm._cleanup() + mock_run.assert_called_once() # docker stop + mock_rm.assert_called_once() # docker rm -f From 5106445d9e203c207336a96b672461a104755e29 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 14:36:00 -0800 Subject: [PATCH 44/89] change the ignore postfix --- .gitignore | 1 - dimos/core/docker_build.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 12b2f19ca3..4045db012e 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,6 @@ package-lock.json # Ignore build artifacts dist/ build/ -.Dockerfile.dimos # Ignore data directory but keep .lfs subdirectory data/* diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 5b54ecbf22..1e357d987b 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -65,7 +65,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path: logger.info(f"Converting {dockerfile.name} to DimOS format") - converted = dockerfile.parent / f".{dockerfile.name}.dimos" + converted = dockerfile.parent / f".{dockerfile.name}.ignore" converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) return converted From 882976167cb664909c0b2ec6ecbd48607814f001 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 7 Mar 2026 17:31:53 -0800 Subject: [PATCH 45/89] fix docker defaults, make deploy better --- dimos/core/docker_build.py | 2 +- dimos/core/docker_runner.py | 15 +++++----- dimos/core/module_coordinator.py | 49 +++++++++++++++++++------------- dimos/core/worker.py | 38 +++++++++++++------------ 4 files changed, 58 insertions(+), 46 deletions(-) diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py index 1e357d987b..24fd2b3e44 100644 --- a/dimos/core/docker_build.py +++ b/dimos/core/docker_build.py @@ -71,7 +71,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path: def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents, build args, and SSH flag.""" + """Hash Dockerfile contents and build args.""" assert cfg.docker_file is not None digest = hashlib.sha256() digest.update(cfg.docker_file.read_bytes()) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index db5f804659..6f0b2e777c 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -77,9 +77,9 @@ class DockerModuleConfig(ModuleConfig): ) # (host, container, proto) # Runtime resources - docker_gpus: str | None = "all" - docker_shm_size: str = "2g" - docker_restart_policy: str = "on-failure:3" + docker_gpus: str | None = None + docker_shm_size: str = "4g" + docker_restart_policy: str = "no" # Env + volumes + devices docker_env_files: list[str] = field(default_factory=list) @@ -300,14 +300,15 @@ def stop(self) -> None: self._cleanup() def _cleanup(self) -> None: - """Release all resources. Safe to call multiple times or from partial init.""" + """Release all resources. Idempotent — safe to call from partial init or after stop().""" with suppress(Exception): self.rpc.stop() - for unsub in self._unsub_fns: + for unsub in getattr(self, "_unsub_fns", []): with suppress(Exception): unsub() - self._unsub_fns.clear() - if not self.config.docker_reconnect_container: + with suppress(Exception): + self._unsub_fns.clear() + if not getattr(getattr(self, "config", None), "docker_reconnect_container", False): with suppress(Exception): _run( [self.config.docker_bin, "stop", self._container_name], diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 6c639117bc..59e1013175 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -22,6 +22,7 @@ from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger +from dimos.utils.safe_thread_map import safe_thread_map if TYPE_CHECKING: from dimos.core.module import Module, ModuleT @@ -147,33 +148,41 @@ def deploy_parallel( worker_indices.append(i) worker_specs.append(spec) - # Intentionally sequential: worker deploys first, then docker. - # Both internally parallelize across their own items. Running them - # concurrently would add complexity for minimal gain since they use - # different resource pools (processes vs containers). - worker_results: list[Any] = [] - docker_results: list[Any] = [] - try: - worker_results = self._client.deploy_parallel(worker_specs) - docker_results = DockerWorkerManager.deploy_parallel(docker_specs) # type: ignore[arg-type] - finally: - # Reassemble whatever succeeded into original input order so - # stop() can clean them up even if a later deploy raised. - # zip(strict=False) safely handles partial results (empty lists). - results: list[Any] = [None] * len(module_specs) - for idx, mod in zip(worker_indices, worker_results, strict=False): - results[idx] = mod - for idx, mod in zip(docker_indices, docker_results, strict=False): # type: ignore[assignment] - results[idx] = mod + # Deploy worker and docker modules in parallel. + results: list[Any] = [None] * len(module_specs) + + def _deploy_workers() -> None: + if not worker_specs: + return + for (index, _), module in zip( + worker_indices, self._client.deploy_parallel(worker_specs), strict=False + ): # type: ignore[union-attr] + results[index] = module + + def _deploy_docker() -> None: + if not docker_specs: + return + for (index, _), module in zip( + docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False + ): # type: ignore[arg-type] + results[index] = module + + def _register() -> None: for (module_class, _, _), module in zip(module_specs, results, strict=False): if module is not None: self._deployed_modules[module_class] = module + def _on_errors( + _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + ) -> None: + _register() + raise ExceptionGroup("deploy_parallel failed", errors) + + safe_thread_map([_deploy_workers, _deploy_docker], lambda fn: fn(), _on_errors) + _register() return results def start_all_modules(self) -> None: - from dimos.utils.safe_thread_map import safe_thread_map - modules = list(self._deployed_modules.values()) if not modules: raise ValueError("No modules deployed. Call deploy() before start_all_modules().") diff --git a/dimos/core/worker.py b/dimos/core/worker.py index b0dd802841..cce79796f5 100644 --- a/dimos/core/worker.py +++ b/dimos/core/worker.py @@ -206,25 +206,27 @@ def deploy_module( "args": args, "kwargs": kwargs, } - with self._lock: - self._conn.send(request) - response = self._conn.recv() + try: + with self._lock: + self._conn.send(request) + response = self._conn.recv() - if response.get("error"): - raise RuntimeError(f"Failed to deploy module: {response['error']}") - - actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock) - actor.set_ref(actor).result() - - self._modules[module_id] = actor - self._reserved = max(0, self._reserved - 1) - logger.info( - "Deployed module.", - module=module_class.__name__, - worker_id=self._worker_id, - module_id=module_id, - ) - return actor + if response.get("error"): + raise RuntimeError(f"Failed to deploy module: {response['error']}") + + actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock) + actor.set_ref(actor).result() + + self._modules[module_id] = actor + logger.info( + "Deployed module.", + module=module_class.__name__, + worker_id=self._worker_id, + module_id=module_id, + ) + return actor + finally: + self._reserved = max(0, self._reserved - 1) def shutdown(self) -> None: if self._conn is not None: From 068b0ad6d17baaa50fb75fad550383429a22686d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 8 Mar 2026 14:06:34 -0700 Subject: [PATCH 46/89] misc --- dimos/core/docker_runner.py | 4 ++-- dimos/core/module_coordinator.py | 7 ++++--- dimos/core/run_registry.py | 4 +--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 6f0b2e777c..10438298b1 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -39,7 +39,7 @@ logger = setup_logger() DOCKER_RUN_TIMEOUT = 120 # Timeout for `docker run` command execution -DOCKER_PULL_TIMEOUT_DEFAULT = 600 # Default timeout for `docker pull` +DOCKER_PULL_TIMEOUT_DEFAULT = None # No timeout for `docker pull` (images can be large) DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) @@ -99,7 +99,7 @@ class DockerModuleConfig(ModuleConfig): docker_extra_args: list[str] = field(default_factory=list) # Timeouts - docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT + docker_pull_timeout: float | None = DOCKER_PULL_TIMEOUT_DEFAULT docker_startup_timeout: float = 120.0 docker_poll_interval: float = 1.0 diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 59e1013175..7d2478dcb1 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -109,7 +109,8 @@ def stop(self) -> None: logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) - self._client.close_all() # type: ignore[union-attr] + if self._client is not None: + self._client.close_all() def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy: # type: ignore[no-untyped-def] # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator @@ -154,7 +155,7 @@ def deploy_parallel( def _deploy_workers() -> None: if not worker_specs: return - for (index, _), module in zip( + for index, module in zip( worker_indices, self._client.deploy_parallel(worker_specs), strict=False ): # type: ignore[union-attr] results[index] = module @@ -162,7 +163,7 @@ def _deploy_workers() -> None: def _deploy_docker() -> None: if not docker_specs: return - for (index, _), module in zip( + for index, module in zip( docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False ): # type: ignore[arg-type] results[index] = module diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py index 9f8e7f3358..848eafde4e 100644 --- a/dimos/core/run_registry.py +++ b/dimos/core/run_registry.py @@ -21,6 +21,7 @@ import os from pathlib import Path import re +import signal import time from dimos.utils.logging_config import setup_logger @@ -142,9 +143,6 @@ def get_most_recent(alive_only: bool = True) -> RunEntry | None: return runs[-1] if runs else None -import signal - - def stop_entry(entry: RunEntry, force: bool = False) -> tuple[str, bool]: """Stop a DimOS instance by registry entry. From 75268debe341cd7bdfffbc41cf296ffe5d72d0f8 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 8 Mar 2026 17:53:29 -0700 Subject: [PATCH 47/89] fix mypy --- dimos/core/module_coordinator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 7d2478dcb1..ee417f93cb 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -139,7 +139,7 @@ def deploy_parallel( # Split by type, tracking original indices for reassembly docker_indices: list[int] = [] worker_indices: list[int] = [] - docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] + docker_specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]] = [] worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = [] for i, spec in enumerate(module_specs): if is_docker_module(spec[0]): @@ -155,9 +155,10 @@ def deploy_parallel( def _deploy_workers() -> None: if not worker_specs: return + assert self._client is not None for index, module in zip( worker_indices, self._client.deploy_parallel(worker_specs), strict=False - ): # type: ignore[union-attr] + ): results[index] = module def _deploy_docker() -> None: @@ -165,7 +166,7 @@ def _deploy_docker() -> None: return for index, module in zip( docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False - ): # type: ignore[arg-type] + ): results[index] = module def _register() -> None: From f38e4beb895762dcaa96a71998df754118d45027 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Mon, 9 Mar 2026 13:05:17 -0700 Subject: [PATCH 48/89] fix ExceptionGroup edgecase --- dimos/core/docker_worker_manager.py | 2 +- dimos/core/module_coordinator.py | 2 +- dimos/core/resource_monitor/stats.py | 2 +- dimos/core/worker_manager.py | 2 +- dimos/utils/safe_thread_map.py | 16 ++++++++++++++++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 34183fda9f..29c7c2a29d 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -16,7 +16,7 @@ from contextlib import suppress from typing import TYPE_CHECKING, Any -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index ee417f93cb..deb867453e 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -22,7 +22,7 @@ from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.module import Module, ModuleT diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py index c020c853e0..f401358890 100644 --- a/dimos/core/resource_monitor/stats.py +++ b/dimos/core/resource_monitor/stats.py @@ -90,7 +90,7 @@ class IoStats(TypedDict): def _collect_io(proc: psutil.Process) -> IoStats: """Collect IO counters in bytes. Call inside oneshot().""" try: - io = proc.io_counters() + io = proc.io_counters() # type: ignore[attr-defined] # Linux-only return IoStats(io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes) except (psutil.AccessDenied, AttributeError): return IoStats(io_read_bytes=0, io_write_bytes=0) diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index b9c25c8445..fa448cb15d 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -20,7 +20,7 @@ from dimos.core.rpc_client import RPCClient from dimos.core.worker import Worker from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import safe_thread_map +from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.module import ModuleT diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index 6729c989f3..f480f2c97d 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -14,8 +14,24 @@ from __future__ import annotations from concurrent.futures import Future, ThreadPoolExecutor, as_completed +import sys from typing import TYPE_CHECKING, Any, TypeVar +if sys.version_info < (3, 11): + + class ExceptionGroup(Exception): # type: ignore[no-redef] # noqa: N818 + """Minimal ExceptionGroup polyfill for Python 3.10.""" + + exceptions: tuple[BaseException, ...] + + def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None: + super().__init__(message) + self.exceptions = tuple(exceptions) +else: + import builtins + + ExceptionGroup = builtins.ExceptionGroup # type: ignore[misc] + if TYPE_CHECKING: from collections.abc import Callable, Sequence From 985ecd7e262b9dafb1fa8116c723f9b6835476a2 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 15:09:33 -0700 Subject: [PATCH 49/89] fix: update Docker deployment to use ModuleSpec format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docker_worker_manager: accept ModuleSpec format, pass global_config - module_coordinator: add type: ignore for ModuleBase→Module cast - worker_manager: convert Iterable to list for len() check - test_docker_deployment: fix Path import, update test assertions for new global_config signature --- dimos/core/docker_worker_manager.py | 8 +++++--- dimos/core/module_coordinator.py | 2 +- dimos/core/tests/test_docker_deployment.py | 14 ++++++-------- dimos/core/worker_manager.py | 1 + 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 29c7c2a29d..520468182f 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -16,11 +16,11 @@ from contextlib import suppress from typing import TYPE_CHECKING, Any +from dimos.core.module import ModuleSpec from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: from dimos.core.docker_runner import DockerModule - from dimos.core.module import Module class DockerWorkerManager: @@ -28,7 +28,7 @@ class DockerWorkerManager: @staticmethod def deploy_parallel( - specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]], + specs: list[ModuleSpec], ) -> list[DockerModule]: """Deploy multiple DockerModules in parallel. @@ -46,5 +46,7 @@ def _on_errors( raise ExceptionGroup("docker deploy_parallel failed", errors) return safe_thread_map( - specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors + specs, + lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]), # type: ignore[arg-type] + _on_errors, ) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d9931b7876..43e3e44f0a 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -131,7 +131,7 @@ def deploy( deployed_module: ModuleProxyProtocol if is_docker_module(module_class): - deployed_module = DockerModule(module_class, global_config=global_config, **kwargs) + deployed_module = DockerModule(module_class, global_config=global_config, **kwargs) # type: ignore[arg-type] else: deployed_module = self._client.deploy(module_class, global_config, kwargs) self._deployed_modules[module_class] = deployed_module # type: ignore[assignment] diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index e89b88e327..a3bb0b716d 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -21,24 +21,20 @@ from __future__ import annotations -from dataclasses import dataclass -from typing import TYPE_CHECKING +from pathlib import Path from unittest.mock import MagicMock, patch import pytest from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator from dimos.core.stream import Out -if TYPE_CHECKING: - from pathlib import Path - # -- Fixtures: fake module classes ------------------------------------------- -@dataclass class FakeDockerConfig(DockerModuleConfig): docker_image: str = "fake:latest" docker_file: Path | None = None @@ -95,7 +91,9 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() # Should construct a DockerModule (container launch happens inside __init__) - mock_docker_module_cls.assert_called_once_with(FakeDockerModule) + mock_docker_module_cls.assert_called_once_with( + FakeDockerModule, global_config=global_config + ) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() assert result is mock_dm @@ -134,7 +132,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage result = coordinator.deploy(FakeRegularModule) - mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule) + mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) assert result is mock_proxy coordinator.stop() diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 52313ca5d4..2b778c433e 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -66,6 +66,7 @@ def deploy_parallel(self, module_specs: Iterable[ModuleSpec]) -> list[RPCClient] if self._closed: raise RuntimeError("WorkerManager is closed") + module_specs = list(module_specs) if len(module_specs) == 0: return [] From 5d994c15136aed54e84f2e884cb079e8bd020fbb Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:43:35 -0700 Subject: [PATCH 50/89] fix(mypy): cover import-not-found for onnxruntime type: ignore Pre-existing mypy errors: onnxruntime is excluded from install (--no-extra cuda) so import-not-found needs to be ignored alongside import-untyped. --- dimos/agents_deprecated/memory/image_embedding.py | 2 +- dimos/simulation/mujoco/policy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py index 27e16f1aa8..d6b0967642 100644 --- a/dimos/agents_deprecated/memory/image_embedding.py +++ b/dimos/agents_deprecated/memory/image_embedding.py @@ -63,7 +63,7 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None: def _initialize_model(self): # type: ignore[no-untyped-def] """Initialize the specified embedding model.""" try: - import onnxruntime as ort # type: ignore[import-untyped] + import onnxruntime as ort # type: ignore[import-untyped,import-not-found] import torch # noqa: F401 from transformers import ( # type: ignore[import-untyped] AutoFeatureExtractor, diff --git a/dimos/simulation/mujoco/policy.py b/dimos/simulation/mujoco/policy.py index 212c7ac60a..1d0598ce46 100644 --- a/dimos/simulation/mujoco/policy.py +++ b/dimos/simulation/mujoco/policy.py @@ -20,7 +20,7 @@ import mujoco import numpy as np -import onnxruntime as ort # type: ignore[import-untyped] +import onnxruntime as ort # type: ignore[import-untyped,import-not-found] from dimos.simulation.mujoco.input_controller import InputController from dimos.utils.logging_config import setup_logger From dd3251e73544f6d303f4b2cad158f27d3947782b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:46:28 -0700 Subject: [PATCH 51/89] fix: remove section markers from hello_docker.py and untrack .venv - Remove comment section markers (dashed lines) that violate the no-section-markers test policy - Remove .venv symlink from git tracking (already in .gitignore) --- .venv | 1 - examples/docker_hello_world/hello_docker.py | 12 +----------- 2 files changed, 1 insertion(+), 12 deletions(-) delete mode 120000 .venv diff --git a/.venv b/.venv deleted file mode 120000 index 3c94680097..0000000000 --- a/.venv +++ /dev/null @@ -1 +0,0 @@ -/home/dimos/auto/dimos/.venv \ No newline at end of file diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 66e95df316..af3bfc19d3 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -41,10 +41,6 @@ from dimos.core.module import Module from dimos.core.stream import In, Out -# --------------------------------------------------------------------------- -# Docker module (runs inside container) -# --------------------------------------------------------------------------- - @dataclass(kw_only=True) class HelloDockerConfig(DockerModuleConfig): @@ -100,10 +96,6 @@ def get_greeting_prefix(self) -> str: return self.config.greeting_prefix -# --------------------------------------------------------------------------- -# Host-side module (sends prompts and prints greetings) -# --------------------------------------------------------------------------- - class PromptModule(Module): """Publishes prompts and listens to greetings.""" @@ -125,9 +117,7 @@ def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- + if __name__ == "__main__": from dimos.core.blueprints import autoconnect From 9830a8e86550c31001d886117ea2deee1860eda7 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 13 Mar 2026 23:46:41 -0700 Subject: [PATCH 52/89] style: fix formatting in hello_docker.py --- examples/docker_hello_world/hello_docker.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index af3bfc19d3..3b8e96e49b 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -96,7 +96,6 @@ def get_greeting_prefix(self) -> str: return self.config.greeting_prefix - class PromptModule(Module): """Publishes prompts and listens to greetings.""" @@ -117,8 +116,6 @@ def _on_greeting(self, text: str) -> None: print(f"[PromptModule] Received: {text}") - - if __name__ == "__main__": from dimos.core.blueprints import autoconnect From cbc46178167d32e39d4d14c58a8e7862a0239e55 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 14 Mar 2026 18:49:06 -0700 Subject: [PATCH 53/89] fix: address review comments on hello_docker example - Add proper _disposables cleanup for stream subscriptions - Use subprocess.check_output instead of subprocess.run - Move inline import (autoconnect) to top of file --- examples/docker_hello_world/hello_docker.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 3b8e96e49b..6c30228089 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -36,6 +36,9 @@ import subprocess import time +from reactivex.disposable import Disposable + +from dimos.core.blueprints import autoconnect from dimos.core.core import rpc from dimos.core.docker_runner import DockerModuleConfig from dimos.core.module import Module @@ -67,17 +70,11 @@ class HelloDockerModule(Module["HelloDockerConfig"]): @rpc def start(self) -> None: super().start() - self.prompt.subscribe(self._on_prompt) + self._disposables.add(Disposable(self.prompt.subscribe(self._on_prompt))) def _cowsay(self, text: str) -> str: """Run cowsay inside the container and return the ASCII art.""" - result = subprocess.run( - ["/usr/games/cowsay", text], - capture_output=True, - text=True, - check=True, - ) - return result.stdout + return subprocess.check_output(["/usr/games/cowsay", text], text=True) def _on_prompt(self, text: str) -> None: art = self._cowsay(text) @@ -105,7 +102,7 @@ class PromptModule(Module): @rpc def start(self) -> None: super().start() - self.greeting.subscribe(self._on_greeting) + self._disposables.add(Disposable(self.greeting.subscribe(self._on_greeting))) @rpc def send(self, text: str) -> None: @@ -117,8 +114,6 @@ def _on_greeting(self, text: str) -> None: if __name__ == "__main__": - from dimos.core.blueprints import autoconnect - coordinator = autoconnect( PromptModule.blueprint(), HelloDockerModule.blueprint(greeting_prefix="Howdy"), From 780736c7b9095ac441a876010411af86eb777653 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 11:58:23 -0700 Subject: [PATCH 54/89] make timeout not hardcoded --- dimos/core/module.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dimos/core/module.py b/dimos/core/module.py index 6b12843a3a..c400e697f6 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -104,6 +104,7 @@ class ModuleBase(Configurable[ModuleConfigT], Resource): _bound_rpc_calls: dict[str, RpcCall] = {} _module_closed: bool = False _module_closed_lock: threading.Lock + _loop_thread_timeout: float = 2.0 rpc_calls: list[str] = [] @@ -151,7 +152,7 @@ def _close_module(self) -> None: if loop_thread.is_alive(): if loop: loop.call_soon_threadsafe(loop.stop) - loop_thread.join(timeout=2) + loop_thread.join(timeout=self._loop_thread_timeout) self._loop = None self._loop_thread = None From 66a6567a9407677127abc54f3e36572618ee8acc Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 10:38:58 -0700 Subject: [PATCH 55/89] docs: add clarifying comment for deploy_parallel lambda tuple --- dimos/core/worker_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 2b778c433e..3cd836b3ed 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -93,6 +93,7 @@ def _on_errors( return safe_thread_map( assignments, + # item = [worker, module_class, global_config, kwargs] lambda item: RPCClient(item[0].deploy_module(item[1], item[2], item[3]), item[1]), _on_errors, ) From 1d3f1230abb7cc71127bdd85f07926e604f882b1 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 11:45:10 -0700 Subject: [PATCH 56/89] feat: port rpc_timeouts system from jeff/fix/rosnav3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Module.rpc_timeouts dict allows per-method timeout overrides - RPCClient resolves timeouts from module's rpc_timeouts, with defaults: start=1200s, everything else=120s - RpcCall carries resolved timeout, passes it to call_sync - DockerModule mirrors the same pattern via _resolve_timeout() - call_sync no longer auto-detects 'start' — caller is responsible - Pickle compat: RpcCall supports both old 2-tuple and new 3-tuple state --- dimos/core/docker_runner.py | 30 +++++++++++++++++++++++++----- dimos/core/module.py | 6 ++++++ dimos/core/rpc_client.py | 25 ++++++++++++++++++++++--- dimos/protocol/rpc/spec.py | 13 ++++++++----- 4 files changed, 61 insertions(+), 13 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 16727a8dd1..b879d29be1 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall +from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall, RPCClient from dimos.protocol.rpc.pubsubrpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -210,6 +210,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} + self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})} # Build or pull image, launch container, wait for RPC server try: @@ -266,12 +267,19 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non def get_rpc_method_names(self) -> list[str]: return self.rpc_calls + def _resolve_timeout(self, method: str) -> float: + return self._rpc_timeouts.get(method, RPCClient.default_rpc_timeout) + def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable # Forward to container — Module.set_rpc_method unpickles the RpcCall # and wires it with the container's own LCMRPC - self.rpc.call_sync(f"{self.remote_name}/set_rpc_method", ([method, callable], {})) + self.rpc.call_sync( + f"{self.remote_name}/set_rpc_method", + ([method, callable], {}), + rpc_timeout=self._resolve_timeout("set_rpc_method"), + ) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: missing = set(methods) - self._bound_rpc_calls.keys() @@ -283,7 +291,9 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: def start(self) -> None: """Invoke the remote module's start() RPC.""" try: - self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) + self.rpc.call_sync( + f"{self.remote_name}/start", ([], {}), rpc_timeout=self._resolve_timeout("start") + ) except Exception: with suppress(Exception): self.stop() @@ -333,7 +343,9 @@ def tail_logs(self, n: int = 200) -> str: def set_transport(self, stream_name: str, transport: Any) -> bool: """Forward to the container's Module.set_transport RPC.""" result, _ = self.rpc.call_sync( - f"{self.remote_name}/set_transport", ([stream_name, transport], {}) + f"{self.remote_name}/set_transport", + ([stream_name, transport], {}), + rpc_timeout=self._resolve_timeout("set_transport"), ) return bool(result) @@ -341,7 +353,15 @@ def __getattr__(self, name: str) -> Any: rpcs = self.__dict__.get("rpcs") if rpcs is not None and name in rpcs: original_method = getattr(self._module_class, name, None) - return RpcCall(original_method, self.rpc, name, self.remote_name, self._unsub_fns, None) + return RpcCall( + original_method, + self.rpc, + name, + self.remote_name, + self._unsub_fns, + None, + timeout=self._resolve_timeout(name), + ) raise AttributeError(f"{name} not found on {type(self).__name__}") # Docker command building (split into focused helpers for readability) diff --git a/dimos/core/module.py b/dimos/core/module.py index c400e697f6..bcd61bd435 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -108,6 +108,12 @@ class ModuleBase(Configurable[ModuleConfigT], Resource): rpc_calls: list[str] = [] + # Per-method RPC timeout overrides (seconds). Keys are method names. + # Used by RPCClient when calling methods on this module from the host. + # Example: rpc_timeouts = {"on_system_modules": 600.0} + # Methods not listed here use RPCClient.default_rpc_timeout (120s). + rpc_timeouts: dict[str, float] = {} + def __init__(self, config_args: dict[str, Any]): super().__init__(**config_args) self._module_closed_lock = threading.Lock() diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 13add06a02..4877a2acd9 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -39,12 +39,14 @@ def __init__( remote_name: str, unsub_fns: list, # type: ignore[type-arg] stop_client: Callable[[], None] | None = None, + timeout: float = 0, ) -> None: self._rpc = rpc self._name = name self._remote_name = remote_name self._unsub_fns = unsub_fns self._stop_rpc_client = stop_client + self._timeout = timeout if original_method: self.__doc__ = original_method.__doc__ @@ -67,15 +69,24 @@ def __call__(self, *args, **kwargs): # type: ignore[no-untyped-def] self._stop_rpc_client() return None - result, unsub_fn = self._rpc.call_sync(f"{self._remote_name}/{self._name}", (args, kwargs)) # type: ignore[arg-type] + result, unsub_fn = self._rpc.call_sync( + f"{self._remote_name}/{self._name}", + (args, kwargs), # type: ignore[arg-type] + rpc_timeout=self._timeout, + ) self._unsub_fns.append(unsub_fn) return result def __getstate__(self): # type: ignore[no-untyped-def] - return (self._name, self._remote_name) + return (self._name, self._remote_name, self._timeout) def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] - self._name, self._remote_name = state + # Support both old 2-tuple and new 3-tuple state for pickle compat. + if len(state) == 2: + self._name, self._remote_name = state + self._timeout = 0 + else: + self._name, self._remote_name, self._timeout = state self._unsub_fns = [] self._rpc = None self._stop_rpc_client = None @@ -93,6 +104,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... class RPCClient: + # Default timeout for all RPC calls (seconds). Override per-method via + # the module's rpc_timeouts dict. + default_rpc_timeout: float = 120.0 + def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] self.rpc = LCMRPC() self.actor_class = actor_class @@ -101,6 +116,8 @@ def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-unty self.rpcs = actor_class.rpcs.keys() self.rpc.start() self._unsub_fns = [] # type: ignore[var-annotated] + # Merge module-level rpc_timeouts over the defaults from RPCSpec. + self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})} def stop_rpc_client(self) -> None: for unsub in self._unsub_fns: @@ -139,6 +156,7 @@ def __getattr__(self, name: str): # type: ignore[no-untyped-def] if name in self.rpcs: original_method = getattr(self.actor_class, name, None) + timeout = self._rpc_timeouts.get(name, self.default_rpc_timeout) return RpcCall( original_method, self.rpc, @@ -146,6 +164,7 @@ def __getattr__(self, name: str): # type: ignore[no-untyped-def] self.remote_name, self._unsub_fns, self.stop_rpc_client, + timeout=timeout, ) # return super().__getattr__(name) diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 47ad77e825..d311e45c6a 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -43,13 +43,16 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[], def call_nowait(self, name: str, arguments: Args) -> None: ... - # we expect to crash if we don't get a return value after 10 seconds - # but callers can override this timeout for extra long functions + # Default RPC timeout. Callers (RpcCall, DockerModule) resolve via + # rpc_timeouts dict; raw call_sync uses this as fallback. + default_rpc_timeout: float = 120.0 + rpc_timeouts: dict[str, float] = {"start": 1200.0} + def call_sync( - self, name: str, arguments: Args, rpc_timeout: float | None = 120.0 + self, name: str, arguments: Args, rpc_timeout: float | None = None ) -> tuple[Any, Callable[[], None]]: - if name == "start": - rpc_timeout = 1200.0 # starting modules can take longer + if rpc_timeout is None: + rpc_timeout = self.rpc_timeouts.get(name, self.default_rpc_timeout) event = threading.Event() def receive_value(val) -> None: # type: ignore[no-untyped-def] From 747bbe2e897b515437705377c6a76693dfdab0d1 Mon Sep 17 00:00:00 2001 From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com> Date: Sun, 15 Mar 2026 19:20:34 +0000 Subject: [PATCH 57/89] CI code cleanup --- dimos/core/docker_runner.py | 5 ++++- dimos/core/rpc_client.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index b879d29be1..ee98b59705 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -210,7 +210,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})} + self._rpc_timeouts: dict[str, float] = { + **self.rpc.rpc_timeouts, + **getattr(module_class, "rpc_timeouts", {}), + } # Build or pull image, launch container, wait for RPC server try: diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 4877a2acd9..417830a49c 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -117,7 +117,10 @@ def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-unty self.rpc.start() self._unsub_fns = [] # type: ignore[var-annotated] # Merge module-level rpc_timeouts over the defaults from RPCSpec. - self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})} + self._rpc_timeouts: dict[str, float] = { + **self.rpc.rpc_timeouts, + **getattr(actor_class, "rpc_timeouts", {}), + } def stop_rpc_client(self) -> None: for unsub in self._unsub_fns: From c2d264350480a7f2ba8db4c8b3b782d0d13c511b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 12:40:33 -0700 Subject: [PATCH 58/89] fixup rpc timeouts, cause they matter for docker --- dimos/core/docker_runner.py | 15 +++------------ dimos/core/module.py | 11 +++-------- dimos/core/rpc_client.py | 30 +++++++++++------------------- dimos/protocol/rpc/pubsubrpc.py | 17 +++++++++-------- dimos/protocol/rpc/spec.py | 16 +++++++++++----- dimos/protocol/rpc/test_lcmrpc.py | 2 +- dimos/protocol/rpc/test_spec.py | 8 ++++---- 7 files changed, 42 insertions(+), 57 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index b879d29be1..cebb7fb49b 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -25,7 +25,7 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleConfig -from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall, RPCClient +from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall from dimos.protocol.rpc.pubsubrpc import LCMRPC from dimos.utils.logging_config import setup_logger from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT @@ -205,12 +205,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}" ) - self.rpc = LCMRPC() + self.rpc = LCMRPC(rpc_timeouts=self.config.rpc_timeouts) self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})} # Build or pull image, launch container, wait for RPC server try: @@ -267,9 +266,6 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non def get_rpc_method_names(self) -> list[str]: return self.rpc_calls - def _resolve_timeout(self, method: str) -> float: - return self._rpc_timeouts.get(method, RPCClient.default_rpc_timeout) - def set_rpc_method(self, method: str, callable: RpcCall) -> None: callable.set_rpc(self.rpc) self._bound_rpc_calls[method] = callable @@ -278,7 +274,6 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: self.rpc.call_sync( f"{self.remote_name}/set_rpc_method", ([method, callable], {}), - rpc_timeout=self._resolve_timeout("set_rpc_method"), ) def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: @@ -291,9 +286,7 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: def start(self) -> None: """Invoke the remote module's start() RPC.""" try: - self.rpc.call_sync( - f"{self.remote_name}/start", ([], {}), rpc_timeout=self._resolve_timeout("start") - ) + self.rpc.call_sync(f"{self.remote_name}/start", ([], {})) except Exception: with suppress(Exception): self.stop() @@ -345,7 +338,6 @@ def set_transport(self, stream_name: str, transport: Any) -> bool: result, _ = self.rpc.call_sync( f"{self.remote_name}/set_transport", ([stream_name, transport], {}), - rpc_timeout=self._resolve_timeout("set_transport"), ) return bool(result) @@ -360,7 +352,6 @@ def __getattr__(self, name: str) -> Any: self.remote_name, self._unsub_fns, None, - timeout=self._resolve_timeout(name), ) raise AttributeError(f"{name} not found on {type(self).__name__}") diff --git a/dimos/core/module.py b/dimos/core/module.py index bcd61bd435..c6c557b825 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -40,7 +40,7 @@ from dimos.core.rpc_client import RpcCall from dimos.core.stream import In, Out, RemoteOut, Transport from dimos.protocol.rpc.pubsubrpc import LCMRPC -from dimos.protocol.rpc.spec import RPCSpec +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.protocol.service.spec import BaseConfig, Configurable from dimos.protocol.tf.tf import LCMTF, TFSpec from dimos.utils import colors @@ -79,6 +79,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]: class ModuleConfig(BaseConfig): rpc_transport: type[RPCSpec] = LCMRPC + rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS tf_transport: type[TFSpec] = LCMTF # type: ignore[type-arg] frame_id_prefix: str | None = None frame_id: str | None = None @@ -108,19 +109,13 @@ class ModuleBase(Configurable[ModuleConfigT], Resource): rpc_calls: list[str] = [] - # Per-method RPC timeout overrides (seconds). Keys are method names. - # Used by RPCClient when calling methods on this module from the host. - # Example: rpc_timeouts = {"on_system_modules": 600.0} - # Methods not listed here use RPCClient.default_rpc_timeout (120s). - rpc_timeouts: dict[str, float] = {} - def __init__(self, config_args: dict[str, Any]): super().__init__(**config_args) self._module_closed_lock = threading.Lock() self._loop, self._loop_thread = get_loop() self._disposables = CompositeDisposable() try: - self.rpc = self.config.rpc_transport() + self.rpc = self.config.rpc_transport(rpc_timeouts=self.config.rpc_timeouts) self.rpc.serve_module_rpc(self) self.rpc.start() # type: ignore[attr-defined] except ValueError: diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 4877a2acd9..3fd120a1fc 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -18,7 +18,7 @@ from dimos.core.stream import RemoteStream from dimos.core.worker import MethodCallProxy from dimos.protocol.rpc.pubsubrpc import LCMRPC -from dimos.protocol.rpc.spec import RPCSpec +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -39,14 +39,12 @@ def __init__( remote_name: str, unsub_fns: list, # type: ignore[type-arg] stop_client: Callable[[], None] | None = None, - timeout: float = 0, ) -> None: self._rpc = rpc self._name = name self._remote_name = remote_name self._unsub_fns = unsub_fns self._stop_rpc_client = stop_client - self._timeout = timeout if original_method: self.__doc__ = original_method.__doc__ @@ -72,21 +70,19 @@ def __call__(self, *args, **kwargs): # type: ignore[no-untyped-def] result, unsub_fn = self._rpc.call_sync( f"{self._remote_name}/{self._name}", (args, kwargs), # type: ignore[arg-type] - rpc_timeout=self._timeout, ) self._unsub_fns.append(unsub_fn) return result def __getstate__(self): # type: ignore[no-untyped-def] - return (self._name, self._remote_name, self._timeout) + return (self._name, self._remote_name) def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] - # Support both old 2-tuple and new 3-tuple state for pickle compat. - if len(state) == 2: - self._name, self._remote_name = state - self._timeout = 0 + # Support both old 2-tuple and new 3-tuple (legacy) state for pickle compat. + if len(state) == 3: + self._name, self._remote_name, _ = state else: - self._name, self._remote_name, self._timeout = state + self._name, self._remote_name = state self._unsub_fns = [] self._rpc = None self._stop_rpc_client = None @@ -95,6 +91,8 @@ def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] class ModuleProxyProtocol(Protocol): """Protocol for host-side handles to remote modules (worker or Docker).""" + rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS + def start(self) -> None: ... def stop(self) -> None: ... def set_transport(self, stream_name: str, transport: Any) -> bool: ... @@ -104,20 +102,16 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... class RPCClient: - # Default timeout for all RPC calls (seconds). Override per-method via - # the module's rpc_timeouts dict. - default_rpc_timeout: float = 120.0 - def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] - self.rpc = LCMRPC() + default_config = getattr(actor_class, "default_config", None) + self.rpc_timeouts: dict[str, float] = getattr(default_config, "rpc_timeouts", DEFAULT_RPC_TIMEOUTS) + self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts) self.actor_class = actor_class self.remote_name = actor_class.__name__ self.actor_instance = actor_instance self.rpcs = actor_class.rpcs.keys() self.rpc.start() self._unsub_fns = [] # type: ignore[var-annotated] - # Merge module-level rpc_timeouts over the defaults from RPCSpec. - self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})} def stop_rpc_client(self) -> None: for unsub in self._unsub_fns: @@ -156,7 +150,6 @@ def __getattr__(self, name: str): # type: ignore[no-untyped-def] if name in self.rpcs: original_method = getattr(self.actor_class, name, None) - timeout = self._rpc_timeouts.get(name, self.default_rpc_timeout) return RpcCall( original_method, self.rpc, @@ -164,7 +157,6 @@ def __getattr__(self, name: str): # type: ignore[no-untyped-def] self.remote_name, self._unsub_fns, self.stop_rpc_client, - timeout=timeout, ) # return super().__getattr__(name) diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py index 3b77227218..c440710e5f 100644 --- a/dimos/protocol/rpc/pubsubrpc.py +++ b/dimos/protocol/rpc/pubsubrpc.py @@ -32,7 +32,7 @@ from dimos.protocol.pubsub.impl.shmpubsub import PickleSharedMemory from dimos.protocol.pubsub.spec import PubSub from dimos.protocol.rpc.rpc_utils import deserialize_exception, serialize_exception -from dimos.protocol.rpc.spec import Args, RPCSpec +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, Args, RPCSpec from dimos.utils.generic import short_id from dimos.utils.logging_config import setup_logger @@ -62,8 +62,9 @@ class RPCRes(TypedDict, total=False): class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]): - def __init__(self, *args: Any, **kwargs: Any) -> None: + def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: super().__init__(*args, **kwargs) + self.rpc_timeouts = {**DEFAULT_RPC_TIMEOUTS, **rpc_timeouts} # Thread pool for RPC handler execution (prevents deadlock in nested calls) self._call_thread_pool: ThreadPoolExecutor | None = None self._call_thread_pool_lock = threading.RLock() @@ -290,12 +291,12 @@ def execute_and_respond() -> None: class LCMRPC(PubSubRPCMixin[Topic, Any], PickleLCM): - def __init__(self, **kwargs: Any) -> None: + def __init__(self, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: # Need to ensure PickleLCM gets initialized properly # This is due to the diamond inheritance pattern with multiple base classes PickleLCM.__init__(self, **kwargs) - # Initialize PubSubRPCMixin's thread pool - PubSubRPCMixin.__init__(self, **kwargs) + # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults) + PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs) def topicgen(self, name: str, req_or_res: bool) -> Topic: suffix = "res" if req_or_res else "req" @@ -306,12 +307,12 @@ def topicgen(self, name: str, req_or_res: bool) -> Topic: class ShmRPC(PubSubRPCMixin[str, Any], PickleSharedMemory): - def __init__(self, prefer: str = "cpu", **kwargs: Any) -> None: + def __init__(self, rpc_timeouts: dict[str, float], prefer: str = "cpu", **kwargs: Any) -> None: # Need to ensure SharedMemory gets initialized properly # This is due to the diamond inheritance pattern with multiple base classes PickleSharedMemory.__init__(self, prefer=prefer, **kwargs) - # Initialize PubSubRPCMixin's thread pool - PubSubRPCMixin.__init__(self, **kwargs) + # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults) + PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs) def topicgen(self, name: str, req_or_res: bool) -> str: suffix = "res" if req_or_res else "req" diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index d311e45c6a..3d17d65948 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -30,6 +30,10 @@ class RPCInspectable(Protocol): def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] +DEFAULT_RPC_TIMEOUT: float = 120.0 +DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0} + + class RPCClient(Protocol): # if we don't provide callback, we don't get a return unsub f @overload @@ -43,16 +47,18 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[], def call_nowait(self, name: str, arguments: Args) -> None: ... - # Default RPC timeout. Callers (RpcCall, DockerModule) resolve via - # rpc_timeouts dict; raw call_sync uses this as fallback. - default_rpc_timeout: float = 120.0 - rpc_timeouts: dict[str, float] = {"start": 1200.0} + # call_sync resolves per-method overrides from rpc_timeouts, + # falling back to default_rpc_timeout. + default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT + rpc_timeouts: dict[str, float] def call_sync( self, name: str, arguments: Args, rpc_timeout: float | None = None ) -> tuple[Any, Callable[[], None]]: if rpc_timeout is None: - rpc_timeout = self.rpc_timeouts.get(name, self.default_rpc_timeout) + # Try full topic name first, then bare method name (after last "/"). + method = name.rsplit("/", 1)[-1] + rpc_timeout = self.rpc_timeouts.get(name, self.rpc_timeouts.get(method, self.default_rpc_timeout)) event = threading.Event() def receive_value(val) -> None: # type: ignore[no-untyped-def] diff --git a/dimos/protocol/rpc/test_lcmrpc.py b/dimos/protocol/rpc/test_lcmrpc.py index 5baa5ac40c..700618ab72 100644 --- a/dimos/protocol/rpc/test_lcmrpc.py +++ b/dimos/protocol/rpc/test_lcmrpc.py @@ -22,7 +22,7 @@ @pytest.fixture def lcmrpc() -> Generator[LCMRPC, None, None]: - ret = LCMRPC() + ret = LCMRPC(rpc_timeouts={}) ret.start() yield ret ret.stop() diff --git a/dimos/protocol/rpc/test_spec.py b/dimos/protocol/rpc/test_spec.py index cfee044548..12bdc98c85 100644 --- a/dimos/protocol/rpc/test_spec.py +++ b/dimos/protocol/rpc/test_spec.py @@ -46,8 +46,8 @@ def lcm_rpc_context(): from dimos.protocol.service.lcmservice import autoconf autoconf() - server = LCMRPC() - client = LCMRPC() + server = LCMRPC(rpc_timeouts={}) + client = LCMRPC(rpc_timeouts={}) server.start() client.start() @@ -65,8 +65,8 @@ def lcm_rpc_context(): def shm_rpc_context(): """Context manager for Shared Memory RPC implementation.""" # Create two separate instances that communicate through shared memory segments - server = ShmRPC(prefer="cpu") - client = ShmRPC(prefer="cpu") + server = ShmRPC(rpc_timeouts={}, prefer="cpu") + client = ShmRPC(rpc_timeouts={}, prefer="cpu") server.start() client.start() From 54d45920c7b161250f1d6b52a5a91a0e2327cf2c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 12:53:10 -0700 Subject: [PATCH 59/89] better matching logic for rpc_timeouts --- dimos/protocol/rpc/spec.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 3d17d65948..6b344d0719 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -57,8 +57,13 @@ def call_sync( ) -> tuple[Any, Callable[[], None]]: if rpc_timeout is None: # Try full topic name first, then bare method name (after last "/"). - method = name.rsplit("/", 1)[-1] - rpc_timeout = self.rpc_timeouts.get(name, self.rpc_timeouts.get(method, self.default_rpc_timeout)) + rpc_timeout = self.rpc_timeouts.get(name) + if rpc_timeout is None: + method = name.rsplit("/", 1)[-1] + if method is not name: + rpc_timeout = self.rpc_timeouts.get(method, self.default_rpc_timeout) + else: + rpc_timeout = self.default_rpc_timeout event = threading.Event() def receive_value(val) -> None: # type: ignore[no-untyped-def] From 159854568e70ed14e1140ecf4a49d9afd95bd007 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 12:54:08 -0700 Subject: [PATCH 60/89] enforce RPCSpec's to have rpc_timeouts in constructor --- dimos/protocol/rpc/spec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 6b344d0719..0f48cab05e 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -115,4 +115,5 @@ def override_f(*args, fname=fname, **kwargs): # type: ignore[no-untyped-def] self.serve_rpc(override_f, topic) -class RPCSpec(RPCServer, RPCClient): ... +class RPCSpec(RPCServer, RPCClient): + def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: ... From 8a3684389163ea6a14ee994b4f85b77a3e08d9c5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 13:29:40 -0700 Subject: [PATCH 61/89] Remove pr-name-check from this branch Not related to rosnav feature work. --- bin/pr-name-check | 69 --------------------------------- dimos/core/docker_runner.py | 2 - dimos/core/rpc_client.py | 7 ++-- dimos/protocol/rpc/pubsubrpc.py | 5 +-- dimos/protocol/rpc/spec.py | 14 +++++-- 5 files changed, 17 insertions(+), 80 deletions(-) delete mode 100755 bin/pr-name-check diff --git a/bin/pr-name-check b/bin/pr-name-check deleted file mode 100755 index 0f67e6172a..0000000000 --- a/bin/pr-name-check +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -branch="$(git rev-parse --abbrev-ref HEAD)" - -# based on: https://github.com/dimensionalOS/wiki/wiki -allowed_types="feat fix chore refactor docs" -allowed_names="stash ivan paul alexl mustafa miguel christie ruthwik jalaj yashas yash matt jing juan jeff unknown" - -if [[ "$branch" != */*/* ]]; then - echo "Invalid branch name: '$branch'" - echo "Expected format: //" - echo "Allowed names: $allowed_names" - echo "Allowed types: $allowed_types" - exit 1 -fi - -branch_name="${branch%%/*}" -rest="${branch#*/}" -branch_type="${rest%%/*}" -branch_description="${branch#*/*/}" - -if [[ -z "$branch_description" || "$branch_description" == "$branch" ]]; then - echo "Invalid branch name: '$branch'" - echo "Expected format: //" - exit 1 -fi - -name_ok=0 -for n in $allowed_names; do - if [[ "$branch_name" == "$n" ]]; then - name_ok=1 - break - fi -done - -type_ok=0 -for t in $allowed_types; do - if [[ "$branch_type" == "$t" ]]; then - type_ok=1 - break - fi -done - -if [[ "$name_ok" -ne 1 || "$type_ok" -ne 1 ]]; then - echo - echo - echo - echo - echo - echo "Invalid branch name: '$branch'" - echo - echo " Expected format: //" - echo " Example: jeff/fix/ci-divergence" - echo " Parsed name: $branch_name" - echo " Allowed names: $allowed_names" - echo " Parsed type: $branch_type" - echo " Allowed types: $allowed_types" - echo - echo "Wait 4 seconds if you want to ignore this error" - sleep 1; echo 4 - sleep 1; echo 3 - sleep 1; echo 2 - sleep 1; echo 1 - exit 1 -else - echo "Branch naming check passed: $branch" -fi diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index cebb7fb49b..c5e1a929f0 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -43,7 +43,6 @@ DOCKER_CMD_TIMEOUT = 20 # Timeout for quick Docker commands (inspect, rm, logs) DOCKER_STATUS_TIMEOUT = 10 # Timeout for container status checks DOCKER_STOP_TIMEOUT = 30 # Timeout for `docker stop` command (graceful shutdown) -RPC_READY_TIMEOUT = 3.0 # Timeout for RPC readiness probe during container startup LOG_TAIL_LINES = 200 # Number of log lines to include in error messages @@ -529,7 +528,6 @@ def _wait_for_rpc(self) -> None: self.rpc.call_sync( f"{self.remote_name}/get_rpc_method_names", ([], {}), - rpc_timeout=RPC_READY_TIMEOUT, ) elapsed = time.time() - start_time logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)") diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 3fd120a1fc..46354dd257 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -91,8 +91,6 @@ def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] class ModuleProxyProtocol(Protocol): """Protocol for host-side handles to remote modules (worker or Docker).""" - rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS - def start(self) -> None: ... def stop(self) -> None: ... def set_transport(self, stream_name: str, transport: Any) -> bool: ... @@ -104,7 +102,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... class RPCClient: def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] default_config = getattr(actor_class, "default_config", None) - self.rpc_timeouts: dict[str, float] = getattr(default_config, "rpc_timeouts", DEFAULT_RPC_TIMEOUTS) + self.rpc_timeouts: dict[str, float] = { + **DEFAULT_RPC_TIMEOUTS, + **getattr(default_config, "rpc_timeouts", {}), + } self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts) self.actor_class = actor_class self.remote_name = actor_class.__name__ diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py index c440710e5f..628c5b0a0b 100644 --- a/dimos/protocol/rpc/pubsubrpc.py +++ b/dimos/protocol/rpc/pubsubrpc.py @@ -32,7 +32,7 @@ from dimos.protocol.pubsub.impl.shmpubsub import PickleSharedMemory from dimos.protocol.pubsub.spec import PubSub from dimos.protocol.rpc.rpc_utils import deserialize_exception, serialize_exception -from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, Args, RPCSpec +from dimos.protocol.rpc.spec import Args, RPCSpec from dimos.utils.generic import short_id from dimos.utils.logging_config import setup_logger @@ -63,8 +63,7 @@ class RPCRes(TypedDict, total=False): class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]): def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.rpc_timeouts = {**DEFAULT_RPC_TIMEOUTS, **rpc_timeouts} + super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs) # Thread pool for RPC handler execution (prevents deadlock in nested calls) self._call_thread_pool: ThreadPoolExecutor | None = None self._call_thread_pool_lock = threading.RLock() diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 0f48cab05e..a4d7e614e8 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -29,12 +29,19 @@ class RPCInspectable(Protocol): @property def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] - DEFAULT_RPC_TIMEOUT: float = 120.0 DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0} - class RPCClient(Protocol): + # call_sync resolves per-method overrides from rpc_timeouts, + # falling back to default_rpc_timeout. + rpc_timeouts: dict[str, float] + default_rpc_timeout: float + + def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.rpc_timeouts = dict(rpc_timeouts) + # if we don't provide callback, we don't get a return unsub f @overload def call(self, name: str, arguments: Args, cb: None) -> None: ... @@ -116,4 +123,5 @@ def override_f(*args, fname=fname, **kwargs): # type: ignore[no-untyped-def] class RPCSpec(RPCServer, RPCClient): - def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: ... + def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: + super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs) From 7ad090fcd4eadb7b0f8d55c91c17f0b3c26305ad Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 13:38:40 -0700 Subject: [PATCH 62/89] fixup rpc timeouts --- dimos/core/docker_runner.py | 5 ++++- dimos/core/module.py | 10 ++++++--- dimos/core/rpc_client.py | 11 +++++----- dimos/protocol/rpc/pubsubrpc.py | 34 ++++++++++++++++++++----------- dimos/protocol/rpc/spec.py | 27 +++++++++++++++--------- dimos/protocol/rpc/test_lcmrpc.py | 3 ++- dimos/protocol/rpc/test_spec.py | 9 ++++---- 7 files changed, 62 insertions(+), 37 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index c5e1a929f0..30468bccd5 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -204,7 +204,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}" ) - self.rpc = LCMRPC(rpc_timeouts=self.config.rpc_timeouts) + self.rpc = LCMRPC( + rpc_timeouts=self.config.rpc_timeouts, + default_rpc_timeout=self.config.default_rpc_timeout, + ) self.rpcs = set(module_class.rpcs.keys()) # type: ignore[attr-defined] self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", []) self._unsub_fns: list[Callable[[], None]] = [] diff --git a/dimos/core/module.py b/dimos/core/module.py index c6c557b825..64f7dd65cf 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -40,7 +40,7 @@ from dimos.core.rpc_client import RpcCall from dimos.core.stream import In, Out, RemoteOut, Transport from dimos.protocol.rpc.pubsubrpc import LCMRPC -from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.protocol.service.spec import BaseConfig, Configurable from dimos.protocol.tf.tf import LCMTF, TFSpec from dimos.utils import colors @@ -79,7 +79,8 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]: class ModuleConfig(BaseConfig): rpc_transport: type[RPCSpec] = LCMRPC - rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS + default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT + rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS) tf_transport: type[TFSpec] = LCMTF # type: ignore[type-arg] frame_id_prefix: str | None = None frame_id: str | None = None @@ -115,7 +116,10 @@ def __init__(self, config_args: dict[str, Any]): self._loop, self._loop_thread = get_loop() self._disposables = CompositeDisposable() try: - self.rpc = self.config.rpc_transport(rpc_timeouts=self.config.rpc_timeouts) + self.rpc = self.config.rpc_transport( + rpc_timeouts=self.config.rpc_timeouts, + default_rpc_timeout=self.config.default_rpc_timeout, + ) self.rpc.serve_module_rpc(self) self.rpc.start() # type: ignore[attr-defined] except ValueError: diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 46354dd257..7ac34bb645 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -18,7 +18,7 @@ from dimos.core.stream import RemoteStream from dimos.core.worker import MethodCallProxy from dimos.protocol.rpc.pubsubrpc import LCMRPC -from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -102,11 +102,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ... class RPCClient: def __init__(self, actor_instance, actor_class) -> None: # type: ignore[no-untyped-def] default_config = getattr(actor_class, "default_config", None) - self.rpc_timeouts: dict[str, float] = { - **DEFAULT_RPC_TIMEOUTS, - **getattr(default_config, "rpc_timeouts", {}), - } - self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts) + self.rpc = LCMRPC( + rpc_timeouts=getattr(default_config, "rpc_timeouts", dict(DEFAULT_RPC_TIMEOUTS)), + default_rpc_timeout=getattr(default_config, "default_rpc_timeout", DEFAULT_RPC_TIMEOUT), + ) self.actor_class = actor_class self.remote_name = actor_class.__name__ self.actor_instance = actor_instance diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py index 628c5b0a0b..565a9af227 100644 --- a/dimos/protocol/rpc/pubsubrpc.py +++ b/dimos/protocol/rpc/pubsubrpc.py @@ -62,8 +62,12 @@ class RPCRes(TypedDict, total=False): class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]): - def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: - super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs) + def __init__( + self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any + ) -> None: + super().__init__( + *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs + ) # Thread pool for RPC handler execution (prevents deadlock in nested calls) self._call_thread_pool: ThreadPoolExecutor | None = None self._call_thread_pool_lock = threading.RLock() @@ -290,12 +294,13 @@ def execute_and_respond() -> None: class LCMRPC(PubSubRPCMixin[Topic, Any], PickleLCM): - def __init__(self, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: - # Need to ensure PickleLCM gets initialized properly - # This is due to the diamond inheritance pattern with multiple base classes + def __init__( + self, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any + ) -> None: PickleLCM.__init__(self, **kwargs) - # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults) - PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs) + PubSubRPCMixin.__init__( + self, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs + ) def topicgen(self, name: str, req_or_res: bool) -> Topic: suffix = "res" if req_or_res else "req" @@ -306,12 +311,17 @@ def topicgen(self, name: str, req_or_res: bool) -> Topic: class ShmRPC(PubSubRPCMixin[str, Any], PickleSharedMemory): - def __init__(self, rpc_timeouts: dict[str, float], prefer: str = "cpu", **kwargs: Any) -> None: - # Need to ensure SharedMemory gets initialized properly - # This is due to the diamond inheritance pattern with multiple base classes + def __init__( + self, + rpc_timeouts: dict[str, float], + default_rpc_timeout: float, + prefer: str = "cpu", + **kwargs: Any, + ) -> None: PickleSharedMemory.__init__(self, prefer=prefer, **kwargs) - # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults) - PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs) + PubSubRPCMixin.__init__( + self, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs + ) def topicgen(self, name: str, req_or_res: bool) -> str: suffix = "res" if req_or_res else "req" diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index a4d7e614e8..f80f77bf3a 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -15,6 +15,7 @@ import asyncio from collections.abc import Callable import threading +from types import MappingProxyType from typing import Any, Protocol, overload @@ -29,18 +30,25 @@ class RPCInspectable(Protocol): @property def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] + +# module.py and other places imports these constants and choose what to give RPCClient +# the RPCClient below does not use these constants directly (by design) DEFAULT_RPC_TIMEOUT: float = 120.0 -DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0} +DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({"start": 1200.0}) + class RPCClient(Protocol): # call_sync resolves per-method overrides from rpc_timeouts, # falling back to default_rpc_timeout. rpc_timeouts: dict[str, float] default_rpc_timeout: float - - def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: + + def __init__( + self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any + ) -> None: super().__init__(*args, **kwargs) self.rpc_timeouts = dict(rpc_timeouts) + self.default_rpc_timeout = default_rpc_timeout # if we don't provide callback, we don't get a return unsub f @overload @@ -54,11 +62,6 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[], def call_nowait(self, name: str, arguments: Args) -> None: ... - # call_sync resolves per-method overrides from rpc_timeouts, - # falling back to default_rpc_timeout. - default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT - rpc_timeouts: dict[str, float] - def call_sync( self, name: str, arguments: Args, rpc_timeout: float | None = None ) -> tuple[Any, Callable[[], None]]: @@ -123,5 +126,9 @@ def override_f(*args, fname=fname, **kwargs): # type: ignore[no-untyped-def] class RPCSpec(RPCServer, RPCClient): - def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: - super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs) + def __init__( + self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any + ) -> None: + super().__init__( + *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs + ) diff --git a/dimos/protocol/rpc/test_lcmrpc.py b/dimos/protocol/rpc/test_lcmrpc.py index 700618ab72..3c2b87761d 100644 --- a/dimos/protocol/rpc/test_lcmrpc.py +++ b/dimos/protocol/rpc/test_lcmrpc.py @@ -18,11 +18,12 @@ from dimos.constants import LCM_MAX_CHANNEL_NAME_LENGTH from dimos.protocol.rpc.pubsubrpc import LCMRPC +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT @pytest.fixture def lcmrpc() -> Generator[LCMRPC, None, None]: - ret = LCMRPC(rpc_timeouts={}) + ret = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT) ret.start() yield ret ret.stop() diff --git a/dimos/protocol/rpc/test_spec.py b/dimos/protocol/rpc/test_spec.py index 12bdc98c85..0b374f7d6c 100644 --- a/dimos/protocol/rpc/test_spec.py +++ b/dimos/protocol/rpc/test_spec.py @@ -27,6 +27,7 @@ from dimos.protocol.rpc.pubsubrpc import LCMRPC, ShmRPC from dimos.protocol.rpc.rpc_utils import RemoteError +from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT class CustomTestError(Exception): @@ -46,8 +47,8 @@ def lcm_rpc_context(): from dimos.protocol.service.lcmservice import autoconf autoconf() - server = LCMRPC(rpc_timeouts={}) - client = LCMRPC(rpc_timeouts={}) + server = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT) + client = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT) server.start() client.start() @@ -65,8 +66,8 @@ def lcm_rpc_context(): def shm_rpc_context(): """Context manager for Shared Memory RPC implementation.""" # Create two separate instances that communicate through shared memory segments - server = ShmRPC(rpc_timeouts={}, prefer="cpu") - client = ShmRPC(rpc_timeouts={}, prefer="cpu") + server = ShmRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT, prefer="cpu") + client = ShmRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT, prefer="cpu") server.start() client.start() From d0563a89f6e6ee5382ebec4007c4fad3420c11b4 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 13:40:21 -0700 Subject: [PATCH 63/89] mypy issue on dev --- dimos/core/resource.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dimos/core/resource.py b/dimos/core/resource.py index 63b1eec4f0..a4c008b806 100644 --- a/dimos/core/resource.py +++ b/dimos/core/resource.py @@ -15,7 +15,13 @@ from __future__ import annotations from abc import abstractmethod -from typing import TYPE_CHECKING, Self +import sys +from typing import TYPE_CHECKING + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self if TYPE_CHECKING: from types import TracebackType From 639e90c9caa030779de5071c3a9dd3e509ea7ca4 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 13:40:34 -0700 Subject: [PATCH 64/89] equality --- dimos/protocol/rpc/spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index f80f77bf3a..f833b032ad 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -70,7 +70,7 @@ def call_sync( rpc_timeout = self.rpc_timeouts.get(name) if rpc_timeout is None: method = name.rsplit("/", 1)[-1] - if method is not name: + if method != name: rpc_timeout = self.rpc_timeouts.get(method, self.default_rpc_timeout) else: rpc_timeout = self.default_rpc_timeout From 5c85dc20e0c00d0fa6f7e04b8e3d9357f3d9e7c1 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 14:26:15 -0700 Subject: [PATCH 65/89] fix: docker module init + rpc timeout bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove @dataclass(kw_only=True) from HelloDockerConfig (conflicts with Pydantic) - Pop global_config from kwargs before passing to config class - Store rpc_timeouts/default_rpc_timeout in PubSubRPCMixin (not Protocol) - Remove __init__ from RPCClient Protocol and RPCSpec (structural typing only) - Use short 3s timeout for readiness probe polling (was using 120s default) - Extract NavigationStrategy/VlModelName into lightweight types.py files (same fix as jeff/fix/help — prevents torch import in Docker containers) --- dimos/core/docker_runner.py | 4 ++++ dimos/core/global_config.py | 4 ++-- dimos/mapping/occupancy/path_map.py | 3 +-- dimos/mapping/occupancy/types.py | 3 +++ dimos/models/vl/create.py | 4 ++-- dimos/models/vl/types.py | 3 +++ dimos/protocol/rpc/pubsubrpc.py | 6 +++--- dimos/protocol/rpc/spec.py | 17 +++-------------- examples/docker_hello_world/hello_docker.py | 3 +-- 9 files changed, 22 insertions(+), 25 deletions(-) create mode 100644 dimos/mapping/occupancy/types.py create mode 100644 dimos/models/vl/types.py diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 30468bccd5..3efc05f316 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -183,6 +183,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non image_exists, ) + # global_config is passed by deploy pipeline but isn't a config field + kwargs.pop("global_config", None) + config_class = getattr(module_class, "default_config", DockerModuleConfig) if not issubclass(config_class, DockerModuleConfig): raise TypeError( @@ -531,6 +534,7 @@ def _wait_for_rpc(self) -> None: self.rpc.call_sync( f"{self.remote_name}/get_rpc_method_names", ([], {}), + rpc_timeout=3.0, # short timeout for polling readiness ) elapsed = time.time() - start_time logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)") diff --git a/dimos/core/global_config.py b/dimos/core/global_config.py index 60072ae7fd..49f4d4f325 100644 --- a/dimos/core/global_config.py +++ b/dimos/core/global_config.py @@ -17,8 +17,8 @@ from pydantic_settings import BaseSettings, SettingsConfigDict -from dimos.mapping.occupancy.path_map import NavigationStrategy -from dimos.models.vl.create import VlModelName +from dimos.mapping.occupancy.types import NavigationStrategy +from dimos.models.vl.types import VlModelName ViewerBackend: TypeAlias = Literal["rerun", "rerun-web", "rerun-connect", "foxglove", "none"] diff --git a/dimos/mapping/occupancy/path_map.py b/dimos/mapping/occupancy/path_map.py index a99a423de8..7392030298 100644 --- a/dimos/mapping/occupancy/path_map.py +++ b/dimos/mapping/occupancy/path_map.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Literal, TypeAlias +from dimos.mapping.occupancy.types import NavigationStrategy from dimos.mapping.occupancy.gradient import voronoi_gradient from dimos.mapping.occupancy.inflation import simple_inflate from dimos.mapping.occupancy.operations import overlay_occupied, smooth_occupied from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid -NavigationStrategy: TypeAlias = Literal["simple", "mixed"] def make_navigation_map( diff --git a/dimos/mapping/occupancy/types.py b/dimos/mapping/occupancy/types.py new file mode 100644 index 0000000000..e6b7d5bd6b --- /dev/null +++ b/dimos/mapping/occupancy/types.py @@ -0,0 +1,3 @@ +from typing import Literal, TypeAlias + +NavigationStrategy: TypeAlias = Literal["simple", "mixed"] diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py index 6c778d4104..bb14758bcb 100644 --- a/dimos/models/vl/create.py +++ b/dimos/models/vl/create.py @@ -1,8 +1,8 @@ -from typing import Any, Literal +from typing import Any from dimos.models.vl.base import VlModel -VlModelName = Literal["qwen", "moondream"] +from dimos.models.vl.types import VlModelName def create(name: VlModelName) -> VlModel[Any]: diff --git a/dimos/models/vl/types.py b/dimos/models/vl/types.py new file mode 100644 index 0000000000..ac8b0f024d --- /dev/null +++ b/dimos/models/vl/types.py @@ -0,0 +1,3 @@ +from typing import Literal + +VlModelName = Literal["qwen", "moondream"] diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py index 565a9af227..52cb89a199 100644 --- a/dimos/protocol/rpc/pubsubrpc.py +++ b/dimos/protocol/rpc/pubsubrpc.py @@ -65,9 +65,9 @@ class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]): def __init__( self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any ) -> None: - super().__init__( - *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs - ) + super().__init__(*args, **kwargs) + self.rpc_timeouts = dict(rpc_timeouts) + self.default_rpc_timeout = default_rpc_timeout # Thread pool for RPC handler execution (prevents deadlock in nested calls) self._call_thread_pool: ThreadPoolExecutor | None = None self._call_thread_pool_lock = threading.RLock() diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index f833b032ad..993f6044bb 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -39,17 +39,11 @@ def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] class RPCClient(Protocol): # call_sync resolves per-method overrides from rpc_timeouts, - # falling back to default_rpc_timeout. + # falling back to default_rpc_timeout. These are set by + # PubSubRPCMixin.__init__ at runtime. rpc_timeouts: dict[str, float] default_rpc_timeout: float - def __init__( - self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any - ) -> None: - super().__init__(*args, **kwargs) - self.rpc_timeouts = dict(rpc_timeouts) - self.default_rpc_timeout = default_rpc_timeout - # if we don't provide callback, we don't get a return unsub f @overload def call(self, name: str, arguments: Args, cb: None) -> None: ... @@ -126,9 +120,4 @@ def override_f(*args, fname=fname, **kwargs): # type: ignore[no-untyped-def] class RPCSpec(RPCServer, RPCClient): - def __init__( - self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any - ) -> None: - super().__init__( - *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs - ) + pass diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 6c30228089..0fb56959ea 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -31,7 +31,7 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import field from pathlib import Path import subprocess import time @@ -45,7 +45,6 @@ from dimos.core.stream import In, Out -@dataclass(kw_only=True) class HelloDockerConfig(DockerModuleConfig): docker_image: str = "dimos-hello-docker:latest" docker_file: Path | None = Path(__file__).parent / "Dockerfile" From 9668e3afda98ab9af0643505e8fdaabe7ca1d068 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sun, 15 Mar 2026 14:54:51 -0700 Subject: [PATCH 66/89] fix(example): use 'cowsay' not '/usr/games/cowsay' per review Address Paul's review comment to use check_output with plain 'cowsay'. --- examples/docker_hello_world/hello_docker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py index 0fb56959ea..a9913d770b 100644 --- a/examples/docker_hello_world/hello_docker.py +++ b/examples/docker_hello_world/hello_docker.py @@ -73,7 +73,7 @@ def start(self) -> None: def _cowsay(self, text: str) -> str: """Run cowsay inside the container and return the ASCII art.""" - return subprocess.check_output(["/usr/games/cowsay", text], text=True) + return subprocess.check_output(["cowsay", text], text=True) def _on_prompt(self, text: str) -> None: art = self._cowsay(text) From fba0a7128ad99f7656e58bf409930067c482d85b Mon Sep 17 00:00:00 2001 From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com> Date: Mon, 16 Mar 2026 17:51:48 +0000 Subject: [PATCH 67/89] CI code cleanup --- dimos/mapping/occupancy/path_map.py | 4 +--- dimos/mapping/occupancy/types.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/dimos/mapping/occupancy/path_map.py b/dimos/mapping/occupancy/path_map.py index 7392030298..a1a4640007 100644 --- a/dimos/mapping/occupancy/path_map.py +++ b/dimos/mapping/occupancy/path_map.py @@ -12,15 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dimos.mapping.occupancy.types import NavigationStrategy - from dimos.mapping.occupancy.gradient import voronoi_gradient from dimos.mapping.occupancy.inflation import simple_inflate from dimos.mapping.occupancy.operations import overlay_occupied, smooth_occupied +from dimos.mapping.occupancy.types import NavigationStrategy from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid - def make_navigation_map( occupancy_grid: OccupancyGrid, robot_width: float, strategy: NavigationStrategy ) -> OccupancyGrid: diff --git a/dimos/mapping/occupancy/types.py b/dimos/mapping/occupancy/types.py index e6b7d5bd6b..87f2084698 100644 --- a/dimos/mapping/occupancy/types.py +++ b/dimos/mapping/occupancy/types.py @@ -1,3 +1,17 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Literal, TypeAlias NavigationStrategy: TypeAlias = Literal["simple", "mixed"] From 593c4180c17c3a857e1ef023e9a5ac264915731d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Tue, 17 Mar 2026 10:27:10 -0700 Subject: [PATCH 68/89] fix: address Paul's PR review comments - Use strict=True instead of strict=False in zip() calls (module_coordinator.py) - Fix mutable default dict for rpc_timeouts using Field(default_factory=...) (module.py) - Remove unnecessary getattr() for _unsub_fns in _cleanup() (docker_runner.py) - Use threading.Event instead of bool for _running flag (docker_runner.py) - Rename global_config kwarg to g to match ModuleConfig field name (docker_runner.py, module_coordinator.py, docker_worker_manager.py) - Move inline test imports to top of file (test_docker_deployment.py) - Sort imports in hello_docker.py example --- dimos/core/docker_runner.py | 21 +++++++++--------- dimos/core/docker_worker_manager.py | 2 +- dimos/core/module.py | 3 ++- dimos/core/module_coordinator.py | 8 +++---- dimos/core/tests/test_docker_deployment.py | 25 +++++++--------------- 5 files changed, 25 insertions(+), 34 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 3efc05f316..fb5770325b 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -183,8 +183,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non image_exists, ) - # global_config is passed by deploy pipeline but isn't a config field - kwargs.pop("global_config", None) + # g (GlobalConfig) is passed by deploy pipeline but handled by the base config + kwargs.pop("g", None) config_class = getattr(module_class, "default_config", DockerModuleConfig) if not issubclass(config_class, DockerModuleConfig): @@ -198,7 +198,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.config = config self._args = args self._kwargs = kwargs - self._running = False + self._running = threading.Event() self.remote_name = module_class.__name__ # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] @@ -259,7 +259,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" ) self.rpc.start() - self._running = True + self._running.set() # docker run -d returns before Module.__init__ finishes in the container, # so we poll until the RPC server is reachable before returning. self._wait_for_rpc() @@ -299,9 +299,9 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - if not self._running: + if not self._running.is_set(): return - self._running = False # claim shutdown before any side-effects + self._running.clear() # claim shutdown before any side-effects with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) self._cleanup() @@ -310,11 +310,10 @@ def _cleanup(self) -> None: """Release all resources. Idempotent — safe to call from partial init or after stop().""" with suppress(Exception): self.rpc.stop() - for unsub in getattr(self, "_unsub_fns", []): + for unsub in self._unsub_fns: with suppress(Exception): unsub() - with suppress(Exception): - self._unsub_fns.clear() + self._unsub_fns.clear() if not getattr(getattr(self, "config", None), "docker_reconnect_container", False): with suppress(Exception): _run( @@ -323,7 +322,7 @@ def _cleanup(self) -> None: ) with suppress(Exception): _remove_container(self.config, self._container_name) - self._running = False + self._running.clear() logger.info(f"Cleaned up container handle: {self._container_name}") def status(self) -> dict[str, Any]: @@ -332,7 +331,7 @@ def status(self) -> dict[str, Any]: "module": self.remote_name, "container_name": self._container_name, "image": cfg.docker_image, - "running": bool(self._running and _is_container_running(cfg, self._container_name)), + "running": self._running.is_set() and _is_container_running(cfg, self._container_name), } def tail_logs(self, n: int = 200) -> str: diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 520468182f..94a5793c3d 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -47,6 +47,6 @@ def _on_errors( return safe_thread_map( specs, - lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]), # type: ignore[arg-type] + lambda spec: DockerModule(spec[0], g=spec[1], **spec[2]), # type: ignore[arg-type] _on_errors, ) diff --git a/dimos/core/module.py b/dimos/core/module.py index 64f7dd65cf..2e03e2484e 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -30,6 +30,7 @@ ) from langchain_core.tools import tool +from pydantic import Field from reactivex.disposable import CompositeDisposable from dimos.core.core import T, rpc @@ -80,7 +81,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]: class ModuleConfig(BaseConfig): rpc_transport: type[RPCSpec] = LCMRPC default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT - rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS) + rpc_timeouts: dict[str, float] = Field(default_factory=lambda: dict(DEFAULT_RPC_TIMEOUTS)) tf_transport: type[TFSpec] = LCMTF # type: ignore[type-arg] frame_id_prefix: str | None = None frame_id: str | None = None diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 43e3e44f0a..d2d1db67be 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -131,7 +131,7 @@ def deploy( deployed_module: ModuleProxyProtocol if is_docker_module(module_class): - deployed_module = DockerModule(module_class, global_config=global_config, **kwargs) # type: ignore[arg-type] + deployed_module = DockerModule(module_class, g=global_config, **kwargs) # type: ignore[arg-type] else: deployed_module = self._client.deploy(module_class, global_config, kwargs) self._deployed_modules[module_class] = deployed_module # type: ignore[assignment] @@ -165,7 +165,7 @@ def _deploy_workers() -> None: return assert self._client is not None for index, module in zip( - worker_indices, self._client.deploy_parallel(worker_specs), strict=False + worker_indices, self._client.deploy_parallel(worker_specs), strict=True ): results[index] = module @@ -173,12 +173,12 @@ def _deploy_docker() -> None: if not docker_specs: return for index, module in zip( - docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False + docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True ): results[index] = module def _register() -> None: - for (module_class, _, _), module in zip(module_specs, results, strict=False): + for (module_class, _, _), module in zip(module_specs, results, strict=True): if module is not None: self._deployed_modules[module_class] = module diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index a3bb0b716d..3dfb9242c6 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -22,14 +22,16 @@ from __future__ import annotations from pathlib import Path +import threading from unittest.mock import MagicMock, patch import pytest -from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator +from dimos.core.rpc_client import RpcCall from dimos.core.stream import Out # -- Fixtures: fake module classes ------------------------------------------- @@ -91,9 +93,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() # Should construct a DockerModule (container launch happens inside __init__) - mock_docker_module_cls.assert_called_once_with( - FakeDockerModule, global_config=global_config - ) + mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() assert result is mock_dm @@ -198,7 +198,6 @@ class TestDockerModuleGetattr: def test_getattr_no_recursion_when_rpcs_not_set(self): """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse.""" - from dimos.core.docker_runner import DockerModule dm = DockerModule.__new__(DockerModule) # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure @@ -207,7 +206,6 @@ def test_getattr_no_recursion_when_rpcs_not_set(self): def test_getattr_no_recursion_on_cleanup_attrs(self): """Accessing cleanup-related attrs before they exist must raise, not recurse.""" - from dimos.core.docker_runner import DockerModule dm = DockerModule.__new__(DockerModule) # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse @@ -216,9 +214,6 @@ def test_getattr_no_recursion_on_cleanup_attrs(self): getattr(dm, attr) def test_getattr_delegates_to_rpc_when_rpcs_set(self): - from dimos.core.docker_runner import DockerModule - from dimos.core.rpc_client import RpcCall - dm = DockerModule.__new__(DockerModule) dm.rpcs = {"do_thing"} @@ -235,8 +230,6 @@ def do_thing(self) -> None: ... assert isinstance(result, RpcCall) def test_getattr_raises_for_unknown_method(self): - from dimos.core.docker_runner import DockerModule - dm = DockerModule.__new__(DockerModule) dm.rpcs = {"do_thing"} @@ -248,11 +241,10 @@ class TestDockerModuleCleanupReconnect: """Tests for DockerModule._cleanup with docker_reconnect_container.""" def test_cleanup_skips_stop_when_reconnect(self): - from dimos.core.docker_runner import DockerModule - with patch.object(DockerModule, "__init__", lambda self: None): dm = DockerModule.__new__(DockerModule) - dm._running = True + dm._running = threading.Event() + dm._running.set() dm._container_name = "test_container" dm._unsub_fns = [] dm.rpc = MagicMock() @@ -269,11 +261,10 @@ def test_cleanup_skips_stop_when_reconnect(self): mock_rm.assert_not_called() def test_cleanup_stops_container_when_not_reconnect(self): - from dimos.core.docker_runner import DockerModule - with patch.object(DockerModule, "__init__", lambda self: None): dm = DockerModule.__new__(DockerModule) - dm._running = True + dm._running = threading.Event() + dm._running.set() dm._container_name = "test_container" dm._unsub_fns = [] dm.rpc = MagicMock() From 427816618a935917e71ddaf11258fbc8229a8016 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 19 Mar 2026 02:42:22 -0700 Subject: [PATCH 69/89] fix(ci): fix _DummyRPC init and mypy type-ignore for rpc_transport kwargs - Add __init__(**kwargs) to _DummyRPC in test_sim_module.py to accept rpc_timeouts/default_rpc_timeout kwargs passed by Module.__init__ - Add type: ignore[call-arg] for RPCSpec Protocol constructor call --- dimos/core/module.py | 2 +- dimos/simulation/manipulators/test_sim_module.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dimos/core/module.py b/dimos/core/module.py index 2e03e2484e..59c8833ea8 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -117,7 +117,7 @@ def __init__(self, config_args: dict[str, Any]): self._loop, self._loop_thread = get_loop() self._disposables = CompositeDisposable() try: - self.rpc = self.config.rpc_transport( + self.rpc = self.config.rpc_transport( # type: ignore[call-arg] rpc_timeouts=self.config.rpc_timeouts, default_rpc_timeout=self.config.default_rpc_timeout, ) diff --git a/dimos/simulation/manipulators/test_sim_module.py b/dimos/simulation/manipulators/test_sim_module.py index 951d4790e3..54d8f21da3 100644 --- a/dimos/simulation/manipulators/test_sim_module.py +++ b/dimos/simulation/manipulators/test_sim_module.py @@ -22,6 +22,9 @@ class _DummyRPC(RPCSpec): + def __init__(self, **kwargs: object) -> None: # type: ignore[no-untyped-def] + pass + def serve_module_rpc(self, _module) -> None: # type: ignore[no-untyped-def] return None From 47737b09eec1fec17bdce898a6a74c4c5d48fcdf Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 19 Mar 2026 03:15:27 -0700 Subject: [PATCH 70/89] fix(mypy): add __all__ to vl/create.py for explicit VlModelName export --- dimos/models/vl/create.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py index bb14758bcb..9d2a908532 100644 --- a/dimos/models/vl/create.py +++ b/dimos/models/vl/create.py @@ -1,9 +1,10 @@ from typing import Any from dimos.models.vl.base import VlModel - from dimos.models.vl.types import VlModelName +__all__ = ["VlModelName", "create"] + def create(name: VlModelName) -> VlModel[Any]: # This uses inline imports to only import what's needed. From 157ce93717695bb9145f5260cc561bc408a3a6ce Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 19 Mar 2026 04:27:47 -0700 Subject: [PATCH 71/89] fix(test): wrap coordinator in try/finally for proper cleanup on test failure Address Paul's review comment: if an assertion fails before coordinator.stop(), cleanup won't run. Use try/finally to ensure stop() is always called, even when tests fail. --- dimos/core/tests/test_docker_deployment.py | 92 +++++++++++----------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 3dfb9242c6..d8eb9448ff 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -87,19 +87,19 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ coordinator = ModuleCoordinator() coordinator.start() - - result = coordinator.deploy(FakeDockerModule) - - # Should NOT go through worker manager - mock_worker_mgr.deploy.assert_not_called() - # Should construct a DockerModule (container launch happens inside __init__) - mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config) - # start() is NOT called during deploy — it's called in start_all_modules - mock_dm.start.assert_not_called() - assert result is mock_dm - assert coordinator.get_instance(FakeDockerModule) is mock_dm - - coordinator.stop() + try: + result = coordinator.deploy(FakeDockerModule) + + # Should NOT go through worker manager + mock_worker_mgr.deploy.assert_not_called() + # Should construct a DockerModule (container launch happens inside __init__) + mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config) + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() + assert result is mock_dm + assert coordinator.get_instance(FakeDockerModule) is mock_dm + finally: + coordinator.stop() @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") @@ -114,11 +114,11 @@ def test_deploy_docker_propagates_constructor_failure( coordinator = ModuleCoordinator() coordinator.start() - - with pytest.raises(RuntimeError, match="launch failed"): - coordinator.deploy(FakeDockerModule) - - coordinator.stop() + try: + with pytest.raises(RuntimeError, match="launch failed"): + coordinator.deploy(FakeDockerModule) + finally: + coordinator.stop() @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): @@ -129,13 +129,13 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage coordinator = ModuleCoordinator() coordinator.start() + try: + result = coordinator.deploy(FakeRegularModule) - result = coordinator.deploy(FakeRegularModule) - - mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) - assert result is mock_proxy - - coordinator.stop() + mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) + assert result is mock_proxy + finally: + coordinator.stop() @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel") @patch("dimos.core.module_coordinator.WorkerManager") @@ -153,25 +153,25 @@ def test_deploy_parallel_separates_docker_and_regular( coordinator = ModuleCoordinator() coordinator.start() - - specs = [ - (FakeRegularModule, (), {}), - (FakeDockerModule, (), {}), - ] - results = coordinator.deploy_parallel(specs) - - # Regular module goes through worker manager - mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) - # Docker specs go through DockerWorkerManager - mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})]) - # start() is NOT called during deploy — it's called in start_all_modules - mock_dm.start.assert_not_called() - - # Results preserve input order - assert results[0] is regular_proxy - assert results[1] is mock_dm - - coordinator.stop() + try: + specs = [ + (FakeRegularModule, (), {}), + (FakeDockerModule, (), {}), + ] + results = coordinator.deploy_parallel(specs) + + # Regular module goes through worker manager + mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) + # Docker specs go through DockerWorkerManager + mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})]) + # start() is NOT called during deploy — it's called in start_all_modules + mock_dm.start.assert_not_called() + + # Results preserve input order + assert results[0] is regular_proxy + assert results[1] is mock_dm + finally: + coordinator.stop() @patch("dimos.core.docker_runner.DockerModule") @patch("dimos.core.module_coordinator.WorkerManager") @@ -184,8 +184,10 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke coordinator = ModuleCoordinator() coordinator.start() - coordinator.deploy(FakeDockerModule) - coordinator.stop() + try: + coordinator.deploy(FakeDockerModule) + finally: + coordinator.stop() # stop() called exactly once (no double cleanup) assert mock_dm.stop.call_count == 1 From 07b33dd8fd67fd7899bc8a3bf0f6a96c0afda61b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 19 Mar 2026 16:49:01 -0700 Subject: [PATCH 72/89] add build --- dimos/core/blueprints.py | 1 + dimos/core/docker_runner.py | 48 ++++++++++++++-------- dimos/core/docker_worker_manager.py | 11 ++--- dimos/core/module.py | 13 +++++- dimos/core/module_coordinator.py | 24 +++++++++-- dimos/core/rpc_client.py | 2 + dimos/core/tests/test_docker_deployment.py | 16 +++----- dimos/protocol/rpc/spec.py | 5 ++- 8 files changed, 82 insertions(+), 38 deletions(-) diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py index cac8507881..823488c611 100644 --- a/dimos/core/blueprints.py +++ b/dimos/core/blueprints.py @@ -494,6 +494,7 @@ def build( self._connect_rpc_methods(module_coordinator) self._connect_module_refs(module_coordinator) + module_coordinator.build_all_modules() module_coordinator.start_all_modules() return module_coordinator diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 3efc05f316..3e7376a66d 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -167,7 +167,9 @@ class DockerModule(ModuleProxyProtocol): Host-side handle for a module running inside Docker. Lifecycle: - - start(): builds the image if needed, launches the container, waits for readiness, calls the remote module's start() RPC (after streams are wired) + - __init__(): lightweight setup — config, names, RPC client, no side-effects + - build(): heavy work — docker build/pull image, launch container, wait for RPC readiness + - start(): invoke remote module's start() RPC (after streams are wired) - stop(): stops the container and cleans up Communication: All RPC happens via LCM multicast (requires --network=host). @@ -176,13 +178,6 @@ class DockerModule(ModuleProxyProtocol): config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - from dimos.core.docker_build import ( - _compute_build_hash, - _get_image_build_hash, - build_image, - image_exists, - ) - # global_config is passed by deploy pipeline but isn't a config field kwargs.pop("global_config", None) @@ -198,7 +193,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self.config = config self._args = args self._kwargs = kwargs - self._running = False + self._running = threading.Event() + self._is_built = False self.remote_name = module_class.__name__ # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] @@ -216,7 +212,23 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._unsub_fns: list[Callable[[], None]] = [] self._bound_rpc_calls: dict[str, RpcCall] = {} - # Build or pull image, launch container, wait for RPC server + def build(self) -> None: + """Build/pull docker image, launch container, wait for RPC readiness. + + Idempotent — safe to call multiple times. Has no RPC timeout since + this runs host-side (not via RPC to a worker process). + """ + if self._is_built: + return + + from dimos.core.docker_build import ( + _compute_build_hash, + _get_image_build_hash, + build_image, + image_exists, + ) + + config = self.config try: if config.docker_file is not None: current_hash = _compute_build_hash(config) @@ -259,10 +271,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}" ) self.rpc.start() - self._running = True + self._running.set() # docker run -d returns before Module.__init__ finishes in the container, # so we poll until the RPC server is reachable before returning. self._wait_for_rpc() + self._is_built = True except Exception: with suppress(Exception): self._cleanup() @@ -299,9 +312,9 @@ def start(self) -> None: def stop(self) -> None: """Gracefully stop the Docker container and clean up resources.""" - if not self._running: + if not self._running.is_set(): return - self._running = False # claim shutdown before any side-effects + self._running.clear() # claim shutdown before any side-effects with suppress(Exception): self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {})) self._cleanup() @@ -310,11 +323,10 @@ def _cleanup(self) -> None: """Release all resources. Idempotent — safe to call from partial init or after stop().""" with suppress(Exception): self.rpc.stop() - for unsub in getattr(self, "_unsub_fns", []): + for unsub in self._unsub_fns: with suppress(Exception): unsub() - with suppress(Exception): - self._unsub_fns.clear() + self._unsub_fns.clear() if not getattr(getattr(self, "config", None), "docker_reconnect_container", False): with suppress(Exception): _run( @@ -323,7 +335,7 @@ def _cleanup(self) -> None: ) with suppress(Exception): _remove_container(self.config, self._container_name) - self._running = False + self._running.clear() logger.info(f"Cleaned up container handle: {self._container_name}") def status(self) -> dict[str, Any]: @@ -332,7 +344,7 @@ def status(self) -> dict[str, Any]: "module": self.remote_name, "container_name": self._container_name, "image": cfg.docker_image, - "running": bool(self._running and _is_container_running(cfg, self._container_name)), + "running": self._running.is_set() and _is_container_running(cfg, self._container_name), } def tail_logs(self, n: int = 200) -> str: diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 520468182f..824bccdaed 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -45,8 +45,9 @@ def _on_errors( mod.stop() raise ExceptionGroup("docker deploy_parallel failed", errors) - return safe_thread_map( - specs, - lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]), # type: ignore[arg-type] - _on_errors, - ) + def _deploy_one(spec: ModuleSpec) -> DockerModule: + mod = DockerModule(spec[0], global_config=spec[1], **spec[2]) # type: ignore[arg-type] + mod.build() + return mod + + return safe_thread_map(specs, _deploy_one, _on_errors) diff --git a/dimos/core/module.py b/dimos/core/module.py index 64f7dd65cf..4d7ad37719 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -40,6 +40,8 @@ from dimos.core.rpc_client import RpcCall from dimos.core.stream import In, Out, RemoteOut, Transport from dimos.protocol.rpc.pubsubrpc import LCMRPC +from types import MappingProxyType + from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.protocol.service.spec import BaseConfig, Configurable from dimos.protocol.tf.tf import LCMTF, TFSpec @@ -80,7 +82,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]: class ModuleConfig(BaseConfig): rpc_transport: type[RPCSpec] = LCMRPC default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT - rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS) + rpc_timeouts: MappingProxyType[str, float] = DEFAULT_RPC_TIMEOUTS tf_transport: type[TFSpec] = LCMTF # type: ignore[type-arg] frame_id_prefix: str | None = None frame_id: str | None = None @@ -132,6 +134,15 @@ def frame_id(self) -> str: return f"{self.config.frame_id_prefix}/{base}" return base + @rpc + def build(self) -> None: + """Optional build step for heavy one-time work (docker builds, LFS downloads, etc.). + + Called after deploy and stream wiring but before start(). + Has a very long timeout (24h) so long-running builds don't fail. + Default is a no-op — override in subclasses that need a build step. + """ + @rpc def start(self) -> None: pass diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 43e3e44f0a..f5fd340f02 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -165,7 +165,7 @@ def _deploy_workers() -> None: return assert self._client is not None for index, module in zip( - worker_indices, self._client.deploy_parallel(worker_specs), strict=False + worker_indices, self._client.deploy_parallel(worker_specs), strict=True ): results[index] = module @@ -173,12 +173,12 @@ def _deploy_docker() -> None: if not docker_specs: return for index, module in zip( - docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False + docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True ): results[index] = module def _register() -> None: - for (module_class, _, _), module in zip(module_specs, results, strict=False): + for (module_class, _, _), module in zip(module_specs, results, strict=True): if module is not None: self._deployed_modules[module_class] = module @@ -192,6 +192,24 @@ def _on_errors( _register() return results + def build_all_modules(self) -> None: + """Call build() on all deployed modules in parallel. + + build() handles heavy one-time work (docker builds, LFS downloads, etc.) + with a very long timeout. Must be called after deploy and stream wiring + but before start_all_modules(). + """ + modules = list(self._deployed_modules.values()) + if not modules: + raise ValueError("No modules deployed. Call deploy() before build_all_modules().") + + def _on_build_errors( + _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + ) -> None: + raise ExceptionGroup("build_all_modules failed", errors) + + safe_thread_map(modules, lambda m: m.build(), _on_build_errors) + def start_all_modules(self) -> None: modules = list(self._deployed_modules.values()) if not modules: diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 7ac34bb645..46182b7556 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -91,6 +91,7 @@ def __setstate__(self, state) -> None: # type: ignore[no-untyped-def] class ModuleProxyProtocol(Protocol): """Protocol for host-side handles to remote modules (worker or Docker).""" + def build(self) -> None: ... def start(self) -> None: ... def stop(self) -> None: ... def set_transport(self, stream_name: str, transport: Any) -> bool: ... @@ -179,5 +180,6 @@ def __getattr__(self, name: str): # type: ignore[no-untyped-def] # why? because the RPCClient instance is going to have all the methods of a Module # but those methods/attributes are super dynamic, so the type hints can't figure that out class ModuleProxy(RPCClient, Module): # type: ignore[misc] + def build(self) -> None: ... def start(self) -> None: ... def stop(self) -> None: ... diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index a3bb0b716d..d4c0d579d4 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -26,10 +26,13 @@ import pytest -from dimos.core.docker_runner import DockerModuleConfig, is_docker_module +import threading + +from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator +from dimos.core.rpc_client import RpcCall from dimos.core.stream import Out # -- Fixtures: fake module classes ------------------------------------------- @@ -198,7 +201,6 @@ class TestDockerModuleGetattr: def test_getattr_no_recursion_when_rpcs_not_set(self): """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse.""" - from dimos.core.docker_runner import DockerModule dm = DockerModule.__new__(DockerModule) # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure @@ -207,7 +209,6 @@ def test_getattr_no_recursion_when_rpcs_not_set(self): def test_getattr_no_recursion_on_cleanup_attrs(self): """Accessing cleanup-related attrs before they exist must raise, not recurse.""" - from dimos.core.docker_runner import DockerModule dm = DockerModule.__new__(DockerModule) # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse @@ -216,8 +217,6 @@ def test_getattr_no_recursion_on_cleanup_attrs(self): getattr(dm, attr) def test_getattr_delegates_to_rpc_when_rpcs_set(self): - from dimos.core.docker_runner import DockerModule - from dimos.core.rpc_client import RpcCall dm = DockerModule.__new__(DockerModule) dm.rpcs = {"do_thing"} @@ -235,7 +234,6 @@ def do_thing(self) -> None: ... assert isinstance(result, RpcCall) def test_getattr_raises_for_unknown_method(self): - from dimos.core.docker_runner import DockerModule dm = DockerModule.__new__(DockerModule) dm.rpcs = {"do_thing"} @@ -248,11 +246,10 @@ class TestDockerModuleCleanupReconnect: """Tests for DockerModule._cleanup with docker_reconnect_container.""" def test_cleanup_skips_stop_when_reconnect(self): - from dimos.core.docker_runner import DockerModule with patch.object(DockerModule, "__init__", lambda self: None): dm = DockerModule.__new__(DockerModule) - dm._running = True + dm._running = threading.Event(); dm._running.set() dm._container_name = "test_container" dm._unsub_fns = [] dm.rpc = MagicMock() @@ -269,11 +266,10 @@ def test_cleanup_skips_stop_when_reconnect(self): mock_rm.assert_not_called() def test_cleanup_stops_container_when_not_reconnect(self): - from dimos.core.docker_runner import DockerModule with patch.object(DockerModule, "__init__", lambda self: None): dm = DockerModule.__new__(DockerModule) - dm._running = True + dm._running = threading.Event(); dm._running.set() dm._container_name = "test_container" dm._unsub_fns = [] dm.rpc = MagicMock() diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 993f6044bb..5b1b8bcb67 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -34,7 +34,10 @@ def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] # module.py and other places imports these constants and choose what to give RPCClient # the RPCClient below does not use these constants directly (by design) DEFAULT_RPC_TIMEOUT: float = 120.0 -DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({"start": 1200.0}) +DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({ + "build": 86400.0, # 24h — docker builds, LFS downloads, etc. + "start": 1200.0, +}) class RPCClient(Protocol): From 97b7e0df0296160a018fec6660afd9b5bd221980 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Fri, 20 Mar 2026 13:58:28 -0700 Subject: [PATCH 73/89] fix: thread leak in native module test + show docker pull output - test_process_crash_triggers_stop: call mod.stop() after watchdog cleanup to release LCM transport and event loop threads (fixes CI thread leak error) - docker pull: remove stderr=subprocess.PIPE so both stdout and stderr are visible during pulls (progress bars, layer downloads) --- dimos/core/docker_runner.py | 3 +-- dimos/core/test_native_module.py | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index fb5770325b..d76845bb1a 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -229,12 +229,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non r = subprocess.run( [config.docker_bin, "pull", config.docker_image], text=True, - stderr=subprocess.PIPE, timeout=config.docker_pull_timeout, ) if r.returncode != 0: raise RuntimeError( - f"Failed to pull image '{config.docker_image}'.\nSTDERR:\n{r.stderr}" + f"Failed to pull image '{config.docker_image}'." ) reconnect = False diff --git a/dimos/core/test_native_module.py b/dimos/core/test_native_module.py index e77b8f9a53..31d6050818 100644 --- a/dimos/core/test_native_module.py +++ b/dimos/core/test_native_module.py @@ -107,6 +107,9 @@ def test_process_crash_triggers_stop() -> None: assert mod._process is None, f"Watchdog did not clean up after process {pid} died" + # Ensure all threads (LCM transport, event loop) are cleaned up + mod.stop() + @pytest.mark.slow def test_manual(dimos_cluster: ModuleCoordinator, args_file: str) -> None: From fbc146ac97fad63d49f7d54cde17fe02738dbba2 Mon Sep 17 00:00:00 2001 From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:07:00 +0000 Subject: [PATCH 74/89] CI code cleanup --- dimos/core/docker_runner.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index d76845bb1a..10a194a920 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -232,9 +232,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non timeout=config.docker_pull_timeout, ) if r.returncode != 0: - raise RuntimeError( - f"Failed to pull image '{config.docker_image}'." - ) + raise RuntimeError(f"Failed to pull image '{config.docker_image}'.") reconnect = False if _is_container_running(config, self._container_name): From f09875c3a507d31cff0b12ae44194379c4b29184 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 21 Mar 2026 21:59:10 -0700 Subject: [PATCH 75/89] chore: regenerate uv.lock after merge with dev --- uv.lock | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/uv.lock b/uv.lock index 529842294b..5d1272f673 100644 --- a/uv.lock +++ b/uv.lock @@ -1859,7 +1859,9 @@ dev = [ ] docker = [ { name = "dimos-lcm" }, + { name = "langchain-core" }, { name = "lcm" }, + { name = "matplotlib" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "open3d", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" }, @@ -1877,6 +1879,7 @@ docker = [ { name = "sortedcontainers" }, { name = "structlog" }, { name = "typer" }, + { name = "typing-extensions" }, ] drone = [ { name = "pymavlink" }, @@ -2020,6 +2023,7 @@ requires-dist = [ { name = "langchain", marker = "extra == 'agents'", specifier = "==1.2.3" }, { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-core", marker = "extra == 'agents'", specifier = "==1.2.3" }, + { name = "langchain-core", marker = "extra == 'docker'" }, { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" }, { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -2031,6 +2035,7 @@ requires-dist = [ { name = "llvmlite", specifier = ">=0.42.0" }, { name = "lxml-stubs", marker = "extra == 'dev'", specifier = ">=0.5.1,<1" }, { name = "lz4", specifier = ">=4.4.5" }, + { name = "matplotlib", marker = "extra == 'docker'" }, { name = "matplotlib", marker = "extra == 'manipulation'", specifier = ">=3.7.1" }, { name = "md-babel-py", marker = "extra == 'dev'", specifier = "==1.1.1" }, { name = "moondream", marker = "extra == 'perception'" }, @@ -2142,6 +2147,7 @@ requires-dist = [ { name = "types-tensorflow", marker = "extra == 'dev'", specifier = ">=2.18.0.20251008,<3" }, { name = "types-tqdm", marker = "extra == 'dev'", specifier = ">=4.67.0.20250809,<5" }, { name = "typing-extensions", marker = "python_full_version < '3.11'", specifier = ">=4.0" }, + { name = "typing-extensions", marker = "extra == 'docker'" }, { name = "ultralytics", marker = "extra == 'perception'", specifier = ">=8.3.70" }, { name = "unitree-webrtc-connect-leshy", marker = "extra == 'unitree'", specifier = ">=2.0.7" }, { name = "uvicorn", marker = "extra == 'web'", specifier = ">=0.34.0" }, From 317c487a2f05543059cabc159b422d9d9f79cb39 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 21 Mar 2026 23:21:53 -0700 Subject: [PATCH 76/89] fix(docker): include stdout/stderr in pull error message When docker pull fails, the error message now includes the actual output to help diagnose auth/network/registry issues. Revert: git revert HEAD --- dimos/core/docker_runner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 10a194a920..06b12c7512 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -229,10 +229,14 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non r = subprocess.run( [config.docker_bin, "pull", config.docker_image], text=True, + capture_output=True, timeout=config.docker_pull_timeout, ) if r.returncode != 0: - raise RuntimeError(f"Failed to pull image '{config.docker_image}'.") + raise RuntimeError( + f"Failed to pull image '{config.docker_image}'.\n" + f"stdout: {r.stdout}\nstderr: {r.stderr}" + ) reconnect = False if _is_container_running(config, self._container_name): From 91a13f1e7251fbab1657cc290da75e0c1976fe3c Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 21 Mar 2026 23:22:30 -0700 Subject: [PATCH 77/89] fix(tests): import ExceptionGroup in test_parallel_deploy_cleanup Test file used ExceptionGroup without importing it, causing NameError on Python < 3.11. Import from safe_thread_map where it's polyfilled. Revert: git revert HEAD --- dimos/core/tests/test_parallel_deploy_cleanup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py index 1987fa4be7..ef6bf4b879 100644 --- a/dimos/core/tests/test_parallel_deploy_cleanup.py +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -24,6 +24,8 @@ import pytest +from dimos.utils.safe_thread_map import ExceptionGroup + class TestDockerWorkerManagerPartialFailure: """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" From 42f3797fe6c7eb9e13c046be26c02ed72a947c2d Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Sat, 21 Mar 2026 23:24:45 -0700 Subject: [PATCH 78/89] docs: add changes.md with fix descriptions and revert instructions --- changes.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 changes.md diff --git a/changes.md b/changes.md new file mode 100644 index 0000000000..f5982fce13 --- /dev/null +++ b/changes.md @@ -0,0 +1,24 @@ +# PR #1431 (Docker Restoration) — Paul Review Fixes + +## Commits (local, not pushed) + +### 1. `317c487a2` — Include stdout/stderr in docker pull error +- Pull failures were silent — no diagnostic output +- Now includes both stdout and stderr in exception +- **Revert:** `git revert 317c487a2` + +### 2. `91a13f1e7` — Import ExceptionGroup in test file +- Test used ExceptionGroup without import → NameError on Python < 3.11 +- Now imports from safe_thread_map polyfill +- **Revert:** `git revert 91a13f1e7` + +## Reviewer was wrong on +- `rpc_timeouts` class-level mutable dict — it's in ModuleConfig (pydantic) with `Field(default_factory=...)`, which is correct + +## Not addressed (need Jeff's input / bigger refactor) +- Container launch in `__init__` vs `start()` — lifecycle redesign +- Deterministic container naming (removed PID+timestamp) — collision risk +- `docker_gpus` default None (was "all") — intentional breaking change? +- `docker_restart_policy` default "no" (was "on-failure:3") — same +- Build hash includes original Dockerfile, not converted (with footer) +- `getattr(default_config, "rpc_timeouts", ...)` returns FieldInfo on class From 30d87a6c30d589102315fbdf22c2ba2d6deb74e8 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 14:57:04 -0700 Subject: [PATCH 79/89] cleanup g passing --- dimos/core/docker_runner.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 005b55fb3b..9b9c658bcb 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -178,9 +178,6 @@ class DockerModule(ModuleProxyProtocol): config: DockerModuleConfig def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None: - # g (GlobalConfig) is passed by deploy pipeline but isn't a config field - kwargs.pop("g", None) - config_class = getattr(module_class, "default_config", DockerModuleConfig) if not issubclass(config_class, DockerModuleConfig): raise TypeError( From 8f23d0996912e2d50e4a5e5532b9b94e5e7629f5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 15:23:39 -0700 Subject: [PATCH 80/89] cleanup --- dimos/core/docker_runner.py | 3 ++- dimos/core/module.py | 5 ++--- dimos/core/module_coordinator.py | 6 +++++- dimos/core/test_core.py | 2 +- dimos/protocol/rpc/spec.py | 10 ++++++---- dimos/utils/safe_thread_map.py | 6 ++---- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 9b9c658bcb..8b3e39995a 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -155,7 +155,8 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: json.dumps(v) out[k] = v except (TypeError, ValueError): - logger.debug(f"Config field '{k}' not JSON-serializable, skipping") + level = "debug" if k.startswith("_") else "warning" + getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping") return out diff --git a/dimos/core/module.py b/dimos/core/module.py index 28971f0e4a..ebe1879681 100644 --- a/dimos/core/module.py +++ b/dimos/core/module.py @@ -30,6 +30,7 @@ ) from langchain_core.tools import tool +from pydantic import Field from reactivex.disposable import CompositeDisposable from dimos.core.core import T, rpc @@ -40,8 +41,6 @@ from dimos.core.rpc_client import RpcCall from dimos.core.stream import In, Out, RemoteOut, Transport from dimos.protocol.rpc.pubsubrpc import LCMRPC -from types import MappingProxyType - from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.protocol.service.spec import BaseConfig, Configurable from dimos.protocol.tf.tf import LCMTF, TFSpec @@ -82,7 +81,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]: class ModuleConfig(BaseConfig): rpc_transport: type[RPCSpec] = LCMRPC default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT - rpc_timeouts: MappingProxyType[str, float] = DEFAULT_RPC_TIMEOUTS + rpc_timeouts: dict[str, float] = Field(default_factory=lambda: dict(DEFAULT_RPC_TIMEOUTS)) tf_transport: type[TFSpec] = LCMTF # type: ignore[type-arg] frame_id_prefix: str | None = None frame_id: str | None = None diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index a8b6ec0922..d4778a5c0d 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -14,6 +14,7 @@ from __future__ import annotations +from contextlib import suppress import threading from typing import TYPE_CHECKING, Any @@ -204,8 +205,11 @@ def build_all_modules(self) -> None: raise ValueError("No modules deployed. Call deploy() before build_all_modules().") def _on_build_errors( - _outcomes: list[Any], _successes: list[Any], errors: list[Exception] + _outcomes: list[Any], successes: list[Any], errors: list[Exception] ) -> None: + for mod in successes: + with suppress(Exception): + mod.stop() raise ExceptionGroup("build_all_modules failed", errors) safe_thread_map(modules, lambda m: m.build(), _on_build_errors) diff --git a/dimos/core/test_core.py b/dimos/core/test_core.py index 7cd0f89b36..f9a89829d5 100644 --- a/dimos/core/test_core.py +++ b/dimos/core/test_core.py @@ -77,7 +77,7 @@ def test_classmethods() -> None: # Check that we have the expected RPC methods assert "navigate_to" in class_rpcs, "navigate_to should be in rpcs" assert "start" in class_rpcs, "start should be in rpcs" - assert len(class_rpcs) == 8 + assert len(class_rpcs) == 9 # Check that the values are callable assert callable(class_rpcs["navigate_to"]), "navigate_to should be callable" diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py index 5b1b8bcb67..cefd89f449 100644 --- a/dimos/protocol/rpc/spec.py +++ b/dimos/protocol/rpc/spec.py @@ -34,10 +34,12 @@ def rpcs(self) -> dict[str, Callable]: ... # type: ignore[type-arg] # module.py and other places imports these constants and choose what to give RPCClient # the RPCClient below does not use these constants directly (by design) DEFAULT_RPC_TIMEOUT: float = 120.0 -DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({ - "build": 86400.0, # 24h — docker builds, LFS downloads, etc. - "start": 1200.0, -}) +DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType( + { + "build": 86400.0, # 24h — docker builds, LFS downloads, etc. + "start": 1200.0, + } +) class RPCClient(Protocol): diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py index f480f2c97d..514fac2026 100644 --- a/dimos/utils/safe_thread_map.py +++ b/dimos/utils/safe_thread_map.py @@ -13,9 +13,10 @@ # limitations under the License. from __future__ import annotations +from collections.abc import Callable, Sequence from concurrent.futures import Future, ThreadPoolExecutor, as_completed import sys -from typing import TYPE_CHECKING, Any, TypeVar +from typing import Any, TypeVar if sys.version_info < (3, 11): @@ -32,9 +33,6 @@ def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None: ExceptionGroup = builtins.ExceptionGroup # type: ignore[misc] -if TYPE_CHECKING: - from collections.abc import Callable, Sequence - T = TypeVar("T") R = TypeVar("R") From d37a9229cf9f5368725cf6177739484b1078bf77 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 15:24:04 -0700 Subject: [PATCH 81/89] combine docker_build and runner --- dimos/core/docker_build.py | 150 ------------------ dimos/core/docker_runner.py | 149 +++++++++++++++-- dimos/core/docker_worker_manager.py | 12 +- dimos/core/module_coordinator.py | 4 +- dimos/core/tests/test_docker_deployment.py | 38 ++--- .../tests/test_parallel_deploy_cleanup.py | 8 +- dimos/manipulation/pick_and_place_module.py | 2 +- dimos/test_no_sections.py | 2 +- 8 files changed, 168 insertions(+), 197 deletions(-) delete mode 100644 dimos/core/docker_build.py diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py deleted file mode 100644 index 24fd2b3e44..0000000000 --- a/dimos/core/docker_build.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Docker image building and Dockerfile conversion utilities. -Converts any Dockerfile into a DimOS module container by appending a footer -that installs DimOS and creates the module entrypoint. -""" - -from __future__ import annotations - -import hashlib -import subprocess -from typing import TYPE_CHECKING - -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from pathlib import Path - - from dimos.core.docker_runner import DockerModuleConfig - -logger = setup_logger() - -_BUILD_HASH_LABEL = "dimos.build.hash" - -DOCKER_CMD_TIMEOUT = 20 - -# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) -DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" - -# Footer appended to Dockerfiles for DimOS module conversion -DIMOS_FOOTER = f""" -# ==== {DIMOS_SENTINEL} ==== -# Copy DimOS source from build context -COPY dimos /dimos/source/dimos/ -COPY pyproject.toml /dimos/source/ -COPY docker/python/module-install.sh /tmp/module-install.sh - -# Install DimOS and create entrypoint -RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh - -ENTRYPOINT ["/dimos/entrypoint.sh"] -""" - - -def _convert_dockerfile(dockerfile: Path) -> Path: - """Append DimOS footer to Dockerfile. Returns path to converted file.""" - content = dockerfile.read_text() - - # Already converted? - if DIMOS_SENTINEL in content: - return dockerfile - - logger.info(f"Converting {dockerfile.name} to DimOS format") - - converted = dockerfile.parent / f".{dockerfile.name}.ignore" - converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) - return converted - - -def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents and build args.""" - assert cfg.docker_file is not None - digest = hashlib.sha256() - digest.update(cfg.docker_file.read_bytes()) - for key, val in sorted(cfg.docker_build_args.items()): - digest.update(f"{key}={val}".encode()) - for arg in cfg.docker_build_extra_args: - digest.update(arg.encode()) - return digest.hexdigest() - - -def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: - """Read the build hash label from an existing Docker image.""" - r = subprocess.run( - [ - cfg.docker_bin, - "image", - "inspect", - "-f", - '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', - cfg.docker_image, - ], - capture_output=True, - text=True, - timeout=DOCKER_CMD_TIMEOUT, - check=False, - ) - if r.returncode != 0: - return None - value = r.stdout.strip() - # docker prints "" when the label is missing - return value if value and value != "" else None - - -def build_image(cfg: DockerModuleConfig) -> None: - """Build Docker image using footer mode conversion.""" - if cfg.docker_file is None: - raise ValueError("docker_file is required for building Docker images") - - build_hash = _compute_build_hash(cfg) - dockerfile = _convert_dockerfile(cfg.docker_file) - - context = cfg.docker_build_context or cfg.docker_file.parent - cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] - cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) - for k, v in cfg.docker_build_args.items(): - cmd.extend(["--build-arg", f"{k}={v}"]) - cmd.extend(cfg.docker_build_extra_args) - cmd.append(str(context)) - - logger.info(f"Building Docker image: {cfg.docker_image}") - # Stream stdout to terminal so the user sees build progress, but capture - # stderr separately so we can include it in the error message on failure. - result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) - if result.returncode != 0: - raise RuntimeError( - f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" - ) - - -def image_exists(cfg: DockerModuleConfig) -> bool: - """Check if the configured Docker image exists locally.""" - r = subprocess.run( - [cfg.docker_bin, "image", "inspect", cfg.docker_image], - capture_output=True, - text=True, - timeout=DOCKER_CMD_TIMEOUT, - check=False, - ) - return r.returncode == 0 - - -__all__ = [ - "DIMOS_FOOTER", - "build_image", - "image_exists", -] diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py index 8b3e39995a..61a050e2f5 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_runner.py @@ -11,11 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +""" +Docker module support: image building, Dockerfile conversion, host-side +proxy (DockerModuleOuter), and container-side runner (DockerModuleInner). +""" + from __future__ import annotations import argparse from contextlib import suppress from dataclasses import field +import hashlib import importlib import json import signal @@ -53,7 +60,7 @@ class DockerModuleConfig(ModuleConfig): For advanced Docker options not listed here, use docker_extra_args. Example: docker_extra_args=["--cap-add=SYS_ADMIN", "--read-only"] - NOTE: a DockerModule will rebuild automatically if the Dockerfile or build args change + NOTE: a DockerModuleOuter will rebuild automatically if the Dockerfile or build args change """ # Build / image @@ -160,10 +167,122 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: return out +# Image building and Dockerfile conversion + + +_BUILD_HASH_LABEL = "dimos.build.hash" + +# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) +DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" + +# Footer appended to Dockerfiles for DimOS module conversion +DIMOS_FOOTER = f""" +# ==== {DIMOS_SENTINEL} ==== +# Copy DimOS source from build context +COPY dimos /dimos/source/dimos/ +COPY pyproject.toml /dimos/source/ +COPY docker/python/module-install.sh /tmp/module-install.sh + +# Install DimOS and create entrypoint +RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh + +ENTRYPOINT ["/dimos/entrypoint.sh"] +""" + + +def _convert_dockerfile(dockerfile: Path) -> Path: + """Append DimOS footer to Dockerfile. Returns path to converted file.""" + content = dockerfile.read_text() + + # Already converted? + if DIMOS_SENTINEL in content: + return dockerfile + + logger.info(f"Converting {dockerfile.name} to DimOS format") + + converted = dockerfile.parent / f".{dockerfile.name}.ignore" + converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) + return converted + + +def _compute_build_hash(cfg: DockerModuleConfig) -> str: + """Hash Dockerfile contents and build args.""" + if cfg.docker_file is None: + raise ValueError("docker_file is required for computing build hash") + digest = hashlib.sha256() + digest.update(cfg.docker_file.read_bytes()) + for key, val in sorted(cfg.docker_build_args.items()): + digest.update(f"{key}={val}".encode()) + for arg in cfg.docker_build_extra_args: + digest.update(arg.encode()) + return digest.hexdigest() + + +def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: + """Read the build hash label from an existing Docker image.""" + r = subprocess.run( + [ + cfg.docker_bin, + "image", + "inspect", + "-f", + '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', + cfg.docker_image, + ], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + if r.returncode != 0: + return None + value = r.stdout.strip() + # docker prints "" when the label is missing + return value if value and value != "" else None + + +def build_image(cfg: DockerModuleConfig) -> None: + """Build Docker image using footer mode conversion.""" + if cfg.docker_file is None: + raise ValueError("docker_file is required for building Docker images") + + build_hash = _compute_build_hash(cfg) + dockerfile = _convert_dockerfile(cfg.docker_file) + + context = cfg.docker_build_context or cfg.docker_file.parent + cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) + for k, v in cfg.docker_build_args.items(): + cmd.extend(["--build-arg", f"{k}={v}"]) + cmd.extend(cfg.docker_build_extra_args) + cmd.append(str(context)) + + logger.info(f"Building Docker image: {cfg.docker_image}") + # Stream stdout to terminal so the user sees build progress, but capture + # stderr separately so we can include it in the error message on failure. + result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) + if result.returncode != 0: + raise RuntimeError( + f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" + ) + + +def image_exists(cfg: DockerModuleConfig) -> bool: + """Check if the configured Docker image exists locally.""" + r = subprocess.run( + [cfg.docker_bin, "image", "inspect", cfg.docker_image], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + return r.returncode == 0 + + # Host-side Docker-backed Module handle -class DockerModule(ModuleProxyProtocol): +class DockerModuleOuter(ModuleProxyProtocol): """ Host-side handle for a module running inside Docker. @@ -219,13 +338,6 @@ def build(self) -> None: if self._is_built: return - from dimos.core.docker_build import ( - _compute_build_hash, - _get_image_build_hash, - build_image, - image_exists, - ) - config = self.config try: if config.docker_file is not None: @@ -401,7 +513,7 @@ def _validate_config(self, cfg: DockerModuleConfig) -> None: using_host_network = cfg.docker_network is None and cfg.docker_network_mode == "host" if not using_host_network: logger.warning( - "DockerModule not using host network. LCM multicast requires --network=host. " + "DockerModuleOuter not using host network. LCM multicast requires --network=host. " "RPC communication may not work with bridge/custom networks." ) @@ -523,7 +635,7 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: payload_json = json.dumps(payload, separators=(",", ":")) except TypeError as e: raise TypeError( - f"Cannot serialize DockerModule payload to JSON: {e}\n" + f"Cannot serialize DockerModuleOuter payload to JSON: {e}\n" f"Ensure all constructor args/kwargs for {self._module_class.__name__} are " f"JSON-serializable, or use docker_command to bypass automatic payload generation." ) from e @@ -559,10 +671,14 @@ def _wait_for_rpc(self) -> None: ) +# Backwards compatibility alias +DockerModule = DockerModuleOuter + + # Container-side runner -class StandaloneModuleRunner: +class DockerModuleInner: """Runs a module inside Docker container. Blocks until SIGTERM/SIGINT.""" def __init__(self, module_path: str, args: list[Any], kwargs: dict[str, Any]) -> None: @@ -597,7 +713,7 @@ def wait(self) -> None: self._shutdown.wait() -def _install_signal_handlers(runner: StandaloneModuleRunner) -> None: +def _install_signal_handlers(runner: DockerModuleInner) -> None: def shutdown(_sig: int, _frame: Any) -> None: runner.stop() @@ -607,7 +723,7 @@ def shutdown(_sig: int, _frame: Any) -> None: def _cli_run(payload_json: str) -> None: payload = json.loads(payload_json) - runner = StandaloneModuleRunner( + runner = DockerModuleInner( payload["module_path"], payload.get("args", []), payload.get("kwargs", {}), @@ -640,5 +756,10 @@ def main(argv: list[str] | None = None) -> None: __all__ = [ "DockerModule", "DockerModuleConfig", + "DockerModuleInner", + "DockerModuleOuter", + "DIMOS_FOOTER", + "build_image", + "image_exists", "is_docker_module", ] diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 08ea7e3958..4a85bd59f9 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -20,7 +20,7 @@ from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: - from dimos.core.docker_runner import DockerModule + from dimos.core.docker_runner import DockerModuleOuter class DockerWorkerManager: @@ -29,24 +29,24 @@ class DockerWorkerManager: @staticmethod def deploy_parallel( specs: list[ModuleSpec], - ) -> list[DockerModule]: + ) -> list[DockerModuleOuter]: """Deploy multiple DockerModules in parallel. If any deployment fails, all successfully-started containers are stopped before an ExceptionGroup is raised. """ - from dimos.core.docker_runner import DockerModule + from dimos.core.docker_runner import DockerModuleOuter def _on_errors( - _outcomes: list[Any], successes: list[DockerModule], errors: list[Exception] + _outcomes: list[Any], successes: list[DockerModuleOuter], errors: list[Exception] ) -> None: for mod in successes: with suppress(Exception): mod.stop() raise ExceptionGroup("docker deploy_parallel failed", errors) - def _deploy_one(spec: ModuleSpec) -> DockerModule: - mod = DockerModule(spec[0], g=spec[1], **spec[2]) # type: ignore[arg-type] + def _deploy_one(spec: ModuleSpec) -> DockerModuleOuter: + mod = DockerModuleOuter(spec[0], g=spec[1], **spec[2]) # type: ignore[arg-type] mod.build() return mod diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d4778a5c0d..5d7b76db78 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -125,14 +125,14 @@ def deploy( **kwargs: Any, ) -> ModuleProxy: # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator - from dimos.core.docker_runner import DockerModule, is_docker_module + from dimos.core.docker_runner import DockerModuleOuter, is_docker_module if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") deployed_module: ModuleProxyProtocol if is_docker_module(module_class): - deployed_module = DockerModule(module_class, g=global_config, **kwargs) # type: ignore[arg-type] + deployed_module = DockerModuleOuter(module_class, g=global_config, **kwargs) # type: ignore[arg-type] else: deployed_module = self._client.deploy(module_class, global_config, kwargs) self._deployed_modules[module_class] = deployed_module # type: ignore[assignment] diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index d8eb9448ff..a528e07ad9 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -16,7 +16,7 @@ Smoke tests for Docker module deployment routing. These tests verify that the ModuleCoordinator correctly detects and routes -docker modules to DockerModule WITHOUT actually running Docker. +docker modules to DockerModuleOuter WITHOUT actually running Docker. """ from __future__ import annotations @@ -27,7 +27,7 @@ import pytest -from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module +from dimos.core.docker_runner import DockerModuleOuter, DockerModuleConfig, is_docker_module from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator @@ -76,7 +76,7 @@ class Bare(Module): class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -92,7 +92,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ # Should NOT go through worker manager mock_worker_mgr.deploy.assert_not_called() - # Should construct a DockerModule (container launch happens inside __init__) + # Should construct a DockerModuleOuter (container launch happens inside __init__) mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config) # start() is NOT called during deploy — it's called in start_all_modules mock_dm.start.assert_not_called() @@ -101,7 +101,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ finally: coordinator.stop() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls @@ -109,7 +109,7 @@ def test_deploy_docker_propagates_constructor_failure( mock_worker_mgr = MagicMock() mock_worker_manager_cls.return_value = mock_worker_mgr - # Container launch fails inside __init__; DockerModule handles its own cleanup + # Container launch fails inside __init__; DockerModuleOuter handles its own cleanup mock_docker_module_cls.side_effect = RuntimeError("launch failed") coordinator = ModuleCoordinator() @@ -173,7 +173,7 @@ def test_deploy_parallel_separates_docker_and_regular( finally: coordinator.stop() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -195,13 +195,13 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke mock_worker_mgr.close_all.assert_called_once() -class TestDockerModuleGetattr: - """Tests for DockerModule.__getattr__ avoiding infinite recursion.""" +class TestDockerModuleOuterGetattr: + """Tests for DockerModuleOuter.__getattr__ avoiding infinite recursion.""" def test_getattr_no_recursion_when_rpcs_not_set(self): """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse.""" - dm = DockerModule.__new__(DockerModule) + dm = DockerModuleOuter.__new__(DockerModuleOuter) # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure with pytest.raises(AttributeError): _ = dm.some_method @@ -209,14 +209,14 @@ def test_getattr_no_recursion_when_rpcs_not_set(self): def test_getattr_no_recursion_on_cleanup_attrs(self): """Accessing cleanup-related attrs before they exist must raise, not recurse.""" - dm = DockerModule.__new__(DockerModule) + dm = DockerModuleOuter.__new__(DockerModuleOuter) # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse for attr in ("rpc", "config", "_container_name", "_unsub_fns"): with pytest.raises(AttributeError): getattr(dm, attr) def test_getattr_delegates_to_rpc_when_rpcs_set(self): - dm = DockerModule.__new__(DockerModule) + dm = DockerModuleOuter.__new__(DockerModuleOuter) dm.rpcs = {"do_thing"} # _module_class needs a real method with __name__ for RpcCall @@ -232,19 +232,19 @@ def do_thing(self) -> None: ... assert isinstance(result, RpcCall) def test_getattr_raises_for_unknown_method(self): - dm = DockerModule.__new__(DockerModule) + dm = DockerModuleOuter.__new__(DockerModuleOuter) dm.rpcs = {"do_thing"} with pytest.raises(AttributeError, match="not found"): _ = dm.nonexistent -class TestDockerModuleCleanupReconnect: - """Tests for DockerModule._cleanup with docker_reconnect_container.""" +class TestDockerModuleOuterCleanupReconnect: + """Tests for DockerModuleOuter._cleanup with docker_reconnect_container.""" def test_cleanup_skips_stop_when_reconnect(self): - with patch.object(DockerModule, "__init__", lambda self: None): - dm = DockerModule.__new__(DockerModule) + with patch.object(DockerModuleOuter, "__init__", lambda self: None): + dm = DockerModuleOuter.__new__(DockerModuleOuter) dm._running = threading.Event() dm._running.set() dm._container_name = "test_container" @@ -263,8 +263,8 @@ def test_cleanup_skips_stop_when_reconnect(self): mock_rm.assert_not_called() def test_cleanup_stops_container_when_not_reconnect(self): - with patch.object(DockerModule, "__init__", lambda self: None): - dm = DockerModule.__new__(DockerModule) + with patch.object(DockerModuleOuter, "__init__", lambda self: None): + dm = DockerModuleOuter.__new__(DockerModuleOuter) dm._running = threading.Event() dm._running.set() dm._container_name = "test_container" diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py index ef6bf4b879..adfd1f7a36 100644 --- a/dimos/core/tests/test_parallel_deploy_cleanup.py +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -30,7 +30,7 @@ class TestDockerWorkerManagerPartialFailure: """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls): """Deploy 3 modules where the middle one fails. The other two must be stopped.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -69,7 +69,7 @@ def fake_constructor(cls, *args, **kwargs): mod_a.stop.assert_called_once() mod_c.stop.assert_called_once() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls): """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -110,7 +110,7 @@ def fake_constructor(cls, *args, **kwargs): # The one successful module must have been stopped mod_a.stop.assert_called_once() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") def test_all_succeed_no_stops(self, mock_docker_module_cls): """When all deployments succeed, no modules should be stopped.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -138,7 +138,7 @@ def fake_constructor(cls, *args, **kwargs): for m in mocks: m.stop.assert_not_called() - @patch("dimos.core.docker_runner.DockerModule") + @patch("dimos.core.docker_runner.DockerModuleOuter") def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls): """If stop() itself raises during cleanup, the original deploy error still propagates.""" from dimos.core.docker_worker_manager import DockerWorkerManager diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py index 81e7bcf2d3..e519d82c87 100644 --- a/dimos/manipulation/pick_and_place_module.py +++ b/dimos/manipulation/pick_and_place_module.py @@ -30,7 +30,7 @@ from dimos.agents.annotation import skill from dimos.constants import DIMOS_PROJECT_ROOT from dimos.core.core import rpc -from dimos.core.docker_runner import DockerModule as DockerRunner +from dimos.core.docker_runner import DockerModuleOuter as DockerRunner from dimos.core.stream import In from dimos.manipulation.grasping.graspgen_module import GraspGenModule from dimos.manipulation.manipulation_module import ( diff --git a/dimos/test_no_sections.py b/dimos/test_no_sections.py index 9523c0aae2..63c6c42c81 100644 --- a/dimos/test_no_sections.py +++ b/dimos/test_no_sections.py @@ -58,7 +58,7 @@ # Each entry is (relative_path, line_substring) — if both match, the line is skipped. WHITELIST = [ # Sentinel marker used at runtime to detect already-converted Dockerfiles - ("dimos/core/docker_build.py", "DIMOS_SENTINEL"), + ("dimos/core/docker_runner.py", "DIMOS_SENTINEL"), ] From 79d7817fc242b0f1ce2e731408b8f0039918cc9b Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 15:28:25 -0700 Subject: [PATCH 82/89] rename docker_runner to module --- .../core/{docker_runner.py => docker_module.py} | 4 ++-- dimos/core/docker_worker_manager.py | 4 ++-- dimos/core/module_coordinator.py | 8 ++++---- dimos/core/tests/test_docker_deployment.py | 16 ++++++++-------- dimos/core/tests/test_parallel_deploy_cleanup.py | 8 ++++---- dimos/manipulation/grasping/graspgen_module.py | 2 +- dimos/manipulation/pick_and_place_module.py | 2 +- dimos/test_no_sections.py | 2 +- docker/python/module-install.sh | 2 +- examples/docker_hello_world/hello_docker.py | 2 +- 10 files changed, 25 insertions(+), 25 deletions(-) rename dimos/core/{docker_runner.py => docker_module.py} (99%) diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_module.py similarity index 99% rename from dimos/core/docker_runner.py rename to dimos/core/docker_module.py index 61a050e2f5..1880aa0dbd 100644 --- a/dimos/core/docker_runner.py +++ b/dimos/core/docker_module.py @@ -630,7 +630,7 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]: # Filter out docker-specific kwargs (paths, etc.) - only pass module config kwargs = {"config": _extract_module_config(cfg)} payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs} - # DimOS base image entrypoint already runs "dimos.core.docker_runner run" + # DimOS base image entrypoint already runs "dimos.core.docker_module run" try: payload_json = json.dumps(payload, separators=(",", ":")) except TypeError as e: @@ -734,7 +734,7 @@ def _cli_run(payload_json: str) -> None: def main(argv: list[str] | None = None) -> None: - parser = argparse.ArgumentParser(prog="dimos.core.docker_runner") + parser = argparse.ArgumentParser(prog="dimos.core.docker_module") sub = parser.add_subparsers(dest="cmd", required=True) runp = sub.add_parser("run", help="Run a module inside a container") diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py index 4a85bd59f9..94b4e973c8 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/docker_worker_manager.py @@ -20,7 +20,7 @@ from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map if TYPE_CHECKING: - from dimos.core.docker_runner import DockerModuleOuter + from dimos.core.docker_module import DockerModuleOuter class DockerWorkerManager: @@ -35,7 +35,7 @@ def deploy_parallel( If any deployment fails, all successfully-started containers are stopped before an ExceptionGroup is raised. """ - from dimos.core.docker_runner import DockerModuleOuter + from dimos.core.docker_module import DockerModuleOuter def _on_errors( _outcomes: list[Any], successes: list[DockerModuleOuter], errors: list[Exception] diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 5d7b76db78..4937a2e121 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -124,8 +124,8 @@ def deploy( global_config: GlobalConfig = global_config, **kwargs: Any, ) -> ModuleProxy: - # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator - from dimos.core.docker_runner import DockerModuleOuter, is_docker_module + # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator + from dimos.core.docker_module import DockerModuleOuter, is_docker_module if not self._client: raise ValueError("Trying to dimos.deploy before the client has started") @@ -139,8 +139,8 @@ def deploy( return deployed_module # type: ignore[return-value] def deploy_parallel(self, module_specs: list[ModuleSpec]) -> list[ModuleProxy]: - # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator - from dimos.core.docker_runner import is_docker_module + # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator + from dimos.core.docker_module import is_docker_module if not self._client: raise ValueError("Not started") diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index a528e07ad9..55e96d3b72 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -27,7 +27,7 @@ import pytest -from dimos.core.docker_runner import DockerModuleOuter, DockerModuleConfig, is_docker_module +from dimos.core.docker_module import DockerModuleOuter, DockerModuleConfig, is_docker_module from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator @@ -76,7 +76,7 @@ class Bare(Module): class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -101,7 +101,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_ finally: coordinator.stop() - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_deploy_docker_propagates_constructor_failure( self, mock_worker_manager_cls, mock_docker_module_cls @@ -173,7 +173,7 @@ def test_deploy_parallel_separates_docker_and_regular( finally: coordinator.stop() - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") @patch("dimos.core.module_coordinator.WorkerManager") def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): mock_worker_mgr = MagicMock() @@ -255,8 +255,8 @@ def test_cleanup_skips_stop_when_reconnect(self): # reconnect mode: should NOT stop/rm the container dm.config = FakeDockerConfig(docker_reconnect_container=True) with ( - patch("dimos.core.docker_runner._run") as mock_run, - patch("dimos.core.docker_runner._remove_container") as mock_rm, + patch("dimos.core.docker_module._run") as mock_run, + patch("dimos.core.docker_module._remove_container") as mock_rm, ): dm._cleanup() mock_run.assert_not_called() @@ -275,8 +275,8 @@ def test_cleanup_stops_container_when_not_reconnect(self): # normal mode: should stop and rm the container dm.config = FakeDockerConfig(docker_reconnect_container=False) with ( - patch("dimos.core.docker_runner._run") as mock_run, - patch("dimos.core.docker_runner._remove_container") as mock_rm, + patch("dimos.core.docker_module._run") as mock_run, + patch("dimos.core.docker_module._remove_container") as mock_rm, ): dm._cleanup() mock_run.assert_called_once() # docker stop diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py index adfd1f7a36..212daa9a49 100644 --- a/dimos/core/tests/test_parallel_deploy_cleanup.py +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -30,7 +30,7 @@ class TestDockerWorkerManagerPartialFailure: """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls): """Deploy 3 modules where the middle one fails. The other two must be stopped.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -69,7 +69,7 @@ def fake_constructor(cls, *args, **kwargs): mod_a.stop.assert_called_once() mod_c.stop.assert_called_once() - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls): """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -110,7 +110,7 @@ def fake_constructor(cls, *args, **kwargs): # The one successful module must have been stopped mod_a.stop.assert_called_once() - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") def test_all_succeed_no_stops(self, mock_docker_module_cls): """When all deployments succeed, no modules should be stopped.""" from dimos.core.docker_worker_manager import DockerWorkerManager @@ -138,7 +138,7 @@ def fake_constructor(cls, *args, **kwargs): for m in mocks: m.stop.assert_not_called() - @patch("dimos.core.docker_runner.DockerModuleOuter") + @patch("dimos.core.docker_module.DockerModuleOuter") def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls): """If stop() itself raises during cleanup, the original deploy error still propagates.""" from dimos.core.docker_worker_manager import DockerWorkerManager diff --git a/dimos/manipulation/grasping/graspgen_module.py b/dimos/manipulation/grasping/graspgen_module.py index ae2d59512a..3cca54dc2f 100644 --- a/dimos/manipulation/grasping/graspgen_module.py +++ b/dimos/manipulation/grasping/graspgen_module.py @@ -22,7 +22,7 @@ import numpy as np from dimos.core.core import rpc -from dimos.core.docker_runner import DockerModuleConfig +from dimos.core.docker_module import DockerModuleConfig from dimos.core.module import Module from dimos.core.stream import Out from dimos.msgs.geometry_msgs.PoseArray import PoseArray diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py index e519d82c87..2d8bcd1584 100644 --- a/dimos/manipulation/pick_and_place_module.py +++ b/dimos/manipulation/pick_and_place_module.py @@ -30,7 +30,7 @@ from dimos.agents.annotation import skill from dimos.constants import DIMOS_PROJECT_ROOT from dimos.core.core import rpc -from dimos.core.docker_runner import DockerModuleOuter as DockerRunner +from dimos.core.docker_module import DockerModuleOuter as DockerRunner from dimos.core.stream import In from dimos.manipulation.grasping.graspgen_module import GraspGenModule from dimos.manipulation.manipulation_module import ( diff --git a/dimos/test_no_sections.py b/dimos/test_no_sections.py index 63c6c42c81..902288b2e6 100644 --- a/dimos/test_no_sections.py +++ b/dimos/test_no_sections.py @@ -58,7 +58,7 @@ # Each entry is (relative_path, line_substring) — if both match, the line is skipped. WHITELIST = [ # Sentinel marker used at runtime to detect already-converted Dockerfiles - ("dimos/core/docker_runner.py", "DIMOS_SENTINEL"), + ("dimos/core/docker_module.py", "DIMOS_SENTINEL"), ] diff --git a/docker/python/module-install.sh b/docker/python/module-install.sh index ab0aea1032..7c0c54b5f8 100644 --- a/docker/python/module-install.sh +++ b/docker/python/module-install.sh @@ -66,7 +66,7 @@ cat > /dimos/entrypoint.sh < Date: Wed, 25 Mar 2026 17:38:19 -0700 Subject: [PATCH 83/89] add ModuleCoordinator docstring --- dimos/core/module_coordinator.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 4937a2e121..d70c10035c 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -35,6 +35,15 @@ class ModuleCoordinator(Resource): # type: ignore[misc] + """ + There should only ever be one module coordinator instance (this is a singleton) + - Module (classes) should be able to be deployed, stopped, and re-deployed in on one instance of ModuleCoordinator + - Arguably ModuleCoordinator could be called the "DimosRuntime" + - ModuleCoordinator is responsible for all global "addresses". + Ex: it should make sure all modules are using the same LCM url, the same rerun port, etc + (it may not do all of that at time of writing but that is the intention/job of this class) + - Modules shouldn't be deployed on their own (except for testing) + """ _client: WorkerManager | None = None _global_config: GlobalConfig _n: int | None = None From 2d321e3d808f98ada0a2766ca8c8a0d71da21808 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 18:51:44 -0700 Subject: [PATCH 84/89] use threading utils --- dimos/core/docker_module.py | 326 ++++--- dimos/core/module_coordinator.py | 4 +- dimos/core/tests/test_docker_deployment.py | 2 +- .../tests/test_parallel_deploy_cleanup.py | 2 +- dimos/core/worker_manager.py | 3 +- ...er_manager.py => worker_manager_docker.py} | 3 +- dimos/utils/safe_thread_map.py | 108 --- dimos/utils/test_thread_utils.py | 888 ++++++++++++++++++ dimos/utils/thread_utils.py | 550 +++++++++++ dimos/utils/typing_utils.py | 45 + 10 files changed, 1654 insertions(+), 277 deletions(-) rename dimos/core/{docker_worker_manager.py => worker_manager_docker.py} (94%) delete mode 100644 dimos/utils/safe_thread_map.py create mode 100644 dimos/utils/test_thread_utils.py create mode 100644 dimos/utils/thread_utils.py create mode 100644 dimos/utils/typing_utils.py diff --git a/dimos/core/docker_module.py b/dimos/core/docker_module.py index 1880aa0dbd..dc0ffd533f 100644 --- a/dimos/core/docker_module.py +++ b/dimos/core/docker_module.py @@ -35,6 +35,7 @@ from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall from dimos.protocol.rpc.pubsubrpc import LCMRPC from dimos.utils.logging_config import setup_logger +from dimos.utils.thread_utils import ThreadSafeVal from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT if TYPE_CHECKING: @@ -125,163 +126,6 @@ def is_docker_module(module_class: type) -> bool: ) -# Docker helpers - - -def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]: - logger.debug(f"exec: {' '.join(cmd)}") - return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) - - -def _remove_container(cfg: DockerModuleConfig, name: str) -> None: - _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) - - -def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: - r = _run( - [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name], - timeout=DOCKER_STATUS_TIMEOUT, - ) - return r.returncode == 0 and r.stdout.strip() == "true" - - -def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: - r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) - out = (r.stdout or "").rstrip() - err = (r.stderr or "").rstrip() - return out + ("\n" + err if err else "") - - -def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: - """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" - out: dict[str, Any] = {} - for k, v in cfg.__dict__.items(): - if k.startswith("docker_") or isinstance(v, type) or callable(v): - continue - try: - json.dumps(v) - out[k] = v - except (TypeError, ValueError): - level = "debug" if k.startswith("_") else "warning" - getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping") - return out - - -# Image building and Dockerfile conversion - - -_BUILD_HASH_LABEL = "dimos.build.hash" - -# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) -DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" - -# Footer appended to Dockerfiles for DimOS module conversion -DIMOS_FOOTER = f""" -# ==== {DIMOS_SENTINEL} ==== -# Copy DimOS source from build context -COPY dimos /dimos/source/dimos/ -COPY pyproject.toml /dimos/source/ -COPY docker/python/module-install.sh /tmp/module-install.sh - -# Install DimOS and create entrypoint -RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh - -ENTRYPOINT ["/dimos/entrypoint.sh"] -""" - - -def _convert_dockerfile(dockerfile: Path) -> Path: - """Append DimOS footer to Dockerfile. Returns path to converted file.""" - content = dockerfile.read_text() - - # Already converted? - if DIMOS_SENTINEL in content: - return dockerfile - - logger.info(f"Converting {dockerfile.name} to DimOS format") - - converted = dockerfile.parent / f".{dockerfile.name}.ignore" - converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) - return converted - - -def _compute_build_hash(cfg: DockerModuleConfig) -> str: - """Hash Dockerfile contents and build args.""" - if cfg.docker_file is None: - raise ValueError("docker_file is required for computing build hash") - digest = hashlib.sha256() - digest.update(cfg.docker_file.read_bytes()) - for key, val in sorted(cfg.docker_build_args.items()): - digest.update(f"{key}={val}".encode()) - for arg in cfg.docker_build_extra_args: - digest.update(arg.encode()) - return digest.hexdigest() - - -def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: - """Read the build hash label from an existing Docker image.""" - r = subprocess.run( - [ - cfg.docker_bin, - "image", - "inspect", - "-f", - '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', - cfg.docker_image, - ], - capture_output=True, - text=True, - timeout=DOCKER_CMD_TIMEOUT, - check=False, - ) - if r.returncode != 0: - return None - value = r.stdout.strip() - # docker prints "" when the label is missing - return value if value and value != "" else None - - -def build_image(cfg: DockerModuleConfig) -> None: - """Build Docker image using footer mode conversion.""" - if cfg.docker_file is None: - raise ValueError("docker_file is required for building Docker images") - - build_hash = _compute_build_hash(cfg) - dockerfile = _convert_dockerfile(cfg.docker_file) - - context = cfg.docker_build_context or cfg.docker_file.parent - cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] - cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) - for k, v in cfg.docker_build_args.items(): - cmd.extend(["--build-arg", f"{k}={v}"]) - cmd.extend(cfg.docker_build_extra_args) - cmd.append(str(context)) - - logger.info(f"Building Docker image: {cfg.docker_image}") - # Stream stdout to terminal so the user sees build progress, but capture - # stderr separately so we can include it in the error message on failure. - result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) - if result.returncode != 0: - raise RuntimeError( - f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" - ) - - -def image_exists(cfg: DockerModuleConfig) -> bool: - """Check if the configured Docker image exists locally.""" - r = subprocess.run( - [cfg.docker_bin, "image", "inspect", cfg.docker_image], - capture_output=True, - text=True, - timeout=DOCKER_CMD_TIMEOUT, - check=False, - ) - return r.returncode == 0 - - -# Host-side Docker-backed Module handle - - class DockerModuleOuter(ModuleProxyProtocol): """ Host-side handle for a module running inside Docker. @@ -311,7 +155,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non self._args = args self._kwargs = kwargs self._running = threading.Event() - self._is_built = False + self._is_built = ThreadSafeVal(False) self.remote_name = module_class.__name__ # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2" image_ref = config.docker_image.rsplit("/", 1)[-1] @@ -335,7 +179,7 @@ def build(self) -> None: Idempotent — safe to call multiple times. Has no RPC timeout since this runs host-side (not via RPC to a worker process). """ - if self._is_built: + if self._is_built.get(): return config = self.config @@ -386,7 +230,7 @@ def build(self) -> None: # docker run -d returns before Module.__init__ finishes in the container, # so we poll until the RPC server is reachable before returning. self._wait_for_rpc() - self._is_built = True + self._is_built.set(True) except Exception: with suppress(Exception): self._cleanup() @@ -675,9 +519,6 @@ def _wait_for_rpc(self) -> None: DockerModule = DockerModuleOuter -# Container-side runner - - class DockerModuleInner: """Runs a module inside Docker container. Blocks until SIGTERM/SIGINT.""" @@ -713,6 +554,159 @@ def wait(self) -> None: self._shutdown.wait() +# --------------------------------------------------------------------------- +# Helpers (private — used by the classes above) +# --------------------------------------------------------------------------- + + +def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]: + logger.debug(f"exec: {' '.join(cmd)}") + return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) + + +def _remove_container(cfg: DockerModuleConfig, name: str) -> None: + _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT) + + +def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool: + r = _run( + [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name], + timeout=DOCKER_STATUS_TIMEOUT, + ) + return r.returncode == 0 and r.stdout.strip() == "true" + + +def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str: + r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT) + out = (r.stdout or "").rstrip() + err = (r.stderr or "").rstrip() + return out + ("\n" + err if err else "") + + +def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]: + """Extract JSON-serializable config fields for the container (excludes docker_* fields).""" + out: dict[str, Any] = {} + for k, v in cfg.__dict__.items(): + if k.startswith("docker_") or isinstance(v, type) or callable(v): + continue + try: + json.dumps(v) + out[k] = v + except (TypeError, ValueError): + level = "debug" if k.startswith("_") else "warning" + getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping") + return out + + +_BUILD_HASH_LABEL = "dimos.build.hash" + +# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness) +DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc" + +# Footer appended to Dockerfiles for DimOS module conversion +DIMOS_FOOTER = f""" +# ==== {DIMOS_SENTINEL} ==== +# Copy DimOS source from build context +COPY dimos /dimos/source/dimos/ +COPY pyproject.toml /dimos/source/ +COPY docker/python/module-install.sh /tmp/module-install.sh + +# Install DimOS and create entrypoint +RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh + +ENTRYPOINT ["/dimos/entrypoint.sh"] +""" + + +def _convert_dockerfile(dockerfile: Path) -> Path: + """Append DimOS footer to Dockerfile. Returns path to converted file.""" + content = dockerfile.read_text() + + # Already converted? + if DIMOS_SENTINEL in content: + return dockerfile + + logger.info(f"Converting {dockerfile.name} to DimOS format") + + converted = dockerfile.parent / f".{dockerfile.name}.ignore" + converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n")) + return converted + + +def _compute_build_hash(cfg: DockerModuleConfig) -> str: + """Hash Dockerfile contents and build args.""" + if cfg.docker_file is None: + raise ValueError("docker_file is required for computing build hash") + digest = hashlib.sha256() + digest.update(cfg.docker_file.read_bytes()) + for key, val in sorted(cfg.docker_build_args.items()): + digest.update(f"{key}={val}".encode()) + for arg in cfg.docker_build_extra_args: + digest.update(arg.encode()) + return digest.hexdigest() + + +def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None: + """Read the build hash label from an existing Docker image.""" + r = subprocess.run( + [ + cfg.docker_bin, + "image", + "inspect", + "-f", + '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}', + cfg.docker_image, + ], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + if r.returncode != 0: + return None + value = r.stdout.strip() + # docker prints "" when the label is missing + return value if value and value != "" else None + + +def build_image(cfg: DockerModuleConfig) -> None: + """Build Docker image using footer mode conversion.""" + if cfg.docker_file is None: + raise ValueError("docker_file is required for building Docker images") + + build_hash = _compute_build_hash(cfg) + dockerfile = _convert_dockerfile(cfg.docker_file) + + context = cfg.docker_build_context or cfg.docker_file.parent + cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)] + cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"]) + for k, v in cfg.docker_build_args.items(): + cmd.extend(["--build-arg", f"{k}={v}"]) + cmd.extend(cfg.docker_build_extra_args) + cmd.append(str(context)) + + logger.info(f"Building Docker image: {cfg.docker_image}") + # Stream stdout to terminal so the user sees build progress, but capture + # stderr separately so we can include it in the error message on failure. + result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE) + if result.returncode != 0: + raise RuntimeError( + f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}" + ) + + +def image_exists(cfg: DockerModuleConfig) -> bool: + """Check if the configured Docker image exists locally.""" + r = subprocess.run( + [cfg.docker_bin, "image", "inspect", cfg.docker_image], + capture_output=True, + text=True, + timeout=DOCKER_CMD_TIMEOUT, + check=False, + ) + return r.returncode == 0 + + def _install_signal_handlers(runner: DockerModuleInner) -> None: def shutdown(_sig: int, _frame: Any) -> None: runner.stop() @@ -733,6 +727,10 @@ def _cli_run(payload_json: str) -> None: runner.wait() +# Container-side entrypoint: invoked as `python -m dimos.core.docker_module run --payload '...'` +# by the generated entrypoint.sh inside Docker containers (see docker/python/module-install.sh). +# This is what makes `DockerModuleInner` actually run — without it, containers would have no +# way to bootstrap the module from the JSON payload that `DockerModuleOuter` passes via `docker run`. def main(argv: list[str] | None = None) -> None: parser = argparse.ArgumentParser(prog="dimos.core.docker_module") sub = parser.add_subparsers(dest="cmd", required=True) @@ -754,11 +752,11 @@ def main(argv: list[str] | None = None) -> None: __all__ = [ + "DIMOS_FOOTER", "DockerModule", "DockerModuleConfig", "DockerModuleInner", "DockerModuleOuter", - "DIMOS_FOOTER", "build_image", "image_exists", "is_docker_module", diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d70c10035c..7902072570 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -24,7 +24,8 @@ from dimos.core.resource import Resource from dimos.core.worker_manager import WorkerManager from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map +from dimos.utils.thread_utils import safe_thread_map +from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: from dimos.core.resource_monitor.monitor import StatsMonitor @@ -44,6 +45,7 @@ class ModuleCoordinator(Resource): # type: ignore[misc] (it may not do all of that at time of writing but that is the intention/job of this class) - Modules shouldn't be deployed on their own (except for testing) """ + _client: WorkerManager | None = None _global_config: GlobalConfig _n: int | None = None diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 55e96d3b72..982bc656b4 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -27,7 +27,7 @@ import pytest -from dimos.core.docker_module import DockerModuleOuter, DockerModuleConfig, is_docker_module +from dimos.core.docker_module import DockerModuleConfig, DockerModuleOuter, is_docker_module from dimos.core.global_config import global_config from dimos.core.module import Module from dimos.core.module_coordinator import ModuleCoordinator diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py index 212daa9a49..795401d80e 100644 --- a/dimos/core/tests/test_parallel_deploy_cleanup.py +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -24,7 +24,7 @@ import pytest -from dimos.utils.safe_thread_map import ExceptionGroup +from dimos.utils.typing_utils import ExceptionGroup class TestDockerWorkerManagerPartialFailure: diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py index 3cd836b3ed..f12bffac66 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager.py @@ -23,7 +23,8 @@ from dimos.core.rpc_client import RPCClient from dimos.core.worker import Worker from dimos.utils.logging_config import setup_logger -from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map +from dimos.utils.thread_utils import safe_thread_map +from dimos.utils.typing_utils import ExceptionGroup logger = setup_logger() diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/worker_manager_docker.py similarity index 94% rename from dimos/core/docker_worker_manager.py rename to dimos/core/worker_manager_docker.py index 94b4e973c8..78bc9928c4 100644 --- a/dimos/core/docker_worker_manager.py +++ b/dimos/core/worker_manager_docker.py @@ -17,7 +17,8 @@ from typing import TYPE_CHECKING, Any from dimos.core.module import ModuleSpec -from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map +from dimos.utils.thread_utils import safe_thread_map +from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: from dimos.core.docker_module import DockerModuleOuter diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py deleted file mode 100644 index 514fac2026..0000000000 --- a/dimos/utils/safe_thread_map.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2025-2026 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -from collections.abc import Callable, Sequence -from concurrent.futures import Future, ThreadPoolExecutor, as_completed -import sys -from typing import Any, TypeVar - -if sys.version_info < (3, 11): - - class ExceptionGroup(Exception): # type: ignore[no-redef] # noqa: N818 - """Minimal ExceptionGroup polyfill for Python 3.10.""" - - exceptions: tuple[BaseException, ...] - - def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None: - super().__init__(message) - self.exceptions = tuple(exceptions) -else: - import builtins - - ExceptionGroup = builtins.ExceptionGroup # type: ignore[misc] - -T = TypeVar("T") -R = TypeVar("R") - - -def safe_thread_map( - items: Sequence[T], - fn: Callable[[T], R], - on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any] - | None = None, -) -> list[R]: - """Thread-pool map that waits for all items to finish before raising and a cleanup handler - - - Empty *items* → returns ``[]`` immediately. - - All succeed → returns results in input order. - - Any fail → calls ``on_errors(outcomes, successes, errors)`` where - *outcomes* is a list of ``(input, result_or_exception)`` pairs in input - order, *successes* is the list of successful results, and *errors* is - the list of exceptions. If *on_errors* raises, that exception propagates. - If *on_errors* returns normally, its return value is returned from - ``safe_thread_map``. If *on_errors* is ``None``, raises an - ``ExceptionGroup``. - - Example:: - - def start_service(name: str) -> Connection: - return connect(name) - - def cleanup( - outcomes: list[tuple[str, Connection | Exception]], - successes: list[Connection], - errors: list[Exception], - ) -> None: - for conn in successes: - conn.close() - raise ExceptionGroup("failed to start services", errors) - - connections = safe_thread_map( - ["db", "cache", "queue"], - start_service, - cleanup, # called only if any start_service() raises - ) - """ - if not items: - return [] - - outcomes: dict[int, R | Exception] = {} - - with ThreadPoolExecutor(max_workers=len(items)) as pool: - futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)} - for fut in as_completed(futures): - idx = futures[fut] - try: - outcomes[idx] = fut.result() - except Exception as e: - outcomes[idx] = e - - # Note: successes/errors are in completion order, not input order. - # This is fine — on_errors only needs them for cleanup, not ordering. - successes: list[R] = [] - errors: list[Exception] = [] - for v in outcomes.values(): - if isinstance(v, Exception): - errors.append(v) - else: - successes.append(v) - - if errors: - if on_errors is not None: - zipped = [(items[i], outcomes[i]) for i in range(len(items))] - return on_errors(zipped, successes, errors) # type: ignore[return-value, no-any-return] - raise ExceptionGroup("safe_thread_map failed", errors) - - return [outcomes[i] for i in range(len(items))] # type: ignore[misc] diff --git a/dimos/utils/test_thread_utils.py b/dimos/utils/test_thread_utils.py new file mode 100644 index 0000000000..07047c6d92 --- /dev/null +++ b/dimos/utils/test_thread_utils.py @@ -0,0 +1,888 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Exhaustive tests for dimos/utils/thread_utils.py + +Covers: ThreadSafeVal, ModuleThread, AsyncModuleThread, ModuleProcess, safe_thread_map. +Focuses on deadlocks, race conditions, idempotency, and edge cases under load. +""" + +from __future__ import annotations + +import asyncio +import os +import pickle +import sys +import threading +import time +from unittest import mock + +import pytest +from reactivex.disposable import CompositeDisposable + +from dimos.utils.thread_utils import ( + AsyncModuleThread, + ModuleProcess, + ModuleThread, + ThreadSafeVal, + safe_thread_map, +) + +# Helpers: fake ModuleBase for testing ModuleThread / AsyncModuleThread / ModuleProcess + + +class FakeModule: + """Minimal stand-in for ModuleBase — just needs _disposables.""" + + def __init__(self) -> None: + self._disposables = CompositeDisposable() + + def dispose(self) -> None: + self._disposables.dispose() + + +# ThreadSafeVal Tests + + +class TestThreadSafeVal: + def test_basic_get_set(self) -> None: + v = ThreadSafeVal(42) + assert v.get() == 42 + v.set(99) + assert v.get() == 99 + + def test_bool_truthy(self) -> None: + v = ThreadSafeVal(True) + assert bool(v) is True + v.set(False) + assert bool(v) is False + + def test_bool_zero(self) -> None: + v = ThreadSafeVal(0) + assert bool(v) is False + v.set(1) + assert bool(v) is True + + def test_context_manager_returns_value(self) -> None: + v = ThreadSafeVal("hello") + with v as val: + assert val == "hello" + + def test_set_inside_context_manager_no_deadlock(self) -> None: + """The critical test: set() inside a with block must NOT deadlock. + + This was a confirmed bug when using threading.Lock (non-reentrant). + Fixed by using threading.RLock. + """ + v = ThreadSafeVal(0) + result = threading.Event() + + def do_it() -> None: + with v as val: + v.set(val + 1) + result.set() + + t = threading.Thread(target=do_it) + t.start() + t.join(timeout=2) + assert result.is_set(), "Deadlocked! set() inside with block hung" + assert v.get() == 1 + + def test_get_inside_context_manager_no_deadlock(self) -> None: + v = ThreadSafeVal(10) + result = threading.Event() + + def do_it() -> None: + with v: + _ = v.get() + result.set() + + t = threading.Thread(target=do_it) + t.start() + t.join(timeout=2) + assert result.is_set(), "Deadlocked! get() inside with block hung" + + def test_bool_inside_context_manager_no_deadlock(self) -> None: + v = ThreadSafeVal(True) + result = threading.Event() + + def do_it() -> None: + with v: + _ = bool(v) + result.set() + + t = threading.Thread(target=do_it) + t.start() + t.join(timeout=2) + assert result.is_set(), "Deadlocked! bool() inside with block hung" + + def test_context_manager_blocks_other_threads(self) -> None: + """While one thread holds the lock via `with`, others should block on set().""" + v = ThreadSafeVal(0) + gate = threading.Event() + other_started = threading.Event() + other_finished = threading.Event() + + def holder() -> None: + with v: + gate.wait(timeout=5) # hold the lock until signaled + + def setter() -> None: + other_started.set() + v.set(42) # should block until holder releases + other_finished.set() + + t1 = threading.Thread(target=holder) + t2 = threading.Thread(target=setter) + t1.start() + time.sleep(0.05) # let holder acquire lock + t2.start() + other_started.wait(timeout=2) + time.sleep(0.1) + # setter should be blocked + assert not other_finished.is_set(), "set() did not block while lock was held" + gate.set() # release holder + t1.join(timeout=2) + t2.join(timeout=2) + assert other_finished.is_set() + assert v.get() == 42 + + def test_concurrent_increments(self) -> None: + """Many threads doing atomic read-modify-write should not lose updates.""" + v = ThreadSafeVal(0) + n_threads = 50 + n_increments = 100 + + def incrementer() -> None: + for _ in range(n_increments): + with v as val: + v.set(val + 1) + + threads = [threading.Thread(target=incrementer) for _ in range(n_threads)] + for t in threads: + t.start() + for t in threads: + t.join(timeout=10) + assert v.get() == n_threads * n_increments + + def test_concurrent_increments_stress(self) -> None: + """Run the concurrent increment test multiple times to catch races.""" + for _ in range(10): + self.test_concurrent_increments() + + def test_pickle_roundtrip(self) -> None: + v = ThreadSafeVal({"key": [1, 2, 3]}) + data = pickle.dumps(v) + v2 = pickle.loads(data) + assert v2.get() == {"key": [1, 2, 3]} + # Verify the new instance has a working lock + with v2 as val: + v2.set({**val, "new": True}) + assert v2.get()["new"] is True + + def test_repr(self) -> None: + v = ThreadSafeVal("test") + assert repr(v) == "ThreadSafeVal('test')" + + def test_dict_type(self) -> None: + v = ThreadSafeVal({"running": False, "count": 0}) + with v as s: + v.set({**s, "running": True}) + assert v.get() == {"running": True, "count": 0} + + def test_string_literal_type(self) -> None: + """Simulates the ModState pattern from module.py.""" + v = ThreadSafeVal("init") + with v as state: + if state == "init": + v.set("started") + assert v.get() == "started" + + with v as state: + if state == "stopped": + pass # no-op + else: + v.set("stopped") + assert v.get() == "stopped" + + def test_nested_with_no_deadlock(self) -> None: + """RLock should allow the same thread to nest with blocks.""" + v = ThreadSafeVal(0) + result = threading.Event() + + def do_it() -> None: + with v: + with v as val2: + v.set(val2 + 1) + result.set() + + t = threading.Thread(target=do_it) + t.start() + t.join(timeout=2) + assert result.is_set(), "Nested with blocks deadlocked!" + + +# ModuleThread Tests + + +class TestModuleThread: + def test_basic_lifecycle(self) -> None: + mod = FakeModule() + ran = threading.Event() + + def target() -> None: + ran.set() + + mt = ModuleThread(module=mod, target=target, name="test-basic") + ran.wait(timeout=2) + assert ran.is_set() + mt.stop() + assert not mt.is_alive + + def test_auto_start(self) -> None: + mod = FakeModule() + started = threading.Event() + mt = ModuleThread(module=mod, target=started.set, name="test-autostart") + started.wait(timeout=2) + assert started.is_set() + mt.stop() + + def test_deferred_start(self) -> None: + mod = FakeModule() + started = threading.Event() + mt = ModuleThread(module=mod, target=started.set, name="test-deferred", start=False) + time.sleep(0.1) + assert not started.is_set() + mt.start() + started.wait(timeout=2) + assert started.is_set() + mt.stop() + + def test_stopping_property(self) -> None: + mod = FakeModule() + saw_stopping = threading.Event() + holder: list[ModuleThread] = [] + + def target() -> None: + while not holder[0].stopping: + time.sleep(0.01) + saw_stopping.set() + + mt = ModuleThread(module=mod, target=target, name="test-stopping", start=False) + holder.append(mt) + mt.start() + time.sleep(0.05) + mt.stop() + saw_stopping.wait(timeout=2) + assert saw_stopping.is_set() + + def test_stop_idempotent(self) -> None: + mod = FakeModule() + mt = ModuleThread(module=mod, target=lambda: time.sleep(0.01), name="test-idem") + time.sleep(0.05) + mt.stop() + mt.stop() # second call should not raise + mt.stop() # third call should not raise + + def test_stop_from_managed_thread_no_deadlock(self) -> None: + """The thread calling stop() on itself should not deadlock.""" + mod = FakeModule() + result = threading.Event() + holder: list[ModuleThread] = [] + + def target() -> None: + holder[0].stop() # stop ourselves — should not deadlock + result.set() + + mt = ModuleThread(module=mod, target=target, name="test-self-stop", start=False) + holder.append(mt) + mt.start() + result.wait(timeout=3) + assert result.is_set(), "Deadlocked when thread called stop() on itself" + + def test_dispose_stops_thread(self) -> None: + """Module dispose should stop the thread via the registered Disposable.""" + mod = FakeModule() + running = threading.Event() + holder: list[ModuleThread] = [] + + def target() -> None: + running.set() + while not holder[0].stopping: + time.sleep(0.01) + + mt = ModuleThread(module=mod, target=target, name="test-dispose", start=False) + holder.append(mt) + mt.start() + running.wait(timeout=2) + mod.dispose() + time.sleep(0.1) + assert not mt.is_alive + + def test_concurrent_stop_calls(self) -> None: + """Multiple threads calling stop() concurrently should not crash.""" + mod = FakeModule() + holder: list[ModuleThread] = [] + + def target() -> None: + while not holder[0].stopping: + time.sleep(0.01) + + mt = ModuleThread(module=mod, target=target, name="test-concurrent-stop", start=False) + holder.append(mt) + mt.start() + time.sleep(0.05) + + errors = [] + + def stop_it() -> None: + try: + mt.stop() + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=stop_it) for _ in range(20)] + for t in threads: + t.start() + for t in threads: + t.join(timeout=5) + assert not errors, f"Concurrent stop() raised: {errors}" + + def test_close_timeout_respected(self) -> None: + """If the thread ignores the stop signal, stop() should return after close_timeout.""" + mod = FakeModule() + bail = threading.Event() + + def stubborn_target() -> None: + bail.wait(timeout=10) # ignores stopping signal, but we can bail it out + + mt = ModuleThread( + module=mod, target=stubborn_target, name="test-timeout", close_timeout=0.2 + ) + start = time.monotonic() + mt.stop() + elapsed = time.monotonic() - start + assert elapsed < 1.0, f"stop() took {elapsed}s, expected ~0.2s" + bail.set() # let the thread exit so conftest thread-leak detector is happy + mt.join(timeout=2) + + def test_stop_concurrent_with_dispose(self) -> None: + """Calling stop() and dispose() concurrently should not crash.""" + for _ in range(20): + mod = FakeModule() + holder: list[ModuleThread] = [] + + def target(h: list[ModuleThread] = holder) -> None: + while not h[0].stopping: + time.sleep(0.001) + + mt = ModuleThread(module=mod, target=target, name="test-stop-dispose", start=False) + holder.append(mt) + mt.start() + time.sleep(0.02) + # Race: stop and dispose from different threads + t1 = threading.Thread(target=mt.stop) + t2 = threading.Thread(target=mod.dispose) + t1.start() + t2.start() + t1.join(timeout=3) + t2.join(timeout=3) + + +# AsyncModuleThread Tests + + +class TestAsyncModuleThread: + def test_creates_loop_and_thread(self) -> None: + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + assert amt.loop is not None + assert amt.loop.is_running() + assert amt.is_alive + amt.stop() + assert not amt.is_alive + + def test_stop_idempotent(self) -> None: + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + amt.stop() + amt.stop() # should not raise + amt.stop() + + def test_dispose_stops_loop(self) -> None: + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + assert amt.is_alive + mod.dispose() + time.sleep(0.1) + assert not amt.is_alive + + def test_can_schedule_coroutine(self) -> None: + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + result = [] + + async def coro() -> None: + result.append(42) + + future = asyncio.run_coroutine_threadsafe(coro(), amt.loop) + future.result(timeout=2) + assert result == [42] + amt.stop() + + def test_stop_with_pending_work(self) -> None: + """Stop should succeed even with long-running tasks on the loop.""" + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + started = threading.Event() + + async def slow_coro() -> None: + started.set() + await asyncio.sleep(10) + + asyncio.run_coroutine_threadsafe(slow_coro(), amt.loop) + started.wait(timeout=2) + # stop() should not hang waiting for the coroutine + start = time.monotonic() + amt.stop() + elapsed = time.monotonic() - start + assert elapsed < 5.0, f"stop() hung for {elapsed}s with pending coroutine" + + def test_concurrent_stop(self) -> None: + mod = FakeModule() + amt = AsyncModuleThread(module=mod) + errors = [] + + def stop_it() -> None: + try: + amt.stop() + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=stop_it) for _ in range(20)] + for t in threads: + t.start() + for t in threads: + t.join(timeout=5) + assert not errors + + +# ModuleProcess Tests + + +# Helper: path to a python that sleeps or echoes +PYTHON = sys.executable + + +class TestModuleProcess: + def test_basic_lifecycle(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + shutdown_timeout=2.0, + ) + assert mp.is_alive + assert mp.pid is not None + mp.stop() + assert not mp.is_alive + assert mp.pid is None + + def test_stop_idempotent(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + shutdown_timeout=1.0, + ) + mp.stop() + mp.stop() # should not raise + mp.stop() + + def test_dispose_stops_process(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + shutdown_timeout=2.0, + ) + mod.dispose() + time.sleep(0.5) + assert not mp.is_alive + + def test_on_exit_fires_on_natural_exit(self) -> None: + """on_exit should fire when the process exits on its own.""" + mod = FakeModule() + exit_called = threading.Event() + + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "print('done')"], + on_exit=exit_called.set, + ) + exit_called.wait(timeout=5) + assert exit_called.is_set(), "on_exit was not called after natural process exit" + + def test_on_exit_fires_on_crash(self) -> None: + mod = FakeModule() + exit_called = threading.Event() + + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import sys; sys.exit(1)"], + on_exit=exit_called.set, + ) + exit_called.wait(timeout=5) + assert exit_called.is_set(), "on_exit was not called after process crash" + + def test_on_exit_not_fired_on_stop(self) -> None: + """on_exit should NOT fire when stop() kills the process.""" + mod = FakeModule() + exit_called = threading.Event() + + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + on_exit=exit_called.set, + shutdown_timeout=2.0, + ) + time.sleep(0.2) # let watchdog start + mp.stop() + time.sleep(1.0) # give watchdog time to potentially fire + assert not exit_called.is_set(), "on_exit fired after intentional stop()" + + def test_stdout_logged(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "print('hello from subprocess')"], + ) + time.sleep(1.0) # let output be read + mp.stop() + + def test_stderr_logged(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import sys; sys.stderr.write('error msg\\n')"], + ) + time.sleep(1.0) + mp.stop() + + def test_log_json_mode(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[ + PYTHON, + "-c", + """import json; print(json.dumps({"event": "test", "key": "val"}))""", + ], + log_json=True, + ) + time.sleep(1.0) + mp.stop() + + def test_log_json_malformed(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "print('not json')"], + log_json=True, + ) + time.sleep(1.0) + mp.stop() + + def test_stop_process_that_ignores_sigterm(self) -> None: + """Process that ignores SIGTERM should be killed with SIGKILL.""" + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[ + PYTHON, + "-c", + "import signal, time; signal.signal(signal.SIGTERM, signal.SIG_IGN); time.sleep(60)", + ], + shutdown_timeout=0.5, + kill_timeout=2.0, + ) + time.sleep(0.2) + start = time.monotonic() + mp.stop() + elapsed = time.monotonic() - start + assert not mp.is_alive + # Should take roughly shutdown_timeout (0.5) + a bit for SIGKILL + assert elapsed < 5.0 + + def test_stop_already_dead_process(self) -> None: + """stop() on a process that already exited should not raise.""" + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "pass"], # exits immediately + ) + time.sleep(1.0) # let it die + mp.stop() # should not raise + + def test_concurrent_stop(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + shutdown_timeout=2.0, + ) + errors = [] + + def stop_it() -> None: + try: + mp.stop() + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=stop_it) for _ in range(20)] + for t in threads: + t.start() + for t in threads: + t.join(timeout=10) + assert not errors, f"Concurrent stop() raised: {errors}" + + def test_on_exit_calls_module_stop_no_deadlock(self) -> None: + """Simulate the real pattern: on_exit=module.stop, which disposes the + ModuleProcess, which tries to stop its watchdog from inside the watchdog. + Must not deadlock. + """ + mod = FakeModule() + stop_called = threading.Event() + + def fake_module_stop() -> None: + """Simulates module.stop() -> _stop() -> dispose()""" + mod.dispose() + stop_called.set() + + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "pass"], # exits immediately + on_exit=fake_module_stop, + ) + stop_called.wait(timeout=5) + assert stop_called.is_set(), "Deadlocked! on_exit -> dispose -> stop chain hung" + + def test_on_exit_calls_module_stop_no_deadlock_stress(self) -> None: + """Run the deadlock test multiple times under load.""" + for _i in range(10): + self.test_on_exit_calls_module_stop_no_deadlock() + + def test_deferred_start(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(30)"], + start=False, + ) + assert not mp.is_alive + mp.start() + assert mp.is_alive + mp.stop() + + def test_env_passed(self) -> None: + mod = FakeModule() + exit_called = threading.Event() + + ModuleProcess( + module=mod, + args=[ + PYTHON, + "-c", + "import os, sys; sys.exit(0 if os.environ.get('MY_VAR') == '42' else 1)", + ], + env={**os.environ, "MY_VAR": "42"}, + on_exit=exit_called.set, + ) + exit_called.wait(timeout=5) + # Process should have exited with 0 (our on_exit fires for all unmanaged exits) + assert exit_called.is_set() + + def test_cwd_passed(self) -> None: + mod = FakeModule() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import os; print(os.getcwd())"], + cwd="/tmp", + ) + time.sleep(1.0) + mp.stop() + + +# safe_thread_map Tests + + +class TestSafeThreadMap: + def test_empty_input(self) -> None: + assert safe_thread_map([], lambda x: x) == [] + + def test_all_succeed(self) -> None: + result = safe_thread_map([1, 2, 3], lambda x: x * 2) + assert result == [2, 4, 6] + + def test_preserves_order(self) -> None: + def slow(x: int) -> int: + time.sleep(0.01 * (10 - x)) + return x + + result = safe_thread_map(list(range(10)), slow) + assert result == list(range(10)) + + def test_all_fail_raises_exception_group(self) -> None: + def fail(x: int) -> int: + raise ValueError(f"fail-{x}") + + with pytest.raises(ExceptionGroup) as exc_info: + safe_thread_map([1, 2, 3], fail) + assert len(exc_info.value.exceptions) == 3 + + def test_partial_failure(self) -> None: + def maybe_fail(x: int) -> int: + if x == 2: + raise ValueError("fail") + return x + + with pytest.raises(ExceptionGroup) as exc_info: + safe_thread_map([1, 2, 3], maybe_fail) + assert len(exc_info.value.exceptions) == 1 + + def test_on_errors_callback(self) -> None: + def fail(x: int) -> int: + if x == 2: + raise ValueError("boom") + return x * 10 + + cleanup_called = False + + def on_errors(outcomes, successes, errors): + nonlocal cleanup_called + cleanup_called = True + assert len(errors) == 1 + assert len(successes) == 2 + return successes # return successful results + + result = safe_thread_map([1, 2, 3], fail, on_errors) + assert cleanup_called + assert sorted(result) == [10, 30] + + def test_on_errors_can_raise(self) -> None: + def fail(x: int) -> int: + raise ValueError("boom") + + def on_errors(outcomes, successes, errors): + raise RuntimeError("custom error") + + with pytest.raises(RuntimeError, match="custom error"): + safe_thread_map([1], fail, on_errors) + + def test_waits_for_all_before_raising(self) -> None: + """Even if one fails fast, all others should complete.""" + completed = [] + + def work(x: int) -> int: + if x == 0: + raise ValueError("fast fail") + time.sleep(0.2) + completed.append(x) + return x + + with pytest.raises(ExceptionGroup): + safe_thread_map([0, 1, 2, 3], work) + # All non-failing items should have completed + assert sorted(completed) == [1, 2, 3] + + +# Integration: ModuleProcess on_exit -> dispose chain (the CI bug scenario) + + +class TestModuleProcessDisposeChain: + """Tests the exact pattern that caused the CI bug: + process exits -> watchdog fires on_exit -> module.stop() -> dispose -> + ModuleProcess.stop() -> tries to stop watchdog from inside watchdog thread. + """ + + @staticmethod + def _make_fake_stop(mod: FakeModule, done: threading.Event) -> Callable: + def fake_stop() -> None: + mod.dispose() + done.set() + + return fake_stop + + def test_chain_no_deadlock_fast_exit(self) -> None: + """Process exits immediately.""" + for _ in range(20): + mod = FakeModule() + done = threading.Event() + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "pass"], + on_exit=self._make_fake_stop(mod, done), + ) + assert done.wait(timeout=5), "Deadlock in dispose chain (fast exit)" + + def test_chain_no_deadlock_slow_exit(self) -> None: + """Process runs briefly then exits.""" + for _ in range(10): + mod = FakeModule() + done = threading.Event() + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(0.1)"], + on_exit=self._make_fake_stop(mod, done), + ) + assert done.wait(timeout=5), "Deadlock in dispose chain (slow exit)" + + def test_chain_concurrent_with_external_stop(self) -> None: + """Process exits naturally while external code calls stop().""" + for _ in range(20): + mod = FakeModule() + done = threading.Event() + mp = ModuleProcess( + module=mod, + args=[PYTHON, "-c", "import time; time.sleep(0.05)"], + on_exit=self._make_fake_stop(mod, done), + shutdown_timeout=1.0, + ) + # Race: the process might exit naturally or we might stop it + time.sleep(0.03) + mp.stop() + # Either way, should not deadlock + time.sleep(1.0) + + def test_dispose_with_artificial_delay(self) -> None: + """Add artificial delay near cleanup to simulate heavy CPU load.""" + original_stop = ModuleThread.stop + + def slow_stop(self_mt: ModuleThread) -> None: + time.sleep(0.05) # simulate load + original_stop(self_mt) + + for _ in range(10): + mod = FakeModule() + done = threading.Event() + with mock.patch.object(ModuleThread, "stop", slow_stop): + ModuleProcess( + module=mod, + args=[PYTHON, "-c", "pass"], + on_exit=self._make_fake_stop(mod, done), + ) + assert done.wait(timeout=10), "Deadlock with slow ModuleThread.stop()" + + +from dimos.utils.typing_utils import ExceptionGroup diff --git a/dimos/utils/thread_utils.py b/dimos/utils/thread_utils.py new file mode 100644 index 0000000000..6d9b7a9e7f --- /dev/null +++ b/dimos/utils/thread_utils.py @@ -0,0 +1,550 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Thread utilities: safe values, managed threads, safe parallel map.""" + +from __future__ import annotations + +import asyncio +import collections +from concurrent.futures import Future, ThreadPoolExecutor, as_completed +import json +import signal +import subprocess +import threading +from typing import IO, TYPE_CHECKING, Any, Generic + +from reactivex.disposable import Disposable + +from dimos.utils.logging_config import setup_logger +from dimos.utils.typing_utils import ExceptionGroup, TypeVar + +logger = setup_logger() + +if TYPE_CHECKING: + from collections.abc import Callable, Sequence + + from dimos.core.module import ModuleBase + +T = TypeVar("T") +R = TypeVar("R") + + +# ThreadSafeVal: a lock-protected value with context-manager support + + +class ThreadSafeVal(Generic[T]): + """A thread-safe value wrapper. + + Wraps any value with a lock and provides atomic read-modify-write + via a context manager:: + + counter = ThreadSafeVal(0) + + # Simple get/set (each acquires the lock briefly): + counter.set(10) + print(counter.get()) # 10 + + # Atomic read-modify-write: + with counter as value: + # Lock is held for the entire block. + # Other threads block on get/set/with until this exits. + if value < 100: + counter.set(value + 1) + + # Works with any type: + status = ThreadSafeVal({"running": False, "count": 0}) + with status as s: + status.set({**s, "running": True}) + + # Bool check (for flag-like usage): + stopping = ThreadSafeVal(False) + stopping.set(True) + if stopping: + print("stopping!") + """ + + def __init__(self, initial: T) -> None: + self._lock = threading.RLock() + self._value = initial + + def get(self) -> T: + """Return the current value (acquires the lock briefly).""" + with self._lock: + return self._value + + def set(self, value: T) -> None: + """Replace the value (acquires the lock briefly).""" + with self._lock: + self._value = value + + def __bool__(self) -> bool: + with self._lock: + return bool(self._value) + + def __enter__(self) -> T: + self._lock.acquire() + return self._value + + def __exit__(self, *exc: object) -> None: + self._lock.release() + + def __getstate__(self) -> dict[str, Any]: + return {"_value": self._value} + + def __setstate__(self, state: dict[str, Any]) -> None: + self._lock = threading.RLock() + self._value = state["_value"] + + def __repr__(self) -> str: + return f"ThreadSafeVal({self._value!r})" + + +# ModuleThread: a thread that auto-registers with a module's disposables + + +class ModuleThread: + """A thread that registers cleanup with a module's disposables. + + Passes most kwargs through to ``threading.Thread``. On construction, + registers a disposable with the module so that when the module stops, + the thread is automatically joined. Cleanup is idempotent — safe to + call ``stop()`` manually even if the module also disposes it. + + Example:: + + class MyModule(Module): + @rpc + def start(self) -> None: + self._worker = ModuleThread( + module=self, + target=self._run_loop, + name="my-worker", + ) + + def _run_loop(self) -> None: + while not self._worker.stopping: + do_work() + """ + + def __init__( + self, + module: ModuleBase[Any], + *, + start: bool = True, + close_timeout: float = 2.0, + **thread_kwargs: Any, + ) -> None: + thread_kwargs.setdefault("daemon", True) + self._thread = threading.Thread(**thread_kwargs) + self._stop_event = threading.Event() + self._close_timeout = close_timeout + self._stopped = False + self._stop_lock = threading.Lock() + module._disposables.add(Disposable(self.stop)) + if start: + self.start() + + @property + def stopping(self) -> bool: + """True after ``stop()`` has been called.""" + return self._stop_event.is_set() + + def start(self) -> None: + """Start the underlying thread.""" + self._stop_event.clear() + self._thread.start() + + def stop(self) -> None: + """Signal the thread to stop and join it. + + Safe to call multiple times, from any thread (including the + managed thread itself — it will skip the join in that case). + """ + with self._stop_lock: + if self._stopped: + return + self._stopped = True + + self._stop_event.set() + if self._thread.is_alive() and self._thread is not threading.current_thread(): + self._thread.join(timeout=self._close_timeout) + + def join(self, timeout: float | None = None) -> None: + """Join the underlying thread.""" + self._thread.join(timeout=timeout) + + @property + def is_alive(self) -> bool: + return self._thread.is_alive() + + +# AsyncModuleThread: a thread running an asyncio event loop, auto-registered + + +class AsyncModuleThread: + """A thread running an asyncio event loop, registered with a module's disposables. + + If a loop is already running in the current context, reuses it (no thread + created). Otherwise creates a new loop and drives it in a daemon thread. + + On stop (or module dispose), the loop is shut down gracefully and the + thread is joined. Idempotent — safe to call ``stop()`` multiple times. + + Example:: + + class MyModule(Module): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._async = AsyncModuleThread(module=self) + + @rpc + def start(self) -> None: + future = asyncio.run_coroutine_threadsafe( + self._do_work(), self._async.loop + ) + + async def _do_work(self) -> None: + ... + """ + + def __init__( + self, + module: ModuleBase[Any], + *, + close_timeout: float = 2.0, + ) -> None: + self._close_timeout = close_timeout + self._stopped = False + self._stop_lock = threading.Lock() + self._owns_loop = False + self._thread: threading.Thread | None = None + + try: + self._loop = asyncio.get_running_loop() + except RuntimeError: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + self._owns_loop = True + self._thread = threading.Thread( + target=self._loop.run_forever, + daemon=True, + name=f"{type(module).__name__}-event-loop", + ) + self._thread.start() + + module._disposables.add(Disposable(self.stop)) + + @property + def loop(self) -> asyncio.AbstractEventLoop: + """The managed event loop.""" + return self._loop + + @property + def is_alive(self) -> bool: + return self._thread is not None and self._thread.is_alive() + + def stop(self) -> None: + """Stop the event loop and join the thread. + + No-op if the loop was not created by this instance (reused an + existing running loop). Safe to call multiple times. + """ + with self._stop_lock: + if self._stopped: + return + self._stopped = True + + if self._owns_loop and self._loop.is_running(): + self._loop.call_soon_threadsafe(self._loop.stop) + + if self._thread is not None and self._thread.is_alive(): + self._thread.join(timeout=self._close_timeout) + + +# ModuleProcess: managed subprocess with log piping, auto-registered cleanup + + +class ModuleProcess: + """A managed subprocess that pipes stdout/stderr through the logger. + + Registers with a module's disposables so the process is automatically + stopped on module teardown. A watchdog thread monitors the process and + calls ``on_exit`` if the process exits on its own (i.e. not via + ``ModuleProcess.stop()``). + + Most constructor kwargs mirror ``subprocess.Popen``. ``stdout`` and + ``stderr`` are always captured (set to ``PIPE`` internally). + + Example:: + + class MyModule(Module): + @rpc + def start(self) -> None: + self._proc = ModuleProcess( + module=self, + args=["./my_binary", "--flag"], + cwd="/opt/bin", + on_exit=self.stop, # stops the whole module if process exits on its own + ) + + @rpc + def stop(self) -> None: + super().stop() + """ + + def __init__( + self, + module: ModuleBase[Any], + args: list[str] | str, + *, + env: dict[str, str] | None = None, + cwd: str | None = None, + shell: bool = False, + on_exit: Callable[[], Any] | None = None, + shutdown_timeout: float = 10.0, + kill_timeout: float = 5.0, + log_json: bool = False, + log_tail_lines: int = 50, + start: bool = True, + **popen_kwargs: Any, + ) -> None: + self._args = args + self._env = env + self._cwd = cwd + self._shell = shell + self._on_exit = on_exit + self._shutdown_timeout = shutdown_timeout + self._kill_timeout = kill_timeout + self._log_json = log_json + self._log_tail_lines = log_tail_lines + self._popen_kwargs = popen_kwargs + self._process: subprocess.Popen[bytes] | None = None + self._watchdog: ModuleThread | None = None + self._module = module + self._stopped = False + self._stop_lock = threading.Lock() + self.last_stdout: collections.deque[str] = collections.deque(maxlen=log_tail_lines) + self.last_stderr: collections.deque[str] = collections.deque(maxlen=log_tail_lines) + + module._disposables.add(Disposable(self.stop)) + if start: + self.start() + + @property + def pid(self) -> int | None: + return self._process.pid if self._process is not None else None + + @property + def returncode(self) -> int | None: + if self._process is None: + return None + return self._process.poll() + + @property + def is_alive(self) -> bool: + return self._process is not None and self._process.poll() is None + + def start(self) -> None: + """Launch the subprocess and start the watchdog.""" + if self._process is not None and self._process.poll() is None: + logger.warning("Process already running", pid=self._process.pid) + return + + with self._stop_lock: + self._stopped = False + + self.last_stdout = collections.deque(maxlen=self._log_tail_lines) + self.last_stderr = collections.deque(maxlen=self._log_tail_lines) + + logger.info( + "Starting process", + cmd=self._args if isinstance(self._args, str) else " ".join(self._args), + cwd=self._cwd, + ) + self._process = subprocess.Popen( + self._args, + env=self._env, + cwd=self._cwd, + shell=self._shell, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **self._popen_kwargs, + ) + logger.info("Process started", pid=self._process.pid) + + self._watchdog = ModuleThread( + module=self._module, + target=self._watch, + name=f"proc-{self._process.pid}-watchdog", + ) + + def stop(self) -> None: + """Send SIGTERM, wait, escalate to SIGKILL if needed. Idempotent.""" + with self._stop_lock: + if self._stopped: + return + self._stopped = True + + if self._process is not None and self._process.poll() is None: + logger.info("Stopping process", pid=self._process.pid) + try: + self._process.send_signal(signal.SIGTERM) + except OSError: + pass # process already dead (PID recycled or exited between poll and signal) + else: + try: + self._process.wait(timeout=self._shutdown_timeout) + except subprocess.TimeoutExpired: + logger.warning( + "Process did not exit, sending SIGKILL", + pid=self._process.pid, + ) + self._process.kill() + try: + self._process.wait(timeout=self._kill_timeout) + except subprocess.TimeoutExpired: + logger.error( + "Process did not exit after SIGKILL", + pid=self._process.pid, + ) + self._process = None + + def _watch(self) -> None: + """Watchdog: pipe logs, detect crashes.""" + proc = self._process + if proc is None: + return + + stdout_t = self._start_reader(proc.stdout, "info") + stderr_t = self._start_reader(proc.stderr, "warning") + rc = proc.wait() + stdout_t.join(timeout=2) + stderr_t.join(timeout=2) + + with self._stop_lock: + if self._stopped: + return + + last_stdout = "\n".join(self.last_stdout) or None + last_stderr = "\n".join(self.last_stderr) or None + logger.error( + "Process died unexpectedly", + pid=proc.pid, + returncode=rc, + last_stdout=last_stdout, + last_stderr=last_stderr, + ) + if self._on_exit is not None: + self._on_exit() + + def _start_reader(self, stream: IO[bytes] | None, level: str) -> threading.Thread: + t = threading.Thread(target=self._read_stream, args=(stream, level), daemon=True) + t.start() + return t + + def _read_stream(self, stream: IO[bytes] | None, level: str) -> None: + if stream is None: + return + log_fn = getattr(logger, level) + is_stderr = level == "warning" + buf = self.last_stderr if is_stderr else self.last_stdout + for raw in stream: + line = raw.decode("utf-8", errors="replace").rstrip() + if not line: + continue + buf.append(line) + if self._log_json: + try: + data = json.loads(line) + event = data.pop("event", line) + log_fn(event, **data) + continue + except (json.JSONDecodeError, TypeError): + logger.warning("malformed JSON from process", raw=line) + proc = self._process + log_fn(line, pid=proc.pid if proc else None) + stream.close() + + +# safe_thread_map: parallel map that collects all results before raising + + +def safe_thread_map( + items: Sequence[T], + fn: Callable[[T], R], + on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any] + | None = None, +) -> list[R]: + """Thread-pool map that waits for all items to finish before raising and a cleanup handler + + - Empty *items* → returns ``[]`` immediately. + - All succeed → returns results in input order. + - Any fail → calls ``on_errors(outcomes, successes, errors)`` where + *outcomes* is a list of ``(input, result_or_exception)`` pairs in input + order, *successes* is the list of successful results, and *errors* is + the list of exceptions. If *on_errors* raises, that exception propagates. + If *on_errors* returns normally, its return value is returned from + ``safe_thread_map``. If *on_errors* is ``None``, raises an + ``ExceptionGroup``. + + Example:: + + def start_service(name: str) -> Connection: + return connect(name) + + def cleanup( + outcomes: list[tuple[str, Connection | Exception]], + successes: list[Connection], + errors: list[Exception], + ) -> None: + for conn in successes: + conn.close() + raise ExceptionGroup("failed to start services", errors) + + connections = safe_thread_map( + ["db", "cache", "queue"], + start_service, + cleanup, # called only if any start_service() raises + ) + """ + if not items: + return [] + + outcomes: dict[int, R | Exception] = {} + + with ThreadPoolExecutor(max_workers=len(items)) as pool: + futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)} + for fut in as_completed(futures): + idx = futures[fut] + try: + outcomes[idx] = fut.result() + except Exception as e: + outcomes[idx] = e + + successes: list[R] = [] + errors: list[Exception] = [] + for v in outcomes.values(): + if isinstance(v, Exception): + errors.append(v) + else: + successes.append(v) + + if errors: + if on_errors is not None: + zipped = [(items[i], outcomes[i]) for i in range(len(items))] + return on_errors(zipped, successes, errors) # type: ignore[return-value, no-any-return] + raise ExceptionGroup("safe_thread_map failed", errors) + + return [outcomes[i] for i in range(len(items))] # type: ignore[misc] diff --git a/dimos/utils/typing_utils.py b/dimos/utils/typing_utils.py new file mode 100644 index 0000000000..3592d5fdbb --- /dev/null +++ b/dimos/utils/typing_utils.py @@ -0,0 +1,45 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unify typing compatibility across multiple Python versions.""" + +from __future__ import annotations + +from collections.abc import Sequence +import sys + +if sys.version_info < (3, 13): + from typing_extensions import TypeVar +else: + from typing import TypeVar + +if sys.version_info < (3, 11): + + class ExceptionGroup(Exception): # type: ignore[no-redef] # noqa: N818 + """Minimal ExceptionGroup polyfill for Python 3.10.""" + + exceptions: tuple[BaseException, ...] + + def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None: + super().__init__(message) + self.exceptions = tuple(exceptions) +else: + import builtins + + ExceptionGroup = builtins.ExceptionGroup # type: ignore[misc] + +__all__ = [ + "ExceptionGroup", + "TypeVar", +] From 0bff0cf0303dba93d1c7a8c1456406d3e38c133e Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 18:59:28 -0700 Subject: [PATCH 85/89] proper design of WorkerManagers --- dimos/core/blueprints.py | 2 +- dimos/core/docker_module.py | 5 - dimos/core/global_config.py | 1 + dimos/core/module_coordinator.py | 157 ++++++------------ dimos/core/rpc_client.py | 2 +- dimos/core/test_daemon.py | 49 +++--- dimos/core/test_e2e_daemon.py | 24 ++- dimos/core/test_worker.py | 9 +- dimos/core/tests/test_docker_deployment.py | 112 +++++++------ .../tests/test_parallel_deploy_cleanup.py | 35 ++-- dimos/core/worker_manager_docker.py | 64 +++++-- ...er_manager.py => worker_manager_python.py} | 78 +++++++-- dimos/core/{worker.py => worker_python.py} | 0 .../sensors/camera/realsense/camera.py | 2 +- dimos/hardware/sensors/camera/zed/camera.py | 2 +- dimos/robot/cli/dimos.py | 3 +- dimos/robot/unitree/b1/unitree_b1.py | 2 +- dimos/utils/demo_image_encoding.py | 2 +- 18 files changed, 287 insertions(+), 262 deletions(-) rename dimos/core/{worker_manager.py => worker_manager_python.py} (62%) rename dimos/core/{worker.py => worker_python.py} (100%) diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py index 8f9d59182d..314724386d 100644 --- a/dimos/core/blueprints.py +++ b/dimos/core/blueprints.py @@ -485,7 +485,7 @@ def build( self._verify_no_name_conflicts() logger.info("Starting the modules") - module_coordinator = ModuleCoordinator(cfg=global_config) + module_coordinator = ModuleCoordinator(g=global_config) module_coordinator.start() # all module constructors are called here (each of them setup their own) diff --git a/dimos/core/docker_module.py b/dimos/core/docker_module.py index dc0ffd533f..8cf01c41af 100644 --- a/dimos/core/docker_module.py +++ b/dimos/core/docker_module.py @@ -554,11 +554,6 @@ def wait(self) -> None: self._shutdown.wait() -# --------------------------------------------------------------------------- -# Helpers (private — used by the classes above) -# --------------------------------------------------------------------------- - - def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]: logger.debug(f"exec: {' '.join(cmd)}") return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False) diff --git a/dimos/core/global_config.py b/dimos/core/global_config.py index 90461932a2..5a5f7ba7bc 100644 --- a/dimos/core/global_config.py +++ b/dimos/core/global_config.py @@ -38,6 +38,7 @@ class GlobalConfig(BaseSettings): new_memory: bool = False viewer: ViewerBackend = "rerun" n_workers: int = 2 + worker_to_module_ratio: float = 1.0 memory_limit: str = "auto" mujoco_camera_position: str | None = None mujoco_room: str | None = None diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 7902072570..d1020e61bd 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -18,19 +18,17 @@ import threading from typing import TYPE_CHECKING, Any -from dimos.core.docker_worker_manager import DockerWorkerManager from dimos.core.global_config import GlobalConfig, global_config from dimos.core.module import ModuleBase, ModuleSpec from dimos.core.resource import Resource -from dimos.core.worker_manager import WorkerManager +from dimos.core.worker_manager_docker import WorkerManagerDocker +from dimos.core.worker_manager_python import WorkerManagerPython from dimos.utils.logging_config import setup_logger from dimos.utils.thread_utils import safe_thread_map from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: - from dimos.core.resource_monitor.monitor import StatsMonitor from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol - from dimos.core.worker import Worker logger = setup_logger() @@ -46,88 +44,59 @@ class ModuleCoordinator(Resource): # type: ignore[misc] - Modules shouldn't be deployed on their own (except for testing) """ - _client: WorkerManager | None = None + _managers: list[WorkerManagerDocker | WorkerManagerPython] _global_config: GlobalConfig - _n: int | None = None - _memory_limit: str = "auto" _deployed_modules: dict[type[ModuleBase], ModuleProxyProtocol] - _stats_monitor: StatsMonitor | None = None def __init__( self, - n: int | None = None, - cfg: GlobalConfig = global_config, + g: GlobalConfig = global_config, ) -> None: - self._n = n if n is not None else cfg.n_workers - self._memory_limit = cfg.memory_limit - self._global_config = cfg + self._global_config = g + self._managers = [] self._deployed_modules = {} - @property - def workers(self) -> list[Worker]: - """Active worker processes.""" - if self._client is None: - return [] - return self._client.workers - - @property - def n_workers(self) -> int: - """Number of active workers.""" - return len(self.workers) + def start(self) -> None: + self._managers = [ + WorkerManagerDocker(g=self._global_config), + WorkerManagerPython(g=self._global_config), + ] + for m in self._managers: + m.start() + + def _find_manager( + self, module_class: type[ModuleBase[Any]] + ) -> WorkerManagerDocker | WorkerManagerPython: + for m in self._managers: + if m.should_manage(module_class): + return m + raise ValueError(f"No manager found for {module_class.__name__}") def health_check(self) -> bool: - """Verify all workers are alive after build. - - Since ``blueprint.build()`` is synchronous, every module should be - started by the time this runs. We just confirm no worker has died. - """ - if self.n_workers == 0: - logger.error("health_check: no workers found") - return False - - for w in self.workers: - if w.pid is None: - logger.error("health_check: worker died", worker_id=w.worker_id) - return False - - return True + return all(m.health_check() for m in self._managers) @property def n_modules(self) -> int: - """Number of deployed modules.""" return len(self._deployed_modules) def suppress_console(self) -> None: - """Silence console output in all worker processes.""" - if self._client is not None: - self._client.suppress_console() - - def start(self) -> None: - n = self._n if self._n is not None else 2 - self._client = WorkerManager(n_workers=n) - self._client.start() - - if self._global_config.dtop: - from dimos.core.resource_monitor.monitor import StatsMonitor - - self._stats_monitor = StatsMonitor(self._client) - self._stats_monitor.start() + for m in self._managers: + m.suppress_console() def stop(self) -> None: - if self._stats_monitor is not None: - self._stats_monitor.stop() - self._stats_monitor = None - for module_class, module in reversed(self._deployed_modules.items()): logger.info("Stopping module...", module=module_class.__name__) - try: + with suppress(Exception): module.stop() - except Exception: - logger.error("Error stopping module", module=module_class.__name__, exc_info=True) logger.info("Module stopped.", module=module_class.__name__) - if self._client is not None: - self._client.close_all() + def _stop_manager(m: WorkerManagerDocker | WorkerManagerPython) -> None: + try: + m.stop() + except Exception: + logger.error("Error stopping manager", manager=type(m).__name__, exc_info=True) + + safe_thread_map(self._managers, _stop_manager) def deploy( self, @@ -135,58 +104,34 @@ def deploy( global_config: GlobalConfig = global_config, **kwargs: Any, ) -> ModuleProxy: - # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator - from dimos.core.docker_module import DockerModuleOuter, is_docker_module - - if not self._client: + if not self._managers: raise ValueError("Trying to dimos.deploy before the client has started") - deployed_module: ModuleProxyProtocol - if is_docker_module(module_class): - deployed_module = DockerModuleOuter(module_class, g=global_config, **kwargs) # type: ignore[arg-type] - else: - deployed_module = self._client.deploy(module_class, global_config, kwargs) + manager = self._find_manager(module_class) + deployed_module = manager.deploy(module_class, global_config, kwargs) self._deployed_modules[module_class] = deployed_module # type: ignore[assignment] return deployed_module # type: ignore[return-value] def deploy_parallel(self, module_specs: list[ModuleSpec]) -> list[ModuleProxy]: - # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator - from dimos.core.docker_module import is_docker_module - - if not self._client: + if not self._managers: raise ValueError("Not started") - # Split by type, tracking original indices for reassembly - docker_indices: list[int] = [] - worker_indices: list[int] = [] - docker_specs: list[ModuleSpec] = [] - worker_specs: list[ModuleSpec] = [] - for i, spec in enumerate(module_specs): - if is_docker_module(spec[0]): - docker_indices.append(i) - docker_specs.append(spec) - else: - worker_indices.append(i) - worker_specs.append(spec) - - # Deploy worker and docker modules in parallel. - results: list[Any] = [None] * len(module_specs) + # Group specs by manager, tracking original indices for reassembly + groups: dict[int, WorkerManagerDocker | WorkerManagerPython] = {} + indices_by_manager: dict[int, list[int]] = {} + specs_by_manager: dict[int, list[ModuleSpec]] = {} + for index, spec in enumerate(module_specs): + manager = self._find_manager(spec[0]) + mid = id(manager) + groups.setdefault(mid, manager) + indices_by_manager.setdefault(mid, []).append(index) + specs_by_manager.setdefault(mid, []).append(spec) - def _deploy_workers() -> None: - if not worker_specs: - return - assert self._client is not None - for index, module in zip( - worker_indices, self._client.deploy_parallel(worker_specs), strict=True - ): - results[index] = module + results: list[Any] = [None] * len(module_specs) - def _deploy_docker() -> None: - if not docker_specs: - return - for index, module in zip( - docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True - ): + def _deploy_group(mid: int) -> None: + deployed = groups[mid].deploy_parallel(specs_by_manager[mid]) + for index, module in zip(indices_by_manager[mid], deployed, strict=True): results[index] = module def _register() -> None: @@ -200,7 +145,7 @@ def _on_errors( _register() raise ExceptionGroup("deploy_parallel failed", errors) - safe_thread_map([_deploy_workers, _deploy_docker], lambda fn: fn(), _on_errors) + safe_thread_map(list(groups.keys()), _deploy_group, _on_errors) _register() return results diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py index 46182b7556..f051cbfdb1 100644 --- a/dimos/core/rpc_client.py +++ b/dimos/core/rpc_client.py @@ -16,7 +16,7 @@ from typing import TYPE_CHECKING, Any, Protocol from dimos.core.stream import RemoteStream -from dimos.core.worker import MethodCallProxy +from dimos.core.worker_python import MethodCallProxy from dimos.protocol.rpc.pubsubrpc import LCMRPC from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec from dimos.utils.logging_config import setup_logger diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py index f6dae51433..821e2378de 100644 --- a/dimos/core/test_daemon.py +++ b/dimos/core/test_daemon.py @@ -158,50 +158,41 @@ def test_port_conflict_no_false_positive(self, tmp_registry: Path): from dimos.core.module_coordinator import ModuleCoordinator -def _mock_worker(pid: int | None = 1234, worker_id: int = 0): - """Create a mock Worker with a controllable pid.""" - w = mock.MagicMock() - w.worker_id = worker_id - w.pid = pid - return w - - -def _mock_coordinator(workers: list | None = None) -> ModuleCoordinator: - """Create a ModuleCoordinator with mocked internals and controllable workers.""" +def _mock_coordinator(manager_health: list[bool] | None = None) -> ModuleCoordinator: + """Create a ModuleCoordinator with mocked managers and controllable health.""" coord = mock.MagicMock(spec=ModuleCoordinator) # Bind the real health_check method so it runs actual logic coord.health_check = ModuleCoordinator.health_check.__get__(coord) - if workers is not None: - coord.workers = workers - coord.n_workers = len(workers) + if manager_health is not None: + managers = [] + for healthy in manager_health: + m = mock.MagicMock() + m.health_check.return_value = healthy + managers.append(m) + coord._managers = managers else: - coord.workers = [] - coord.n_workers = 0 + coord._managers = [] return coord class TestHealthCheck: - """health_check verifies all workers are alive after synchronous build.""" + """health_check delegates to managers and returns all() of their results.""" def test_all_healthy(self): - workers = [_mock_worker(pid=os.getpid(), worker_id=i) for i in range(3)] - coord = _mock_coordinator(workers) + coord = _mock_coordinator([True, True]) assert coord.health_check() is True - def test_dead_worker(self): - dead = _mock_worker(pid=None, worker_id=0) - coord = _mock_coordinator([dead]) + def test_one_unhealthy(self): + coord = _mock_coordinator([True, False]) assert coord.health_check() is False - def test_no_workers(self): - coord = _mock_coordinator(workers=[]) - assert coord.health_check() is False + def test_no_managers(self): + coord = _mock_coordinator([]) + # all([]) is True — no managers means nothing to fail + assert coord.health_check() is True - def test_partial_death(self): - w1 = _mock_worker(pid=os.getpid(), worker_id=0) - w2 = _mock_worker(pid=os.getpid(), worker_id=1) - w3 = _mock_worker(pid=None, worker_id=2) - coord = _mock_coordinator([w1, w2, w3]) + def test_all_unhealthy(self): + coord = _mock_coordinator([False, False]) assert coord.health_check() is False diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py index d8ac016faa..b52bf14ea6 100644 --- a/dimos/core/test_e2e_daemon.py +++ b/dimos/core/test_e2e_daemon.py @@ -111,7 +111,6 @@ class TestDaemonE2E: def test_single_worker_lifecycle(self, coordinator, registry_entry): """Build -> health check -> registry -> status (1 worker).""" - assert len(coordinator.workers) == 1 assert coordinator.n_modules == 2 assert coordinator.health_check(), "Health check should pass" @@ -126,15 +125,14 @@ def test_single_worker_lifecycle(self, coordinator, registry_entry): def test_multiple_workers(self, coordinator_2w): """Build with 2 workers — both should be alive.""" - assert len(coordinator_2w.workers) == 2 - for w in coordinator_2w.workers: - assert w.pid is not None, f"Worker {w.worker_id} has no PID" - assert coordinator_2w.health_check(), "Health check should pass" def test_health_check_detects_dead_worker(self, coordinator): """Kill a worker process — health check should fail.""" - worker = coordinator.workers[0] + from dimos.core.worker_manager_python import WorkerManagerPython + + py_mgr = next(m for m in coordinator._managers if isinstance(m, WorkerManagerPython)) + worker = py_mgr.workers[0] worker_pid = worker.pid assert worker_pid is not None @@ -237,21 +235,19 @@ def test_status_shows_live_blueprint(self, live_blueprint): assert "ping-pong" in result.output assert str(os.getpid()) in result.output - def test_status_shows_worker_count_via_registry(self, live_blueprint): - coord, entry = live_blueprint - - assert len(coord.workers) >= 1 - for w in coord.workers: - assert w.pid is not None + def test_status_shows_live_entry_via_registry(self, live_blueprint): + _coord, entry = live_blueprint runs = list_runs(alive_only=True) matching = [r for r in runs if r.run_id == entry.run_id] assert len(matching) == 1 def test_stop_kills_real_workers(self, live_blueprint): - coord, _entry = live_blueprint + from dimos.core.worker_manager_python import WorkerManagerPython - worker_pids = [w.pid for w in coord.workers if w.pid] + coord, _entry = live_blueprint + py_mgr = next(m for m in coord._managers if isinstance(m, WorkerManagerPython)) + worker_pids = [w.pid for w in py_mgr.workers if w.pid] assert len(worker_pids) >= 1 coord.stop() diff --git a/dimos/core/test_worker.py b/dimos/core/test_worker.py index 021b2e21c4..ced51dfa76 100644 --- a/dimos/core/test_worker.py +++ b/dimos/core/test_worker.py @@ -17,10 +17,10 @@ import pytest from dimos.core.core import rpc -from dimos.core.global_config import global_config +from dimos.core.global_config import GlobalConfig, global_config from dimos.core.module import Module from dimos.core.stream import In, Out -from dimos.core.worker_manager import WorkerManager +from dimos.core.worker_manager_python import WorkerManagerPython from dimos.msgs.geometry_msgs.Vector3 import Vector3 if TYPE_CHECKING: @@ -87,14 +87,15 @@ def create_worker_manager(): def _create(n_workers): nonlocal manager - manager = WorkerManager(n_workers=n_workers) + g = GlobalConfig(n_workers=n_workers) + manager = WorkerManagerPython(g=g) manager.start() return manager yield _create if manager is not None: - manager.close_all() + manager.stop() @pytest.mark.slow diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py index 982bc656b4..5bb18d4a24 100644 --- a/dimos/core/tests/test_docker_deployment.py +++ b/dimos/core/tests/test_docker_deployment.py @@ -76,41 +76,38 @@ class Bare(Module): class TestModuleCoordinatorDockerRouting: - @patch("dimos.core.docker_module.DockerModuleOuter") - @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls): - mock_worker_mgr = MagicMock() - mock_worker_manager_cls.return_value = mock_worker_mgr - + @patch("dimos.core.module_coordinator.WorkerManagerDocker") + @patch("dimos.core.module_coordinator.WorkerManagerPython") + def test_deploy_routes_docker_module(self, mock_py_cls, mock_docker_cls): + mock_py = MagicMock() + mock_py_cls.return_value = mock_py + + mock_docker = MagicMock() + mock_docker_cls.return_value = mock_docker mock_dm = MagicMock() - mock_docker_module_cls.return_value = mock_dm + mock_docker.deploy.return_value = mock_dm coordinator = ModuleCoordinator() coordinator.start() try: result = coordinator.deploy(FakeDockerModule) - # Should NOT go through worker manager - mock_worker_mgr.deploy.assert_not_called() - # Should construct a DockerModuleOuter (container launch happens inside __init__) - mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config) - # start() is NOT called during deploy — it's called in start_all_modules - mock_dm.start.assert_not_called() + # Docker manager should handle it + mock_docker.deploy.assert_called_once_with(FakeDockerModule, global_config, {}) + # Python manager should NOT be used + mock_py.deploy.assert_not_called() assert result is mock_dm assert coordinator.get_instance(FakeDockerModule) is mock_dm finally: coordinator.stop() - @patch("dimos.core.docker_module.DockerModuleOuter") - @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_docker_propagates_constructor_failure( - self, mock_worker_manager_cls, mock_docker_module_cls - ): - mock_worker_mgr = MagicMock() - mock_worker_manager_cls.return_value = mock_worker_mgr - - # Container launch fails inside __init__; DockerModuleOuter handles its own cleanup - mock_docker_module_cls.side_effect = RuntimeError("launch failed") + @patch("dimos.core.module_coordinator.WorkerManagerDocker") + @patch("dimos.core.module_coordinator.WorkerManagerPython") + def test_deploy_docker_propagates_failure(self, mock_py_cls, mock_docker_cls): + mock_py_cls.return_value = MagicMock() + mock_docker = MagicMock() + mock_docker_cls.return_value = mock_docker + mock_docker.deploy.side_effect = RuntimeError("launch failed") coordinator = ModuleCoordinator() coordinator.start() @@ -120,36 +117,43 @@ def test_deploy_docker_propagates_constructor_failure( finally: coordinator.stop() - @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls): - mock_worker_mgr = MagicMock() - mock_worker_manager_cls.return_value = mock_worker_mgr + @patch("dimos.core.module_coordinator.WorkerManagerDocker") + @patch("dimos.core.module_coordinator.WorkerManagerPython") + def test_deploy_routes_regular_module_to_python_manager(self, mock_py_cls, mock_docker_cls): + mock_py = MagicMock() + mock_py_cls.return_value = mock_py mock_proxy = MagicMock() - mock_worker_mgr.deploy.return_value = mock_proxy + mock_py.deploy.return_value = mock_proxy + + # Docker manager rejects regular modules + mock_docker = MagicMock() + mock_docker_cls.return_value = mock_docker + mock_docker.should_manage.return_value = False coordinator = ModuleCoordinator() coordinator.start() try: result = coordinator.deploy(FakeRegularModule) - mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) + mock_py.deploy.assert_called_once_with(FakeRegularModule, global_config, {}) assert result is mock_proxy finally: coordinator.stop() - @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel") - @patch("dimos.core.module_coordinator.WorkerManager") - def test_deploy_parallel_separates_docker_and_regular( - self, mock_worker_manager_cls, mock_docker_deploy - ): - mock_worker_mgr = MagicMock() - mock_worker_manager_cls.return_value = mock_worker_mgr - + @patch("dimos.core.module_coordinator.WorkerManagerDocker") + @patch("dimos.core.module_coordinator.WorkerManagerPython") + def test_deploy_parallel_separates_docker_and_regular(self, mock_py_cls, mock_docker_cls): + mock_py = MagicMock() + mock_py_cls.return_value = mock_py regular_proxy = MagicMock() - mock_worker_mgr.deploy_parallel.return_value = [regular_proxy] + mock_py.deploy_parallel.return_value = [regular_proxy] + mock_docker = MagicMock() + mock_docker_cls.return_value = mock_docker mock_dm = MagicMock() - mock_docker_deploy.return_value = [mock_dm] + mock_docker.deploy_parallel.return_value = [mock_dm] + # Docker manager only claims FakeDockerModule + mock_docker.should_manage.side_effect = lambda cls: cls is FakeDockerModule coordinator = ModuleCoordinator() coordinator.start() @@ -160,27 +164,24 @@ def test_deploy_parallel_separates_docker_and_regular( ] results = coordinator.deploy_parallel(specs) - # Regular module goes through worker manager - mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) - # Docker specs go through DockerWorkerManager - mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})]) - # start() is NOT called during deploy — it's called in start_all_modules + mock_py.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})]) + mock_docker.deploy_parallel.assert_called_once_with([(FakeDockerModule, (), {})]) mock_dm.start.assert_not_called() - # Results preserve input order assert results[0] is regular_proxy assert results[1] is mock_dm finally: coordinator.stop() - @patch("dimos.core.docker_module.DockerModuleOuter") - @patch("dimos.core.module_coordinator.WorkerManager") - def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls): - mock_worker_mgr = MagicMock() - mock_worker_manager_cls.return_value = mock_worker_mgr - + @patch("dimos.core.module_coordinator.WorkerManagerDocker") + @patch("dimos.core.module_coordinator.WorkerManagerPython") + def test_stop_cleans_up_all_managers(self, mock_py_cls, mock_docker_cls): + mock_py = MagicMock() + mock_py_cls.return_value = mock_py + mock_docker = MagicMock() + mock_docker_cls.return_value = mock_docker mock_dm = MagicMock() - mock_docker_module_cls.return_value = mock_dm + mock_docker.deploy.return_value = mock_dm coordinator = ModuleCoordinator() coordinator.start() @@ -189,10 +190,11 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke finally: coordinator.stop() - # stop() called exactly once (no double cleanup) + # Module stop() called assert mock_dm.stop.call_count == 1 - # Worker manager also closed - mock_worker_mgr.close_all.assert_called_once() + # Both managers stopped + mock_py.stop.assert_called_once() + mock_docker.stop.assert_called_once() class TestDockerModuleOuterGetattr: diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py index 795401d80e..bf6e7d1ed4 100644 --- a/dimos/core/tests/test_parallel_deploy_cleanup.py +++ b/dimos/core/tests/test_parallel_deploy_cleanup.py @@ -27,13 +27,14 @@ from dimos.utils.typing_utils import ExceptionGroup -class TestDockerWorkerManagerPartialFailure: - """DockerWorkerManager.deploy_parallel must stop successful containers when one fails.""" +class TestWorkerManagerDockerPartialFailure: + """WorkerManagerDocker.deploy_parallel must stop successful containers when one fails.""" @patch("dimos.core.docker_module.DockerModuleOuter") def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls): """Deploy 3 modules where the middle one fails. The other two must be stopped.""" - from dimos.core.docker_worker_manager import DockerWorkerManager + from dimos.core.global_config import GlobalConfig + from dimos.core.worker_manager_docker import WorkerManagerDocker mod_a = MagicMock(name="ModuleA") mod_c = MagicMock(name="ModuleC") @@ -54,7 +55,7 @@ def fake_constructor(cls, *args, **kwargs): FakeC = type("C", (), {}) with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: - DockerWorkerManager.deploy_parallel( + WorkerManagerDocker(g=GlobalConfig()).deploy_parallel( [ (FakeA, (), {}), (FakeB, (), {}), @@ -72,7 +73,8 @@ def fake_constructor(cls, *args, **kwargs): @patch("dimos.core.docker_module.DockerModuleOuter") def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls): """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors.""" - from dimos.core.docker_worker_manager import DockerWorkerManager + from dimos.core.global_config import GlobalConfig + from dimos.core.worker_manager_docker import WorkerManagerDocker mod_a = MagicMock(name="ModuleA") @@ -94,7 +96,7 @@ def fake_constructor(cls, *args, **kwargs): FakeC = type("C", (), {}) with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info: - DockerWorkerManager.deploy_parallel( + WorkerManagerDocker(g=GlobalConfig()).deploy_parallel( [ (FakeA, (), {}), (FakeB, (), {}), @@ -113,7 +115,8 @@ def fake_constructor(cls, *args, **kwargs): @patch("dimos.core.docker_module.DockerModuleOuter") def test_all_succeed_no_stops(self, mock_docker_module_cls): """When all deployments succeed, no modules should be stopped.""" - from dimos.core.docker_worker_manager import DockerWorkerManager + from dimos.core.global_config import GlobalConfig + from dimos.core.worker_manager_docker import WorkerManagerDocker mocks = [MagicMock(name=f"Mod{i}") for i in range(3)] @@ -126,7 +129,7 @@ def fake_constructor(cls, *args, **kwargs): FakeB = type("B", (), {}) FakeC = type("C", (), {}) - results = DockerWorkerManager.deploy_parallel( + results = WorkerManagerDocker(g=GlobalConfig()).deploy_parallel( [ (FakeA, (), {}), (FakeB, (), {}), @@ -141,7 +144,8 @@ def fake_constructor(cls, *args, **kwargs): @patch("dimos.core.docker_module.DockerModuleOuter") def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls): """If stop() itself raises during cleanup, the original deploy error still propagates.""" - from dimos.core.docker_worker_manager import DockerWorkerManager + from dimos.core.global_config import GlobalConfig + from dimos.core.worker_manager_docker import WorkerManagerDocker mod_a = MagicMock(name="ModuleA") mod_a.stop.side_effect = OSError("stop failed") @@ -160,19 +164,22 @@ def fake_constructor(cls, *args, **kwargs): FakeB = type("B", (), {}) with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed"): - DockerWorkerManager.deploy_parallel([(FakeA, (), {}), (FakeB, (), {})]) + WorkerManagerDocker(g=GlobalConfig()).deploy_parallel( + [(FakeA, (), {}), (FakeB, (), {})] + ) # stop was attempted despite it raising mod_a.stop.assert_called_once() class TestWorkerManagerPartialFailure: - """WorkerManager.deploy_parallel must clean up successful RPCClients when one fails.""" + """WorkerManagerPython.deploy_parallel must clean up successful RPCClients when one fails.""" def test_middle_module_fails_cleans_up_siblings(self): - from dimos.core.worker_manager import WorkerManager + from dimos.core.global_config import GlobalConfig + from dimos.core.worker_manager_python import WorkerManagerPython - manager = WorkerManager(n_workers=2) + manager = WorkerManagerPython(g=GlobalConfig(n_workers=2)) mock_workers = [MagicMock(name=f"Worker{i}") for i in range(2)] for w in mock_workers: @@ -198,7 +205,7 @@ def fake_deploy_module(module_class, args=(), kwargs=None): rpc_clients_created: list[MagicMock] = [] - with patch("dimos.core.worker_manager.RPCClient") as mock_rpc_cls: + with patch("dimos.core.worker_manager_python.RPCClient") as mock_rpc_cls: def make_rpc(actor, cls): client = MagicMock(name=f"rpc_{cls.__name__}") diff --git a/dimos/core/worker_manager_docker.py b/dimos/core/worker_manager_docker.py index 78bc9928c4..b35a7f000d 100644 --- a/dimos/core/worker_manager_docker.py +++ b/dimos/core/worker_manager_docker.py @@ -16,26 +16,51 @@ from contextlib import suppress from typing import TYPE_CHECKING, Any -from dimos.core.module import ModuleSpec +from dimos.core.global_config import GlobalConfig +from dimos.core.module import ModuleBase, ModuleSpec +from dimos.utils.logging_config import setup_logger from dimos.utils.thread_utils import safe_thread_map from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: from dimos.core.docker_module import DockerModuleOuter + from dimos.core.rpc_client import ModuleProxyProtocol +logger = setup_logger() -class DockerWorkerManager: - """Parallel deployment of Docker-backed modules.""" - @staticmethod - def deploy_parallel( - specs: list[ModuleSpec], - ) -> list[DockerModuleOuter]: - """Deploy multiple DockerModules in parallel. +class WorkerManagerDocker: + """Manages deployment of Docker-backed modules.""" - If any deployment fails, all successfully-started containers are - stopped before an ExceptionGroup is raised. - """ + def __init__(self, g: GlobalConfig) -> None: + self._cfg = g + self._deployed: list[DockerModuleOuter] = [] + + def should_manage(self, module_class: type) -> bool: + # inlined to prevent circular dependency + from dimos.core.docker_module import is_docker_module + + return is_docker_module(module_class) + + def start(self) -> None: + """No-op — Docker manager has no persistent workers.""" + + def deploy( + self, + module_class: type[ModuleBase], + global_config: GlobalConfig, + kwargs: dict[str, Any], + ) -> ModuleProxyProtocol: + # inlined to prevent circular dependency + from dimos.core.docker_module import DockerModuleOuter + + mod = DockerModuleOuter(module_class, g=global_config, **kwargs) # type: ignore[arg-type] + mod.build() + self._deployed.append(mod) + return mod + + def deploy_parallel(self, specs: list[ModuleSpec]) -> list[ModuleProxyProtocol]: + # inlined to prevent circular dependency from dimos.core.docker_module import DockerModuleOuter def _on_errors( @@ -51,4 +76,19 @@ def _deploy_one(spec: ModuleSpec) -> DockerModuleOuter: mod.build() return mod - return safe_thread_map(specs, _deploy_one, _on_errors) + results = safe_thread_map(specs, _deploy_one, _on_errors) + self._deployed.extend(results) + return results # type: ignore[return-value] + + def stop(self) -> None: + for mod in reversed(self._deployed): + with suppress(Exception): + mod.stop() + self._deployed.clear() + + def health_check(self) -> bool: + # TODO: in the future decide on what a meaninful health check would be + return True + + def suppress_console(self) -> None: + """No-op — Docker containers manage their own stdio.""" diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager_python.py similarity index 62% rename from dimos/core/worker_manager.py rename to dimos/core/worker_manager_python.py index f12bffac66..12a0d11f68 100644 --- a/dimos/core/worker_manager.py +++ b/dimos/core/worker_manager_python.py @@ -16,35 +16,61 @@ from collections.abc import Iterable from contextlib import suppress -from typing import Any +from typing import TYPE_CHECKING, Any from dimos.core.global_config import GlobalConfig from dimos.core.module import ModuleBase, ModuleSpec from dimos.core.rpc_client import RPCClient -from dimos.core.worker import Worker +from dimos.core.worker_python import Worker from dimos.utils.logging_config import setup_logger from dimos.utils.thread_utils import safe_thread_map from dimos.utils.typing_utils import ExceptionGroup +if TYPE_CHECKING: + from dimos.core.resource_monitor.monitor import StatsMonitor + logger = setup_logger() -class WorkerManager: - def __init__(self, n_workers: int = 2) -> None: - self._n_workers = n_workers +_MIN_WORKERS = 2 + + +class WorkerManagerPython: + def __init__(self, g: GlobalConfig) -> None: + self._cfg = g + self._max_workers = g.n_workers + self._worker_to_module_ratio = g.worker_to_module_ratio self._workers: list[Worker] = [] + self._n_modules = 0 self._closed = False self._started = False + self._stats_monitor: StatsMonitor | None = None + + def _desired_workers(self, n_modules: int) -> int: + """Target worker count: ratio * modules, clamped to [_MIN_WORKERS, max_workers].""" + from_ratio = int(n_modules * self._worker_to_module_ratio + 0.5) + return max(_MIN_WORKERS, min(from_ratio, self._max_workers)) + + def _ensure_workers(self, n_modules: int) -> None: + """Grow the worker pool to match the desired count for *n_modules*.""" + target = self._desired_workers(n_modules) + while len(self._workers) < target: + worker = Worker() + worker.start_process() + self._workers.append(worker) def start(self) -> None: if self._started: return self._started = True - for _ in range(self._n_workers): - worker = Worker() - worker.start_process() - self._workers.append(worker) - logger.info("Worker pool started.", n_workers=self._n_workers) + self._ensure_workers(self._n_modules) + logger.info("Worker pool started.", n_workers=len(self._workers)) + + if self._cfg.dtop: + from dimos.core.resource_monitor.monitor import StatsMonitor + + self._stats_monitor = StatsMonitor(self) + self._stats_monitor.start() def _select_worker(self) -> Worker: return min(self._workers, key=lambda w: w.module_count) @@ -53,28 +79,31 @@ def deploy( self, module_class: type[ModuleBase], global_config: GlobalConfig, kwargs: dict[str, Any] ) -> RPCClient: if self._closed: - raise RuntimeError("WorkerManager is closed") + raise RuntimeError("WorkerManagerPython is closed") - # Auto-start for backward compatibility if not self._started: self.start() + self._n_modules += 1 + self._ensure_workers(self._n_modules) worker = self._select_worker() actor = worker.deploy_module(module_class, global_config, kwargs=kwargs) return RPCClient(actor, module_class) def deploy_parallel(self, module_specs: Iterable[ModuleSpec]) -> list[RPCClient]: if self._closed: - raise RuntimeError("WorkerManager is closed") + raise RuntimeError("WorkerManagerPython is closed") module_specs = list(module_specs) if len(module_specs) == 0: return [] - # Auto-start for backward compatibility if not self._started: self.start() + self._n_modules += len(module_specs) + self._ensure_workers(self._n_modules) + # Pre-assign workers sequentially (so least-loaded accounting is # correct), then deploy concurrently via threads. The per-worker lock # serializes deploys that land on the same worker process. @@ -99,6 +128,21 @@ def _on_errors( _on_errors, ) + def should_manage(self, module_class: type) -> bool: + """Catch-all — accepts any module not claimed by another manager.""" + return True + + def health_check(self) -> bool: + """Verify all worker processes are alive.""" + if len(self._workers) == 0: + logger.error("health_check: no workers found") + return False + for w in self._workers: + if w.pid is None: + logger.error("health_check: worker died", worker_id=w.worker_id) + return False + return True + def suppress_console(self) -> None: """Tell all workers to redirect stdout/stderr to /dev/null.""" for worker in self._workers: @@ -108,11 +152,15 @@ def suppress_console(self) -> None: def workers(self) -> list[Worker]: return list(self._workers) - def close_all(self) -> None: + def stop(self) -> None: if self._closed: return self._closed = True + if self._stats_monitor is not None: + self._stats_monitor.stop() + self._stats_monitor = None + logger.info("Shutting down all workers...") for worker in reversed(self._workers): diff --git a/dimos/core/worker.py b/dimos/core/worker_python.py similarity index 100% rename from dimos/core/worker.py rename to dimos/core/worker_python.py diff --git a/dimos/hardware/sensors/camera/realsense/camera.py b/dimos/hardware/sensors/camera/realsense/camera.py index 821982981d..ca87ec3c1b 100644 --- a/dimos/hardware/sensors/camera/realsense/camera.py +++ b/dimos/hardware/sensors/camera/realsense/camera.py @@ -445,7 +445,7 @@ def get_depth_scale(self) -> float: def main() -> None: - dimos = ModuleCoordinator(n=2) + dimos = ModuleCoordinator() dimos.start() camera = dimos.deploy(RealSenseCamera, enable_pointcloud=True, pointcloud_fps=5.0) # type: ignore[type-var] diff --git a/dimos/hardware/sensors/camera/zed/camera.py b/dimos/hardware/sensors/camera/zed/camera.py index dd429c29cf..d39a37f82f 100644 --- a/dimos/hardware/sensors/camera/zed/camera.py +++ b/dimos/hardware/sensors/camera/zed/camera.py @@ -491,7 +491,7 @@ def get_depth_scale(self) -> float: def main() -> None: - dimos = ModuleCoordinator(n=2) + dimos = ModuleCoordinator() dimos.start() camera = dimos.deploy(ZEDCamera, enable_pointcloud=True, pointcloud_fps=5.0) # type: ignore[type-var] diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py index 1137a612f3..8a2be16668 100644 --- a/dimos/robot/cli/dimos.py +++ b/dimos/robot/cli/dimos.py @@ -177,9 +177,8 @@ def run( coordinator.stop() raise typer.Exit(1) - n_workers = coordinator.n_workers n_modules = coordinator.n_modules - typer.echo(f"✓ All modules started ({n_modules} modules, {n_workers} workers)") + typer.echo(f"✓ All modules started ({n_modules} modules)") typer.echo("✓ Health check passed") typer.echo("✓ DimOS running in background\n") typer.echo(f" Run ID: {run_id}") diff --git a/dimos/robot/unitree/b1/unitree_b1.py b/dimos/robot/unitree/b1/unitree_b1.py index 9a6d04a7ff..ab36850643 100644 --- a/dimos/robot/unitree/b1/unitree_b1.py +++ b/dimos/robot/unitree/b1/unitree_b1.py @@ -80,7 +80,7 @@ def __init__( self.capabilities = [RobotCapability.LOCOMOTION] self.connection = None self.joystick = None - self._dimos = ModuleCoordinator(n=2) + self._dimos = ModuleCoordinator() os.makedirs(self.output_dir, exist_ok=True) logger.info(f"Robot outputs will be saved to: {self.output_dir}") diff --git a/dimos/utils/demo_image_encoding.py b/dimos/utils/demo_image_encoding.py index 84b91acf79..148b5e842d 100644 --- a/dimos/utils/demo_image_encoding.py +++ b/dimos/utils/demo_image_encoding.py @@ -97,7 +97,7 @@ def main() -> None: ) args = parser.parse_args() - dimos = ModuleCoordinator(n=2) + dimos = ModuleCoordinator() dimos.start() emitter = dimos.deploy(EmitterModule) receiver = dimos.deploy(ReceiverModule) From 434c321962f8271a09a23c9f2d2905e838dcb648 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Wed, 25 Mar 2026 21:22:57 -0700 Subject: [PATCH 86/89] refactor blueprint build --- dimos/core/blueprints.py | 271 ++++++++++++++++--------------- dimos/core/module_coordinator.py | 35 ++++ 2 files changed, 176 insertions(+), 130 deletions(-) diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py index 314724386d..b1c855d4d7 100644 --- a/dimos/core/blueprints.py +++ b/dimos/core/blueprints.py @@ -55,6 +55,36 @@ class ModuleRef: spec: type[Spec] | type[ModuleBase] +@dataclass(frozen=True) +class StreamWiring: + """Compiled instruction: set a transport on a module's stream.""" + + module_class: type[ModuleBase] + stream_name: str + transport: PubSubTransport[Any] + + +@dataclass(frozen=True) +class ModuleRefWiring: + """Compiled instruction: link base_module.ref_name → target_module.""" + + base_module: type[ModuleBase] + ref_name: str + target_module: type[ModuleBase] + + +@dataclass(frozen=True) +class RpcWiringPlan: + """Compiled RPC wiring: registry of methods + per-module binding requests.""" + + # rpc_key -> (module_class, method_name) — the full callable registry + registry: dict[str, tuple[type[ModuleBase], str]] + # (module_class, set_method_name, linked_rpc_key) — for set_X pattern + set_methods: tuple[tuple[type[ModuleBase], str, str], ...] + # (module_class, requested_name, rpc_key) — for rpc_calls pattern + rpc_call_bindings: tuple[tuple[type[ModuleBase], str, str], ...] + + @dataclass(frozen=True) class _BlueprintAtom: kwargs: dict[str, Any] @@ -166,7 +196,7 @@ def _active_blueprints(self) -> tuple[_BlueprintAtom, ...]: def _check_ambiguity( self, requested_method_name: str, - interface_methods: Mapping[str, list[tuple[type[ModuleBase], Callable[..., Any]]]], + interface_methods: Mapping[str, list[tuple[type[ModuleBase], str]]], requesting_module: type[ModuleBase], ) -> None: if ( @@ -189,17 +219,13 @@ def _get_transport_for(self, name: str, stream_type: type) -> PubSubTransport[An use_pickled = getattr(stream_type, "lcm_encode", None) is None topic = f"/{name}" if self._is_name_unique(name) else f"/{short_id()}" - transport = pLCMTransport(topic) if use_pickled else LCMTransport(topic, stream_type) - - return transport + return pLCMTransport(topic) if use_pickled else LCMTransport(topic, stream_type) @cached_property def _all_name_types(self) -> set[tuple[str, type]]: - # Apply remappings to get the actual names that will be used result = set() for blueprint in self._active_blueprints: for conn in blueprint.streams: - # Check if this stream should be remapped remapped_name = self.remapping_map.get((blueprint.module, conn.name), conn.name) if isinstance(remapped_name, str): result.add((remapped_name, conn.type)) @@ -274,65 +300,69 @@ def _verify_no_name_conflicts(self) -> None: raise ValueError("\n".join(error_lines)) - def _deploy_all_modules( - self, module_coordinator: ModuleCoordinator, global_config: GlobalConfig - ) -> None: - module_specs: list[ModuleSpec] = [] + def _compile_module_specs(self, g: GlobalConfig) -> list[ModuleSpec]: + """Compile the list of module deployment specs (pure — no side effects).""" + specs: list[ModuleSpec] = [] for blueprint in self._active_blueprints: - module_specs.append((blueprint.module, global_config, blueprint.kwargs)) - - module_coordinator.deploy_parallel(module_specs) + specs.append((blueprint.module, g, blueprint.kwargs)) + return specs - def _connect_streams(self, module_coordinator: ModuleCoordinator) -> None: - # dict when given (final/remapped) stream name+type, provides a list of modules + original (non-remapped) stream names - streams = defaultdict(list) + def _compile_stream_wiring(self) -> list[StreamWiring]: + """Compile stream transport assignments (pure — no side effects).""" + # Group streams by (remapped_name, type) -> [(module_class, original_name)] + streams: dict[ + tuple[str | type[ModuleBase] | type[Spec], type], list[tuple[type[ModuleBase], str]] + ] = defaultdict(list) for blueprint in self._active_blueprints: for conn in blueprint.streams: - # Check if this stream should be remapped remapped_name = self.remapping_map.get((blueprint.module, conn.name), conn.name) if isinstance(remapped_name, str): - # Group by remapped name and type streams[remapped_name, conn.type].append((blueprint.module, conn.name)) - # Connect all In/Out streams by remapped name and type. - for remapped_name, stream_type in streams.keys(): + wiring: list[StreamWiring] = [] + for (remapped_name, stream_type), module_streams in streams.items(): + assert isinstance(remapped_name, str) transport = self._get_transport_for(remapped_name, stream_type) - for module, original_name in streams[(remapped_name, stream_type)]: - instance = module_coordinator.get_instance(module) # type: ignore[assignment] - instance.set_transport(original_name, transport) # type: ignore[union-attr] + for module_class, original_name in module_streams: + wiring.append( + StreamWiring( + module_class=module_class, + stream_name=original_name, + transport=transport, + ) + ) logger.info( "Transport", name=remapped_name, original_name=original_name, topic=str(getattr(transport, "topic", None)), type=f"{stream_type.__module__}.{stream_type.__qualname__}", - module=module.__name__, + module=module_class.__name__, transport=transport.__class__.__name__, ) + return wiring + + def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]: + """Resolve module references and return wiring plan (pure — no side effects).""" + mod_and_mod_ref_to_target: dict[tuple[type[ModuleBase], str], type[ModuleBase]] = {} - def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None: - # partly fill out the mod_and_mod_ref_to_proxy - mod_and_mod_ref_to_proxy = { - (module, name): replacement - for (module, name), replacement in self.remapping_map.items() - if is_spec(replacement) or is_module_type(replacement) - } + # Seed with explicit remappings that point to modules/specs + for (module, name), replacement in self.remapping_map.items(): + if is_module_type(replacement): + mod_and_mod_ref_to_target[module, name] = replacement # type: ignore[assignment] - # after this loop we should have an exact module for every module_ref on every blueprint for blueprint in self._active_blueprints: for each_module_ref in blueprint.module_refs: - # we've got to find a another module that implements this spec - spec = mod_and_mod_ref_to_proxy.get( - (blueprint.module, each_module_ref.name), each_module_ref.spec - ) + key = (blueprint.module, each_module_ref.name) + if key in mod_and_mod_ref_to_target: + continue - # if the spec is actually module, use that (basically a user override) + spec = self.remapping_map.get(key, each_module_ref.spec) if is_module_type(spec): - mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = spec + mod_and_mod_ref_to_target[key] = spec # type: ignore[assignment] continue - # find all available candidates possible_module_candidates = [ each_other_blueprint.module for each_other_blueprint in self._active_blueprints @@ -341,33 +371,26 @@ def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None: and spec_structural_compliance(each_other_blueprint.module, spec) ) ] - # we keep valid separate from invalid to provide a better error message for "almost" valid cases valid_module_candidates = [ each_candidate for each_candidate in possible_module_candidates if spec_annotation_compliance(each_candidate, spec) ] - # none + if len(possible_module_candidates) == 0: raise Exception( f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I couldn't find a module that met that spec.\n""" ) - # exactly one structurally valid candidate elif len(possible_module_candidates) == 1: if len(valid_module_candidates) == 0: logger.warning( f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. I found a module ({possible_module_candidates[0].__name__}) that met that spec structurally, but it had a mismatch in type annotations.\nPlease either change the {each_module_ref.spec.__name__} spec or the {possible_module_candidates[0].__name__} module.\n""" ) - mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = ( - possible_module_candidates[0] - ) - continue - # more than one + mod_and_mod_ref_to_target[key] = possible_module_candidates[0] elif len(valid_module_candidates) > 1: raise Exception( f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {possible_module_candidates}.\nTo fix this use .remappings, for example:\n autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, ) ])\n""" ) - # structural candidates, but no valid candidates elif len(valid_module_candidates) == 0: possible_module_candidates_str = ", ".join( [each_candidate.__name__ for each_candidate in possible_module_candidates] @@ -375,129 +398,118 @@ def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None: raise Exception( f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. Some modules ({possible_module_candidates_str}) met the spec structurally but had a mismatch in type annotations\n""" ) - # one valid candidate (and more than one structurally valid candidate) else: - mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = ( - valid_module_candidates[0] - ) - - # now that we know the streams, we mutate the RPCClient objects - for (base_module, module_ref_name), target_module in mod_and_mod_ref_to_proxy.items(): - base_module_proxy = module_coordinator.get_instance(base_module) - target_module_proxy = module_coordinator.get_instance(target_module) # type: ignore[type-var,arg-type] - setattr( - base_module_proxy, - module_ref_name, - target_module_proxy, - ) - # Ensure the remote module instance can use the module ref inside its own RPC handlers. - base_module_proxy.set_module_ref(module_ref_name, target_module_proxy) - - def _connect_rpc_methods(self, module_coordinator: ModuleCoordinator) -> None: - # Gather all RPC methods. - rpc_methods = {} - rpc_methods_dot = {} - - # Track interface methods to detect ambiguity. - interface_methods: defaultdict[str, list[tuple[type[ModuleBase], Callable[..., Any]]]] = ( - defaultdict(list) - ) # interface_name_method -> [(module_class, method)] - interface_methods_dot: defaultdict[ - str, list[tuple[type[ModuleBase], Callable[..., Any]]] - ] = defaultdict(list) # interface_name.method -> [(module_class, method)] + mod_and_mod_ref_to_target[key] = valid_module_candidates[0] + + return [ + ModuleRefWiring(base_module=base_module, ref_name=ref_name, target_module=target) + for (base_module, ref_name), target in mod_and_mod_ref_to_target.items() + ] + + def _compile_rpc_wiring(self) -> RpcWiringPlan: + """Compile the RPC method registry and binding requests (pure — no side effects).""" + # registry: rpc_key -> (module_class, method_name) + registry: dict[str, tuple[type[ModuleBase], str]] = {} + + # Track interface methods to detect ambiguity + interface_methods: defaultdict[str, list[tuple[type[ModuleBase], str]]] = defaultdict(list) + interface_methods_dot: defaultdict[str, list[tuple[type[ModuleBase], str]]] = defaultdict( + list + ) for blueprint in self._active_blueprints: for method_name in blueprint.module.rpcs.keys(): # type: ignore[attr-defined] - module_proxy = module_coordinator.get_instance(blueprint.module) # type: ignore[assignment] - method_for_rpc_client = getattr(module_proxy, method_name) - # Register under concrete class name (backward compatibility) - rpc_methods[f"{blueprint.module.__name__}_{method_name}"] = method_for_rpc_client - rpc_methods_dot[f"{blueprint.module.__name__}.{method_name}"] = ( - method_for_rpc_client + registry[f"{blueprint.module.__name__}_{method_name}"] = ( + blueprint.module, + method_name, + ) + registry[f"{blueprint.module.__name__}.{method_name}"] = ( + blueprint.module, + method_name, ) - # Also register under any interface names for base in blueprint.module.mro(): - # Check if this base is an abstract interface with the method if ( base is not Module and issubclass(base, ABC) and hasattr(base, method_name) and getattr(base, method_name, None) is not None ): - interface_key = f"{base.__name__}.{method_name}" - interface_methods_dot[interface_key].append( - (blueprint.module, method_for_rpc_client) + interface_methods_dot[f"{base.__name__}.{method_name}"].append( + (blueprint.module, method_name) ) - interface_key_underscore = f"{base.__name__}_{method_name}" - interface_methods[interface_key_underscore].append( - (blueprint.module, method_for_rpc_client) + interface_methods[f"{base.__name__}_{method_name}"].append( + (blueprint.module, method_name) ) - # Check for ambiguity in interface methods and add non-ambiguous ones - for interface_key, implementations in interface_methods_dot.items(): + # Add non-ambiguous interface methods to registry + for key, implementations in interface_methods_dot.items(): if len(implementations) == 1: - rpc_methods_dot[interface_key] = implementations[0][1] - for interface_key, implementations in interface_methods.items(): + registry[key] = implementations[0] + for key, implementations in interface_methods.items(): if len(implementations) == 1: - rpc_methods[interface_key] = implementations[0][1] + registry[key] = implementations[0] - # Fulfil method requests (so modules can call each other). + # Compile set_ method bindings + set_methods: list[tuple[type[ModuleBase], str, str]] = [] for blueprint in self._active_blueprints: - instance = module_coordinator.get_instance(blueprint.module) # type: ignore[assignment] - for method_name in blueprint.module.rpcs.keys(): # type: ignore[attr-defined] if not method_name.startswith("set_"): continue - linked_name = method_name.removeprefix("set_") - self._check_ambiguity(linked_name, interface_methods, blueprint.module) + if linked_name in registry: + set_methods.append((blueprint.module, method_name, linked_name)) - if linked_name not in rpc_methods: - continue - - getattr(instance, method_name)(rpc_methods[linked_name]) - - for requested_method_name in instance.get_rpc_method_names(): # type: ignore[union-attr] - self._check_ambiguity( - requested_method_name, interface_methods_dot, blueprint.module - ) - - if requested_method_name not in rpc_methods_dot: - continue - - instance.set_rpc_method( # type: ignore[union-attr] - requested_method_name, rpc_methods_dot[requested_method_name] - ) + # Compile rpc_call bindings (uses rpc_calls list from module) + rpc_call_bindings: list[tuple[type[ModuleBase], str, str]] = [] + for blueprint in self._active_blueprints: + rpc_call_names: list[str] = getattr(blueprint.module, "rpc_calls", []) + for requested_name in rpc_call_names: + self._check_ambiguity(requested_name, interface_methods_dot, blueprint.module) + if requested_name in registry: + rpc_call_bindings.append((blueprint.module, requested_name, requested_name)) + + return RpcWiringPlan( + registry=registry, + set_methods=tuple(set_methods), + rpc_call_bindings=tuple(rpc_call_bindings), + ) def build( self, cli_config_overrides: Mapping[str, Any] | None = None, ) -> ModuleCoordinator: logger.info("Building the blueprint") + + # Phase 1: Configuration global_config.update(**dict(self.global_config_overrides)) if cli_config_overrides: global_config.update(**dict(cli_config_overrides)) + # Phase 2: Validation self._run_configurators() self._check_requirements() self._verify_no_name_conflicts() - logger.info("Starting the modules") - module_coordinator = ModuleCoordinator(g=global_config) - module_coordinator.start() + # Phase 3: Compile wiring plans (pure — no side effects) + module_specs = self._compile_module_specs(global_config) + stream_wiring = self._compile_stream_wiring() + module_ref_wiring = self._compile_module_ref_wiring() + rpc_wiring = self._compile_rpc_wiring() - # all module constructors are called here (each of them setup their own) - self._deploy_all_modules(module_coordinator, global_config) - self._connect_streams(module_coordinator) - self._connect_rpc_methods(module_coordinator) - self._connect_module_refs(module_coordinator) - - module_coordinator.build_all_modules() - module_coordinator.start_all_modules() + # Phase 4: Execute (all mutations go through coordinator) + logger.info("Starting the modules") + coordinator = ModuleCoordinator(g=global_config) + coordinator.start() + coordinator.deploy_parallel(module_specs) + coordinator.wire_streams(stream_wiring) + coordinator.wire_rpc_methods(rpc_wiring) + coordinator.wire_module_refs(module_ref_wiring) + coordinator.build_all_modules() + coordinator.start_all_modules() - return module_coordinator + return coordinator def autoconnect(*blueprints: Blueprint) -> Blueprint: @@ -528,7 +540,6 @@ def autoconnect(*blueprints: Blueprint) -> Blueprint: def _eliminate_duplicates(blueprints: list[_BlueprintAtom]) -> list[_BlueprintAtom]: - # The duplicates are eliminated in reverse so that newer blueprints override older ones. seen = set() unique_blueprints = [] for bp in reversed(blueprints): diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d1020e61bd..bf828eecda 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -28,6 +28,7 @@ from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: + from dimos.core.blueprints import ModuleRefWiring, RpcWiringPlan, StreamWiring from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol logger = setup_logger() @@ -149,6 +150,40 @@ def _on_errors( _register() return results + def wire_streams(self, wiring: list[StreamWiring]) -> None: + """Apply stream transports to deployed modules.""" + for w in wiring: + instance = self.get_instance(w.module_class) + instance.set_transport(w.stream_name, w.transport) # type: ignore[union-attr] + + def wire_rpc_methods(self, plan: RpcWiringPlan) -> None: + """Wire RPC methods between modules using the compiled plan.""" + # Build callable registry from deployed instances + callables: dict[str, Any] = {} + for rpc_key, (module_class, method_name) in plan.registry.items(): + proxy = self.get_instance(module_class) + callables[rpc_key] = getattr(proxy, method_name) + + # Apply set_ methods + for module_class, set_method, linked_key in plan.set_methods: + if linked_key in callables: + instance = self.get_instance(module_class) + getattr(instance, set_method)(callables[linked_key]) + + # Apply rpc_call bindings + for module_class, requested_name, rpc_key in plan.rpc_call_bindings: + if rpc_key in callables: + instance = self.get_instance(module_class) + instance.set_rpc_method(requested_name, callables[rpc_key]) # type: ignore[union-attr] + + def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None: + """Set module references between deployed modules.""" + for w in wiring: + base_proxy = self.get_instance(w.base_module) + target_proxy = self.get_instance(w.target_module) + setattr(base_proxy, w.ref_name, target_proxy) + base_proxy.set_module_ref(w.ref_name, target_proxy) # type: ignore[union-attr] + def build_all_modules(self) -> None: """Call build() on all deployed modules in parallel. From 97c3ab6255499448cb571227be7ee9f042e0dcab Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Thu, 26 Mar 2026 00:03:31 -0700 Subject: [PATCH 87/89] split ModuleCoordinator from Blueprint --- dimos/core/blueprints.py | 33 ++++++++++++++++++-------------- dimos/core/module_coordinator.py | 24 +++++++++++++++++------ 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py index b1c855d4d7..a91714bd9f 100644 --- a/dimos/core/blueprints.py +++ b/dimos/core/blueprints.py @@ -85,6 +85,16 @@ class RpcWiringPlan: rpc_call_bindings: tuple[tuple[type[ModuleBase], str, str], ...] +@dataclass(frozen=True) +class DeploySpec: + """Complete deployment specification compiled by Blueprint.build().""" + + module_specs: list[ModuleSpec] + stream_wiring: list[StreamWiring] + rpc_wiring: RpcWiringPlan + module_ref_wiring: list[ModuleRefWiring] + + @dataclass(frozen=True) class _BlueprintAtom: kwargs: dict[str, Any] @@ -389,7 +399,7 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]: mod_and_mod_ref_to_target[key] = possible_module_candidates[0] elif len(valid_module_candidates) > 1: raise Exception( - f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {possible_module_candidates}.\nTo fix this use .remappings, for example:\n autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, ) ])\n""" + f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {valid_module_candidates}.\nTo fix this use .remappings, for example:\n autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, ) ])\n""" ) elif len(valid_module_candidates) == 0: possible_module_candidates_str = ", ".join( @@ -492,23 +502,18 @@ def build( self._check_requirements() self._verify_no_name_conflicts() - # Phase 3: Compile wiring plans (pure — no side effects) - module_specs = self._compile_module_specs(global_config) - stream_wiring = self._compile_stream_wiring() - module_ref_wiring = self._compile_module_ref_wiring() - rpc_wiring = self._compile_rpc_wiring() + # Phase 3: Compile deploy spec (pure — no side effects) + deploy_spec = DeploySpec( + module_specs=self._compile_module_specs(global_config), + stream_wiring=self._compile_stream_wiring(), + module_ref_wiring=self._compile_module_ref_wiring(), + rpc_wiring=self._compile_rpc_wiring(), + ) # Phase 4: Execute (all mutations go through coordinator) logger.info("Starting the modules") - coordinator = ModuleCoordinator(g=global_config) + coordinator = ModuleCoordinator(g=global_config, deploy_spec=deploy_spec) coordinator.start() - coordinator.deploy_parallel(module_specs) - coordinator.wire_streams(stream_wiring) - coordinator.wire_rpc_methods(rpc_wiring) - coordinator.wire_module_refs(module_ref_wiring) - coordinator.build_all_modules() - coordinator.start_all_modules() - return coordinator diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index bf828eecda..6af0bf78dd 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -28,7 +28,7 @@ from dimos.utils.typing_utils import ExceptionGroup if TYPE_CHECKING: - from dimos.core.blueprints import ModuleRefWiring, RpcWiringPlan, StreamWiring + from dimos.core.blueprints import DeploySpec, ModuleRefWiring, RpcWiringPlan, StreamWiring from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol logger = setup_logger() @@ -47,13 +47,16 @@ class ModuleCoordinator(Resource): # type: ignore[misc] _managers: list[WorkerManagerDocker | WorkerManagerPython] _global_config: GlobalConfig + _deploy_spec: DeploySpec | None _deployed_modules: dict[type[ModuleBase], ModuleProxyProtocol] def __init__( self, g: GlobalConfig = global_config, + deploy_spec: DeploySpec | None = None, ) -> None: self._global_config = g + self._deploy_spec = deploy_spec self._managers = [] self._deployed_modules = {} @@ -65,6 +68,15 @@ def start(self) -> None: for m in self._managers: m.start() + if self._deploy_spec is not None: + spec = self._deploy_spec + self.deploy_parallel(spec.module_specs) + self._wire_streams(spec.stream_wiring) + self._wire_rpc_methods(spec.rpc_wiring) + self._wire_module_refs(spec.module_ref_wiring) + self._build_all_modules() + self.start_all_modules() + def _find_manager( self, module_class: type[ModuleBase[Any]] ) -> WorkerManagerDocker | WorkerManagerPython: @@ -143,20 +155,20 @@ def _register() -> None: def _on_errors( _outcomes: list[Any], _successes: list[Any], errors: list[Exception] ) -> None: - _register() + # Don't register partially-deployed modules — managers handle their own cleanup. raise ExceptionGroup("deploy_parallel failed", errors) safe_thread_map(list(groups.keys()), _deploy_group, _on_errors) _register() return results - def wire_streams(self, wiring: list[StreamWiring]) -> None: + def _wire_streams(self, wiring: list[StreamWiring]) -> None: """Apply stream transports to deployed modules.""" for w in wiring: instance = self.get_instance(w.module_class) instance.set_transport(w.stream_name, w.transport) # type: ignore[union-attr] - def wire_rpc_methods(self, plan: RpcWiringPlan) -> None: + def _wire_rpc_methods(self, plan: RpcWiringPlan) -> None: """Wire RPC methods between modules using the compiled plan.""" # Build callable registry from deployed instances callables: dict[str, Any] = {} @@ -176,7 +188,7 @@ def wire_rpc_methods(self, plan: RpcWiringPlan) -> None: instance = self.get_instance(module_class) instance.set_rpc_method(requested_name, callables[rpc_key]) # type: ignore[union-attr] - def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None: + def _wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None: """Set module references between deployed modules.""" for w in wiring: base_proxy = self.get_instance(w.base_module) @@ -184,7 +196,7 @@ def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None: setattr(base_proxy, w.ref_name, target_proxy) base_proxy.set_module_ref(w.ref_name, target_proxy) # type: ignore[union-attr] - def build_all_modules(self) -> None: + def _build_all_modules(self) -> None: """Call build() on all deployed modules in parallel. build() handles heavy one-time work (docker builds, LFS downloads, etc.) From 7ded86939fbfbdb5b96b9229d20683cf063f52a5 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Tue, 31 Mar 2026 17:01:41 -0700 Subject: [PATCH 88/89] docs: remove outdated singleton claim from ModuleCoordinator --- dimos/core/module_coordinator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index 8ec88920bf..d331f3c385 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -36,7 +36,6 @@ class ModuleCoordinator(Resource): # type: ignore[misc] """ - There should only ever be one module coordinator instance (this is a singleton) - Module (classes) should be able to be deployed, stopped, and re-deployed in on one instance of ModuleCoordinator - Arguably ModuleCoordinator could be called the "DimosRuntime" - ModuleCoordinator is responsible for all global "addresses". From e3b508686162bfd96245892d2d97d1773df5a1f3 Mon Sep 17 00:00:00 2001 From: Jeff Hykin Date: Tue, 31 Mar 2026 17:05:34 -0700 Subject: [PATCH 89/89] feat: restore _DisabledModuleProxy for disabled module ref wiring - Added _DisabledModuleProxy class back to blueprints.py - Added optional field to ModuleRef - _compile_module_ref_wiring now detects disabled providers and creates no-op proxies - DeploySpec carries disabled_ref_proxies - ModuleCoordinator._wire_disabled_ref_proxies wires them after module refs --- dimos/core/blueprints.py | 60 ++++++++++++++++++++++++++++++-- dimos/core/module_coordinator.py | 8 +++++ 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py index a91714bd9f..6f6f0cd793 100644 --- a/dimos/core/blueprints.py +++ b/dimos/core/blueprints.py @@ -42,6 +42,30 @@ logger = setup_logger() +class _DisabledModuleProxy: + def __init__(self, spec_name: str) -> None: + object.__setattr__(self, "_spec_name", spec_name) + + def __getattr__(self, name: str) -> Any: + spec = object.__getattribute__(self, "_spec_name") + + def _noop(*_args: Any, **_kwargs: Any) -> None: + logger.warning( + "Called on disabled module (no-op)", + method=name, + spec=spec, + ) + return None + + return _noop + + def __reduce__(self) -> tuple[type, tuple[str]]: + return (_DisabledModuleProxy, (self._spec_name,)) + + def __repr__(self) -> str: + return f"" + + @dataclass(frozen=True) class StreamRef: name: str @@ -53,6 +77,7 @@ class StreamRef: class ModuleRef: name: str spec: type[Spec] | type[ModuleBase] + optional: bool = False @dataclass(frozen=True) @@ -93,6 +118,7 @@ class DeploySpec: stream_wiring: list[StreamWiring] rpc_wiring: RpcWiringPlan module_ref_wiring: list[ModuleRefWiring] + disabled_ref_proxies: dict[tuple[type[ModuleBase], str], _DisabledModuleProxy] = field(default_factory=dict) @dataclass(frozen=True) @@ -356,6 +382,8 @@ def _compile_stream_wiring(self) -> list[StreamWiring]: def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]: """Resolve module references and return wiring plan (pure — no side effects).""" mod_and_mod_ref_to_target: dict[tuple[type[ModuleBase], str], type[ModuleBase]] = {} + disabled_ref_proxies: dict[tuple[type[ModuleBase], str], _DisabledModuleProxy] = {} + disabled_set = set(self.disabled_modules_tuple) # Seed with explicit remappings that point to modules/specs for (module, name), replacement in self.remapping_map.items(): @@ -388,6 +416,31 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]: ] if len(possible_module_candidates) == 0: + if each_module_ref.optional: + continue + # Check whether a *disabled* module would have satisfied this ref. + disabled_candidate = next( + ( + bp.module + for bp in self.blueprints + if bp.module in disabled_set + and spec_structural_compliance(bp.module, spec) + ), + None, + ) + if disabled_candidate is not None: + logger.warning( + "Module ref unsatisfied because provider is disabled; " + "installing no-op proxy", + ref=each_module_ref.name, + consumer=blueprint.module.__name__, + disabled_provider=disabled_candidate.__name__, + spec=each_module_ref.spec.__name__, + ) + disabled_ref_proxies[blueprint.module, each_module_ref.name] = ( + _DisabledModuleProxy(each_module_ref.spec.__name__) + ) + continue raise Exception( f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I couldn't find a module that met that spec.\n""" ) @@ -411,10 +464,11 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]: else: mod_and_mod_ref_to_target[key] = valid_module_candidates[0] - return [ + wiring = [ ModuleRefWiring(base_module=base_module, ref_name=ref_name, target_module=target) for (base_module, ref_name), target in mod_and_mod_ref_to_target.items() ] + return wiring, disabled_ref_proxies def _compile_rpc_wiring(self) -> RpcWiringPlan: """Compile the RPC method registry and binding requests (pure — no side effects).""" @@ -503,11 +557,13 @@ def build( self._verify_no_name_conflicts() # Phase 3: Compile deploy spec (pure — no side effects) + module_ref_wiring, disabled_ref_proxies = self._compile_module_ref_wiring() deploy_spec = DeploySpec( module_specs=self._compile_module_specs(global_config), stream_wiring=self._compile_stream_wiring(), - module_ref_wiring=self._compile_module_ref_wiring(), + module_ref_wiring=module_ref_wiring, rpc_wiring=self._compile_rpc_wiring(), + disabled_ref_proxies=disabled_ref_proxies, ) # Phase 4: Execute (all mutations go through coordinator) diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py index d331f3c385..63154541db 100644 --- a/dimos/core/module_coordinator.py +++ b/dimos/core/module_coordinator.py @@ -75,6 +75,7 @@ def start(self) -> None: self._wire_streams(spec.stream_wiring) self._wire_rpc_methods(spec.rpc_wiring) self._wire_module_refs(spec.module_ref_wiring) + self._wire_disabled_ref_proxies(spec.disabled_ref_proxies) self._build_all_modules() self.start_all_modules() @@ -189,6 +190,13 @@ def _wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None: setattr(base_proxy, w.ref_name, target_proxy) base_proxy.set_module_ref(w.ref_name, target_proxy) # type: ignore[union-attr] + def _wire_disabled_ref_proxies(self, proxies: dict[tuple[type[ModuleBase], str], Any]) -> None: + """Wire up no-op proxies for refs whose providers were disabled.""" + for (base_module, module_ref_name), proxy in proxies.items(): + base_module_proxy = self.get_instance(base_module) + setattr(base_module_proxy, module_ref_name, proxy) + base_module_proxy.set_module_ref(module_ref_name, proxy) # type: ignore[union-attr] + def _build_all_modules(self) -> None: """Call build() on all deployed modules in parallel.