From 5daada572a51e4544c7882883044dac177781f49 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 20:43:46 -0800
Subject: [PATCH 01/89] add docker example

---
 examples/docker_hello_world/Dockerfile      |  15 +++
 examples/docker_hello_world/hello_docker.py | 134 ++++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 examples/docker_hello_world/Dockerfile
 create mode 100644 examples/docker_hello_world/hello_docker.py

diff --git a/examples/docker_hello_world/Dockerfile b/examples/docker_hello_world/Dockerfile
new file mode 100644
index 0000000000..3ceb24b3b4
--- /dev/null
+++ b/examples/docker_hello_world/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.12-slim
+
+RUN apt-get update && apt-get install -y \
+    iproute2 \
+    libx11-6 libgl1 libglib2.0-0 \
+    libidn2-0 libgfortran5 libgomp1 \
+    cowsay \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Copy example module so it's importable inside the container
+COPY examples/docker_hello_world/hello_docker.py /dimos/source/examples/docker_hello_world/hello_docker.py
+RUN touch /dimos/source/examples/__init__.py /dimos/source/examples/docker_hello_world/__init__.py
+
+WORKDIR /app
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
new file mode 100644
index 0000000000..c6a5f0bb3e
--- /dev/null
+++ b/examples/docker_hello_world/hello_docker.py
@@ -0,0 +1,134 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Hello World Docker Module
+==========================
+
+Minimal example showing a DimOS module running inside Docker.
+
+The module receives a string on its ``prompt`` input stream, runs it through
+cowsay inside the container, and publishes the ASCII art on its ``greeting``
+output stream.
+
+NOTE: Requires Linux. Docker Desktop on macOS does not support host networking,
+which is needed for LCM multicast between host and container.
+
+Usage:
+    python examples/docker_hello_world/hello_docker.py
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+import subprocess
+import time
+
+from dimos.core.blueprints import autoconnect
+from dimos.core.core import rpc
+from dimos.core.docker_runner import DockerModuleConfig
+from dimos.core.module import Module
+from dimos.core.stream import In, Out
+
+# ---------------------------------------------------------------------------
+# Docker module (runs inside container)
+# ---------------------------------------------------------------------------
+
+
+class HelloDockerConfig(DockerModuleConfig):
+    docker_image: str = "dimos-hello-docker:latest"
+    docker_file: Path | None = Path(__file__).parent / "Dockerfile"
+    docker_build_context: Path | None = Path(__file__).parents[2]  # repo root
+    docker_gpus: str | None = None  # no GPU needed
+    docker_rm: bool = True
+    docker_restart_policy: str = "no"
+    docker_env: dict[str, str] = {"CI": "1"}  # skip interactive system configurator
+
+
+class HelloDockerModule(Module["HelloDockerConfig"]):
+    """A trivial module that runs inside Docker and echoes greetings."""
+
+    default_config = HelloDockerConfig
+
+    prompt: In[str]
+    greeting: Out[str]
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self.prompt.subscribe(self._on_prompt)
+
+    def _cowsay(self, text: str) -> str:
+        """Run cowsay inside the container and return the ASCII art."""
+        result = subprocess.run(
+            ["/usr/games/cowsay", text],
+            capture_output=True,
+            text=True,
+        )
+        return result.stdout
+
+    def _on_prompt(self, text: str) -> None:
+        art = self._cowsay(text)
+        print(f"[HelloDockerModule]\n{art}")
+        self.greeting.publish(art)
+
+    @rpc
+    def greet(self, name: str) -> str:
+        """RPC method that can be called directly."""
+        return self._cowsay(f"Hello, {name}!")
+
+
+# ---------------------------------------------------------------------------
+# Host-side module (sends prompts and prints greetings)
+# ---------------------------------------------------------------------------
+
+
+class PromptModule(Module):
+    """Publishes prompts and listens to greetings."""
+
+    prompt: Out[str]
+    greeting: In[str]
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self.greeting.subscribe(self._on_greeting)
+
+    def _on_greeting(self, text: str) -> None:
+        print(f"[PromptModule] Received: {text}")
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    coordinator = autoconnect(
+        PromptModule.blueprint(),
+        HelloDockerModule.blueprint(),
+    ).build()
+
+    # Get module proxies
+    prompt_mod = coordinator.get_instance(PromptModule)
+    docker_mod = coordinator.get_instance(HelloDockerModule)
+
+    # Test RPC
+    print(docker_mod.greet("World"))
+
+    # Test stream
+    prompt_mod.prompt.publish("stream test")
+    time.sleep(2)
+
+    coordinator.close_all()
+    print("Done!")

From 1412542bfdd6e762729d77e01e3ce08c441ebaca Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 21:10:12 -0800
Subject: [PATCH 02/89] add docker module system

---
 dimos/core/docker_worker_manager.py | 57 ++++++++++++++++++++++++++++
 dimos/core/module_coordinator.py    | 58 +++++++++++++++++++++++++----
 2 files changed, 108 insertions(+), 7 deletions(-)
 create mode 100644 dimos/core/docker_worker_manager.py

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
new file mode 100644
index 0000000000..42843577ba
--- /dev/null
+++ b/dimos/core/docker_worker_manager.py
@@ -0,0 +1,57 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from dimos.core.docker_runner import DockerModule
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from dimos.core.module import Module
+
+logger = setup_logger()
+
+
+class DockerWorkerManager:
+    """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules."""
+
+    def __init__(self) -> None:
+        self._docker_modules: list[DockerModule] = []
+        self._closed = False
+
+    def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule:
+        if self._closed:
+            raise RuntimeError("DockerWorkerManager is closed")
+
+        logger.info("Deploying module in Docker.", module=module_class.__name__)
+        dm = DockerModule(module_class, *args, **kwargs)
+        self._docker_modules.append(dm)
+        return dm
+
+    def close_all(self) -> None:
+        if self._closed:
+            return
+        self._closed = True
+
+        logger.info("Stopping all Docker modules...")
+        for dm in reversed(self._docker_modules):
+            try:
+                dm.stop()
+            except Exception:
+                logger.error("Error stopping Docker module", exc_info=True)
+
+        self._docker_modules.clear()
+        logger.info("All Docker modules stopped.")
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 86afb9ebc4..9d33255d4c 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,6 +18,8 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
+from dimos.core.docker_runner import is_docker_module
+from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
@@ -33,6 +35,7 @@
 
 class ModuleCoordinator(Resource):  # type: ignore[misc]
     _client: WorkerManager | None = None
+    _docker_client: DockerWorkerManager | None = None
     _global_config: GlobalConfig
     _n: int | None = None
     _memory_limit: str = "auto"
@@ -53,6 +56,7 @@ def start(self) -> None:
         n = self._n if self._n is not None else 2
         self._client = WorkerManager(n_workers=n)
         self._client.start()
+        self._docker_client = DockerWorkerManager()
 
         if self._global_config.dtop:
             from dimos.core.resource_monitor.monitor import StatsMonitor
@@ -73,15 +77,23 @@ def stop(self) -> None:
                 logger.error("Error stopping module", module=module_class.__name__, exc_info=True)
             logger.info("Module stopped.", module=module_class.__name__)
 
+        if self._docker_client is not None:
+            self._docker_client.close_all()
         self._client.close_all()  # type: ignore[union-attr]
 
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
-        module: ModuleProxy = self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
-        self._deployed_modules[module_class] = module
-        return module
+        if is_docker_module(module_class):
+            if not self._docker_client:
+                self._docker_client = DockerWorkerManager()
+            module = self._docker_client.deploy(module_class, *args, **kwargs)  # type: ignore[assignment]
+        else:
+            module = self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
+
+        self._deployed_modules[module_class] = module  # type: ignore[assignment]
+        return module  # type: ignore[return-value]
 
     def deploy_parallel(
         self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]]
@@ -89,10 +101,42 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        modules = self._client.deploy_parallel(module_specs)
-        for (module_class, _, _), module in zip(module_specs, modules, strict=True):
-            self._deployed_modules[module_class] = module  # type: ignore[assignment]
-        return modules  # type: ignore[return-value]
+        # Separate docker modules from regular modules
+        docker_specs = []
+        worker_specs = []
+        spec_indices: list[tuple[str, int]] = []  # ("docker"|"worker", index_in_sublist)
+
+        for spec in module_specs:
+            module_class = spec[0]
+            if is_docker_module(module_class):
+                spec_indices.append(("docker", len(docker_specs)))
+                docker_specs.append(spec)
+            else:
+                spec_indices.append(("worker", len(worker_specs)))
+                worker_specs.append(spec)
+
+        # Deploy worker modules in parallel via WorkerManager
+        worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
+
+        # Deploy docker modules (each gets its own DockerModule)
+        docker_results: list[Any] = []
+        for module_class, args, kwargs in docker_specs:
+            if not self._docker_client:
+                self._docker_client = DockerWorkerManager()
+            dm = self._docker_client.deploy(module_class, *args, **kwargs)
+            docker_results.append(dm)
+
+        # Reassemble results in original order
+        results: list[Any] = []
+        for kind, idx in spec_indices:
+            if kind == "docker":
+                results.append(docker_results[idx])
+            else:
+                results.append(worker_results[idx])
+
+        for (module_class, _, _), module in zip(module_specs, results, strict=True):
+            self._deployed_modules[module_class] = module
+        return results  # type: ignore[return-value]
 
     def start_all_modules(self) -> None:
         modules = list(self._deployed_modules.values())

From b63bf73177f0ef2fd8ff138d232f1a97d10cbbd5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 22:15:46 -0800
Subject: [PATCH 03/89] fixup

---
 .gitignore                                  |   1 +
 dimos/core/docker_runner.py                 |  41 +++-
 dimos/core/docker_worker_manager.py         |   1 +
 dimos/core/module.py                        |   3 +-
 dimos/core/module_coordinator.py            |  15 +-
 dimos/core/tests/test_docker_deployment.py  | 223 ++++++++++++++++++++
 examples/docker_hello_world/hello_docker.py |   9 +-
 pyproject.toml                              |   2 +
 uv.lock                                     |   4 +
 9 files changed, 285 insertions(+), 14 deletions(-)
 create mode 100644 dimos/core/tests/test_docker_deployment.py

diff --git a/.gitignore b/.gitignore
index 4045db012e..12b2f19ca3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ package-lock.json
 # Ignore build artifacts
 dist/
 build/
+.Dockerfile.dimos
 
 # Ignore data directory but keep .lfs subdirectory
 data/*
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index ee56163ca6..566e28a70e 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -25,17 +25,20 @@
 import time
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.docker_build import build_image, image_exists
-from dimos.core.module import Module, ModuleConfig
+from dimos.core.module import ModuleConfig
 from dimos.core.rpc_client import RpcCall
-from dimos.protocol.rpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
-from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
+
+# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers
+RERUN_GRPC_PORT = 9876
+RERUN_WEB_PORT = 9090
 
 if TYPE_CHECKING:
     from collections.abc import Callable
     from pathlib import Path
 
+    from dimos.core.module import Module
+
 logger = setup_logger()
 
 DOCKER_RUN_TIMEOUT = 120  #     Timeout for `docker run` command execution
@@ -186,7 +189,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}"
         )
 
-        # RPC setup
+        # RPC setup (lazy import to keep container-side imports light)
+        from dimos.protocol.rpc import LCMRPC
+
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
@@ -194,6 +199,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._bound_rpc_calls: dict[str, RpcCall] = {}
 
         # Build image if needed (but don't start - caller must call start() explicitly)
+        from dimos.core.docker_build import build_image, image_exists
+
         if not image_exists(config):
             logger.info(f"Building {config.docker_image}")
             build_image(config)
@@ -400,7 +407,29 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
         if cfg.docker_command:
             return list(cfg.docker_command)
 
-        module_path = f"{self._module_class.__module__}.{self._module_class.__name__}"
+        module_name = self._module_class.__module__
+        if module_name == "__main__":
+            # When run as `python script.py`, __module__ is "__main__".
+            # Resolve to the actual dotted module path so the container can import it.
+            import __main__
+
+            spec = getattr(__main__, "__spec__", None)
+            if spec and spec.name:
+                module_name = spec.name
+            else:
+                # Fallback: derive from file path relative to cwd
+                main_file = getattr(__main__, "__file__", None)
+                if main_file:
+                    import pathlib
+
+                    rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd())
+                    module_name = str(rel.with_suffix("")).replace("/", ".")
+                else:
+                    raise RuntimeError(
+                        "Cannot determine module path for __main__. "
+                        "Run with `python -m` or set docker_command explicitly."
+                    )
+        module_path = f"{module_name}.{self._module_class.__name__}"
         # Filter out docker-specific kwargs (paths, etc.) - only pass module config
         kwargs = {"config": _extract_module_config(cfg)}
         payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs}
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 42843577ba..97f27a6d7a 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -38,6 +38,7 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke
 
         logger.info("Deploying module in Docker.", module=module_class.__name__)
         dm = DockerModule(module_class, *args, **kwargs)
+        dm.start()  # Docker modules must be running before streams/RPC can be wired
         self._docker_modules.append(dm)
         return dm
 
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 48a99a79a3..127be545fe 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -218,11 +218,12 @@ def inputs(self) -> dict[str, In]:  # type: ignore[type-arg]
 
     @classproperty
     def rpcs(self) -> dict[str, Callable[..., Any]]:
+        _skip = {"rpcs", "blueprint", "module_info", "io"}
         return {
             name: getattr(self, name)
             for name in dir(self)
             if not name.startswith("_")
-            and name != "rpcs"  # Exclude the rpcs property itself to prevent recursion
+            and name not in _skip
             and callable(getattr(self, name, None))
             and hasattr(getattr(self, name), "__rpc__")
         }
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 9d33255d4c..dae1760b9e 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,14 +18,13 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.docker_runner import is_docker_module
-from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
 from dimos.utils.logging_config import setup_logger
 
 if TYPE_CHECKING:
+    from dimos.core.docker_worker_manager import DockerWorkerManager
     from dimos.core.module import Module, ModuleT
     from dimos.core.resource_monitor.monitor import StatsMonitor
     from dimos.core.rpc_client import ModuleProxy
@@ -53,6 +52,8 @@ def __init__(
         self._deployed_modules = {}
 
     def start(self) -> None:
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
         n = self._n if self._n is not None else 2
         self._client = WorkerManager(n_workers=n)
         self._client.start()
@@ -85,6 +86,9 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
+        from dimos.core.docker_runner import is_docker_module
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
         if is_docker_module(module_class):
             if not self._docker_client:
                 self._docker_client = DockerWorkerManager()
@@ -101,9 +105,12 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
+        from dimos.core.docker_runner import is_docker_module
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
         # Separate docker modules from regular modules
-        docker_specs = []
-        worker_specs = []
+        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
         spec_indices: list[tuple[str, int]] = []  # ("docker"|"worker", index_in_sublist)
 
         for spec in module_specs:
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
new file mode 100644
index 0000000000..85f2b0508a
--- /dev/null
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -0,0 +1,223 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Smoke tests for Docker module deployment routing.
+
+These tests verify that the ModuleCoordinator correctly detects and routes
+docker modules to the DockerWorkerManager WITHOUT actually running Docker.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from dimos.core.docker_runner import DockerModuleConfig, is_docker_module
+from dimos.core.docker_worker_manager import DockerWorkerManager
+from dimos.core.module import Module
+from dimos.core.module_coordinator import ModuleCoordinator
+from dimos.core.stream import Out
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+# -- Fixtures: fake module classes -------------------------------------------
+
+
+@dataclass
+class FakeDockerConfig(DockerModuleConfig):
+    docker_image: str = "fake:latest"
+    docker_file: Path | None = None
+    docker_gpus: str | None = None
+    docker_rm: bool = True
+    docker_restart_policy: str = "no"
+
+
+class FakeDockerModule(Module["FakeDockerConfig"]):
+    default_config = FakeDockerConfig
+    output: Out[str]
+
+
+class FakeRegularModule(Module):
+    output: Out[str]
+
+
+# -- Tests -------------------------------------------------------------------
+
+
+class TestIsDockerModule:
+    def test_docker_module_detected(self):
+        assert is_docker_module(FakeDockerModule) is True
+
+    def test_regular_module_not_detected(self):
+        assert is_docker_module(FakeRegularModule) is False
+
+    def test_plain_class_not_detected(self):
+        assert is_docker_module(str) is False
+
+    def test_no_default_config(self):
+        class Bare(Module):
+            pass
+
+        # Module has default_config = ModuleConfig, which is not DockerModuleConfig
+        assert is_docker_module(Bare) is False
+
+
+class TestDockerWorkerManager:
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    def test_deploy_creates_docker_module(self, mock_docker_module_cls):
+        mock_instance = MagicMock()
+        mock_docker_module_cls.return_value = mock_instance
+
+        mgr = DockerWorkerManager()
+        result = mgr.deploy(FakeDockerModule, some_kwarg="value")
+
+        mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value")
+        assert result is mock_instance
+        assert len(mgr._docker_modules) == 1
+
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls):
+        dm1 = MagicMock()
+        dm2 = MagicMock()
+        mock_docker_module_cls.side_effect = [dm1, dm2]
+
+        mgr = DockerWorkerManager()
+        mgr.deploy(FakeDockerModule)
+        mgr.deploy(FakeDockerModule)
+        mgr.close_all()
+
+        # Stopped in reverse order
+        assert dm2.stop.call_count == 1
+        assert dm1.stop.call_count == 1
+        assert dm2.stop.called
+        assert dm1.stop.called
+        assert len(mgr._docker_modules) == 0
+
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    def test_close_all_idempotent(self, mock_docker_module_cls):
+        mock_docker_module_cls.return_value = MagicMock()
+        mgr = DockerWorkerManager()
+        mgr.deploy(FakeDockerModule)
+        mgr.close_all()
+        mgr.close_all()  # second call should be no-op
+
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    def test_deploy_after_close_raises(self, mock_docker_module_cls):
+        mgr = DockerWorkerManager()
+        mgr.close_all()
+        with pytest.raises(RuntimeError, match="closed"):
+            mgr.deploy(FakeDockerModule)
+
+
+class TestModuleCoordinatorDockerRouting:
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.WorkerManager")
+    def test_deploy_routes_docker_module_to_docker_manager(
+        self, mock_worker_manager_cls, mock_docker_module_cls
+    ):
+        mock_worker_mgr = MagicMock()
+        mock_worker_manager_cls.return_value = mock_worker_mgr
+
+        mock_dm = MagicMock()
+        mock_docker_module_cls.return_value = mock_dm
+
+        coordinator = ModuleCoordinator()
+        coordinator.start()
+
+        result = coordinator.deploy(FakeDockerModule)
+
+        # Should NOT go through worker manager
+        mock_worker_mgr.deploy.assert_not_called()
+        # Should create a DockerModule
+        mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+        assert result is mock_dm
+        # Should be tracked
+        assert coordinator.get_instance(FakeDockerModule) is mock_dm
+
+        coordinator.stop()
+
+    @patch("dimos.core.module_coordinator.WorkerManager")
+    def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls):
+        mock_worker_mgr = MagicMock()
+        mock_worker_manager_cls.return_value = mock_worker_mgr
+        mock_proxy = MagicMock()
+        mock_worker_mgr.deploy.return_value = mock_proxy
+
+        coordinator = ModuleCoordinator()
+        coordinator.start()
+
+        result = coordinator.deploy(FakeRegularModule)
+
+        mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule)
+        assert result is mock_proxy
+
+        coordinator.stop()
+
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.WorkerManager")
+    def test_deploy_parallel_separates_docker_and_regular(
+        self, mock_worker_manager_cls, mock_docker_module_cls
+    ):
+        mock_worker_mgr = MagicMock()
+        mock_worker_manager_cls.return_value = mock_worker_mgr
+
+        regular_proxy = MagicMock()
+        mock_worker_mgr.deploy_parallel.return_value = [regular_proxy]
+
+        mock_dm = MagicMock()
+        mock_docker_module_cls.return_value = mock_dm
+
+        coordinator = ModuleCoordinator()
+        coordinator.start()
+
+        specs = [
+            (FakeRegularModule, (), {}),
+            (FakeDockerModule, (), {}),
+        ]
+        results = coordinator.deploy_parallel(specs)
+
+        # Regular module goes through worker manager
+        mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
+        # Docker module gets its own DockerModule
+        mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+
+        # Results are in original order
+        assert results[0] is regular_proxy
+        assert results[1] is mock_dm
+
+        coordinator.stop()
+
+    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.WorkerManager")
+    def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
+        mock_worker_mgr = MagicMock()
+        mock_worker_manager_cls.return_value = mock_worker_mgr
+
+        mock_dm = MagicMock()
+        mock_docker_module_cls.return_value = mock_dm
+
+        coordinator = ModuleCoordinator()
+        coordinator.start()
+        coordinator.deploy(FakeDockerModule)
+        coordinator.stop()
+
+        # The deployed module's stop() is called during coordinator.stop() loop
+        mock_dm.stop.assert_called()
+        # Worker manager also closed
+        mock_worker_mgr.close_all.assert_called_once()
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index c6a5f0bb3e..871be6f5d2 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -31,11 +31,11 @@
 
 from __future__ import annotations
 
+from dataclasses import dataclass, field
 from pathlib import Path
 import subprocess
 import time
 
-from dimos.core.blueprints import autoconnect
 from dimos.core.core import rpc
 from dimos.core.docker_runner import DockerModuleConfig
 from dimos.core.module import Module
@@ -46,6 +46,7 @@
 # ---------------------------------------------------------------------------
 
 
+@dataclass(kw_only=True)
 class HelloDockerConfig(DockerModuleConfig):
     docker_image: str = "dimos-hello-docker:latest"
     docker_file: Path | None = Path(__file__).parent / "Dockerfile"
@@ -53,7 +54,7 @@ class HelloDockerConfig(DockerModuleConfig):
     docker_gpus: str | None = None  # no GPU needed
     docker_rm: bool = True
     docker_restart_policy: str = "no"
-    docker_env: dict[str, str] = {"CI": "1"}  # skip interactive system configurator
+    docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"})
 
 
 class HelloDockerModule(Module["HelloDockerConfig"]):
@@ -114,6 +115,8 @@ def _on_greeting(self, text: str) -> None:
 # ---------------------------------------------------------------------------
 
 if __name__ == "__main__":
+    from dimos.core.blueprints import autoconnect
+
     coordinator = autoconnect(
         PromptModule.blueprint(),
         HelloDockerModule.blueprint(),
@@ -130,5 +133,5 @@ def _on_greeting(self, text: str) -> None:
     prompt_mod.prompt.publish("stream test")
     time.sleep(2)
 
-    coordinator.close_all()
+    coordinator.stop()
     print("Done!")
diff --git a/pyproject.toml b/pyproject.toml
index cb4607ced5..55eb570836 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -294,6 +294,8 @@ docker = [
     "sortedcontainers",
     "PyTurboJPEG",
     "rerun-sdk",
+    "langchain-core",
+    "typing_extensions",
     "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'",
     "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'",
 ]
diff --git a/uv.lock b/uv.lock
index 2f53ef0e6f..a7e9070a7d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1848,6 +1848,7 @@ dev = [
 ]
 docker = [
     { name = "dimos-lcm" },
+    { name = "langchain-core" },
     { name = "lcm" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -1865,6 +1866,7 @@ docker = [
     { name = "sortedcontainers" },
     { name = "structlog" },
     { name = "typer" },
+    { name = "typing-extensions" },
 ]
 drone = [
     { name = "pymavlink" },
@@ -2003,6 +2005,7 @@ requires-dist = [
     { name = "langchain", marker = "extra == 'agents'", specifier = "==1.2.3" },
     { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-core", marker = "extra == 'agents'", specifier = "==1.2.3" },
+    { name = "langchain-core", marker = "extra == 'docker'" },
     { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" },
@@ -2118,6 +2121,7 @@ requires-dist = [
     { name = "types-tabulate", marker = "extra == 'dev'", specifier = ">=0.9.0.20241207,<1" },
     { name = "types-tensorflow", marker = "extra == 'dev'", specifier = ">=2.18.0.20251008,<3" },
     { name = "types-tqdm", marker = "extra == 'dev'", specifier = ">=4.67.0.20250809,<5" },
+    { name = "typing-extensions", marker = "extra == 'docker'" },
     { name = "ultralytics", marker = "extra == 'perception'", specifier = ">=8.3.70" },
     { name = "unitree-webrtc-connect-leshy", marker = "extra == 'unitree'", specifier = ">=2.0.7" },
     { name = "uvicorn", marker = "extra == 'web'", specifier = ">=0.34.0" },

From 580eda4d6a621644f7fec36a6846bbf6b827672a Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 22:33:41 -0800
Subject: [PATCH 04/89] fix rerun imports

---
 dimos/core/docker_runner.py            |  5 +----
 dimos/visualization/rerun/bridge.py    |  3 ---
 dimos/visualization/rerun/constants.py | 17 +++++++++++++++++
 3 files changed, 18 insertions(+), 7 deletions(-)
 create mode 100644 dimos/visualization/rerun/constants.py

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 566e28a70e..2735b0cefe 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -28,10 +28,7 @@
 from dimos.core.module import ModuleConfig
 from dimos.core.rpc_client import RpcCall
 from dimos.utils.logging_config import setup_logger
-
-# Inlined from dimos.visualization.rerun.bridge to avoid heavy import chain in containers
-RERUN_GRPC_PORT = 9876
-RERUN_WEB_PORT = 9090
+from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT
 
 if TYPE_CHECKING:
     from collections.abc import Callable
diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py
index 47bce27dcf..420ffd1769 100644
--- a/dimos/visualization/rerun/bridge.py
+++ b/dimos/visualization/rerun/bridge.py
@@ -39,9 +39,6 @@
 from dimos.protocol.pubsub.patterns import Glob, pattern_matches
 from dimos.utils.logging_config import setup_logger
 
-RERUN_GRPC_PORT = 9876
-RERUN_WEB_PORT = 9090
-
 # TODO OUT visual annotations
 #
 # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific)
diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py
new file mode 100644
index 0000000000..e1c98176ad
--- /dev/null
+++ b/dimos/visualization/rerun/constants.py
@@ -0,0 +1,17 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# isolated so that they can be imported into lightweight modules without importing all of rerun
+RERUN_GRPC_PORT = 9876
+RERUN_WEB_PORT = 9090

From 5374de612c2942c5553fda4b37b2eaa07522755c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 22:37:43 -0800
Subject: [PATCH 05/89] fixup imports

---
 dimos/core/module_coordinator.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index dae1760b9e..155ffb28db 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,13 +18,14 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
+from dimos.core.docker_runner import is_docker_module
+from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
 from dimos.utils.logging_config import setup_logger
 
 if TYPE_CHECKING:
-    from dimos.core.docker_worker_manager import DockerWorkerManager
     from dimos.core.module import Module, ModuleT
     from dimos.core.resource_monitor.monitor import StatsMonitor
     from dimos.core.rpc_client import ModuleProxy
@@ -52,8 +53,6 @@ def __init__(
         self._deployed_modules = {}
 
     def start(self) -> None:
-        from dimos.core.docker_worker_manager import DockerWorkerManager
-
         n = self._n if self._n is not None else 2
         self._client = WorkerManager(n_workers=n)
         self._client.start()
@@ -86,9 +85,6 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
-        from dimos.core.docker_runner import is_docker_module
-        from dimos.core.docker_worker_manager import DockerWorkerManager
-
         if is_docker_module(module_class):
             if not self._docker_client:
                 self._docker_client = DockerWorkerManager()
@@ -105,9 +101,6 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        from dimos.core.docker_runner import is_docker_module
-        from dimos.core.docker_worker_manager import DockerWorkerManager
-
         # Separate docker modules from regular modules
         docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
         worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []

From bc66a45fdaba0d81453575942537e6f6fd5b78fd Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 22:48:14 -0800
Subject: [PATCH 06/89] fixup

---
 dimos/core/docker_runner.py         | 9 ++++++++-
 dimos/core/docker_worker_manager.py | 8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 2735b0cefe..f6bbd98325 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -419,7 +419,14 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
                 if main_file:
                     import pathlib
 
-                    rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd())
+                    try:
+                        rel = pathlib.Path(main_file).resolve().relative_to(pathlib.Path.cwd())
+                    except ValueError:
+                        raise RuntimeError(
+                            f"Cannot derive module path: '{main_file}' is not under cwd "
+                            f"'{pathlib.Path.cwd()}'. "
+                            "Run with `python -m` or set docker_command explicitly."
+                        ) from None
                     module_name = str(rel.with_suffix("")).replace("/", ".")
                 else:
                     raise RuntimeError(
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 97f27a6d7a..bd432f18e2 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.docker_runner import DockerModule
@@ -38,7 +39,12 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke
 
         logger.info("Deploying module in Docker.", module=module_class.__name__)
         dm = DockerModule(module_class, *args, **kwargs)
-        dm.start()  # Docker modules must be running before streams/RPC can be wired
+        try:
+            dm.start()  # Docker modules must be running before streams/RPC can be wired
+        except Exception:
+            with suppress(Exception):
+                dm.stop()
+            raise
         self._docker_modules.append(dm)
         return dm
 

From d8436097a1b3bc43a6d30e0df334f45d63a29cde Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 22:52:39 -0800
Subject: [PATCH 07/89] simplify stop logic

---
 dimos/core/docker_worker_manager.py | 21 ---------------------
 dimos/core/module_coordinator.py    |  2 --
 2 files changed, 23 deletions(-)

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index bd432f18e2..8e368d15a8 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -31,13 +31,8 @@ class DockerWorkerManager:
 
     def __init__(self) -> None:
         self._docker_modules: list[DockerModule] = []
-        self._closed = False
 
     def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule:
-        if self._closed:
-            raise RuntimeError("DockerWorkerManager is closed")
-
-        logger.info("Deploying module in Docker.", module=module_class.__name__)
         dm = DockerModule(module_class, *args, **kwargs)
         try:
             dm.start()  # Docker modules must be running before streams/RPC can be wired
@@ -45,20 +40,4 @@ def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Docke
             with suppress(Exception):
                 dm.stop()
             raise
-        self._docker_modules.append(dm)
         return dm
-
-    def close_all(self) -> None:
-        if self._closed:
-            return
-        self._closed = True
-
-        logger.info("Stopping all Docker modules...")
-        for dm in reversed(self._docker_modules):
-            try:
-                dm.stop()
-            except Exception:
-                logger.error("Error stopping Docker module", exc_info=True)
-
-        self._docker_modules.clear()
-        logger.info("All Docker modules stopped.")
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 155ffb28db..97541640dc 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -77,8 +77,6 @@ def stop(self) -> None:
                 logger.error("Error stopping module", module=module_class.__name__, exc_info=True)
             logger.info("Module stopped.", module=module_class.__name__)
 
-        if self._docker_client is not None:
-            self._docker_client.close_all()
         self._client.close_all()  # type: ignore[union-attr]
 
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]

From 30254a140324cac2c541c7825cf58a144151bb90 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 23:10:06 -0800
Subject: [PATCH 08/89] simplify and explain

---
 dimos/core/docker_worker_manager.py        | 43 ----------
 dimos/core/module_coordinator.py           | 36 ++++++---
 dimos/core/tests/test_docker_deployment.py | 91 ++++++++--------------
 3 files changed, 57 insertions(+), 113 deletions(-)
 delete mode 100644 dimos/core/docker_worker_manager.py

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
deleted file mode 100644
index 8e368d15a8..0000000000
--- a/dimos/core/docker_worker_manager.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-from contextlib import suppress
-from typing import TYPE_CHECKING, Any
-
-from dimos.core.docker_runner import DockerModule
-from dimos.utils.logging_config import setup_logger
-
-if TYPE_CHECKING:
-    from dimos.core.module import Module
-
-logger = setup_logger()
-
-
-class DockerWorkerManager:
-    """Manages DockerModule instances, mirroring WorkerManager's interface for docker-based modules."""
-
-    def __init__(self) -> None:
-        self._docker_modules: list[DockerModule] = []
-
-    def deploy(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule:
-        dm = DockerModule(module_class, *args, **kwargs)
-        try:
-            dm.start()  # Docker modules must be running before streams/RPC can be wired
-        except Exception:
-            with suppress(Exception):
-                dm.stop()
-            raise
-        return dm
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 97541640dc..25f8fdbc22 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,8 +18,7 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.docker_runner import is_docker_module
-from dimos.core.docker_worker_manager import DockerWorkerManager
+from dimos.core.docker_runner import DockerModule, is_docker_module
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
@@ -35,7 +34,6 @@
 
 class ModuleCoordinator(Resource):  # type: ignore[misc]
     _client: WorkerManager | None = None
-    _docker_client: DockerWorkerManager | None = None
     _global_config: GlobalConfig
     _n: int | None = None
     _memory_limit: str = "auto"
@@ -56,7 +54,6 @@ def start(self) -> None:
         n = self._n if self._n is not None else 2
         self._client = WorkerManager(n_workers=n)
         self._client.start()
-        self._docker_client = DockerWorkerManager()
 
         if self._global_config.dtop:
             from dimos.core.resource_monitor.monitor import StatsMonitor
@@ -79,14 +76,30 @@ def stop(self) -> None:
 
         self._client.close_all()  # type: ignore[union-attr]
 
+    def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule:
+        from contextlib import suppress
+
+        logger.info("Deploying module in Docker.", module=module_class.__name__)
+        dm = DockerModule(module_class, *args, **kwargs)
+        try:
+            # why are docker modules started here? shouldn't they be started in start_all_modules?
+            # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries
+            # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running.
+            # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that
+            # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things
+            dm.start()
+        except Exception:
+            with suppress(Exception):
+                dm.stop()
+            raise
+        return dm
+
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
         if is_docker_module(module_class):
-            if not self._docker_client:
-                self._docker_client = DockerWorkerManager()
-            module = self._docker_client.deploy(module_class, *args, **kwargs)  # type: ignore[assignment]
+            module = self._deploy_docker(module_class, *args, **kwargs)  # type: ignore[assignment]
         else:
             module = self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
 
@@ -119,9 +132,7 @@ def deploy_parallel(
         # Deploy docker modules (each gets its own DockerModule)
         docker_results: list[Any] = []
         for module_class, args, kwargs in docker_specs:
-            if not self._docker_client:
-                self._docker_client = DockerWorkerManager()
-            dm = self._docker_client.deploy(module_class, *args, **kwargs)
+            dm = self._deploy_docker(module_class, *args, **kwargs)
             docker_results.append(dm)
 
         # Reassemble results in original order
@@ -137,9 +148,10 @@ def deploy_parallel(
         return results  # type: ignore[return-value]
 
     def start_all_modules(self) -> None:
-        modules = list(self._deployed_modules.values())
+        # Docker modules are already started during deploy, (see their deploy as to why this is)
+        modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)]
         if isinstance(self._client, WorkerManager):
-            with ThreadPoolExecutor(max_workers=len(modules)) as executor:
+            with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor:
                 list(executor.map(lambda m: m.start(), modules))
         else:
             for module in modules:
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 85f2b0508a..99c1debbb6 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -16,7 +16,7 @@
 Smoke tests for Docker module deployment routing.
 
 These tests verify that the ModuleCoordinator correctly detects and routes
-docker modules to the DockerWorkerManager WITHOUT actually running Docker.
+docker modules to DockerModule WITHOUT actually running Docker.
 """
 
 from __future__ import annotations
@@ -28,7 +28,6 @@
 import pytest
 
 from dimos.core.docker_runner import DockerModuleConfig, is_docker_module
-from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
 from dimos.core.stream import Out
@@ -78,59 +77,10 @@ class Bare(Module):
         assert is_docker_module(Bare) is False
 
 
-class TestDockerWorkerManager:
-    @patch("dimos.core.docker_worker_manager.DockerModule")
-    def test_deploy_creates_docker_module(self, mock_docker_module_cls):
-        mock_instance = MagicMock()
-        mock_docker_module_cls.return_value = mock_instance
-
-        mgr = DockerWorkerManager()
-        result = mgr.deploy(FakeDockerModule, some_kwarg="value")
-
-        mock_docker_module_cls.assert_called_once_with(FakeDockerModule, some_kwarg="value")
-        assert result is mock_instance
-        assert len(mgr._docker_modules) == 1
-
-    @patch("dimos.core.docker_worker_manager.DockerModule")
-    def test_close_all_stops_in_reverse_order(self, mock_docker_module_cls):
-        dm1 = MagicMock()
-        dm2 = MagicMock()
-        mock_docker_module_cls.side_effect = [dm1, dm2]
-
-        mgr = DockerWorkerManager()
-        mgr.deploy(FakeDockerModule)
-        mgr.deploy(FakeDockerModule)
-        mgr.close_all()
-
-        # Stopped in reverse order
-        assert dm2.stop.call_count == 1
-        assert dm1.stop.call_count == 1
-        assert dm2.stop.called
-        assert dm1.stop.called
-        assert len(mgr._docker_modules) == 0
-
-    @patch("dimos.core.docker_worker_manager.DockerModule")
-    def test_close_all_idempotent(self, mock_docker_module_cls):
-        mock_docker_module_cls.return_value = MagicMock()
-        mgr = DockerWorkerManager()
-        mgr.deploy(FakeDockerModule)
-        mgr.close_all()
-        mgr.close_all()  # second call should be no-op
-
-    @patch("dimos.core.docker_worker_manager.DockerModule")
-    def test_deploy_after_close_raises(self, mock_docker_module_cls):
-        mgr = DockerWorkerManager()
-        mgr.close_all()
-        with pytest.raises(RuntimeError, match="closed"):
-            mgr.deploy(FakeDockerModule)
-
-
 class TestModuleCoordinatorDockerRouting:
-    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_routes_docker_module_to_docker_manager(
-        self, mock_worker_manager_cls, mock_docker_module_cls
-    ):
+    def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
         mock_worker_manager_cls.return_value = mock_worker_mgr
 
@@ -144,14 +94,38 @@ def test_deploy_routes_docker_module_to_docker_manager(
 
         # Should NOT go through worker manager
         mock_worker_mgr.deploy.assert_not_called()
-        # Should create a DockerModule
+        # Should create a DockerModule and start it
         mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+        mock_dm.start.assert_called_once()
         assert result is mock_dm
         # Should be tracked
         assert coordinator.get_instance(FakeDockerModule) is mock_dm
 
         coordinator.stop()
 
+    @patch("dimos.core.module_coordinator.DockerModule")
+    @patch("dimos.core.module_coordinator.WorkerManager")
+    def test_deploy_docker_cleans_up_on_start_failure(
+        self, mock_worker_manager_cls, mock_docker_module_cls
+    ):
+        mock_worker_mgr = MagicMock()
+        mock_worker_manager_cls.return_value = mock_worker_mgr
+
+        mock_dm = MagicMock()
+        mock_dm.start.side_effect = RuntimeError("start failed")
+        mock_docker_module_cls.return_value = mock_dm
+
+        coordinator = ModuleCoordinator()
+        coordinator.start()
+
+        with pytest.raises(RuntimeError, match="start failed"):
+            coordinator.deploy(FakeDockerModule)
+
+        # stop() called to clean up the failed container
+        mock_dm.stop.assert_called_once()
+
+        coordinator.stop()
+
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls):
         mock_worker_mgr = MagicMock()
@@ -169,7 +143,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage
 
         coordinator.stop()
 
-    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_parallel_separates_docker_and_regular(
         self, mock_worker_manager_cls, mock_docker_module_cls
@@ -196,6 +170,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
         # Docker module gets its own DockerModule
         mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+        mock_dm.start.assert_called_once()
 
         # Results are in original order
         assert results[0] is regular_proxy
@@ -203,7 +178,7 @@ def test_deploy_parallel_separates_docker_and_regular(
 
         coordinator.stop()
 
-    @patch("dimos.core.docker_worker_manager.DockerModule")
+    @patch("dimos.core.module_coordinator.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -217,7 +192,7 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke
         coordinator.deploy(FakeDockerModule)
         coordinator.stop()
 
-        # The deployed module's stop() is called during coordinator.stop() loop
-        mock_dm.stop.assert_called()
+        # stop() called exactly once (no double cleanup)
+        assert mock_dm.stop.call_count == 1
         # Worker manager also closed
         mock_worker_mgr.close_all.assert_called_once()

From 16565b02070275669ad18fdcf45aa501c5849290 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 23:26:39 -0800
Subject: [PATCH 09/89] parallel start of docker modules

---
 dimos/core/module_coordinator.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 25f8fdbc22..b16812a4dd 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -129,11 +129,16 @@ def deploy_parallel(
         # Deploy worker modules in parallel via WorkerManager
         worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
 
-        # Deploy docker modules (each gets its own DockerModule)
-        docker_results: list[Any] = []
-        for module_class, args, kwargs in docker_specs:
-            dm = self._deploy_docker(module_class, *args, **kwargs)
-            docker_results.append(dm)
+        # Deploy docker modules in parallel (each starts its own container)
+        if docker_specs:
+            with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
+                futures = [
+                    executor.submit(self._deploy_docker, module_class, *args, **kwargs)
+                    for module_class, args, kwargs in docker_specs
+                ]
+                docker_results: list[Any] = [f.result() for f in futures]
+        else:
+            docker_results: list[Any] = []
 
         # Reassemble results in original order
         results: list[Any] = []

From 91170176f7a820eb1b059cd1c7a32b3780f5b3ee Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 23:33:09 -0800
Subject: [PATCH 10/89] fix container name to be stable

---
 dimos/core/docker_runner.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index f6bbd98325..1fc281c035 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -18,7 +18,6 @@
 from dataclasses import dataclass, field
 import importlib
 import json
-import os
 import signal
 import subprocess
 import threading
@@ -181,9 +180,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._kwargs = kwargs
         self._running = False
         self.remote_name = module_class.__name__
-        self._container_name = (
-            config.docker_container_name
-            or f"dimos_{module_class.__name__.lower()}_{os.getpid()}_{int(time.time())}"
+        self._container_name = config.docker_container_name or self._default_container_name(
+            module_class, config
         )
 
         # RPC setup (lazy import to keep container-side imports light)
@@ -202,6 +200,16 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             logger.info(f"Building {config.docker_image}")
             build_image(config)
 
+    @staticmethod
+    def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str:
+        import hashlib
+
+        name = module_class.__name__.lower()
+        path_hash = hashlib.sha256(
+            str(config.docker_file.resolve()).encode()  # type: ignore[union-attr]
+        ).hexdigest()[:12]
+        return f"dimos_{name}_{path_hash}"
+
     def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         callable.set_rpc(self.rpc)
         self._bound_rpc_calls[method] = callable

From ab150fa1b663784191daf4644f13e7dcf0c4c1ec Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 4 Mar 2026 23:51:12 -0800
Subject: [PATCH 11/89] lazy import

---
 dimos/core/o3dpickle.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py
index 1912ab7739..1c1464fece 100644
--- a/dimos/core/o3dpickle.py
+++ b/dimos/core/o3dpickle.py
@@ -14,25 +14,34 @@
 
 import copyreg
 
-import numpy as np
-import open3d as o3d  # type: ignore[import-untyped]
-
+# open3d is imported lazily (inside functions) rather than at module level.
+# dimos.core.core imports this module just to register pickle handlers, and core is
+# imported by almost everything — including lightweight docker modules that don't use
+# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere,
+# which crashes in environments where those packages aren't installed or version-matched.
+# (i.e. minimal docker envs)
 
 def reduce_external(obj):  # type: ignore[no-untyped-def]
+    import numpy as np
+
     # Convert Vector3dVector to numpy array for pickling
     points_array = np.asarray(obj.points)
     return (reconstruct_pointcloud, (points_array,))
 
 
 def reconstruct_pointcloud(points_array):  # type: ignore[no-untyped-def]
-    # Create new PointCloud and assign the points
+    import open3d as o3d  # type: ignore[import-untyped]
+
     pc = o3d.geometry.PointCloud()
     pc.points = o3d.utility.Vector3dVector(points_array)
     return pc
 
 
 def register_picklers() -> None:
-    # Register for the actual PointCloud class that gets instantiated
-    # We need to create a dummy PointCloud to get its actual class
+    try:
+        import open3d as o3d  # type: ignore[import-untyped]
+    except ImportError:
+        return  # open3d not installed in this environment; skip registration
+
     _dummy_pc = o3d.geometry.PointCloud()
     copyreg.pickle(_dummy_pc.__class__, reduce_external)

From 84c045e106bbf7b91dcaa0cc3e3238c891355780 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 10:08:21 -0800
Subject: [PATCH 12/89] clean up

---
 dimos/core/docker_runner.py                 | 139 ++++++++++++--------
 dimos/core/module.py                        |  25 +++-
 dimos/core/module_coordinator.py            |  89 ++++---------
 dimos/core/o3dpickle.py                     |  21 +--
 dimos/core/tests/test_docker_deployment.py  |  21 ++-
 examples/docker_hello_world/hello_docker.py |   7 +-
 6 files changed, 155 insertions(+), 147 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 1fc281c035..c6a196b7a7 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -26,6 +26,7 @@
 
 from dimos.core.module import ModuleConfig
 from dimos.core.rpc_client import RpcCall
+from dimos.protocol.rpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
 from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT
 
@@ -139,6 +140,32 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s
     return out + ("\n" + err if err else "")
 
 
+def _prompt_restart(container_name: str) -> bool:
+    """Ask the user whether to restart a running container.
+
+    Returns True to restart, False to reuse.
+    Falls back to restart when stdin is not a TTY (e.g. CI).
+    """
+    import sys
+
+    if not sys.stdin.isatty():
+        logger.warning(
+            f"Container '{container_name}' already running — restarting (non-interactive)."
+        )
+        return True
+
+    print(f"\nContainer '{container_name}' is already running.")
+    print("  [r] Restart  — stop the existing container and start a fresh one")
+    print("  [u] Use      — attach to the existing container as-is")
+    while True:
+        choice = input("Choice [r/u]: ").strip().lower()
+        if choice in ("r", "restart"):
+            return True
+        if choice in ("u", "use"):
+            return False
+        print("Please enter 'r' or 'u'.")
+
+
 def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
     """Extract JSON-serializable config fields for the container (excludes docker_* fields)."""
     out: dict[str, Any] = {}
@@ -161,21 +188,22 @@ class DockerModule:
     Host-side handle for a module running inside Docker.
 
     Lifecycle:
-    - start(): launches container, waits for module ready via RPC
-    - stop(): stops container
-    - __getattr__: exposes RpcCall for @rpc methods on remote module
+    - start(): builds the image if needed, launches the container, waits for readiness, calls the remote module's start() RPC (after streams are wired)
+    - stop(): stops the container and cleans up
 
     Communication: All RPC happens via LCM multicast (requires --network=host).
     """
+    config : DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
         # Config
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
+        assert issubclass(config_class, DockerModuleConfig)
         config = config_class(**kwargs)
-
+        
         # Module info
         self._module_class = module_class
-        self._config = config
+        self.config = config
         self._args = args
         self._kwargs = kwargs
         self._running = False
@@ -184,21 +212,13 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             module_class, config
         )
 
-        # RPC setup (lazy import to keep container-side imports light)
-        from dimos.protocol.rpc import LCMRPC
 
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
-
-        # Build image if needed (but don't start - caller must call start() explicitly)
-        from dimos.core.docker_build import build_image, image_exists
-
-        if not image_exists(config):
-            logger.info(f"Building {config.docker_image}")
-            build_image(config)
+        self._deferred_transports: dict[str, str] = {}  # stream_name -> topic
 
     @staticmethod
     def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str:
@@ -210,44 +230,56 @@ def _default_container_name(module_class: type[Module], config: DockerModuleConf
         ).hexdigest()[:12]
         return f"dimos_{name}_{path_hash}"
 
+    def get_rpc_method_names(self) -> list[str]:
+        return self.rpc_calls
+
     def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         callable.set_rpc(self.rpc)
         self._bound_rpc_calls[method] = callable
 
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
-        # Check all requested methods exist
         missing = set(methods) - self._bound_rpc_calls.keys()
         if missing:
             raise ValueError(f"RPC methods not found: {missing}")
-        # Return single RpcCall or tuple
         calls = tuple(self._bound_rpc_calls[m] for m in methods)
         return calls[0] if len(calls) == 1 else calls
 
     def start(self) -> None:
-        if self._running:
-            return
+        """Invoke the remote module's start() RPC.
 
-        cfg = self._config
+        Called after stream transports are wired so the module can subscribe
+        to its streams with valid transports.
+        """
+        from dimos.core.docker_build import build_image, image_exists
 
-        # Prevent accidental kill of running container with same name
-        if _is_container_running(cfg, self._container_name):
-            raise RuntimeError(
-                f"Container '{self._container_name}' already running. "
-                "Choose a different container_name or stop the existing container."
-            )
-        _remove_container(cfg, self._container_name)
-
-        cmd = self._build_docker_run_command()
-        logger.info(f"Starting docker container: {self._container_name}")
-        r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
-        if r.returncode != 0:
-            raise RuntimeError(
-                f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
-            )
+        if not image_exists(self.config):
+            logger.info(f"Building {self.config.docker_image}")
+            build_image(self.config)
+        try:
 
-        self.rpc.start()
-        self._running = True
-        self._wait_for_ready()
+            cfg = self.config
+            if _is_container_running(cfg, self._container_name):
+                restart = _prompt_restart(self._container_name)
+                if restart:
+                    _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+                    _remove_container(cfg, self._container_name)
+
+            cmd = self._build_docker_run_command()
+            logger.info(f"Starting docker container: {self._container_name}")
+            r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
+            if r.returncode != 0:
+                raise RuntimeError(
+                    f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+                )
+
+            self.rpc.start()
+            self._running = True
+            self._configure_streams(self._deferred_transports)
+            self.rpc.call_sync(f"{self.remote_name}/start", ([], {}))
+        except Exception:
+            with suppress(Exception):
+                self.stop()
+            raise
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
@@ -263,13 +295,13 @@ def stop(self) -> None:
         self._unsub_fns.clear()
 
         # Stop and remove container
-        _run([_docker_bin(self._config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
-        _remove_container(self._config, self._container_name)
+        _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+        _remove_container(self.config, self._container_name)
         self._running = False
         logger.info(f"Stopped container: {self._container_name}")
 
     def status(self) -> dict[str, Any]:
-        cfg = self._config
+        cfg = self.config
         return {
             "module": self.remote_name,
             "container_name": self._container_name,
@@ -278,19 +310,17 @@ def status(self) -> dict[str, Any]:
         }
 
     def tail_logs(self, n: int = 200) -> str:
-        return _tail_logs(self._config, self._container_name, n=n)
+        return _tail_logs(self.config, self._container_name, n=n)
 
     def set_transport(self, stream_name: str, transport: Any) -> bool:
-        """Configure stream transport in container. Mirrors Module.set_transport() for autoconnect()."""
+        """Defer stream transport config until start() when the container is running."""
         topic = getattr(transport, "topic", None)
         if topic is None:
             return False
         if hasattr(topic, "topic"):
             topic = topic.topic
-        result, _ = self.rpc.call_sync(
-            f"{self.remote_name}/configure_stream", ([stream_name, str(topic)], {})
-        )
-        return bool(result)
+        self._deferred_transports[stream_name] = str(topic)
+        return True
 
     def __getattr__(self, name: str) -> Any:
         if name in self.rpcs:
@@ -302,7 +332,7 @@ def __getattr__(self, name: str) -> Any:
 
     def _build_docker_run_command(self) -> list[str]:
         """Build the complete `docker run` command."""
-        cfg = self._config
+        cfg = self.config
         self._validate_config(cfg)
 
         cmd = [_docker_bin(cfg), "run", "-d"]
@@ -448,9 +478,13 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
         # DimOS base image entrypoint already runs "dimos.core.docker_runner run"
         return ["--payload", json.dumps(payload, separators=(",", ":"))]
 
-    def _wait_for_ready(self) -> None:
-        """Poll the module's RPC endpoint until ready, crashed, or timeout."""
-        cfg = self._config
+    def _configure_streams(self, streams: dict[str, str]) -> None:
+        """Poll configure_streams RPC until the container's RPC server is up, then wire streams.
+
+        Also serves as the liveness gate — the first successful call proves the
+        container is ready to accept RPCs.
+        """
+        cfg = self.config
         start_time = time.time()
 
         logger.info(f"Waiting for {self.remote_name} to be ready...")
@@ -462,13 +496,14 @@ def _wait_for_ready(self) -> None:
 
             try:
                 self.rpc.call_sync(
-                    f"{self.remote_name}/start", ([], {}), rpc_timeout=RPC_READY_TIMEOUT
+                    f"{self.remote_name}/configure_streams",
+                    ([streams], {}),
+                    rpc_timeout=RPC_READY_TIMEOUT,
                 )
                 elapsed = time.time() - start_time
                 logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)")
                 return
             except (TimeoutError, ConnectionError, OSError):
-                # Module not ready yet - retry after poll interval
                 time.sleep(cfg.docker_poll_interval)
 
         logs = _tail_logs(cfg, self._container_name)
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 127be545fe..72df61d4c7 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -446,15 +446,26 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool:  # type
         return True
 
     @rpc
-    def configure_stream(self, stream_name: str, topic: str) -> bool:
-        """Configure a stream's transport by topic. Called by DockerModule for stream wiring."""
+    def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]:
+        """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring.
+
+        Args:
+            streams: mapping of stream_name -> topic
+
+        Returns:
+            mapping of stream_name -> success
+        """
         from dimos.core.transport import pLCMTransport
 
-        stream = getattr(self, stream_name, None)
-        if not isinstance(stream, (Out, In)):
-            return False
-        stream._transport = pLCMTransport(topic)
-        return True
+        results: dict[str, bool] = {}
+        for stream_name, topic in streams.items():
+            stream = getattr(self, stream_name, None)
+            if not isinstance(stream, (Out, In)):
+                results[stream_name] = False
+            else:
+                stream._transport = pLCMTransport(topic)
+                results[stream_name] = True
+        return results
 
     # called from remote
     def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]):  # type: ignore[no-untyped-def]
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index b16812a4dd..3d71e8776b 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -76,33 +76,14 @@ def stop(self) -> None:
 
         self._client.close_all()  # type: ignore[union-attr]
 
-    def _deploy_docker(self, module_class: type[Module], *args: Any, **kwargs: Any) -> DockerModule:
-        from contextlib import suppress
-
-        logger.info("Deploying module in Docker.", module=module_class.__name__)
-        dm = DockerModule(module_class, *args, **kwargs)
-        try:
-            # why are docker modules started here? shouldn't they be started in start_all_modules?
-            # this is a bigger design problem we have with how blueprints, ModuleCoordinator, and WorkerManager are leaky abstractions with imperfect boundaries
-            # the Stream/RPC wiring (in blueprints) happens after deploy but before start. For docker modules, wiring needs the container's LCM transport to be reachable — which requires the container to be running.
-            # self.rpc.call_sync() send an RPC call to the container during wiring, the container must be running to handle that
-            # if we defer start() to start_all_modules, the container won't be up yet when _connect_streams and _connect_rpc_methods try to wire things
-            dm.start()
-        except Exception:
-            with suppress(Exception):
-                dm.stop()
-            raise
-        return dm
-
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
-
-        if is_docker_module(module_class):
-            module = self._deploy_docker(module_class, *args, **kwargs)  # type: ignore[assignment]
-        else:
-            module = self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
-
+        module = (
+            DockerModule(module_class, *args, **kwargs)  # type: ignore[assignment]
+            if is_docker_module(module_class)
+            else self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
+        )
         self._deployed_modules[module_class] = module  # type: ignore[assignment]
         return module  # type: ignore[return-value]
 
@@ -112,49 +93,38 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        # Separate docker modules from regular modules
-        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
-        worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
-        spec_indices: list[tuple[str, int]] = []  # ("docker"|"worker", index_in_sublist)
-
-        for spec in module_specs:
-            module_class = spec[0]
-            if is_docker_module(module_class):
-                spec_indices.append(("docker", len(docker_specs)))
-                docker_specs.append(spec)
-            else:
-                spec_indices.append(("worker", len(worker_specs)))
-                worker_specs.append(spec)
-
-        # Deploy worker modules in parallel via WorkerManager
+        docker_specs = [
+            (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class)
+        ]
+        worker_specs = [
+            (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class)
+        ]
+
         worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
 
-        # Deploy docker modules in parallel (each starts its own container)
+        docker_results: list[Any] = []
         if docker_specs:
             with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
-                futures = [
-                    executor.submit(self._deploy_docker, module_class, *args, **kwargs)
-                    for module_class, args, kwargs in docker_specs
-                ]
-                docker_results: list[Any] = [f.result() for f in futures]
-        else:
-            docker_results: list[Any] = []
-
-        # Reassemble results in original order
-        results: list[Any] = []
-        for kind, idx in spec_indices:
-            if kind == "docker":
-                results.append(docker_results[idx])
-            else:
-                results.append(worker_results[idx])
+                docker_results = list(
+                    executor.map(
+                        lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
+                    )
+                )
+
+        # Reassemble in original order
+        worker_iter = iter(worker_results)
+        docker_iter = iter(docker_results)
+        results: list[Any] = [
+            next(docker_iter) if is_docker_module(module_class) else next(worker_iter)
+            for module_class, _, _ in module_specs
+        ]
 
         for (module_class, _, _), module in zip(module_specs, results, strict=True):
-            self._deployed_modules[module_class] = module
+            self._deployed_modules[module_class] = module  # type: ignore[assignment]
         return results  # type: ignore[return-value]
 
     def start_all_modules(self) -> None:
-        # Docker modules are already started during deploy, (see their deploy as to why this is)
-        modules = [m for cls, m in self._deployed_modules.items() if not is_docker_module(cls)]
+        modules = list(self._deployed_modules.values())
         if isinstance(self._client, WorkerManager):
             with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor:
                 list(executor.map(lambda m: m.start(), modules))
@@ -162,10 +132,9 @@ def start_all_modules(self) -> None:
             for module in modules:
                 module.start()
 
-        module_list = list(self._deployed_modules.values())
         for module in modules:
             if hasattr(module, "on_system_modules"):
-                module.on_system_modules(module_list)
+                module.on_system_modules(modules)
 
     def get_instance(self, module: type[ModuleT]) -> ModuleProxy:
         return self._deployed_modules.get(module)  # type: ignore[return-value, no-any-return]
diff --git a/dimos/core/o3dpickle.py b/dimos/core/o3dpickle.py
index 1c1464fece..1912ab7739 100644
--- a/dimos/core/o3dpickle.py
+++ b/dimos/core/o3dpickle.py
@@ -14,34 +14,25 @@
 
 import copyreg
 
-# open3d is imported lazily (inside functions) rather than at module level.
-# dimos.core.core imports this module just to register pickle handlers, and core is
-# imported by almost everything — including lightweight docker modules that don't use
-# open3d. A module-level import would drag in open3d's sklearn/scipy chain everywhere,
-# which crashes in environments where those packages aren't installed or version-matched.
-# (i.e. minimal docker envs)
+import numpy as np
+import open3d as o3d  # type: ignore[import-untyped]
 
-def reduce_external(obj):  # type: ignore[no-untyped-def]
-    import numpy as np
 
+def reduce_external(obj):  # type: ignore[no-untyped-def]
     # Convert Vector3dVector to numpy array for pickling
     points_array = np.asarray(obj.points)
     return (reconstruct_pointcloud, (points_array,))
 
 
 def reconstruct_pointcloud(points_array):  # type: ignore[no-untyped-def]
-    import open3d as o3d  # type: ignore[import-untyped]
-
+    # Create new PointCloud and assign the points
     pc = o3d.geometry.PointCloud()
     pc.points = o3d.utility.Vector3dVector(points_array)
     return pc
 
 
 def register_picklers() -> None:
-    try:
-        import open3d as o3d  # type: ignore[import-untyped]
-    except ImportError:
-        return  # open3d not installed in this environment; skip registration
-
+    # Register for the actual PointCloud class that gets instantiated
+    # We need to create a dummy PointCloud to get its actual class
     _dummy_pc = o3d.geometry.PointCloud()
     copyreg.pickle(_dummy_pc.__class__, reduce_external)
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 99c1debbb6..7a02682fda 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -94,36 +94,32 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
 
         # Should NOT go through worker manager
         mock_worker_mgr.deploy.assert_not_called()
-        # Should create a DockerModule and start it
+        # Should construct a DockerModule (container launch happens inside __init__)
         mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
-        mock_dm.start.assert_called_once()
+        # start() is NOT called during deploy — it's called in start_all_modules
+        mock_dm.start.assert_not_called()
         assert result is mock_dm
-        # Should be tracked
         assert coordinator.get_instance(FakeDockerModule) is mock_dm
 
         coordinator.stop()
 
     @patch("dimos.core.module_coordinator.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_docker_cleans_up_on_start_failure(
+    def test_deploy_docker_propagates_constructor_failure(
         self, mock_worker_manager_cls, mock_docker_module_cls
     ):
         mock_worker_mgr = MagicMock()
         mock_worker_manager_cls.return_value = mock_worker_mgr
 
-        mock_dm = MagicMock()
-        mock_dm.start.side_effect = RuntimeError("start failed")
-        mock_docker_module_cls.return_value = mock_dm
+        # Container launch fails inside __init__; DockerModule handles its own cleanup
+        mock_docker_module_cls.side_effect = RuntimeError("launch failed")
 
         coordinator = ModuleCoordinator()
         coordinator.start()
 
-        with pytest.raises(RuntimeError, match="start failed"):
+        with pytest.raises(RuntimeError, match="launch failed"):
             coordinator.deploy(FakeDockerModule)
 
-        # stop() called to clean up the failed container
-        mock_dm.stop.assert_called_once()
-
         coordinator.stop()
 
     @patch("dimos.core.module_coordinator.WorkerManager")
@@ -170,7 +166,8 @@ def test_deploy_parallel_separates_docker_and_regular(
         mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
         # Docker module gets its own DockerModule
         mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
-        mock_dm.start.assert_called_once()
+        # start() is NOT called during deploy — it's called in start_all_modules
+        mock_dm.start.assert_not_called()
 
         # Results are in original order
         assert results[0] is regular_proxy
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 871be6f5d2..187384854e 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -106,6 +106,11 @@ def start(self) -> None:
         super().start()
         self.greeting.subscribe(self._on_greeting)
 
+    @rpc
+    def send(self, text: str) -> None:
+        """Publish a prompt message onto the stream."""
+        self.prompt.publish(text)
+
     def _on_greeting(self, text: str) -> None:
         print(f"[PromptModule] Received: {text}")
 
@@ -130,7 +135,7 @@ def _on_greeting(self, text: str) -> None:
     print(docker_mod.greet("World"))
 
     # Test stream
-    prompt_mod.prompt.publish("stream test")
+    prompt_mod.send("stream test")
     time.sleep(2)
 
     coordinator.stop()

From 9ead3fd350f7f10e987c2da4e090ed259cc284a1 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 10:19:20 -0800
Subject: [PATCH 13/89] revert

---
 dimos/core/docker_runner.py            |  2 +-
 dimos/visualization/rerun/bridge.py    |  3 +++
 dimos/visualization/rerun/constants.py | 17 -----------------
 3 files changed, 4 insertions(+), 18 deletions(-)
 delete mode 100644 dimos/visualization/rerun/constants.py

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index c6a196b7a7..e1a583b285 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -28,7 +28,7 @@
 from dimos.core.rpc_client import RpcCall
 from dimos.protocol.rpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
-from dimos.visualization.rerun.constants import RERUN_GRPC_PORT, RERUN_WEB_PORT
+from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
 
 if TYPE_CHECKING:
     from collections.abc import Callable
diff --git a/dimos/visualization/rerun/bridge.py b/dimos/visualization/rerun/bridge.py
index 9cadbc617f..cc4b13ecb9 100644
--- a/dimos/visualization/rerun/bridge.py
+++ b/dimos/visualization/rerun/bridge.py
@@ -39,6 +39,9 @@
 from dimos.protocol.pubsub.patterns import Glob, pattern_matches
 from dimos.utils.logging_config import setup_logger
 
+RERUN_GRPC_PORT = 9876
+RERUN_WEB_PORT = 9090
+
 # TODO OUT visual annotations
 #
 # In the future it would be nice if modules can annotate their individual OUTs with (general or rerun specific)
diff --git a/dimos/visualization/rerun/constants.py b/dimos/visualization/rerun/constants.py
deleted file mode 100644
index e1c98176ad..0000000000
--- a/dimos/visualization/rerun/constants.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright 2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# isolated so that they can be imported into lightweight modules without importing all of rerun
-RERUN_GRPC_PORT = 9876
-RERUN_WEB_PORT = 9090

From b98d5d0469ccf77e8f8e976fe9b4e816fce0c829 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 10:29:35 -0800
Subject: [PATCH 14/89] cleanup

---
 dimos/core/docker_runner.py      |  4 ++--
 dimos/core/module.py             |  3 +--
 dimos/core/module_coordinator.py | 11 ++++++-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index e1a583b285..3f1b3031c7 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -25,7 +25,7 @@
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.module import ModuleConfig
-from dimos.core.rpc_client import RpcCall
+from dimos.core.rpc_client import RpcCall, ModuleProxy
 from dimos.protocol.rpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
 from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
@@ -183,7 +183,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
 # Host-side Docker-backed Module handle
 
 
-class DockerModule:
+class DockerModule(ModuleProxy):
     """
     Host-side handle for a module running inside Docker.
 
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 72df61d4c7..24be321ee2 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -218,12 +218,11 @@ def inputs(self) -> dict[str, In]:  # type: ignore[type-arg]
 
     @classproperty
     def rpcs(self) -> dict[str, Callable[..., Any]]:
-        _skip = {"rpcs", "blueprint", "module_info", "io"}
         return {
             name: getattr(self, name)
             for name in dir(self)
             if not name.startswith("_")
-            and name not in _skip
+            and name != "rpcs"  # Exclude the rpcs property itself to prevent recursion
             and callable(getattr(self, name, None))
             and hasattr(getattr(self, name), "__rpc__")
         }
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 3d71e8776b..c2483bdd74 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -79,11 +79,12 @@ def stop(self) -> None:
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
-        module = (
-            DockerModule(module_class, *args, **kwargs)  # type: ignore[assignment]
-            if is_docker_module(module_class)
-            else self._client.deploy(module_class, *args, **kwargs)  # type: ignore[union-attr, attr-defined, assignment]
-        )
+        
+        deployed_module : ModuleProxy
+        if is_docker_module(module_class):
+            deployed_module = DockerModule(module_class, *args, **kwargs)
+        else:
+            deployed_module = self._client.deploy(module_class, *args, **kwargs)
         self._deployed_modules[module_class] = module  # type: ignore[assignment]
         return module  # type: ignore[return-value]
 

From 2fed467ebee9d855a99493ceeed923388902456c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 11:01:11 -0800
Subject: [PATCH 15/89] fixup  deploy_parallel

---
 dimos/core/module.py             |  2 +-
 dimos/core/module_coordinator.py | 33 ++++++++++++++------------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/dimos/core/module.py b/dimos/core/module.py
index 24be321ee2..14aeea6da5 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -446,7 +446,7 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool:  # type
 
     @rpc
     def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]:
-        """Configure stream transports in bulk by topic. Called by DockerModule for stream wiring.
+        """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring.
 
         Args:
             streams: mapping of stream_name -> topic
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index c2483bdd74..8698af55cf 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -94,16 +94,19 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        docker_specs = [
-            (module_class, args, kwargs) for module_class, args, kwargs in module_specs if is_docker_module(module_class)
-        ]
-        worker_specs = [
-            (module_class, args, kwargs) for module_class, args, kwargs in module_specs if not is_docker_module(module_class)
-        ]
+        # Separate docker modules from regular modules
+        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        spec_indices: list[tuple[str, int]] = []  # ("docker"|"worker", index_in_sublist)
 
-        worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
+        for module_class, args, kwargs in module_specs:
+            if is_docker_module(module_class):
+                docker_specs.append(spec)
+            else:
+                worker_specs.append(spec)
 
-        docker_results: list[Any] = []
+        worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
+        docker_results = []
         if docker_specs:
             with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
                 docker_results = list(
@@ -111,17 +114,9 @@ def deploy_parallel(
                         lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
                     )
                 )
-
-        # Reassemble in original order
-        worker_iter = iter(worker_results)
-        docker_iter = iter(docker_results)
-        results: list[Any] = [
-            next(docker_iter) if is_docker_module(module_class) else next(worker_iter)
-            for module_class, _, _ in module_specs
-        ]
-
-        for (module_class, _, _), module in zip(module_specs, results, strict=True):
-            self._deployed_modules[module_class] = module  # type: ignore[assignment]
+        
+        for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True):
+            self._deployed_modules[module_class] = module
         return results  # type: ignore[return-value]
 
     def start_all_modules(self) -> None:

From 9b7696bc19dffca53cca0526fa0395f3e791ac12 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 11:08:13 -0800
Subject: [PATCH 16/89] clean up reconnect logic

---
 dimos/core/docker_runner.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 3f1b3031c7..c7e40f0997 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -140,7 +140,7 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s
     return out + ("\n" + err if err else "")
 
 
-def _prompt_restart(container_name: str) -> bool:
+def _prompt_reconnect(container_name: str) -> bool:
     """Ask the user whether to restart a running container.
 
     Returns True to restart, False to reuse.
@@ -152,7 +152,7 @@ def _prompt_restart(container_name: str) -> bool:
         logger.warning(
             f"Container '{container_name}' already running — restarting (non-interactive)."
         )
-        return True
+        return False
 
     print(f"\nContainer '{container_name}' is already running.")
     print("  [r] Restart  — stop the existing container and start a fresh one")
@@ -160,9 +160,9 @@ def _prompt_restart(container_name: str) -> bool:
     while True:
         choice = input("Choice [r/u]: ").strip().lower()
         if choice in ("r", "restart"):
-            return True
-        if choice in ("u", "use"):
             return False
+        if choice in ("u", "use"):
+            return True
         print("Please enter 'r' or 'u'.")
 
 
@@ -258,12 +258,14 @@ def start(self) -> None:
         try:
 
             cfg = self.config
+            reconnect = False
             if _is_container_running(cfg, self._container_name):
-                restart = _prompt_restart(self._container_name)
-                if restart:
+                reconnect = _prompt_reconnect(self._container_name)
+                if not reconnect:
                     _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
-                    _remove_container(cfg, self._container_name)
-
+            if not reconnect:
+                _remove_container(cfg, self._container_name)
+            
             cmd = self._build_docker_run_command()
             logger.info(f"Starting docker container: {self._container_name}")
             r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)

From 3ec607089a5d739f2386712056872f928743ce3d Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 11:20:53 -0800
Subject: [PATCH 17/89] fixup

---
 dimos/core/module_coordinator.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 8698af55cf..9689a6119b 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -114,19 +114,17 @@ def deploy_parallel(
                         lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
                     )
                 )
+        specs = worker_specs+docker_specs
+        results = worker_results+docker_results
         
-        for (module_class, _, _), module in zip(worker_specs+docker_specs, worker_results+docker_results, strict=True):
+        for (module_class, _, _), module in zip(specs, results, strict=True):
             self._deployed_modules[module_class] = module
         return results  # type: ignore[return-value]
 
     def start_all_modules(self) -> None:
         modules = list(self._deployed_modules.values())
-        if isinstance(self._client, WorkerManager):
-            with ThreadPoolExecutor(max_workers=max(len(modules), 1)) as executor:
-                list(executor.map(lambda m: m.start(), modules))
-        else:
-            for module in modules:
-                module.start()
+        with ThreadPoolExecutor(max_workers=len(modules)) as executor:
+            list(executor.map(lambda m: m.start(), modules))
 
         for module in modules:
             if hasattr(module, "on_system_modules"):

From e06be8ec142e554e14ff1103b9bcc6e19619afc0 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 11:22:15 -0800
Subject: [PATCH 18/89] -

---
 dimos/core/module_coordinator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 9689a6119b..2d15734b30 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -85,8 +85,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:
             deployed_module = DockerModule(module_class, *args, **kwargs)
         else:
             deployed_module = self._client.deploy(module_class, *args, **kwargs)
-        self._deployed_modules[module_class] = module  # type: ignore[assignment]
-        return module  # type: ignore[return-value]
+        self._deployed_modules[module_class] = deployed_module
+        return deployed_module
 
     def deploy_parallel(
         self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]]

From 06181edae2b61d2d1c6abc98953772c0907e4db5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 11:53:32 -0800
Subject: [PATCH 19/89] fix deployment/coordinator timeline

---
 dimos/core/docker_runner.py | 110 ++++++++++++++++++------------------
 dimos/core/module.py        |  22 --------
 2 files changed, 54 insertions(+), 78 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index c7e40f0997..fb3fc28af7 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -196,12 +196,12 @@ class DockerModule(ModuleProxy):
     config : DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
-        # Config
+        from dimos.core.docker_build import build_image, image_exists
+
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
         assert issubclass(config_class, DockerModuleConfig)
         config = config_class(**kwargs)
-        
-        # Module info
+
         self._module_class = module_class
         self.config = config
         self._args = args
@@ -212,13 +212,43 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             module_class, config
         )
 
-
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
-        self._deferred_transports: dict[str, str] = {}  # stream_name -> topic
+
+        # Build image, launch container, wait for RPC server — mirrors worker Module.__init__
+        try:
+            if not image_exists(config):
+                logger.info(f"Building {config.docker_image}")
+                build_image(config)
+
+            reconnect = False
+            if _is_container_running(config, self._container_name):
+                reconnect = _prompt_reconnect(self._container_name)
+                if not reconnect:
+                    _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+            if not reconnect:
+                _remove_container(config, self._container_name)
+
+            cmd = self._build_docker_run_command()
+            logger.info(f"Starting docker container: {self._container_name}")
+            r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
+            if r.returncode != 0:
+                raise RuntimeError(
+                    f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+                )
+
+            self.rpc.start()
+            self._running = True
+            # docker run -d returns before Module.__init__ finishes in the container,
+            # so we poll until the RPC server is reachable before returning.
+            self._wait_for_rpc()
+        except Exception:
+            with suppress(Exception):
+                self.stop()
+            raise
 
     @staticmethod
     def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str:
@@ -236,6 +266,11 @@ def get_rpc_method_names(self) -> list[str]:
     def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         callable.set_rpc(self.rpc)
         self._bound_rpc_calls[method] = callable
+        # Forward to container — Module.set_rpc_method unpickles the RpcCall
+        # and wires it with the container's own LCMRPC
+        self.rpc.call_sync(
+            f"{self.remote_name}/set_rpc_method", ([method, callable], {})
+        )
 
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
         missing = set(methods) - self._bound_rpc_calls.keys()
@@ -245,38 +280,8 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
         return calls[0] if len(calls) == 1 else calls
 
     def start(self) -> None:
-        """Invoke the remote module's start() RPC.
-
-        Called after stream transports are wired so the module can subscribe
-        to its streams with valid transports.
-        """
-        from dimos.core.docker_build import build_image, image_exists
-
-        if not image_exists(self.config):
-            logger.info(f"Building {self.config.docker_image}")
-            build_image(self.config)
+        """Invoke the remote module's start() RPC."""
         try:
-
-            cfg = self.config
-            reconnect = False
-            if _is_container_running(cfg, self._container_name):
-                reconnect = _prompt_reconnect(self._container_name)
-                if not reconnect:
-                    _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
-            if not reconnect:
-                _remove_container(cfg, self._container_name)
-            
-            cmd = self._build_docker_run_command()
-            logger.info(f"Starting docker container: {self._container_name}")
-            r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
-            if r.returncode != 0:
-                raise RuntimeError(
-                    f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
-                )
-
-            self.rpc.start()
-            self._running = True
-            self._configure_streams(self._deferred_transports)
             self.rpc.call_sync(f"{self.remote_name}/start", ([], {}))
         except Exception:
             with suppress(Exception):
@@ -285,10 +290,11 @@ def start(self) -> None:
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
-        # Signal remote module, stop RPC, unsubscribe handlers (ignore failures)
+        if not self._running:
+            return
+
         with suppress(Exception):
-            if self._running:
-                self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
+            self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         with suppress(Exception):
             self.rpc.stop()
         for unsub in self._unsub_fns:
@@ -296,7 +302,6 @@ def stop(self) -> None:
                 unsub()
         self._unsub_fns.clear()
 
-        # Stop and remove container
         _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
         _remove_container(self.config, self._container_name)
         self._running = False
@@ -315,14 +320,11 @@ def tail_logs(self, n: int = 200) -> str:
         return _tail_logs(self.config, self._container_name, n=n)
 
     def set_transport(self, stream_name: str, transport: Any) -> bool:
-        """Defer stream transport config until start() when the container is running."""
-        topic = getattr(transport, "topic", None)
-        if topic is None:
-            return False
-        if hasattr(topic, "topic"):
-            topic = topic.topic
-        self._deferred_transports[stream_name] = str(topic)
-        return True
+        """Forward to the container's Module.set_transport RPC."""
+        result, _ = self.rpc.call_sync(
+            f"{self.remote_name}/set_transport", ([stream_name, transport], {})
+        )
+        return bool(result)
 
     def __getattr__(self, name: str) -> Any:
         if name in self.rpcs:
@@ -480,12 +482,8 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
         # DimOS base image entrypoint already runs "dimos.core.docker_runner run"
         return ["--payload", json.dumps(payload, separators=(",", ":"))]
 
-    def _configure_streams(self, streams: dict[str, str]) -> None:
-        """Poll configure_streams RPC until the container's RPC server is up, then wire streams.
-
-        Also serves as the liveness gate — the first successful call proves the
-        container is ready to accept RPCs.
-        """
+    def _wait_for_rpc(self) -> None:
+        """Poll until the container's RPC server is reachable."""
         cfg = self.config
         start_time = time.time()
 
@@ -498,8 +496,8 @@ def _configure_streams(self, streams: dict[str, str]) -> None:
 
             try:
                 self.rpc.call_sync(
-                    f"{self.remote_name}/configure_streams",
-                    ([streams], {}),
+                    f"{self.remote_name}/get_rpc_method_names",
+                    ([], {}),
                     rpc_timeout=RPC_READY_TIMEOUT,
                 )
                 elapsed = time.time() - start_time
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 14aeea6da5..af642b71bd 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -444,28 +444,6 @@ def set_transport(self, stream_name: str, transport: Transport) -> bool:  # type
         stream._transport = transport
         return True
 
-    @rpc
-    def configure_streams(self, streams: dict[str, str]) -> dict[str, bool]:
-        """Configure stream transports in bulk by topic. NOTE: called before start, used by DockerModule for stream wiring.
-
-        Args:
-            streams: mapping of stream_name -> topic
-
-        Returns:
-            mapping of stream_name -> success
-        """
-        from dimos.core.transport import pLCMTransport
-
-        results: dict[str, bool] = {}
-        for stream_name, topic in streams.items():
-            stream = getattr(self, stream_name, None)
-            if not isinstance(stream, (Out, In)):
-                results[stream_name] = False
-            else:
-                stream._transport = pLCMTransport(topic)
-                results[stream_name] = True
-        return results
-
     # called from remote
     def connect_stream(self, input_name: str, remote_stream: RemoteOut[T]):  # type: ignore[no-untyped-def]
         input_stream = getattr(self, input_name, None)

From d87ab954912212cf71e0bdcde5366a011913ec30 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:32:36 -0800
Subject: [PATCH 20/89] fir enforcement of either dockerfile or image pull

---
 dimos/core/docker_runner.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index fb3fc28af7..c7b2528969 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -221,8 +221,17 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         # Build image, launch container, wait for RPC server — mirrors worker Module.__init__
         try:
             if not image_exists(config):
-                logger.info(f"Building {config.docker_image}")
-                build_image(config)
+                if config.docker_file is not None:
+                    logger.info(f"Building {config.docker_image}")
+                    build_image(config)
+                else:
+                    logger.info(f"Pulling {config.docker_image}")
+                    r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT)
+                    if r.returncode != 0:
+                        raise RuntimeError(
+                            f"Failed to pull image '{config.docker_image}'.\n"
+                            f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+                        )
 
             reconnect = False
             if _is_container_running(config, self._container_name):

From b3d24ef364f8de94aa0666b4b94d544ffaddb17d Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:33:24 -0800
Subject: [PATCH 21/89] fix reconnect system

---
 dimos/core/docker_runner.py | 49 ++++++++++---------------------------
 1 file changed, 13 insertions(+), 36 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index c7b2528969..8cca64ca16 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -140,31 +140,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s
     return out + ("\n" + err if err else "")
 
 
-def _prompt_reconnect(container_name: str) -> bool:
-    """Ask the user whether to restart a running container.
-
-    Returns True to restart, False to reuse.
-    Falls back to restart when stdin is not a TTY (e.g. CI).
-    """
-    import sys
-
-    if not sys.stdin.isatty():
-        logger.warning(
-            f"Container '{container_name}' already running — restarting (non-interactive)."
-        )
-        return False
-
-    print(f"\nContainer '{container_name}' is already running.")
-    print("  [r] Restart  — stop the existing container and start a fresh one")
-    print("  [u] Use      — attach to the existing container as-is")
-    while True:
-        choice = input("Choice [r/u]: ").strip().lower()
-        if choice in ("r", "restart"):
-            return False
-        if choice in ("u", "use"):
-            return True
-        print("Please enter 'r' or 'u'.")
-
 
 def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
     """Extract JSON-serializable config fields for the container (excludes docker_* fields)."""
@@ -235,20 +210,22 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
 
             reconnect = False
             if _is_container_running(config, self._container_name):
-                reconnect = _prompt_reconnect(self._container_name)
-                if not reconnect:
+                if config.docker_reconnect_container:
+                    logger.info(f"Reconnecting to running container: {self._container_name}")
+                    reconnect = True
+                else:
+                    logger.info(f"Stopping existing container: {self._container_name}")
                     _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+
             if not reconnect:
                 _remove_container(config, self._container_name)
-
-            cmd = self._build_docker_run_command()
-            logger.info(f"Starting docker container: {self._container_name}")
-            r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
-            if r.returncode != 0:
-                raise RuntimeError(
-                    f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
-                )
-
+                cmd = self._build_docker_run_command()
+                logger.info(f"Starting docker container: {self._container_name}")
+                r = _run(cmd, timeout=DOCKER_RUN_TIMEOUT)
+                if r.returncode != 0:
+                    raise RuntimeError(
+                        f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+                    )
             self.rpc.start()
             self._running = True
             # docker run -d returns before Module.__init__ finishes in the container,

From 83cb7c7422dbcb14168a4fd3be444beca6a6ef98 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:33:42 -0800
Subject: [PATCH 22/89] -

---
 dimos/core/docker_runner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 8cca64ca16..15677a0e03 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -98,6 +98,9 @@ class DockerModuleConfig(ModuleConfig):
     docker_startup_timeout: float = 120.0
     docker_poll_interval: float = 1.0
 
+    # Reconnect to a running container instead of restarting it
+    docker_reconnect_container: bool = False
+
     # Advanced
     docker_bin: str = "docker"
 

From d62396bda3844a05f2ae66b8fd081bfb08f7176c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:34:14 -0800
Subject: [PATCH 23/89] fix deploy_parallel

---
 dimos/core/module_coordinator.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 2d15734b30..59e1e5a657 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -94,19 +94,11 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        # Separate docker modules from regular modules
-        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
-        worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
-        spec_indices: list[tuple[str, int]] = []  # ("docker"|"worker", index_in_sublist)
-
-        for module_class, args, kwargs in module_specs:
-            if is_docker_module(module_class):
-                docker_specs.append(spec)
-            else:
-                worker_specs.append(spec)
+        docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])]
+        worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])]
 
         worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
-        docker_results = []
+        docker_results: list[Any] = []
         if docker_specs:
             with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
                 docker_results = list(
@@ -114,12 +106,13 @@ def deploy_parallel(
                         lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
                     )
                 )
-        specs = worker_specs+docker_specs
-        results = worker_results+docker_results
-        
-        for (module_class, _, _), module in zip(specs, results, strict=True):
+
+        results = worker_results + docker_results
+        for (module_class, _, _), module in zip(
+            worker_specs + docker_specs, results, strict=True
+        ):
             self._deployed_modules[module_class] = module
-        return results  # type: ignore[return-value]
+        return results
 
     def start_all_modules(self) -> None:
         modules = list(self._deployed_modules.values())

From 20aa4f1e0c0964a2ad6493015e62cb857aa62367 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:34:38 -0800
Subject: [PATCH 24/89] better error

---
 dimos/core/module_coordinator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 59e1e5a657..3dda7c38b0 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -116,6 +116,8 @@ def deploy_parallel(
 
     def start_all_modules(self) -> None:
         modules = list(self._deployed_modules.values())
+        if not modules:
+            raise ValueError("No modules deployed. Call deploy() before start_all_modules().")
         with ThreadPoolExecutor(max_workers=len(modules)) as executor:
             list(executor.map(lambda m: m.start(), modules))
 

From 34598539176d7f763825aa7febeeca027b833de5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:36:59 -0800
Subject: [PATCH 25/89] clean container name generation

---
 dimos/core/docker_runner.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 15677a0e03..d11a68e2a1 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -186,9 +186,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._kwargs = kwargs
         self._running = False
         self.remote_name = module_class.__name__
-        self._container_name = config.docker_container_name or self._default_container_name(
-            module_class, config
-        )
+        # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo"
+        image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1]
+        self._container_name = config.docker_container_name or f"dimos_{image_base}"
 
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
@@ -239,16 +239,6 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                 self.stop()
             raise
 
-    @staticmethod
-    def _default_container_name(module_class: type[Module], config: DockerModuleConfig) -> str:
-        import hashlib
-
-        name = module_class.__name__.lower()
-        path_hash = hashlib.sha256(
-            str(config.docker_file.resolve()).encode()  # type: ignore[union-attr]
-        ).hexdigest()[:12]
-        return f"dimos_{name}_{path_hash}"
-
     def get_rpc_method_names(self) -> list[str]:
         return self.rpc_calls
 

From 5538b6517e6808e06596aa761a4ab457476ed66c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:43:44 -0800
Subject: [PATCH 26/89] fixup typing for ModuleProxy

---
 dimos/core/docker_runner.py      | 24 +++++++++++++-----------
 dimos/core/module_coordinator.py | 10 +++++-----
 dimos/core/rpc_client.py         | 15 +++++++++++++--
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index d11a68e2a1..74e7c840c8 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -25,7 +25,7 @@
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.module import ModuleConfig
-from dimos.core.rpc_client import RpcCall, ModuleProxy
+from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall
 from dimos.protocol.rpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
 from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
@@ -161,7 +161,7 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
 # Host-side Docker-backed Module handle
 
 
-class DockerModule(ModuleProxy):
+class DockerModule(ModuleProxyProtocol):
     """
     Host-side handle for a module running inside Docker.
 
@@ -171,13 +171,17 @@ class DockerModule(ModuleProxy):
 
     Communication: All RPC happens via LCM multicast (requires --network=host).
     """
-    config : DockerModuleConfig
+    config: DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
         from dimos.core.docker_build import build_image, image_exists
 
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
-        assert issubclass(config_class, DockerModuleConfig)
+        if not issubclass(config_class, DockerModuleConfig):
+            raise TypeError(
+                f"{module_class.__name__}.default_config must be a DockerModuleConfig subclass, "
+                f"got {config_class.__name__}"
+            )
         config = config_class(**kwargs)
 
         self._module_class = module_class
@@ -196,7 +200,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
 
-        # Build image, launch container, wait for RPC server — mirrors worker Module.__init__
+        # Build or pull image, launch container, wait for RPC server
         try:
             if not image_exists(config):
                 if config.docker_file is not None:
@@ -269,9 +273,6 @@ def start(self) -> None:
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
-        if not self._running:
-            return
-
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         with suppress(Exception):
@@ -280,9 +281,10 @@ def stop(self) -> None:
             with suppress(Exception):
                 unsub()
         self._unsub_fns.clear()
-
-        _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
-        _remove_container(self.config, self._container_name)
+        with suppress(Exception):
+            _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+        with suppress(Exception):
+            _remove_container(self.config, self._container_name)
         self._running = False
         logger.info(f"Stopped container: {self._container_name}")
 
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 3dda7c38b0..5534d9f9a7 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -27,7 +27,7 @@
 if TYPE_CHECKING:
     from dimos.core.module import Module, ModuleT
     from dimos.core.resource_monitor.monitor import StatsMonitor
-    from dimos.core.rpc_client import ModuleProxy
+    from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol
 
 logger = setup_logger()
 
@@ -37,7 +37,7 @@ class ModuleCoordinator(Resource):  # type: ignore[misc]
     _global_config: GlobalConfig
     _n: int | None = None
     _memory_limit: str = "auto"
-    _deployed_modules: dict[type[Module], ModuleProxy]
+    _deployed_modules: dict[type[Module], ModuleProxyProtocol]
     _stats_monitor: StatsMonitor | None = None
 
     def __init__(
@@ -79,14 +79,14 @@ def stop(self) -> None:
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
-        
-        deployed_module : ModuleProxy
+
+        deployed_module: ModuleProxyProtocol
         if is_docker_module(module_class):
             deployed_module = DockerModule(module_class, *args, **kwargs)
         else:
             deployed_module = self._client.deploy(module_class, *args, **kwargs)
         self._deployed_modules[module_class] = deployed_module
-        return deployed_module
+        return deployed_module  # type: ignore[return-value]
 
     def deploy_parallel(
         self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]]
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index e46124469c..a89c54caf0 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections.abc import Callable
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Protocol
 
 from dimos.core.stream import RemoteStream
 from dimos.core.worker import MethodCallProxy
@@ -80,7 +80,18 @@ def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
         self._stop_rpc_client = None
 
 
-class RPCClient:
+class ModuleProxyProtocol(Protocol):
+    """Protocol for host-side handles to remote modules (worker or Docker)."""
+
+    def start(self) -> None: ...
+    def stop(self) -> None: ...
+    def set_transport(self, stream_name: str, transport: Any) -> bool: ...
+    def get_rpc_method_names(self) -> list[str]: ...
+    def set_rpc_method(self, method: str, callable: RpcCall) -> None: ...
+    def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
+
+
+class RPCClient(ModuleProxyProtocol):
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
         self.rpc = LCMRPC()
         self.actor_class = actor_class

From 52146f21fb94953cb0f086c276bd673185fed2f3 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 12:53:53 -0800
Subject: [PATCH 27/89] misc

---
 dimos/core/docker_runner.py                |  6 ++---
 dimos/core/module_coordinator.py           | 28 ++++++++++++----------
 dimos/core/tests/test_docker_deployment.py |  2 +-
 pyproject.toml                             |  4 +++-
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 74e7c840c8..1a0fc718ae 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -190,9 +190,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._kwargs = kwargs
         self._running = False
         self.remote_name = module_class.__name__
-        # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo"
-        image_base = config.docker_image.rsplit(":", 1)[0].rsplit("/", 1)[-1]
-        self._container_name = config.docker_container_name or f"dimos_{image_base}"
+        # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2"
+        image_ref = config.docker_image.rsplit("/", 1)[-1]
+        self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}"
 
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 5534d9f9a7..90538cfc0a 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -97,21 +97,25 @@ def deploy_parallel(
         docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])]
         worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])]
 
-        worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
+        worker_results: list[Any] = []
         docker_results: list[Any] = []
-        if docker_specs:
-            with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
-                docker_results = list(
-                    executor.map(
-                        lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
+        try:
+            worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
+            if docker_specs:
+                with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
+                    docker_results = list(
+                        executor.map(
+                            lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
+                        )
                     )
-                )
+        finally:
+            results = worker_results + docker_results
+            # Register whatever succeeded so stop() can clean them up
+            for (module_class, _, _), module in zip(
+                worker_specs + docker_specs, results, strict=False
+            ):
+                self._deployed_modules[module_class] = module
 
-        results = worker_results + docker_results
-        for (module_class, _, _), module in zip(
-            worker_specs + docker_specs, results, strict=True
-        ):
-            self._deployed_modules[module_class] = module
         return results
 
     def start_all_modules(self) -> None:
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 7a02682fda..e6ddbc4a73 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         # start() is NOT called during deploy — it's called in start_all_modules
         mock_dm.start.assert_not_called()
 
-        # Results are in original order
+        # Results are worker-first, then docker
         assert results[0] is regular_proxy
         assert results[1] is mock_dm
 
diff --git a/pyproject.toml b/pyproject.toml
index dcd2a5d987..31d3322453 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -321,10 +321,12 @@ docker = [
     "sortedcontainers",
     "PyTurboJPEG",
     "rerun-sdk",
-    "langchain-core",
     "typing_extensions",
     "open3d-unofficial-arm; platform_system == 'Linux' and platform_machine == 'aarch64'",
     "open3d>=0.18.0; platform_system != 'Linux' or platform_machine != 'aarch64'",
+    # these below should be removed later, right now they are needed even for running `dimos --help` (seperate non-docker issue)
+    "langchain-core",
+    "matplotlib",
 ]
 
 base = [

From 3cf2dff187037a59f46d1b507fb53d3a93a2024e Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 13:48:44 -0800
Subject: [PATCH 28/89] testing fixup

---
 dimos/core/docker_runner.py                | 25 +++++++++++++++-------
 dimos/core/module_coordinator.py           |  5 ++++-
 dimos/core/rpc_client.py                   |  2 +-
 dimos/core/test_core.py                    |  2 +-
 dimos/core/tests/test_docker_deployment.py |  8 +++----
 uv.lock                                    |  2 ++
 6 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 1a0fc718ae..7ce89c40e6 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -143,7 +143,6 @@ def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> s
     return out + ("\n" + err if err else "")
 
 
-
 def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
     """Extract JSON-serializable config fields for the container (excludes docker_* fields)."""
     out: dict[str, Any] = {}
@@ -171,6 +170,7 @@ class DockerModule(ModuleProxyProtocol):
 
     Communication: All RPC happens via LCM multicast (requires --network=host).
     """
+
     config: DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
@@ -192,7 +192,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self.remote_name = module_class.__name__
         # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2"
         image_ref = config.docker_image.rsplit("/", 1)[-1]
-        self._container_name = config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}"
+        self._container_name = (
+            config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}"
+        )
 
         self.rpc = LCMRPC()
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
@@ -208,7 +210,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     build_image(config)
                 else:
                     logger.info(f"Pulling {config.docker_image}")
-                    r = _run([_docker_bin(config), "pull", config.docker_image], timeout=DOCKER_RUN_TIMEOUT)
+                    r = _run(
+                        [_docker_bin(config), "pull", config.docker_image],
+                        timeout=DOCKER_RUN_TIMEOUT,
+                    )
                     if r.returncode != 0:
                         raise RuntimeError(
                             f"Failed to pull image '{config.docker_image}'.\n"
@@ -222,7 +227,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     reconnect = True
                 else:
                     logger.info(f"Stopping existing container: {self._container_name}")
-                    _run([_docker_bin(config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+                    _run(
+                        [_docker_bin(config), "stop", self._container_name],
+                        timeout=DOCKER_STOP_TIMEOUT,
+                    )
 
             if not reconnect:
                 _remove_container(config, self._container_name)
@@ -251,9 +259,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         self._bound_rpc_calls[method] = callable
         # Forward to container — Module.set_rpc_method unpickles the RpcCall
         # and wires it with the container's own LCMRPC
-        self.rpc.call_sync(
-            f"{self.remote_name}/set_rpc_method", ([method, callable], {})
-        )
+        self.rpc.call_sync(f"{self.remote_name}/set_rpc_method", ([method, callable], {}))
 
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
         missing = set(methods) - self._bound_rpc_calls.keys()
@@ -282,7 +288,10 @@ def stop(self) -> None:
                 unsub()
         self._unsub_fns.clear()
         with suppress(Exception):
-            _run([_docker_bin(self.config), "stop", self._container_name], timeout=DOCKER_STOP_TIMEOUT)
+            _run(
+                [_docker_bin(self.config), "stop", self._container_name],
+                timeout=DOCKER_STOP_TIMEOUT,
+            )
         with suppress(Exception):
             _remove_container(self.config, self._container_name)
         self._running = False
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 90538cfc0a..3e8ff31018 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,7 +18,6 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.docker_runner import DockerModule, is_docker_module
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
@@ -77,6 +76,8 @@ def stop(self) -> None:
         self._client.close_all()  # type: ignore[union-attr]
 
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
+        from dimos.core.docker_runner import DockerModule, is_docker_module
+
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
@@ -91,6 +92,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:
     def deploy_parallel(
         self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]]
     ) -> list[ModuleProxy]:
+        from dimos.core.docker_runner import DockerModule, is_docker_module
+
         if not self._client:
             raise ValueError("Not started")
 
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index a89c54caf0..c9e73ac54e 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -91,7 +91,7 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None: ...
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
 
 
-class RPCClient(ModuleProxyProtocol):
+class RPCClient:
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
         self.rpc = LCMRPC()
         self.actor_class = actor_class
diff --git a/dimos/core/test_core.py b/dimos/core/test_core.py
index 197539ef67..30f14c93b4 100644
--- a/dimos/core/test_core.py
+++ b/dimos/core/test_core.py
@@ -80,7 +80,7 @@ def test_classmethods() -> None:
     # Check that we have the expected RPC methods
     assert "navigate_to" in class_rpcs, "navigate_to should be in rpcs"
     assert "start" in class_rpcs, "start should be in rpcs"
-    assert len(class_rpcs) == 9
+    assert len(class_rpcs) == 8
 
     # Check that the values are callable
     assert callable(class_rpcs["navigate_to"]), "navigate_to should be callable"
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index e6ddbc4a73..f60f37a21a 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -78,7 +78,7 @@ class Bare(Module):
 
 
 class TestModuleCoordinatorDockerRouting:
-    @patch("dimos.core.module_coordinator.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -103,7 +103,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
 
         coordinator.stop()
 
-    @patch("dimos.core.module_coordinator.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_docker_propagates_constructor_failure(
         self, mock_worker_manager_cls, mock_docker_module_cls
@@ -139,7 +139,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage
 
         coordinator.stop()
 
-    @patch("dimos.core.module_coordinator.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_parallel_separates_docker_and_regular(
         self, mock_worker_manager_cls, mock_docker_module_cls
@@ -175,7 +175,7 @@ def test_deploy_parallel_separates_docker_and_regular(
 
         coordinator.stop()
 
-    @patch("dimos.core.module_coordinator.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
diff --git a/uv.lock b/uv.lock
index 084e157ee5..820bb92f2d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1852,6 +1852,7 @@ docker = [
     { name = "dimos-lcm" },
     { name = "langchain-core" },
     { name = "lcm" },
+    { name = "matplotlib" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "open3d", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" },
@@ -2022,6 +2023,7 @@ requires-dist = [
     { name = "lcm", marker = "extra == 'docker'" },
     { name = "llvmlite", specifier = ">=0.42.0" },
     { name = "lxml-stubs", marker = "extra == 'dev'", specifier = ">=0.5.1,<1" },
+    { name = "matplotlib", marker = "extra == 'docker'" },
     { name = "matplotlib", marker = "extra == 'manipulation'", specifier = ">=3.7.1" },
     { name = "md-babel-py", marker = "extra == 'dev'", specifier = "==1.1.1" },
     { name = "moondream", marker = "extra == 'perception'" },

From 8eaed575ede4a356f3f9507b742f3e214f09a687 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 15:54:12 -0800
Subject: [PATCH 29/89] maintain order

---
 dimos/core/docker_runner.py      | 50 ++++++++++++++++++++++++++++----
 dimos/core/module_coordinator.py | 29 +++++++++++++-----
 2 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 7ce89c40e6..776cef516d 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -39,6 +39,7 @@
 logger = setup_logger()
 
 DOCKER_RUN_TIMEOUT = 120  #     Timeout for `docker run` command execution
+DOCKER_PULL_TIMEOUT = 600  #     Timeout for `docker pull` (large images over slow connections)
 DOCKER_CMD_TIMEOUT = 20  #       Timeout for quick Docker commands (inspect, rm, logs)
 DOCKER_STATUS_TIMEOUT = 10  #    Timeout for container status checks
 DOCKER_STOP_TIMEOUT = 30  #      Timeout for `docker stop` command (graceful shutdown)
@@ -136,6 +137,31 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool:
     return r.returncode == 0 and r.stdout.strip() == "true"
 
 
+def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None:
+    """Return the container's start time as a Unix timestamp, or None on failure."""
+    r = _run(
+        [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name],
+        timeout=DOCKER_STATUS_TIMEOUT,
+    )
+    if r.returncode != 0:
+        return None
+    from datetime import datetime
+
+    try:
+        # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z"
+        raw = r.stdout.strip()
+        # Truncate nanoseconds to microseconds for fromisoformat compatibility
+        if "." in raw:
+            base, frac = raw.split(".", 1)
+            frac = frac.rstrip("Z")[:6]
+            raw = f"{base}.{frac}+00:00"
+        else:
+            raw = raw.rstrip("Z") + "+00:00"
+        return datetime.fromisoformat(raw).timestamp()
+    except (ValueError, OSError):
+        return None
+
+
 def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str:
     r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
     out = (r.stdout or "").rstrip()
@@ -190,10 +216,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._kwargs = kwargs
         self._running = False
         self.remote_name = module_class.__name__
-        # Derive container name from image name: "my-registry/foo:v2" → "dimos_foo_v2"
+        # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2"
         image_ref = config.docker_image.rsplit("/", 1)[-1]
         self._container_name = (
-            config.docker_container_name or f"dimos_{image_ref.replace(':', '_')}"
+            config.docker_container_name
+            or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}"
         )
 
         self.rpc = LCMRPC()
@@ -212,7 +239,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     logger.info(f"Pulling {config.docker_image}")
                     r = _run(
                         [_docker_bin(config), "pull", config.docker_image],
-                        timeout=DOCKER_RUN_TIMEOUT,
+                        timeout=DOCKER_PULL_TIMEOUT,
                     )
                     if r.returncode != 0:
                         raise RuntimeError(
@@ -223,9 +250,18 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             reconnect = False
             if _is_container_running(config, self._container_name):
                 if config.docker_reconnect_container:
-                    logger.info(f"Reconnecting to running container: {self._container_name}")
-                    reconnect = True
-                else:
+                    # Verify the container hasn't restarted since we last ran
+                    container_start = _container_started_at(config, self._container_name)
+                    process_start = time.time()  # conservative: current time as upper bound
+                    if container_start is not None and container_start > process_start - 5:
+                        logger.warning(
+                            f"Container {self._container_name} appears to have restarted recently "
+                            f"(started at {container_start:.0f}). Treating as fresh start."
+                        )
+                    else:
+                        logger.info(f"Reconnecting to running container: {self._container_name}")
+                        reconnect = True
+                if not reconnect:
                     logger.info(f"Stopping existing container: {self._container_name}")
                     _run(
                         [_docker_bin(config), "stop", self._container_name],
@@ -279,6 +315,8 @@ def start(self) -> None:
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
+        if not self._running:
+            return
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         with suppress(Exception):
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 3e8ff31018..01f657dd1a 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -97,8 +97,19 @@ def deploy_parallel(
         if not self._client:
             raise ValueError("Not started")
 
-        docker_specs = [spec for spec in module_specs if is_docker_module(spec[0])]
-        worker_specs = [spec for spec in module_specs if not is_docker_module(spec[0])]
+        # Split by type, tracking original indices for reassembly
+        docker_indices: list[int] = []
+        worker_indices: list[int] = []
+        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        # the i is needed for maintaining order on the returned output
+        for i, spec in enumerate(module_specs):
+            if is_docker_module(spec[0]):
+                docker_indices.append(i)
+                docker_specs.append(spec)
+            else:
+                worker_indices.append(i)
+                worker_specs.append(spec)
 
         worker_results: list[Any] = []
         docker_results: list[Any] = []
@@ -112,12 +123,16 @@ def deploy_parallel(
                         )
                     )
         finally:
-            results = worker_results + docker_results
+            # Reassemble results in original input order
+            results: list[Any] = [None] * len(module_specs)
+            for idx, mod in zip(worker_indices, worker_results, strict=False):
+                results[idx] = mod
+            for idx, mod in zip(docker_indices, docker_results, strict=False):
+                results[idx] = mod
             # Register whatever succeeded so stop() can clean them up
-            for (module_class, _, _), module in zip(
-                worker_specs + docker_specs, results, strict=False
-            ):
-                self._deployed_modules[module_class] = module
+            for spec, module in zip(module_specs, results, strict=False):
+                if module is not None:
+                    self._deployed_modules[spec[0]] = module
 
         return results
 

From 55c234df7d4351c1e737f97396d7b4a96b0b211b Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 16:12:44 -0800
Subject: [PATCH 30/89] refine

---
 dimos/core/docker_runner.py                 | 46 ++++-----------------
 dimos/core/tests/test_docker_deployment.py  |  2 +-
 examples/docker_hello_world/hello_docker.py |  1 +
 3 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 776cef516d..aacdbe7c19 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -137,31 +137,6 @@ def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool:
     return r.returncode == 0 and r.stdout.strip() == "true"
 
 
-def _container_started_at(cfg: DockerModuleConfig, name: str) -> float | None:
-    """Return the container's start time as a Unix timestamp, or None on failure."""
-    r = _run(
-        [_docker_bin(cfg), "inspect", "-f", "{{.State.StartedAt}}", name],
-        timeout=DOCKER_STATUS_TIMEOUT,
-    )
-    if r.returncode != 0:
-        return None
-    from datetime import datetime
-
-    try:
-        # Docker returns RFC 3339 with nanoseconds, e.g. "2024-01-02T03:04:05.123456789Z"
-        raw = r.stdout.strip()
-        # Truncate nanoseconds to microseconds for fromisoformat compatibility
-        if "." in raw:
-            base, frac = raw.split(".", 1)
-            frac = frac.rstrip("Z")[:6]
-            raw = f"{base}.{frac}+00:00"
-        else:
-            raw = raw.rstrip("Z") + "+00:00"
-        return datetime.fromisoformat(raw).timestamp()
-    except (ValueError, OSError):
-        return None
-
-
 def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str:
     r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
     out = (r.stdout or "").rstrip()
@@ -250,18 +225,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             reconnect = False
             if _is_container_running(config, self._container_name):
                 if config.docker_reconnect_container:
-                    # Verify the container hasn't restarted since we last ran
-                    container_start = _container_started_at(config, self._container_name)
-                    process_start = time.time()  # conservative: current time as upper bound
-                    if container_start is not None and container_start > process_start - 5:
-                        logger.warning(
-                            f"Container {self._container_name} appears to have restarted recently "
-                            f"(started at {container_start:.0f}). Treating as fresh start."
-                        )
-                    else:
-                        logger.info(f"Reconnecting to running container: {self._container_name}")
-                        reconnect = True
-                if not reconnect:
+                    logger.info(f"Reconnecting to running container: {self._container_name}")
+                    reconnect = True
+                else:
                     logger.info(f"Stopping existing container: {self._container_name}")
                     _run(
                         [_docker_bin(config), "stop", self._container_name],
@@ -284,7 +250,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             self._wait_for_rpc()
         except Exception:
             with suppress(Exception):
-                self.stop()
+                self._cleanup()
             raise
 
     def get_rpc_method_names(self) -> list[str]:
@@ -319,6 +285,10 @@ def stop(self) -> None:
             return
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
+        self._cleanup()
+
+    def _cleanup(self) -> None:
+        """Release all resources. Safe to call multiple times or from partial init."""
         with suppress(Exception):
             self.rpc.stop()
         for unsub in self._unsub_fns:
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index f60f37a21a..95db171e1c 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -169,7 +169,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         # start() is NOT called during deploy — it's called in start_all_modules
         mock_dm.start.assert_not_called()
 
-        # Results are worker-first, then docker
+        # Results preserve input order
         assert results[0] is regular_proxy
         assert results[1] is mock_dm
 
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 187384854e..eb4765a629 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -76,6 +76,7 @@ def _cowsay(self, text: str) -> str:
             ["/usr/games/cowsay", text],
             capture_output=True,
             text=True,
+            check=True,
         )
         return result.stdout
 

From d9d4716ec159f286dc86a00e3937cb005f091093 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 16:37:59 -0800
Subject: [PATCH 31/89] make pull out configurable

---
 dimos/core/docker_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index aacdbe7c19..89fa9d9af3 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -39,7 +39,7 @@
 logger = setup_logger()
 
 DOCKER_RUN_TIMEOUT = 120  #     Timeout for `docker run` command execution
-DOCKER_PULL_TIMEOUT = 600  #     Timeout for `docker pull` (large images over slow connections)
+DOCKER_PULL_TIMEOUT_DEFAULT = 600  # Default timeout for `docker pull`
 DOCKER_CMD_TIMEOUT = 20  #       Timeout for quick Docker commands (inspect, rm, logs)
 DOCKER_STATUS_TIMEOUT = 10  #    Timeout for container status checks
 DOCKER_STOP_TIMEOUT = 30  #      Timeout for `docker stop` command (graceful shutdown)
@@ -95,7 +95,8 @@ class DockerModuleConfig(ModuleConfig):
     docker_command: list[str] | None = None
     docker_extra_args: list[str] = field(default_factory=list)
 
-    # Startup readiness
+    # Timeouts
+    docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT
     docker_startup_timeout: float = 120.0
     docker_poll_interval: float = 1.0
 
@@ -214,7 +215,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     logger.info(f"Pulling {config.docker_image}")
                     r = _run(
                         [_docker_bin(config), "pull", config.docker_image],
-                        timeout=DOCKER_PULL_TIMEOUT,
+                        timeout=config.docker_pull_timeout,
                     )
                     if r.returncode != 0:
                         raise RuntimeError(

From 215e9ba7aa732d8ac974962b675164fc98496dd2 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 16:38:25 -0800
Subject: [PATCH 32/89] have example show using normal config

---
 examples/docker_hello_world/hello_docker.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index eb4765a629..66e95df316 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -56,6 +56,9 @@ class HelloDockerConfig(DockerModuleConfig):
     docker_restart_policy: str = "no"
     docker_env: dict[str, str] = field(default_factory=lambda: {"CI": "1"})
 
+    # Custom (non-docker) config field — passed to the container via JSON
+    greeting_prefix: str = "Hello"
+
 
 class HelloDockerModule(Module["HelloDockerConfig"]):
     """A trivial module that runs inside Docker and echoes greetings."""
@@ -88,7 +91,13 @@ def _on_prompt(self, text: str) -> None:
     @rpc
     def greet(self, name: str) -> str:
         """RPC method that can be called directly."""
-        return self._cowsay(f"Hello, {name}!")
+        prefix = self.config.greeting_prefix
+        return self._cowsay(f"{prefix}, {name}!")
+
+    @rpc
+    def get_greeting_prefix(self) -> str:
+        """Return the config value to verify it was passed to the container."""
+        return self.config.greeting_prefix
 
 
 # ---------------------------------------------------------------------------
@@ -125,14 +134,19 @@ def _on_greeting(self, text: str) -> None:
 
     coordinator = autoconnect(
         PromptModule.blueprint(),
-        HelloDockerModule.blueprint(),
+        HelloDockerModule.blueprint(greeting_prefix="Howdy"),
     ).build()
 
     # Get module proxies
     prompt_mod = coordinator.get_instance(PromptModule)
     docker_mod = coordinator.get_instance(HelloDockerModule)
 
-    # Test RPC
+    # Test that custom config was passed to the container
+    prefix = docker_mod.get_greeting_prefix()
+    assert prefix == "Howdy", f"Expected 'Howdy', got {prefix!r}"
+    print(f"Config passed to container: greeting_prefix={prefix!r}")
+
+    # Test RPC (should use the custom prefix)
     print(docker_mod.greet("World"))
 
     # Test stream

From 1f8ab0a31ef2da34012b41eecbd8eee323b5c3fc Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 17:55:27 -0800
Subject: [PATCH 33/89] Add DockerWorkerManager

---
 dimos/core/docker_runner.py                |  7 ++-
 dimos/core/docker_worker_manager.py        | 59 ++++++++++++++++++++++
 dimos/core/module_coordinator.py           | 13 ++---
 dimos/core/tests/test_docker_deployment.py | 10 ++--
 4 files changed, 75 insertions(+), 14 deletions(-)
 create mode 100644 dimos/core/docker_worker_manager.py

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 89fa9d9af3..26d822ce73 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -110,7 +110,11 @@ class DockerModuleConfig(ModuleConfig):
 def is_docker_module(module_class: type) -> bool:
     """Check if a module class should run in Docker based on its default_config."""
     default_config = getattr(module_class, "default_config", None)
-    return default_config is not None and issubclass(default_config, DockerModuleConfig)
+    return (
+        default_config is not None
+        and isinstance(default_config, type)
+        and issubclass(default_config, DockerModuleConfig)
+    )
 
 
 # Docker helpers
@@ -284,6 +288,7 @@ def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
         if not self._running:
             return
+        self._running = False  # claim shutdown before any side-effects
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         self._cleanup()
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
new file mode 100644
index 0000000000..52317d984b
--- /dev/null
+++ b/dimos/core/docker_worker_manager.py
@@ -0,0 +1,59 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from concurrent.futures import Future, ThreadPoolExecutor, as_completed
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from dimos.core.docker_runner import DockerModule
+    from dimos.core.module import Module
+
+
+class DockerWorkerManager:
+    """Parallel deployment of Docker-backed modules."""
+
+    @staticmethod
+    def deploy_parallel(
+        specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]],
+    ) -> list[DockerModule]:
+        """Deploy multiple DockerModules in parallel, collecting partial results on failure.
+
+        Returns all successfully-created DockerModules. If any deployment fails,
+        the successful ones are still returned (so the caller can register them
+        for cleanup), and the first exception is re-raised.
+        """
+        from dimos.core.docker_runner import DockerModule
+
+        results: dict[int, DockerModule] = {}
+        first_exc: Exception | None = None
+
+        with ThreadPoolExecutor(max_workers=len(specs)) as executor:
+            futures: dict[Future[DockerModule], int] = {
+                executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i
+                for i, spec in enumerate(specs)
+            }
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                try:
+                    results[idx] = fut.result()
+                except Exception as e:
+                    if first_exc is None:
+                        first_exc = e
+
+        # Return in input order (missing indices = failed deployments)
+        ordered = [results[i] for i in sorted(results)]
+        if first_exc is not None:
+            raise first_exc
+        return ordered
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 01f657dd1a..4ede195571 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,6 +18,7 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
+from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
@@ -76,6 +77,7 @@ def stop(self) -> None:
         self._client.close_all()  # type: ignore[union-attr]
 
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
+        # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
         from dimos.core.docker_runner import DockerModule, is_docker_module
 
         if not self._client:
@@ -92,7 +94,8 @@ def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:
     def deploy_parallel(
         self, module_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]]
     ) -> list[ModuleProxy]:
-        from dimos.core.docker_runner import DockerModule, is_docker_module
+        # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
+        from dimos.core.docker_runner import is_docker_module
 
         if not self._client:
             raise ValueError("Not started")
@@ -102,7 +105,6 @@ def deploy_parallel(
         worker_indices: list[int] = []
         docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
         worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
-        # the i is needed for maintaining order on the returned output
         for i, spec in enumerate(module_specs):
             if is_docker_module(spec[0]):
                 docker_indices.append(i)
@@ -116,12 +118,7 @@ def deploy_parallel(
         try:
             worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
             if docker_specs:
-                with ThreadPoolExecutor(max_workers=len(docker_specs)) as executor:
-                    docker_results = list(
-                        executor.map(
-                            lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), docker_specs
-                        )
-                    )
+                docker_results = DockerWorkerManager.deploy_parallel(docker_specs)
         finally:
             # Reassemble results in original input order
             results: list[Any] = [None] * len(module_specs)
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 95db171e1c..17d1290916 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -139,10 +139,10 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage
 
         coordinator.stop()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_parallel_separates_docker_and_regular(
-        self, mock_worker_manager_cls, mock_docker_module_cls
+        self, mock_worker_manager_cls, mock_docker_deploy
     ):
         mock_worker_mgr = MagicMock()
         mock_worker_manager_cls.return_value = mock_worker_mgr
@@ -151,7 +151,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         mock_worker_mgr.deploy_parallel.return_value = [regular_proxy]
 
         mock_dm = MagicMock()
-        mock_docker_module_cls.return_value = mock_dm
+        mock_docker_deploy.return_value = [mock_dm]
 
         coordinator = ModuleCoordinator()
         coordinator.start()
@@ -164,8 +164,8 @@ def test_deploy_parallel_separates_docker_and_regular(
 
         # Regular module goes through worker manager
         mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
-        # Docker module gets its own DockerModule
-        mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+        # Docker specs go through DockerWorkerManager
+        mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})])
         # start() is NOT called during deploy — it's called in start_all_modules
         mock_dm.start.assert_not_called()
 

From 4536ce12c1efffd47004cb57443c7a6f9cfda65a Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 21:44:22 -0800
Subject: [PATCH 34/89] add proper cleanup handling if a module fails to deploy
 correctly

---
 dimos/core/docker_worker_manager.py           |  43 ++--
 dimos/core/module_coordinator.py              |  11 +-
 .../tests/test_parallel_deploy_cleanup.py     | 219 ++++++++++++++++++
 dimos/core/worker_manager.py                  |  30 ++-
 dimos/utils/safe_thread_map.py                |  92 ++++++++
 5 files changed, 350 insertions(+), 45 deletions(-)
 create mode 100644 dimos/core/tests/test_parallel_deploy_cleanup.py
 create mode 100644 dimos/utils/safe_thread_map.py

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 52317d984b..b70ff3ba52 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 from __future__ import annotations
 
-from concurrent.futures import Future, ThreadPoolExecutor, as_completed
+from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
+from dimos.utils.safe_thread_map import safe_thread_map
+
 if TYPE_CHECKING:
     from dimos.core.docker_runner import DockerModule
     from dimos.core.module import Module
@@ -28,32 +30,21 @@ class DockerWorkerManager:
     def deploy_parallel(
         specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]],
     ) -> list[DockerModule]:
-        """Deploy multiple DockerModules in parallel, collecting partial results on failure.
+        """Deploy multiple DockerModules in parallel.
 
-        Returns all successfully-created DockerModules. If any deployment fails,
-        the successful ones are still returned (so the caller can register them
-        for cleanup), and the first exception is re-raised.
+        If any deployment fails, all successfully-started containers are
+        stopped before an ExceptionGroup is raised.
         """
         from dimos.core.docker_runner import DockerModule
 
-        results: dict[int, DockerModule] = {}
-        first_exc: Exception | None = None
-
-        with ThreadPoolExecutor(max_workers=len(specs)) as executor:
-            futures: dict[Future[DockerModule], int] = {
-                executor.submit(lambda s=spec: DockerModule(s[0], *s[1], **s[2])): i
-                for i, spec in enumerate(specs)
-            }
-            for fut in as_completed(futures):
-                idx = futures[fut]
-                try:
-                    results[idx] = fut.result()
-                except Exception as e:
-                    if first_exc is None:
-                        first_exc = e
-
-        # Return in input order (missing indices = failed deployments)
-        ordered = [results[i] for i in sorted(results)]
-        if first_exc is not None:
-            raise first_exc
-        return ordered
+        def _on_errors(
+            _outcomes: list, successes: list[DockerModule], errors: list[Exception]
+        ) -> None:
+            for mod in successes:
+                with suppress(Exception):
+                    mod.stop()
+            raise ExceptionGroup("docker deploy_parallel failed", errors)
+
+        return safe_thread_map(
+            specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors
+        )
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 4ede195571..48546c5568 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -113,12 +113,9 @@ def deploy_parallel(
                 worker_indices.append(i)
                 worker_specs.append(spec)
 
-        worker_results: list[Any] = []
-        docker_results: list[Any] = []
         try:
-            worker_results = self._client.deploy_parallel(worker_specs) if worker_specs else []
-            if docker_specs:
-                docker_results = DockerWorkerManager.deploy_parallel(docker_specs)
+            worker_results = self._client.deploy_parallel(worker_specs)
+            docker_results = DockerWorkerManager.deploy_parallel(docker_specs)
         finally:
             # Reassemble results in original input order
             results: list[Any] = [None] * len(module_specs)
@@ -127,9 +124,9 @@ def deploy_parallel(
             for idx, mod in zip(docker_indices, docker_results, strict=False):
                 results[idx] = mod
             # Register whatever succeeded so stop() can clean them up
-            for spec, module in zip(module_specs, results, strict=False):
+            for (module_class, _, _), module in zip(module_specs, results, strict=False):
                 if module is not None:
-                    self._deployed_modules[spec[0]] = module
+                    self._deployed_modules[module_class] = module
 
         return results
 
diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
new file mode 100644
index 0000000000..1987fa4be7
--- /dev/null
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -0,0 +1,219 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Tests that deploy_parallel cleans up successfully-started modules when a
+sibling deployment fails ("middle module throws" scenario).
+"""
+
+from __future__ import annotations
+
+import threading
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+class TestDockerWorkerManagerPartialFailure:
+    """DockerWorkerManager.deploy_parallel must stop successful containers when one fails."""
+
+    @patch("dimos.core.docker_runner.DockerModule")
+    def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls):
+        """Deploy 3 modules where the middle one fails. The other two must be stopped."""
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
+        mod_a = MagicMock(name="ModuleA")
+        mod_c = MagicMock(name="ModuleC")
+
+        barrier = threading.Barrier(3, timeout=5)
+
+        def fake_constructor(cls, *args, **kwargs):
+            label = cls.__name__
+            barrier.wait()
+            if label == "B":
+                raise RuntimeError("B failed to start")
+            return mod_a if label == "A" else mod_c
+
+        mock_docker_module_cls.side_effect = fake_constructor
+
+        FakeA = type("A", (), {})
+        FakeB = type("B", (), {})
+        FakeC = type("C", (), {})
+
+        with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info:
+            DockerWorkerManager.deploy_parallel(
+                [
+                    (FakeA, (), {}),
+                    (FakeB, (), {}),
+                    (FakeC, (), {}),
+                ]
+            )
+
+        assert len(exc_info.value.exceptions) == 1
+        assert "B failed to start" in str(exc_info.value.exceptions[0])
+
+        # Both successful modules must have been stopped exactly once
+        mod_a.stop.assert_called_once()
+        mod_c.stop.assert_called_once()
+
+    @patch("dimos.core.docker_runner.DockerModule")
+    def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls):
+        """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors."""
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
+        mod_a = MagicMock(name="ModuleA")
+
+        barrier = threading.Barrier(3, timeout=5)
+
+        def fake_constructor(cls, *args, **kwargs):
+            label = cls.__name__
+            barrier.wait()
+            if label == "B":
+                raise RuntimeError("B failed")
+            if label == "C":
+                raise ValueError("C failed")
+            return mod_a
+
+        mock_docker_module_cls.side_effect = fake_constructor
+
+        FakeA = type("A", (), {})
+        FakeB = type("B", (), {})
+        FakeC = type("C", (), {})
+
+        with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info:
+            DockerWorkerManager.deploy_parallel(
+                [
+                    (FakeA, (), {}),
+                    (FakeB, (), {}),
+                    (FakeC, (), {}),
+                ]
+            )
+
+        assert len(exc_info.value.exceptions) == 2
+        messages = {str(e) for e in exc_info.value.exceptions}
+        assert "B failed" in messages
+        assert "C failed" in messages
+
+        # The one successful module must have been stopped
+        mod_a.stop.assert_called_once()
+
+    @patch("dimos.core.docker_runner.DockerModule")
+    def test_all_succeed_no_stops(self, mock_docker_module_cls):
+        """When all deployments succeed, no modules should be stopped."""
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
+        mocks = [MagicMock(name=f"Mod{i}") for i in range(3)]
+
+        def fake_constructor(cls, *args, **kwargs):
+            return mocks[["A", "B", "C"].index(cls.__name__)]
+
+        mock_docker_module_cls.side_effect = fake_constructor
+
+        FakeA = type("A", (), {})
+        FakeB = type("B", (), {})
+        FakeC = type("C", (), {})
+
+        results = DockerWorkerManager.deploy_parallel(
+            [
+                (FakeA, (), {}),
+                (FakeB, (), {}),
+                (FakeC, (), {}),
+            ]
+        )
+
+        assert len(results) == 3
+        for m in mocks:
+            m.stop.assert_not_called()
+
+    @patch("dimos.core.docker_runner.DockerModule")
+    def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls):
+        """If stop() itself raises during cleanup, the original deploy error still propagates."""
+        from dimos.core.docker_worker_manager import DockerWorkerManager
+
+        mod_a = MagicMock(name="ModuleA")
+        mod_a.stop.side_effect = OSError("stop failed")
+
+        barrier = threading.Barrier(2, timeout=5)
+
+        def fake_constructor(cls, *args, **kwargs):
+            barrier.wait()
+            if cls.__name__ == "B":
+                raise RuntimeError("B exploded")
+            return mod_a
+
+        mock_docker_module_cls.side_effect = fake_constructor
+
+        FakeA = type("A", (), {})
+        FakeB = type("B", (), {})
+
+        with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed"):
+            DockerWorkerManager.deploy_parallel([(FakeA, (), {}), (FakeB, (), {})])
+
+        # stop was attempted despite it raising
+        mod_a.stop.assert_called_once()
+
+
+class TestWorkerManagerPartialFailure:
+    """WorkerManager.deploy_parallel must clean up successful RPCClients when one fails."""
+
+    def test_middle_module_fails_cleans_up_siblings(self):
+        from dimos.core.worker_manager import WorkerManager
+
+        manager = WorkerManager(n_workers=2)
+
+        mock_workers = [MagicMock(name=f"Worker{i}") for i in range(2)]
+        for w in mock_workers:
+            w.module_count = 0
+            w.reserve_slot = MagicMock(
+                side_effect=lambda w=w: setattr(w, "module_count", w.module_count + 1)
+            )
+
+        manager._workers = mock_workers
+        manager._started = True
+
+        def fake_deploy_module(module_class, args=(), kwargs=None):
+            if module_class.__name__ == "B":
+                raise RuntimeError("B failed to deploy")
+            return MagicMock(name=f"actor_{module_class.__name__}")
+
+        for w in mock_workers:
+            w.deploy_module = fake_deploy_module
+
+        FakeA = type("A", (), {})
+        FakeB = type("B", (), {})
+        FakeC = type("C", (), {})
+
+        rpc_clients_created: list[MagicMock] = []
+
+        with patch("dimos.core.worker_manager.RPCClient") as mock_rpc_cls:
+
+            def make_rpc(actor, cls):
+                client = MagicMock(name=f"rpc_{cls.__name__}")
+                rpc_clients_created.append(client)
+                return client
+
+            mock_rpc_cls.side_effect = make_rpc
+
+            with pytest.raises(ExceptionGroup, match="worker deploy_parallel failed"):
+                manager.deploy_parallel(
+                    [
+                        (FakeA, (), {}),
+                        (FakeB, (), {}),
+                        (FakeC, (), {}),
+                    ]
+                )
+
+        # Every successfully-created RPC client must have been cleaned up exactly once
+        for client in rpc_clients_created:
+            client.stop_rpc_client.assert_called_once()
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index 4dbb51eb54..25a052590c 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -14,12 +14,13 @@
 
 from __future__ import annotations
 
-from concurrent.futures import ThreadPoolExecutor
+from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.rpc_client import RPCClient
 from dimos.core.worker import Worker
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.safe_thread_map import safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.module import ModuleT
@@ -65,6 +66,9 @@ def deploy_parallel(
         if self._closed:
             raise RuntimeError("WorkerManager is closed")
 
+        if len(module_specs) == 0:
+            return []
+
         # Auto-start for backward compatibility
         if not self._started:
             self.start()
@@ -78,17 +82,19 @@ def deploy_parallel(
             worker.reserve_slot()
             assignments.append((worker, module_class, args, kwargs))
 
-        def _deploy(
-            item: tuple[Worker, type[ModuleT], tuple[Any, ...], dict[Any, Any]],
-        ) -> RPCClient:
-            worker, module_class, args, kwargs = item
-            actor = worker.deploy_module(module_class, args=args, kwargs=kwargs)
-            return RPCClient(actor, module_class)
-
-        with ThreadPoolExecutor(max_workers=len(assignments)) as pool:
-            results = list(pool.map(_deploy, assignments))
-
-        return results
+        def _on_errors(
+            _outcomes: list, successes: list[RPCClient], errors: list[Exception]
+        ) -> None:
+            for rpc_client in successes:
+                with suppress(Exception):
+                    rpc_client.stop_rpc_client()
+            raise ExceptionGroup("worker deploy_parallel failed", errors)
+
+        return safe_thread_map(
+            assignments,
+            lambda item: RPCClient(item[0].deploy_module(item[1], item[2], item[3]), item[1]),
+            _on_errors,
+        )
 
     @property
     def workers(self) -> list[Worker]:
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
new file mode 100644
index 0000000000..f051b0d950
--- /dev/null
+++ b/dimos/utils/safe_thread_map.py
@@ -0,0 +1,92 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from concurrent.futures import Future, ThreadPoolExecutor, as_completed
+from typing import TYPE_CHECKING, Any, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+def safe_thread_map(
+    items: Sequence[T],
+    fn: Callable[[T], R],
+    on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any]
+    | None = None,
+) -> list[R]:
+    """Thread-pool map that waits for all items to finish before raising and a cleanup handler
+
+    - Empty *items* → returns ``[]`` immediately.
+    - All succeed → returns results in input order.
+    - Any fail → calls ``on_errors(outcomes, successes, errors)`` where
+      *outcomes* is a list of ``(input, result_or_exception)`` pairs in input
+      order, *successes* is the list of successful results, and *errors* is
+      the list of exceptions. If *on_errors* raises, that exception propagates.
+      If *on_errors* returns normally, its return value is returned from
+      ``safe_thread_map``. If *on_errors* is ``None``, raises an
+      ``ExceptionGroup``.
+
+    Example::
+
+        def start_service(name: str) -> Connection:
+            return connect(name)
+
+        def cleanup(
+            outcomes: list[tuple[str, Connection | Exception]],
+            successes: list[Connection],
+            errors: list[Exception],
+        ) -> None:
+            for conn in successes:
+                conn.close()
+            raise ExceptionGroup("failed to start services", errors)
+
+        connections = safe_thread_map(
+            ["db", "cache", "queue"],
+            start_service,
+            cleanup,  # called only if any start_service() raises
+        )
+    """
+    if not items:
+        return []
+
+    outcomes: dict[int, R | Exception] = {}
+
+    with ThreadPoolExecutor(max_workers=len(items)) as pool:
+        futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)}
+        for fut in as_completed(futures):
+            idx = futures[fut]
+            try:
+                outcomes[idx] = fut.result()
+            except Exception as e:
+                outcomes[idx] = e
+
+    successes: list[R] = []
+    errors: list[Exception] = []
+    for v in outcomes.values():
+        if isinstance(v, Exception):
+            errors.append(v)
+        else:
+            successes.append(v)
+
+    if errors:
+        if on_errors is not None:
+            zipped = [(items[i], outcomes[i]) for i in range(len(items))]
+            return on_errors(zipped, successes, errors)  # type: ignore[return-value]
+        raise ExceptionGroup("safe_thread_map failed", errors)
+
+    return [outcomes[i] for i in range(len(items))]  # type: ignore[misc]

From 59c5cc065e30f355d0011bf00b18ae31994774fd Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 21:48:13 -0800
Subject: [PATCH 35/89] mypy fixup

---
 dimos/core/docker_worker_manager.py | 2 +-
 dimos/core/module_coordinator.py    | 4 ++--
 dimos/core/worker_manager.py        | 2 +-
 dimos/utils/safe_thread_map.py      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index b70ff3ba52..34183fda9f 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -38,7 +38,7 @@ def deploy_parallel(
         from dimos.core.docker_runner import DockerModule
 
         def _on_errors(
-            _outcomes: list, successes: list[DockerModule], errors: list[Exception]
+            _outcomes: list[Any], successes: list[DockerModule], errors: list[Exception]
         ) -> None:
             for mod in successes:
                 with suppress(Exception):
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 48546c5568..8269a47bf9 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -115,13 +115,13 @@ def deploy_parallel(
 
         try:
             worker_results = self._client.deploy_parallel(worker_specs)
-            docker_results = DockerWorkerManager.deploy_parallel(docker_specs)
+            docker_results = DockerWorkerManager.deploy_parallel(docker_specs)  # type: ignore[arg-type]
         finally:
             # Reassemble results in original input order
             results: list[Any] = [None] * len(module_specs)
             for idx, mod in zip(worker_indices, worker_results, strict=False):
                 results[idx] = mod
-            for idx, mod in zip(docker_indices, docker_results, strict=False):
+            for idx, mod in zip(docker_indices, docker_results, strict=False):  # type: ignore[assignment]
                 results[idx] = mod
             # Register whatever succeeded so stop() can clean them up
             for (module_class, _, _), module in zip(module_specs, results, strict=False):
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index 25a052590c..b9c25c8445 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -83,7 +83,7 @@ def deploy_parallel(
             assignments.append((worker, module_class, args, kwargs))
 
         def _on_errors(
-            _outcomes: list, successes: list[RPCClient], errors: list[Exception]
+            _outcomes: list[Any], successes: list[RPCClient], errors: list[Exception]
         ) -> None:
             for rpc_client in successes:
                 with suppress(Exception):
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
index f051b0d950..240f5e7099 100644
--- a/dimos/utils/safe_thread_map.py
+++ b/dimos/utils/safe_thread_map.py
@@ -86,7 +86,7 @@ def cleanup(
     if errors:
         if on_errors is not None:
             zipped = [(items[i], outcomes[i]) for i in range(len(items))]
-            return on_errors(zipped, successes, errors)  # type: ignore[return-value]
+            return on_errors(zipped, successes, errors)  # type: ignore[return-value, no-any-return]
         raise ExceptionGroup("safe_thread_map failed", errors)
 
     return [outcomes[i] for i in range(len(items))]  # type: ignore[misc]

From 5d46c8b659f99c74a6f1aa55b025d396ec4c23a4 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 5 Mar 2026 22:18:08 -0800
Subject: [PATCH 36/89] -

---
 dimos/core/module_coordinator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 8269a47bf9..cbcdb179e9 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -113,6 +113,8 @@ def deploy_parallel(
                 worker_indices.append(i)
                 worker_specs.append(spec)
 
+        worker_results: list[Any] = []
+        docker_results: list[Any] = []
         try:
             worker_results = self._client.deploy_parallel(worker_specs)
             docker_results = DockerWorkerManager.deploy_parallel(docker_specs)  # type: ignore[arg-type]

From c72b380ba8fab997e09a4fa6406067226a715533 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 6 Mar 2026 21:35:53 -0800
Subject: [PATCH 37/89] add docker_build_ssh and image rebuild check

---
 dimos/core/docker_build.py  | 41 +++++++++++++++++++++++++++++++++++++
 dimos/core/docker_runner.py | 36 ++++++++++++++++++++------------
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index 7ee90fc5c3..2679450269 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -20,6 +20,7 @@
 
 from __future__ import annotations
 
+import hashlib
 import subprocess
 from typing import TYPE_CHECKING
 
@@ -90,14 +91,52 @@ def _convert_dockerfile(dockerfile: Path) -> Path:
     return converted
 
 
+_BUILD_HASH_LABEL = "dimos.build.hash"
+
+
+def _compute_build_hash(cfg: DockerModuleConfig) -> str:
+    """Hash Dockerfile contents, build args, and build context path."""
+    assert cfg.docker_file is not None
+    digest = hashlib.sha256()
+    digest.update(cfg.docker_file.read_bytes())
+    for key, val in sorted(cfg.docker_build_args.items()):
+        digest.update(f"{key}={val}".encode())
+    return digest.hexdigest()
+
+
+def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None:
+    """Read the build hash label from an existing Docker image."""
+    r = _run(
+        [
+            docker_bin,
+            "image",
+            "inspect",
+            "-f",
+            '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
+            image_name,
+        ],
+        timeout=DOCKER_CMD_TIMEOUT,
+    )
+    if r.returncode != 0:
+        return None
+    value = r.stdout.strip()
+    # docker prints "<no value>" when the label is missing
+    return value if value and value != "<no value>" else None
+
+
 def build_image(cfg: DockerModuleConfig) -> None:
     """Build Docker image using footer mode conversion."""
     if cfg.docker_file is None:
         raise ValueError("docker_file is required for building Docker images")
+
+    build_hash = _compute_build_hash(cfg)
     dockerfile = _convert_dockerfile(cfg.docker_file)
 
     context = cfg.docker_build_context or cfg.docker_file.parent
     cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
+    cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
+    if cfg.docker_build_ssh:
+        cmd.extend(["--ssh", "default"])
     for k, v in cfg.docker_build_args.items():
         cmd.extend(["--build-arg", f"{k}={v}"])
     cmd.append(str(context))
@@ -115,6 +154,8 @@ def image_exists(cfg: DockerModuleConfig) -> bool:
 
 __all__ = [
     "DIMOS_FOOTER",
+    "_compute_build_hash",
+    "_get_image_build_hash",
     "build_image",
     "image_exists",
 ]
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 26d822ce73..4a19746c5e 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -54,6 +54,8 @@ class DockerModuleConfig(ModuleConfig):
 
     For advanced Docker options not listed here, use docker_extra_args.
     Example: docker_extra_args=["--cap-add=SYS_ADMIN", "--read-only"]
+
+    NOTE: a DockerModule will rebuild automatically if the Dockerfile or build args change
     """
 
     # Build / image
@@ -61,6 +63,7 @@ class DockerModuleConfig(ModuleConfig):
     docker_file: Path | None = None  # Required on host for building, not needed in container
     docker_build_context: Path | None = None
     docker_build_args: dict[str, str] = field(default_factory=dict)
+    docker_build_ssh: bool = False  # Pass --ssh default to docker build (for private repo clones)
 
     # Identity
     docker_container_name: str | None = None
@@ -180,7 +183,12 @@ class DockerModule(ModuleProxyProtocol):
     config: DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
-        from dimos.core.docker_build import build_image, image_exists
+        from dimos.core.docker_build import (
+            _compute_build_hash,
+            _get_image_build_hash,
+            build_image,
+            image_exists,
+        )
 
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
         if not issubclass(config_class, DockerModuleConfig):
@@ -211,21 +219,23 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
 
         # Build or pull image, launch container, wait for RPC server
         try:
-            if not image_exists(config):
-                if config.docker_file is not None:
+            if config.docker_file is not None:
+                current_hash = _compute_build_hash(config)
+                stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image)
+                if current_hash != stored_hash:
                     logger.info(f"Building {config.docker_image}")
                     build_image(config)
-                else:
-                    logger.info(f"Pulling {config.docker_image}")
-                    r = _run(
-                        [_docker_bin(config), "pull", config.docker_image],
-                        timeout=config.docker_pull_timeout,
+            elif not image_exists(config):
+                logger.info(f"Pulling {config.docker_image}")
+                r = _run(
+                    [_docker_bin(config), "pull", config.docker_image],
+                    timeout=config.docker_pull_timeout,
+                )
+                if r.returncode != 0:
+                    raise RuntimeError(
+                        f"Failed to pull image '{config.docker_image}'.\n"
+                        f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
                     )
-                    if r.returncode != 0:
-                        raise RuntimeError(
-                            f"Failed to pull image '{config.docker_image}'.\n"
-                            f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
-                        )
 
             reconnect = False
             if _is_container_running(config, self._container_name):

From 6e0a5c5886af3f311b8317fcc5028016eeb8a256 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 6 Mar 2026 21:42:48 -0800
Subject: [PATCH 38/89] simplify

---
 dimos/core/docker_build.py  | 52 ++++++++++++++-----------------------
 dimos/core/docker_runner.py | 21 ++++++---------
 2 files changed, 27 insertions(+), 46 deletions(-)

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index 2679450269..d3fbcec685 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -33,10 +33,11 @@
 
 logger = setup_logger()
 
-# Timeout for quick Docker commands
+_BUILD_HASH_LABEL = "dimos.build.hash"
+
 DOCKER_CMD_TIMEOUT = 20
 
-# Sentinel value to detect already-converted Dockerfiles (UUID ensures uniqueness)
+# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness)
 DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc"
 
 # Footer appended to Dockerfiles for DimOS module conversion
@@ -54,28 +55,6 @@
 """
 
 
-def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]:
-    """Run a command and return the result."""
-    return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
-
-
-def _run_streaming(cmd: list[str]) -> int:
-    """Run command and stream output to terminal. Returns exit code."""
-    result = subprocess.run(cmd, text=True)
-    return result.returncode
-
-
-def _docker_bin(cfg: DockerModuleConfig) -> str:
-    """Get docker binary path."""
-    return cfg.docker_bin or "docker"
-
-
-def _image_exists(docker_bin: str, image_name: str) -> bool:
-    """Check if a Docker image exists locally."""
-    r = _run([docker_bin, "image", "inspect", image_name], timeout=DOCKER_CMD_TIMEOUT)
-    return r.returncode == 0
-
-
 def _convert_dockerfile(dockerfile: Path) -> Path:
     """Append DimOS footer to Dockerfile. Returns path to converted file."""
     content = dockerfile.read_text()
@@ -91,9 +70,6 @@ def _convert_dockerfile(dockerfile: Path) -> Path:
     return converted
 
 
-_BUILD_HASH_LABEL = "dimos.build.hash"
-
-
 def _compute_build_hash(cfg: DockerModuleConfig) -> str:
     """Hash Dockerfile contents, build args, and build context path."""
     assert cfg.docker_file is not None
@@ -106,7 +82,7 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str:
 
 def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None:
     """Read the build hash label from an existing Docker image."""
-    r = _run(
+    r = subprocess.run(
         [
             docker_bin,
             "image",
@@ -115,7 +91,10 @@ def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None:
             '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
             image_name,
         ],
+        capture_output=True,
+        text=True,
         timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
     )
     if r.returncode != 0:
         return None
@@ -133,7 +112,7 @@ def build_image(cfg: DockerModuleConfig) -> None:
     dockerfile = _convert_dockerfile(cfg.docker_file)
 
     context = cfg.docker_build_context or cfg.docker_file.parent
-    cmd = [_docker_bin(cfg), "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
+    cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
     cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
     if cfg.docker_build_ssh:
         cmd.extend(["--ssh", "default"])
@@ -142,14 +121,21 @@ def build_image(cfg: DockerModuleConfig) -> None:
     cmd.append(str(context))
 
     logger.info(f"Building Docker image: {cfg.docker_image}")
-    exit_code = _run_streaming(cmd)
-    if exit_code != 0:
-        raise RuntimeError(f"Docker build failed with exit code {exit_code}")
+    result = subprocess.run(cmd, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"Docker build failed with exit code {result.returncode}")
 
 
 def image_exists(cfg: DockerModuleConfig) -> bool:
     """Check if the configured Docker image exists locally."""
-    return _image_exists(_docker_bin(cfg), cfg.docker_image)
+    r = subprocess.run(
+        [cfg.docker_bin, "image", "inspect", cfg.docker_image],
+        capture_output=True,
+        text=True,
+        timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
+    )
+    return r.returncode == 0
 
 
 __all__ = [
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 4a19746c5e..c81d4367bc 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -128,25 +128,20 @@ def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.Complete
     return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
 
 
-def _docker_bin(cfg: DockerModuleConfig) -> str:
-    """Get docker binary path, defaulting to 'docker' if empty/None."""
-    return cfg.docker_bin or "docker"
-
-
 def _remove_container(cfg: DockerModuleConfig, name: str) -> None:
-    _run([_docker_bin(cfg), "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT)
+    _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT)
 
 
 def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool:
     r = _run(
-        [_docker_bin(cfg), "inspect", "-f", "{{.State.Running}}", name],
+        [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name],
         timeout=DOCKER_STATUS_TIMEOUT,
     )
     return r.returncode == 0 and r.stdout.strip() == "true"
 
 
 def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str:
-    r = _run([_docker_bin(cfg), "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
+    r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
     out = (r.stdout or "").rstrip()
     err = (r.stderr or "").rstrip()
     return out + ("\n" + err if err else "")
@@ -221,14 +216,14 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         try:
             if config.docker_file is not None:
                 current_hash = _compute_build_hash(config)
-                stored_hash = _get_image_build_hash(_docker_bin(config), config.docker_image)
+                stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image)
                 if current_hash != stored_hash:
                     logger.info(f"Building {config.docker_image}")
                     build_image(config)
             elif not image_exists(config):
                 logger.info(f"Pulling {config.docker_image}")
                 r = _run(
-                    [_docker_bin(config), "pull", config.docker_image],
+                    [config.docker_bin, "pull", config.docker_image],
                     timeout=config.docker_pull_timeout,
                 )
                 if r.returncode != 0:
@@ -245,7 +240,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                 else:
                     logger.info(f"Stopping existing container: {self._container_name}")
                     _run(
-                        [_docker_bin(config), "stop", self._container_name],
+                        [config.docker_bin, "stop", self._container_name],
                         timeout=DOCKER_STOP_TIMEOUT,
                     )
 
@@ -313,7 +308,7 @@ def _cleanup(self) -> None:
         self._unsub_fns.clear()
         with suppress(Exception):
             _run(
-                [_docker_bin(self.config), "stop", self._container_name],
+                [self.config.docker_bin, "stop", self._container_name],
                 timeout=DOCKER_STOP_TIMEOUT,
             )
         with suppress(Exception):
@@ -353,7 +348,7 @@ def _build_docker_run_command(self) -> list[str]:
         cfg = self.config
         self._validate_config(cfg)
 
-        cmd = [_docker_bin(cfg), "run", "-d"]
+        cmd = [cfg.docker_bin, "run", "-d"]
         self._add_lifecycle_args(cmd, cfg)
         self._add_network_args(cmd, cfg)
         self._add_port_args(cmd, cfg)

From 2b4adaeb06ca9fa0cd45334d8d1c122453e47fed Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 6 Mar 2026 23:07:30 -0800
Subject: [PATCH 39/89] misc

---
 dimos/core/docker_build.py       | 19 +++++++++++--------
 dimos/core/docker_runner.py      |  2 +-
 dimos/core/module_coordinator.py |  9 +++++++--
 dimos/utils/safe_thread_map.py   |  2 ++
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index d3fbcec685..036c4cfd6c 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -71,25 +71,26 @@ def _convert_dockerfile(dockerfile: Path) -> Path:
 
 
 def _compute_build_hash(cfg: DockerModuleConfig) -> str:
-    """Hash Dockerfile contents, build args, and build context path."""
+    """Hash Dockerfile contents, build args, and SSH flag."""
     assert cfg.docker_file is not None
     digest = hashlib.sha256()
     digest.update(cfg.docker_file.read_bytes())
     for key, val in sorted(cfg.docker_build_args.items()):
         digest.update(f"{key}={val}".encode())
+    digest.update(f"ssh={cfg.docker_build_ssh}".encode())
     return digest.hexdigest()
 
 
-def _get_image_build_hash(docker_bin: str, image_name: str) -> str | None:
+def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None:
     """Read the build hash label from an existing Docker image."""
     r = subprocess.run(
         [
-            docker_bin,
+            cfg.docker_bin,
             "image",
             "inspect",
             "-f",
             '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
-            image_name,
+            cfg.docker_image,
         ],
         capture_output=True,
         text=True,
@@ -121,9 +122,13 @@ def build_image(cfg: DockerModuleConfig) -> None:
     cmd.append(str(context))
 
     logger.info(f"Building Docker image: {cfg.docker_image}")
-    result = subprocess.run(cmd, text=True)
+    # Stream stdout to terminal so the user sees build progress, but capture
+    # stderr separately so we can include it in the error message on failure.
+    result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE)
     if result.returncode != 0:
-        raise RuntimeError(f"Docker build failed with exit code {result.returncode}")
+        raise RuntimeError(
+            f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}"
+        )
 
 
 def image_exists(cfg: DockerModuleConfig) -> bool:
@@ -140,8 +145,6 @@ def image_exists(cfg: DockerModuleConfig) -> bool:
 
 __all__ = [
     "DIMOS_FOOTER",
-    "_compute_build_hash",
-    "_get_image_build_hash",
     "build_image",
     "image_exists",
 ]
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index c81d4367bc..97dbe5e209 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -216,7 +216,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         try:
             if config.docker_file is not None:
                 current_hash = _compute_build_hash(config)
-                stored_hash = _get_image_build_hash(config.docker_bin, config.docker_image)
+                stored_hash = _get_image_build_hash(config)
                 if current_hash != stored_hash:
                     logger.info(f"Building {config.docker_image}")
                     build_image(config)
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index cbcdb179e9..7e42f566fa 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -113,19 +113,24 @@ def deploy_parallel(
                 worker_indices.append(i)
                 worker_specs.append(spec)
 
+        # Intentionally sequential: worker deploys first, then docker.
+        # Both internally parallelize across their own items. Running them
+        # concurrently would add complexity for minimal gain since they use
+        # different resource pools (processes vs containers).
         worker_results: list[Any] = []
         docker_results: list[Any] = []
         try:
             worker_results = self._client.deploy_parallel(worker_specs)
             docker_results = DockerWorkerManager.deploy_parallel(docker_specs)  # type: ignore[arg-type]
         finally:
-            # Reassemble results in original input order
+            # Reassemble whatever succeeded into original input order so
+            # stop() can clean them up even if a later deploy raised.
+            # zip(strict=False) safely handles partial results (empty lists).
             results: list[Any] = [None] * len(module_specs)
             for idx, mod in zip(worker_indices, worker_results, strict=False):
                 results[idx] = mod
             for idx, mod in zip(docker_indices, docker_results, strict=False):  # type: ignore[assignment]
                 results[idx] = mod
-            # Register whatever succeeded so stop() can clean them up
             for (module_class, _, _), module in zip(module_specs, results, strict=False):
                 if module is not None:
                     self._deployed_modules[module_class] = module
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
index 240f5e7099..6729c989f3 100644
--- a/dimos/utils/safe_thread_map.py
+++ b/dimos/utils/safe_thread_map.py
@@ -75,6 +75,8 @@ def cleanup(
             except Exception as e:
                 outcomes[idx] = e
 
+    # Note: successes/errors are in completion order, not input order.
+    # This is fine — on_errors only needs them for cleanup, not ordering.
     successes: list[R] = []
     errors: list[Exception] = []
     for v in outcomes.values():

From cb83f9aca2cc866f3f74f8d18f61d6fc0a6cc926 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 00:11:47 -0800
Subject: [PATCH 40/89] add docker_build_extra_args

---
 dimos/core/docker_build.py  | 6 +++---
 dimos/core/docker_runner.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index 036c4cfd6c..5b54ecbf22 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -77,7 +77,8 @@ def _compute_build_hash(cfg: DockerModuleConfig) -> str:
     digest.update(cfg.docker_file.read_bytes())
     for key, val in sorted(cfg.docker_build_args.items()):
         digest.update(f"{key}={val}".encode())
-    digest.update(f"ssh={cfg.docker_build_ssh}".encode())
+    for arg in cfg.docker_build_extra_args:
+        digest.update(arg.encode())
     return digest.hexdigest()
 
 
@@ -115,10 +116,9 @@ def build_image(cfg: DockerModuleConfig) -> None:
     context = cfg.docker_build_context or cfg.docker_file.parent
     cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
     cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
-    if cfg.docker_build_ssh:
-        cmd.extend(["--ssh", "default"])
     for k, v in cfg.docker_build_args.items():
         cmd.extend(["--build-arg", f"{k}={v}"])
+    cmd.extend(cfg.docker_build_extra_args)
     cmd.append(str(context))
 
     logger.info(f"Building Docker image: {cfg.docker_image}")
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 97dbe5e209..a72718b564 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -63,7 +63,7 @@ class DockerModuleConfig(ModuleConfig):
     docker_file: Path | None = None  # Required on host for building, not needed in container
     docker_build_context: Path | None = None
     docker_build_args: dict[str, str] = field(default_factory=dict)
-    docker_build_ssh: bool = False  # Pass --ssh default to docker build (for private repo clones)
+    docker_build_extra_args: list[str] = field(default_factory=list)  # Extra args for docker build
 
     # Identity
     docker_container_name: str | None = None

From c74c5b907fc2f5447a6d1397a7de6c43119da63f Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 02:49:39 -0800
Subject: [PATCH 41/89] PR review fixes: better error messages, consistent API,
 restore install.sh

- Include docker_build_ssh in build hash so toggling SSH triggers rebuild
- Capture stderr on build failure for actionable error messages
- Change _get_image_build_hash to take cfg instead of raw docker_bin str
- Remove private names from __all__ in docker_build.py
- Add helpful TypeError when DockerModule payload isn't JSON-serializable
- Replace ThreadPoolExecutor.map in start_all_modules with safe_thread_map
  to surface all failures via ExceptionGroup instead of losing all but first
- Restore scripts/install.sh and README.md (accidentally removed)
- Add intent comments on deploy_parallel and safe_thread_map design choices
---
 dimos/core/docker_runner.py      | 10 +++++++++-
 dimos/core/module_coordinator.py | 12 +++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index a72718b564..6d12705521 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -489,7 +489,15 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
         kwargs = {"config": _extract_module_config(cfg)}
         payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs}
         # DimOS base image entrypoint already runs "dimos.core.docker_runner run"
-        return ["--payload", json.dumps(payload, separators=(",", ":"))]
+        try:
+            payload_json = json.dumps(payload, separators=(",", ":"))
+        except TypeError as e:
+            raise TypeError(
+                f"Cannot serialize DockerModule payload to JSON: {e}\n"
+                f"Ensure all constructor args/kwargs for {self._module_class.__name__} are "
+                f"JSON-serializable, or use docker_command to bypass automatic payload generation."
+            ) from e
+        return ["--payload", payload_json]
 
     def _wait_for_rpc(self) -> None:
         """Poll until the container's RPC server is reachable."""
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index ac693c1795..6c639117bc 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -14,7 +14,6 @@
 
 from __future__ import annotations
 
-from concurrent.futures import ThreadPoolExecutor
 import threading
 from typing import TYPE_CHECKING, Any
 
@@ -173,11 +172,18 @@ def deploy_parallel(
         return results
 
     def start_all_modules(self) -> None:
+        from dimos.utils.safe_thread_map import safe_thread_map
+
         modules = list(self._deployed_modules.values())
         if not modules:
             raise ValueError("No modules deployed. Call deploy() before start_all_modules().")
-        with ThreadPoolExecutor(max_workers=len(modules)) as executor:
-            list(executor.map(lambda m: m.start(), modules))
+
+        def _on_start_errors(
+            _outcomes: list[Any], _successes: list[Any], errors: list[Exception]
+        ) -> None:
+            raise ExceptionGroup("start_all_modules failed", errors)
+
+        safe_thread_map(modules, lambda m: m.start(), _on_start_errors)
 
         for module in modules:
             if hasattr(module, "on_system_modules"):

From 45ee6fe1501d701ebc6669d8d18735e36907f9c0 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 02:56:50 -0800
Subject: [PATCH 42/89] fix pull problem

---
 dimos/core/docker_runner.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 6d12705521..987e834eae 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -222,14 +222,15 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     build_image(config)
             elif not image_exists(config):
                 logger.info(f"Pulling {config.docker_image}")
-                r = _run(
+                r = subprocess.run(
                     [config.docker_bin, "pull", config.docker_image],
+                    text=True,
+                    stderr=subprocess.PIPE,
                     timeout=config.docker_pull_timeout,
                 )
                 if r.returncode != 0:
                     raise RuntimeError(
-                        f"Failed to pull image '{config.docker_image}'.\n"
-                        f"STDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
+                        f"Failed to pull image '{config.docker_image}'.\nSTDERR:\n{r.stderr}"
                     )
 
             reconnect = False

From 029a8633d579460656bb270ec50dbb30740e129f Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 14:31:00 -0800
Subject: [PATCH 43/89] fix reconnect edgecase and __getattr__ loop edgecase

---
 dimos/core/docker_runner.py                | 22 ++---
 dimos/core/tests/test_docker_deployment.py | 97 ++++++++++++++++++++++
 2 files changed, 109 insertions(+), 10 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 987e834eae..db5f804659 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -307,15 +307,16 @@ def _cleanup(self) -> None:
             with suppress(Exception):
                 unsub()
         self._unsub_fns.clear()
-        with suppress(Exception):
-            _run(
-                [self.config.docker_bin, "stop", self._container_name],
-                timeout=DOCKER_STOP_TIMEOUT,
-            )
-        with suppress(Exception):
-            _remove_container(self.config, self._container_name)
+        if not self.config.docker_reconnect_container:
+            with suppress(Exception):
+                _run(
+                    [self.config.docker_bin, "stop", self._container_name],
+                    timeout=DOCKER_STOP_TIMEOUT,
+                )
+            with suppress(Exception):
+                _remove_container(self.config, self._container_name)
         self._running = False
-        logger.info(f"Stopped container: {self._container_name}")
+        logger.info(f"Cleaned up container handle: {self._container_name}")
 
     def status(self) -> dict[str, Any]:
         cfg = self.config
@@ -337,10 +338,11 @@ def set_transport(self, stream_name: str, transport: Any) -> bool:
         return bool(result)
 
     def __getattr__(self, name: str) -> Any:
-        if name in self.rpcs:
+        rpcs = self.__dict__.get("rpcs")
+        if rpcs is not None and name in rpcs:
             original_method = getattr(self._module_class, name, None)
             return RpcCall(original_method, self.rpc, name, self.remote_name, self._unsub_fns, None)
-        raise AttributeError(f"{name} not found on {self._module_class.__name__}")
+        raise AttributeError(f"{name} not found on {type(self).__name__}")
 
     # Docker command building (split into focused helpers for readability)
 
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 17d1290916..e89b88e327 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -193,3 +193,100 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke
         assert mock_dm.stop.call_count == 1
         # Worker manager also closed
         mock_worker_mgr.close_all.assert_called_once()
+
+
+class TestDockerModuleGetattr:
+    """Tests for DockerModule.__getattr__ avoiding infinite recursion."""
+
+    def test_getattr_no_recursion_when_rpcs_not_set(self):
+        """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse."""
+        from dimos.core.docker_runner import DockerModule
+
+        dm = DockerModule.__new__(DockerModule)
+        # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure
+        with pytest.raises(AttributeError):
+            _ = dm.some_method
+
+    def test_getattr_no_recursion_on_cleanup_attrs(self):
+        """Accessing cleanup-related attrs before they exist must raise, not recurse."""
+        from dimos.core.docker_runner import DockerModule
+
+        dm = DockerModule.__new__(DockerModule)
+        # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse
+        for attr in ("rpc", "config", "_container_name", "_unsub_fns"):
+            with pytest.raises(AttributeError):
+                getattr(dm, attr)
+
+    def test_getattr_delegates_to_rpc_when_rpcs_set(self):
+        from dimos.core.docker_runner import DockerModule
+        from dimos.core.rpc_client import RpcCall
+
+        dm = DockerModule.__new__(DockerModule)
+        dm.rpcs = {"do_thing"}
+
+        # _module_class needs a real method with __name__ for RpcCall
+        class FakeMod:
+            def do_thing(self) -> None: ...
+
+        dm._module_class = FakeMod
+        dm.rpc = MagicMock()
+        dm.remote_name = "FakeMod"
+        dm._unsub_fns = []
+
+        result = dm.do_thing
+        assert isinstance(result, RpcCall)
+
+    def test_getattr_raises_for_unknown_method(self):
+        from dimos.core.docker_runner import DockerModule
+
+        dm = DockerModule.__new__(DockerModule)
+        dm.rpcs = {"do_thing"}
+
+        with pytest.raises(AttributeError, match="not found"):
+            _ = dm.nonexistent
+
+
+class TestDockerModuleCleanupReconnect:
+    """Tests for DockerModule._cleanup with docker_reconnect_container."""
+
+    def test_cleanup_skips_stop_when_reconnect(self):
+        from dimos.core.docker_runner import DockerModule
+
+        with patch.object(DockerModule, "__init__", lambda self: None):
+            dm = DockerModule.__new__(DockerModule)
+            dm._running = True
+            dm._container_name = "test_container"
+            dm._unsub_fns = []
+            dm.rpc = MagicMock()
+            dm.remote_name = "TestModule"
+
+            # reconnect mode: should NOT stop/rm the container
+            dm.config = FakeDockerConfig(docker_reconnect_container=True)
+            with (
+                patch("dimos.core.docker_runner._run") as mock_run,
+                patch("dimos.core.docker_runner._remove_container") as mock_rm,
+            ):
+                dm._cleanup()
+                mock_run.assert_not_called()
+                mock_rm.assert_not_called()
+
+    def test_cleanup_stops_container_when_not_reconnect(self):
+        from dimos.core.docker_runner import DockerModule
+
+        with patch.object(DockerModule, "__init__", lambda self: None):
+            dm = DockerModule.__new__(DockerModule)
+            dm._running = True
+            dm._container_name = "test_container"
+            dm._unsub_fns = []
+            dm.rpc = MagicMock()
+            dm.remote_name = "TestModule"
+
+            # normal mode: should stop and rm the container
+            dm.config = FakeDockerConfig(docker_reconnect_container=False)
+            with (
+                patch("dimos.core.docker_runner._run") as mock_run,
+                patch("dimos.core.docker_runner._remove_container") as mock_rm,
+            ):
+                dm._cleanup()
+                mock_run.assert_called_once()  # docker stop
+                mock_rm.assert_called_once()  # docker rm -f

From 5106445d9e203c207336a96b672461a104755e29 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 14:36:00 -0800
Subject: [PATCH 44/89] change the ignore postfix

---
 .gitignore                 | 1 -
 dimos/core/docker_build.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 12b2f19ca3..4045db012e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,7 +42,6 @@ package-lock.json
 # Ignore build artifacts
 dist/
 build/
-.Dockerfile.dimos
 
 # Ignore data directory but keep .lfs subdirectory
 data/*
diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index 5b54ecbf22..1e357d987b 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -65,7 +65,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path:
 
     logger.info(f"Converting {dockerfile.name} to DimOS format")
 
-    converted = dockerfile.parent / f".{dockerfile.name}.dimos"
+    converted = dockerfile.parent / f".{dockerfile.name}.ignore"
     converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n"))
     return converted
 

From 882976167cb664909c0b2ec6ecbd48607814f001 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 7 Mar 2026 17:31:53 -0800
Subject: [PATCH 45/89] fix docker defaults, make deploy better

---
 dimos/core/docker_build.py       |  2 +-
 dimos/core/docker_runner.py      | 15 +++++-----
 dimos/core/module_coordinator.py | 49 +++++++++++++++++++-------------
 dimos/core/worker.py             | 38 +++++++++++++------------
 4 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
index 1e357d987b..24fd2b3e44 100644
--- a/dimos/core/docker_build.py
+++ b/dimos/core/docker_build.py
@@ -71,7 +71,7 @@ def _convert_dockerfile(dockerfile: Path) -> Path:
 
 
 def _compute_build_hash(cfg: DockerModuleConfig) -> str:
-    """Hash Dockerfile contents, build args, and SSH flag."""
+    """Hash Dockerfile contents and build args."""
     assert cfg.docker_file is not None
     digest = hashlib.sha256()
     digest.update(cfg.docker_file.read_bytes())
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index db5f804659..6f0b2e777c 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -77,9 +77,9 @@ class DockerModuleConfig(ModuleConfig):
     )  # (host, container, proto)
 
     # Runtime resources
-    docker_gpus: str | None = "all"
-    docker_shm_size: str = "2g"
-    docker_restart_policy: str = "on-failure:3"
+    docker_gpus: str | None = None
+    docker_shm_size: str = "4g"
+    docker_restart_policy: str = "no"
 
     # Env + volumes + devices
     docker_env_files: list[str] = field(default_factory=list)
@@ -300,14 +300,15 @@ def stop(self) -> None:
         self._cleanup()
 
     def _cleanup(self) -> None:
-        """Release all resources. Safe to call multiple times or from partial init."""
+        """Release all resources. Idempotent — safe to call from partial init or after stop()."""
         with suppress(Exception):
             self.rpc.stop()
-        for unsub in self._unsub_fns:
+        for unsub in getattr(self, "_unsub_fns", []):
             with suppress(Exception):
                 unsub()
-        self._unsub_fns.clear()
-        if not self.config.docker_reconnect_container:
+        with suppress(Exception):
+            self._unsub_fns.clear()
+        if not getattr(getattr(self, "config", None), "docker_reconnect_container", False):
             with suppress(Exception):
                 _run(
                     [self.config.docker_bin, "stop", self._container_name],
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 6c639117bc..59e1013175 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -22,6 +22,7 @@
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.safe_thread_map import safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.module import Module, ModuleT
@@ -147,33 +148,41 @@ def deploy_parallel(
                 worker_indices.append(i)
                 worker_specs.append(spec)
 
-        # Intentionally sequential: worker deploys first, then docker.
-        # Both internally parallelize across their own items. Running them
-        # concurrently would add complexity for minimal gain since they use
-        # different resource pools (processes vs containers).
-        worker_results: list[Any] = []
-        docker_results: list[Any] = []
-        try:
-            worker_results = self._client.deploy_parallel(worker_specs)
-            docker_results = DockerWorkerManager.deploy_parallel(docker_specs)  # type: ignore[arg-type]
-        finally:
-            # Reassemble whatever succeeded into original input order so
-            # stop() can clean them up even if a later deploy raised.
-            # zip(strict=False) safely handles partial results (empty lists).
-            results: list[Any] = [None] * len(module_specs)
-            for idx, mod in zip(worker_indices, worker_results, strict=False):
-                results[idx] = mod
-            for idx, mod in zip(docker_indices, docker_results, strict=False):  # type: ignore[assignment]
-                results[idx] = mod
+        # Deploy worker and docker modules in parallel.
+        results: list[Any] = [None] * len(module_specs)
+
+        def _deploy_workers() -> None:
+            if not worker_specs:
+                return
+            for (index, _), module in zip(
+                worker_indices, self._client.deploy_parallel(worker_specs), strict=False
+            ):  # type: ignore[union-attr]
+                results[index] = module
+
+        def _deploy_docker() -> None:
+            if not docker_specs:
+                return
+            for (index, _), module in zip(
+                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False
+            ):  # type: ignore[arg-type]
+                results[index] = module
+
+        def _register() -> None:
             for (module_class, _, _), module in zip(module_specs, results, strict=False):
                 if module is not None:
                     self._deployed_modules[module_class] = module
 
+        def _on_errors(
+            _outcomes: list[Any], _successes: list[Any], errors: list[Exception]
+        ) -> None:
+            _register()
+            raise ExceptionGroup("deploy_parallel failed", errors)
+
+        safe_thread_map([_deploy_workers, _deploy_docker], lambda fn: fn(), _on_errors)
+        _register()
         return results
 
     def start_all_modules(self) -> None:
-        from dimos.utils.safe_thread_map import safe_thread_map
-
         modules = list(self._deployed_modules.values())
         if not modules:
             raise ValueError("No modules deployed. Call deploy() before start_all_modules().")
diff --git a/dimos/core/worker.py b/dimos/core/worker.py
index b0dd802841..cce79796f5 100644
--- a/dimos/core/worker.py
+++ b/dimos/core/worker.py
@@ -206,25 +206,27 @@ def deploy_module(
             "args": args,
             "kwargs": kwargs,
         }
-        with self._lock:
-            self._conn.send(request)
-            response = self._conn.recv()
+        try:
+            with self._lock:
+                self._conn.send(request)
+                response = self._conn.recv()
 
-        if response.get("error"):
-            raise RuntimeError(f"Failed to deploy module: {response['error']}")
-
-        actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock)
-        actor.set_ref(actor).result()
-
-        self._modules[module_id] = actor
-        self._reserved = max(0, self._reserved - 1)
-        logger.info(
-            "Deployed module.",
-            module=module_class.__name__,
-            worker_id=self._worker_id,
-            module_id=module_id,
-        )
-        return actor
+            if response.get("error"):
+                raise RuntimeError(f"Failed to deploy module: {response['error']}")
+
+            actor = Actor(self._conn, module_class, self._worker_id, module_id, self._lock)
+            actor.set_ref(actor).result()
+
+            self._modules[module_id] = actor
+            logger.info(
+                "Deployed module.",
+                module=module_class.__name__,
+                worker_id=self._worker_id,
+                module_id=module_id,
+            )
+            return actor
+        finally:
+            self._reserved = max(0, self._reserved - 1)
 
     def shutdown(self) -> None:
         if self._conn is not None:

From 068b0ad6d17baaa50fb75fad550383429a22686d Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 8 Mar 2026 14:06:34 -0700
Subject: [PATCH 46/89] misc

---
 dimos/core/docker_runner.py      | 4 ++--
 dimos/core/module_coordinator.py | 7 ++++---
 dimos/core/run_registry.py       | 4 +---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 6f0b2e777c..10438298b1 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -39,7 +39,7 @@
 logger = setup_logger()
 
 DOCKER_RUN_TIMEOUT = 120  #     Timeout for `docker run` command execution
-DOCKER_PULL_TIMEOUT_DEFAULT = 600  # Default timeout for `docker pull`
+DOCKER_PULL_TIMEOUT_DEFAULT = None  # No timeout for `docker pull` (images can be large)
 DOCKER_CMD_TIMEOUT = 20  #       Timeout for quick Docker commands (inspect, rm, logs)
 DOCKER_STATUS_TIMEOUT = 10  #    Timeout for container status checks
 DOCKER_STOP_TIMEOUT = 30  #      Timeout for `docker stop` command (graceful shutdown)
@@ -99,7 +99,7 @@ class DockerModuleConfig(ModuleConfig):
     docker_extra_args: list[str] = field(default_factory=list)
 
     # Timeouts
-    docker_pull_timeout: float = DOCKER_PULL_TIMEOUT_DEFAULT
+    docker_pull_timeout: float | None = DOCKER_PULL_TIMEOUT_DEFAULT
     docker_startup_timeout: float = 120.0
     docker_poll_interval: float = 1.0
 
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 59e1013175..7d2478dcb1 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -109,7 +109,8 @@ def stop(self) -> None:
                 logger.error("Error stopping module", module=module_class.__name__, exc_info=True)
             logger.info("Module stopped.", module=module_class.__name__)
 
-        self._client.close_all()  # type: ignore[union-attr]
+        if self._client is not None:
+            self._client.close_all()
 
     def deploy(self, module_class: type[ModuleT], *args, **kwargs) -> ModuleProxy:  # type: ignore[no-untyped-def]
         # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
@@ -154,7 +155,7 @@ def deploy_parallel(
         def _deploy_workers() -> None:
             if not worker_specs:
                 return
-            for (index, _), module in zip(
+            for index, module in zip(
                 worker_indices, self._client.deploy_parallel(worker_specs), strict=False
             ):  # type: ignore[union-attr]
                 results[index] = module
@@ -162,7 +163,7 @@ def _deploy_workers() -> None:
         def _deploy_docker() -> None:
             if not docker_specs:
                 return
-            for (index, _), module in zip(
+            for index, module in zip(
                 docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False
             ):  # type: ignore[arg-type]
                 results[index] = module
diff --git a/dimos/core/run_registry.py b/dimos/core/run_registry.py
index 9f8e7f3358..848eafde4e 100644
--- a/dimos/core/run_registry.py
+++ b/dimos/core/run_registry.py
@@ -21,6 +21,7 @@
 import os
 from pathlib import Path
 import re
+import signal
 import time
 
 from dimos.utils.logging_config import setup_logger
@@ -142,9 +143,6 @@ def get_most_recent(alive_only: bool = True) -> RunEntry | None:
     return runs[-1] if runs else None
 
 
-import signal
-
-
 def stop_entry(entry: RunEntry, force: bool = False) -> tuple[str, bool]:
     """Stop a DimOS instance by registry entry.
 

From 75268debe341cd7bdfffbc41cf296ffe5d72d0f8 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 8 Mar 2026 17:53:29 -0700
Subject: [PATCH 47/89] fix mypy

---
 dimos/core/module_coordinator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 7d2478dcb1..ee417f93cb 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -139,7 +139,7 @@ def deploy_parallel(
         # Split by type, tracking original indices for reassembly
         docker_indices: list[int] = []
         worker_indices: list[int] = []
-        docker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
+        docker_specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]] = []
         worker_specs: list[tuple[type[ModuleT], tuple[Any, ...], dict[str, Any]]] = []
         for i, spec in enumerate(module_specs):
             if is_docker_module(spec[0]):
@@ -155,9 +155,10 @@ def deploy_parallel(
         def _deploy_workers() -> None:
             if not worker_specs:
                 return
+            assert self._client is not None
             for index, module in zip(
                 worker_indices, self._client.deploy_parallel(worker_specs), strict=False
-            ):  # type: ignore[union-attr]
+            ):
                 results[index] = module
 
         def _deploy_docker() -> None:
@@ -165,7 +166,7 @@ def _deploy_docker() -> None:
                 return
             for index, module in zip(
                 docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False
-            ):  # type: ignore[arg-type]
+            ):
                 results[index] = module
 
         def _register() -> None:

From f38e4beb895762dcaa96a71998df754118d45027 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Mon, 9 Mar 2026 13:05:17 -0700
Subject: [PATCH 48/89] fix ExceptionGroup edgecase

---
 dimos/core/docker_worker_manager.py  |  2 +-
 dimos/core/module_coordinator.py     |  2 +-
 dimos/core/resource_monitor/stats.py |  2 +-
 dimos/core/worker_manager.py         |  2 +-
 dimos/utils/safe_thread_map.py       | 16 ++++++++++++++++
 5 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 34183fda9f..29c7c2a29d 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -16,7 +16,7 @@
 from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
-from dimos.utils.safe_thread_map import safe_thread_map
+from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.docker_runner import DockerModule
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index ee417f93cb..deb867453e 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -22,7 +22,7 @@
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
 from dimos.utils.logging_config import setup_logger
-from dimos.utils.safe_thread_map import safe_thread_map
+from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.module import Module, ModuleT
diff --git a/dimos/core/resource_monitor/stats.py b/dimos/core/resource_monitor/stats.py
index c020c853e0..f401358890 100644
--- a/dimos/core/resource_monitor/stats.py
+++ b/dimos/core/resource_monitor/stats.py
@@ -90,7 +90,7 @@ class IoStats(TypedDict):
 def _collect_io(proc: psutil.Process) -> IoStats:
     """Collect IO counters in bytes. Call inside oneshot()."""
     try:
-        io = proc.io_counters()
+        io = proc.io_counters()  # type: ignore[attr-defined]  # Linux-only
         return IoStats(io_read_bytes=io.read_bytes, io_write_bytes=io.write_bytes)
     except (psutil.AccessDenied, AttributeError):
         return IoStats(io_read_bytes=0, io_write_bytes=0)
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index b9c25c8445..fa448cb15d 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -20,7 +20,7 @@
 from dimos.core.rpc_client import RPCClient
 from dimos.core.worker import Worker
 from dimos.utils.logging_config import setup_logger
-from dimos.utils.safe_thread_map import safe_thread_map
+from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.module import ModuleT
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
index 6729c989f3..f480f2c97d 100644
--- a/dimos/utils/safe_thread_map.py
+++ b/dimos/utils/safe_thread_map.py
@@ -14,8 +14,24 @@
 from __future__ import annotations
 
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
+import sys
 from typing import TYPE_CHECKING, Any, TypeVar
 
+if sys.version_info < (3, 11):
+
+    class ExceptionGroup(Exception):  # type: ignore[no-redef]  # noqa: N818
+        """Minimal ExceptionGroup polyfill for Python 3.10."""
+
+        exceptions: tuple[BaseException, ...]
+
+        def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None:
+            super().__init__(message)
+            self.exceptions = tuple(exceptions)
+else:
+    import builtins
+
+    ExceptionGroup = builtins.ExceptionGroup  # type: ignore[misc]
+
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 

From 985ecd7e262b9dafb1fa8116c723f9b6835476a2 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 13 Mar 2026 15:09:33 -0700
Subject: [PATCH 49/89] fix: update Docker deployment to use ModuleSpec format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- docker_worker_manager: accept ModuleSpec format, pass global_config
- module_coordinator: add type: ignore for ModuleBase→Module cast
- worker_manager: convert Iterable to list for len() check
- test_docker_deployment: fix Path import, update test assertions for
  new global_config signature
---
 dimos/core/docker_worker_manager.py        |  8 +++++---
 dimos/core/module_coordinator.py           |  2 +-
 dimos/core/tests/test_docker_deployment.py | 14 ++++++--------
 dimos/core/worker_manager.py               |  1 +
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 29c7c2a29d..520468182f 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -16,11 +16,11 @@
 from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
+from dimos.core.module import ModuleSpec
 from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
     from dimos.core.docker_runner import DockerModule
-    from dimos.core.module import Module
 
 
 class DockerWorkerManager:
@@ -28,7 +28,7 @@ class DockerWorkerManager:
 
     @staticmethod
     def deploy_parallel(
-        specs: list[tuple[type[Module], tuple[Any, ...], dict[str, Any]]],
+        specs: list[ModuleSpec],
     ) -> list[DockerModule]:
         """Deploy multiple DockerModules in parallel.
 
@@ -46,5 +46,7 @@ def _on_errors(
             raise ExceptionGroup("docker deploy_parallel failed", errors)
 
         return safe_thread_map(
-            specs, lambda spec: DockerModule(spec[0], *spec[1], **spec[2]), _on_errors
+            specs,
+            lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]),  # type: ignore[arg-type]
+            _on_errors,
         )
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index d9931b7876..43e3e44f0a 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -131,7 +131,7 @@ def deploy(
 
         deployed_module: ModuleProxyProtocol
         if is_docker_module(module_class):
-            deployed_module = DockerModule(module_class, global_config=global_config, **kwargs)
+            deployed_module = DockerModule(module_class, global_config=global_config, **kwargs)  # type: ignore[arg-type]
         else:
             deployed_module = self._client.deploy(module_class, global_config, kwargs)
         self._deployed_modules[module_class] = deployed_module  # type: ignore[assignment]
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index e89b88e327..a3bb0b716d 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -21,24 +21,20 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
 
 from dimos.core.docker_runner import DockerModuleConfig, is_docker_module
+from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
 from dimos.core.stream import Out
 
-if TYPE_CHECKING:
-    from pathlib import Path
-
 # -- Fixtures: fake module classes -------------------------------------------
 
 
-@dataclass
 class FakeDockerConfig(DockerModuleConfig):
     docker_image: str = "fake:latest"
     docker_file: Path | None = None
@@ -95,7 +91,9 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
         # Should NOT go through worker manager
         mock_worker_mgr.deploy.assert_not_called()
         # Should construct a DockerModule (container launch happens inside __init__)
-        mock_docker_module_cls.assert_called_once_with(FakeDockerModule)
+        mock_docker_module_cls.assert_called_once_with(
+            FakeDockerModule, global_config=global_config
+        )
         # start() is NOT called during deploy — it's called in start_all_modules
         mock_dm.start.assert_not_called()
         assert result is mock_dm
@@ -134,7 +132,7 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage
 
         result = coordinator.deploy(FakeRegularModule)
 
-        mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule)
+        mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {})
         assert result is mock_proxy
 
         coordinator.stop()
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index 52313ca5d4..2b778c433e 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -66,6 +66,7 @@ def deploy_parallel(self, module_specs: Iterable[ModuleSpec]) -> list[RPCClient]
         if self._closed:
             raise RuntimeError("WorkerManager is closed")
 
+        module_specs = list(module_specs)
         if len(module_specs) == 0:
             return []
 

From 5d994c15136aed54e84f2e884cb079e8bd020fbb Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 13 Mar 2026 23:43:35 -0700
Subject: [PATCH 50/89] fix(mypy): cover import-not-found for onnxruntime type:
 ignore

Pre-existing mypy errors: onnxruntime is excluded from install
(--no-extra cuda) so import-not-found needs to be ignored alongside
import-untyped.
---
 dimos/agents_deprecated/memory/image_embedding.py | 2 +-
 dimos/simulation/mujoco/policy.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dimos/agents_deprecated/memory/image_embedding.py b/dimos/agents_deprecated/memory/image_embedding.py
index 27e16f1aa8..d6b0967642 100644
--- a/dimos/agents_deprecated/memory/image_embedding.py
+++ b/dimos/agents_deprecated/memory/image_embedding.py
@@ -63,7 +63,7 @@ def __init__(self, model_name: str = "clip", dimensions: int = 512) -> None:
     def _initialize_model(self):  # type: ignore[no-untyped-def]
         """Initialize the specified embedding model."""
         try:
-            import onnxruntime as ort  # type: ignore[import-untyped]
+            import onnxruntime as ort  # type: ignore[import-untyped,import-not-found]
             import torch  # noqa: F401
             from transformers import (  # type: ignore[import-untyped]
                 AutoFeatureExtractor,
diff --git a/dimos/simulation/mujoco/policy.py b/dimos/simulation/mujoco/policy.py
index 212c7ac60a..1d0598ce46 100644
--- a/dimos/simulation/mujoco/policy.py
+++ b/dimos/simulation/mujoco/policy.py
@@ -20,7 +20,7 @@
 
 import mujoco
 import numpy as np
-import onnxruntime as ort  # type: ignore[import-untyped]
+import onnxruntime as ort  # type: ignore[import-untyped,import-not-found]
 
 from dimos.simulation.mujoco.input_controller import InputController
 from dimos.utils.logging_config import setup_logger

From dd3251e73544f6d303f4b2cad158f27d3947782b Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 13 Mar 2026 23:46:28 -0700
Subject: [PATCH 51/89] fix: remove section markers from hello_docker.py and
 untrack .venv

- Remove comment section markers (dashed lines) that violate the
  no-section-markers test policy
- Remove .venv symlink from git tracking (already in .gitignore)
---
 .venv                                       |  1 -
 examples/docker_hello_world/hello_docker.py | 12 +-----------
 2 files changed, 1 insertion(+), 12 deletions(-)
 delete mode 120000 .venv

diff --git a/.venv b/.venv
deleted file mode 120000
index 3c94680097..0000000000
--- a/.venv
+++ /dev/null
@@ -1 +0,0 @@
-/home/dimos/auto/dimos/.venv
\ No newline at end of file
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 66e95df316..af3bfc19d3 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -41,10 +41,6 @@
 from dimos.core.module import Module
 from dimos.core.stream import In, Out
 
-# ---------------------------------------------------------------------------
-# Docker module (runs inside container)
-# ---------------------------------------------------------------------------
-
 
 @dataclass(kw_only=True)
 class HelloDockerConfig(DockerModuleConfig):
@@ -100,10 +96,6 @@ def get_greeting_prefix(self) -> str:
         return self.config.greeting_prefix
 
 
-# ---------------------------------------------------------------------------
-# Host-side module (sends prompts and prints greetings)
-# ---------------------------------------------------------------------------
-
 
 class PromptModule(Module):
     """Publishes prompts and listens to greetings."""
@@ -125,9 +117,7 @@ def _on_greeting(self, text: str) -> None:
         print(f"[PromptModule] Received: {text}")
 
 
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
+
 
 if __name__ == "__main__":
     from dimos.core.blueprints import autoconnect

From 9830a8e86550c31001d886117ea2deee1860eda7 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 13 Mar 2026 23:46:41 -0700
Subject: [PATCH 52/89] style: fix formatting in hello_docker.py

---
 examples/docker_hello_world/hello_docker.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index af3bfc19d3..3b8e96e49b 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -96,7 +96,6 @@ def get_greeting_prefix(self) -> str:
         return self.config.greeting_prefix
 
 
-
 class PromptModule(Module):
     """Publishes prompts and listens to greetings."""
 
@@ -117,8 +116,6 @@ def _on_greeting(self, text: str) -> None:
         print(f"[PromptModule] Received: {text}")
 
 
-
-
 if __name__ == "__main__":
     from dimos.core.blueprints import autoconnect
 

From cbc46178167d32e39d4d14c58a8e7862a0239e55 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 14 Mar 2026 18:49:06 -0700
Subject: [PATCH 53/89] fix: address review comments on hello_docker example

- Add proper _disposables cleanup for stream subscriptions
- Use subprocess.check_output instead of subprocess.run
- Move inline import (autoconnect) to top of file
---
 examples/docker_hello_world/hello_docker.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 3b8e96e49b..6c30228089 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -36,6 +36,9 @@
 import subprocess
 import time
 
+from reactivex.disposable import Disposable
+
+from dimos.core.blueprints import autoconnect
 from dimos.core.core import rpc
 from dimos.core.docker_runner import DockerModuleConfig
 from dimos.core.module import Module
@@ -67,17 +70,11 @@ class HelloDockerModule(Module["HelloDockerConfig"]):
     @rpc
     def start(self) -> None:
         super().start()
-        self.prompt.subscribe(self._on_prompt)
+        self._disposables.add(Disposable(self.prompt.subscribe(self._on_prompt)))
 
     def _cowsay(self, text: str) -> str:
         """Run cowsay inside the container and return the ASCII art."""
-        result = subprocess.run(
-            ["/usr/games/cowsay", text],
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        return result.stdout
+        return subprocess.check_output(["/usr/games/cowsay", text], text=True)
 
     def _on_prompt(self, text: str) -> None:
         art = self._cowsay(text)
@@ -105,7 +102,7 @@ class PromptModule(Module):
     @rpc
     def start(self) -> None:
         super().start()
-        self.greeting.subscribe(self._on_greeting)
+        self._disposables.add(Disposable(self.greeting.subscribe(self._on_greeting)))
 
     @rpc
     def send(self, text: str) -> None:
@@ -117,8 +114,6 @@ def _on_greeting(self, text: str) -> None:
 
 
 if __name__ == "__main__":
-    from dimos.core.blueprints import autoconnect
-
     coordinator = autoconnect(
         PromptModule.blueprint(),
         HelloDockerModule.blueprint(greeting_prefix="Howdy"),

From 780736c7b9095ac441a876010411af86eb777653 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 11:58:23 -0700
Subject: [PATCH 54/89] make timeout not hardcoded

---
 dimos/core/module.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dimos/core/module.py b/dimos/core/module.py
index 6b12843a3a..c400e697f6 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -104,6 +104,7 @@ class ModuleBase(Configurable[ModuleConfigT], Resource):
     _bound_rpc_calls: dict[str, RpcCall] = {}
     _module_closed: bool = False
     _module_closed_lock: threading.Lock
+    _loop_thread_timeout: float = 2.0
 
     rpc_calls: list[str] = []
 
@@ -151,7 +152,7 @@ def _close_module(self) -> None:
             if loop_thread.is_alive():
                 if loop:
                     loop.call_soon_threadsafe(loop.stop)
-                loop_thread.join(timeout=2)
+                loop_thread.join(timeout=self._loop_thread_timeout)
             self._loop = None
             self._loop_thread = None
 

From 66a6567a9407677127abc54f3e36572618ee8acc Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 10:38:58 -0700
Subject: [PATCH 55/89] docs: add clarifying comment for deploy_parallel lambda
 tuple

---
 dimos/core/worker_manager.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index 2b778c433e..3cd836b3ed 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -93,6 +93,7 @@ def _on_errors(
 
         return safe_thread_map(
             assignments,
+            # item = [worker, module_class, global_config, kwargs]
             lambda item: RPCClient(item[0].deploy_module(item[1], item[2], item[3]), item[1]),
             _on_errors,
         )

From 1d3f1230abb7cc71127bdd85f07926e604f882b1 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 11:45:10 -0700
Subject: [PATCH 56/89] feat: port rpc_timeouts system from jeff/fix/rosnav3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Module.rpc_timeouts dict allows per-method timeout overrides
- RPCClient resolves timeouts from module's rpc_timeouts, with defaults:
  start=1200s, everything else=120s
- RpcCall carries resolved timeout, passes it to call_sync
- DockerModule mirrors the same pattern via _resolve_timeout()
- call_sync no longer auto-detects 'start' — caller is responsible
- Pickle compat: RpcCall supports both old 2-tuple and new 3-tuple state
---
 dimos/core/docker_runner.py | 30 +++++++++++++++++++++++++-----
 dimos/core/module.py        |  6 ++++++
 dimos/core/rpc_client.py    | 25 ++++++++++++++++++++++---
 dimos/protocol/rpc/spec.py  | 13 ++++++++-----
 4 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 16727a8dd1..b879d29be1 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -25,7 +25,7 @@
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.module import ModuleConfig
-from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall
+from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall, RPCClient
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
 from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
@@ -210,6 +210,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
+        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})}
 
         # Build or pull image, launch container, wait for RPC server
         try:
@@ -266,12 +267,19 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
     def get_rpc_method_names(self) -> list[str]:
         return self.rpc_calls
 
+    def _resolve_timeout(self, method: str) -> float:
+        return self._rpc_timeouts.get(method, RPCClient.default_rpc_timeout)
+
     def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         callable.set_rpc(self.rpc)
         self._bound_rpc_calls[method] = callable
         # Forward to container — Module.set_rpc_method unpickles the RpcCall
         # and wires it with the container's own LCMRPC
-        self.rpc.call_sync(f"{self.remote_name}/set_rpc_method", ([method, callable], {}))
+        self.rpc.call_sync(
+            f"{self.remote_name}/set_rpc_method",
+            ([method, callable], {}),
+            rpc_timeout=self._resolve_timeout("set_rpc_method"),
+        )
 
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
         missing = set(methods) - self._bound_rpc_calls.keys()
@@ -283,7 +291,9 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
     def start(self) -> None:
         """Invoke the remote module's start() RPC."""
         try:
-            self.rpc.call_sync(f"{self.remote_name}/start", ([], {}))
+            self.rpc.call_sync(
+                f"{self.remote_name}/start", ([], {}), rpc_timeout=self._resolve_timeout("start")
+            )
         except Exception:
             with suppress(Exception):
                 self.stop()
@@ -333,7 +343,9 @@ def tail_logs(self, n: int = 200) -> str:
     def set_transport(self, stream_name: str, transport: Any) -> bool:
         """Forward to the container's Module.set_transport RPC."""
         result, _ = self.rpc.call_sync(
-            f"{self.remote_name}/set_transport", ([stream_name, transport], {})
+            f"{self.remote_name}/set_transport",
+            ([stream_name, transport], {}),
+            rpc_timeout=self._resolve_timeout("set_transport"),
         )
         return bool(result)
 
@@ -341,7 +353,15 @@ def __getattr__(self, name: str) -> Any:
         rpcs = self.__dict__.get("rpcs")
         if rpcs is not None and name in rpcs:
             original_method = getattr(self._module_class, name, None)
-            return RpcCall(original_method, self.rpc, name, self.remote_name, self._unsub_fns, None)
+            return RpcCall(
+                original_method,
+                self.rpc,
+                name,
+                self.remote_name,
+                self._unsub_fns,
+                None,
+                timeout=self._resolve_timeout(name),
+            )
         raise AttributeError(f"{name} not found on {type(self).__name__}")
 
     # Docker command building (split into focused helpers for readability)
diff --git a/dimos/core/module.py b/dimos/core/module.py
index c400e697f6..bcd61bd435 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -108,6 +108,12 @@ class ModuleBase(Configurable[ModuleConfigT], Resource):
 
     rpc_calls: list[str] = []
 
+    # Per-method RPC timeout overrides (seconds). Keys are method names.
+    # Used by RPCClient when calling methods on this module from the host.
+    # Example: rpc_timeouts = {"on_system_modules": 600.0}
+    # Methods not listed here use RPCClient.default_rpc_timeout (120s).
+    rpc_timeouts: dict[str, float] = {}
+
     def __init__(self, config_args: dict[str, Any]):
         super().__init__(**config_args)
         self._module_closed_lock = threading.Lock()
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 13add06a02..4877a2acd9 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -39,12 +39,14 @@ def __init__(
         remote_name: str,
         unsub_fns: list,  # type: ignore[type-arg]
         stop_client: Callable[[], None] | None = None,
+        timeout: float = 0,
     ) -> None:
         self._rpc = rpc
         self._name = name
         self._remote_name = remote_name
         self._unsub_fns = unsub_fns
         self._stop_rpc_client = stop_client
+        self._timeout = timeout
 
         if original_method:
             self.__doc__ = original_method.__doc__
@@ -67,15 +69,24 @@ def __call__(self, *args, **kwargs):  # type: ignore[no-untyped-def]
                 self._stop_rpc_client()
             return None
 
-        result, unsub_fn = self._rpc.call_sync(f"{self._remote_name}/{self._name}", (args, kwargs))  # type: ignore[arg-type]
+        result, unsub_fn = self._rpc.call_sync(
+            f"{self._remote_name}/{self._name}",
+            (args, kwargs),  # type: ignore[arg-type]
+            rpc_timeout=self._timeout,
+        )
         self._unsub_fns.append(unsub_fn)
         return result
 
     def __getstate__(self):  # type: ignore[no-untyped-def]
-        return (self._name, self._remote_name)
+        return (self._name, self._remote_name, self._timeout)
 
     def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
-        self._name, self._remote_name = state
+        # Support both old 2-tuple and new 3-tuple state for pickle compat.
+        if len(state) == 2:
+            self._name, self._remote_name = state
+            self._timeout = 0
+        else:
+            self._name, self._remote_name, self._timeout = state
         self._unsub_fns = []
         self._rpc = None
         self._stop_rpc_client = None
@@ -93,6 +104,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
 
 
 class RPCClient:
+    # Default timeout for all RPC calls (seconds). Override per-method via
+    # the module's rpc_timeouts dict.
+    default_rpc_timeout: float = 120.0
+
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
         self.rpc = LCMRPC()
         self.actor_class = actor_class
@@ -101,6 +116,8 @@ def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-unty
         self.rpcs = actor_class.rpcs.keys()
         self.rpc.start()
         self._unsub_fns = []  # type: ignore[var-annotated]
+        # Merge module-level rpc_timeouts over the defaults from RPCSpec.
+        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})}
 
     def stop_rpc_client(self) -> None:
         for unsub in self._unsub_fns:
@@ -139,6 +156,7 @@ def __getattr__(self, name: str):  # type: ignore[no-untyped-def]
 
         if name in self.rpcs:
             original_method = getattr(self.actor_class, name, None)
+            timeout = self._rpc_timeouts.get(name, self.default_rpc_timeout)
             return RpcCall(
                 original_method,
                 self.rpc,
@@ -146,6 +164,7 @@ def __getattr__(self, name: str):  # type: ignore[no-untyped-def]
                 self.remote_name,
                 self._unsub_fns,
                 self.stop_rpc_client,
+                timeout=timeout,
             )
 
         # return super().__getattr__(name)
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 47ad77e825..d311e45c6a 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -43,13 +43,16 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[],
 
     def call_nowait(self, name: str, arguments: Args) -> None: ...
 
-    # we expect to crash if we don't get a return value after 10 seconds
-    # but callers can override this timeout for extra long functions
+    # Default RPC timeout. Callers (RpcCall, DockerModule) resolve via
+    # rpc_timeouts dict; raw call_sync uses this as fallback.
+    default_rpc_timeout: float = 120.0
+    rpc_timeouts: dict[str, float] = {"start": 1200.0}
+
     def call_sync(
-        self, name: str, arguments: Args, rpc_timeout: float | None = 120.0
+        self, name: str, arguments: Args, rpc_timeout: float | None = None
     ) -> tuple[Any, Callable[[], None]]:
-        if name == "start":
-            rpc_timeout = 1200.0  # starting modules can take longer
+        if rpc_timeout is None:
+            rpc_timeout = self.rpc_timeouts.get(name, self.default_rpc_timeout)
         event = threading.Event()
 
         def receive_value(val) -> None:  # type: ignore[no-untyped-def]

From 747bbe2e897b515437705377c6a76693dfdab0d1 Mon Sep 17 00:00:00 2001
From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com>
Date: Sun, 15 Mar 2026 19:20:34 +0000
Subject: [PATCH 57/89] CI code cleanup

---
 dimos/core/docker_runner.py | 5 ++++-
 dimos/core/rpc_client.py    | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index b879d29be1..ee98b59705 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -210,7 +210,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
-        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})}
+        self._rpc_timeouts: dict[str, float] = {
+            **self.rpc.rpc_timeouts,
+            **getattr(module_class, "rpc_timeouts", {}),
+        }
 
         # Build or pull image, launch container, wait for RPC server
         try:
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 4877a2acd9..417830a49c 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -117,7 +117,10 @@ def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-unty
         self.rpc.start()
         self._unsub_fns = []  # type: ignore[var-annotated]
         # Merge module-level rpc_timeouts over the defaults from RPCSpec.
-        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})}
+        self._rpc_timeouts: dict[str, float] = {
+            **self.rpc.rpc_timeouts,
+            **getattr(actor_class, "rpc_timeouts", {}),
+        }
 
     def stop_rpc_client(self) -> None:
         for unsub in self._unsub_fns:

From c2d264350480a7f2ba8db4c8b3b782d0d13c511b Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 12:40:33 -0700
Subject: [PATCH 58/89] fixup rpc timeouts, cause they matter for docker

---
 dimos/core/docker_runner.py       | 15 +++------------
 dimos/core/module.py              | 11 +++--------
 dimos/core/rpc_client.py          | 30 +++++++++++-------------------
 dimos/protocol/rpc/pubsubrpc.py   | 17 +++++++++--------
 dimos/protocol/rpc/spec.py        | 16 +++++++++++-----
 dimos/protocol/rpc/test_lcmrpc.py |  2 +-
 dimos/protocol/rpc/test_spec.py   |  8 ++++----
 7 files changed, 42 insertions(+), 57 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index b879d29be1..cebb7fb49b 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -25,7 +25,7 @@
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.module import ModuleConfig
-from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall, RPCClient
+from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
 from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
@@ -205,12 +205,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}"
         )
 
-        self.rpc = LCMRPC()
+        self.rpc = LCMRPC(rpc_timeouts=self.config.rpc_timeouts)
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
-        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(module_class, "rpc_timeouts", {})}
 
         # Build or pull image, launch container, wait for RPC server
         try:
@@ -267,9 +266,6 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
     def get_rpc_method_names(self) -> list[str]:
         return self.rpc_calls
 
-    def _resolve_timeout(self, method: str) -> float:
-        return self._rpc_timeouts.get(method, RPCClient.default_rpc_timeout)
-
     def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         callable.set_rpc(self.rpc)
         self._bound_rpc_calls[method] = callable
@@ -278,7 +274,6 @@ def set_rpc_method(self, method: str, callable: RpcCall) -> None:
         self.rpc.call_sync(
             f"{self.remote_name}/set_rpc_method",
             ([method, callable], {}),
-            rpc_timeout=self._resolve_timeout("set_rpc_method"),
         )
 
     def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
@@ -291,9 +286,7 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]:
     def start(self) -> None:
         """Invoke the remote module's start() RPC."""
         try:
-            self.rpc.call_sync(
-                f"{self.remote_name}/start", ([], {}), rpc_timeout=self._resolve_timeout("start")
-            )
+            self.rpc.call_sync(f"{self.remote_name}/start", ([], {}))
         except Exception:
             with suppress(Exception):
                 self.stop()
@@ -345,7 +338,6 @@ def set_transport(self, stream_name: str, transport: Any) -> bool:
         result, _ = self.rpc.call_sync(
             f"{self.remote_name}/set_transport",
             ([stream_name, transport], {}),
-            rpc_timeout=self._resolve_timeout("set_transport"),
         )
         return bool(result)
 
@@ -360,7 +352,6 @@ def __getattr__(self, name: str) -> Any:
                 self.remote_name,
                 self._unsub_fns,
                 None,
-                timeout=self._resolve_timeout(name),
             )
         raise AttributeError(f"{name} not found on {type(self).__name__}")
 
diff --git a/dimos/core/module.py b/dimos/core/module.py
index bcd61bd435..c6c557b825 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -40,7 +40,7 @@
 from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import In, Out, RemoteOut, Transport
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
-from dimos.protocol.rpc.spec import RPCSpec
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.protocol.service.spec import BaseConfig, Configurable
 from dimos.protocol.tf.tf import LCMTF, TFSpec
 from dimos.utils import colors
@@ -79,6 +79,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]:
 
 class ModuleConfig(BaseConfig):
     rpc_transport: type[RPCSpec] = LCMRPC
+    rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS
     tf_transport: type[TFSpec] = LCMTF  # type: ignore[type-arg]
     frame_id_prefix: str | None = None
     frame_id: str | None = None
@@ -108,19 +109,13 @@ class ModuleBase(Configurable[ModuleConfigT], Resource):
 
     rpc_calls: list[str] = []
 
-    # Per-method RPC timeout overrides (seconds). Keys are method names.
-    # Used by RPCClient when calling methods on this module from the host.
-    # Example: rpc_timeouts = {"on_system_modules": 600.0}
-    # Methods not listed here use RPCClient.default_rpc_timeout (120s).
-    rpc_timeouts: dict[str, float] = {}
-
     def __init__(self, config_args: dict[str, Any]):
         super().__init__(**config_args)
         self._module_closed_lock = threading.Lock()
         self._loop, self._loop_thread = get_loop()
         self._disposables = CompositeDisposable()
         try:
-            self.rpc = self.config.rpc_transport()
+            self.rpc = self.config.rpc_transport(rpc_timeouts=self.config.rpc_timeouts)
             self.rpc.serve_module_rpc(self)
             self.rpc.start()  # type: ignore[attr-defined]
         except ValueError:
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 4877a2acd9..3fd120a1fc 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -18,7 +18,7 @@
 from dimos.core.stream import RemoteStream
 from dimos.core.worker import MethodCallProxy
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
-from dimos.protocol.rpc.spec import RPCSpec
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger()
@@ -39,14 +39,12 @@ def __init__(
         remote_name: str,
         unsub_fns: list,  # type: ignore[type-arg]
         stop_client: Callable[[], None] | None = None,
-        timeout: float = 0,
     ) -> None:
         self._rpc = rpc
         self._name = name
         self._remote_name = remote_name
         self._unsub_fns = unsub_fns
         self._stop_rpc_client = stop_client
-        self._timeout = timeout
 
         if original_method:
             self.__doc__ = original_method.__doc__
@@ -72,21 +70,19 @@ def __call__(self, *args, **kwargs):  # type: ignore[no-untyped-def]
         result, unsub_fn = self._rpc.call_sync(
             f"{self._remote_name}/{self._name}",
             (args, kwargs),  # type: ignore[arg-type]
-            rpc_timeout=self._timeout,
         )
         self._unsub_fns.append(unsub_fn)
         return result
 
     def __getstate__(self):  # type: ignore[no-untyped-def]
-        return (self._name, self._remote_name, self._timeout)
+        return (self._name, self._remote_name)
 
     def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
-        # Support both old 2-tuple and new 3-tuple state for pickle compat.
-        if len(state) == 2:
-            self._name, self._remote_name = state
-            self._timeout = 0
+        # Support both old 2-tuple and new 3-tuple (legacy) state for pickle compat.
+        if len(state) == 3:
+            self._name, self._remote_name, _ = state
         else:
-            self._name, self._remote_name, self._timeout = state
+            self._name, self._remote_name = state
         self._unsub_fns = []
         self._rpc = None
         self._stop_rpc_client = None
@@ -95,6 +91,8 @@ def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
 class ModuleProxyProtocol(Protocol):
     """Protocol for host-side handles to remote modules (worker or Docker)."""
 
+    rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS
+
     def start(self) -> None: ...
     def stop(self) -> None: ...
     def set_transport(self, stream_name: str, transport: Any) -> bool: ...
@@ -104,20 +102,16 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
 
 
 class RPCClient:
-    # Default timeout for all RPC calls (seconds). Override per-method via
-    # the module's rpc_timeouts dict.
-    default_rpc_timeout: float = 120.0
-
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
-        self.rpc = LCMRPC()
+        default_config = getattr(actor_class, "default_config", None)
+        self.rpc_timeouts: dict[str, float] = getattr(default_config, "rpc_timeouts", DEFAULT_RPC_TIMEOUTS)
+        self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts)
         self.actor_class = actor_class
         self.remote_name = actor_class.__name__
         self.actor_instance = actor_instance
         self.rpcs = actor_class.rpcs.keys()
         self.rpc.start()
         self._unsub_fns = []  # type: ignore[var-annotated]
-        # Merge module-level rpc_timeouts over the defaults from RPCSpec.
-        self._rpc_timeouts: dict[str, float] = {**self.rpc.rpc_timeouts, **getattr(actor_class, "rpc_timeouts", {})}
 
     def stop_rpc_client(self) -> None:
         for unsub in self._unsub_fns:
@@ -156,7 +150,6 @@ def __getattr__(self, name: str):  # type: ignore[no-untyped-def]
 
         if name in self.rpcs:
             original_method = getattr(self.actor_class, name, None)
-            timeout = self._rpc_timeouts.get(name, self.default_rpc_timeout)
             return RpcCall(
                 original_method,
                 self.rpc,
@@ -164,7 +157,6 @@ def __getattr__(self, name: str):  # type: ignore[no-untyped-def]
                 self.remote_name,
                 self._unsub_fns,
                 self.stop_rpc_client,
-                timeout=timeout,
             )
 
         # return super().__getattr__(name)
diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py
index 3b77227218..c440710e5f 100644
--- a/dimos/protocol/rpc/pubsubrpc.py
+++ b/dimos/protocol/rpc/pubsubrpc.py
@@ -32,7 +32,7 @@
 from dimos.protocol.pubsub.impl.shmpubsub import PickleSharedMemory
 from dimos.protocol.pubsub.spec import PubSub
 from dimos.protocol.rpc.rpc_utils import deserialize_exception, serialize_exception
-from dimos.protocol.rpc.spec import Args, RPCSpec
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, Args, RPCSpec
 from dimos.utils.generic import short_id
 from dimos.utils.logging_config import setup_logger
 
@@ -62,8 +62,9 @@ class RPCRes(TypedDict, total=False):
 
 
 class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]):
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
+    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
+        self.rpc_timeouts = {**DEFAULT_RPC_TIMEOUTS, **rpc_timeouts}
         # Thread pool for RPC handler execution (prevents deadlock in nested calls)
         self._call_thread_pool: ThreadPoolExecutor | None = None
         self._call_thread_pool_lock = threading.RLock()
@@ -290,12 +291,12 @@ def execute_and_respond() -> None:
 
 
 class LCMRPC(PubSubRPCMixin[Topic, Any], PickleLCM):
-    def __init__(self, **kwargs: Any) -> None:
+    def __init__(self, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
         # Need to ensure PickleLCM gets initialized properly
         # This is due to the diamond inheritance pattern with multiple base classes
         PickleLCM.__init__(self, **kwargs)
-        # Initialize PubSubRPCMixin's thread pool
-        PubSubRPCMixin.__init__(self, **kwargs)
+        # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults)
+        PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs)
 
     def topicgen(self, name: str, req_or_res: bool) -> Topic:
         suffix = "res" if req_or_res else "req"
@@ -306,12 +307,12 @@ def topicgen(self, name: str, req_or_res: bool) -> Topic:
 
 
 class ShmRPC(PubSubRPCMixin[str, Any], PickleSharedMemory):
-    def __init__(self, prefer: str = "cpu", **kwargs: Any) -> None:
+    def __init__(self, rpc_timeouts: dict[str, float], prefer: str = "cpu", **kwargs: Any) -> None:
         # Need to ensure SharedMemory gets initialized properly
         # This is due to the diamond inheritance pattern with multiple base classes
         PickleSharedMemory.__init__(self, prefer=prefer, **kwargs)
-        # Initialize PubSubRPCMixin's thread pool
-        PubSubRPCMixin.__init__(self, **kwargs)
+        # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults)
+        PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs)
 
     def topicgen(self, name: str, req_or_res: bool) -> str:
         suffix = "res" if req_or_res else "req"
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index d311e45c6a..3d17d65948 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -30,6 +30,10 @@ class RPCInspectable(Protocol):
     def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 
 
+DEFAULT_RPC_TIMEOUT: float = 120.0
+DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0}
+
+
 class RPCClient(Protocol):
     # if we don't provide callback, we don't get a return unsub f
     @overload
@@ -43,16 +47,18 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[],
 
     def call_nowait(self, name: str, arguments: Args) -> None: ...
 
-    # Default RPC timeout. Callers (RpcCall, DockerModule) resolve via
-    # rpc_timeouts dict; raw call_sync uses this as fallback.
-    default_rpc_timeout: float = 120.0
-    rpc_timeouts: dict[str, float] = {"start": 1200.0}
+    # call_sync resolves per-method overrides from rpc_timeouts,
+    # falling back to default_rpc_timeout.
+    default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
+    rpc_timeouts: dict[str, float]
 
     def call_sync(
         self, name: str, arguments: Args, rpc_timeout: float | None = None
     ) -> tuple[Any, Callable[[], None]]:
         if rpc_timeout is None:
-            rpc_timeout = self.rpc_timeouts.get(name, self.default_rpc_timeout)
+            # Try full topic name first, then bare method name (after last "/").
+            method = name.rsplit("/", 1)[-1]
+            rpc_timeout = self.rpc_timeouts.get(name, self.rpc_timeouts.get(method, self.default_rpc_timeout))
         event = threading.Event()
 
         def receive_value(val) -> None:  # type: ignore[no-untyped-def]
diff --git a/dimos/protocol/rpc/test_lcmrpc.py b/dimos/protocol/rpc/test_lcmrpc.py
index 5baa5ac40c..700618ab72 100644
--- a/dimos/protocol/rpc/test_lcmrpc.py
+++ b/dimos/protocol/rpc/test_lcmrpc.py
@@ -22,7 +22,7 @@
 
 @pytest.fixture
 def lcmrpc() -> Generator[LCMRPC, None, None]:
-    ret = LCMRPC()
+    ret = LCMRPC(rpc_timeouts={})
     ret.start()
     yield ret
     ret.stop()
diff --git a/dimos/protocol/rpc/test_spec.py b/dimos/protocol/rpc/test_spec.py
index cfee044548..12bdc98c85 100644
--- a/dimos/protocol/rpc/test_spec.py
+++ b/dimos/protocol/rpc/test_spec.py
@@ -46,8 +46,8 @@ def lcm_rpc_context():
     from dimos.protocol.service.lcmservice import autoconf
 
     autoconf()
-    server = LCMRPC()
-    client = LCMRPC()
+    server = LCMRPC(rpc_timeouts={})
+    client = LCMRPC(rpc_timeouts={})
     server.start()
     client.start()
 
@@ -65,8 +65,8 @@ def lcm_rpc_context():
 def shm_rpc_context():
     """Context manager for Shared Memory RPC implementation."""
     # Create two separate instances that communicate through shared memory segments
-    server = ShmRPC(prefer="cpu")
-    client = ShmRPC(prefer="cpu")
+    server = ShmRPC(rpc_timeouts={}, prefer="cpu")
+    client = ShmRPC(rpc_timeouts={}, prefer="cpu")
     server.start()
     client.start()
 

From 54d45920c7b161250f1d6b52a5a91a0e2327cf2c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 12:53:10 -0700
Subject: [PATCH 59/89] better matching logic for rpc_timeouts

---
 dimos/protocol/rpc/spec.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 3d17d65948..6b344d0719 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -57,8 +57,13 @@ def call_sync(
     ) -> tuple[Any, Callable[[], None]]:
         if rpc_timeout is None:
             # Try full topic name first, then bare method name (after last "/").
-            method = name.rsplit("/", 1)[-1]
-            rpc_timeout = self.rpc_timeouts.get(name, self.rpc_timeouts.get(method, self.default_rpc_timeout))
+            rpc_timeout = self.rpc_timeouts.get(name)
+            if rpc_timeout is None:
+                method = name.rsplit("/", 1)[-1]
+                if method is not name:
+                    rpc_timeout = self.rpc_timeouts.get(method, self.default_rpc_timeout)
+                else:
+                    rpc_timeout = self.default_rpc_timeout
         event = threading.Event()
 
         def receive_value(val) -> None:  # type: ignore[no-untyped-def]

From 159854568e70ed14e1140ecf4a49d9afd95bd007 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 12:54:08 -0700
Subject: [PATCH 60/89] enforce RPCSpec's to have rpc_timeouts in constructor

---
 dimos/protocol/rpc/spec.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 6b344d0719..0f48cab05e 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -115,4 +115,5 @@ def override_f(*args, fname=fname, **kwargs):  # type: ignore[no-untyped-def]
             self.serve_rpc(override_f, topic)
 
 
-class RPCSpec(RPCServer, RPCClient): ...
+class RPCSpec(RPCServer, RPCClient):
+    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: ...

From 8a3684389163ea6a14ee994b4f85b77a3e08d9c5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 13:29:40 -0700
Subject: [PATCH 61/89] Remove pr-name-check from this branch

Not related to rosnav feature work.
---
 bin/pr-name-check               | 69 ---------------------------------
 dimos/core/docker_runner.py     |  2 -
 dimos/core/rpc_client.py        |  7 ++--
 dimos/protocol/rpc/pubsubrpc.py |  5 +--
 dimos/protocol/rpc/spec.py      | 14 +++++--
 5 files changed, 17 insertions(+), 80 deletions(-)
 delete mode 100755 bin/pr-name-check

diff --git a/bin/pr-name-check b/bin/pr-name-check
deleted file mode 100755
index 0f67e6172a..0000000000
--- a/bin/pr-name-check
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-branch="$(git rev-parse --abbrev-ref HEAD)"
-
-# based on: https://github.com/dimensionalOS/wiki/wiki
-allowed_types="feat fix chore refactor docs"
-allowed_names="stash ivan paul alexl mustafa miguel christie ruthwik jalaj yashas yash matt jing juan jeff unknown"
-
-if [[ "$branch" != */*/* ]]; then
-  echo "Invalid branch name: '$branch'"
-  echo "Expected format: <your_name>/<type>/<description>"
-  echo "Allowed names: $allowed_names"
-  echo "Allowed types: $allowed_types"
-  exit 1
-fi
-
-branch_name="${branch%%/*}"
-rest="${branch#*/}"
-branch_type="${rest%%/*}"
-branch_description="${branch#*/*/}"
-
-if [[ -z "$branch_description" || "$branch_description" == "$branch" ]]; then
-  echo "Invalid branch name: '$branch'"
-  echo "Expected format: <your_name>/<type>/<description>"
-  exit 1
-fi
-
-name_ok=0
-for n in $allowed_names; do
-  if [[ "$branch_name" == "$n" ]]; then
-    name_ok=1
-    break
-  fi
-done
-
-type_ok=0
-for t in $allowed_types; do
-  if [[ "$branch_type" == "$t" ]]; then
-    type_ok=1
-    break
-  fi
-done
-
-if [[ "$name_ok" -ne 1 || "$type_ok" -ne 1 ]]; then
-  echo
-  echo
-  echo
-  echo
-  echo
-  echo "Invalid branch name: '$branch'"
-  echo
-  echo "    Expected format: <your_name>/<type>/<description>"
-  echo "            Example: jeff/fix/ci-divergence"
-  echo "        Parsed name: $branch_name"
-  echo "      Allowed names: $allowed_names"
-  echo "        Parsed type: $branch_type"
-  echo "      Allowed types: $allowed_types"
-  echo
-  echo "Wait 4 seconds if you want to ignore this error"
-  sleep 1; echo 4
-  sleep 1; echo 3
-  sleep 1; echo 2
-  sleep 1; echo 1
-  exit 1
-else
-  echo "Branch naming check passed: $branch"
-fi
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index cebb7fb49b..c5e1a929f0 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -43,7 +43,6 @@
 DOCKER_CMD_TIMEOUT = 20  #       Timeout for quick Docker commands (inspect, rm, logs)
 DOCKER_STATUS_TIMEOUT = 10  #    Timeout for container status checks
 DOCKER_STOP_TIMEOUT = 30  #      Timeout for `docker stop` command (graceful shutdown)
-RPC_READY_TIMEOUT = 3.0  #       Timeout for RPC readiness probe during container startup
 LOG_TAIL_LINES = 200  #          Number of log lines to include in error messages
 
 
@@ -529,7 +528,6 @@ def _wait_for_rpc(self) -> None:
                 self.rpc.call_sync(
                     f"{self.remote_name}/get_rpc_method_names",
                     ([], {}),
-                    rpc_timeout=RPC_READY_TIMEOUT,
                 )
                 elapsed = time.time() - start_time
                 logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)")
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 3fd120a1fc..46354dd257 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -91,8 +91,6 @@ def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
 class ModuleProxyProtocol(Protocol):
     """Protocol for host-side handles to remote modules (worker or Docker)."""
 
-    rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS
-
     def start(self) -> None: ...
     def stop(self) -> None: ...
     def set_transport(self, stream_name: str, transport: Any) -> bool: ...
@@ -104,7 +102,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
 class RPCClient:
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
         default_config = getattr(actor_class, "default_config", None)
-        self.rpc_timeouts: dict[str, float] = getattr(default_config, "rpc_timeouts", DEFAULT_RPC_TIMEOUTS)
+        self.rpc_timeouts: dict[str, float] = {
+            **DEFAULT_RPC_TIMEOUTS,
+            **getattr(default_config, "rpc_timeouts", {}),
+        }
         self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts)
         self.actor_class = actor_class
         self.remote_name = actor_class.__name__
diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py
index c440710e5f..628c5b0a0b 100644
--- a/dimos/protocol/rpc/pubsubrpc.py
+++ b/dimos/protocol/rpc/pubsubrpc.py
@@ -32,7 +32,7 @@
 from dimos.protocol.pubsub.impl.shmpubsub import PickleSharedMemory
 from dimos.protocol.pubsub.spec import PubSub
 from dimos.protocol.rpc.rpc_utils import deserialize_exception, serialize_exception
-from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, Args, RPCSpec
+from dimos.protocol.rpc.spec import Args, RPCSpec
 from dimos.utils.generic import short_id
 from dimos.utils.logging_config import setup_logger
 
@@ -63,8 +63,7 @@ class RPCRes(TypedDict, total=False):
 
 class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]):
     def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-        self.rpc_timeouts = {**DEFAULT_RPC_TIMEOUTS, **rpc_timeouts}
+        super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs)
         # Thread pool for RPC handler execution (prevents deadlock in nested calls)
         self._call_thread_pool: ThreadPoolExecutor | None = None
         self._call_thread_pool_lock = threading.RLock()
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 0f48cab05e..a4d7e614e8 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -29,12 +29,19 @@ class RPCInspectable(Protocol):
     @property
     def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 
-
 DEFAULT_RPC_TIMEOUT: float = 120.0
 DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0}
 
-
 class RPCClient(Protocol):
+    # call_sync resolves per-method overrides from rpc_timeouts,
+    # falling back to default_rpc_timeout.
+    rpc_timeouts: dict[str, float]
+    default_rpc_timeout: float
+    
+    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        self.rpc_timeouts = dict(rpc_timeouts)
+
     # if we don't provide callback, we don't get a return unsub f
     @overload
     def call(self, name: str, arguments: Args, cb: None) -> None: ...
@@ -116,4 +123,5 @@ def override_f(*args, fname=fname, **kwargs):  # type: ignore[no-untyped-def]
 
 
 class RPCSpec(RPCServer, RPCClient):
-    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None: ...
+    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
+        super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs)

From 7ad090fcd4eadb7b0f8d55c91c17f0b3c26305ad Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 13:38:40 -0700
Subject: [PATCH 62/89] fixup rpc timeouts

---
 dimos/core/docker_runner.py       |  5 ++++-
 dimos/core/module.py              | 10 ++++++---
 dimos/core/rpc_client.py          | 11 +++++-----
 dimos/protocol/rpc/pubsubrpc.py   | 34 ++++++++++++++++++++-----------
 dimos/protocol/rpc/spec.py        | 27 +++++++++++++++---------
 dimos/protocol/rpc/test_lcmrpc.py |  3 ++-
 dimos/protocol/rpc/test_spec.py   |  9 ++++----
 7 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index c5e1a929f0..30468bccd5 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -204,7 +204,10 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             or f"dimos_{module_class.__name__.lower()}_{image_ref.replace(':', '_')}"
         )
 
-        self.rpc = LCMRPC(rpc_timeouts=self.config.rpc_timeouts)
+        self.rpc = LCMRPC(
+            rpc_timeouts=self.config.rpc_timeouts,
+            default_rpc_timeout=self.config.default_rpc_timeout,
+        )
         self.rpcs = set(module_class.rpcs.keys())  # type: ignore[attr-defined]
         self.rpc_calls: list[str] = getattr(module_class, "rpc_calls", [])
         self._unsub_fns: list[Callable[[], None]] = []
diff --git a/dimos/core/module.py b/dimos/core/module.py
index c6c557b825..64f7dd65cf 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -40,7 +40,7 @@
 from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import In, Out, RemoteOut, Transport
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
-from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.protocol.service.spec import BaseConfig, Configurable
 from dimos.protocol.tf.tf import LCMTF, TFSpec
 from dimos.utils import colors
@@ -79,7 +79,8 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]:
 
 class ModuleConfig(BaseConfig):
     rpc_transport: type[RPCSpec] = LCMRPC
-    rpc_timeouts: dict[str, float] = DEFAULT_RPC_TIMEOUTS
+    default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
+    rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS)
     tf_transport: type[TFSpec] = LCMTF  # type: ignore[type-arg]
     frame_id_prefix: str | None = None
     frame_id: str | None = None
@@ -115,7 +116,10 @@ def __init__(self, config_args: dict[str, Any]):
         self._loop, self._loop_thread = get_loop()
         self._disposables = CompositeDisposable()
         try:
-            self.rpc = self.config.rpc_transport(rpc_timeouts=self.config.rpc_timeouts)
+            self.rpc = self.config.rpc_transport(
+                rpc_timeouts=self.config.rpc_timeouts,
+                default_rpc_timeout=self.config.default_rpc_timeout,
+            )
             self.rpc.serve_module_rpc(self)
             self.rpc.start()  # type: ignore[attr-defined]
         except ValueError:
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 46354dd257..7ac34bb645 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -18,7 +18,7 @@
 from dimos.core.stream import RemoteStream
 from dimos.core.worker import MethodCallProxy
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
-from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUTS, RPCSpec
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.utils.logging_config import setup_logger
 
 logger = setup_logger()
@@ -102,11 +102,10 @@ def get_rpc_calls(self, *methods: str) -> RpcCall | tuple[RpcCall, ...]: ...
 class RPCClient:
     def __init__(self, actor_instance, actor_class) -> None:  # type: ignore[no-untyped-def]
         default_config = getattr(actor_class, "default_config", None)
-        self.rpc_timeouts: dict[str, float] = {
-            **DEFAULT_RPC_TIMEOUTS,
-            **getattr(default_config, "rpc_timeouts", {}),
-        }
-        self.rpc = LCMRPC(rpc_timeouts=self.rpc_timeouts)
+        self.rpc = LCMRPC(
+            rpc_timeouts=getattr(default_config, "rpc_timeouts", dict(DEFAULT_RPC_TIMEOUTS)),
+            default_rpc_timeout=getattr(default_config, "default_rpc_timeout", DEFAULT_RPC_TIMEOUT),
+        )
         self.actor_class = actor_class
         self.remote_name = actor_class.__name__
         self.actor_instance = actor_instance
diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py
index 628c5b0a0b..565a9af227 100644
--- a/dimos/protocol/rpc/pubsubrpc.py
+++ b/dimos/protocol/rpc/pubsubrpc.py
@@ -62,8 +62,12 @@ class RPCRes(TypedDict, total=False):
 
 
 class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]):
-    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
-        super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs)
+    def __init__(
+        self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
+    ) -> None:
+        super().__init__(
+            *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
+        )
         # Thread pool for RPC handler execution (prevents deadlock in nested calls)
         self._call_thread_pool: ThreadPoolExecutor | None = None
         self._call_thread_pool_lock = threading.RLock()
@@ -290,12 +294,13 @@ def execute_and_respond() -> None:
 
 
 class LCMRPC(PubSubRPCMixin[Topic, Any], PickleLCM):
-    def __init__(self, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
-        # Need to ensure PickleLCM gets initialized properly
-        # This is due to the diamond inheritance pattern with multiple base classes
+    def __init__(
+        self, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
+    ) -> None:
         PickleLCM.__init__(self, **kwargs)
-        # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults)
-        PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs)
+        PubSubRPCMixin.__init__(
+            self, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
+        )
 
     def topicgen(self, name: str, req_or_res: bool) -> Topic:
         suffix = "res" if req_or_res else "req"
@@ -306,12 +311,17 @@ def topicgen(self, name: str, req_or_res: bool) -> Topic:
 
 
 class ShmRPC(PubSubRPCMixin[str, Any], PickleSharedMemory):
-    def __init__(self, rpc_timeouts: dict[str, float], prefer: str = "cpu", **kwargs: Any) -> None:
-        # Need to ensure SharedMemory gets initialized properly
-        # This is due to the diamond inheritance pattern with multiple base classes
+    def __init__(
+        self,
+        rpc_timeouts: dict[str, float],
+        default_rpc_timeout: float,
+        prefer: str = "cpu",
+        **kwargs: Any,
+    ) -> None:
         PickleSharedMemory.__init__(self, prefer=prefer, **kwargs)
-        # Initialize PubSubRPCMixin's thread pool (merges rpc_timeouts with defaults)
-        PubSubRPCMixin.__init__(self, rpc_timeouts=rpc_timeouts, **kwargs)
+        PubSubRPCMixin.__init__(
+            self, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
+        )
 
     def topicgen(self, name: str, req_or_res: bool) -> str:
         suffix = "res" if req_or_res else "req"
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index a4d7e614e8..f80f77bf3a 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -15,6 +15,7 @@
 import asyncio
 from collections.abc import Callable
 import threading
+from types import MappingProxyType
 from typing import Any, Protocol, overload
 
 
@@ -29,18 +30,25 @@ class RPCInspectable(Protocol):
     @property
     def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 
+
+# module.py and other places imports these constants and choose what to give RPCClient
+# the RPCClient below does not use these constants directly (by design)
 DEFAULT_RPC_TIMEOUT: float = 120.0
-DEFAULT_RPC_TIMEOUTS: dict[str, float] = {"start": 1200.0}
+DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({"start": 1200.0})
+
 
 class RPCClient(Protocol):
     # call_sync resolves per-method overrides from rpc_timeouts,
     # falling back to default_rpc_timeout.
     rpc_timeouts: dict[str, float]
     default_rpc_timeout: float
-    
-    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
+
+    def __init__(
+        self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
+    ) -> None:
         super().__init__(*args, **kwargs)
         self.rpc_timeouts = dict(rpc_timeouts)
+        self.default_rpc_timeout = default_rpc_timeout
 
     # if we don't provide callback, we don't get a return unsub f
     @overload
@@ -54,11 +62,6 @@ def call(self, name: str, arguments: Args, cb: Callable | None) -> Callable[[],
 
     def call_nowait(self, name: str, arguments: Args) -> None: ...
 
-    # call_sync resolves per-method overrides from rpc_timeouts,
-    # falling back to default_rpc_timeout.
-    default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
-    rpc_timeouts: dict[str, float]
-
     def call_sync(
         self, name: str, arguments: Args, rpc_timeout: float | None = None
     ) -> tuple[Any, Callable[[], None]]:
@@ -123,5 +126,9 @@ def override_f(*args, fname=fname, **kwargs):  # type: ignore[no-untyped-def]
 
 
 class RPCSpec(RPCServer, RPCClient):
-    def __init__(self, *args: Any, rpc_timeouts: dict[str, float], **kwargs: Any) -> None:
-        super().__init__(*args, rpc_timeouts=rpc_timeouts, **kwargs)
+    def __init__(
+        self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
+    ) -> None:
+        super().__init__(
+            *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
+        )
diff --git a/dimos/protocol/rpc/test_lcmrpc.py b/dimos/protocol/rpc/test_lcmrpc.py
index 700618ab72..3c2b87761d 100644
--- a/dimos/protocol/rpc/test_lcmrpc.py
+++ b/dimos/protocol/rpc/test_lcmrpc.py
@@ -18,11 +18,12 @@
 
 from dimos.constants import LCM_MAX_CHANNEL_NAME_LENGTH
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT
 
 
 @pytest.fixture
 def lcmrpc() -> Generator[LCMRPC, None, None]:
-    ret = LCMRPC(rpc_timeouts={})
+    ret = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT)
     ret.start()
     yield ret
     ret.stop()
diff --git a/dimos/protocol/rpc/test_spec.py b/dimos/protocol/rpc/test_spec.py
index 12bdc98c85..0b374f7d6c 100644
--- a/dimos/protocol/rpc/test_spec.py
+++ b/dimos/protocol/rpc/test_spec.py
@@ -27,6 +27,7 @@
 
 from dimos.protocol.rpc.pubsubrpc import LCMRPC, ShmRPC
 from dimos.protocol.rpc.rpc_utils import RemoteError
+from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT
 
 
 class CustomTestError(Exception):
@@ -46,8 +47,8 @@ def lcm_rpc_context():
     from dimos.protocol.service.lcmservice import autoconf
 
     autoconf()
-    server = LCMRPC(rpc_timeouts={})
-    client = LCMRPC(rpc_timeouts={})
+    server = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT)
+    client = LCMRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT)
     server.start()
     client.start()
 
@@ -65,8 +66,8 @@ def lcm_rpc_context():
 def shm_rpc_context():
     """Context manager for Shared Memory RPC implementation."""
     # Create two separate instances that communicate through shared memory segments
-    server = ShmRPC(rpc_timeouts={}, prefer="cpu")
-    client = ShmRPC(rpc_timeouts={}, prefer="cpu")
+    server = ShmRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT, prefer="cpu")
+    client = ShmRPC(rpc_timeouts={}, default_rpc_timeout=DEFAULT_RPC_TIMEOUT, prefer="cpu")
     server.start()
     client.start()
 

From d0563a89f6e6ee5382ebec4007c4fad3420c11b4 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 13:40:21 -0700
Subject: [PATCH 63/89] mypy issue on dev

---
 dimos/core/resource.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/dimos/core/resource.py b/dimos/core/resource.py
index 63b1eec4f0..a4c008b806 100644
--- a/dimos/core/resource.py
+++ b/dimos/core/resource.py
@@ -15,7 +15,13 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import TYPE_CHECKING, Self
+import sys
+from typing import TYPE_CHECKING
+
+if sys.version_info >= (3, 11):
+    from typing import Self
+else:
+    from typing_extensions import Self
 
 if TYPE_CHECKING:
     from types import TracebackType

From 639e90c9caa030779de5071c3a9dd3e509ea7ca4 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 13:40:34 -0700
Subject: [PATCH 64/89] equality

---
 dimos/protocol/rpc/spec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index f80f77bf3a..f833b032ad 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -70,7 +70,7 @@ def call_sync(
             rpc_timeout = self.rpc_timeouts.get(name)
             if rpc_timeout is None:
                 method = name.rsplit("/", 1)[-1]
-                if method is not name:
+                if method != name:
                     rpc_timeout = self.rpc_timeouts.get(method, self.default_rpc_timeout)
                 else:
                     rpc_timeout = self.default_rpc_timeout

From 5c85dc20e0c00d0fa6f7e04b8e3d9357f3d9e7c1 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 14:26:15 -0700
Subject: [PATCH 65/89] fix: docker module init + rpc timeout bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove @dataclass(kw_only=True) from HelloDockerConfig (conflicts with Pydantic)
- Pop global_config from kwargs before passing to config class
- Store rpc_timeouts/default_rpc_timeout in PubSubRPCMixin (not Protocol)
- Remove __init__ from RPCClient Protocol and RPCSpec (structural typing only)
- Use short 3s timeout for readiness probe polling (was using 120s default)
- Extract NavigationStrategy/VlModelName into lightweight types.py files
  (same fix as jeff/fix/help — prevents torch import in Docker containers)
---
 dimos/core/docker_runner.py                 |  4 ++++
 dimos/core/global_config.py                 |  4 ++--
 dimos/mapping/occupancy/path_map.py         |  3 +--
 dimos/mapping/occupancy/types.py            |  3 +++
 dimos/models/vl/create.py                   |  4 ++--
 dimos/models/vl/types.py                    |  3 +++
 dimos/protocol/rpc/pubsubrpc.py             |  6 +++---
 dimos/protocol/rpc/spec.py                  | 17 +++--------------
 examples/docker_hello_world/hello_docker.py |  3 +--
 9 files changed, 22 insertions(+), 25 deletions(-)
 create mode 100644 dimos/mapping/occupancy/types.py
 create mode 100644 dimos/models/vl/types.py

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 30468bccd5..3efc05f316 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -183,6 +183,9 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             image_exists,
         )
 
+        # global_config is passed by deploy pipeline but isn't a config field
+        kwargs.pop("global_config", None)
+
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
         if not issubclass(config_class, DockerModuleConfig):
             raise TypeError(
@@ -531,6 +534,7 @@ def _wait_for_rpc(self) -> None:
                 self.rpc.call_sync(
                     f"{self.remote_name}/get_rpc_method_names",
                     ([], {}),
+                    rpc_timeout=3.0,  # short timeout for polling readiness
                 )
                 elapsed = time.time() - start_time
                 logger.info(f"{self.remote_name} ready ({elapsed:.1f}s)")
diff --git a/dimos/core/global_config.py b/dimos/core/global_config.py
index 60072ae7fd..49f4d4f325 100644
--- a/dimos/core/global_config.py
+++ b/dimos/core/global_config.py
@@ -17,8 +17,8 @@
 
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
-from dimos.mapping.occupancy.path_map import NavigationStrategy
-from dimos.models.vl.create import VlModelName
+from dimos.mapping.occupancy.types import NavigationStrategy
+from dimos.models.vl.types import VlModelName
 
 ViewerBackend: TypeAlias = Literal["rerun", "rerun-web", "rerun-connect", "foxglove", "none"]
 
diff --git a/dimos/mapping/occupancy/path_map.py b/dimos/mapping/occupancy/path_map.py
index a99a423de8..7392030298 100644
--- a/dimos/mapping/occupancy/path_map.py
+++ b/dimos/mapping/occupancy/path_map.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Literal, TypeAlias
+from dimos.mapping.occupancy.types import NavigationStrategy
 
 from dimos.mapping.occupancy.gradient import voronoi_gradient
 from dimos.mapping.occupancy.inflation import simple_inflate
 from dimos.mapping.occupancy.operations import overlay_occupied, smooth_occupied
 from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid
 
-NavigationStrategy: TypeAlias = Literal["simple", "mixed"]
 
 
 def make_navigation_map(
diff --git a/dimos/mapping/occupancy/types.py b/dimos/mapping/occupancy/types.py
new file mode 100644
index 0000000000..e6b7d5bd6b
--- /dev/null
+++ b/dimos/mapping/occupancy/types.py
@@ -0,0 +1,3 @@
+from typing import Literal, TypeAlias
+
+NavigationStrategy: TypeAlias = Literal["simple", "mixed"]
diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py
index 6c778d4104..bb14758bcb 100644
--- a/dimos/models/vl/create.py
+++ b/dimos/models/vl/create.py
@@ -1,8 +1,8 @@
-from typing import Any, Literal
+from typing import Any
 
 from dimos.models.vl.base import VlModel
 
-VlModelName = Literal["qwen", "moondream"]
+from dimos.models.vl.types import VlModelName
 
 
 def create(name: VlModelName) -> VlModel[Any]:
diff --git a/dimos/models/vl/types.py b/dimos/models/vl/types.py
new file mode 100644
index 0000000000..ac8b0f024d
--- /dev/null
+++ b/dimos/models/vl/types.py
@@ -0,0 +1,3 @@
+from typing import Literal
+
+VlModelName = Literal["qwen", "moondream"]
diff --git a/dimos/protocol/rpc/pubsubrpc.py b/dimos/protocol/rpc/pubsubrpc.py
index 565a9af227..52cb89a199 100644
--- a/dimos/protocol/rpc/pubsubrpc.py
+++ b/dimos/protocol/rpc/pubsubrpc.py
@@ -65,9 +65,9 @@ class PubSubRPCMixin(RPCSpec, PubSub[TopicT, MsgT], Generic[TopicT, MsgT]):
     def __init__(
         self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
     ) -> None:
-        super().__init__(
-            *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
-        )
+        super().__init__(*args, **kwargs)
+        self.rpc_timeouts = dict(rpc_timeouts)
+        self.default_rpc_timeout = default_rpc_timeout
         # Thread pool for RPC handler execution (prevents deadlock in nested calls)
         self._call_thread_pool: ThreadPoolExecutor | None = None
         self._call_thread_pool_lock = threading.RLock()
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index f833b032ad..993f6044bb 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -39,17 +39,11 @@ def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 
 class RPCClient(Protocol):
     # call_sync resolves per-method overrides from rpc_timeouts,
-    # falling back to default_rpc_timeout.
+    # falling back to default_rpc_timeout. These are set by
+    # PubSubRPCMixin.__init__ at runtime.
     rpc_timeouts: dict[str, float]
     default_rpc_timeout: float
 
-    def __init__(
-        self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
-    ) -> None:
-        super().__init__(*args, **kwargs)
-        self.rpc_timeouts = dict(rpc_timeouts)
-        self.default_rpc_timeout = default_rpc_timeout
-
     # if we don't provide callback, we don't get a return unsub f
     @overload
     def call(self, name: str, arguments: Args, cb: None) -> None: ...
@@ -126,9 +120,4 @@ def override_f(*args, fname=fname, **kwargs):  # type: ignore[no-untyped-def]
 
 
 class RPCSpec(RPCServer, RPCClient):
-    def __init__(
-        self, *args: Any, rpc_timeouts: dict[str, float], default_rpc_timeout: float, **kwargs: Any
-    ) -> None:
-        super().__init__(
-            *args, rpc_timeouts=rpc_timeouts, default_rpc_timeout=default_rpc_timeout, **kwargs
-        )
+    pass
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 6c30228089..0fb56959ea 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -31,7 +31,7 @@
 
 from __future__ import annotations
 
-from dataclasses import dataclass, field
+from dataclasses import field
 from pathlib import Path
 import subprocess
 import time
@@ -45,7 +45,6 @@
 from dimos.core.stream import In, Out
 
 
-@dataclass(kw_only=True)
 class HelloDockerConfig(DockerModuleConfig):
     docker_image: str = "dimos-hello-docker:latest"
     docker_file: Path | None = Path(__file__).parent / "Dockerfile"

From 9668e3afda98ab9af0643505e8fdaabe7ca1d068 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sun, 15 Mar 2026 14:54:51 -0700
Subject: [PATCH 66/89] fix(example): use 'cowsay' not '/usr/games/cowsay' per
 review

Address Paul's review comment to use check_output with plain 'cowsay'.
---
 examples/docker_hello_world/hello_docker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index 0fb56959ea..a9913d770b 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -73,7 +73,7 @@ def start(self) -> None:
 
     def _cowsay(self, text: str) -> str:
         """Run cowsay inside the container and return the ASCII art."""
-        return subprocess.check_output(["/usr/games/cowsay", text], text=True)
+        return subprocess.check_output(["cowsay", text], text=True)
 
     def _on_prompt(self, text: str) -> None:
         art = self._cowsay(text)

From fba0a7128ad99f7656e58bf409930067c482d85b Mon Sep 17 00:00:00 2001
From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com>
Date: Mon, 16 Mar 2026 17:51:48 +0000
Subject: [PATCH 67/89] CI code cleanup

---
 dimos/mapping/occupancy/path_map.py |  4 +---
 dimos/mapping/occupancy/types.py    | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/dimos/mapping/occupancy/path_map.py b/dimos/mapping/occupancy/path_map.py
index 7392030298..a1a4640007 100644
--- a/dimos/mapping/occupancy/path_map.py
+++ b/dimos/mapping/occupancy/path_map.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dimos.mapping.occupancy.types import NavigationStrategy
-
 from dimos.mapping.occupancy.gradient import voronoi_gradient
 from dimos.mapping.occupancy.inflation import simple_inflate
 from dimos.mapping.occupancy.operations import overlay_occupied, smooth_occupied
+from dimos.mapping.occupancy.types import NavigationStrategy
 from dimos.msgs.nav_msgs.OccupancyGrid import OccupancyGrid
 
 
-
 def make_navigation_map(
     occupancy_grid: OccupancyGrid, robot_width: float, strategy: NavigationStrategy
 ) -> OccupancyGrid:
diff --git a/dimos/mapping/occupancy/types.py b/dimos/mapping/occupancy/types.py
index e6b7d5bd6b..87f2084698 100644
--- a/dimos/mapping/occupancy/types.py
+++ b/dimos/mapping/occupancy/types.py
@@ -1,3 +1,17 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Literal, TypeAlias
 
 NavigationStrategy: TypeAlias = Literal["simple", "mixed"]

From 593c4180c17c3a857e1ef023e9a5ac264915731d Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Tue, 17 Mar 2026 10:27:10 -0700
Subject: [PATCH 68/89] fix: address Paul's PR review comments

- Use strict=True instead of strict=False in zip() calls (module_coordinator.py)
- Fix mutable default dict for rpc_timeouts using Field(default_factory=...) (module.py)
- Remove unnecessary getattr() for _unsub_fns in _cleanup() (docker_runner.py)
- Use threading.Event instead of bool for _running flag (docker_runner.py)
- Rename global_config kwarg to g to match ModuleConfig field name (docker_runner.py, module_coordinator.py, docker_worker_manager.py)
- Move inline test imports to top of file (test_docker_deployment.py)
- Sort imports in hello_docker.py example
---
 dimos/core/docker_runner.py                | 21 +++++++++---------
 dimos/core/docker_worker_manager.py        |  2 +-
 dimos/core/module.py                       |  3 ++-
 dimos/core/module_coordinator.py           |  8 +++----
 dimos/core/tests/test_docker_deployment.py | 25 +++++++---------------
 5 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 3efc05f316..fb5770325b 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -183,8 +183,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
             image_exists,
         )
 
-        # global_config is passed by deploy pipeline but isn't a config field
-        kwargs.pop("global_config", None)
+        # g (GlobalConfig) is passed by deploy pipeline but handled by the base config
+        kwargs.pop("g", None)
 
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
         if not issubclass(config_class, DockerModuleConfig):
@@ -198,7 +198,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self.config = config
         self._args = args
         self._kwargs = kwargs
-        self._running = False
+        self._running = threading.Event()
         self.remote_name = module_class.__name__
         # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2"
         image_ref = config.docker_image.rsplit("/", 1)[-1]
@@ -259,7 +259,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                         f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
                     )
             self.rpc.start()
-            self._running = True
+            self._running.set()
             # docker run -d returns before Module.__init__ finishes in the container,
             # so we poll until the RPC server is reachable before returning.
             self._wait_for_rpc()
@@ -299,9 +299,9 @@ def start(self) -> None:
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
-        if not self._running:
+        if not self._running.is_set():
             return
-        self._running = False  # claim shutdown before any side-effects
+        self._running.clear()  # claim shutdown before any side-effects
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         self._cleanup()
@@ -310,11 +310,10 @@ def _cleanup(self) -> None:
         """Release all resources. Idempotent — safe to call from partial init or after stop()."""
         with suppress(Exception):
             self.rpc.stop()
-        for unsub in getattr(self, "_unsub_fns", []):
+        for unsub in self._unsub_fns:
             with suppress(Exception):
                 unsub()
-        with suppress(Exception):
-            self._unsub_fns.clear()
+        self._unsub_fns.clear()
         if not getattr(getattr(self, "config", None), "docker_reconnect_container", False):
             with suppress(Exception):
                 _run(
@@ -323,7 +322,7 @@ def _cleanup(self) -> None:
                 )
             with suppress(Exception):
                 _remove_container(self.config, self._container_name)
-        self._running = False
+        self._running.clear()
         logger.info(f"Cleaned up container handle: {self._container_name}")
 
     def status(self) -> dict[str, Any]:
@@ -332,7 +331,7 @@ def status(self) -> dict[str, Any]:
             "module": self.remote_name,
             "container_name": self._container_name,
             "image": cfg.docker_image,
-            "running": bool(self._running and _is_container_running(cfg, self._container_name)),
+            "running": self._running.is_set() and _is_container_running(cfg, self._container_name),
         }
 
     def tail_logs(self, n: int = 200) -> str:
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 520468182f..94a5793c3d 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -47,6 +47,6 @@ def _on_errors(
 
         return safe_thread_map(
             specs,
-            lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]),  # type: ignore[arg-type]
+            lambda spec: DockerModule(spec[0], g=spec[1], **spec[2]),  # type: ignore[arg-type]
             _on_errors,
         )
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 64f7dd65cf..2e03e2484e 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -30,6 +30,7 @@
 )
 
 from langchain_core.tools import tool
+from pydantic import Field
 from reactivex.disposable import CompositeDisposable
 
 from dimos.core.core import T, rpc
@@ -80,7 +81,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]:
 class ModuleConfig(BaseConfig):
     rpc_transport: type[RPCSpec] = LCMRPC
     default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
-    rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS)
+    rpc_timeouts: dict[str, float] = Field(default_factory=lambda: dict(DEFAULT_RPC_TIMEOUTS))
     tf_transport: type[TFSpec] = LCMTF  # type: ignore[type-arg]
     frame_id_prefix: str | None = None
     frame_id: str | None = None
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 43e3e44f0a..d2d1db67be 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -131,7 +131,7 @@ def deploy(
 
         deployed_module: ModuleProxyProtocol
         if is_docker_module(module_class):
-            deployed_module = DockerModule(module_class, global_config=global_config, **kwargs)  # type: ignore[arg-type]
+            deployed_module = DockerModule(module_class, g=global_config, **kwargs)  # type: ignore[arg-type]
         else:
             deployed_module = self._client.deploy(module_class, global_config, kwargs)
         self._deployed_modules[module_class] = deployed_module  # type: ignore[assignment]
@@ -165,7 +165,7 @@ def _deploy_workers() -> None:
                 return
             assert self._client is not None
             for index, module in zip(
-                worker_indices, self._client.deploy_parallel(worker_specs), strict=False
+                worker_indices, self._client.deploy_parallel(worker_specs), strict=True
             ):
                 results[index] = module
 
@@ -173,12 +173,12 @@ def _deploy_docker() -> None:
             if not docker_specs:
                 return
             for index, module in zip(
-                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False
+                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True
             ):
                 results[index] = module
 
         def _register() -> None:
-            for (module_class, _, _), module in zip(module_specs, results, strict=False):
+            for (module_class, _, _), module in zip(module_specs, results, strict=True):
                 if module is not None:
                     self._deployed_modules[module_class] = module
 
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index a3bb0b716d..3dfb9242c6 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -22,14 +22,16 @@
 from __future__ import annotations
 
 from pathlib import Path
+import threading
 from unittest.mock import MagicMock, patch
 
 import pytest
 
-from dimos.core.docker_runner import DockerModuleConfig, is_docker_module
+from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
+from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import Out
 
 # -- Fixtures: fake module classes -------------------------------------------
@@ -91,9 +93,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
         # Should NOT go through worker manager
         mock_worker_mgr.deploy.assert_not_called()
         # Should construct a DockerModule (container launch happens inside __init__)
-        mock_docker_module_cls.assert_called_once_with(
-            FakeDockerModule, global_config=global_config
-        )
+        mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config)
         # start() is NOT called during deploy — it's called in start_all_modules
         mock_dm.start.assert_not_called()
         assert result is mock_dm
@@ -198,7 +198,6 @@ class TestDockerModuleGetattr:
 
     def test_getattr_no_recursion_when_rpcs_not_set(self):
         """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse."""
-        from dimos.core.docker_runner import DockerModule
 
         dm = DockerModule.__new__(DockerModule)
         # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure
@@ -207,7 +206,6 @@ def test_getattr_no_recursion_when_rpcs_not_set(self):
 
     def test_getattr_no_recursion_on_cleanup_attrs(self):
         """Accessing cleanup-related attrs before they exist must raise, not recurse."""
-        from dimos.core.docker_runner import DockerModule
 
         dm = DockerModule.__new__(DockerModule)
         # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse
@@ -216,9 +214,6 @@ def test_getattr_no_recursion_on_cleanup_attrs(self):
                 getattr(dm, attr)
 
     def test_getattr_delegates_to_rpc_when_rpcs_set(self):
-        from dimos.core.docker_runner import DockerModule
-        from dimos.core.rpc_client import RpcCall
-
         dm = DockerModule.__new__(DockerModule)
         dm.rpcs = {"do_thing"}
 
@@ -235,8 +230,6 @@ def do_thing(self) -> None: ...
         assert isinstance(result, RpcCall)
 
     def test_getattr_raises_for_unknown_method(self):
-        from dimos.core.docker_runner import DockerModule
-
         dm = DockerModule.__new__(DockerModule)
         dm.rpcs = {"do_thing"}
 
@@ -248,11 +241,10 @@ class TestDockerModuleCleanupReconnect:
     """Tests for DockerModule._cleanup with docker_reconnect_container."""
 
     def test_cleanup_skips_stop_when_reconnect(self):
-        from dimos.core.docker_runner import DockerModule
-
         with patch.object(DockerModule, "__init__", lambda self: None):
             dm = DockerModule.__new__(DockerModule)
-            dm._running = True
+            dm._running = threading.Event()
+            dm._running.set()
             dm._container_name = "test_container"
             dm._unsub_fns = []
             dm.rpc = MagicMock()
@@ -269,11 +261,10 @@ def test_cleanup_skips_stop_when_reconnect(self):
                 mock_rm.assert_not_called()
 
     def test_cleanup_stops_container_when_not_reconnect(self):
-        from dimos.core.docker_runner import DockerModule
-
         with patch.object(DockerModule, "__init__", lambda self: None):
             dm = DockerModule.__new__(DockerModule)
-            dm._running = True
+            dm._running = threading.Event()
+            dm._running.set()
             dm._container_name = "test_container"
             dm._unsub_fns = []
             dm.rpc = MagicMock()

From 427816618a935917e71ddaf11258fbc8229a8016 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 19 Mar 2026 02:42:22 -0700
Subject: [PATCH 69/89] fix(ci): fix _DummyRPC init and mypy type-ignore for
 rpc_transport kwargs

- Add __init__(**kwargs) to _DummyRPC in test_sim_module.py to accept
  rpc_timeouts/default_rpc_timeout kwargs passed by Module.__init__
- Add type: ignore[call-arg] for RPCSpec Protocol constructor call
---
 dimos/core/module.py                             | 2 +-
 dimos/simulation/manipulators/test_sim_module.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dimos/core/module.py b/dimos/core/module.py
index 2e03e2484e..59c8833ea8 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -117,7 +117,7 @@ def __init__(self, config_args: dict[str, Any]):
         self._loop, self._loop_thread = get_loop()
         self._disposables = CompositeDisposable()
         try:
-            self.rpc = self.config.rpc_transport(
+            self.rpc = self.config.rpc_transport(  # type: ignore[call-arg]
                 rpc_timeouts=self.config.rpc_timeouts,
                 default_rpc_timeout=self.config.default_rpc_timeout,
             )
diff --git a/dimos/simulation/manipulators/test_sim_module.py b/dimos/simulation/manipulators/test_sim_module.py
index 951d4790e3..54d8f21da3 100644
--- a/dimos/simulation/manipulators/test_sim_module.py
+++ b/dimos/simulation/manipulators/test_sim_module.py
@@ -22,6 +22,9 @@
 
 
 class _DummyRPC(RPCSpec):
+    def __init__(self, **kwargs: object) -> None:  # type: ignore[no-untyped-def]
+        pass
+
     def serve_module_rpc(self, _module) -> None:  # type: ignore[no-untyped-def]
         return None
 

From 47737b09eec1fec17bdce898a6a74c4c5d48fcdf Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 19 Mar 2026 03:15:27 -0700
Subject: [PATCH 70/89] fix(mypy): add __all__ to vl/create.py for explicit
 VlModelName export

---
 dimos/models/vl/create.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dimos/models/vl/create.py b/dimos/models/vl/create.py
index bb14758bcb..9d2a908532 100644
--- a/dimos/models/vl/create.py
+++ b/dimos/models/vl/create.py
@@ -1,9 +1,10 @@
 from typing import Any
 
 from dimos.models.vl.base import VlModel
-
 from dimos.models.vl.types import VlModelName
 
+__all__ = ["VlModelName", "create"]
+
 
 def create(name: VlModelName) -> VlModel[Any]:
     # This uses inline imports to only import what's needed.

From 157ce93717695bb9145f5260cc561bc408a3a6ce Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 19 Mar 2026 04:27:47 -0700
Subject: [PATCH 71/89] fix(test): wrap coordinator in try/finally for proper
 cleanup on test failure

Address Paul's review comment: if an assertion fails before
coordinator.stop(), cleanup won't run. Use try/finally to ensure
stop() is always called, even when tests fail.
---
 dimos/core/tests/test_docker_deployment.py | 92 +++++++++++-----------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 3dfb9242c6..d8eb9448ff 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -87,19 +87,19 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
 
         coordinator = ModuleCoordinator()
         coordinator.start()
-
-        result = coordinator.deploy(FakeDockerModule)
-
-        # Should NOT go through worker manager
-        mock_worker_mgr.deploy.assert_not_called()
-        # Should construct a DockerModule (container launch happens inside __init__)
-        mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config)
-        # start() is NOT called during deploy — it's called in start_all_modules
-        mock_dm.start.assert_not_called()
-        assert result is mock_dm
-        assert coordinator.get_instance(FakeDockerModule) is mock_dm
-
-        coordinator.stop()
+        try:
+            result = coordinator.deploy(FakeDockerModule)
+
+            # Should NOT go through worker manager
+            mock_worker_mgr.deploy.assert_not_called()
+            # Should construct a DockerModule (container launch happens inside __init__)
+            mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config)
+            # start() is NOT called during deploy — it's called in start_all_modules
+            mock_dm.start.assert_not_called()
+            assert result is mock_dm
+            assert coordinator.get_instance(FakeDockerModule) is mock_dm
+        finally:
+            coordinator.stop()
 
     @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
@@ -114,11 +114,11 @@ def test_deploy_docker_propagates_constructor_failure(
 
         coordinator = ModuleCoordinator()
         coordinator.start()
-
-        with pytest.raises(RuntimeError, match="launch failed"):
-            coordinator.deploy(FakeDockerModule)
-
-        coordinator.stop()
+        try:
+            with pytest.raises(RuntimeError, match="launch failed"):
+                coordinator.deploy(FakeDockerModule)
+        finally:
+            coordinator.stop()
 
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls):
@@ -129,13 +129,13 @@ def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manage
 
         coordinator = ModuleCoordinator()
         coordinator.start()
+        try:
+            result = coordinator.deploy(FakeRegularModule)
 
-        result = coordinator.deploy(FakeRegularModule)
-
-        mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {})
-        assert result is mock_proxy
-
-        coordinator.stop()
+            mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {})
+            assert result is mock_proxy
+        finally:
+            coordinator.stop()
 
     @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel")
     @patch("dimos.core.module_coordinator.WorkerManager")
@@ -153,25 +153,25 @@ def test_deploy_parallel_separates_docker_and_regular(
 
         coordinator = ModuleCoordinator()
         coordinator.start()
-
-        specs = [
-            (FakeRegularModule, (), {}),
-            (FakeDockerModule, (), {}),
-        ]
-        results = coordinator.deploy_parallel(specs)
-
-        # Regular module goes through worker manager
-        mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
-        # Docker specs go through DockerWorkerManager
-        mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})])
-        # start() is NOT called during deploy — it's called in start_all_modules
-        mock_dm.start.assert_not_called()
-
-        # Results preserve input order
-        assert results[0] is regular_proxy
-        assert results[1] is mock_dm
-
-        coordinator.stop()
+        try:
+            specs = [
+                (FakeRegularModule, (), {}),
+                (FakeDockerModule, (), {}),
+            ]
+            results = coordinator.deploy_parallel(specs)
+
+            # Regular module goes through worker manager
+            mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
+            # Docker specs go through DockerWorkerManager
+            mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})])
+            # start() is NOT called during deploy — it's called in start_all_modules
+            mock_dm.start.assert_not_called()
+
+            # Results preserve input order
+            assert results[0] is regular_proxy
+            assert results[1] is mock_dm
+        finally:
+            coordinator.stop()
 
     @patch("dimos.core.docker_runner.DockerModule")
     @patch("dimos.core.module_coordinator.WorkerManager")
@@ -184,8 +184,10 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke
 
         coordinator = ModuleCoordinator()
         coordinator.start()
-        coordinator.deploy(FakeDockerModule)
-        coordinator.stop()
+        try:
+            coordinator.deploy(FakeDockerModule)
+        finally:
+            coordinator.stop()
 
         # stop() called exactly once (no double cleanup)
         assert mock_dm.stop.call_count == 1

From 07b33dd8fd67fd7899bc8a3bf0f6a96c0afda61b Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 19 Mar 2026 16:49:01 -0700
Subject: [PATCH 72/89] add build

---
 dimos/core/blueprints.py                   |  1 +
 dimos/core/docker_runner.py                | 48 ++++++++++++++--------
 dimos/core/docker_worker_manager.py        | 11 ++---
 dimos/core/module.py                       | 13 +++++-
 dimos/core/module_coordinator.py           | 24 +++++++++--
 dimos/core/rpc_client.py                   |  2 +
 dimos/core/tests/test_docker_deployment.py | 16 +++-----
 dimos/protocol/rpc/spec.py                 |  5 ++-
 8 files changed, 82 insertions(+), 38 deletions(-)

diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py
index cac8507881..823488c611 100644
--- a/dimos/core/blueprints.py
+++ b/dimos/core/blueprints.py
@@ -494,6 +494,7 @@ def build(
         self._connect_rpc_methods(module_coordinator)
         self._connect_module_refs(module_coordinator)
 
+        module_coordinator.build_all_modules()
         module_coordinator.start_all_modules()
 
         return module_coordinator
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 3efc05f316..3e7376a66d 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -167,7 +167,9 @@ class DockerModule(ModuleProxyProtocol):
     Host-side handle for a module running inside Docker.
 
     Lifecycle:
-    - start(): builds the image if needed, launches the container, waits for readiness, calls the remote module's start() RPC (after streams are wired)
+    - __init__(): lightweight setup — config, names, RPC client, no side-effects
+    - build(): heavy work — docker build/pull image, launch container, wait for RPC readiness
+    - start(): invoke remote module's start() RPC (after streams are wired)
     - stop(): stops the container and cleans up
 
     Communication: All RPC happens via LCM multicast (requires --network=host).
@@ -176,13 +178,6 @@ class DockerModule(ModuleProxyProtocol):
     config: DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
-        from dimos.core.docker_build import (
-            _compute_build_hash,
-            _get_image_build_hash,
-            build_image,
-            image_exists,
-        )
-
         # global_config is passed by deploy pipeline but isn't a config field
         kwargs.pop("global_config", None)
 
@@ -198,7 +193,8 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self.config = config
         self._args = args
         self._kwargs = kwargs
-        self._running = False
+        self._running = threading.Event()
+        self._is_built = False
         self.remote_name = module_class.__name__
         # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2"
         image_ref = config.docker_image.rsplit("/", 1)[-1]
@@ -216,7 +212,23 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._unsub_fns: list[Callable[[], None]] = []
         self._bound_rpc_calls: dict[str, RpcCall] = {}
 
-        # Build or pull image, launch container, wait for RPC server
+    def build(self) -> None:
+        """Build/pull docker image, launch container, wait for RPC readiness.
+
+        Idempotent — safe to call multiple times. Has no RPC timeout since
+        this runs host-side (not via RPC to a worker process).
+        """
+        if self._is_built:
+            return
+
+        from dimos.core.docker_build import (
+            _compute_build_hash,
+            _get_image_build_hash,
+            build_image,
+            image_exists,
+        )
+
+        config = self.config
         try:
             if config.docker_file is not None:
                 current_hash = _compute_build_hash(config)
@@ -259,10 +271,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                         f"Failed to start container.\nSTDOUT:\n{r.stdout}\nSTDERR:\n{r.stderr}"
                     )
             self.rpc.start()
-            self._running = True
+            self._running.set()
             # docker run -d returns before Module.__init__ finishes in the container,
             # so we poll until the RPC server is reachable before returning.
             self._wait_for_rpc()
+            self._is_built = True
         except Exception:
             with suppress(Exception):
                 self._cleanup()
@@ -299,9 +312,9 @@ def start(self) -> None:
 
     def stop(self) -> None:
         """Gracefully stop the Docker container and clean up resources."""
-        if not self._running:
+        if not self._running.is_set():
             return
-        self._running = False  # claim shutdown before any side-effects
+        self._running.clear()  # claim shutdown before any side-effects
         with suppress(Exception):
             self.rpc.call_nowait(f"{self.remote_name}/stop", ([], {}))
         self._cleanup()
@@ -310,11 +323,10 @@ def _cleanup(self) -> None:
         """Release all resources. Idempotent — safe to call from partial init or after stop()."""
         with suppress(Exception):
             self.rpc.stop()
-        for unsub in getattr(self, "_unsub_fns", []):
+        for unsub in self._unsub_fns:
             with suppress(Exception):
                 unsub()
-        with suppress(Exception):
-            self._unsub_fns.clear()
+        self._unsub_fns.clear()
         if not getattr(getattr(self, "config", None), "docker_reconnect_container", False):
             with suppress(Exception):
                 _run(
@@ -323,7 +335,7 @@ def _cleanup(self) -> None:
                 )
             with suppress(Exception):
                 _remove_container(self.config, self._container_name)
-        self._running = False
+        self._running.clear()
         logger.info(f"Cleaned up container handle: {self._container_name}")
 
     def status(self) -> dict[str, Any]:
@@ -332,7 +344,7 @@ def status(self) -> dict[str, Any]:
             "module": self.remote_name,
             "container_name": self._container_name,
             "image": cfg.docker_image,
-            "running": bool(self._running and _is_container_running(cfg, self._container_name)),
+            "running": self._running.is_set() and _is_container_running(cfg, self._container_name),
         }
 
     def tail_logs(self, n: int = 200) -> str:
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 520468182f..824bccdaed 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -45,8 +45,9 @@ def _on_errors(
                     mod.stop()
             raise ExceptionGroup("docker deploy_parallel failed", errors)
 
-        return safe_thread_map(
-            specs,
-            lambda spec: DockerModule(spec[0], global_config=spec[1], **spec[2]),  # type: ignore[arg-type]
-            _on_errors,
-        )
+        def _deploy_one(spec: ModuleSpec) -> DockerModule:
+            mod = DockerModule(spec[0], global_config=spec[1], **spec[2])  # type: ignore[arg-type]
+            mod.build()
+            return mod
+
+        return safe_thread_map(specs, _deploy_one, _on_errors)
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 64f7dd65cf..4d7ad37719 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -40,6 +40,8 @@
 from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import In, Out, RemoteOut, Transport
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
+from types import MappingProxyType
+
 from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.protocol.service.spec import BaseConfig, Configurable
 from dimos.protocol.tf.tf import LCMTF, TFSpec
@@ -80,7 +82,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]:
 class ModuleConfig(BaseConfig):
     rpc_transport: type[RPCSpec] = LCMRPC
     default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
-    rpc_timeouts: dict[str, float] = dict(DEFAULT_RPC_TIMEOUTS)
+    rpc_timeouts: MappingProxyType[str, float] = DEFAULT_RPC_TIMEOUTS
     tf_transport: type[TFSpec] = LCMTF  # type: ignore[type-arg]
     frame_id_prefix: str | None = None
     frame_id: str | None = None
@@ -132,6 +134,15 @@ def frame_id(self) -> str:
             return f"{self.config.frame_id_prefix}/{base}"
         return base
 
+    @rpc
+    def build(self) -> None:
+        """Optional build step for heavy one-time work (docker builds, LFS downloads, etc.).
+
+        Called after deploy and stream wiring but before start().
+        Has a very long timeout (24h) so long-running builds don't fail.
+        Default is a no-op — override in subclasses that need a build step.
+        """
+
     @rpc
     def start(self) -> None:
         pass
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 43e3e44f0a..f5fd340f02 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -165,7 +165,7 @@ def _deploy_workers() -> None:
                 return
             assert self._client is not None
             for index, module in zip(
-                worker_indices, self._client.deploy_parallel(worker_specs), strict=False
+                worker_indices, self._client.deploy_parallel(worker_specs), strict=True
             ):
                 results[index] = module
 
@@ -173,12 +173,12 @@ def _deploy_docker() -> None:
             if not docker_specs:
                 return
             for index, module in zip(
-                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=False
+                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True
             ):
                 results[index] = module
 
         def _register() -> None:
-            for (module_class, _, _), module in zip(module_specs, results, strict=False):
+            for (module_class, _, _), module in zip(module_specs, results, strict=True):
                 if module is not None:
                     self._deployed_modules[module_class] = module
 
@@ -192,6 +192,24 @@ def _on_errors(
         _register()
         return results
 
+    def build_all_modules(self) -> None:
+        """Call build() on all deployed modules in parallel.
+
+        build() handles heavy one-time work (docker builds, LFS downloads, etc.)
+        with a very long timeout. Must be called after deploy and stream wiring
+        but before start_all_modules().
+        """
+        modules = list(self._deployed_modules.values())
+        if not modules:
+            raise ValueError("No modules deployed. Call deploy() before build_all_modules().")
+
+        def _on_build_errors(
+            _outcomes: list[Any], _successes: list[Any], errors: list[Exception]
+        ) -> None:
+            raise ExceptionGroup("build_all_modules failed", errors)
+
+        safe_thread_map(modules, lambda m: m.build(), _on_build_errors)
+
     def start_all_modules(self) -> None:
         modules = list(self._deployed_modules.values())
         if not modules:
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 7ac34bb645..46182b7556 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -91,6 +91,7 @@ def __setstate__(self, state) -> None:  # type: ignore[no-untyped-def]
 class ModuleProxyProtocol(Protocol):
     """Protocol for host-side handles to remote modules (worker or Docker)."""
 
+    def build(self) -> None: ...
     def start(self) -> None: ...
     def stop(self) -> None: ...
     def set_transport(self, stream_name: str, transport: Any) -> bool: ...
@@ -179,5 +180,6 @@ def __getattr__(self, name: str):  # type: ignore[no-untyped-def]
     # why? because the RPCClient instance is going to have all the methods of a Module
     # but those methods/attributes are super dynamic, so the type hints can't figure that out
     class ModuleProxy(RPCClient, Module):  # type: ignore[misc]
+        def build(self) -> None: ...
         def start(self) -> None: ...
         def stop(self) -> None: ...
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index a3bb0b716d..d4c0d579d4 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -26,10 +26,13 @@
 
 import pytest
 
-from dimos.core.docker_runner import DockerModuleConfig, is_docker_module
+import threading
+
+from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
+from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import Out
 
 # -- Fixtures: fake module classes -------------------------------------------
@@ -198,7 +201,6 @@ class TestDockerModuleGetattr:
 
     def test_getattr_no_recursion_when_rpcs_not_set(self):
         """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse."""
-        from dimos.core.docker_runner import DockerModule
 
         dm = DockerModule.__new__(DockerModule)
         # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure
@@ -207,7 +209,6 @@ def test_getattr_no_recursion_when_rpcs_not_set(self):
 
     def test_getattr_no_recursion_on_cleanup_attrs(self):
         """Accessing cleanup-related attrs before they exist must raise, not recurse."""
-        from dimos.core.docker_runner import DockerModule
 
         dm = DockerModule.__new__(DockerModule)
         # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse
@@ -216,8 +217,6 @@ def test_getattr_no_recursion_on_cleanup_attrs(self):
                 getattr(dm, attr)
 
     def test_getattr_delegates_to_rpc_when_rpcs_set(self):
-        from dimos.core.docker_runner import DockerModule
-        from dimos.core.rpc_client import RpcCall
 
         dm = DockerModule.__new__(DockerModule)
         dm.rpcs = {"do_thing"}
@@ -235,7 +234,6 @@ def do_thing(self) -> None: ...
         assert isinstance(result, RpcCall)
 
     def test_getattr_raises_for_unknown_method(self):
-        from dimos.core.docker_runner import DockerModule
 
         dm = DockerModule.__new__(DockerModule)
         dm.rpcs = {"do_thing"}
@@ -248,11 +246,10 @@ class TestDockerModuleCleanupReconnect:
     """Tests for DockerModule._cleanup with docker_reconnect_container."""
 
     def test_cleanup_skips_stop_when_reconnect(self):
-        from dimos.core.docker_runner import DockerModule
 
         with patch.object(DockerModule, "__init__", lambda self: None):
             dm = DockerModule.__new__(DockerModule)
-            dm._running = True
+            dm._running = threading.Event(); dm._running.set()
             dm._container_name = "test_container"
             dm._unsub_fns = []
             dm.rpc = MagicMock()
@@ -269,11 +266,10 @@ def test_cleanup_skips_stop_when_reconnect(self):
                 mock_rm.assert_not_called()
 
     def test_cleanup_stops_container_when_not_reconnect(self):
-        from dimos.core.docker_runner import DockerModule
 
         with patch.object(DockerModule, "__init__", lambda self: None):
             dm = DockerModule.__new__(DockerModule)
-            dm._running = True
+            dm._running = threading.Event(); dm._running.set()
             dm._container_name = "test_container"
             dm._unsub_fns = []
             dm.rpc = MagicMock()
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 993f6044bb..5b1b8bcb67 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -34,7 +34,10 @@ def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 # module.py and other places imports these constants and choose what to give RPCClient
 # the RPCClient below does not use these constants directly (by design)
 DEFAULT_RPC_TIMEOUT: float = 120.0
-DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({"start": 1200.0})
+DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({
+    "build": 86400.0,  # 24h — docker builds, LFS downloads, etc.
+    "start": 1200.0,
+})
 
 
 class RPCClient(Protocol):

From 97b7e0df0296160a018fec6660afd9b5bd221980 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Fri, 20 Mar 2026 13:58:28 -0700
Subject: [PATCH 73/89] fix: thread leak in native module test + show docker
 pull output

- test_process_crash_triggers_stop: call mod.stop() after watchdog cleanup
  to release LCM transport and event loop threads (fixes CI thread leak error)
- docker pull: remove stderr=subprocess.PIPE so both stdout and stderr
  are visible during pulls (progress bars, layer downloads)
---
 dimos/core/docker_runner.py      | 3 +--
 dimos/core/test_native_module.py | 3 +++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index fb5770325b..d76845bb1a 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -229,12 +229,11 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                 r = subprocess.run(
                     [config.docker_bin, "pull", config.docker_image],
                     text=True,
-                    stderr=subprocess.PIPE,
                     timeout=config.docker_pull_timeout,
                 )
                 if r.returncode != 0:
                     raise RuntimeError(
-                        f"Failed to pull image '{config.docker_image}'.\nSTDERR:\n{r.stderr}"
+                        f"Failed to pull image '{config.docker_image}'."
                     )
 
             reconnect = False
diff --git a/dimos/core/test_native_module.py b/dimos/core/test_native_module.py
index e77b8f9a53..31d6050818 100644
--- a/dimos/core/test_native_module.py
+++ b/dimos/core/test_native_module.py
@@ -107,6 +107,9 @@ def test_process_crash_triggers_stop() -> None:
 
     assert mod._process is None, f"Watchdog did not clean up after process {pid} died"
 
+    # Ensure all threads (LCM transport, event loop) are cleaned up
+    mod.stop()
+
 
 @pytest.mark.slow
 def test_manual(dimos_cluster: ModuleCoordinator, args_file: str) -> None:

From fbc146ac97fad63d49f7d54cde17fe02738dbba2 Mon Sep 17 00:00:00 2001
From: jeff-hykin <17692058+jeff-hykin@users.noreply.github.com>
Date: Fri, 20 Mar 2026 23:07:00 +0000
Subject: [PATCH 74/89] CI code cleanup

---
 dimos/core/docker_runner.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index d76845bb1a..10a194a920 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -232,9 +232,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                     timeout=config.docker_pull_timeout,
                 )
                 if r.returncode != 0:
-                    raise RuntimeError(
-                        f"Failed to pull image '{config.docker_image}'."
-                    )
+                    raise RuntimeError(f"Failed to pull image '{config.docker_image}'.")
 
             reconnect = False
             if _is_container_running(config, self._container_name):

From f09875c3a507d31cff0b12ae44194379c4b29184 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 21 Mar 2026 21:59:10 -0700
Subject: [PATCH 75/89] chore: regenerate uv.lock after merge with dev

---
 uv.lock | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/uv.lock b/uv.lock
index 529842294b..5d1272f673 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1859,7 +1859,9 @@ dev = [
 ]
 docker = [
     { name = "dimos-lcm" },
+    { name = "langchain-core" },
     { name = "lcm" },
+    { name = "matplotlib" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "open3d", marker = "platform_machine != 'aarch64' or sys_platform != 'linux'" },
@@ -1877,6 +1879,7 @@ docker = [
     { name = "sortedcontainers" },
     { name = "structlog" },
     { name = "typer" },
+    { name = "typing-extensions" },
 ]
 drone = [
     { name = "pymavlink" },
@@ -2020,6 +2023,7 @@ requires-dist = [
     { name = "langchain", marker = "extra == 'agents'", specifier = "==1.2.3" },
     { name = "langchain-chroma", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-core", marker = "extra == 'agents'", specifier = "==1.2.3" },
+    { name = "langchain-core", marker = "extra == 'docker'" },
     { name = "langchain-huggingface", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-ollama", marker = "extra == 'agents'", specifier = ">=1,<2" },
     { name = "langchain-openai", marker = "extra == 'agents'", specifier = ">=1,<2" },
@@ -2031,6 +2035,7 @@ requires-dist = [
     { name = "llvmlite", specifier = ">=0.42.0" },
     { name = "lxml-stubs", marker = "extra == 'dev'", specifier = ">=0.5.1,<1" },
     { name = "lz4", specifier = ">=4.4.5" },
+    { name = "matplotlib", marker = "extra == 'docker'" },
     { name = "matplotlib", marker = "extra == 'manipulation'", specifier = ">=3.7.1" },
     { name = "md-babel-py", marker = "extra == 'dev'", specifier = "==1.1.1" },
     { name = "moondream", marker = "extra == 'perception'" },
@@ -2142,6 +2147,7 @@ requires-dist = [
     { name = "types-tensorflow", marker = "extra == 'dev'", specifier = ">=2.18.0.20251008,<3" },
     { name = "types-tqdm", marker = "extra == 'dev'", specifier = ">=4.67.0.20250809,<5" },
     { name = "typing-extensions", marker = "python_full_version < '3.11'", specifier = ">=4.0" },
+    { name = "typing-extensions", marker = "extra == 'docker'" },
     { name = "ultralytics", marker = "extra == 'perception'", specifier = ">=8.3.70" },
     { name = "unitree-webrtc-connect-leshy", marker = "extra == 'unitree'", specifier = ">=2.0.7" },
     { name = "uvicorn", marker = "extra == 'web'", specifier = ">=0.34.0" },

From 317c487a2f05543059cabc159b422d9d9f79cb39 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 21 Mar 2026 23:21:53 -0700
Subject: [PATCH 76/89] fix(docker): include stdout/stderr in pull error
 message

When docker pull fails, the error message now includes the actual
output to help diagnose auth/network/registry issues.

Revert: git revert HEAD
---
 dimos/core/docker_runner.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 10a194a920..06b12c7512 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -229,10 +229,14 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
                 r = subprocess.run(
                     [config.docker_bin, "pull", config.docker_image],
                     text=True,
+                    capture_output=True,
                     timeout=config.docker_pull_timeout,
                 )
                 if r.returncode != 0:
-                    raise RuntimeError(f"Failed to pull image '{config.docker_image}'.")
+                    raise RuntimeError(
+                        f"Failed to pull image '{config.docker_image}'.\n"
+                        f"stdout: {r.stdout}\nstderr: {r.stderr}"
+                    )
 
             reconnect = False
             if _is_container_running(config, self._container_name):

From 91a13f1e7251fbab1657cc290da75e0c1976fe3c Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 21 Mar 2026 23:22:30 -0700
Subject: [PATCH 77/89] fix(tests): import ExceptionGroup in
 test_parallel_deploy_cleanup

Test file used ExceptionGroup without importing it, causing NameError
on Python < 3.11. Import from safe_thread_map where it's polyfilled.

Revert: git revert HEAD
---
 dimos/core/tests/test_parallel_deploy_cleanup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
index 1987fa4be7..ef6bf4b879 100644
--- a/dimos/core/tests/test_parallel_deploy_cleanup.py
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -24,6 +24,8 @@
 
 import pytest
 
+from dimos.utils.safe_thread_map import ExceptionGroup
+
 
 class TestDockerWorkerManagerPartialFailure:
     """DockerWorkerManager.deploy_parallel must stop successful containers when one fails."""

From 42f3797fe6c7eb9e13c046be26c02ed72a947c2d Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Sat, 21 Mar 2026 23:24:45 -0700
Subject: [PATCH 78/89] docs: add changes.md with fix descriptions and revert
 instructions

---
 changes.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 changes.md

diff --git a/changes.md b/changes.md
new file mode 100644
index 0000000000..f5982fce13
--- /dev/null
+++ b/changes.md
@@ -0,0 +1,24 @@
+# PR #1431 (Docker Restoration) — Paul Review Fixes
+
+## Commits (local, not pushed)
+
+### 1. `317c487a2` — Include stdout/stderr in docker pull error
+- Pull failures were silent — no diagnostic output
+- Now includes both stdout and stderr in exception
+- **Revert:** `git revert 317c487a2`
+
+### 2. `91a13f1e7` — Import ExceptionGroup in test file
+- Test used ExceptionGroup without import → NameError on Python < 3.11
+- Now imports from safe_thread_map polyfill
+- **Revert:** `git revert 91a13f1e7`
+
+## Reviewer was wrong on
+- `rpc_timeouts` class-level mutable dict — it's in ModuleConfig (pydantic) with `Field(default_factory=...)`, which is correct
+
+## Not addressed (need Jeff's input / bigger refactor)
+- Container launch in `__init__` vs `start()` — lifecycle redesign
+- Deterministic container naming (removed PID+timestamp) — collision risk
+- `docker_gpus` default None (was "all") — intentional breaking change?
+- `docker_restart_policy` default "no" (was "on-failure:3") — same
+- Build hash includes original Dockerfile, not converted (with footer)
+- `getattr(default_config, "rpc_timeouts", ...)` returns FieldInfo on class

From 30d87a6c30d589102315fbdf22c2ba2d6deb74e8 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 14:57:04 -0700
Subject: [PATCH 79/89] cleanup g passing

---
 dimos/core/docker_runner.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 005b55fb3b..9b9c658bcb 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -178,9 +178,6 @@ class DockerModule(ModuleProxyProtocol):
     config: DockerModuleConfig
 
     def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> None:
-        # g (GlobalConfig) is passed by deploy pipeline but isn't a config field
-        kwargs.pop("g", None)
-
         config_class = getattr(module_class, "default_config", DockerModuleConfig)
         if not issubclass(config_class, DockerModuleConfig):
             raise TypeError(

From 8f23d0996912e2d50e4a5e5532b9b94e5e7629f5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 15:23:39 -0700
Subject: [PATCH 80/89] cleanup

---
 dimos/core/docker_runner.py      |  3 ++-
 dimos/core/module.py             |  5 ++---
 dimos/core/module_coordinator.py |  6 +++++-
 dimos/core/test_core.py          |  2 +-
 dimos/protocol/rpc/spec.py       | 10 ++++++----
 dimos/utils/safe_thread_map.py   |  6 ++----
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 9b9c658bcb..8b3e39995a 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -155,7 +155,8 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
             json.dumps(v)
             out[k] = v
         except (TypeError, ValueError):
-            logger.debug(f"Config field '{k}' not JSON-serializable, skipping")
+            level = "debug" if k.startswith("_") else "warning"
+            getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping")
     return out
 
 
diff --git a/dimos/core/module.py b/dimos/core/module.py
index 28971f0e4a..ebe1879681 100644
--- a/dimos/core/module.py
+++ b/dimos/core/module.py
@@ -30,6 +30,7 @@
 )
 
 from langchain_core.tools import tool
+from pydantic import Field
 from reactivex.disposable import CompositeDisposable
 
 from dimos.core.core import T, rpc
@@ -40,8 +41,6 @@
 from dimos.core.rpc_client import RpcCall
 from dimos.core.stream import In, Out, RemoteOut, Transport
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
-from types import MappingProxyType
-
 from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.protocol.service.spec import BaseConfig, Configurable
 from dimos.protocol.tf.tf import LCMTF, TFSpec
@@ -82,7 +81,7 @@ def get_loop() -> tuple[asyncio.AbstractEventLoop, threading.Thread | None]:
 class ModuleConfig(BaseConfig):
     rpc_transport: type[RPCSpec] = LCMRPC
     default_rpc_timeout: float = DEFAULT_RPC_TIMEOUT
-    rpc_timeouts: MappingProxyType[str, float] = DEFAULT_RPC_TIMEOUTS
+    rpc_timeouts: dict[str, float] = Field(default_factory=lambda: dict(DEFAULT_RPC_TIMEOUTS))
     tf_transport: type[TFSpec] = LCMTF  # type: ignore[type-arg]
     frame_id_prefix: str | None = None
     frame_id: str | None = None
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index a8b6ec0922..d4778a5c0d 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+from contextlib import suppress
 import threading
 from typing import TYPE_CHECKING, Any
 
@@ -204,8 +205,11 @@ def build_all_modules(self) -> None:
             raise ValueError("No modules deployed. Call deploy() before build_all_modules().")
 
         def _on_build_errors(
-            _outcomes: list[Any], _successes: list[Any], errors: list[Exception]
+            _outcomes: list[Any], successes: list[Any], errors: list[Exception]
         ) -> None:
+            for mod in successes:
+                with suppress(Exception):
+                    mod.stop()
             raise ExceptionGroup("build_all_modules failed", errors)
 
         safe_thread_map(modules, lambda m: m.build(), _on_build_errors)
diff --git a/dimos/core/test_core.py b/dimos/core/test_core.py
index 7cd0f89b36..f9a89829d5 100644
--- a/dimos/core/test_core.py
+++ b/dimos/core/test_core.py
@@ -77,7 +77,7 @@ def test_classmethods() -> None:
     # Check that we have the expected RPC methods
     assert "navigate_to" in class_rpcs, "navigate_to should be in rpcs"
     assert "start" in class_rpcs, "start should be in rpcs"
-    assert len(class_rpcs) == 8
+    assert len(class_rpcs) == 9
 
     # Check that the values are callable
     assert callable(class_rpcs["navigate_to"]), "navigate_to should be callable"
diff --git a/dimos/protocol/rpc/spec.py b/dimos/protocol/rpc/spec.py
index 5b1b8bcb67..cefd89f449 100644
--- a/dimos/protocol/rpc/spec.py
+++ b/dimos/protocol/rpc/spec.py
@@ -34,10 +34,12 @@ def rpcs(self) -> dict[str, Callable]: ...  # type: ignore[type-arg]
 # module.py and other places imports these constants and choose what to give RPCClient
 # the RPCClient below does not use these constants directly (by design)
 DEFAULT_RPC_TIMEOUT: float = 120.0
-DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType({
-    "build": 86400.0,  # 24h — docker builds, LFS downloads, etc.
-    "start": 1200.0,
-})
+DEFAULT_RPC_TIMEOUTS: MappingProxyType[str, float] = MappingProxyType(
+    {
+        "build": 86400.0,  # 24h — docker builds, LFS downloads, etc.
+        "start": 1200.0,
+    }
+)
 
 
 class RPCClient(Protocol):
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
index f480f2c97d..514fac2026 100644
--- a/dimos/utils/safe_thread_map.py
+++ b/dimos/utils/safe_thread_map.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 from __future__ import annotations
 
+from collections.abc import Callable, Sequence
 from concurrent.futures import Future, ThreadPoolExecutor, as_completed
 import sys
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import Any, TypeVar
 
 if sys.version_info < (3, 11):
 
@@ -32,9 +33,6 @@ def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None:
 
     ExceptionGroup = builtins.ExceptionGroup  # type: ignore[misc]
 
-if TYPE_CHECKING:
-    from collections.abc import Callable, Sequence
-
 T = TypeVar("T")
 R = TypeVar("R")
 

From d37a9229cf9f5368725cf6177739484b1078bf77 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 15:24:04 -0700
Subject: [PATCH 81/89] combine docker_build and runner

---
 dimos/core/docker_build.py                    | 150 ------------------
 dimos/core/docker_runner.py                   | 149 +++++++++++++++--
 dimos/core/docker_worker_manager.py           |  12 +-
 dimos/core/module_coordinator.py              |   4 +-
 dimos/core/tests/test_docker_deployment.py    |  38 ++---
 .../tests/test_parallel_deploy_cleanup.py     |   8 +-
 dimos/manipulation/pick_and_place_module.py   |   2 +-
 dimos/test_no_sections.py                     |   2 +-
 8 files changed, 168 insertions(+), 197 deletions(-)
 delete mode 100644 dimos/core/docker_build.py

diff --git a/dimos/core/docker_build.py b/dimos/core/docker_build.py
deleted file mode 100644
index 24fd2b3e44..0000000000
--- a/dimos/core/docker_build.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Docker image building and Dockerfile conversion utilities.
-Converts any Dockerfile into a DimOS module container by appending a footer
-that installs DimOS and creates the module entrypoint.
-"""
-
-from __future__ import annotations
-
-import hashlib
-import subprocess
-from typing import TYPE_CHECKING
-
-from dimos.utils.logging_config import setup_logger
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-    from dimos.core.docker_runner import DockerModuleConfig
-
-logger = setup_logger()
-
-_BUILD_HASH_LABEL = "dimos.build.hash"
-
-DOCKER_CMD_TIMEOUT = 20
-
-# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness)
-DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc"
-
-# Footer appended to Dockerfiles for DimOS module conversion
-DIMOS_FOOTER = f"""
-# ==== {DIMOS_SENTINEL} ====
-# Copy DimOS source from build context
-COPY dimos /dimos/source/dimos/
-COPY pyproject.toml /dimos/source/
-COPY docker/python/module-install.sh /tmp/module-install.sh
-
-# Install DimOS and create entrypoint
-RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh
-
-ENTRYPOINT ["/dimos/entrypoint.sh"]
-"""
-
-
-def _convert_dockerfile(dockerfile: Path) -> Path:
-    """Append DimOS footer to Dockerfile. Returns path to converted file."""
-    content = dockerfile.read_text()
-
-    # Already converted?
-    if DIMOS_SENTINEL in content:
-        return dockerfile
-
-    logger.info(f"Converting {dockerfile.name} to DimOS format")
-
-    converted = dockerfile.parent / f".{dockerfile.name}.ignore"
-    converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n"))
-    return converted
-
-
-def _compute_build_hash(cfg: DockerModuleConfig) -> str:
-    """Hash Dockerfile contents and build args."""
-    assert cfg.docker_file is not None
-    digest = hashlib.sha256()
-    digest.update(cfg.docker_file.read_bytes())
-    for key, val in sorted(cfg.docker_build_args.items()):
-        digest.update(f"{key}={val}".encode())
-    for arg in cfg.docker_build_extra_args:
-        digest.update(arg.encode())
-    return digest.hexdigest()
-
-
-def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None:
-    """Read the build hash label from an existing Docker image."""
-    r = subprocess.run(
-        [
-            cfg.docker_bin,
-            "image",
-            "inspect",
-            "-f",
-            '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
-            cfg.docker_image,
-        ],
-        capture_output=True,
-        text=True,
-        timeout=DOCKER_CMD_TIMEOUT,
-        check=False,
-    )
-    if r.returncode != 0:
-        return None
-    value = r.stdout.strip()
-    # docker prints "<no value>" when the label is missing
-    return value if value and value != "<no value>" else None
-
-
-def build_image(cfg: DockerModuleConfig) -> None:
-    """Build Docker image using footer mode conversion."""
-    if cfg.docker_file is None:
-        raise ValueError("docker_file is required for building Docker images")
-
-    build_hash = _compute_build_hash(cfg)
-    dockerfile = _convert_dockerfile(cfg.docker_file)
-
-    context = cfg.docker_build_context or cfg.docker_file.parent
-    cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
-    cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
-    for k, v in cfg.docker_build_args.items():
-        cmd.extend(["--build-arg", f"{k}={v}"])
-    cmd.extend(cfg.docker_build_extra_args)
-    cmd.append(str(context))
-
-    logger.info(f"Building Docker image: {cfg.docker_image}")
-    # Stream stdout to terminal so the user sees build progress, but capture
-    # stderr separately so we can include it in the error message on failure.
-    result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE)
-    if result.returncode != 0:
-        raise RuntimeError(
-            f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}"
-        )
-
-
-def image_exists(cfg: DockerModuleConfig) -> bool:
-    """Check if the configured Docker image exists locally."""
-    r = subprocess.run(
-        [cfg.docker_bin, "image", "inspect", cfg.docker_image],
-        capture_output=True,
-        text=True,
-        timeout=DOCKER_CMD_TIMEOUT,
-        check=False,
-    )
-    return r.returncode == 0
-
-
-__all__ = [
-    "DIMOS_FOOTER",
-    "build_image",
-    "image_exists",
-]
diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_runner.py
index 8b3e39995a..61a050e2f5 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_runner.py
@@ -11,11 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+"""
+Docker module support: image building, Dockerfile conversion, host-side
+proxy (DockerModuleOuter), and container-side runner (DockerModuleInner).
+"""
+
 from __future__ import annotations
 
 import argparse
 from contextlib import suppress
 from dataclasses import field
+import hashlib
 import importlib
 import json
 import signal
@@ -53,7 +60,7 @@ class DockerModuleConfig(ModuleConfig):
     For advanced Docker options not listed here, use docker_extra_args.
     Example: docker_extra_args=["--cap-add=SYS_ADMIN", "--read-only"]
 
-    NOTE: a DockerModule will rebuild automatically if the Dockerfile or build args change
+    NOTE: a DockerModuleOuter will rebuild automatically if the Dockerfile or build args change
     """
 
     # Build / image
@@ -160,10 +167,122 @@ def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
     return out
 
 
+# Image building and Dockerfile conversion
+
+
+_BUILD_HASH_LABEL = "dimos.build.hash"
+
+# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness)
+DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc"
+
+# Footer appended to Dockerfiles for DimOS module conversion
+DIMOS_FOOTER = f"""
+# ==== {DIMOS_SENTINEL} ====
+# Copy DimOS source from build context
+COPY dimos /dimos/source/dimos/
+COPY pyproject.toml /dimos/source/
+COPY docker/python/module-install.sh /tmp/module-install.sh
+
+# Install DimOS and create entrypoint
+RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh
+
+ENTRYPOINT ["/dimos/entrypoint.sh"]
+"""
+
+
+def _convert_dockerfile(dockerfile: Path) -> Path:
+    """Append DimOS footer to Dockerfile. Returns path to converted file."""
+    content = dockerfile.read_text()
+
+    # Already converted?
+    if DIMOS_SENTINEL in content:
+        return dockerfile
+
+    logger.info(f"Converting {dockerfile.name} to DimOS format")
+
+    converted = dockerfile.parent / f".{dockerfile.name}.ignore"
+    converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n"))
+    return converted
+
+
+def _compute_build_hash(cfg: DockerModuleConfig) -> str:
+    """Hash Dockerfile contents and build args."""
+    if cfg.docker_file is None:
+        raise ValueError("docker_file is required for computing build hash")
+    digest = hashlib.sha256()
+    digest.update(cfg.docker_file.read_bytes())
+    for key, val in sorted(cfg.docker_build_args.items()):
+        digest.update(f"{key}={val}".encode())
+    for arg in cfg.docker_build_extra_args:
+        digest.update(arg.encode())
+    return digest.hexdigest()
+
+
+def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None:
+    """Read the build hash label from an existing Docker image."""
+    r = subprocess.run(
+        [
+            cfg.docker_bin,
+            "image",
+            "inspect",
+            "-f",
+            '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
+            cfg.docker_image,
+        ],
+        capture_output=True,
+        text=True,
+        timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
+    )
+    if r.returncode != 0:
+        return None
+    value = r.stdout.strip()
+    # docker prints "<no value>" when the label is missing
+    return value if value and value != "<no value>" else None
+
+
+def build_image(cfg: DockerModuleConfig) -> None:
+    """Build Docker image using footer mode conversion."""
+    if cfg.docker_file is None:
+        raise ValueError("docker_file is required for building Docker images")
+
+    build_hash = _compute_build_hash(cfg)
+    dockerfile = _convert_dockerfile(cfg.docker_file)
+
+    context = cfg.docker_build_context or cfg.docker_file.parent
+    cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
+    cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
+    for k, v in cfg.docker_build_args.items():
+        cmd.extend(["--build-arg", f"{k}={v}"])
+    cmd.extend(cfg.docker_build_extra_args)
+    cmd.append(str(context))
+
+    logger.info(f"Building Docker image: {cfg.docker_image}")
+    # Stream stdout to terminal so the user sees build progress, but capture
+    # stderr separately so we can include it in the error message on failure.
+    result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}"
+        )
+
+
+def image_exists(cfg: DockerModuleConfig) -> bool:
+    """Check if the configured Docker image exists locally."""
+    r = subprocess.run(
+        [cfg.docker_bin, "image", "inspect", cfg.docker_image],
+        capture_output=True,
+        text=True,
+        timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
+    )
+    return r.returncode == 0
+
+
 # Host-side Docker-backed Module handle
 
 
-class DockerModule(ModuleProxyProtocol):
+class DockerModuleOuter(ModuleProxyProtocol):
     """
     Host-side handle for a module running inside Docker.
 
@@ -219,13 +338,6 @@ def build(self) -> None:
         if self._is_built:
             return
 
-        from dimos.core.docker_build import (
-            _compute_build_hash,
-            _get_image_build_hash,
-            build_image,
-            image_exists,
-        )
-
         config = self.config
         try:
             if config.docker_file is not None:
@@ -401,7 +513,7 @@ def _validate_config(self, cfg: DockerModuleConfig) -> None:
         using_host_network = cfg.docker_network is None and cfg.docker_network_mode == "host"
         if not using_host_network:
             logger.warning(
-                "DockerModule not using host network. LCM multicast requires --network=host. "
+                "DockerModuleOuter not using host network. LCM multicast requires --network=host. "
                 "RPC communication may not work with bridge/custom networks."
             )
 
@@ -523,7 +635,7 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
             payload_json = json.dumps(payload, separators=(",", ":"))
         except TypeError as e:
             raise TypeError(
-                f"Cannot serialize DockerModule payload to JSON: {e}\n"
+                f"Cannot serialize DockerModuleOuter payload to JSON: {e}\n"
                 f"Ensure all constructor args/kwargs for {self._module_class.__name__} are "
                 f"JSON-serializable, or use docker_command to bypass automatic payload generation."
             ) from e
@@ -559,10 +671,14 @@ def _wait_for_rpc(self) -> None:
         )
 
 
+# Backwards compatibility alias
+DockerModule = DockerModuleOuter
+
+
 # Container-side runner
 
 
-class StandaloneModuleRunner:
+class DockerModuleInner:
     """Runs a module inside Docker container. Blocks until SIGTERM/SIGINT."""
 
     def __init__(self, module_path: str, args: list[Any], kwargs: dict[str, Any]) -> None:
@@ -597,7 +713,7 @@ def wait(self) -> None:
         self._shutdown.wait()
 
 
-def _install_signal_handlers(runner: StandaloneModuleRunner) -> None:
+def _install_signal_handlers(runner: DockerModuleInner) -> None:
     def shutdown(_sig: int, _frame: Any) -> None:
         runner.stop()
 
@@ -607,7 +723,7 @@ def shutdown(_sig: int, _frame: Any) -> None:
 
 def _cli_run(payload_json: str) -> None:
     payload = json.loads(payload_json)
-    runner = StandaloneModuleRunner(
+    runner = DockerModuleInner(
         payload["module_path"],
         payload.get("args", []),
         payload.get("kwargs", {}),
@@ -640,5 +756,10 @@ def main(argv: list[str] | None = None) -> None:
 __all__ = [
     "DockerModule",
     "DockerModuleConfig",
+    "DockerModuleInner",
+    "DockerModuleOuter",
+    "DIMOS_FOOTER",
+    "build_image",
+    "image_exists",
     "is_docker_module",
 ]
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 08ea7e3958..4a85bd59f9 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -20,7 +20,7 @@
 from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
-    from dimos.core.docker_runner import DockerModule
+    from dimos.core.docker_runner import DockerModuleOuter
 
 
 class DockerWorkerManager:
@@ -29,24 +29,24 @@ class DockerWorkerManager:
     @staticmethod
     def deploy_parallel(
         specs: list[ModuleSpec],
-    ) -> list[DockerModule]:
+    ) -> list[DockerModuleOuter]:
         """Deploy multiple DockerModules in parallel.
 
         If any deployment fails, all successfully-started containers are
         stopped before an ExceptionGroup is raised.
         """
-        from dimos.core.docker_runner import DockerModule
+        from dimos.core.docker_runner import DockerModuleOuter
 
         def _on_errors(
-            _outcomes: list[Any], successes: list[DockerModule], errors: list[Exception]
+            _outcomes: list[Any], successes: list[DockerModuleOuter], errors: list[Exception]
         ) -> None:
             for mod in successes:
                 with suppress(Exception):
                     mod.stop()
             raise ExceptionGroup("docker deploy_parallel failed", errors)
 
-        def _deploy_one(spec: ModuleSpec) -> DockerModule:
-            mod = DockerModule(spec[0], g=spec[1], **spec[2])  # type: ignore[arg-type]
+        def _deploy_one(spec: ModuleSpec) -> DockerModuleOuter:
+            mod = DockerModuleOuter(spec[0], g=spec[1], **spec[2])  # type: ignore[arg-type]
             mod.build()
             return mod
 
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index d4778a5c0d..5d7b76db78 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -125,14 +125,14 @@ def deploy(
         **kwargs: Any,
     ) -> ModuleProxy:
         # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
-        from dimos.core.docker_runner import DockerModule, is_docker_module
+        from dimos.core.docker_runner import DockerModuleOuter, is_docker_module
 
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
         deployed_module: ModuleProxyProtocol
         if is_docker_module(module_class):
-            deployed_module = DockerModule(module_class, g=global_config, **kwargs)  # type: ignore[arg-type]
+            deployed_module = DockerModuleOuter(module_class, g=global_config, **kwargs)  # type: ignore[arg-type]
         else:
             deployed_module = self._client.deploy(module_class, global_config, kwargs)
         self._deployed_modules[module_class] = deployed_module  # type: ignore[assignment]
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index d8eb9448ff..a528e07ad9 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -16,7 +16,7 @@
 Smoke tests for Docker module deployment routing.
 
 These tests verify that the ModuleCoordinator correctly detects and routes
-docker modules to DockerModule WITHOUT actually running Docker.
+docker modules to DockerModuleOuter WITHOUT actually running Docker.
 """
 
 from __future__ import annotations
@@ -27,7 +27,7 @@
 
 import pytest
 
-from dimos.core.docker_runner import DockerModule, DockerModuleConfig, is_docker_module
+from dimos.core.docker_runner import DockerModuleOuter, DockerModuleConfig, is_docker_module
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
@@ -76,7 +76,7 @@ class Bare(Module):
 
 
 class TestModuleCoordinatorDockerRouting:
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -92,7 +92,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
 
             # Should NOT go through worker manager
             mock_worker_mgr.deploy.assert_not_called()
-            # Should construct a DockerModule (container launch happens inside __init__)
+            # Should construct a DockerModuleOuter (container launch happens inside __init__)
             mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config)
             # start() is NOT called during deploy — it's called in start_all_modules
             mock_dm.start.assert_not_called()
@@ -101,7 +101,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_docker_propagates_constructor_failure(
         self, mock_worker_manager_cls, mock_docker_module_cls
@@ -109,7 +109,7 @@ def test_deploy_docker_propagates_constructor_failure(
         mock_worker_mgr = MagicMock()
         mock_worker_manager_cls.return_value = mock_worker_mgr
 
-        # Container launch fails inside __init__; DockerModule handles its own cleanup
+        # Container launch fails inside __init__; DockerModuleOuter handles its own cleanup
         mock_docker_module_cls.side_effect = RuntimeError("launch failed")
 
         coordinator = ModuleCoordinator()
@@ -173,7 +173,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -195,13 +195,13 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke
         mock_worker_mgr.close_all.assert_called_once()
 
 
-class TestDockerModuleGetattr:
-    """Tests for DockerModule.__getattr__ avoiding infinite recursion."""
+class TestDockerModuleOuterGetattr:
+    """Tests for DockerModuleOuter.__getattr__ avoiding infinite recursion."""
 
     def test_getattr_no_recursion_when_rpcs_not_set(self):
         """If __init__ fails before self.rpcs is assigned, __getattr__ must not recurse."""
 
-        dm = DockerModule.__new__(DockerModule)
+        dm = DockerModuleOuter.__new__(DockerModuleOuter)
         # Don't set rpcs, _module_class, or any instance attrs — simulates early __init__ failure
         with pytest.raises(AttributeError):
             _ = dm.some_method
@@ -209,14 +209,14 @@ def test_getattr_no_recursion_when_rpcs_not_set(self):
     def test_getattr_no_recursion_on_cleanup_attrs(self):
         """Accessing cleanup-related attrs before they exist must raise, not recurse."""
 
-        dm = DockerModule.__new__(DockerModule)
+        dm = DockerModuleOuter.__new__(DockerModuleOuter)
         # These are accessed during _cleanup() — if rpcs isn't set, they must not recurse
         for attr in ("rpc", "config", "_container_name", "_unsub_fns"):
             with pytest.raises(AttributeError):
                 getattr(dm, attr)
 
     def test_getattr_delegates_to_rpc_when_rpcs_set(self):
-        dm = DockerModule.__new__(DockerModule)
+        dm = DockerModuleOuter.__new__(DockerModuleOuter)
         dm.rpcs = {"do_thing"}
 
         # _module_class needs a real method with __name__ for RpcCall
@@ -232,19 +232,19 @@ def do_thing(self) -> None: ...
         assert isinstance(result, RpcCall)
 
     def test_getattr_raises_for_unknown_method(self):
-        dm = DockerModule.__new__(DockerModule)
+        dm = DockerModuleOuter.__new__(DockerModuleOuter)
         dm.rpcs = {"do_thing"}
 
         with pytest.raises(AttributeError, match="not found"):
             _ = dm.nonexistent
 
 
-class TestDockerModuleCleanupReconnect:
-    """Tests for DockerModule._cleanup with docker_reconnect_container."""
+class TestDockerModuleOuterCleanupReconnect:
+    """Tests for DockerModuleOuter._cleanup with docker_reconnect_container."""
 
     def test_cleanup_skips_stop_when_reconnect(self):
-        with patch.object(DockerModule, "__init__", lambda self: None):
-            dm = DockerModule.__new__(DockerModule)
+        with patch.object(DockerModuleOuter, "__init__", lambda self: None):
+            dm = DockerModuleOuter.__new__(DockerModuleOuter)
             dm._running = threading.Event()
             dm._running.set()
             dm._container_name = "test_container"
@@ -263,8 +263,8 @@ def test_cleanup_skips_stop_when_reconnect(self):
                 mock_rm.assert_not_called()
 
     def test_cleanup_stops_container_when_not_reconnect(self):
-        with patch.object(DockerModule, "__init__", lambda self: None):
-            dm = DockerModule.__new__(DockerModule)
+        with patch.object(DockerModuleOuter, "__init__", lambda self: None):
+            dm = DockerModuleOuter.__new__(DockerModuleOuter)
             dm._running = threading.Event()
             dm._running.set()
             dm._container_name = "test_container"
diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
index ef6bf4b879..adfd1f7a36 100644
--- a/dimos/core/tests/test_parallel_deploy_cleanup.py
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -30,7 +30,7 @@
 class TestDockerWorkerManagerPartialFailure:
     """DockerWorkerManager.deploy_parallel must stop successful containers when one fails."""
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls):
         """Deploy 3 modules where the middle one fails. The other two must be stopped."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -69,7 +69,7 @@ def fake_constructor(cls, *args, **kwargs):
         mod_a.stop.assert_called_once()
         mod_c.stop.assert_called_once()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls):
         """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -110,7 +110,7 @@ def fake_constructor(cls, *args, **kwargs):
         # The one successful module must have been stopped
         mod_a.stop.assert_called_once()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     def test_all_succeed_no_stops(self, mock_docker_module_cls):
         """When all deployments succeed, no modules should be stopped."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -138,7 +138,7 @@ def fake_constructor(cls, *args, **kwargs):
         for m in mocks:
             m.stop.assert_not_called()
 
-    @patch("dimos.core.docker_runner.DockerModule")
+    @patch("dimos.core.docker_runner.DockerModuleOuter")
     def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls):
         """If stop() itself raises during cleanup, the original deploy error still propagates."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py
index 81e7bcf2d3..e519d82c87 100644
--- a/dimos/manipulation/pick_and_place_module.py
+++ b/dimos/manipulation/pick_and_place_module.py
@@ -30,7 +30,7 @@
 from dimos.agents.annotation import skill
 from dimos.constants import DIMOS_PROJECT_ROOT
 from dimos.core.core import rpc
-from dimos.core.docker_runner import DockerModule as DockerRunner
+from dimos.core.docker_runner import DockerModuleOuter as DockerRunner
 from dimos.core.stream import In
 from dimos.manipulation.grasping.graspgen_module import GraspGenModule
 from dimos.manipulation.manipulation_module import (
diff --git a/dimos/test_no_sections.py b/dimos/test_no_sections.py
index 9523c0aae2..63c6c42c81 100644
--- a/dimos/test_no_sections.py
+++ b/dimos/test_no_sections.py
@@ -58,7 +58,7 @@
 # Each entry is (relative_path, line_substring) — if both match, the line is skipped.
 WHITELIST = [
     # Sentinel marker used at runtime to detect already-converted Dockerfiles
-    ("dimos/core/docker_build.py", "DIMOS_SENTINEL"),
+    ("dimos/core/docker_runner.py", "DIMOS_SENTINEL"),
 ]
 
 

From 79d7817fc242b0f1ce2e731408b8f0039918cc9b Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 15:28:25 -0700
Subject: [PATCH 82/89] rename docker_runner to module

---
 .../core/{docker_runner.py => docker_module.py}  |  4 ++--
 dimos/core/docker_worker_manager.py              |  4 ++--
 dimos/core/module_coordinator.py                 |  8 ++++----
 dimos/core/tests/test_docker_deployment.py       | 16 ++++++++--------
 dimos/core/tests/test_parallel_deploy_cleanup.py |  8 ++++----
 dimos/manipulation/grasping/graspgen_module.py   |  2 +-
 dimos/manipulation/pick_and_place_module.py      |  2 +-
 dimos/test_no_sections.py                        |  2 +-
 docker/python/module-install.sh                  |  2 +-
 examples/docker_hello_world/hello_docker.py      |  2 +-
 10 files changed, 25 insertions(+), 25 deletions(-)
 rename dimos/core/{docker_runner.py => docker_module.py} (99%)

diff --git a/dimos/core/docker_runner.py b/dimos/core/docker_module.py
similarity index 99%
rename from dimos/core/docker_runner.py
rename to dimos/core/docker_module.py
index 61a050e2f5..1880aa0dbd 100644
--- a/dimos/core/docker_runner.py
+++ b/dimos/core/docker_module.py
@@ -630,7 +630,7 @@ def _build_container_command(self, cfg: DockerModuleConfig) -> list[str]:
         # Filter out docker-specific kwargs (paths, etc.) - only pass module config
         kwargs = {"config": _extract_module_config(cfg)}
         payload = {"module_path": module_path, "args": list(self._args), "kwargs": kwargs}
-        # DimOS base image entrypoint already runs "dimos.core.docker_runner run"
+        # DimOS base image entrypoint already runs "dimos.core.docker_module run"
         try:
             payload_json = json.dumps(payload, separators=(",", ":"))
         except TypeError as e:
@@ -734,7 +734,7 @@ def _cli_run(payload_json: str) -> None:
 
 
 def main(argv: list[str] | None = None) -> None:
-    parser = argparse.ArgumentParser(prog="dimos.core.docker_runner")
+    parser = argparse.ArgumentParser(prog="dimos.core.docker_module")
     sub = parser.add_subparsers(dest="cmd", required=True)
 
     runp = sub.add_parser("run", help="Run a module inside a container")
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/docker_worker_manager.py
index 4a85bd59f9..94b4e973c8 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/docker_worker_manager.py
@@ -20,7 +20,7 @@
 from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
 
 if TYPE_CHECKING:
-    from dimos.core.docker_runner import DockerModuleOuter
+    from dimos.core.docker_module import DockerModuleOuter
 
 
 class DockerWorkerManager:
@@ -35,7 +35,7 @@ def deploy_parallel(
         If any deployment fails, all successfully-started containers are
         stopped before an ExceptionGroup is raised.
         """
-        from dimos.core.docker_runner import DockerModuleOuter
+        from dimos.core.docker_module import DockerModuleOuter
 
         def _on_errors(
             _outcomes: list[Any], successes: list[DockerModuleOuter], errors: list[Exception]
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 5d7b76db78..4937a2e121 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -124,8 +124,8 @@ def deploy(
         global_config: GlobalConfig = global_config,
         **kwargs: Any,
     ) -> ModuleProxy:
-        # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
-        from dimos.core.docker_runner import DockerModuleOuter, is_docker_module
+        # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator
+        from dimos.core.docker_module import DockerModuleOuter, is_docker_module
 
         if not self._client:
             raise ValueError("Trying to dimos.deploy before the client has started")
@@ -139,8 +139,8 @@ def deploy(
         return deployed_module  # type: ignore[return-value]
 
     def deploy_parallel(self, module_specs: list[ModuleSpec]) -> list[ModuleProxy]:
-        # Inline to avoid circular import: module_coordinator → docker_runner → module → blueprints → module_coordinator
-        from dimos.core.docker_runner import is_docker_module
+        # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator
+        from dimos.core.docker_module import is_docker_module
 
         if not self._client:
             raise ValueError("Not started")
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index a528e07ad9..55e96d3b72 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -27,7 +27,7 @@
 
 import pytest
 
-from dimos.core.docker_runner import DockerModuleOuter, DockerModuleConfig, is_docker_module
+from dimos.core.docker_module import DockerModuleOuter, DockerModuleConfig, is_docker_module
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
@@ -76,7 +76,7 @@ class Bare(Module):
 
 
 class TestModuleCoordinatorDockerRouting:
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -101,7 +101,7 @@ def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_deploy_docker_propagates_constructor_failure(
         self, mock_worker_manager_cls, mock_docker_module_cls
@@ -173,7 +173,7 @@ def test_deploy_parallel_separates_docker_and_regular(
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     @patch("dimos.core.module_coordinator.WorkerManager")
     def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
         mock_worker_mgr = MagicMock()
@@ -255,8 +255,8 @@ def test_cleanup_skips_stop_when_reconnect(self):
             # reconnect mode: should NOT stop/rm the container
             dm.config = FakeDockerConfig(docker_reconnect_container=True)
             with (
-                patch("dimos.core.docker_runner._run") as mock_run,
-                patch("dimos.core.docker_runner._remove_container") as mock_rm,
+                patch("dimos.core.docker_module._run") as mock_run,
+                patch("dimos.core.docker_module._remove_container") as mock_rm,
             ):
                 dm._cleanup()
                 mock_run.assert_not_called()
@@ -275,8 +275,8 @@ def test_cleanup_stops_container_when_not_reconnect(self):
             # normal mode: should stop and rm the container
             dm.config = FakeDockerConfig(docker_reconnect_container=False)
             with (
-                patch("dimos.core.docker_runner._run") as mock_run,
-                patch("dimos.core.docker_runner._remove_container") as mock_rm,
+                patch("dimos.core.docker_module._run") as mock_run,
+                patch("dimos.core.docker_module._remove_container") as mock_rm,
             ):
                 dm._cleanup()
                 mock_run.assert_called_once()  # docker stop
diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
index adfd1f7a36..212daa9a49 100644
--- a/dimos/core/tests/test_parallel_deploy_cleanup.py
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -30,7 +30,7 @@
 class TestDockerWorkerManagerPartialFailure:
     """DockerWorkerManager.deploy_parallel must stop successful containers when one fails."""
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls):
         """Deploy 3 modules where the middle one fails. The other two must be stopped."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -69,7 +69,7 @@ def fake_constructor(cls, *args, **kwargs):
         mod_a.stop.assert_called_once()
         mod_c.stop.assert_called_once()
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls):
         """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -110,7 +110,7 @@ def fake_constructor(cls, *args, **kwargs):
         # The one successful module must have been stopped
         mod_a.stop.assert_called_once()
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_all_succeed_no_stops(self, mock_docker_module_cls):
         """When all deployments succeed, no modules should be stopped."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
@@ -138,7 +138,7 @@ def fake_constructor(cls, *args, **kwargs):
         for m in mocks:
             m.stop.assert_not_called()
 
-    @patch("dimos.core.docker_runner.DockerModuleOuter")
+    @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls):
         """If stop() itself raises during cleanup, the original deploy error still propagates."""
         from dimos.core.docker_worker_manager import DockerWorkerManager
diff --git a/dimos/manipulation/grasping/graspgen_module.py b/dimos/manipulation/grasping/graspgen_module.py
index ae2d59512a..3cca54dc2f 100644
--- a/dimos/manipulation/grasping/graspgen_module.py
+++ b/dimos/manipulation/grasping/graspgen_module.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 from dimos.core.core import rpc
-from dimos.core.docker_runner import DockerModuleConfig
+from dimos.core.docker_module import DockerModuleConfig
 from dimos.core.module import Module
 from dimos.core.stream import Out
 from dimos.msgs.geometry_msgs.PoseArray import PoseArray
diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py
index e519d82c87..2d8bcd1584 100644
--- a/dimos/manipulation/pick_and_place_module.py
+++ b/dimos/manipulation/pick_and_place_module.py
@@ -30,7 +30,7 @@
 from dimos.agents.annotation import skill
 from dimos.constants import DIMOS_PROJECT_ROOT
 from dimos.core.core import rpc
-from dimos.core.docker_runner import DockerModuleOuter as DockerRunner
+from dimos.core.docker_module import DockerModuleOuter as DockerRunner
 from dimos.core.stream import In
 from dimos.manipulation.grasping.graspgen_module import GraspGenModule
 from dimos.manipulation.manipulation_module import (
diff --git a/dimos/test_no_sections.py b/dimos/test_no_sections.py
index 63c6c42c81..902288b2e6 100644
--- a/dimos/test_no_sections.py
+++ b/dimos/test_no_sections.py
@@ -58,7 +58,7 @@
 # Each entry is (relative_path, line_substring) — if both match, the line is skipped.
 WHITELIST = [
     # Sentinel marker used at runtime to detect already-converted Dockerfiles
-    ("dimos/core/docker_runner.py", "DIMOS_SENTINEL"),
+    ("dimos/core/docker_module.py", "DIMOS_SENTINEL"),
 ]
 
 
diff --git a/docker/python/module-install.sh b/docker/python/module-install.sh
index ab0aea1032..7c0c54b5f8 100644
--- a/docker/python/module-install.sh
+++ b/docker/python/module-install.sh
@@ -66,7 +66,7 @@ cat > /dimos/entrypoint.sh <<EOF
 #!/usr/bin/env bash
 set -euo pipefail
 export PYTHONPATH="/dimos/source:/dimos/third_party:\${PYTHONPATH:-}"
-exec ${PYTHON} -m dimos.core.docker_runner run "\$@"
+exec ${PYTHON} -m dimos.core.docker_module run "\$@"
 EOF
 
 chmod +x /dimos/entrypoint.sh
diff --git a/examples/docker_hello_world/hello_docker.py b/examples/docker_hello_world/hello_docker.py
index a9913d770b..d8c984db90 100644
--- a/examples/docker_hello_world/hello_docker.py
+++ b/examples/docker_hello_world/hello_docker.py
@@ -40,7 +40,7 @@
 
 from dimos.core.blueprints import autoconnect
 from dimos.core.core import rpc
-from dimos.core.docker_runner import DockerModuleConfig
+from dimos.core.docker_module import DockerModuleConfig
 from dimos.core.module import Module
 from dimos.core.stream import In, Out
 

From 25f07b33dcfe33264aac030eaff84f3c90cfffd1 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 17:38:19 -0700
Subject: [PATCH 83/89] add ModuleCoordinator docstring

---
 dimos/core/module_coordinator.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 4937a2e121..d70c10035c 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -35,6 +35,15 @@
 
 
 class ModuleCoordinator(Resource):  # type: ignore[misc]
+    """
+    There should only ever be one module coordinator instance (this is a singleton)
+    - Module (classes) should be able to be deployed, stopped, and re-deployed in on one instance of ModuleCoordinator
+    - Arguably ModuleCoordinator could be called the "DimosRuntime"
+    - ModuleCoordinator is responsible for all global "addresses".
+      Ex: it should make sure all modules are using the same LCM url, the same rerun port, etc
+      (it may not do all of that at time of writing but that is the intention/job of this class)
+    - Modules shouldn't be deployed on their own (except for testing)
+    """
     _client: WorkerManager | None = None
     _global_config: GlobalConfig
     _n: int | None = None

From 2d321e3d808f98ada0a2766ca8c8a0d71da21808 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 18:51:44 -0700
Subject: [PATCH 84/89] use threading utils

---
 dimos/core/docker_module.py                   | 326 ++++---
 dimos/core/module_coordinator.py              |   4 +-
 dimos/core/tests/test_docker_deployment.py    |   2 +-
 .../tests/test_parallel_deploy_cleanup.py     |   2 +-
 dimos/core/worker_manager.py                  |   3 +-
 ...er_manager.py => worker_manager_docker.py} |   3 +-
 dimos/utils/safe_thread_map.py                | 108 ---
 dimos/utils/test_thread_utils.py              | 888 ++++++++++++++++++
 dimos/utils/thread_utils.py                   | 550 +++++++++++
 dimos/utils/typing_utils.py                   |  45 +
 10 files changed, 1654 insertions(+), 277 deletions(-)
 rename dimos/core/{docker_worker_manager.py => worker_manager_docker.py} (94%)
 delete mode 100644 dimos/utils/safe_thread_map.py
 create mode 100644 dimos/utils/test_thread_utils.py
 create mode 100644 dimos/utils/thread_utils.py
 create mode 100644 dimos/utils/typing_utils.py

diff --git a/dimos/core/docker_module.py b/dimos/core/docker_module.py
index 1880aa0dbd..dc0ffd533f 100644
--- a/dimos/core/docker_module.py
+++ b/dimos/core/docker_module.py
@@ -35,6 +35,7 @@
 from dimos.core.rpc_client import ModuleProxyProtocol, RpcCall
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
 from dimos.utils.logging_config import setup_logger
+from dimos.utils.thread_utils import ThreadSafeVal
 from dimos.visualization.rerun.bridge import RERUN_GRPC_PORT, RERUN_WEB_PORT
 
 if TYPE_CHECKING:
@@ -125,163 +126,6 @@ def is_docker_module(module_class: type) -> bool:
     )
 
 
-# Docker helpers
-
-
-def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]:
-    logger.debug(f"exec: {' '.join(cmd)}")
-    return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
-
-
-def _remove_container(cfg: DockerModuleConfig, name: str) -> None:
-    _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT)
-
-
-def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool:
-    r = _run(
-        [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name],
-        timeout=DOCKER_STATUS_TIMEOUT,
-    )
-    return r.returncode == 0 and r.stdout.strip() == "true"
-
-
-def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str:
-    r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
-    out = (r.stdout or "").rstrip()
-    err = (r.stderr or "").rstrip()
-    return out + ("\n" + err if err else "")
-
-
-def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
-    """Extract JSON-serializable config fields for the container (excludes docker_* fields)."""
-    out: dict[str, Any] = {}
-    for k, v in cfg.__dict__.items():
-        if k.startswith("docker_") or isinstance(v, type) or callable(v):
-            continue
-        try:
-            json.dumps(v)
-            out[k] = v
-        except (TypeError, ValueError):
-            level = "debug" if k.startswith("_") else "warning"
-            getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping")
-    return out
-
-
-# Image building and Dockerfile conversion
-
-
-_BUILD_HASH_LABEL = "dimos.build.hash"
-
-# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness)
-DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc"
-
-# Footer appended to Dockerfiles for DimOS module conversion
-DIMOS_FOOTER = f"""
-# ==== {DIMOS_SENTINEL} ====
-# Copy DimOS source from build context
-COPY dimos /dimos/source/dimos/
-COPY pyproject.toml /dimos/source/
-COPY docker/python/module-install.sh /tmp/module-install.sh
-
-# Install DimOS and create entrypoint
-RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh
-
-ENTRYPOINT ["/dimos/entrypoint.sh"]
-"""
-
-
-def _convert_dockerfile(dockerfile: Path) -> Path:
-    """Append DimOS footer to Dockerfile. Returns path to converted file."""
-    content = dockerfile.read_text()
-
-    # Already converted?
-    if DIMOS_SENTINEL in content:
-        return dockerfile
-
-    logger.info(f"Converting {dockerfile.name} to DimOS format")
-
-    converted = dockerfile.parent / f".{dockerfile.name}.ignore"
-    converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n"))
-    return converted
-
-
-def _compute_build_hash(cfg: DockerModuleConfig) -> str:
-    """Hash Dockerfile contents and build args."""
-    if cfg.docker_file is None:
-        raise ValueError("docker_file is required for computing build hash")
-    digest = hashlib.sha256()
-    digest.update(cfg.docker_file.read_bytes())
-    for key, val in sorted(cfg.docker_build_args.items()):
-        digest.update(f"{key}={val}".encode())
-    for arg in cfg.docker_build_extra_args:
-        digest.update(arg.encode())
-    return digest.hexdigest()
-
-
-def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None:
-    """Read the build hash label from an existing Docker image."""
-    r = subprocess.run(
-        [
-            cfg.docker_bin,
-            "image",
-            "inspect",
-            "-f",
-            '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
-            cfg.docker_image,
-        ],
-        capture_output=True,
-        text=True,
-        timeout=DOCKER_CMD_TIMEOUT,
-        check=False,
-    )
-    if r.returncode != 0:
-        return None
-    value = r.stdout.strip()
-    # docker prints "<no value>" when the label is missing
-    return value if value and value != "<no value>" else None
-
-
-def build_image(cfg: DockerModuleConfig) -> None:
-    """Build Docker image using footer mode conversion."""
-    if cfg.docker_file is None:
-        raise ValueError("docker_file is required for building Docker images")
-
-    build_hash = _compute_build_hash(cfg)
-    dockerfile = _convert_dockerfile(cfg.docker_file)
-
-    context = cfg.docker_build_context or cfg.docker_file.parent
-    cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
-    cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
-    for k, v in cfg.docker_build_args.items():
-        cmd.extend(["--build-arg", f"{k}={v}"])
-    cmd.extend(cfg.docker_build_extra_args)
-    cmd.append(str(context))
-
-    logger.info(f"Building Docker image: {cfg.docker_image}")
-    # Stream stdout to terminal so the user sees build progress, but capture
-    # stderr separately so we can include it in the error message on failure.
-    result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE)
-    if result.returncode != 0:
-        raise RuntimeError(
-            f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}"
-        )
-
-
-def image_exists(cfg: DockerModuleConfig) -> bool:
-    """Check if the configured Docker image exists locally."""
-    r = subprocess.run(
-        [cfg.docker_bin, "image", "inspect", cfg.docker_image],
-        capture_output=True,
-        text=True,
-        timeout=DOCKER_CMD_TIMEOUT,
-        check=False,
-    )
-    return r.returncode == 0
-
-
-# Host-side Docker-backed Module handle
-
-
 class DockerModuleOuter(ModuleProxyProtocol):
     """
     Host-side handle for a module running inside Docker.
@@ -311,7 +155,7 @@ def __init__(self, module_class: type[Module], *args: Any, **kwargs: Any) -> Non
         self._args = args
         self._kwargs = kwargs
         self._running = threading.Event()
-        self._is_built = False
+        self._is_built = ThreadSafeVal(False)
         self.remote_name = module_class.__name__
         # Derive container name from image + class name: "my-registry/foo:v2" → "dimos_myclass_foo_v2"
         image_ref = config.docker_image.rsplit("/", 1)[-1]
@@ -335,7 +179,7 @@ def build(self) -> None:
         Idempotent — safe to call multiple times. Has no RPC timeout since
         this runs host-side (not via RPC to a worker process).
         """
-        if self._is_built:
+        if self._is_built.get():
             return
 
         config = self.config
@@ -386,7 +230,7 @@ def build(self) -> None:
             # docker run -d returns before Module.__init__ finishes in the container,
             # so we poll until the RPC server is reachable before returning.
             self._wait_for_rpc()
-            self._is_built = True
+            self._is_built.set(True)
         except Exception:
             with suppress(Exception):
                 self._cleanup()
@@ -675,9 +519,6 @@ def _wait_for_rpc(self) -> None:
 DockerModule = DockerModuleOuter
 
 
-# Container-side runner
-
-
 class DockerModuleInner:
     """Runs a module inside Docker container. Blocks until SIGTERM/SIGINT."""
 
@@ -713,6 +554,159 @@ def wait(self) -> None:
         self._shutdown.wait()
 
 
+# ---------------------------------------------------------------------------
+# Helpers (private — used by the classes above)
+# ---------------------------------------------------------------------------
+
+
+def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]:
+    logger.debug(f"exec: {' '.join(cmd)}")
+    return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
+
+
+def _remove_container(cfg: DockerModuleConfig, name: str) -> None:
+    _run([cfg.docker_bin, "rm", "-f", name], timeout=DOCKER_CMD_TIMEOUT)
+
+
+def _is_container_running(cfg: DockerModuleConfig, name: str) -> bool:
+    r = _run(
+        [cfg.docker_bin, "inspect", "-f", "{{.State.Running}}", name],
+        timeout=DOCKER_STATUS_TIMEOUT,
+    )
+    return r.returncode == 0 and r.stdout.strip() == "true"
+
+
+def _tail_logs(cfg: DockerModuleConfig, name: str, n: int = LOG_TAIL_LINES) -> str:
+    r = _run([cfg.docker_bin, "logs", "--tail", str(n), name], timeout=DOCKER_CMD_TIMEOUT)
+    out = (r.stdout or "").rstrip()
+    err = (r.stderr or "").rstrip()
+    return out + ("\n" + err if err else "")
+
+
+def _extract_module_config(cfg: DockerModuleConfig) -> dict[str, Any]:
+    """Extract JSON-serializable config fields for the container (excludes docker_* fields)."""
+    out: dict[str, Any] = {}
+    for k, v in cfg.__dict__.items():
+        if k.startswith("docker_") or isinstance(v, type) or callable(v):
+            continue
+        try:
+            json.dumps(v)
+            out[k] = v
+        except (TypeError, ValueError):
+            level = "debug" if k.startswith("_") else "warning"
+            getattr(logger, level)(f"Config field '{k}' not JSON-serializable, skipping")
+    return out
+
+
+_BUILD_HASH_LABEL = "dimos.build.hash"
+
+# the way of detecting already-converted Dockerfiles (UUID ensures uniqueness)
+DIMOS_SENTINEL = "DIMOS-MODULE-CONVERSION-427593ae-c6e8-4cf1-9b2d-ee81a420a5dc"
+
+# Footer appended to Dockerfiles for DimOS module conversion
+DIMOS_FOOTER = f"""
+# ==== {DIMOS_SENTINEL} ====
+# Copy DimOS source from build context
+COPY dimos /dimos/source/dimos/
+COPY pyproject.toml /dimos/source/
+COPY docker/python/module-install.sh /tmp/module-install.sh
+
+# Install DimOS and create entrypoint
+RUN bash /tmp/module-install.sh /dimos/source && rm /tmp/module-install.sh
+
+ENTRYPOINT ["/dimos/entrypoint.sh"]
+"""
+
+
+def _convert_dockerfile(dockerfile: Path) -> Path:
+    """Append DimOS footer to Dockerfile. Returns path to converted file."""
+    content = dockerfile.read_text()
+
+    # Already converted?
+    if DIMOS_SENTINEL in content:
+        return dockerfile
+
+    logger.info(f"Converting {dockerfile.name} to DimOS format")
+
+    converted = dockerfile.parent / f".{dockerfile.name}.ignore"
+    converted.write_text(content.rstrip() + "\n" + DIMOS_FOOTER.lstrip("\n"))
+    return converted
+
+
+def _compute_build_hash(cfg: DockerModuleConfig) -> str:
+    """Hash Dockerfile contents and build args."""
+    if cfg.docker_file is None:
+        raise ValueError("docker_file is required for computing build hash")
+    digest = hashlib.sha256()
+    digest.update(cfg.docker_file.read_bytes())
+    for key, val in sorted(cfg.docker_build_args.items()):
+        digest.update(f"{key}={val}".encode())
+    for arg in cfg.docker_build_extra_args:
+        digest.update(arg.encode())
+    return digest.hexdigest()
+
+
+def _get_image_build_hash(cfg: DockerModuleConfig) -> str | None:
+    """Read the build hash label from an existing Docker image."""
+    r = subprocess.run(
+        [
+            cfg.docker_bin,
+            "image",
+            "inspect",
+            "-f",
+            '{{index .Config.Labels "' + _BUILD_HASH_LABEL + '"}}',
+            cfg.docker_image,
+        ],
+        capture_output=True,
+        text=True,
+        timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
+    )
+    if r.returncode != 0:
+        return None
+    value = r.stdout.strip()
+    # docker prints "<no value>" when the label is missing
+    return value if value and value != "<no value>" else None
+
+
+def build_image(cfg: DockerModuleConfig) -> None:
+    """Build Docker image using footer mode conversion."""
+    if cfg.docker_file is None:
+        raise ValueError("docker_file is required for building Docker images")
+
+    build_hash = _compute_build_hash(cfg)
+    dockerfile = _convert_dockerfile(cfg.docker_file)
+
+    context = cfg.docker_build_context or cfg.docker_file.parent
+    cmd = [cfg.docker_bin, "build", "-t", cfg.docker_image, "-f", str(dockerfile)]
+    cmd.extend(["--label", f"{_BUILD_HASH_LABEL}={build_hash}"])
+    for k, v in cfg.docker_build_args.items():
+        cmd.extend(["--build-arg", f"{k}={v}"])
+    cmd.extend(cfg.docker_build_extra_args)
+    cmd.append(str(context))
+
+    logger.info(f"Building Docker image: {cfg.docker_image}")
+    # Stream stdout to terminal so the user sees build progress, but capture
+    # stderr separately so we can include it in the error message on failure.
+    result = subprocess.run(cmd, text=True, stderr=subprocess.PIPE)
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"Docker build failed with exit code {result.returncode}\nSTDERR:\n{result.stderr}"
+        )
+
+
+def image_exists(cfg: DockerModuleConfig) -> bool:
+    """Check if the configured Docker image exists locally."""
+    r = subprocess.run(
+        [cfg.docker_bin, "image", "inspect", cfg.docker_image],
+        capture_output=True,
+        text=True,
+        timeout=DOCKER_CMD_TIMEOUT,
+        check=False,
+    )
+    return r.returncode == 0
+
+
 def _install_signal_handlers(runner: DockerModuleInner) -> None:
     def shutdown(_sig: int, _frame: Any) -> None:
         runner.stop()
@@ -733,6 +727,10 @@ def _cli_run(payload_json: str) -> None:
     runner.wait()
 
 
+# Container-side entrypoint: invoked as `python -m dimos.core.docker_module run --payload '...'`
+# by the generated entrypoint.sh inside Docker containers (see docker/python/module-install.sh).
+# This is what makes `DockerModuleInner` actually run — without it, containers would have no
+# way to bootstrap the module from the JSON payload that `DockerModuleOuter` passes via `docker run`.
 def main(argv: list[str] | None = None) -> None:
     parser = argparse.ArgumentParser(prog="dimos.core.docker_module")
     sub = parser.add_subparsers(dest="cmd", required=True)
@@ -754,11 +752,11 @@ def main(argv: list[str] | None = None) -> None:
 
 
 __all__ = [
+    "DIMOS_FOOTER",
     "DockerModule",
     "DockerModuleConfig",
     "DockerModuleInner",
     "DockerModuleOuter",
-    "DIMOS_FOOTER",
     "build_image",
     "image_exists",
     "is_docker_module",
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index d70c10035c..7902072570 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -24,7 +24,8 @@
 from dimos.core.resource import Resource
 from dimos.core.worker_manager import WorkerManager
 from dimos.utils.logging_config import setup_logger
-from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
+from dimos.utils.thread_utils import safe_thread_map
+from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
     from dimos.core.resource_monitor.monitor import StatsMonitor
@@ -44,6 +45,7 @@ class ModuleCoordinator(Resource):  # type: ignore[misc]
       (it may not do all of that at time of writing but that is the intention/job of this class)
     - Modules shouldn't be deployed on their own (except for testing)
     """
+
     _client: WorkerManager | None = None
     _global_config: GlobalConfig
     _n: int | None = None
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 55e96d3b72..982bc656b4 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -27,7 +27,7 @@
 
 import pytest
 
-from dimos.core.docker_module import DockerModuleOuter, DockerModuleConfig, is_docker_module
+from dimos.core.docker_module import DockerModuleConfig, DockerModuleOuter, is_docker_module
 from dimos.core.global_config import global_config
 from dimos.core.module import Module
 from dimos.core.module_coordinator import ModuleCoordinator
diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
index 212daa9a49..795401d80e 100644
--- a/dimos/core/tests/test_parallel_deploy_cleanup.py
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -24,7 +24,7 @@
 
 import pytest
 
-from dimos.utils.safe_thread_map import ExceptionGroup
+from dimos.utils.typing_utils import ExceptionGroup
 
 
 class TestDockerWorkerManagerPartialFailure:
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager.py
index 3cd836b3ed..f12bffac66 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager.py
@@ -23,7 +23,8 @@
 from dimos.core.rpc_client import RPCClient
 from dimos.core.worker import Worker
 from dimos.utils.logging_config import setup_logger
-from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
+from dimos.utils.thread_utils import safe_thread_map
+from dimos.utils.typing_utils import ExceptionGroup
 
 logger = setup_logger()
 
diff --git a/dimos/core/docker_worker_manager.py b/dimos/core/worker_manager_docker.py
similarity index 94%
rename from dimos/core/docker_worker_manager.py
rename to dimos/core/worker_manager_docker.py
index 94b4e973c8..78bc9928c4 100644
--- a/dimos/core/docker_worker_manager.py
+++ b/dimos/core/worker_manager_docker.py
@@ -17,7 +17,8 @@
 from typing import TYPE_CHECKING, Any
 
 from dimos.core.module import ModuleSpec
-from dimos.utils.safe_thread_map import ExceptionGroup, safe_thread_map
+from dimos.utils.thread_utils import safe_thread_map
+from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
     from dimos.core.docker_module import DockerModuleOuter
diff --git a/dimos/utils/safe_thread_map.py b/dimos/utils/safe_thread_map.py
deleted file mode 100644
index 514fac2026..0000000000
--- a/dimos/utils/safe_thread_map.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2025-2026 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-from collections.abc import Callable, Sequence
-from concurrent.futures import Future, ThreadPoolExecutor, as_completed
-import sys
-from typing import Any, TypeVar
-
-if sys.version_info < (3, 11):
-
-    class ExceptionGroup(Exception):  # type: ignore[no-redef]  # noqa: N818
-        """Minimal ExceptionGroup polyfill for Python 3.10."""
-
-        exceptions: tuple[BaseException, ...]
-
-        def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None:
-            super().__init__(message)
-            self.exceptions = tuple(exceptions)
-else:
-    import builtins
-
-    ExceptionGroup = builtins.ExceptionGroup  # type: ignore[misc]
-
-T = TypeVar("T")
-R = TypeVar("R")
-
-
-def safe_thread_map(
-    items: Sequence[T],
-    fn: Callable[[T], R],
-    on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any]
-    | None = None,
-) -> list[R]:
-    """Thread-pool map that waits for all items to finish before raising and a cleanup handler
-
-    - Empty *items* → returns ``[]`` immediately.
-    - All succeed → returns results in input order.
-    - Any fail → calls ``on_errors(outcomes, successes, errors)`` where
-      *outcomes* is a list of ``(input, result_or_exception)`` pairs in input
-      order, *successes* is the list of successful results, and *errors* is
-      the list of exceptions. If *on_errors* raises, that exception propagates.
-      If *on_errors* returns normally, its return value is returned from
-      ``safe_thread_map``. If *on_errors* is ``None``, raises an
-      ``ExceptionGroup``.
-
-    Example::
-
-        def start_service(name: str) -> Connection:
-            return connect(name)
-
-        def cleanup(
-            outcomes: list[tuple[str, Connection | Exception]],
-            successes: list[Connection],
-            errors: list[Exception],
-        ) -> None:
-            for conn in successes:
-                conn.close()
-            raise ExceptionGroup("failed to start services", errors)
-
-        connections = safe_thread_map(
-            ["db", "cache", "queue"],
-            start_service,
-            cleanup,  # called only if any start_service() raises
-        )
-    """
-    if not items:
-        return []
-
-    outcomes: dict[int, R | Exception] = {}
-
-    with ThreadPoolExecutor(max_workers=len(items)) as pool:
-        futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)}
-        for fut in as_completed(futures):
-            idx = futures[fut]
-            try:
-                outcomes[idx] = fut.result()
-            except Exception as e:
-                outcomes[idx] = e
-
-    # Note: successes/errors are in completion order, not input order.
-    # This is fine — on_errors only needs them for cleanup, not ordering.
-    successes: list[R] = []
-    errors: list[Exception] = []
-    for v in outcomes.values():
-        if isinstance(v, Exception):
-            errors.append(v)
-        else:
-            successes.append(v)
-
-    if errors:
-        if on_errors is not None:
-            zipped = [(items[i], outcomes[i]) for i in range(len(items))]
-            return on_errors(zipped, successes, errors)  # type: ignore[return-value, no-any-return]
-        raise ExceptionGroup("safe_thread_map failed", errors)
-
-    return [outcomes[i] for i in range(len(items))]  # type: ignore[misc]
diff --git a/dimos/utils/test_thread_utils.py b/dimos/utils/test_thread_utils.py
new file mode 100644
index 0000000000..07047c6d92
--- /dev/null
+++ b/dimos/utils/test_thread_utils.py
@@ -0,0 +1,888 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Exhaustive tests for dimos/utils/thread_utils.py
+
+Covers: ThreadSafeVal, ModuleThread, AsyncModuleThread, ModuleProcess, safe_thread_map.
+Focuses on deadlocks, race conditions, idempotency, and edge cases under load.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import pickle
+import sys
+import threading
+import time
+from unittest import mock
+
+import pytest
+from reactivex.disposable import CompositeDisposable
+
+from dimos.utils.thread_utils import (
+    AsyncModuleThread,
+    ModuleProcess,
+    ModuleThread,
+    ThreadSafeVal,
+    safe_thread_map,
+)
+
+# Helpers: fake ModuleBase for testing ModuleThread / AsyncModuleThread / ModuleProcess
+
+
+class FakeModule:
+    """Minimal stand-in for ModuleBase — just needs _disposables."""
+
+    def __init__(self) -> None:
+        self._disposables = CompositeDisposable()
+
+    def dispose(self) -> None:
+        self._disposables.dispose()
+
+
+# ThreadSafeVal Tests
+
+
+class TestThreadSafeVal:
+    def test_basic_get_set(self) -> None:
+        v = ThreadSafeVal(42)
+        assert v.get() == 42
+        v.set(99)
+        assert v.get() == 99
+
+    def test_bool_truthy(self) -> None:
+        v = ThreadSafeVal(True)
+        assert bool(v) is True
+        v.set(False)
+        assert bool(v) is False
+
+    def test_bool_zero(self) -> None:
+        v = ThreadSafeVal(0)
+        assert bool(v) is False
+        v.set(1)
+        assert bool(v) is True
+
+    def test_context_manager_returns_value(self) -> None:
+        v = ThreadSafeVal("hello")
+        with v as val:
+            assert val == "hello"
+
+    def test_set_inside_context_manager_no_deadlock(self) -> None:
+        """The critical test: set() inside a with block must NOT deadlock.
+
+        This was a confirmed bug when using threading.Lock (non-reentrant).
+        Fixed by using threading.RLock.
+        """
+        v = ThreadSafeVal(0)
+        result = threading.Event()
+
+        def do_it() -> None:
+            with v as val:
+                v.set(val + 1)
+            result.set()
+
+        t = threading.Thread(target=do_it)
+        t.start()
+        t.join(timeout=2)
+        assert result.is_set(), "Deadlocked! set() inside with block hung"
+        assert v.get() == 1
+
+    def test_get_inside_context_manager_no_deadlock(self) -> None:
+        v = ThreadSafeVal(10)
+        result = threading.Event()
+
+        def do_it() -> None:
+            with v:
+                _ = v.get()
+            result.set()
+
+        t = threading.Thread(target=do_it)
+        t.start()
+        t.join(timeout=2)
+        assert result.is_set(), "Deadlocked! get() inside with block hung"
+
+    def test_bool_inside_context_manager_no_deadlock(self) -> None:
+        v = ThreadSafeVal(True)
+        result = threading.Event()
+
+        def do_it() -> None:
+            with v:
+                _ = bool(v)
+            result.set()
+
+        t = threading.Thread(target=do_it)
+        t.start()
+        t.join(timeout=2)
+        assert result.is_set(), "Deadlocked! bool() inside with block hung"
+
+    def test_context_manager_blocks_other_threads(self) -> None:
+        """While one thread holds the lock via `with`, others should block on set()."""
+        v = ThreadSafeVal(0)
+        gate = threading.Event()
+        other_started = threading.Event()
+        other_finished = threading.Event()
+
+        def holder() -> None:
+            with v:
+                gate.wait(timeout=5)  # hold the lock until signaled
+
+        def setter() -> None:
+            other_started.set()
+            v.set(42)  # should block until holder releases
+            other_finished.set()
+
+        t1 = threading.Thread(target=holder)
+        t2 = threading.Thread(target=setter)
+        t1.start()
+        time.sleep(0.05)  # let holder acquire lock
+        t2.start()
+        other_started.wait(timeout=2)
+        time.sleep(0.1)
+        # setter should be blocked
+        assert not other_finished.is_set(), "set() did not block while lock was held"
+        gate.set()  # release holder
+        t1.join(timeout=2)
+        t2.join(timeout=2)
+        assert other_finished.is_set()
+        assert v.get() == 42
+
+    def test_concurrent_increments(self) -> None:
+        """Many threads doing atomic read-modify-write should not lose updates."""
+        v = ThreadSafeVal(0)
+        n_threads = 50
+        n_increments = 100
+
+        def incrementer() -> None:
+            for _ in range(n_increments):
+                with v as val:
+                    v.set(val + 1)
+
+        threads = [threading.Thread(target=incrementer) for _ in range(n_threads)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join(timeout=10)
+        assert v.get() == n_threads * n_increments
+
+    def test_concurrent_increments_stress(self) -> None:
+        """Run the concurrent increment test multiple times to catch races."""
+        for _ in range(10):
+            self.test_concurrent_increments()
+
+    def test_pickle_roundtrip(self) -> None:
+        v = ThreadSafeVal({"key": [1, 2, 3]})
+        data = pickle.dumps(v)
+        v2 = pickle.loads(data)
+        assert v2.get() == {"key": [1, 2, 3]}
+        # Verify the new instance has a working lock
+        with v2 as val:
+            v2.set({**val, "new": True})
+        assert v2.get()["new"] is True
+
+    def test_repr(self) -> None:
+        v = ThreadSafeVal("test")
+        assert repr(v) == "ThreadSafeVal('test')"
+
+    def test_dict_type(self) -> None:
+        v = ThreadSafeVal({"running": False, "count": 0})
+        with v as s:
+            v.set({**s, "running": True})
+        assert v.get() == {"running": True, "count": 0}
+
+    def test_string_literal_type(self) -> None:
+        """Simulates the ModState pattern from module.py."""
+        v = ThreadSafeVal("init")
+        with v as state:
+            if state == "init":
+                v.set("started")
+        assert v.get() == "started"
+
+        with v as state:
+            if state == "stopped":
+                pass  # no-op
+            else:
+                v.set("stopped")
+        assert v.get() == "stopped"
+
+    def test_nested_with_no_deadlock(self) -> None:
+        """RLock should allow the same thread to nest with blocks."""
+        v = ThreadSafeVal(0)
+        result = threading.Event()
+
+        def do_it() -> None:
+            with v:
+                with v as val2:
+                    v.set(val2 + 1)
+            result.set()
+
+        t = threading.Thread(target=do_it)
+        t.start()
+        t.join(timeout=2)
+        assert result.is_set(), "Nested with blocks deadlocked!"
+
+
+# ModuleThread Tests
+
+
+class TestModuleThread:
+    def test_basic_lifecycle(self) -> None:
+        mod = FakeModule()
+        ran = threading.Event()
+
+        def target() -> None:
+            ran.set()
+
+        mt = ModuleThread(module=mod, target=target, name="test-basic")
+        ran.wait(timeout=2)
+        assert ran.is_set()
+        mt.stop()
+        assert not mt.is_alive
+
+    def test_auto_start(self) -> None:
+        mod = FakeModule()
+        started = threading.Event()
+        mt = ModuleThread(module=mod, target=started.set, name="test-autostart")
+        started.wait(timeout=2)
+        assert started.is_set()
+        mt.stop()
+
+    def test_deferred_start(self) -> None:
+        mod = FakeModule()
+        started = threading.Event()
+        mt = ModuleThread(module=mod, target=started.set, name="test-deferred", start=False)
+        time.sleep(0.1)
+        assert not started.is_set()
+        mt.start()
+        started.wait(timeout=2)
+        assert started.is_set()
+        mt.stop()
+
+    def test_stopping_property(self) -> None:
+        mod = FakeModule()
+        saw_stopping = threading.Event()
+        holder: list[ModuleThread] = []
+
+        def target() -> None:
+            while not holder[0].stopping:
+                time.sleep(0.01)
+            saw_stopping.set()
+
+        mt = ModuleThread(module=mod, target=target, name="test-stopping", start=False)
+        holder.append(mt)
+        mt.start()
+        time.sleep(0.05)
+        mt.stop()
+        saw_stopping.wait(timeout=2)
+        assert saw_stopping.is_set()
+
+    def test_stop_idempotent(self) -> None:
+        mod = FakeModule()
+        mt = ModuleThread(module=mod, target=lambda: time.sleep(0.01), name="test-idem")
+        time.sleep(0.05)
+        mt.stop()
+        mt.stop()  # second call should not raise
+        mt.stop()  # third call should not raise
+
+    def test_stop_from_managed_thread_no_deadlock(self) -> None:
+        """The thread calling stop() on itself should not deadlock."""
+        mod = FakeModule()
+        result = threading.Event()
+        holder: list[ModuleThread] = []
+
+        def target() -> None:
+            holder[0].stop()  # stop ourselves — should not deadlock
+            result.set()
+
+        mt = ModuleThread(module=mod, target=target, name="test-self-stop", start=False)
+        holder.append(mt)
+        mt.start()
+        result.wait(timeout=3)
+        assert result.is_set(), "Deadlocked when thread called stop() on itself"
+
+    def test_dispose_stops_thread(self) -> None:
+        """Module dispose should stop the thread via the registered Disposable."""
+        mod = FakeModule()
+        running = threading.Event()
+        holder: list[ModuleThread] = []
+
+        def target() -> None:
+            running.set()
+            while not holder[0].stopping:
+                time.sleep(0.01)
+
+        mt = ModuleThread(module=mod, target=target, name="test-dispose", start=False)
+        holder.append(mt)
+        mt.start()
+        running.wait(timeout=2)
+        mod.dispose()
+        time.sleep(0.1)
+        assert not mt.is_alive
+
+    def test_concurrent_stop_calls(self) -> None:
+        """Multiple threads calling stop() concurrently should not crash."""
+        mod = FakeModule()
+        holder: list[ModuleThread] = []
+
+        def target() -> None:
+            while not holder[0].stopping:
+                time.sleep(0.01)
+
+        mt = ModuleThread(module=mod, target=target, name="test-concurrent-stop", start=False)
+        holder.append(mt)
+        mt.start()
+        time.sleep(0.05)
+
+        errors = []
+
+        def stop_it() -> None:
+            try:
+                mt.stop()
+            except Exception as e:
+                errors.append(e)
+
+        threads = [threading.Thread(target=stop_it) for _ in range(20)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join(timeout=5)
+        assert not errors, f"Concurrent stop() raised: {errors}"
+
+    def test_close_timeout_respected(self) -> None:
+        """If the thread ignores the stop signal, stop() should return after close_timeout."""
+        mod = FakeModule()
+        bail = threading.Event()
+
+        def stubborn_target() -> None:
+            bail.wait(timeout=10)  # ignores stopping signal, but we can bail it out
+
+        mt = ModuleThread(
+            module=mod, target=stubborn_target, name="test-timeout", close_timeout=0.2
+        )
+        start = time.monotonic()
+        mt.stop()
+        elapsed = time.monotonic() - start
+        assert elapsed < 1.0, f"stop() took {elapsed}s, expected ~0.2s"
+        bail.set()  # let the thread exit so conftest thread-leak detector is happy
+        mt.join(timeout=2)
+
+    def test_stop_concurrent_with_dispose(self) -> None:
+        """Calling stop() and dispose() concurrently should not crash."""
+        for _ in range(20):
+            mod = FakeModule()
+            holder: list[ModuleThread] = []
+
+            def target(h: list[ModuleThread] = holder) -> None:
+                while not h[0].stopping:
+                    time.sleep(0.001)
+
+            mt = ModuleThread(module=mod, target=target, name="test-stop-dispose", start=False)
+            holder.append(mt)
+            mt.start()
+            time.sleep(0.02)
+            # Race: stop and dispose from different threads
+            t1 = threading.Thread(target=mt.stop)
+            t2 = threading.Thread(target=mod.dispose)
+            t1.start()
+            t2.start()
+            t1.join(timeout=3)
+            t2.join(timeout=3)
+
+
+# AsyncModuleThread Tests
+
+
+class TestAsyncModuleThread:
+    def test_creates_loop_and_thread(self) -> None:
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        assert amt.loop is not None
+        assert amt.loop.is_running()
+        assert amt.is_alive
+        amt.stop()
+        assert not amt.is_alive
+
+    def test_stop_idempotent(self) -> None:
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        amt.stop()
+        amt.stop()  # should not raise
+        amt.stop()
+
+    def test_dispose_stops_loop(self) -> None:
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        assert amt.is_alive
+        mod.dispose()
+        time.sleep(0.1)
+        assert not amt.is_alive
+
+    def test_can_schedule_coroutine(self) -> None:
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        result = []
+
+        async def coro() -> None:
+            result.append(42)
+
+        future = asyncio.run_coroutine_threadsafe(coro(), amt.loop)
+        future.result(timeout=2)
+        assert result == [42]
+        amt.stop()
+
+    def test_stop_with_pending_work(self) -> None:
+        """Stop should succeed even with long-running tasks on the loop."""
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        started = threading.Event()
+
+        async def slow_coro() -> None:
+            started.set()
+            await asyncio.sleep(10)
+
+        asyncio.run_coroutine_threadsafe(slow_coro(), amt.loop)
+        started.wait(timeout=2)
+        # stop() should not hang waiting for the coroutine
+        start = time.monotonic()
+        amt.stop()
+        elapsed = time.monotonic() - start
+        assert elapsed < 5.0, f"stop() hung for {elapsed}s with pending coroutine"
+
+    def test_concurrent_stop(self) -> None:
+        mod = FakeModule()
+        amt = AsyncModuleThread(module=mod)
+        errors = []
+
+        def stop_it() -> None:
+            try:
+                amt.stop()
+            except Exception as e:
+                errors.append(e)
+
+        threads = [threading.Thread(target=stop_it) for _ in range(20)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join(timeout=5)
+        assert not errors
+
+
+# ModuleProcess Tests
+
+
+# Helper: path to a python that sleeps or echoes
+PYTHON = sys.executable
+
+
+class TestModuleProcess:
+    def test_basic_lifecycle(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            shutdown_timeout=2.0,
+        )
+        assert mp.is_alive
+        assert mp.pid is not None
+        mp.stop()
+        assert not mp.is_alive
+        assert mp.pid is None
+
+    def test_stop_idempotent(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            shutdown_timeout=1.0,
+        )
+        mp.stop()
+        mp.stop()  # should not raise
+        mp.stop()
+
+    def test_dispose_stops_process(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            shutdown_timeout=2.0,
+        )
+        mod.dispose()
+        time.sleep(0.5)
+        assert not mp.is_alive
+
+    def test_on_exit_fires_on_natural_exit(self) -> None:
+        """on_exit should fire when the process exits on its own."""
+        mod = FakeModule()
+        exit_called = threading.Event()
+
+        ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "print('done')"],
+            on_exit=exit_called.set,
+        )
+        exit_called.wait(timeout=5)
+        assert exit_called.is_set(), "on_exit was not called after natural process exit"
+
+    def test_on_exit_fires_on_crash(self) -> None:
+        mod = FakeModule()
+        exit_called = threading.Event()
+
+        ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import sys; sys.exit(1)"],
+            on_exit=exit_called.set,
+        )
+        exit_called.wait(timeout=5)
+        assert exit_called.is_set(), "on_exit was not called after process crash"
+
+    def test_on_exit_not_fired_on_stop(self) -> None:
+        """on_exit should NOT fire when stop() kills the process."""
+        mod = FakeModule()
+        exit_called = threading.Event()
+
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            on_exit=exit_called.set,
+            shutdown_timeout=2.0,
+        )
+        time.sleep(0.2)  # let watchdog start
+        mp.stop()
+        time.sleep(1.0)  # give watchdog time to potentially fire
+        assert not exit_called.is_set(), "on_exit fired after intentional stop()"
+
+    def test_stdout_logged(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "print('hello from subprocess')"],
+        )
+        time.sleep(1.0)  # let output be read
+        mp.stop()
+
+    def test_stderr_logged(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import sys; sys.stderr.write('error msg\\n')"],
+        )
+        time.sleep(1.0)
+        mp.stop()
+
+    def test_log_json_mode(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[
+                PYTHON,
+                "-c",
+                """import json; print(json.dumps({"event": "test", "key": "val"}))""",
+            ],
+            log_json=True,
+        )
+        time.sleep(1.0)
+        mp.stop()
+
+    def test_log_json_malformed(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "print('not json')"],
+            log_json=True,
+        )
+        time.sleep(1.0)
+        mp.stop()
+
+    def test_stop_process_that_ignores_sigterm(self) -> None:
+        """Process that ignores SIGTERM should be killed with SIGKILL."""
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[
+                PYTHON,
+                "-c",
+                "import signal, time; signal.signal(signal.SIGTERM, signal.SIG_IGN); time.sleep(60)",
+            ],
+            shutdown_timeout=0.5,
+            kill_timeout=2.0,
+        )
+        time.sleep(0.2)
+        start = time.monotonic()
+        mp.stop()
+        elapsed = time.monotonic() - start
+        assert not mp.is_alive
+        # Should take roughly shutdown_timeout (0.5) + a bit for SIGKILL
+        assert elapsed < 5.0
+
+    def test_stop_already_dead_process(self) -> None:
+        """stop() on a process that already exited should not raise."""
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "pass"],  # exits immediately
+        )
+        time.sleep(1.0)  # let it die
+        mp.stop()  # should not raise
+
+    def test_concurrent_stop(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            shutdown_timeout=2.0,
+        )
+        errors = []
+
+        def stop_it() -> None:
+            try:
+                mp.stop()
+            except Exception as e:
+                errors.append(e)
+
+        threads = [threading.Thread(target=stop_it) for _ in range(20)]
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join(timeout=10)
+        assert not errors, f"Concurrent stop() raised: {errors}"
+
+    def test_on_exit_calls_module_stop_no_deadlock(self) -> None:
+        """Simulate the real pattern: on_exit=module.stop, which disposes the
+        ModuleProcess, which tries to stop its watchdog from inside the watchdog.
+        Must not deadlock.
+        """
+        mod = FakeModule()
+        stop_called = threading.Event()
+
+        def fake_module_stop() -> None:
+            """Simulates module.stop() -> _stop() -> dispose()"""
+            mod.dispose()
+            stop_called.set()
+
+        ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "pass"],  # exits immediately
+            on_exit=fake_module_stop,
+        )
+        stop_called.wait(timeout=5)
+        assert stop_called.is_set(), "Deadlocked! on_exit -> dispose -> stop chain hung"
+
+    def test_on_exit_calls_module_stop_no_deadlock_stress(self) -> None:
+        """Run the deadlock test multiple times under load."""
+        for _i in range(10):
+            self.test_on_exit_calls_module_stop_no_deadlock()
+
+    def test_deferred_start(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import time; time.sleep(30)"],
+            start=False,
+        )
+        assert not mp.is_alive
+        mp.start()
+        assert mp.is_alive
+        mp.stop()
+
+    def test_env_passed(self) -> None:
+        mod = FakeModule()
+        exit_called = threading.Event()
+
+        ModuleProcess(
+            module=mod,
+            args=[
+                PYTHON,
+                "-c",
+                "import os, sys; sys.exit(0 if os.environ.get('MY_VAR') == '42' else 1)",
+            ],
+            env={**os.environ, "MY_VAR": "42"},
+            on_exit=exit_called.set,
+        )
+        exit_called.wait(timeout=5)
+        # Process should have exited with 0 (our on_exit fires for all unmanaged exits)
+        assert exit_called.is_set()
+
+    def test_cwd_passed(self) -> None:
+        mod = FakeModule()
+        mp = ModuleProcess(
+            module=mod,
+            args=[PYTHON, "-c", "import os; print(os.getcwd())"],
+            cwd="/tmp",
+        )
+        time.sleep(1.0)
+        mp.stop()
+
+
+# safe_thread_map Tests
+
+
+class TestSafeThreadMap:
+    def test_empty_input(self) -> None:
+        assert safe_thread_map([], lambda x: x) == []
+
+    def test_all_succeed(self) -> None:
+        result = safe_thread_map([1, 2, 3], lambda x: x * 2)
+        assert result == [2, 4, 6]
+
+    def test_preserves_order(self) -> None:
+        def slow(x: int) -> int:
+            time.sleep(0.01 * (10 - x))
+            return x
+
+        result = safe_thread_map(list(range(10)), slow)
+        assert result == list(range(10))
+
+    def test_all_fail_raises_exception_group(self) -> None:
+        def fail(x: int) -> int:
+            raise ValueError(f"fail-{x}")
+
+        with pytest.raises(ExceptionGroup) as exc_info:
+            safe_thread_map([1, 2, 3], fail)
+        assert len(exc_info.value.exceptions) == 3
+
+    def test_partial_failure(self) -> None:
+        def maybe_fail(x: int) -> int:
+            if x == 2:
+                raise ValueError("fail")
+            return x
+
+        with pytest.raises(ExceptionGroup) as exc_info:
+            safe_thread_map([1, 2, 3], maybe_fail)
+        assert len(exc_info.value.exceptions) == 1
+
+    def test_on_errors_callback(self) -> None:
+        def fail(x: int) -> int:
+            if x == 2:
+                raise ValueError("boom")
+            return x * 10
+
+        cleanup_called = False
+
+        def on_errors(outcomes, successes, errors):
+            nonlocal cleanup_called
+            cleanup_called = True
+            assert len(errors) == 1
+            assert len(successes) == 2
+            return successes  # return successful results
+
+        result = safe_thread_map([1, 2, 3], fail, on_errors)
+        assert cleanup_called
+        assert sorted(result) == [10, 30]
+
+    def test_on_errors_can_raise(self) -> None:
+        def fail(x: int) -> int:
+            raise ValueError("boom")
+
+        def on_errors(outcomes, successes, errors):
+            raise RuntimeError("custom error")
+
+        with pytest.raises(RuntimeError, match="custom error"):
+            safe_thread_map([1], fail, on_errors)
+
+    def test_waits_for_all_before_raising(self) -> None:
+        """Even if one fails fast, all others should complete."""
+        completed = []
+
+        def work(x: int) -> int:
+            if x == 0:
+                raise ValueError("fast fail")
+            time.sleep(0.2)
+            completed.append(x)
+            return x
+
+        with pytest.raises(ExceptionGroup):
+            safe_thread_map([0, 1, 2, 3], work)
+        # All non-failing items should have completed
+        assert sorted(completed) == [1, 2, 3]
+
+
+# Integration: ModuleProcess on_exit -> dispose chain (the CI bug scenario)
+
+
+class TestModuleProcessDisposeChain:
+    """Tests the exact pattern that caused the CI bug:
+    process exits -> watchdog fires on_exit -> module.stop() -> dispose ->
+    ModuleProcess.stop() -> tries to stop watchdog from inside watchdog thread.
+    """
+
+    @staticmethod
+    def _make_fake_stop(mod: FakeModule, done: threading.Event) -> Callable:
+        def fake_stop() -> None:
+            mod.dispose()
+            done.set()
+
+        return fake_stop
+
+    def test_chain_no_deadlock_fast_exit(self) -> None:
+        """Process exits immediately."""
+        for _ in range(20):
+            mod = FakeModule()
+            done = threading.Event()
+            ModuleProcess(
+                module=mod,
+                args=[PYTHON, "-c", "pass"],
+                on_exit=self._make_fake_stop(mod, done),
+            )
+            assert done.wait(timeout=5), "Deadlock in dispose chain (fast exit)"
+
+    def test_chain_no_deadlock_slow_exit(self) -> None:
+        """Process runs briefly then exits."""
+        for _ in range(10):
+            mod = FakeModule()
+            done = threading.Event()
+            ModuleProcess(
+                module=mod,
+                args=[PYTHON, "-c", "import time; time.sleep(0.1)"],
+                on_exit=self._make_fake_stop(mod, done),
+            )
+            assert done.wait(timeout=5), "Deadlock in dispose chain (slow exit)"
+
+    def test_chain_concurrent_with_external_stop(self) -> None:
+        """Process exits naturally while external code calls stop()."""
+        for _ in range(20):
+            mod = FakeModule()
+            done = threading.Event()
+            mp = ModuleProcess(
+                module=mod,
+                args=[PYTHON, "-c", "import time; time.sleep(0.05)"],
+                on_exit=self._make_fake_stop(mod, done),
+                shutdown_timeout=1.0,
+            )
+            # Race: the process might exit naturally or we might stop it
+            time.sleep(0.03)
+            mp.stop()
+            # Either way, should not deadlock
+            time.sleep(1.0)
+
+    def test_dispose_with_artificial_delay(self) -> None:
+        """Add artificial delay near cleanup to simulate heavy CPU load."""
+        original_stop = ModuleThread.stop
+
+        def slow_stop(self_mt: ModuleThread) -> None:
+            time.sleep(0.05)  # simulate load
+            original_stop(self_mt)
+
+        for _ in range(10):
+            mod = FakeModule()
+            done = threading.Event()
+            with mock.patch.object(ModuleThread, "stop", slow_stop):
+                ModuleProcess(
+                    module=mod,
+                    args=[PYTHON, "-c", "pass"],
+                    on_exit=self._make_fake_stop(mod, done),
+                )
+                assert done.wait(timeout=10), "Deadlock with slow ModuleThread.stop()"
+
+
+from dimos.utils.typing_utils import ExceptionGroup
diff --git a/dimos/utils/thread_utils.py b/dimos/utils/thread_utils.py
new file mode 100644
index 0000000000..6d9b7a9e7f
--- /dev/null
+++ b/dimos/utils/thread_utils.py
@@ -0,0 +1,550 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Thread utilities: safe values, managed threads, safe parallel map."""
+
+from __future__ import annotations
+
+import asyncio
+import collections
+from concurrent.futures import Future, ThreadPoolExecutor, as_completed
+import json
+import signal
+import subprocess
+import threading
+from typing import IO, TYPE_CHECKING, Any, Generic
+
+from reactivex.disposable import Disposable
+
+from dimos.utils.logging_config import setup_logger
+from dimos.utils.typing_utils import ExceptionGroup, TypeVar
+
+logger = setup_logger()
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+    from dimos.core.module import ModuleBase
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+# ThreadSafeVal: a lock-protected value with context-manager support
+
+
+class ThreadSafeVal(Generic[T]):
+    """A thread-safe value wrapper.
+
+    Wraps any value with a lock and provides atomic read-modify-write
+    via a context manager::
+
+        counter = ThreadSafeVal(0)
+
+        # Simple get/set (each acquires the lock briefly):
+        counter.set(10)
+        print(counter.get())  # 10
+
+        # Atomic read-modify-write:
+        with counter as value:
+            # Lock is held for the entire block.
+            # Other threads block on get/set/with until this exits.
+            if value < 100:
+                counter.set(value + 1)
+
+        # Works with any type:
+        status = ThreadSafeVal({"running": False, "count": 0})
+        with status as s:
+            status.set({**s, "running": True})
+
+        # Bool check (for flag-like usage):
+        stopping = ThreadSafeVal(False)
+        stopping.set(True)
+        if stopping:
+            print("stopping!")
+    """
+
+    def __init__(self, initial: T) -> None:
+        self._lock = threading.RLock()
+        self._value = initial
+
+    def get(self) -> T:
+        """Return the current value (acquires the lock briefly)."""
+        with self._lock:
+            return self._value
+
+    def set(self, value: T) -> None:
+        """Replace the value (acquires the lock briefly)."""
+        with self._lock:
+            self._value = value
+
+    def __bool__(self) -> bool:
+        with self._lock:
+            return bool(self._value)
+
+    def __enter__(self) -> T:
+        self._lock.acquire()
+        return self._value
+
+    def __exit__(self, *exc: object) -> None:
+        self._lock.release()
+
+    def __getstate__(self) -> dict[str, Any]:
+        return {"_value": self._value}
+
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self._lock = threading.RLock()
+        self._value = state["_value"]
+
+    def __repr__(self) -> str:
+        return f"ThreadSafeVal({self._value!r})"
+
+
+# ModuleThread: a thread that auto-registers with a module's disposables
+
+
+class ModuleThread:
+    """A thread that registers cleanup with a module's disposables.
+
+    Passes most kwargs through to ``threading.Thread``. On construction,
+    registers a disposable with the module so that when the module stops,
+    the thread is automatically joined. Cleanup is idempotent — safe to
+    call ``stop()`` manually even if the module also disposes it.
+
+    Example::
+
+        class MyModule(Module):
+            @rpc
+            def start(self) -> None:
+                self._worker = ModuleThread(
+                    module=self,
+                    target=self._run_loop,
+                    name="my-worker",
+                )
+
+            def _run_loop(self) -> None:
+                while not self._worker.stopping:
+                    do_work()
+    """
+
+    def __init__(
+        self,
+        module: ModuleBase[Any],
+        *,
+        start: bool = True,
+        close_timeout: float = 2.0,
+        **thread_kwargs: Any,
+    ) -> None:
+        thread_kwargs.setdefault("daemon", True)
+        self._thread = threading.Thread(**thread_kwargs)
+        self._stop_event = threading.Event()
+        self._close_timeout = close_timeout
+        self._stopped = False
+        self._stop_lock = threading.Lock()
+        module._disposables.add(Disposable(self.stop))
+        if start:
+            self.start()
+
+    @property
+    def stopping(self) -> bool:
+        """True after ``stop()`` has been called."""
+        return self._stop_event.is_set()
+
+    def start(self) -> None:
+        """Start the underlying thread."""
+        self._stop_event.clear()
+        self._thread.start()
+
+    def stop(self) -> None:
+        """Signal the thread to stop and join it.
+
+        Safe to call multiple times, from any thread (including the
+        managed thread itself — it will skip the join in that case).
+        """
+        with self._stop_lock:
+            if self._stopped:
+                return
+            self._stopped = True
+
+        self._stop_event.set()
+        if self._thread.is_alive() and self._thread is not threading.current_thread():
+            self._thread.join(timeout=self._close_timeout)
+
+    def join(self, timeout: float | None = None) -> None:
+        """Join the underlying thread."""
+        self._thread.join(timeout=timeout)
+
+    @property
+    def is_alive(self) -> bool:
+        return self._thread.is_alive()
+
+
+# AsyncModuleThread: a thread running an asyncio event loop, auto-registered
+
+
+class AsyncModuleThread:
+    """A thread running an asyncio event loop, registered with a module's disposables.
+
+    If a loop is already running in the current context, reuses it (no thread
+    created).  Otherwise creates a new loop and drives it in a daemon thread.
+
+    On stop (or module dispose), the loop is shut down gracefully and the
+    thread is joined.  Idempotent — safe to call ``stop()`` multiple times.
+
+    Example::
+
+        class MyModule(Module):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+                self._async = AsyncModuleThread(module=self)
+
+            @rpc
+            def start(self) -> None:
+                future = asyncio.run_coroutine_threadsafe(
+                    self._do_work(), self._async.loop
+                )
+
+            async def _do_work(self) -> None:
+                ...
+    """
+
+    def __init__(
+        self,
+        module: ModuleBase[Any],
+        *,
+        close_timeout: float = 2.0,
+    ) -> None:
+        self._close_timeout = close_timeout
+        self._stopped = False
+        self._stop_lock = threading.Lock()
+        self._owns_loop = False
+        self._thread: threading.Thread | None = None
+
+        try:
+            self._loop = asyncio.get_running_loop()
+        except RuntimeError:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            self._owns_loop = True
+            self._thread = threading.Thread(
+                target=self._loop.run_forever,
+                daemon=True,
+                name=f"{type(module).__name__}-event-loop",
+            )
+            self._thread.start()
+
+        module._disposables.add(Disposable(self.stop))
+
+    @property
+    def loop(self) -> asyncio.AbstractEventLoop:
+        """The managed event loop."""
+        return self._loop
+
+    @property
+    def is_alive(self) -> bool:
+        return self._thread is not None and self._thread.is_alive()
+
+    def stop(self) -> None:
+        """Stop the event loop and join the thread.
+
+        No-op if the loop was not created by this instance (reused an
+        existing running loop).  Safe to call multiple times.
+        """
+        with self._stop_lock:
+            if self._stopped:
+                return
+            self._stopped = True
+
+        if self._owns_loop and self._loop.is_running():
+            self._loop.call_soon_threadsafe(self._loop.stop)
+
+        if self._thread is not None and self._thread.is_alive():
+            self._thread.join(timeout=self._close_timeout)
+
+
+# ModuleProcess: managed subprocess with log piping, auto-registered cleanup
+
+
+class ModuleProcess:
+    """A managed subprocess that pipes stdout/stderr through the logger.
+
+    Registers with a module's disposables so the process is automatically
+    stopped on module teardown. A watchdog thread monitors the process and
+    calls ``on_exit`` if the process exits on its own (i.e. not via
+    ``ModuleProcess.stop()``).
+
+    Most constructor kwargs mirror ``subprocess.Popen``. ``stdout`` and
+    ``stderr`` are always captured (set to ``PIPE`` internally).
+
+    Example::
+
+        class MyModule(Module):
+            @rpc
+            def start(self) -> None:
+                self._proc = ModuleProcess(
+                    module=self,
+                    args=["./my_binary", "--flag"],
+                    cwd="/opt/bin",
+                    on_exit=self.stop,  # stops the whole module if process exits on its own
+                )
+
+            @rpc
+            def stop(self) -> None:
+                super().stop()
+    """
+
+    def __init__(
+        self,
+        module: ModuleBase[Any],
+        args: list[str] | str,
+        *,
+        env: dict[str, str] | None = None,
+        cwd: str | None = None,
+        shell: bool = False,
+        on_exit: Callable[[], Any] | None = None,
+        shutdown_timeout: float = 10.0,
+        kill_timeout: float = 5.0,
+        log_json: bool = False,
+        log_tail_lines: int = 50,
+        start: bool = True,
+        **popen_kwargs: Any,
+    ) -> None:
+        self._args = args
+        self._env = env
+        self._cwd = cwd
+        self._shell = shell
+        self._on_exit = on_exit
+        self._shutdown_timeout = shutdown_timeout
+        self._kill_timeout = kill_timeout
+        self._log_json = log_json
+        self._log_tail_lines = log_tail_lines
+        self._popen_kwargs = popen_kwargs
+        self._process: subprocess.Popen[bytes] | None = None
+        self._watchdog: ModuleThread | None = None
+        self._module = module
+        self._stopped = False
+        self._stop_lock = threading.Lock()
+        self.last_stdout: collections.deque[str] = collections.deque(maxlen=log_tail_lines)
+        self.last_stderr: collections.deque[str] = collections.deque(maxlen=log_tail_lines)
+
+        module._disposables.add(Disposable(self.stop))
+        if start:
+            self.start()
+
+    @property
+    def pid(self) -> int | None:
+        return self._process.pid if self._process is not None else None
+
+    @property
+    def returncode(self) -> int | None:
+        if self._process is None:
+            return None
+        return self._process.poll()
+
+    @property
+    def is_alive(self) -> bool:
+        return self._process is not None and self._process.poll() is None
+
+    def start(self) -> None:
+        """Launch the subprocess and start the watchdog."""
+        if self._process is not None and self._process.poll() is None:
+            logger.warning("Process already running", pid=self._process.pid)
+            return
+
+        with self._stop_lock:
+            self._stopped = False
+
+        self.last_stdout = collections.deque(maxlen=self._log_tail_lines)
+        self.last_stderr = collections.deque(maxlen=self._log_tail_lines)
+
+        logger.info(
+            "Starting process",
+            cmd=self._args if isinstance(self._args, str) else " ".join(self._args),
+            cwd=self._cwd,
+        )
+        self._process = subprocess.Popen(
+            self._args,
+            env=self._env,
+            cwd=self._cwd,
+            shell=self._shell,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            **self._popen_kwargs,
+        )
+        logger.info("Process started", pid=self._process.pid)
+
+        self._watchdog = ModuleThread(
+            module=self._module,
+            target=self._watch,
+            name=f"proc-{self._process.pid}-watchdog",
+        )
+
+    def stop(self) -> None:
+        """Send SIGTERM, wait, escalate to SIGKILL if needed. Idempotent."""
+        with self._stop_lock:
+            if self._stopped:
+                return
+            self._stopped = True
+
+        if self._process is not None and self._process.poll() is None:
+            logger.info("Stopping process", pid=self._process.pid)
+            try:
+                self._process.send_signal(signal.SIGTERM)
+            except OSError:
+                pass  # process already dead (PID recycled or exited between poll and signal)
+            else:
+                try:
+                    self._process.wait(timeout=self._shutdown_timeout)
+                except subprocess.TimeoutExpired:
+                    logger.warning(
+                        "Process did not exit, sending SIGKILL",
+                        pid=self._process.pid,
+                    )
+                    self._process.kill()
+                    try:
+                        self._process.wait(timeout=self._kill_timeout)
+                    except subprocess.TimeoutExpired:
+                        logger.error(
+                            "Process did not exit after SIGKILL",
+                            pid=self._process.pid,
+                        )
+        self._process = None
+
+    def _watch(self) -> None:
+        """Watchdog: pipe logs, detect crashes."""
+        proc = self._process
+        if proc is None:
+            return
+
+        stdout_t = self._start_reader(proc.stdout, "info")
+        stderr_t = self._start_reader(proc.stderr, "warning")
+        rc = proc.wait()
+        stdout_t.join(timeout=2)
+        stderr_t.join(timeout=2)
+
+        with self._stop_lock:
+            if self._stopped:
+                return
+
+        last_stdout = "\n".join(self.last_stdout) or None
+        last_stderr = "\n".join(self.last_stderr) or None
+        logger.error(
+            "Process died unexpectedly",
+            pid=proc.pid,
+            returncode=rc,
+            last_stdout=last_stdout,
+            last_stderr=last_stderr,
+        )
+        if self._on_exit is not None:
+            self._on_exit()
+
+    def _start_reader(self, stream: IO[bytes] | None, level: str) -> threading.Thread:
+        t = threading.Thread(target=self._read_stream, args=(stream, level), daemon=True)
+        t.start()
+        return t
+
+    def _read_stream(self, stream: IO[bytes] | None, level: str) -> None:
+        if stream is None:
+            return
+        log_fn = getattr(logger, level)
+        is_stderr = level == "warning"
+        buf = self.last_stderr if is_stderr else self.last_stdout
+        for raw in stream:
+            line = raw.decode("utf-8", errors="replace").rstrip()
+            if not line:
+                continue
+            buf.append(line)
+            if self._log_json:
+                try:
+                    data = json.loads(line)
+                    event = data.pop("event", line)
+                    log_fn(event, **data)
+                    continue
+                except (json.JSONDecodeError, TypeError):
+                    logger.warning("malformed JSON from process", raw=line)
+            proc = self._process
+            log_fn(line, pid=proc.pid if proc else None)
+        stream.close()
+
+
+# safe_thread_map: parallel map that collects all results before raising
+
+
+def safe_thread_map(
+    items: Sequence[T],
+    fn: Callable[[T], R],
+    on_errors: Callable[[list[tuple[T, R | Exception]], list[R], list[Exception]], Any]
+    | None = None,
+) -> list[R]:
+    """Thread-pool map that waits for all items to finish before raising and a cleanup handler
+
+    - Empty *items* → returns ``[]`` immediately.
+    - All succeed → returns results in input order.
+    - Any fail → calls ``on_errors(outcomes, successes, errors)`` where
+      *outcomes* is a list of ``(input, result_or_exception)`` pairs in input
+      order, *successes* is the list of successful results, and *errors* is
+      the list of exceptions. If *on_errors* raises, that exception propagates.
+      If *on_errors* returns normally, its return value is returned from
+      ``safe_thread_map``. If *on_errors* is ``None``, raises an
+      ``ExceptionGroup``.
+
+    Example::
+
+        def start_service(name: str) -> Connection:
+            return connect(name)
+
+        def cleanup(
+            outcomes: list[tuple[str, Connection | Exception]],
+            successes: list[Connection],
+            errors: list[Exception],
+        ) -> None:
+            for conn in successes:
+                conn.close()
+            raise ExceptionGroup("failed to start services", errors)
+
+        connections = safe_thread_map(
+            ["db", "cache", "queue"],
+            start_service,
+            cleanup,  # called only if any start_service() raises
+        )
+    """
+    if not items:
+        return []
+
+    outcomes: dict[int, R | Exception] = {}
+
+    with ThreadPoolExecutor(max_workers=len(items)) as pool:
+        futures: dict[Future[R], int] = {pool.submit(fn, item): i for i, item in enumerate(items)}
+        for fut in as_completed(futures):
+            idx = futures[fut]
+            try:
+                outcomes[idx] = fut.result()
+            except Exception as e:
+                outcomes[idx] = e
+
+    successes: list[R] = []
+    errors: list[Exception] = []
+    for v in outcomes.values():
+        if isinstance(v, Exception):
+            errors.append(v)
+        else:
+            successes.append(v)
+
+    if errors:
+        if on_errors is not None:
+            zipped = [(items[i], outcomes[i]) for i in range(len(items))]
+            return on_errors(zipped, successes, errors)  # type: ignore[return-value, no-any-return]
+        raise ExceptionGroup("safe_thread_map failed", errors)
+
+    return [outcomes[i] for i in range(len(items))]  # type: ignore[misc]
diff --git a/dimos/utils/typing_utils.py b/dimos/utils/typing_utils.py
new file mode 100644
index 0000000000..3592d5fdbb
--- /dev/null
+++ b/dimos/utils/typing_utils.py
@@ -0,0 +1,45 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unify typing compatibility across multiple Python versions."""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+import sys
+
+if sys.version_info < (3, 13):
+    from typing_extensions import TypeVar
+else:
+    from typing import TypeVar
+
+if sys.version_info < (3, 11):
+
+    class ExceptionGroup(Exception):  # type: ignore[no-redef]  # noqa: N818
+        """Minimal ExceptionGroup polyfill for Python 3.10."""
+
+        exceptions: tuple[BaseException, ...]
+
+        def __init__(self, message: str, exceptions: Sequence[BaseException]) -> None:
+            super().__init__(message)
+            self.exceptions = tuple(exceptions)
+else:
+    import builtins
+
+    ExceptionGroup = builtins.ExceptionGroup  # type: ignore[misc]
+
+__all__ = [
+    "ExceptionGroup",
+    "TypeVar",
+]

From 0bff0cf0303dba93d1c7a8c1456406d3e38c133e Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 18:59:28 -0700
Subject: [PATCH 85/89] proper design of WorkerManagers

---
 dimos/core/blueprints.py                      |   2 +-
 dimos/core/docker_module.py                   |   5 -
 dimos/core/global_config.py                   |   1 +
 dimos/core/module_coordinator.py              | 157 ++++++------------
 dimos/core/rpc_client.py                      |   2 +-
 dimos/core/test_daemon.py                     |  49 +++---
 dimos/core/test_e2e_daemon.py                 |  24 ++-
 dimos/core/test_worker.py                     |   9 +-
 dimos/core/tests/test_docker_deployment.py    | 112 +++++++------
 .../tests/test_parallel_deploy_cleanup.py     |  35 ++--
 dimos/core/worker_manager_docker.py           |  64 +++++--
 ...er_manager.py => worker_manager_python.py} |  78 +++++++--
 dimos/core/{worker.py => worker_python.py}    |   0
 .../sensors/camera/realsense/camera.py        |   2 +-
 dimos/hardware/sensors/camera/zed/camera.py   |   2 +-
 dimos/robot/cli/dimos.py                      |   3 +-
 dimos/robot/unitree/b1/unitree_b1.py          |   2 +-
 dimos/utils/demo_image_encoding.py            |   2 +-
 18 files changed, 287 insertions(+), 262 deletions(-)
 rename dimos/core/{worker_manager.py => worker_manager_python.py} (62%)
 rename dimos/core/{worker.py => worker_python.py} (100%)

diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py
index 8f9d59182d..314724386d 100644
--- a/dimos/core/blueprints.py
+++ b/dimos/core/blueprints.py
@@ -485,7 +485,7 @@ def build(
         self._verify_no_name_conflicts()
 
         logger.info("Starting the modules")
-        module_coordinator = ModuleCoordinator(cfg=global_config)
+        module_coordinator = ModuleCoordinator(g=global_config)
         module_coordinator.start()
 
         # all module constructors are called here (each of them setup their own)
diff --git a/dimos/core/docker_module.py b/dimos/core/docker_module.py
index dc0ffd533f..8cf01c41af 100644
--- a/dimos/core/docker_module.py
+++ b/dimos/core/docker_module.py
@@ -554,11 +554,6 @@ def wait(self) -> None:
         self._shutdown.wait()
 
 
-# ---------------------------------------------------------------------------
-# Helpers (private — used by the classes above)
-# ---------------------------------------------------------------------------
-
-
 def _run(cmd: list[str], *, timeout: float | None = None) -> subprocess.CompletedProcess[str]:
     logger.debug(f"exec: {' '.join(cmd)}")
     return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, check=False)
diff --git a/dimos/core/global_config.py b/dimos/core/global_config.py
index 90461932a2..5a5f7ba7bc 100644
--- a/dimos/core/global_config.py
+++ b/dimos/core/global_config.py
@@ -38,6 +38,7 @@ class GlobalConfig(BaseSettings):
     new_memory: bool = False
     viewer: ViewerBackend = "rerun"
     n_workers: int = 2
+    worker_to_module_ratio: float = 1.0
     memory_limit: str = "auto"
     mujoco_camera_position: str | None = None
     mujoco_room: str | None = None
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 7902072570..d1020e61bd 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -18,19 +18,17 @@
 import threading
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.docker_worker_manager import DockerWorkerManager
 from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.module import ModuleBase, ModuleSpec
 from dimos.core.resource import Resource
-from dimos.core.worker_manager import WorkerManager
+from dimos.core.worker_manager_docker import WorkerManagerDocker
+from dimos.core.worker_manager_python import WorkerManagerPython
 from dimos.utils.logging_config import setup_logger
 from dimos.utils.thread_utils import safe_thread_map
 from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
-    from dimos.core.resource_monitor.monitor import StatsMonitor
     from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol
-    from dimos.core.worker import Worker
 
 logger = setup_logger()
 
@@ -46,88 +44,59 @@ class ModuleCoordinator(Resource):  # type: ignore[misc]
     - Modules shouldn't be deployed on their own (except for testing)
     """
 
-    _client: WorkerManager | None = None
+    _managers: list[WorkerManagerDocker | WorkerManagerPython]
     _global_config: GlobalConfig
-    _n: int | None = None
-    _memory_limit: str = "auto"
     _deployed_modules: dict[type[ModuleBase], ModuleProxyProtocol]
-    _stats_monitor: StatsMonitor | None = None
 
     def __init__(
         self,
-        n: int | None = None,
-        cfg: GlobalConfig = global_config,
+        g: GlobalConfig = global_config,
     ) -> None:
-        self._n = n if n is not None else cfg.n_workers
-        self._memory_limit = cfg.memory_limit
-        self._global_config = cfg
+        self._global_config = g
+        self._managers = []
         self._deployed_modules = {}
 
-    @property
-    def workers(self) -> list[Worker]:
-        """Active worker processes."""
-        if self._client is None:
-            return []
-        return self._client.workers
-
-    @property
-    def n_workers(self) -> int:
-        """Number of active workers."""
-        return len(self.workers)
+    def start(self) -> None:
+        self._managers = [
+            WorkerManagerDocker(g=self._global_config),
+            WorkerManagerPython(g=self._global_config),
+        ]
+        for m in self._managers:
+            m.start()
+
+    def _find_manager(
+        self, module_class: type[ModuleBase[Any]]
+    ) -> WorkerManagerDocker | WorkerManagerPython:
+        for m in self._managers:
+            if m.should_manage(module_class):
+                return m
+        raise ValueError(f"No manager found for {module_class.__name__}")
 
     def health_check(self) -> bool:
-        """Verify all workers are alive after build.
-
-        Since ``blueprint.build()`` is synchronous, every module should be
-        started by the time this runs.  We just confirm no worker has died.
-        """
-        if self.n_workers == 0:
-            logger.error("health_check: no workers found")
-            return False
-
-        for w in self.workers:
-            if w.pid is None:
-                logger.error("health_check: worker died", worker_id=w.worker_id)
-                return False
-
-        return True
+        return all(m.health_check() for m in self._managers)
 
     @property
     def n_modules(self) -> int:
-        """Number of deployed modules."""
         return len(self._deployed_modules)
 
     def suppress_console(self) -> None:
-        """Silence console output in all worker processes."""
-        if self._client is not None:
-            self._client.suppress_console()
-
-    def start(self) -> None:
-        n = self._n if self._n is not None else 2
-        self._client = WorkerManager(n_workers=n)
-        self._client.start()
-
-        if self._global_config.dtop:
-            from dimos.core.resource_monitor.monitor import StatsMonitor
-
-            self._stats_monitor = StatsMonitor(self._client)
-            self._stats_monitor.start()
+        for m in self._managers:
+            m.suppress_console()
 
     def stop(self) -> None:
-        if self._stats_monitor is not None:
-            self._stats_monitor.stop()
-            self._stats_monitor = None
-
         for module_class, module in reversed(self._deployed_modules.items()):
             logger.info("Stopping module...", module=module_class.__name__)
-            try:
+            with suppress(Exception):
                 module.stop()
-            except Exception:
-                logger.error("Error stopping module", module=module_class.__name__, exc_info=True)
             logger.info("Module stopped.", module=module_class.__name__)
 
-        if self._client is not None:
-            self._client.close_all()
+        def _stop_manager(m: WorkerManagerDocker | WorkerManagerPython) -> None:
+            try:
+                m.stop()
+            except Exception:
+                logger.error("Error stopping manager", manager=type(m).__name__, exc_info=True)
+
+        safe_thread_map(self._managers, _stop_manager)
 
     def deploy(
         self,
@@ -135,58 +104,34 @@ def deploy(
         global_config: GlobalConfig = global_config,
         **kwargs: Any,
     ) -> ModuleProxy:
-        # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator
-        from dimos.core.docker_module import DockerModuleOuter, is_docker_module
-
-        if not self._client:
+        if not self._managers:
             raise ValueError("Trying to dimos.deploy before the client has started")
 
-        deployed_module: ModuleProxyProtocol
-        if is_docker_module(module_class):
-            deployed_module = DockerModuleOuter(module_class, g=global_config, **kwargs)  # type: ignore[arg-type]
-        else:
-            deployed_module = self._client.deploy(module_class, global_config, kwargs)
+        manager = self._find_manager(module_class)
+        deployed_module = manager.deploy(module_class, global_config, kwargs)
         self._deployed_modules[module_class] = deployed_module  # type: ignore[assignment]
         return deployed_module  # type: ignore[return-value]
 
     def deploy_parallel(self, module_specs: list[ModuleSpec]) -> list[ModuleProxy]:
-        # Inline to avoid circular import: module_coordinator → docker_module → module → blueprints → module_coordinator
-        from dimos.core.docker_module import is_docker_module
-
-        if not self._client:
+        if not self._managers:
             raise ValueError("Not started")
 
-        # Split by type, tracking original indices for reassembly
-        docker_indices: list[int] = []
-        worker_indices: list[int] = []
-        docker_specs: list[ModuleSpec] = []
-        worker_specs: list[ModuleSpec] = []
-        for i, spec in enumerate(module_specs):
-            if is_docker_module(spec[0]):
-                docker_indices.append(i)
-                docker_specs.append(spec)
-            else:
-                worker_indices.append(i)
-                worker_specs.append(spec)
-
-        # Deploy worker and docker modules in parallel.
-        results: list[Any] = [None] * len(module_specs)
+        # Group specs by manager, tracking original indices for reassembly
+        groups: dict[int, WorkerManagerDocker | WorkerManagerPython] = {}
+        indices_by_manager: dict[int, list[int]] = {}
+        specs_by_manager: dict[int, list[ModuleSpec]] = {}
+        for index, spec in enumerate(module_specs):
+            manager = self._find_manager(spec[0])
+            mid = id(manager)
+            groups.setdefault(mid, manager)
+            indices_by_manager.setdefault(mid, []).append(index)
+            specs_by_manager.setdefault(mid, []).append(spec)
 
-        def _deploy_workers() -> None:
-            if not worker_specs:
-                return
-            assert self._client is not None
-            for index, module in zip(
-                worker_indices, self._client.deploy_parallel(worker_specs), strict=True
-            ):
-                results[index] = module
+        results: list[Any] = [None] * len(module_specs)
 
-        def _deploy_docker() -> None:
-            if not docker_specs:
-                return
-            for index, module in zip(
-                docker_indices, DockerWorkerManager.deploy_parallel(docker_specs), strict=True
-            ):
+        def _deploy_group(mid: int) -> None:
+            deployed = groups[mid].deploy_parallel(specs_by_manager[mid])
+            for index, module in zip(indices_by_manager[mid], deployed, strict=True):
                 results[index] = module
 
         def _register() -> None:
@@ -200,7 +145,7 @@ def _on_errors(
             _register()
             raise ExceptionGroup("deploy_parallel failed", errors)
 
-        safe_thread_map([_deploy_workers, _deploy_docker], lambda fn: fn(), _on_errors)
+        safe_thread_map(list(groups.keys()), _deploy_group, _on_errors)
         _register()
         return results
 
diff --git a/dimos/core/rpc_client.py b/dimos/core/rpc_client.py
index 46182b7556..f051cbfdb1 100644
--- a/dimos/core/rpc_client.py
+++ b/dimos/core/rpc_client.py
@@ -16,7 +16,7 @@
 from typing import TYPE_CHECKING, Any, Protocol
 
 from dimos.core.stream import RemoteStream
-from dimos.core.worker import MethodCallProxy
+from dimos.core.worker_python import MethodCallProxy
 from dimos.protocol.rpc.pubsubrpc import LCMRPC
 from dimos.protocol.rpc.spec import DEFAULT_RPC_TIMEOUT, DEFAULT_RPC_TIMEOUTS, RPCSpec
 from dimos.utils.logging_config import setup_logger
diff --git a/dimos/core/test_daemon.py b/dimos/core/test_daemon.py
index f6dae51433..821e2378de 100644
--- a/dimos/core/test_daemon.py
+++ b/dimos/core/test_daemon.py
@@ -158,50 +158,41 @@ def test_port_conflict_no_false_positive(self, tmp_registry: Path):
 from dimos.core.module_coordinator import ModuleCoordinator
 
 
-def _mock_worker(pid: int | None = 1234, worker_id: int = 0):
-    """Create a mock Worker with a controllable pid."""
-    w = mock.MagicMock()
-    w.worker_id = worker_id
-    w.pid = pid
-    return w
-
-
-def _mock_coordinator(workers: list | None = None) -> ModuleCoordinator:
-    """Create a ModuleCoordinator with mocked internals and controllable workers."""
+def _mock_coordinator(manager_health: list[bool] | None = None) -> ModuleCoordinator:
+    """Create a ModuleCoordinator with mocked managers and controllable health."""
     coord = mock.MagicMock(spec=ModuleCoordinator)
     # Bind the real health_check method so it runs actual logic
     coord.health_check = ModuleCoordinator.health_check.__get__(coord)
-    if workers is not None:
-        coord.workers = workers
-        coord.n_workers = len(workers)
+    if manager_health is not None:
+        managers = []
+        for healthy in manager_health:
+            m = mock.MagicMock()
+            m.health_check.return_value = healthy
+            managers.append(m)
+        coord._managers = managers
     else:
-        coord.workers = []
-        coord.n_workers = 0
+        coord._managers = []
     return coord
 
 
 class TestHealthCheck:
-    """health_check verifies all workers are alive after synchronous build."""
+    """health_check delegates to managers and returns all() of their results."""
 
     def test_all_healthy(self):
-        workers = [_mock_worker(pid=os.getpid(), worker_id=i) for i in range(3)]
-        coord = _mock_coordinator(workers)
+        coord = _mock_coordinator([True, True])
         assert coord.health_check() is True
 
-    def test_dead_worker(self):
-        dead = _mock_worker(pid=None, worker_id=0)
-        coord = _mock_coordinator([dead])
+    def test_one_unhealthy(self):
+        coord = _mock_coordinator([True, False])
         assert coord.health_check() is False
 
-    def test_no_workers(self):
-        coord = _mock_coordinator(workers=[])
-        assert coord.health_check() is False
+    def test_no_managers(self):
+        coord = _mock_coordinator([])
+        # all([]) is True — no managers means nothing to fail
+        assert coord.health_check() is True
 
-    def test_partial_death(self):
-        w1 = _mock_worker(pid=os.getpid(), worker_id=0)
-        w2 = _mock_worker(pid=os.getpid(), worker_id=1)
-        w3 = _mock_worker(pid=None, worker_id=2)
-        coord = _mock_coordinator([w1, w2, w3])
+    def test_all_unhealthy(self):
+        coord = _mock_coordinator([False, False])
         assert coord.health_check() is False
 
 
diff --git a/dimos/core/test_e2e_daemon.py b/dimos/core/test_e2e_daemon.py
index d8ac016faa..b52bf14ea6 100644
--- a/dimos/core/test_e2e_daemon.py
+++ b/dimos/core/test_e2e_daemon.py
@@ -111,7 +111,6 @@ class TestDaemonE2E:
 
     def test_single_worker_lifecycle(self, coordinator, registry_entry):
         """Build -> health check -> registry -> status (1 worker)."""
-        assert len(coordinator.workers) == 1
         assert coordinator.n_modules == 2
 
         assert coordinator.health_check(), "Health check should pass"
@@ -126,15 +125,14 @@ def test_single_worker_lifecycle(self, coordinator, registry_entry):
 
     def test_multiple_workers(self, coordinator_2w):
         """Build with 2 workers — both should be alive."""
-        assert len(coordinator_2w.workers) == 2
-        for w in coordinator_2w.workers:
-            assert w.pid is not None, f"Worker {w.worker_id} has no PID"
-
         assert coordinator_2w.health_check(), "Health check should pass"
 
     def test_health_check_detects_dead_worker(self, coordinator):
         """Kill a worker process — health check should fail."""
-        worker = coordinator.workers[0]
+        from dimos.core.worker_manager_python import WorkerManagerPython
+
+        py_mgr = next(m for m in coordinator._managers if isinstance(m, WorkerManagerPython))
+        worker = py_mgr.workers[0]
         worker_pid = worker.pid
         assert worker_pid is not None
 
@@ -237,21 +235,19 @@ def test_status_shows_live_blueprint(self, live_blueprint):
         assert "ping-pong" in result.output
         assert str(os.getpid()) in result.output
 
-    def test_status_shows_worker_count_via_registry(self, live_blueprint):
-        coord, entry = live_blueprint
-
-        assert len(coord.workers) >= 1
-        for w in coord.workers:
-            assert w.pid is not None
+    def test_status_shows_live_entry_via_registry(self, live_blueprint):
+        _coord, entry = live_blueprint
 
         runs = list_runs(alive_only=True)
         matching = [r for r in runs if r.run_id == entry.run_id]
         assert len(matching) == 1
 
     def test_stop_kills_real_workers(self, live_blueprint):
-        coord, _entry = live_blueprint
+        from dimos.core.worker_manager_python import WorkerManagerPython
 
-        worker_pids = [w.pid for w in coord.workers if w.pid]
+        coord, _entry = live_blueprint
+        py_mgr = next(m for m in coord._managers if isinstance(m, WorkerManagerPython))
+        worker_pids = [w.pid for w in py_mgr.workers if w.pid]
         assert len(worker_pids) >= 1
 
         coord.stop()
diff --git a/dimos/core/test_worker.py b/dimos/core/test_worker.py
index 021b2e21c4..ced51dfa76 100644
--- a/dimos/core/test_worker.py
+++ b/dimos/core/test_worker.py
@@ -17,10 +17,10 @@
 import pytest
 
 from dimos.core.core import rpc
-from dimos.core.global_config import global_config
+from dimos.core.global_config import GlobalConfig, global_config
 from dimos.core.module import Module
 from dimos.core.stream import In, Out
-from dimos.core.worker_manager import WorkerManager
+from dimos.core.worker_manager_python import WorkerManagerPython
 from dimos.msgs.geometry_msgs.Vector3 import Vector3
 
 if TYPE_CHECKING:
@@ -87,14 +87,15 @@ def create_worker_manager():
 
     def _create(n_workers):
         nonlocal manager
-        manager = WorkerManager(n_workers=n_workers)
+        g = GlobalConfig(n_workers=n_workers)
+        manager = WorkerManagerPython(g=g)
         manager.start()
         return manager
 
     yield _create
 
     if manager is not None:
-        manager.close_all()
+        manager.stop()
 
 
 @pytest.mark.slow
diff --git a/dimos/core/tests/test_docker_deployment.py b/dimos/core/tests/test_docker_deployment.py
index 982bc656b4..5bb18d4a24 100644
--- a/dimos/core/tests/test_docker_deployment.py
+++ b/dimos/core/tests/test_docker_deployment.py
@@ -76,41 +76,38 @@ class Bare(Module):
 
 
 class TestModuleCoordinatorDockerRouting:
-    @patch("dimos.core.docker_module.DockerModuleOuter")
-    @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_routes_docker_module(self, mock_worker_manager_cls, mock_docker_module_cls):
-        mock_worker_mgr = MagicMock()
-        mock_worker_manager_cls.return_value = mock_worker_mgr
-
+    @patch("dimos.core.module_coordinator.WorkerManagerDocker")
+    @patch("dimos.core.module_coordinator.WorkerManagerPython")
+    def test_deploy_routes_docker_module(self, mock_py_cls, mock_docker_cls):
+        mock_py = MagicMock()
+        mock_py_cls.return_value = mock_py
+
+        mock_docker = MagicMock()
+        mock_docker_cls.return_value = mock_docker
         mock_dm = MagicMock()
-        mock_docker_module_cls.return_value = mock_dm
+        mock_docker.deploy.return_value = mock_dm
 
         coordinator = ModuleCoordinator()
         coordinator.start()
         try:
             result = coordinator.deploy(FakeDockerModule)
 
-            # Should NOT go through worker manager
-            mock_worker_mgr.deploy.assert_not_called()
-            # Should construct a DockerModuleOuter (container launch happens inside __init__)
-            mock_docker_module_cls.assert_called_once_with(FakeDockerModule, g=global_config)
-            # start() is NOT called during deploy — it's called in start_all_modules
-            mock_dm.start.assert_not_called()
+            # Docker manager should handle it
+            mock_docker.deploy.assert_called_once_with(FakeDockerModule, global_config, {})
+            # Python manager should NOT be used
+            mock_py.deploy.assert_not_called()
             assert result is mock_dm
             assert coordinator.get_instance(FakeDockerModule) is mock_dm
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_module.DockerModuleOuter")
-    @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_docker_propagates_constructor_failure(
-        self, mock_worker_manager_cls, mock_docker_module_cls
-    ):
-        mock_worker_mgr = MagicMock()
-        mock_worker_manager_cls.return_value = mock_worker_mgr
-
-        # Container launch fails inside __init__; DockerModuleOuter handles its own cleanup
-        mock_docker_module_cls.side_effect = RuntimeError("launch failed")
+    @patch("dimos.core.module_coordinator.WorkerManagerDocker")
+    @patch("dimos.core.module_coordinator.WorkerManagerPython")
+    def test_deploy_docker_propagates_failure(self, mock_py_cls, mock_docker_cls):
+        mock_py_cls.return_value = MagicMock()
+        mock_docker = MagicMock()
+        mock_docker_cls.return_value = mock_docker
+        mock_docker.deploy.side_effect = RuntimeError("launch failed")
 
         coordinator = ModuleCoordinator()
         coordinator.start()
@@ -120,36 +117,43 @@ def test_deploy_docker_propagates_constructor_failure(
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_routes_regular_module_to_worker_manager(self, mock_worker_manager_cls):
-        mock_worker_mgr = MagicMock()
-        mock_worker_manager_cls.return_value = mock_worker_mgr
+    @patch("dimos.core.module_coordinator.WorkerManagerDocker")
+    @patch("dimos.core.module_coordinator.WorkerManagerPython")
+    def test_deploy_routes_regular_module_to_python_manager(self, mock_py_cls, mock_docker_cls):
+        mock_py = MagicMock()
+        mock_py_cls.return_value = mock_py
         mock_proxy = MagicMock()
-        mock_worker_mgr.deploy.return_value = mock_proxy
+        mock_py.deploy.return_value = mock_proxy
+
+        # Docker manager rejects regular modules
+        mock_docker = MagicMock()
+        mock_docker_cls.return_value = mock_docker
+        mock_docker.should_manage.return_value = False
 
         coordinator = ModuleCoordinator()
         coordinator.start()
         try:
             result = coordinator.deploy(FakeRegularModule)
 
-            mock_worker_mgr.deploy.assert_called_once_with(FakeRegularModule, global_config, {})
+            mock_py.deploy.assert_called_once_with(FakeRegularModule, global_config, {})
             assert result is mock_proxy
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_worker_manager.DockerWorkerManager.deploy_parallel")
-    @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_deploy_parallel_separates_docker_and_regular(
-        self, mock_worker_manager_cls, mock_docker_deploy
-    ):
-        mock_worker_mgr = MagicMock()
-        mock_worker_manager_cls.return_value = mock_worker_mgr
-
+    @patch("dimos.core.module_coordinator.WorkerManagerDocker")
+    @patch("dimos.core.module_coordinator.WorkerManagerPython")
+    def test_deploy_parallel_separates_docker_and_regular(self, mock_py_cls, mock_docker_cls):
+        mock_py = MagicMock()
+        mock_py_cls.return_value = mock_py
         regular_proxy = MagicMock()
-        mock_worker_mgr.deploy_parallel.return_value = [regular_proxy]
+        mock_py.deploy_parallel.return_value = [regular_proxy]
 
+        mock_docker = MagicMock()
+        mock_docker_cls.return_value = mock_docker
         mock_dm = MagicMock()
-        mock_docker_deploy.return_value = [mock_dm]
+        mock_docker.deploy_parallel.return_value = [mock_dm]
+        # Docker manager only claims FakeDockerModule
+        mock_docker.should_manage.side_effect = lambda cls: cls is FakeDockerModule
 
         coordinator = ModuleCoordinator()
         coordinator.start()
@@ -160,27 +164,24 @@ def test_deploy_parallel_separates_docker_and_regular(
             ]
             results = coordinator.deploy_parallel(specs)
 
-            # Regular module goes through worker manager
-            mock_worker_mgr.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
-            # Docker specs go through DockerWorkerManager
-            mock_docker_deploy.assert_called_once_with([(FakeDockerModule, (), {})])
-            # start() is NOT called during deploy — it's called in start_all_modules
+            mock_py.deploy_parallel.assert_called_once_with([(FakeRegularModule, (), {})])
+            mock_docker.deploy_parallel.assert_called_once_with([(FakeDockerModule, (), {})])
             mock_dm.start.assert_not_called()
 
-            # Results preserve input order
             assert results[0] is regular_proxy
             assert results[1] is mock_dm
         finally:
             coordinator.stop()
 
-    @patch("dimos.core.docker_module.DockerModuleOuter")
-    @patch("dimos.core.module_coordinator.WorkerManager")
-    def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docker_module_cls):
-        mock_worker_mgr = MagicMock()
-        mock_worker_manager_cls.return_value = mock_worker_mgr
-
+    @patch("dimos.core.module_coordinator.WorkerManagerDocker")
+    @patch("dimos.core.module_coordinator.WorkerManagerPython")
+    def test_stop_cleans_up_all_managers(self, mock_py_cls, mock_docker_cls):
+        mock_py = MagicMock()
+        mock_py_cls.return_value = mock_py
+        mock_docker = MagicMock()
+        mock_docker_cls.return_value = mock_docker
         mock_dm = MagicMock()
-        mock_docker_module_cls.return_value = mock_dm
+        mock_docker.deploy.return_value = mock_dm
 
         coordinator = ModuleCoordinator()
         coordinator.start()
@@ -189,10 +190,11 @@ def test_stop_cleans_up_docker_modules(self, mock_worker_manager_cls, mock_docke
         finally:
             coordinator.stop()
 
-        # stop() called exactly once (no double cleanup)
+        # Module stop() called
         assert mock_dm.stop.call_count == 1
-        # Worker manager also closed
-        mock_worker_mgr.close_all.assert_called_once()
+        # Both managers stopped
+        mock_py.stop.assert_called_once()
+        mock_docker.stop.assert_called_once()
 
 
 class TestDockerModuleOuterGetattr:
diff --git a/dimos/core/tests/test_parallel_deploy_cleanup.py b/dimos/core/tests/test_parallel_deploy_cleanup.py
index 795401d80e..bf6e7d1ed4 100644
--- a/dimos/core/tests/test_parallel_deploy_cleanup.py
+++ b/dimos/core/tests/test_parallel_deploy_cleanup.py
@@ -27,13 +27,14 @@
 from dimos.utils.typing_utils import ExceptionGroup
 
 
-class TestDockerWorkerManagerPartialFailure:
-    """DockerWorkerManager.deploy_parallel must stop successful containers when one fails."""
+class TestWorkerManagerDockerPartialFailure:
+    """WorkerManagerDocker.deploy_parallel must stop successful containers when one fails."""
 
     @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_middle_module_fails_stops_siblings(self, mock_docker_module_cls):
         """Deploy 3 modules where the middle one fails. The other two must be stopped."""
-        from dimos.core.docker_worker_manager import DockerWorkerManager
+        from dimos.core.global_config import GlobalConfig
+        from dimos.core.worker_manager_docker import WorkerManagerDocker
 
         mod_a = MagicMock(name="ModuleA")
         mod_c = MagicMock(name="ModuleC")
@@ -54,7 +55,7 @@ def fake_constructor(cls, *args, **kwargs):
         FakeC = type("C", (), {})
 
         with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info:
-            DockerWorkerManager.deploy_parallel(
+            WorkerManagerDocker(g=GlobalConfig()).deploy_parallel(
                 [
                     (FakeA, (), {}),
                     (FakeB, (), {}),
@@ -72,7 +73,8 @@ def fake_constructor(cls, *args, **kwargs):
     @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_multiple_failures_raises_exception_group(self, mock_docker_module_cls):
         """Deploy 3 modules where two fail. Should raise ExceptionGroup with both errors."""
-        from dimos.core.docker_worker_manager import DockerWorkerManager
+        from dimos.core.global_config import GlobalConfig
+        from dimos.core.worker_manager_docker import WorkerManagerDocker
 
         mod_a = MagicMock(name="ModuleA")
 
@@ -94,7 +96,7 @@ def fake_constructor(cls, *args, **kwargs):
         FakeC = type("C", (), {})
 
         with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed") as exc_info:
-            DockerWorkerManager.deploy_parallel(
+            WorkerManagerDocker(g=GlobalConfig()).deploy_parallel(
                 [
                     (FakeA, (), {}),
                     (FakeB, (), {}),
@@ -113,7 +115,8 @@ def fake_constructor(cls, *args, **kwargs):
     @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_all_succeed_no_stops(self, mock_docker_module_cls):
         """When all deployments succeed, no modules should be stopped."""
-        from dimos.core.docker_worker_manager import DockerWorkerManager
+        from dimos.core.global_config import GlobalConfig
+        from dimos.core.worker_manager_docker import WorkerManagerDocker
 
         mocks = [MagicMock(name=f"Mod{i}") for i in range(3)]
 
@@ -126,7 +129,7 @@ def fake_constructor(cls, *args, **kwargs):
         FakeB = type("B", (), {})
         FakeC = type("C", (), {})
 
-        results = DockerWorkerManager.deploy_parallel(
+        results = WorkerManagerDocker(g=GlobalConfig()).deploy_parallel(
             [
                 (FakeA, (), {}),
                 (FakeB, (), {}),
@@ -141,7 +144,8 @@ def fake_constructor(cls, *args, **kwargs):
     @patch("dimos.core.docker_module.DockerModuleOuter")
     def test_stop_failure_does_not_mask_deploy_error(self, mock_docker_module_cls):
         """If stop() itself raises during cleanup, the original deploy error still propagates."""
-        from dimos.core.docker_worker_manager import DockerWorkerManager
+        from dimos.core.global_config import GlobalConfig
+        from dimos.core.worker_manager_docker import WorkerManagerDocker
 
         mod_a = MagicMock(name="ModuleA")
         mod_a.stop.side_effect = OSError("stop failed")
@@ -160,19 +164,22 @@ def fake_constructor(cls, *args, **kwargs):
         FakeB = type("B", (), {})
 
         with pytest.raises(ExceptionGroup, match="docker deploy_parallel failed"):
-            DockerWorkerManager.deploy_parallel([(FakeA, (), {}), (FakeB, (), {})])
+            WorkerManagerDocker(g=GlobalConfig()).deploy_parallel(
+                [(FakeA, (), {}), (FakeB, (), {})]
+            )
 
         # stop was attempted despite it raising
         mod_a.stop.assert_called_once()
 
 
 class TestWorkerManagerPartialFailure:
-    """WorkerManager.deploy_parallel must clean up successful RPCClients when one fails."""
+    """WorkerManagerPython.deploy_parallel must clean up successful RPCClients when one fails."""
 
     def test_middle_module_fails_cleans_up_siblings(self):
-        from dimos.core.worker_manager import WorkerManager
+        from dimos.core.global_config import GlobalConfig
+        from dimos.core.worker_manager_python import WorkerManagerPython
 
-        manager = WorkerManager(n_workers=2)
+        manager = WorkerManagerPython(g=GlobalConfig(n_workers=2))
 
         mock_workers = [MagicMock(name=f"Worker{i}") for i in range(2)]
         for w in mock_workers:
@@ -198,7 +205,7 @@ def fake_deploy_module(module_class, args=(), kwargs=None):
 
         rpc_clients_created: list[MagicMock] = []
 
-        with patch("dimos.core.worker_manager.RPCClient") as mock_rpc_cls:
+        with patch("dimos.core.worker_manager_python.RPCClient") as mock_rpc_cls:
 
             def make_rpc(actor, cls):
                 client = MagicMock(name=f"rpc_{cls.__name__}")
diff --git a/dimos/core/worker_manager_docker.py b/dimos/core/worker_manager_docker.py
index 78bc9928c4..b35a7f000d 100644
--- a/dimos/core/worker_manager_docker.py
+++ b/dimos/core/worker_manager_docker.py
@@ -16,26 +16,51 @@
 from contextlib import suppress
 from typing import TYPE_CHECKING, Any
 
-from dimos.core.module import ModuleSpec
+from dimos.core.global_config import GlobalConfig
+from dimos.core.module import ModuleBase, ModuleSpec
+from dimos.utils.logging_config import setup_logger
 from dimos.utils.thread_utils import safe_thread_map
 from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
     from dimos.core.docker_module import DockerModuleOuter
+    from dimos.core.rpc_client import ModuleProxyProtocol
 
+logger = setup_logger()
 
-class DockerWorkerManager:
-    """Parallel deployment of Docker-backed modules."""
 
-    @staticmethod
-    def deploy_parallel(
-        specs: list[ModuleSpec],
-    ) -> list[DockerModuleOuter]:
-        """Deploy multiple DockerModules in parallel.
+class WorkerManagerDocker:
+    """Manages deployment of Docker-backed modules."""
 
-        If any deployment fails, all successfully-started containers are
-        stopped before an ExceptionGroup is raised.
-        """
+    def __init__(self, g: GlobalConfig) -> None:
+        self._cfg = g
+        self._deployed: list[DockerModuleOuter] = []
+
+    def should_manage(self, module_class: type) -> bool:
+        # inlined to prevent circular dependency
+        from dimos.core.docker_module import is_docker_module
+
+        return is_docker_module(module_class)
+
+    def start(self) -> None:
+        """No-op — Docker manager has no persistent workers."""
+
+    def deploy(
+        self,
+        module_class: type[ModuleBase],
+        global_config: GlobalConfig,
+        kwargs: dict[str, Any],
+    ) -> ModuleProxyProtocol:
+        # inlined to prevent circular dependency
+        from dimos.core.docker_module import DockerModuleOuter
+
+        mod = DockerModuleOuter(module_class, g=global_config, **kwargs)  # type: ignore[arg-type]
+        mod.build()
+        self._deployed.append(mod)
+        return mod
+
+    def deploy_parallel(self, specs: list[ModuleSpec]) -> list[ModuleProxyProtocol]:
+        # inlined to prevent circular dependency
         from dimos.core.docker_module import DockerModuleOuter
 
         def _on_errors(
@@ -51,4 +76,19 @@ def _deploy_one(spec: ModuleSpec) -> DockerModuleOuter:
             mod.build()
             return mod
 
-        return safe_thread_map(specs, _deploy_one, _on_errors)
+        results = safe_thread_map(specs, _deploy_one, _on_errors)
+        self._deployed.extend(results)
+        return results  # type: ignore[return-value]
+
+    def stop(self) -> None:
+        for mod in reversed(self._deployed):
+            with suppress(Exception):
+                mod.stop()
+        self._deployed.clear()
+
+    def health_check(self) -> bool:
+        # TODO: in the future decide on what a meaninful health check would be
+        return True
+
+    def suppress_console(self) -> None:
+        """No-op — Docker containers manage their own stdio."""
diff --git a/dimos/core/worker_manager.py b/dimos/core/worker_manager_python.py
similarity index 62%
rename from dimos/core/worker_manager.py
rename to dimos/core/worker_manager_python.py
index f12bffac66..12a0d11f68 100644
--- a/dimos/core/worker_manager.py
+++ b/dimos/core/worker_manager_python.py
@@ -16,35 +16,61 @@
 
 from collections.abc import Iterable
 from contextlib import suppress
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from dimos.core.global_config import GlobalConfig
 from dimos.core.module import ModuleBase, ModuleSpec
 from dimos.core.rpc_client import RPCClient
-from dimos.core.worker import Worker
+from dimos.core.worker_python import Worker
 from dimos.utils.logging_config import setup_logger
 from dimos.utils.thread_utils import safe_thread_map
 from dimos.utils.typing_utils import ExceptionGroup
 
+if TYPE_CHECKING:
+    from dimos.core.resource_monitor.monitor import StatsMonitor
+
 logger = setup_logger()
 
 
-class WorkerManager:
-    def __init__(self, n_workers: int = 2) -> None:
-        self._n_workers = n_workers
+_MIN_WORKERS = 2
+
+
+class WorkerManagerPython:
+    def __init__(self, g: GlobalConfig) -> None:
+        self._cfg = g
+        self._max_workers = g.n_workers
+        self._worker_to_module_ratio = g.worker_to_module_ratio
         self._workers: list[Worker] = []
+        self._n_modules = 0
         self._closed = False
         self._started = False
+        self._stats_monitor: StatsMonitor | None = None
+
+    def _desired_workers(self, n_modules: int) -> int:
+        """Target worker count: ratio * modules, clamped to [_MIN_WORKERS, max_workers]."""
+        from_ratio = int(n_modules * self._worker_to_module_ratio + 0.5)
+        return max(_MIN_WORKERS, min(from_ratio, self._max_workers))
+
+    def _ensure_workers(self, n_modules: int) -> None:
+        """Grow the worker pool to match the desired count for *n_modules*."""
+        target = self._desired_workers(n_modules)
+        while len(self._workers) < target:
+            worker = Worker()
+            worker.start_process()
+            self._workers.append(worker)
 
     def start(self) -> None:
         if self._started:
             return
         self._started = True
-        for _ in range(self._n_workers):
-            worker = Worker()
-            worker.start_process()
-            self._workers.append(worker)
-        logger.info("Worker pool started.", n_workers=self._n_workers)
+        self._ensure_workers(self._n_modules)
+        logger.info("Worker pool started.", n_workers=len(self._workers))
+
+        if self._cfg.dtop:
+            from dimos.core.resource_monitor.monitor import StatsMonitor
+
+            self._stats_monitor = StatsMonitor(self)
+            self._stats_monitor.start()
 
     def _select_worker(self) -> Worker:
         return min(self._workers, key=lambda w: w.module_count)
@@ -53,28 +79,31 @@ def deploy(
         self, module_class: type[ModuleBase], global_config: GlobalConfig, kwargs: dict[str, Any]
     ) -> RPCClient:
         if self._closed:
-            raise RuntimeError("WorkerManager is closed")
+            raise RuntimeError("WorkerManagerPython is closed")
 
-        # Auto-start for backward compatibility
         if not self._started:
             self.start()
 
+        self._n_modules += 1
+        self._ensure_workers(self._n_modules)
         worker = self._select_worker()
         actor = worker.deploy_module(module_class, global_config, kwargs=kwargs)
         return RPCClient(actor, module_class)
 
     def deploy_parallel(self, module_specs: Iterable[ModuleSpec]) -> list[RPCClient]:
         if self._closed:
-            raise RuntimeError("WorkerManager is closed")
+            raise RuntimeError("WorkerManagerPython is closed")
 
         module_specs = list(module_specs)
         if len(module_specs) == 0:
             return []
 
-        # Auto-start for backward compatibility
         if not self._started:
             self.start()
 
+        self._n_modules += len(module_specs)
+        self._ensure_workers(self._n_modules)
+
         # Pre-assign workers sequentially (so least-loaded accounting is
         # correct), then deploy concurrently via threads. The per-worker lock
         # serializes deploys that land on the same worker process.
@@ -99,6 +128,21 @@ def _on_errors(
             _on_errors,
         )
 
+    def should_manage(self, module_class: type) -> bool:
+        """Catch-all — accepts any module not claimed by another manager."""
+        return True
+
+    def health_check(self) -> bool:
+        """Verify all worker processes are alive."""
+        if len(self._workers) == 0:
+            logger.error("health_check: no workers found")
+            return False
+        for w in self._workers:
+            if w.pid is None:
+                logger.error("health_check: worker died", worker_id=w.worker_id)
+                return False
+        return True
+
     def suppress_console(self) -> None:
         """Tell all workers to redirect stdout/stderr to /dev/null."""
         for worker in self._workers:
@@ -108,11 +152,15 @@ def suppress_console(self) -> None:
     def workers(self) -> list[Worker]:
         return list(self._workers)
 
-    def close_all(self) -> None:
+    def stop(self) -> None:
         if self._closed:
             return
         self._closed = True
 
+        if self._stats_monitor is not None:
+            self._stats_monitor.stop()
+            self._stats_monitor = None
+
         logger.info("Shutting down all workers...")
 
         for worker in reversed(self._workers):
diff --git a/dimos/core/worker.py b/dimos/core/worker_python.py
similarity index 100%
rename from dimos/core/worker.py
rename to dimos/core/worker_python.py
diff --git a/dimos/hardware/sensors/camera/realsense/camera.py b/dimos/hardware/sensors/camera/realsense/camera.py
index 821982981d..ca87ec3c1b 100644
--- a/dimos/hardware/sensors/camera/realsense/camera.py
+++ b/dimos/hardware/sensors/camera/realsense/camera.py
@@ -445,7 +445,7 @@ def get_depth_scale(self) -> float:
 
 
 def main() -> None:
-    dimos = ModuleCoordinator(n=2)
+    dimos = ModuleCoordinator()
     dimos.start()
 
     camera = dimos.deploy(RealSenseCamera, enable_pointcloud=True, pointcloud_fps=5.0)  # type: ignore[type-var]
diff --git a/dimos/hardware/sensors/camera/zed/camera.py b/dimos/hardware/sensors/camera/zed/camera.py
index dd429c29cf..d39a37f82f 100644
--- a/dimos/hardware/sensors/camera/zed/camera.py
+++ b/dimos/hardware/sensors/camera/zed/camera.py
@@ -491,7 +491,7 @@ def get_depth_scale(self) -> float:
 
 
 def main() -> None:
-    dimos = ModuleCoordinator(n=2)
+    dimos = ModuleCoordinator()
     dimos.start()
 
     camera = dimos.deploy(ZEDCamera, enable_pointcloud=True, pointcloud_fps=5.0)  # type: ignore[type-var]
diff --git a/dimos/robot/cli/dimos.py b/dimos/robot/cli/dimos.py
index 1137a612f3..8a2be16668 100644
--- a/dimos/robot/cli/dimos.py
+++ b/dimos/robot/cli/dimos.py
@@ -177,9 +177,8 @@ def run(
             coordinator.stop()
             raise typer.Exit(1)
 
-        n_workers = coordinator.n_workers
         n_modules = coordinator.n_modules
-        typer.echo(f"✓ All modules started ({n_modules} modules, {n_workers} workers)")
+        typer.echo(f"✓ All modules started ({n_modules} modules)")
         typer.echo("✓ Health check passed")
         typer.echo("✓ DimOS running in background\n")
         typer.echo(f"  Run ID:    {run_id}")
diff --git a/dimos/robot/unitree/b1/unitree_b1.py b/dimos/robot/unitree/b1/unitree_b1.py
index 9a6d04a7ff..ab36850643 100644
--- a/dimos/robot/unitree/b1/unitree_b1.py
+++ b/dimos/robot/unitree/b1/unitree_b1.py
@@ -80,7 +80,7 @@ def __init__(
         self.capabilities = [RobotCapability.LOCOMOTION]
         self.connection = None
         self.joystick = None
-        self._dimos = ModuleCoordinator(n=2)
+        self._dimos = ModuleCoordinator()
 
         os.makedirs(self.output_dir, exist_ok=True)
         logger.info(f"Robot outputs will be saved to: {self.output_dir}")
diff --git a/dimos/utils/demo_image_encoding.py b/dimos/utils/demo_image_encoding.py
index 84b91acf79..148b5e842d 100644
--- a/dimos/utils/demo_image_encoding.py
+++ b/dimos/utils/demo_image_encoding.py
@@ -97,7 +97,7 @@ def main() -> None:
     )
     args = parser.parse_args()
 
-    dimos = ModuleCoordinator(n=2)
+    dimos = ModuleCoordinator()
     dimos.start()
     emitter = dimos.deploy(EmitterModule)
     receiver = dimos.deploy(ReceiverModule)

From 434c321962f8271a09a23c9f2d2905e838dcb648 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Wed, 25 Mar 2026 21:22:57 -0700
Subject: [PATCH 86/89] refactor blueprint build

---
 dimos/core/blueprints.py         | 271 ++++++++++++++++---------------
 dimos/core/module_coordinator.py |  35 ++++
 2 files changed, 176 insertions(+), 130 deletions(-)

diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py
index 314724386d..b1c855d4d7 100644
--- a/dimos/core/blueprints.py
+++ b/dimos/core/blueprints.py
@@ -55,6 +55,36 @@ class ModuleRef:
     spec: type[Spec] | type[ModuleBase]
 
 
+@dataclass(frozen=True)
+class StreamWiring:
+    """Compiled instruction: set a transport on a module's stream."""
+
+    module_class: type[ModuleBase]
+    stream_name: str
+    transport: PubSubTransport[Any]
+
+
+@dataclass(frozen=True)
+class ModuleRefWiring:
+    """Compiled instruction: link base_module.ref_name → target_module."""
+
+    base_module: type[ModuleBase]
+    ref_name: str
+    target_module: type[ModuleBase]
+
+
+@dataclass(frozen=True)
+class RpcWiringPlan:
+    """Compiled RPC wiring: registry of methods + per-module binding requests."""
+
+    # rpc_key -> (module_class, method_name) — the full callable registry
+    registry: dict[str, tuple[type[ModuleBase], str]]
+    # (module_class, set_method_name, linked_rpc_key) — for set_X pattern
+    set_methods: tuple[tuple[type[ModuleBase], str, str], ...]
+    # (module_class, requested_name, rpc_key) — for rpc_calls pattern
+    rpc_call_bindings: tuple[tuple[type[ModuleBase], str, str], ...]
+
+
 @dataclass(frozen=True)
 class _BlueprintAtom:
     kwargs: dict[str, Any]
@@ -166,7 +196,7 @@ def _active_blueprints(self) -> tuple[_BlueprintAtom, ...]:
     def _check_ambiguity(
         self,
         requested_method_name: str,
-        interface_methods: Mapping[str, list[tuple[type[ModuleBase], Callable[..., Any]]]],
+        interface_methods: Mapping[str, list[tuple[type[ModuleBase], str]]],
         requesting_module: type[ModuleBase],
     ) -> None:
         if (
@@ -189,17 +219,13 @@ def _get_transport_for(self, name: str, stream_type: type) -> PubSubTransport[An
 
         use_pickled = getattr(stream_type, "lcm_encode", None) is None
         topic = f"/{name}" if self._is_name_unique(name) else f"/{short_id()}"
-        transport = pLCMTransport(topic) if use_pickled else LCMTransport(topic, stream_type)
-
-        return transport
+        return pLCMTransport(topic) if use_pickled else LCMTransport(topic, stream_type)
 
     @cached_property
     def _all_name_types(self) -> set[tuple[str, type]]:
-        # Apply remappings to get the actual names that will be used
         result = set()
         for blueprint in self._active_blueprints:
             for conn in blueprint.streams:
-                # Check if this stream should be remapped
                 remapped_name = self.remapping_map.get((blueprint.module, conn.name), conn.name)
                 if isinstance(remapped_name, str):
                     result.add((remapped_name, conn.type))
@@ -274,65 +300,69 @@ def _verify_no_name_conflicts(self) -> None:
 
         raise ValueError("\n".join(error_lines))
 
-    def _deploy_all_modules(
-        self, module_coordinator: ModuleCoordinator, global_config: GlobalConfig
-    ) -> None:
-        module_specs: list[ModuleSpec] = []
+    def _compile_module_specs(self, g: GlobalConfig) -> list[ModuleSpec]:
+        """Compile the list of module deployment specs (pure — no side effects)."""
+        specs: list[ModuleSpec] = []
         for blueprint in self._active_blueprints:
-            module_specs.append((blueprint.module, global_config, blueprint.kwargs))
-
-        module_coordinator.deploy_parallel(module_specs)
+            specs.append((blueprint.module, g, blueprint.kwargs))
+        return specs
 
-    def _connect_streams(self, module_coordinator: ModuleCoordinator) -> None:
-        # dict when given (final/remapped) stream name+type, provides a list of modules + original (non-remapped) stream names
-        streams = defaultdict(list)
+    def _compile_stream_wiring(self) -> list[StreamWiring]:
+        """Compile stream transport assignments (pure — no side effects)."""
+        # Group streams by (remapped_name, type) -> [(module_class, original_name)]
+        streams: dict[
+            tuple[str | type[ModuleBase] | type[Spec], type], list[tuple[type[ModuleBase], str]]
+        ] = defaultdict(list)
 
         for blueprint in self._active_blueprints:
             for conn in blueprint.streams:
-                # Check if this stream should be remapped
                 remapped_name = self.remapping_map.get((blueprint.module, conn.name), conn.name)
                 if isinstance(remapped_name, str):
-                    # Group by remapped name and type
                     streams[remapped_name, conn.type].append((blueprint.module, conn.name))
 
-        # Connect all In/Out streams by remapped name and type.
-        for remapped_name, stream_type in streams.keys():
+        wiring: list[StreamWiring] = []
+        for (remapped_name, stream_type), module_streams in streams.items():
+            assert isinstance(remapped_name, str)
             transport = self._get_transport_for(remapped_name, stream_type)
-            for module, original_name in streams[(remapped_name, stream_type)]:
-                instance = module_coordinator.get_instance(module)  # type: ignore[assignment]
-                instance.set_transport(original_name, transport)  # type: ignore[union-attr]
+            for module_class, original_name in module_streams:
+                wiring.append(
+                    StreamWiring(
+                        module_class=module_class,
+                        stream_name=original_name,
+                        transport=transport,
+                    )
+                )
                 logger.info(
                     "Transport",
                     name=remapped_name,
                     original_name=original_name,
                     topic=str(getattr(transport, "topic", None)),
                     type=f"{stream_type.__module__}.{stream_type.__qualname__}",
-                    module=module.__name__,
+                    module=module_class.__name__,
                     transport=transport.__class__.__name__,
                 )
+        return wiring
+
+    def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]:
+        """Resolve module references and return wiring plan (pure — no side effects)."""
+        mod_and_mod_ref_to_target: dict[tuple[type[ModuleBase], str], type[ModuleBase]] = {}
 
-    def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None:
-        # partly fill out the mod_and_mod_ref_to_proxy
-        mod_and_mod_ref_to_proxy = {
-            (module, name): replacement
-            for (module, name), replacement in self.remapping_map.items()
-            if is_spec(replacement) or is_module_type(replacement)
-        }
+        # Seed with explicit remappings that point to modules/specs
+        for (module, name), replacement in self.remapping_map.items():
+            if is_module_type(replacement):
+                mod_and_mod_ref_to_target[module, name] = replacement  # type: ignore[assignment]
 
-        # after this loop we should have an exact module for every module_ref on every blueprint
         for blueprint in self._active_blueprints:
             for each_module_ref in blueprint.module_refs:
-                # we've got to find a another module that implements this spec
-                spec = mod_and_mod_ref_to_proxy.get(
-                    (blueprint.module, each_module_ref.name), each_module_ref.spec
-                )
+                key = (blueprint.module, each_module_ref.name)
+                if key in mod_and_mod_ref_to_target:
+                    continue
 
-                # if the spec is actually module, use that (basically a user override)
+                spec = self.remapping_map.get(key, each_module_ref.spec)
                 if is_module_type(spec):
-                    mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = spec
+                    mod_and_mod_ref_to_target[key] = spec  # type: ignore[assignment]
                     continue
 
-                # find all available candidates
                 possible_module_candidates = [
                     each_other_blueprint.module
                     for each_other_blueprint in self._active_blueprints
@@ -341,33 +371,26 @@ def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None:
                         and spec_structural_compliance(each_other_blueprint.module, spec)
                     )
                 ]
-                # we keep valid separate from invalid to provide a better error message for "almost" valid cases
                 valid_module_candidates = [
                     each_candidate
                     for each_candidate in possible_module_candidates
                     if spec_annotation_compliance(each_candidate, spec)
                 ]
-                # none
+
                 if len(possible_module_candidates) == 0:
                     raise Exception(
                         f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I couldn't find a module that met that spec.\n"""
                     )
-                # exactly one structurally valid candidate
                 elif len(possible_module_candidates) == 1:
                     if len(valid_module_candidates) == 0:
                         logger.warning(
                             f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. I found a module ({possible_module_candidates[0].__name__}) that met that spec structurally, but it had a mismatch in type annotations.\nPlease either change the {each_module_ref.spec.__name__} spec or the {possible_module_candidates[0].__name__} module.\n"""
                         )
-                    mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = (
-                        possible_module_candidates[0]
-                    )
-                    continue
-                # more than one
+                    mod_and_mod_ref_to_target[key] = possible_module_candidates[0]
                 elif len(valid_module_candidates) > 1:
                     raise Exception(
                         f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {possible_module_candidates}.\nTo fix this use .remappings, for example:\n    autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, <ModuleThatHasTheRpcCalls>) ])\n"""
                     )
-                # structural candidates, but no valid candidates
                 elif len(valid_module_candidates) == 0:
                     possible_module_candidates_str = ", ".join(
                         [each_candidate.__name__ for each_candidate in possible_module_candidates]
@@ -375,129 +398,118 @@ def _connect_module_refs(self, module_coordinator: ModuleCoordinator) -> None:
                     raise Exception(
                         f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. Some modules ({possible_module_candidates_str}) met the spec structurally but had a mismatch in type annotations\n"""
                     )
-                # one valid candidate (and more than one structurally valid candidate)
                 else:
-                    mod_and_mod_ref_to_proxy[blueprint.module, each_module_ref.name] = (
-                        valid_module_candidates[0]
-                    )
-
-        # now that we know the streams, we mutate the RPCClient objects
-        for (base_module, module_ref_name), target_module in mod_and_mod_ref_to_proxy.items():
-            base_module_proxy = module_coordinator.get_instance(base_module)
-            target_module_proxy = module_coordinator.get_instance(target_module)  # type: ignore[type-var,arg-type]
-            setattr(
-                base_module_proxy,
-                module_ref_name,
-                target_module_proxy,
-            )
-            # Ensure the remote module instance can use the module ref inside its own RPC handlers.
-            base_module_proxy.set_module_ref(module_ref_name, target_module_proxy)
-
-    def _connect_rpc_methods(self, module_coordinator: ModuleCoordinator) -> None:
-        # Gather all RPC methods.
-        rpc_methods = {}
-        rpc_methods_dot = {}
-
-        # Track interface methods to detect ambiguity.
-        interface_methods: defaultdict[str, list[tuple[type[ModuleBase], Callable[..., Any]]]] = (
-            defaultdict(list)
-        )  # interface_name_method -> [(module_class, method)]
-        interface_methods_dot: defaultdict[
-            str, list[tuple[type[ModuleBase], Callable[..., Any]]]
-        ] = defaultdict(list)  # interface_name.method -> [(module_class, method)]
+                    mod_and_mod_ref_to_target[key] = valid_module_candidates[0]
+
+        return [
+            ModuleRefWiring(base_module=base_module, ref_name=ref_name, target_module=target)
+            for (base_module, ref_name), target in mod_and_mod_ref_to_target.items()
+        ]
+
+    def _compile_rpc_wiring(self) -> RpcWiringPlan:
+        """Compile the RPC method registry and binding requests (pure — no side effects)."""
+        # registry: rpc_key -> (module_class, method_name)
+        registry: dict[str, tuple[type[ModuleBase], str]] = {}
+
+        # Track interface methods to detect ambiguity
+        interface_methods: defaultdict[str, list[tuple[type[ModuleBase], str]]] = defaultdict(list)
+        interface_methods_dot: defaultdict[str, list[tuple[type[ModuleBase], str]]] = defaultdict(
+            list
+        )
 
         for blueprint in self._active_blueprints:
             for method_name in blueprint.module.rpcs.keys():  # type: ignore[attr-defined]
-                module_proxy = module_coordinator.get_instance(blueprint.module)  # type: ignore[assignment]
-                method_for_rpc_client = getattr(module_proxy, method_name)
-                # Register under concrete class name (backward compatibility)
-                rpc_methods[f"{blueprint.module.__name__}_{method_name}"] = method_for_rpc_client
-                rpc_methods_dot[f"{blueprint.module.__name__}.{method_name}"] = (
-                    method_for_rpc_client
+                registry[f"{blueprint.module.__name__}_{method_name}"] = (
+                    blueprint.module,
+                    method_name,
+                )
+                registry[f"{blueprint.module.__name__}.{method_name}"] = (
+                    blueprint.module,
+                    method_name,
                 )
 
-                # Also register under any interface names
                 for base in blueprint.module.mro():
-                    # Check if this base is an abstract interface with the method
                     if (
                         base is not Module
                         and issubclass(base, ABC)
                         and hasattr(base, method_name)
                         and getattr(base, method_name, None) is not None
                     ):
-                        interface_key = f"{base.__name__}.{method_name}"
-                        interface_methods_dot[interface_key].append(
-                            (blueprint.module, method_for_rpc_client)
+                        interface_methods_dot[f"{base.__name__}.{method_name}"].append(
+                            (blueprint.module, method_name)
                         )
-                        interface_key_underscore = f"{base.__name__}_{method_name}"
-                        interface_methods[interface_key_underscore].append(
-                            (blueprint.module, method_for_rpc_client)
+                        interface_methods[f"{base.__name__}_{method_name}"].append(
+                            (blueprint.module, method_name)
                         )
 
-        # Check for ambiguity in interface methods and add non-ambiguous ones
-        for interface_key, implementations in interface_methods_dot.items():
+        # Add non-ambiguous interface methods to registry
+        for key, implementations in interface_methods_dot.items():
             if len(implementations) == 1:
-                rpc_methods_dot[interface_key] = implementations[0][1]
-        for interface_key, implementations in interface_methods.items():
+                registry[key] = implementations[0]
+        for key, implementations in interface_methods.items():
             if len(implementations) == 1:
-                rpc_methods[interface_key] = implementations[0][1]
+                registry[key] = implementations[0]
 
-        # Fulfil method requests (so modules can call each other).
+        # Compile set_ method bindings
+        set_methods: list[tuple[type[ModuleBase], str, str]] = []
         for blueprint in self._active_blueprints:
-            instance = module_coordinator.get_instance(blueprint.module)  # type: ignore[assignment]
-
             for method_name in blueprint.module.rpcs.keys():  # type: ignore[attr-defined]
                 if not method_name.startswith("set_"):
                     continue
-
                 linked_name = method_name.removeprefix("set_")
-
                 self._check_ambiguity(linked_name, interface_methods, blueprint.module)
+                if linked_name in registry:
+                    set_methods.append((blueprint.module, method_name, linked_name))
 
-                if linked_name not in rpc_methods:
-                    continue
-
-                getattr(instance, method_name)(rpc_methods[linked_name])
-
-            for requested_method_name in instance.get_rpc_method_names():  # type: ignore[union-attr]
-                self._check_ambiguity(
-                    requested_method_name, interface_methods_dot, blueprint.module
-                )
-
-                if requested_method_name not in rpc_methods_dot:
-                    continue
-
-                instance.set_rpc_method(  # type: ignore[union-attr]
-                    requested_method_name, rpc_methods_dot[requested_method_name]
-                )
+        # Compile rpc_call bindings (uses rpc_calls list from module)
+        rpc_call_bindings: list[tuple[type[ModuleBase], str, str]] = []
+        for blueprint in self._active_blueprints:
+            rpc_call_names: list[str] = getattr(blueprint.module, "rpc_calls", [])
+            for requested_name in rpc_call_names:
+                self._check_ambiguity(requested_name, interface_methods_dot, blueprint.module)
+                if requested_name in registry:
+                    rpc_call_bindings.append((blueprint.module, requested_name, requested_name))
+
+        return RpcWiringPlan(
+            registry=registry,
+            set_methods=tuple(set_methods),
+            rpc_call_bindings=tuple(rpc_call_bindings),
+        )
 
     def build(
         self,
         cli_config_overrides: Mapping[str, Any] | None = None,
     ) -> ModuleCoordinator:
         logger.info("Building the blueprint")
+
+        # Phase 1: Configuration
         global_config.update(**dict(self.global_config_overrides))
         if cli_config_overrides:
             global_config.update(**dict(cli_config_overrides))
 
+        # Phase 2: Validation
         self._run_configurators()
         self._check_requirements()
         self._verify_no_name_conflicts()
 
-        logger.info("Starting the modules")
-        module_coordinator = ModuleCoordinator(g=global_config)
-        module_coordinator.start()
+        # Phase 3: Compile wiring plans (pure — no side effects)
+        module_specs = self._compile_module_specs(global_config)
+        stream_wiring = self._compile_stream_wiring()
+        module_ref_wiring = self._compile_module_ref_wiring()
+        rpc_wiring = self._compile_rpc_wiring()
 
-        # all module constructors are called here (each of them setup their own)
-        self._deploy_all_modules(module_coordinator, global_config)
-        self._connect_streams(module_coordinator)
-        self._connect_rpc_methods(module_coordinator)
-        self._connect_module_refs(module_coordinator)
-
-        module_coordinator.build_all_modules()
-        module_coordinator.start_all_modules()
+        # Phase 4: Execute (all mutations go through coordinator)
+        logger.info("Starting the modules")
+        coordinator = ModuleCoordinator(g=global_config)
+        coordinator.start()
+        coordinator.deploy_parallel(module_specs)
+        coordinator.wire_streams(stream_wiring)
+        coordinator.wire_rpc_methods(rpc_wiring)
+        coordinator.wire_module_refs(module_ref_wiring)
+        coordinator.build_all_modules()
+        coordinator.start_all_modules()
 
-        return module_coordinator
+        return coordinator
 
 
 def autoconnect(*blueprints: Blueprint) -> Blueprint:
@@ -528,7 +540,6 @@ def autoconnect(*blueprints: Blueprint) -> Blueprint:
 
 
 def _eliminate_duplicates(blueprints: list[_BlueprintAtom]) -> list[_BlueprintAtom]:
-    # The duplicates are eliminated in reverse so that newer blueprints override older ones.
     seen = set()
     unique_blueprints = []
     for bp in reversed(blueprints):
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index d1020e61bd..bf828eecda 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -28,6 +28,7 @@
 from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
+    from dimos.core.blueprints import ModuleRefWiring, RpcWiringPlan, StreamWiring
     from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol
 
 logger = setup_logger()
@@ -149,6 +150,40 @@ def _on_errors(
         _register()
         return results
 
+    def wire_streams(self, wiring: list[StreamWiring]) -> None:
+        """Apply stream transports to deployed modules."""
+        for w in wiring:
+            instance = self.get_instance(w.module_class)
+            instance.set_transport(w.stream_name, w.transport)  # type: ignore[union-attr]
+
+    def wire_rpc_methods(self, plan: RpcWiringPlan) -> None:
+        """Wire RPC methods between modules using the compiled plan."""
+        # Build callable registry from deployed instances
+        callables: dict[str, Any] = {}
+        for rpc_key, (module_class, method_name) in plan.registry.items():
+            proxy = self.get_instance(module_class)
+            callables[rpc_key] = getattr(proxy, method_name)
+
+        # Apply set_ methods
+        for module_class, set_method, linked_key in plan.set_methods:
+            if linked_key in callables:
+                instance = self.get_instance(module_class)
+                getattr(instance, set_method)(callables[linked_key])
+
+        # Apply rpc_call bindings
+        for module_class, requested_name, rpc_key in plan.rpc_call_bindings:
+            if rpc_key in callables:
+                instance = self.get_instance(module_class)
+                instance.set_rpc_method(requested_name, callables[rpc_key])  # type: ignore[union-attr]
+
+    def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None:
+        """Set module references between deployed modules."""
+        for w in wiring:
+            base_proxy = self.get_instance(w.base_module)
+            target_proxy = self.get_instance(w.target_module)
+            setattr(base_proxy, w.ref_name, target_proxy)
+            base_proxy.set_module_ref(w.ref_name, target_proxy)  # type: ignore[union-attr]
+
     def build_all_modules(self) -> None:
         """Call build() on all deployed modules in parallel.
 

From 97c3ab6255499448cb571227be7ee9f042e0dcab Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Thu, 26 Mar 2026 00:03:31 -0700
Subject: [PATCH 87/89] split ModuleCoordinator from Blueprint

---
 dimos/core/blueprints.py         | 33 ++++++++++++++++++--------------
 dimos/core/module_coordinator.py | 24 +++++++++++++++++------
 2 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py
index b1c855d4d7..a91714bd9f 100644
--- a/dimos/core/blueprints.py
+++ b/dimos/core/blueprints.py
@@ -85,6 +85,16 @@ class RpcWiringPlan:
     rpc_call_bindings: tuple[tuple[type[ModuleBase], str, str], ...]
 
 
+@dataclass(frozen=True)
+class DeploySpec:
+    """Complete deployment specification compiled by Blueprint.build()."""
+
+    module_specs: list[ModuleSpec]
+    stream_wiring: list[StreamWiring]
+    rpc_wiring: RpcWiringPlan
+    module_ref_wiring: list[ModuleRefWiring]
+
+
 @dataclass(frozen=True)
 class _BlueprintAtom:
     kwargs: dict[str, Any]
@@ -389,7 +399,7 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]:
                     mod_and_mod_ref_to_target[key] = possible_module_candidates[0]
                 elif len(valid_module_candidates) > 1:
                     raise Exception(
-                        f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {possible_module_candidates}.\nTo fix this use .remappings, for example:\n    autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, <ModuleThatHasTheRpcCalls>) ])\n"""
+                        f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I found multiple modules that met that spec: {valid_module_candidates}.\nTo fix this use .remappings, for example:\n    autoconnect(...).remappings([ ({blueprint.module.__name__}, {each_module_ref.name!r}, <ModuleThatHasTheRpcCalls>) ])\n"""
                     )
                 elif len(valid_module_candidates) == 0:
                     possible_module_candidates_str = ", ".join(
@@ -492,23 +502,18 @@ def build(
         self._check_requirements()
         self._verify_no_name_conflicts()
 
-        # Phase 3: Compile wiring plans (pure — no side effects)
-        module_specs = self._compile_module_specs(global_config)
-        stream_wiring = self._compile_stream_wiring()
-        module_ref_wiring = self._compile_module_ref_wiring()
-        rpc_wiring = self._compile_rpc_wiring()
+        # Phase 3: Compile deploy spec (pure — no side effects)
+        deploy_spec = DeploySpec(
+            module_specs=self._compile_module_specs(global_config),
+            stream_wiring=self._compile_stream_wiring(),
+            module_ref_wiring=self._compile_module_ref_wiring(),
+            rpc_wiring=self._compile_rpc_wiring(),
+        )
 
         # Phase 4: Execute (all mutations go through coordinator)
         logger.info("Starting the modules")
-        coordinator = ModuleCoordinator(g=global_config)
+        coordinator = ModuleCoordinator(g=global_config, deploy_spec=deploy_spec)
         coordinator.start()
-        coordinator.deploy_parallel(module_specs)
-        coordinator.wire_streams(stream_wiring)
-        coordinator.wire_rpc_methods(rpc_wiring)
-        coordinator.wire_module_refs(module_ref_wiring)
-        coordinator.build_all_modules()
-        coordinator.start_all_modules()
-
         return coordinator
 
 
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index bf828eecda..6af0bf78dd 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -28,7 +28,7 @@
 from dimos.utils.typing_utils import ExceptionGroup
 
 if TYPE_CHECKING:
-    from dimos.core.blueprints import ModuleRefWiring, RpcWiringPlan, StreamWiring
+    from dimos.core.blueprints import DeploySpec, ModuleRefWiring, RpcWiringPlan, StreamWiring
     from dimos.core.rpc_client import ModuleProxy, ModuleProxyProtocol
 
 logger = setup_logger()
@@ -47,13 +47,16 @@ class ModuleCoordinator(Resource):  # type: ignore[misc]
 
     _managers: list[WorkerManagerDocker | WorkerManagerPython]
     _global_config: GlobalConfig
+    _deploy_spec: DeploySpec | None
     _deployed_modules: dict[type[ModuleBase], ModuleProxyProtocol]
 
     def __init__(
         self,
         g: GlobalConfig = global_config,
+        deploy_spec: DeploySpec | None = None,
     ) -> None:
         self._global_config = g
+        self._deploy_spec = deploy_spec
         self._managers = []
         self._deployed_modules = {}
 
@@ -65,6 +68,15 @@ def start(self) -> None:
         for m in self._managers:
             m.start()
 
+        if self._deploy_spec is not None:
+            spec = self._deploy_spec
+            self.deploy_parallel(spec.module_specs)
+            self._wire_streams(spec.stream_wiring)
+            self._wire_rpc_methods(spec.rpc_wiring)
+            self._wire_module_refs(spec.module_ref_wiring)
+            self._build_all_modules()
+            self.start_all_modules()
+
     def _find_manager(
         self, module_class: type[ModuleBase[Any]]
     ) -> WorkerManagerDocker | WorkerManagerPython:
@@ -143,20 +155,20 @@ def _register() -> None:
         def _on_errors(
             _outcomes: list[Any], _successes: list[Any], errors: list[Exception]
         ) -> None:
-            _register()
+            # Don't register partially-deployed modules — managers handle their own cleanup.
             raise ExceptionGroup("deploy_parallel failed", errors)
 
         safe_thread_map(list(groups.keys()), _deploy_group, _on_errors)
         _register()
         return results
 
-    def wire_streams(self, wiring: list[StreamWiring]) -> None:
+    def _wire_streams(self, wiring: list[StreamWiring]) -> None:
         """Apply stream transports to deployed modules."""
         for w in wiring:
             instance = self.get_instance(w.module_class)
             instance.set_transport(w.stream_name, w.transport)  # type: ignore[union-attr]
 
-    def wire_rpc_methods(self, plan: RpcWiringPlan) -> None:
+    def _wire_rpc_methods(self, plan: RpcWiringPlan) -> None:
         """Wire RPC methods between modules using the compiled plan."""
         # Build callable registry from deployed instances
         callables: dict[str, Any] = {}
@@ -176,7 +188,7 @@ def wire_rpc_methods(self, plan: RpcWiringPlan) -> None:
                 instance = self.get_instance(module_class)
                 instance.set_rpc_method(requested_name, callables[rpc_key])  # type: ignore[union-attr]
 
-    def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None:
+    def _wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None:
         """Set module references between deployed modules."""
         for w in wiring:
             base_proxy = self.get_instance(w.base_module)
@@ -184,7 +196,7 @@ def wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None:
             setattr(base_proxy, w.ref_name, target_proxy)
             base_proxy.set_module_ref(w.ref_name, target_proxy)  # type: ignore[union-attr]
 
-    def build_all_modules(self) -> None:
+    def _build_all_modules(self) -> None:
         """Call build() on all deployed modules in parallel.
 
         build() handles heavy one-time work (docker builds, LFS downloads, etc.)

From 7ded86939fbfbdb5b96b9229d20683cf063f52a5 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Tue, 31 Mar 2026 17:01:41 -0700
Subject: [PATCH 88/89] docs: remove outdated singleton claim from
 ModuleCoordinator

---
 dimos/core/module_coordinator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index 8ec88920bf..d331f3c385 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -36,7 +36,6 @@
 
 class ModuleCoordinator(Resource):  # type: ignore[misc]
     """
-    There should only ever be one module coordinator instance (this is a singleton)
     - Module (classes) should be able to be deployed, stopped, and re-deployed in on one instance of ModuleCoordinator
     - Arguably ModuleCoordinator could be called the "DimosRuntime"
     - ModuleCoordinator is responsible for all global "addresses".

From e3b508686162bfd96245892d2d97d1773df5a1f3 Mon Sep 17 00:00:00 2001
From: Jeff Hykin <jeff.hykin@gmail.com>
Date: Tue, 31 Mar 2026 17:05:34 -0700
Subject: [PATCH 89/89] feat: restore _DisabledModuleProxy for disabled module
 ref wiring

- Added _DisabledModuleProxy class back to blueprints.py
- Added optional field to ModuleRef
- _compile_module_ref_wiring now detects disabled providers and creates no-op proxies
- DeploySpec carries disabled_ref_proxies
- ModuleCoordinator._wire_disabled_ref_proxies wires them after module refs
---
 dimos/core/blueprints.py         | 60 ++++++++++++++++++++++++++++++--
 dimos/core/module_coordinator.py |  8 +++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/dimos/core/blueprints.py b/dimos/core/blueprints.py
index a91714bd9f..6f6f0cd793 100644
--- a/dimos/core/blueprints.py
+++ b/dimos/core/blueprints.py
@@ -42,6 +42,30 @@
 logger = setup_logger()
 
 
+class _DisabledModuleProxy:
+    def __init__(self, spec_name: str) -> None:
+        object.__setattr__(self, "_spec_name", spec_name)
+
+    def __getattr__(self, name: str) -> Any:
+        spec = object.__getattribute__(self, "_spec_name")
+
+        def _noop(*_args: Any, **_kwargs: Any) -> None:
+            logger.warning(
+                "Called on disabled module (no-op)",
+                method=name,
+                spec=spec,
+            )
+            return None
+
+        return _noop
+
+    def __reduce__(self) -> tuple[type, tuple[str]]:
+        return (_DisabledModuleProxy, (self._spec_name,))
+
+    def __repr__(self) -> str:
+        return f"<DisabledModuleProxy spec={self._spec_name}>"
+
+
 @dataclass(frozen=True)
 class StreamRef:
     name: str
@@ -53,6 +77,7 @@ class StreamRef:
 class ModuleRef:
     name: str
     spec: type[Spec] | type[ModuleBase]
+    optional: bool = False
 
 
 @dataclass(frozen=True)
@@ -93,6 +118,7 @@ class DeploySpec:
     stream_wiring: list[StreamWiring]
     rpc_wiring: RpcWiringPlan
     module_ref_wiring: list[ModuleRefWiring]
+    disabled_ref_proxies: dict[tuple[type[ModuleBase], str], _DisabledModuleProxy] = field(default_factory=dict)
 
 
 @dataclass(frozen=True)
@@ -356,6 +382,8 @@ def _compile_stream_wiring(self) -> list[StreamWiring]:
     def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]:
         """Resolve module references and return wiring plan (pure — no side effects)."""
         mod_and_mod_ref_to_target: dict[tuple[type[ModuleBase], str], type[ModuleBase]] = {}
+        disabled_ref_proxies: dict[tuple[type[ModuleBase], str], _DisabledModuleProxy] = {}
+        disabled_set = set(self.disabled_modules_tuple)
 
         # Seed with explicit remappings that point to modules/specs
         for (module, name), replacement in self.remapping_map.items():
@@ -388,6 +416,31 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]:
                 ]
 
                 if len(possible_module_candidates) == 0:
+                    if each_module_ref.optional:
+                        continue
+                    # Check whether a *disabled* module would have satisfied this ref.
+                    disabled_candidate = next(
+                        (
+                            bp.module
+                            for bp in self.blueprints
+                            if bp.module in disabled_set
+                            and spec_structural_compliance(bp.module, spec)
+                        ),
+                        None,
+                    )
+                    if disabled_candidate is not None:
+                        logger.warning(
+                            "Module ref unsatisfied because provider is disabled; "
+                            "installing no-op proxy",
+                            ref=each_module_ref.name,
+                            consumer=blueprint.module.__name__,
+                            disabled_provider=disabled_candidate.__name__,
+                            spec=each_module_ref.spec.__name__,
+                        )
+                        disabled_ref_proxies[blueprint.module, each_module_ref.name] = (
+                            _DisabledModuleProxy(each_module_ref.spec.__name__)
+                        )
+                        continue
                     raise Exception(
                         f"""The {blueprint.module.__name__} has a module reference ({each_module_ref}) which requested a module that fills out the {each_module_ref.spec.__name__} spec. But I couldn't find a module that met that spec.\n"""
                     )
@@ -411,10 +464,11 @@ def _compile_module_ref_wiring(self) -> list[ModuleRefWiring]:
                 else:
                     mod_and_mod_ref_to_target[key] = valid_module_candidates[0]
 
-        return [
+        wiring = [
             ModuleRefWiring(base_module=base_module, ref_name=ref_name, target_module=target)
             for (base_module, ref_name), target in mod_and_mod_ref_to_target.items()
         ]
+        return wiring, disabled_ref_proxies
 
     def _compile_rpc_wiring(self) -> RpcWiringPlan:
         """Compile the RPC method registry and binding requests (pure — no side effects)."""
@@ -503,11 +557,13 @@ def build(
         self._verify_no_name_conflicts()
 
         # Phase 3: Compile deploy spec (pure — no side effects)
+        module_ref_wiring, disabled_ref_proxies = self._compile_module_ref_wiring()
         deploy_spec = DeploySpec(
             module_specs=self._compile_module_specs(global_config),
             stream_wiring=self._compile_stream_wiring(),
-            module_ref_wiring=self._compile_module_ref_wiring(),
+            module_ref_wiring=module_ref_wiring,
             rpc_wiring=self._compile_rpc_wiring(),
+            disabled_ref_proxies=disabled_ref_proxies,
         )
 
         # Phase 4: Execute (all mutations go through coordinator)
diff --git a/dimos/core/module_coordinator.py b/dimos/core/module_coordinator.py
index d331f3c385..63154541db 100644
--- a/dimos/core/module_coordinator.py
+++ b/dimos/core/module_coordinator.py
@@ -75,6 +75,7 @@ def start(self) -> None:
             self._wire_streams(spec.stream_wiring)
             self._wire_rpc_methods(spec.rpc_wiring)
             self._wire_module_refs(spec.module_ref_wiring)
+            self._wire_disabled_ref_proxies(spec.disabled_ref_proxies)
             self._build_all_modules()
             self.start_all_modules()
 
@@ -189,6 +190,13 @@ def _wire_module_refs(self, wiring: list[ModuleRefWiring]) -> None:
             setattr(base_proxy, w.ref_name, target_proxy)
             base_proxy.set_module_ref(w.ref_name, target_proxy)  # type: ignore[union-attr]
 
+    def _wire_disabled_ref_proxies(self, proxies: dict[tuple[type[ModuleBase], str], Any]) -> None:
+        """Wire up no-op proxies for refs whose providers were disabled."""
+        for (base_module, module_ref_name), proxy in proxies.items():
+            base_module_proxy = self.get_instance(base_module)
+            setattr(base_module_proxy, module_ref_name, proxy)
+            base_module_proxy.set_module_ref(module_ref_name, proxy)  # type: ignore[union-attr]
+
     def _build_all_modules(self) -> None:
         """Call build() on all deployed modules in parallel.