From 14d1cffbd36c54c18685bfed278dfb84dd2e266c Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:22:37 -0500 Subject: [PATCH 01/32] initial commit for gen_on_manager -> gen_on_worker --- docs/FAQ.rst | 5 --- docs/data_structures/libE_specs.rst | 12 ++---- docs/platforms/aurora.rst | 22 +---------- docs/platforms/perlmutter.rst | 20 ---------- docs/platforms/platforms_index.rst | 22 ++--------- docs/running_libE.rst | 30 -------------- docs/tutorials/executor_forces_tutorial.rst | 38 ------------------ docs/tutorials/gpcam_tutorial.rst | 39 ++++++++++++------- libensemble/manager.py | 16 ++++---- libensemble/specs.py | 6 +-- .../test_GPU_gen_resources.py | 8 ++-- .../test_evaluate_existing_plus_gen.py | 3 +- .../test_persistent_uniform_sampling.py | 4 +- .../test_evaluate_mixed_sample.py | 1 - .../test_persistent_gp_multitask_ax.py | 5 +-- 15 files changed, 50 insertions(+), 181 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index b8a3ea2ce5..0339dbb681 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -13,11 +13,6 @@ We recommend using the following options to help debug workflows:: logger.set_level("DEBUG") libE_specs["safe_mode"] = True -To make it easier to debug a generator try setting the **libE_specs** option ``gen_on_manager``. -To do so, add the following to your calling script:: - - libE_specs["gen_on_manager"] = True - With this, ``pdb`` breakpoints can be set as usual in the generator. For more debugging options see "How can I debug specific libEnsemble processes?" below. diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index c0ca141403..6ecac6e582 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -9,12 +9,7 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl from libensemble.specs import LibeSpecs - specs = LibeSpecs( - gen_on_manager=True, - save_every_k_gens=100, - sim_dirs_make=True, - nworkers=4 - ) + specs = LibeSpecs(save_every_k_gens=100, sim_dirs_make=True, nworkers=4) .. dropdown:: Settings by Category :open: @@ -31,9 +26,8 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **nworkers** [int]: Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. - **gen_on_manager** [bool] = False - Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. + **gen_on_worker** [bool] = False + Instructs Worker process to run generator instead of Manager. **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/docs/platforms/aurora.rst b/docs/platforms/aurora.rst index af2e7cc160..4865ba0c18 100644 --- a/docs/platforms/aurora.rst +++ b/docs/platforms/aurora.rst @@ -57,7 +57,7 @@ simulations for each worker: .. code-block:: python # Instruct libEnsemble to exit after this many simulations - ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers*2) + ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers * 2) Now grab an interactive session on two nodes (or use the batch script at ``../submission_scripts/submit_pbs_aurora.sh``):: @@ -115,26 +115,6 @@ will use one GPU tile):: python run_libe_forces.py -n 25 -Running generator on the manager --------------------------------- - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows: - -.. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -then we can run with 12 (instead of 13) workers:: - - python run_libe_forces.py -n 12 - Dynamic resource assignment --------------------------- diff --git a/docs/platforms/perlmutter.rst b/docs/platforms/perlmutter.rst index 88d3f808b2..a2768e1d26 100644 --- a/docs/platforms/perlmutter.rst +++ b/docs/platforms/perlmutter.rst @@ -105,26 +105,6 @@ To see GPU usage, ssh into the node you are on in another window and run:: watch -n 0.1 nvidia-smi -Running generator on the manager -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows. - - .. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -and run with:: - - python run_libe_forces.py -n 4 - To watch video ^^^^^^^^^^^^^^ diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index c06cdbe6fd..eca8ab9d58 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,10 +24,6 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -The generator will run on a worker by default, but if running a single generator, -the :ref:`libE_specs` option **gen_on_manager** is recommended, -which runs the generator on the manager (using a thread) as below. - .. list-table:: :widths: 60 40 @@ -35,15 +31,6 @@ which runs the generator on the manager (using a thread) as below. :alt: centralized :scale: 55 - - In calling script: - - .. code-block:: python - :linenos: - - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - ) - A SLURM batch script may include: .. code-block:: bash @@ -52,7 +39,9 @@ which runs the generator on the manager (using a thread) as below. python run_libe_forces.py --nworkers 3 -When using **gen_on_manager**, set ``nworkers`` to the number of workers desired for running simulations. +If running multiple generator processes instead, then set the +:ref:`libE_specs` option **gen_on_worker** so that multiple +worker processes can run multiple generator instances. Dedicated Mode ^^^^^^^^^^^^^^ @@ -87,8 +76,6 @@ remaining nodes in the allocation. python run_libe_forces.py --nworkers 3 -Note that **gen_on_manager** is not set in the above example. - Distributed Running ------------------- @@ -137,8 +124,7 @@ Zero-resource workers --------------------- Users with persistent ``gen_f`` functions may notice that the persistent workers -are still automatically assigned system resources. This can be resolved by using -the ``gen_on_manager`` option or by +are still automatically assigned system resources. This can be resolved by :ref:`fixing the number of resource sets`. Assigning GPUs diff --git a/docs/running_libE.rst b/docs/running_libE.rst index ae658e31c6..80af301f3f 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -12,13 +12,6 @@ determine the parameters/inputs for simulations. Simulator functions run and manage simulations, which often involve running a user application (see :doc:`Executor`). -.. note:: - As of version 1.3.0, the generator can be run as a thread on the manager, - using the :ref:`libE_specs` option **gen_on_manager**. - When using this option, set the number of workers desired for running - simulations. See :ref:`Running generator on the manager` - for more details. - To use libEnsemble, you will need a calling script, which in turn will specify generator and simulator functions. Many :doc:`examples` are available. @@ -162,29 +155,6 @@ If this example was run as:: No simulations will be able to run. -.. _gen-on-manager: - -Running generator on the manager --------------------------------- - -The majority of libEnsemble use cases run a single generator. The -:ref:`libE_specs` option **gen_on_manager** will cause -the generator function to run on a thread on the manager. This can run -persistent user functions, sharing data structures with the manager, and avoids -additional communication to a generator running on a worker. When using this -option, the number of workers specified should be the (maximum) number of -concurrent simulations. - -If modifying a workflow to use ``gen_on_manager`` consider the following. - -* Set ``nworkers`` to the number of workers desired for running simulations. -* If using :meth:`add_unique_random_streams()` - to seed random streams, the default generator seed will be zero. -* If you have a line like ``libE_specs["nresource_sets"] = nworkers -1``, this - line should be removed. -* If the generator does use resources, ``nresource_sets`` can be increased as needed - so that the generator and all simulations are resourced. - Environment Variables --------------------- diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst index e01496734b..a083aa2a82 100644 --- a/docs/tutorials/executor_forces_tutorial.rst +++ b/docs/tutorials/executor_forces_tutorial.rst @@ -336,44 +336,6 @@ These may require additional browsing of the documentation to complete. ... -Running the generator on the manager ------------------------------------- - -As of version 1.3.0, the generator can be run on a thread on the manager, -using the :ref:`libE_specs` option **gen_on_manager**. - -Change the libE_specs as follows. - - .. code-block:: python - :linenos: - :lineno-start: 28 - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - sim_dirs_make=True, - ensemble_dir_path="./test_executor_forces_tutorial", - ) - -When running set ``nworkers`` to the number of workers desired for running simulations. -E.g., Instead of: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 5 - -use: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 4 - -Note that as the generator random number seed will be zero instead of one, the checksum will change. - -For more information see :ref:`Running generator on the manager`. - Running forces application with input file ------------------------------------------ diff --git a/docs/tutorials/gpcam_tutorial.rst b/docs/tutorials/gpcam_tutorial.rst index a013c1b67e..096d5584ca 100644 --- a/docs/tutorials/gpcam_tutorial.rst +++ b/docs/tutorials/gpcam_tutorial.rst @@ -30,6 +30,7 @@ This version (and others) of the gpCAM generator can be found at `libensemble/ge from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG from libensemble.tools.persistent_support import PersistentSupport + def persistent_gpCAM(H_in, persis_info, gen_specs, libE_info): """Run a batched gpCAM model to create a surrogate""" @@ -156,6 +157,7 @@ For running applications using parallel resources in the simulator see the `forc # Define our simulation function import numpy as np + def six_hump_camel(H, persis_info, sim_specs, _): """Six-Hump Camel sim_f.""" @@ -189,6 +191,8 @@ First we will create a cleanup script so we can easily re-run. # To rerun this notebook, we need to delete the ensemble directory. import shutil + + def cleanup(): try: shutil.rmtree("ensemble") @@ -218,31 +222,30 @@ If you wish to make your own functions based on the above, those can be imported nworkers = 4 - # When using gen_on_manager, nworkers is number of concurrent sims. # final_gen_send means the last evaluated points are returned to the generator to update the model. - libE_specs = LibeSpecs(nworkers=nworkers, gen_on_manager=True, final_gen_send=True) + libE_specs = LibeSpecs(nworkers=nworkers, final_gen_send=True) n = 2 # Input dimensions batch_size = 4 num_batches = 6 gen_specs = GenSpecs( - gen_f=persistent_gpCAM, # Generator function - persis_in=["f"], # Objective, defined in sim, is returned to gen + gen_f=persistent_gpCAM, # Generator function + persis_in=["f"], # Objective, defined in sim, is returned to gen outputs=[("x", float, (n,))], # Parameters (name, type, size) user={ "batch_size": batch_size, "lb": np.array([-2, -1]), # lower boundaries for n dimensions - "ub": np.array([2, 1]), # upper boundaries for n dimensions - "ask_max_iter": 5, # Number of iterations for ask (default 20) + "ub": np.array([2, 1]), # upper boundaries for n dimensions + "ask_max_iter": 5, # Number of iterations for ask (default 20) "rng_seed": 0, }, ) sim_specs = SimSpecs( - sim_f=six_hump_camel, # Simulator function - inputs=["x"], # Input field names. "x" defined in gen - outputs=[("f", float)], # Objective + sim_f=six_hump_camel, # Simulator function + inputs=["x"], # Input field names. "x" defined in gen + outputs=[("f", float)], # Objective ) # Starts one persistent generator. Simulated values are returned in batch. @@ -251,7 +254,7 @@ If you wish to make your own functions based on the above, those can be imported user={"async_return": False}, # False = batch returns ) - exit_criteria = ExitCriteria(sim_max=num_batches*batch_size) + exit_criteria = ExitCriteria(sim_max=num_batches * batch_size) # Initialize and run the ensemble. ensemble = Ensemble( @@ -272,7 +275,7 @@ At the end of our calling script we run the ensemble. H, persis_info, flag = ensemble.run() # Start the ensemble. Blocks until completion. ensemble.save_output("H_array", append_attrs=False) # Save H (history of all evaluated points) to file - pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results + pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results Rerun and test model at known points ------------------------------------ @@ -312,15 +315,21 @@ values at the test points. markersize = 10 plt.figure(figsize=(10, 5)) plt.plot( - num_sims, mse, marker="^", markeredgecolor="black", markeredgewidth=2, - markersize=markersize, linewidth=2, label="Mean squared error" + num_sims, + mse, + marker="^", + markeredgecolor="black", + markeredgewidth=2, + markersize=markersize, + linewidth=2, + label="Mean squared error", ) plt.xticks(num_sims) # Labeling the axes and the legend - plt.title('Mean Squared Error at test points') + plt.title("Mean Squared Error at test points") plt.xlabel("Number of simulations") - plt.ylabel('Mean squared error (rad$^2$)') + plt.ylabel("Mean squared error (rad$^2$)") legend = plt.legend(framealpha=1, edgecolor="black") # Increase edge width here plt.grid(True) plt.show() diff --git a/libensemble/manager.py b/libensemble/manager.py index c0cf02500e..149ab819b6 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -231,19 +231,19 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - gen_on_manager = self.libE_specs.get("gen_on_manager", False) + gen_on_worker = self.libE_specs.get("gen_on_worker", False) - self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) - if gen_on_manager: + self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + if gen_on_worker: + self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] + else: self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] self.W[0]["gen_worker"] = True local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - else: - self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] - self.W = _WorkerIndexer(self.W, gen_on_manager) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_manager) + self.W = _WorkerIndexer(self.W, gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -639,7 +639,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "gen_on_manager": self.libE_specs.get("gen_on_manager", False), + "gen_on_worker": self.libE_specs.get("gen_on_worker", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/specs.py b/libensemble/specs.py index aa70018362..e2cc525636 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -182,10 +182,8 @@ class LibeSpecs(BaseModel): nworkers: Optional[int] = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - gen_on_manager: Optional[bool] = False - """ Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. - """ + gen_on_worker: Optional[bool] = False + """ Instructs Worker process to run generator instead of Manager.""" mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index d77088d7e4..48b7fee0d2 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -100,18 +100,18 @@ libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() - for gen_on_manager in [False, True]: + for gen_on_worker in [False, True]: for run in range(5): # reset libE_specs = base_libE_specs.copy() - libE_specs["gen_on_manager"] = gen_on_manager + libE_specs["gen_on_worker"] = gen_on_worker persis_info = add_unique_random_streams({}, nworkers + 1) if run == 0: libE_specs["gen_num_procs"] = 2 elif run == 1: - if gen_on_manager: - print("SECOND LIBE CALL WITH GEN ON MANAGER") + if gen_on_worker: + print("SECOND LIBE CALL WITH GEN ON WORKER INSTEAD OF MANAGER") libE_specs["gen_num_gpus"] = 1 elif run == 2: persis_info["gen_num_gpus"] = 1 diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py index fe3d8dad8e..7a16d70736 100644 --- a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -1,6 +1,6 @@ """ Test libEnsemble's capability to evaluate existing points and then generate -new samples via gen_on_manager. +new samples. Execute via one of the following commands (e.g. 3 workers): mpiexec -np 4 python test_evaluate_existing_sample.py @@ -43,7 +43,6 @@ def create_H0(persis_info, gen_specs, H0_size): if __name__ == "__main__": sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) gen_specs = { diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index 81a18a5285..643b4723d7 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -87,9 +87,9 @@ sim_specs["in"] = ["x", "obj_component"] # sim_specs["out"] = [("f", float), ("grad", float, n)] elif run == 3: - libE_specs["gen_on_manager"] = True + libE_specs["gen_on_worker"] = True elif run == 4: - libE_specs["gen_on_manager"] = False + libE_specs["gen_on_worker"] = False libE_specs["gen_workers"] = [2] # Perform the run diff --git a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py index 481db84191..60e43fa57e 100644 --- a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py +++ b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py @@ -44,7 +44,6 @@ H0["sim_ended"][:500] = True sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.H0 = H0 sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) sampling.alloc_specs = AllocSpecs(alloc_f=alloc_f) diff --git a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py index 8c589161ad..f88db4fe05 100644 --- a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py +++ b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py @@ -2,8 +2,6 @@ Example of multi-fidelity optimization using a persistent GP gen_func (calling Ax). -This test uses the gen_on_manager option (persistent generator runs on -a thread). Therefore nworkers is the number of simulation workers. Execute via one of the following commands: mpiexec -np 4 python test_persistent_gp_multitask_ax.py @@ -50,7 +48,7 @@ def run_simulation(H, persis_info, sim_specs, libE_info): z = 8 elif task == "cheap_model": z = 1 - print('in sim', task) + print("in sim", task) libE_output = np.zeros(1, dtype=sim_specs["out"]) calc_status = WORKER_DONE @@ -64,7 +62,6 @@ def run_simulation(H, persis_info, sim_specs, libE_info): # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_manager"] = True mt_params = { "name_hifi": "expensive_model", From ae40691b0bdc89b0a39747837917a062cf20d1f9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:24:35 -0500 Subject: [PATCH 02/32] fix --- docs/platforms/platforms_index.rst | 47 +++++++++++++----------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index eca8ab9d58..843e65e5e6 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,20 +24,17 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -.. list-table:: - :widths: 60 40 +.. image:: ../images/centralized_gen_on_manager.png + :alt: centralized + :scale: 55 - * - .. image:: ../images/centralized_gen_on_manager.png - :alt: centralized - :scale: 55 +A SLURM batch script may include: - A SLURM batch script may include: +.. code-block:: bash - .. code-block:: bash + #SBATCH --nodes 3 - #SBATCH --nodes 3 - - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 If running multiple generator processes instead, then set the :ref:`libE_specs` option **gen_on_worker** so that multiple @@ -51,30 +48,28 @@ True, the MPI executor will not launch applications on nodes where libEnsemble P processes (manager and workers) are running. Workers launch applications onto the remaining nodes in the allocation. -.. list-table:: - :widths: 60 40 - * - .. image:: ../images/centralized_dedicated.png - :alt: centralized dedicated mode - :scale: 30 +.. image:: ../images/centralized_dedicated.png + :alt: centralized dedicated mode + :scale: 30 - - In calling script: +In calling script: - .. code-block:: python - :linenos: +.. code-block:: python + :linenos: - ensemble.libE_specs = LibeSpecs( - num_resource_sets=2, - dedicated_mode=True, - ) + ensemble.libE_specs = LibeSpecs( + num_resource_sets=2, + dedicated_mode=True, + ) - A SLURM batch script may include: +A SLURM batch script may include: - .. code-block:: bash +.. code-block:: bash - #SBATCH --nodes 3 + #SBATCH --nodes 3 - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 Distributed Running ------------------- From d6cf1d3052408edaa0d7e6ae236566d88fc6f194 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:04:31 -0500 Subject: [PATCH 03/32] additional note --- docs/overview_usecases.rst | 3 +++ docs/platforms/platforms_index.rst | 1 + 2 files changed, 4 insertions(+) diff --git a/docs/overview_usecases.rst b/docs/overview_usecases.rst index 56ad05b6c9..6d63ce8ff8 100644 --- a/docs/overview_usecases.rst +++ b/docs/overview_usecases.rst @@ -20,6 +20,9 @@ which perform computations via **user functions**: | +As of **v2.0** the **Manager** by default runs **a single generator**. This +is configurable. + The default allocator (``alloc_f``) instructs workers to run the simulator on the highest priority work from the generator. If a worker is idle and there is no work, that worker is instructed to call the generator. diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 843e65e5e6..6daa319b94 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -59,6 +59,7 @@ In calling script: :linenos: ensemble.libE_specs = LibeSpecs( + gen_on_worker=True, num_resource_sets=2, dedicated_mode=True, ) From c54adb3d5063a6913fd9d85d3f5fe3e456f6ea19 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:29:51 -0500 Subject: [PATCH 04/32] fixes, plus for test_manager_main we don't want to run the default worker0 during unit tests --- libensemble/manager.py | 7 ++++--- libensemble/tests/unit_tests/test_manager_main.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 149ab819b6..49ddebc8b3 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,8 +232,9 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) + len_W = len(self.wcomms) + 1 - gen_on_worker - self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] else: @@ -242,8 +243,8 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, gen_on_worker) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources diff --git a/libensemble/tests/unit_tests/test_manager_main.py b/libensemble/tests/unit_tests/test_manager_main.py index 4e246eb570..e34bc76301 100644 --- a/libensemble/tests/unit_tests/test_manager_main.py +++ b/libensemble/tests/unit_tests/test_manager_main.py @@ -6,7 +6,7 @@ import libensemble.manager as man import libensemble.tests.unit_tests.setup as setup -libE_specs = {"comms": "local"} +libE_specs = {"comms": "local", "gen_on_worker": True} def test_term_test_1(): From dc2570d81c63a7d319f05e0ccd02ca09dfb9821d Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:11:59 -0500 Subject: [PATCH 05/32] small fixes, comments, additional unit test for affirming alloc behavior with additional worker zero --- libensemble/manager.py | 4 +- .../test_allocation_funcs_and_support.py | 55 +++++++++++++++++++ libensemble/worker.py | 2 +- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index de36dc1319..fe6a4bd710 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,7 +232,7 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) - len_W = len(self.wcomms) + 1 - gen_on_worker + len_W = len(self.wcomms) + 1 - gen_on_worker # if gen_on_worker, len_W = len(self.wcomms) self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: @@ -243,7 +243,7 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) # if gen on worker, then no additional worker self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 6d056b1e01..9d1d45fece 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -34,6 +34,25 @@ ], ) +W_gen_mgr = np.array( + [ + (0, True, 0, 0, False, False), + (1, False, 0, 0, False, False), + (2, False, 0, 0, False, False), + (3, False, 0, 0, False, False), + (4, False, 0, 0, False, False), + ], + dtype=[ + ("worker_id", " Date: Wed, 13 Aug 2025 15:27:02 -0500 Subject: [PATCH 06/32] check worker zero for run_order for aposmm-ibcdfo-pounders --- .../regression_tests/test_persistent_aposmm_ibcdfo_pounders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py index 7523704a0b..356b2017a9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py @@ -135,7 +135,7 @@ def synthetic_beamline_mapping(H, _, sim_specs): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 save_libE_output(H, persis_info, __file__, nworkers) From a6fd67ce71a10ebcf6fd31cab65c12339652c368 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:43:01 -0500 Subject: [PATCH 07/32] test_zero_resource_workers probably needs adjustment? --- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index c8f0786d06..4739c5bfb6 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -36,7 +36,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True From 3ce9ed4aeb40b7ef5f389b2776ec4720637119cd Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:20:15 -0500 Subject: [PATCH 08/32] fixes to accomodate the zeroth worker being a zero-resource worker --- libensemble/sim_funcs/run_line_check.py | 2 +- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- libensemble/tools/parse_args.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 530b1e8d9b..96ef13a5da 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (1 + persis_gens) + offset = workerID - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index 4739c5bfb6..d286aa7128 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -101,7 +101,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index 5e302c0ce0..c52b6f7c8f 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + nworkers = args.nworkers + 1 # for manager if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 9c38da2e9ae9f3308456a23faef75a3d32068da9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:21:52 -0500 Subject: [PATCH 09/32] fix the specified zero-resource worker --- .../functionality_tests/test_zero_resource_workers_subnode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 69ea2b559c..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -39,7 +39,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True @@ -100,7 +100,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options From dc477724c8670af1b9491a1d2d23167584b5b0ce Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 12:01:38 -0500 Subject: [PATCH 10/32] tiny fixes --- ...daptive_workers_persistent_oversubscribe_rsets.py | 8 ++++---- .../functionality_tests/test_sim_dirs_per_worker.py | 12 ++++++------ .../test_zero_resource_workers_subnode.py | 2 +- libensemble/tools/parse_args.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py index fb730b966b..94bf1a387f 100644 --- a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py +++ b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py @@ -31,9 +31,9 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - nsim_workers = nworkers - 1 + nsim_workers = nworkers - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] rsets = nsim_workers * 2 libE_specs["num_resource_sets"] = rsets @@ -64,7 +64,7 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n), ("x_on_cube", float, n)], "user": { - "initial_batch_size": nworkers - 1, + "initial_batch_size": nworkers, "max_resource_sets": max_rsets, "lb": np.array([-3, -2]), "ub": np.array([3, 2]), @@ -91,7 +91,7 @@ "node_file": node_file, } # Name of file containing a node-list - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": 40, "wallclock_max": 300} # Perform the run diff --git a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py index 69bb34ab84..a15ede9e92 100644 --- a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py +++ b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py @@ -23,14 +23,14 @@ from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.tools import add_unique_random_streams, parse_args -nworkers, is_manager, libE_specs, _ = parse_args() +n_simworkers, is_manager, libE_specs, _ = parse_args() # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": sim_input_dir = "./sim_input_dir" dir_to_copy = sim_input_dir + "/copy_this" dir_to_symlink = sim_input_dir + "/symlink_this" - w_ensemble = "./ensemble_workdirs_w" + str(nworkers) + "_" + libE_specs.get("comms") + w_ensemble = "./ensemble_workdirs_w" + str(n_simworkers) + "_" + libE_specs.get("comms") print("creating ensemble dir: ", w_ensemble, flush=True) for dir in [sim_input_dir, dir_to_copy, dir_to_symlink]: @@ -60,7 +60,7 @@ }, } - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, n_simworkers) exit_criteria = {"sim_max": 21} @@ -69,9 +69,9 @@ if is_manager: assert os.path.isdir(w_ensemble), f"Ensemble directory {w_ensemble} not created." worker_dir_sum = sum(["worker" in i for i in os.listdir(w_ensemble)]) - assert worker_dir_sum == nworkers, "Number of worker dirs ({}) does not match nworkers ({}).".format( - worker_dir_sum, nworkers - ) + assert ( + worker_dir_sum == n_simworkers + 1 + ), "Number of worker dirs ({}) does not match n_simworkers ({}).".format(worker_dir_sum, n_simworkers) input_copied = [] sim_dir_sum = 0 diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index a7a57b584a..446142f5db 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 +# TESTSUITE_NPROCS: 3 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index c52b6f7c8f..5e302c0ce0 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + 1 # for manager + nworkers = args.nworkers if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 46708391e8faac92faf1c086c1e7dbfe4b6ae8ad Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:08:11 -0500 Subject: [PATCH 11/32] fix run_order key in persis_info for handful of aposmm tests --- .../tests/regression_tests/test_persistent_aposmm_dfols.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_periodic.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_timeout.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 6e19930691..322f3d6633 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -102,7 +102,7 @@ def combine_component(x): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 assert np.min(H["f"][H["sim_ended"]]) <= 3000, "Didn't find a value below 3000" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py index d99e8802a0..8cc215800d 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py @@ -89,7 +89,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) # The minima are known on this test problem. If the above [lb, ub] domain is diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py index e61843fd71..e6014cbee3 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py @@ -87,6 +87,6 @@ if is_manager: assert flag == 2, "Test should have timed out" - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index f2d2f09cc0..75b3a582d9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -121,7 +121,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0=H0) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." From befcecc2e9821df661f7f08c8248543bf1cb34fc Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:13:25 -0500 Subject: [PATCH 12/32] ditto --- .../test_persistent_uniform_sampling_nonblocking.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- .../tests/regression_tests/test_persistent_fd_param_finder.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py index 5425578849..b62cbe6b64 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py @@ -68,4 +68,4 @@ assert len(np.unique(H["gen_ended_time"])) == 2 save_libE_output(H, persis_info, __file__, nworkers) - assert persis_info[1]["spin_count"] > 0, "This should have been a nonblocking receive" + assert persis_info[0]["spin_count"] > 0, "This should have been a nonblocking receive" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index 75b3a582d9..48b1ae2ff6 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -123,7 +123,7 @@ if is_manager: assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( - len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] + len(persis_info[0]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." assert len(H) < exit_criteria["sim_max"], "Test should have stopped early due to 'stop_after_k_minima'" diff --git a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py index ac01d5683b..de97470dc2 100644 --- a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py +++ b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py @@ -70,6 +70,6 @@ if fd_test.is_manager: assert len(H) < fd_test.exit_criteria.gen_max, "Problem didn't stop early, which should have been the case." - assert np.all(persis_info[1]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." + assert np.all(persis_info[0]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." fd_test.save_output(__file__) From 30403c70cfdbe6b366900235feebd367f0f784d3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:36:44 -0500 Subject: [PATCH 13/32] fix workercount? --- .../functionality_tests/test_zero_resource_workers_subnode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 446142f5db..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 +# TESTSUITE_NPROCS: 4 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": From 519930d37887c6069e1acfcb898e3b4ee1cf7b0f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:55:26 -0500 Subject: [PATCH 14/32] trying to narrow down issue with this test... --- .../test_persistent_uniform_gen_decides_stop.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index 68c8aaaa05..f6a1e5d57f 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -13,7 +13,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 5 7 +# TESTSUITE_NPROCS: 3 5 # TESTSUITE_OS_SKIP: WIN import sys @@ -82,9 +82,7 @@ assert ( sum(counts == init_batch_size) >= ngens ), "The initial batch of each gen should be common among initial_batch_size number of points" - assert ( - len(counts) > 1 - ), "All gen_ended_times are the same; they should be different for the async case" + assert len(counts) > 1, "All gen_ended_times are the same; they should be different for the async case" gen_workers = np.unique(H["gen_worker"]) print("Generators that issued points", gen_workers) From 015cc4a90446a4759829a484eb64f1c52d8b340b Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 28 Aug 2025 11:11:47 -0500 Subject: [PATCH 15/32] still narrowing down issues with the zrw tests and having a zeroth worker... --- libensemble/sim_funcs/run_line_check.py | 6 +++--- .../test_mpi_runners_zrw_subnode_uneven.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 96ef13a5da..44827d3c90 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (persis_gens) + offset = workerID # - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) @@ -80,7 +80,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): exctr = Executor.executor test = sim_specs["user"]["tests"][0] exp_list = sim_specs["user"]["expect"] - p_gens = sim_specs["user"].get("persis_gens", 0) + # p_gens = sim_specs["user"].get("persis_gens", 0) task = exctr.submit( calc_type="sim", @@ -107,7 +107,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): else: wid_mod = wid - new_exp_list = exp_list[wid_mod - 1 - p_gens] + new_exp_list = exp_list[wid_mod - 1] # - p_gens] if outline != new_exp_list: print(f"Worker {wid}:\n outline is: {outline}\n exp is: {new_exp_list}", flush=True) diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index cc73d0e427..9c9f936edb 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -44,6 +44,7 @@ comms = libE_specs["comms"] libE_specs["dedicated_mode"] = True + libE_specs["zero_resource_workers"] = [0] libE_specs["enforce_worker_core_bounds"] = True # To allow visual checking - log file not used in test @@ -52,7 +53,7 @@ # For varying size test - relate node count to nworkers n_gens = 1 - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens if nsim_workers % 2 == 0: sys.exit( From 8c04b3a3212483165c31c3a10b4b222e5b23da29 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 5 Sep 2025 11:35:10 -0500 Subject: [PATCH 16/32] various fixes throughout the codebase to try making the number of workers (with gen_on_worker defaulting to False) clear to resources. other temporary debug adjusts --- libensemble/libE.py | 4 +++- libensemble/resources/worker_resources.py | 2 +- libensemble/sim_funcs/run_line_check.py | 2 +- .../functionality_tests/test_zero_resource_workers.py | 10 ++++------ libensemble/worker.py | 6 +++--- pyproject.toml | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index af302d13c8..665553fe75 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -489,8 +489,10 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li wcomms = start_proc_team(libE_specs["nworkers"], sim_specs, gen_specs, libE_specs) # Set manager resources after the forkpoint. + # if libE_specs["gen_on_worker"] == True, -n reflects the exact number of workers + # if libE_specs["gen_on_worker"] == False: nworkers internally is the number of workers + 1 if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"]) + resources.set_resource_manager(libE_specs["nworkers"] + (1 - libE_specs["gen_on_worker"])) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 5033b2aeee..8df45929cc 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -106,7 +106,7 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in """Map WorkerID to index into a nodelist""" index = 0 index_list = [] - for i in range(1, num_workers + 1): + for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) else: diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 44827d3c90..9d16205d28 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID # - (persis_gens) + offset = workerID new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index d286aa7128..93c4350786 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -9,8 +9,6 @@ The number of concurrent evaluations of the objective function will be 4-1=3. """ -import sys - import numpy as np from libensemble import logger @@ -23,7 +21,7 @@ from libensemble.tools import add_unique_random_streams, parse_args # logger.set_level("DEBUG") # For testing the test -logger.set_level("INFO") +logger.set_level("DEBUG") # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local @@ -49,7 +47,7 @@ # For varying size test - relate node count to nworkers in_place = libE_specs["zero_resource_workers"] n_gens = len(in_place) - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens comms = libE_specs["comms"] nodes_per_worker = 2 @@ -79,8 +77,8 @@ exctr = MPIExecutor(custom_info=mpi_customizer) exctr.register_app(full_path=sim_app, calc_type="sim") - if nworkers < 2: - sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") + # if nworkers < 2: + # sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") n = 2 sim_specs = { diff --git a/libensemble/worker.py b/libensemble/worker.py index 5f574a3336..e543e93bcc 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -177,7 +177,7 @@ def __init__( self.runners = {EVAL_SIM_TAG: self.sim_runner.run, EVAL_GEN_TAG: self.gen_runner.run} self.calc_iter = {EVAL_SIM_TAG: 0, EVAL_GEN_TAG: 0} Worker._set_executor(self.workerID, self.comm) - Worker._set_resources(self.workerID, self.comm) + Worker._set_resources(self.workerID, self.comm, self.libE_specs) self.EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) @staticmethod @@ -215,11 +215,11 @@ def _set_executor(workerID: int, comm: Comm) -> bool: return False @staticmethod - def _set_resources(workerID, comm: Comm) -> bool: + def _set_resources(workerID, comm: Comm, libE_specs) -> bool: """Sets worker ID in the resources, return True if set""" resources = Resources.resources if isinstance(resources, Resources): - resources.set_worker_resources(comm.get_num_workers(), workerID) + resources.set_worker_resources(comm.get_num_workers() + (1 - libE_specs["gen_on_worker"]), workerID) return True else: logger.debug(f"No resources set on worker {workerID}") diff --git a/pyproject.toml b/pyproject.toml index 68d5654da3..231afa5bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,4 +142,4 @@ extend-exclude = ["*.bib", "*.xml", "docs/nitpicky"] disable_error_code = ["import-not-found", "import-untyped"] [dependency-groups] -dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4"] +dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4", "wat>=0.7.0,<0.8"] From fafff0ea5b0ab16809f00ad9f7cfe6a2f09fc3bf Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 12 Dec 2025 16:25:13 -0600 Subject: [PATCH 17/32] adjust map_workerid_to_index unit test for starting at workerID index zero --- .../tests/unit_tests/test_resources.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index b87583f377..15db28ea07 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -694,23 +694,23 @@ def test_map_workerid_to_index(): zero_resource_list = [] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(1, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 1, "index incorrect. Received: " + str(index) + for workerID in range(0, num_workers): + index = index_list[workerID] + assert index == workerID, "index incorrect. Received: " + str(index) - zero_resource_list = [1] + zero_resource_list = [0] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(2, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 2, "index incorrect. Received: " + str(index) + for workerID in range(1, num_workers): + index = index_list[workerID] + assert index == workerID - 1, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 2] + zero_resource_list = [0, 1] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(3, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 3, "index incorrect. Received: " + str(index) + for workerID in range(2, num_workers): + index = index_list[workerID] + assert index == workerID - 2, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 3] + zero_resource_list = [0, 2] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) workerID = 2 From f4370683209460ccbdf97c90a564d1040273e0dd Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 18 Dec 2025 10:46:36 -0600 Subject: [PATCH 18/32] perhaps the default zero-resource-worker should be worker 0 - but does this actually solve anything? --- libensemble/specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/specs.py b/libensemble/specs.py index 168bbcc380..16347084c5 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -469,7 +469,7 @@ class LibeSpecs(BaseModel): libEnsemble processes (manager and workers) are running. """ - zero_resource_workers: list[int] | None = [] + zero_resource_workers: list[int] | None = [0] """ list of workers that require no resources. For when a fixed mapping of workers to resources is required. Otherwise, use ``num_resource_sets``. From 39798a393218e7f508eadf2bd26050f33c1f11d8 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 13 Mar 2026 12:40:43 -0500 Subject: [PATCH 19/32] various nworkers indexing fixes for outstanding bugs and indexing errors, regarding the default presence of the zeroth worker. assistance from gemini/claude for fixing resource math in various tests. --- libensemble/libE.py | 3 +-- libensemble/manager.py | 2 +- libensemble/sim_funcs/run_line_check.py | 2 +- .../test_GPU_gen_resources.py | 27 +++++++++++++------ .../test_asktell_sampling.py | 2 +- .../test_asktell_sampling_external_gen.py | 2 +- .../test_zero_resource_workers.py | 21 ++++++++------- .../test_zero_resource_workers_subnode.py | 19 ++++++------- .../test_asktell_aposmm_nlopt.py | 2 +- .../regression_tests/test_optimas_ax_mf.py | 2 +- .../test_optimas_ax_multitask.py | 2 +- .../regression_tests/test_optimas_ax_sf.py | 2 +- .../test_optimas_grid_sample.py | 2 +- .../tests/regression_tests/test_xopt_EI.py | 2 +- .../regression_tests/test_xopt_EI_xopt_sim.py | 2 +- .../regression_tests/test_xopt_nelder_mead.py | 2 +- .../forces_simple_xopt/run_libe_forces.py | 1 - libensemble/worker.py | 2 +- 18 files changed, 54 insertions(+), 43 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index c3d5df2917..7bd84658dc 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -495,9 +495,8 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li # Set manager resources after the forkpoint. # if libE_specs["gen_on_worker"] == True, -n reflects the exact number of workers - # if libE_specs["gen_on_worker"] == False: nworkers internally is the number of workers + 1 if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"] + (1 - libE_specs["gen_on_worker"])) + resources.set_resource_manager(libE_specs["nworkers"] + 1) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/manager.py b/libensemble/manager.py index 208c7058a7..5d6e1cf79b 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -401,7 +401,7 @@ def _set_resources(self, Work: dict, w: int) -> None: if rset_req is None: rset_team = [] - default_rset = resource_manager.index_list[w - 1] + default_rset = resource_manager.index_list[w] if default_rset is not None: rset_team.append(default_rset) Work["libE_info"]["rset_team"] = rset_team diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 9d16205d28..98b5fa29e8 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID + offset = workerID - 1 new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index 48b7fee0d2..9ea37ff9a6 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -50,8 +50,6 @@ if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers # Persistent gen DOES need resources - # Mock GPU system / uncomment to detect GPUs libE_specs["sim_dirs_make"] = True # Will only contain files if dry_run is False libE_specs["gen_dirs_make"] = True # Will only contain files if dry_run is False @@ -80,8 +78,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("num_procs", int), ("num_gpus", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_procs": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": "set_in_loop", + "max_procs": "set_in_loop", # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), "dry_run": dry_run, @@ -97,7 +95,6 @@ } exit_criteria = {"sim_max": 20} - libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() for gen_on_worker in [False, True]: @@ -105,7 +102,21 @@ # reset libE_specs = base_libE_specs.copy() libE_specs["gen_on_worker"] = gen_on_worker - persis_info = add_unique_random_streams({}, nworkers + 1) + libE_specs["zero_resource_workers"] = [] + + active_workers = nworkers if gen_on_worker else nworkers + 1 + sim_workers = nworkers - 1 if gen_on_worker else nworkers + + gen_specs["user"]["initial_batch_size"] = sim_workers + gen_specs["user"]["max_procs"] = sim_workers + + libE_specs["num_resource_sets"] = active_workers + libE_specs["resource_info"] = { + "cores_on_node": (active_workers * 2, active_workers * 4), + "gpus_on_node": active_workers, + } + + persis_info = add_unique_random_streams({}, active_workers) if run == 0: libE_specs["gen_num_procs"] = 2 @@ -117,13 +128,13 @@ persis_info["gen_num_gpus"] = 1 elif run == 3: # Two GPUs per resource set - libE_specs["resource_info"]["gpus_on_node"] = nworkers * 2 + libE_specs["resource_info"]["gpus_on_node"] = active_workers * 2 persis_info["gen_num_gpus"] = 1 elif run == 4: # Two GPUs requested for gen persis_info["gen_num_procs"] = 2 persis_info["gen_num_gpus"] = 2 - gen_specs["user"]["max_procs"] = max(nworkers - 2, 1) + gen_specs["user"]["max_procs"] = max(sim_workers - 1, 1) # Perform the run H, persis_info, flag = libE( diff --git a/libensemble/tests/functionality_tests/test_asktell_sampling.py b/libensemble/tests/functionality_tests/test_asktell_sampling.py index cebb858df2..7da9f1c668 100644 --- a/libensemble/tests/functionality_tests/test_asktell_sampling.py +++ b/libensemble/tests/functionality_tests/test_asktell_sampling.py @@ -34,7 +34,7 @@ def sim_f(In): if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_manager"] = True + libE_specs["gen_on_worker"] = False sim_specs = { "sim_f": sim_f, diff --git a/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py b/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py index 8578da720c..1bbbf696f4 100644 --- a/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py +++ b/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py @@ -44,7 +44,7 @@ def sim_f_scalar(In): if __name__ == "__main__": - libE_specs = LibeSpecs(gen_on_manager=True) + libE_specs = LibeSpecs() for test in range(1): # 2 diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index 93c4350786..bc03ebbb7a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -1,12 +1,12 @@ """ Runs libEnsemble testing the zero_resource_workers argument. -Execute via one of the following commands (e.g. 3 workers): - mpiexec -np 4 python test_zero_resource_workers.py - python test_zero_resource_workers.py --nworkers 3 - python test_zero_resource_workers.py --nworkers 3 --comms tcp +Execute via one of the following commands (e.g. 4 workers): + mpiexec -np 5 python test_zero_resource_workers.py + python test_zero_resource_workers.py --nworkers 4 + python test_zero_resource_workers.py --nworkers 4 --comms tcp -The number of concurrent evaluations of the objective function will be 4-1=3. +The number of concurrent evaluations of the objective function will be 4. """ import numpy as np @@ -25,7 +25,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 4 +# TESTSUITE_NPROCS: 3 5 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": @@ -45,9 +45,10 @@ nodes_per_worker = 2 # For varying size test - relate node count to nworkers - in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) - nsim_workers = nworkers # - n_gens + # With gen-on-manager (default), all user workers are sim workers. + # Worker 0 (gen) is hidden and in zero_resource_workers. + n_gens = 0 + nsim_workers = nworkers comms = libE_specs["comms"] nodes_per_worker = 2 @@ -99,7 +100,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers) + persis_info = add_unique_random_streams({}, nworkers + 1) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index a7a57b584a..19986097c7 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -2,13 +2,13 @@ Runs libEnsemble testing the zero_resource_workers argument with 2 workers per node. -This test must be run on an odd number of workers >= 3 (e.g. even number of +This test must be run on an even number of workers >= 2 (e.g. odd number of procs when using mpi4py). Execute via one of the following commands (e.g. 3 workers): - mpiexec -np 4 python test_zero_resource_workers_subnode.py - python test_zero_resource_workers_subnode.py --nworkers 3 - python test_zero_resource_workers_subnode.py --nworkers 3 --comms tcp + mpiexec -np 5 python test_zero_resource_workers_subnode.py + python test_zero_resource_workers_subnode.py --nworkers 4 + python test_zero_resource_workers_subnode.py --nworkers 4 --comms tcp """ import sys @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 +# TESTSUITE_NPROCS: 5 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": @@ -50,9 +50,10 @@ nodes_per_worker = 0.5 # For varying size test - relate node count to nworkers - in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) - nsim_workers = nworkers - n_gens + # With gen-on-manager (default), all user workers are sim workers. + # Worker 0 (gen) is hidden and in zero_resource_workers. + n_gens = 0 + nsim_workers = nworkers if not (nsim_workers * nodes_per_worker).is_integer(): sys.exit(f"Sim workers ({nsim_workers}) must divide evenly into nodes") @@ -100,7 +101,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers) + persis_info = add_unique_random_streams({}, nworkers + 1) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py b/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py index a85771d7dd..5a36e2e1ec 100644 --- a/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py +++ b/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py @@ -64,7 +64,7 @@ def six_hump_camel_func(x): n = 2 workflow.alloc_specs = AllocSpecs(alloc_f=alloc_f) - workflow.libE_specs.gen_on_manager = True + workflow.libE_specs.gen_on_worker = False vocs = VOCS( variables={"core": [-3, 3], "edge": [-2, 2], "core_on_cube": [-3, 3], "edge_on_cube": [-2, 2]}, diff --git a/libensemble/tests/regression_tests/test_optimas_ax_mf.py b/libensemble/tests/regression_tests/test_optimas_ax_mf.py index 758aa1fc2c..d99bc518a9 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_mf.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_mf.py @@ -42,7 +42,7 @@ def eval_func_mf(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={"x0": [-50.0, 5.0], "x1": [-5.0, 15.0], "res": [1.0, 8.0]}, diff --git a/libensemble/tests/regression_tests/test_optimas_ax_multitask.py b/libensemble/tests/regression_tests/test_optimas_ax_multitask.py index 43c6c444eb..65c81a373f 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_multitask.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_multitask.py @@ -57,7 +57,7 @@ def eval_func_multitask(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={ diff --git a/libensemble/tests/regression_tests/test_optimas_ax_sf.py b/libensemble/tests/regression_tests/test_optimas_ax_sf.py index e4ee9e8a79..b04753490f 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_sf.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_sf.py @@ -41,7 +41,7 @@ def eval_func_sf(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={ diff --git a/libensemble/tests/regression_tests/test_optimas_grid_sample.py b/libensemble/tests/regression_tests/test_optimas_grid_sample.py index 57c6c8fedf..555ef071f7 100644 --- a/libensemble/tests/regression_tests/test_optimas_grid_sample.py +++ b/libensemble/tests/regression_tests/test_optimas_grid_sample.py @@ -42,7 +42,7 @@ def eval_func(input_params: dict): n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) # Create varying parameters. lower_bounds = [-3.0, 2.0] diff --git a/libensemble/tests/regression_tests/test_xopt_EI.py b/libensemble/tests/regression_tests/test_xopt_EI.py index a78aee60a5..3c902ee9d7 100644 --- a/libensemble/tests/regression_tests/test_xopt_EI.py +++ b/libensemble/tests/regression_tests/test_xopt_EI.py @@ -53,7 +53,7 @@ def xtest_sim(H, persis_info, sim_specs, _): n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py b/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py index 07ace47fa9..8082a67fac 100644 --- a/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py +++ b/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py @@ -47,7 +47,7 @@ def xtest_callable(input_dict: dict, a=0) -> dict: n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/regression_tests/test_xopt_nelder_mead.py b/libensemble/tests/regression_tests/test_xopt_nelder_mead.py index fe8039b95c..d8bfdb4597 100644 --- a/libensemble/tests/regression_tests/test_xopt_nelder_mead.py +++ b/libensemble/tests/regression_tests/test_xopt_nelder_mead.py @@ -38,7 +38,7 @@ def rosenbrock_callable(input_dict: dict) -> dict: batch_size = 1 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py index 2a7f8f7018..2d23c83a88 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py @@ -32,7 +32,6 @@ # Persistent gen does not need resources ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, sim_dirs_make=True, ) diff --git a/libensemble/worker.py b/libensemble/worker.py index 3ac46b9760..3a472899e6 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -223,7 +223,7 @@ def _set_resources(workerID, comm: Comm, libE_specs) -> bool: """Sets worker ID in the resources, return True if set""" resources = Resources.resources if isinstance(resources, Resources): - resources.set_worker_resources(comm.get_num_workers() + (1 - libE_specs["gen_on_worker"]), workerID) + resources.set_worker_resources(comm.get_num_workers() + 1, workerID) return True else: logger.debug(f"No resources set on worker {workerID}") From 836e52514784ea7a22da85a5764c68f6d594dd60 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 13 Mar 2026 13:59:39 -0500 Subject: [PATCH 20/32] some clarifying renames and comments --- .../test_GPU_gen_resources.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index 9ea37ff9a6..d45abaa4bf 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -102,21 +102,23 @@ # reset libE_specs = base_libE_specs.copy() libE_specs["gen_on_worker"] = gen_on_worker - libE_specs["zero_resource_workers"] = [] + libE_specs["zero_resource_workers"] = [] # perhaps the generator needs GPUs - active_workers = nworkers if gen_on_worker else nworkers + 1 + resourced_workers = ( + nworkers if gen_on_worker else nworkers + 1.0 + ) # note this "nworkers" decided before the extra worker starts sim_workers = nworkers - 1 if gen_on_worker else nworkers gen_specs["user"]["initial_batch_size"] = sim_workers gen_specs["user"]["max_procs"] = sim_workers - libE_specs["num_resource_sets"] = active_workers + libE_specs["num_resource_sets"] = resourced_workers libE_specs["resource_info"] = { - "cores_on_node": (active_workers * 2, active_workers * 4), - "gpus_on_node": active_workers, + "cores_on_node": (resourced_workers * 2, resourced_workers * 4), + "gpus_on_node": resourced_workers, } - persis_info = add_unique_random_streams({}, active_workers) + persis_info = add_unique_random_streams({}, resourced_workers) if run == 0: libE_specs["gen_num_procs"] = 2 @@ -128,7 +130,7 @@ persis_info["gen_num_gpus"] = 1 elif run == 3: # Two GPUs per resource set - libE_specs["resource_info"]["gpus_on_node"] = active_workers * 2 + libE_specs["resource_info"]["gpus_on_node"] = resourced_workers * 2 persis_info["gen_num_gpus"] = 1 elif run == 4: # Two GPUs requested for gen From 9a8425e4531a088c338591ff11f7c1dcef2046d1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 09:27:53 -0500 Subject: [PATCH 21/32] ditto --- .../tests/regression_tests/test_persistent_aposmm_dfols.py | 1 + .../tests/regression_tests/test_persistent_aposmm_exception.py | 1 + .../regression_tests/test_persistent_aposmm_external_localopt.py | 1 + .../tests/regression_tests/test_persistent_aposmm_nlopt.py | 1 + .../tests/regression_tests/test_persistent_aposmm_tao_blmvm.py | 1 + 5 files changed, 5 insertions(+) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 6e19930691..c810db561e 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -69,6 +69,7 @@ def combine_component(x): "gen_f": gen_f, "persis_in": ["f", "fvec"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "localopt_method": "dfols", diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py b/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py index b197dc3f07..09544e51d1 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py @@ -69,6 +69,7 @@ def assertion(passed): "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "localopt_method": "LN_BOBYQA", diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py index dd01d1069e..22cf47325f 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py @@ -75,6 +75,7 @@ "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py index 9d42784439..bf5914573f 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py @@ -64,6 +64,7 @@ "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py index 39ff3b79fd..8a3c762871 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py @@ -67,6 +67,7 @@ "gen_f": gen_f, "persis_in": ["f", "grad"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), From e0d4cbaee883813193b759cbabd1dc1cd6a67bc8 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 13:30:50 -0500 Subject: [PATCH 22/32] 0 is always a zero-resource-worker, no matter what... plus mypy fixes --- libensemble/resources/resources.py | 8 +++++--- libensemble/sim_funcs/run_line_check.py | 4 ++-- .../tests/functionality_tests/test_GPU_gen_resources.py | 2 +- .../test_mpi_runners_subnode_uneven.py | 2 +- .../test_mpi_runners_zrw_supernode_uneven.py | 5 +++-- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 105f8a836e..90765e9490 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -63,10 +63,10 @@ def init_resources(cls, libE_specs: dict, platform_info: dict = {}) -> None: libE_specs=libE_specs, platform_info=platform_info, top_level_dir=top_level_dir ) - def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = None) -> None: + def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = "") -> None: """Initiate a new resources object""" self.top_level_dir = top_level_dir or os.getcwd() - self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir=None) + self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir="") self.resource_manager = None # For Manager self.worker_resources = None # For Workers @@ -101,7 +101,7 @@ class GlobalResources: :ivar int num_resource_sets: Number of resource sets, if supplied by the user. """ - def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = None) -> None: + def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = "") -> None: """Initializes a new Resources instance Determines the compute resources available for current allocation, including @@ -167,6 +167,8 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) + if 0 not in self.zero_resource_workers: + self.zero_resource_workers.append(0) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 98b5fa29e8..d30d10ec38 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -80,7 +80,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): exctr = Executor.executor test = sim_specs["user"]["tests"][0] exp_list = sim_specs["user"]["expect"] - # p_gens = sim_specs["user"].get("persis_gens", 0) + p_gens = sim_specs["user"].get("persis_gens", 0) task = exctr.submit( calc_type="sim", @@ -107,7 +107,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): else: wid_mod = wid - new_exp_list = exp_list[wid_mod - 1] # - p_gens] + new_exp_list = exp_list[wid_mod - 1 - p_gens] if outline != new_exp_list: print(f"Worker {wid}:\n outline is: {outline}\n exp is: {new_exp_list}", flush=True) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index d45abaa4bf..5308f12c7f 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -105,7 +105,7 @@ libE_specs["zero_resource_workers"] = [] # perhaps the generator needs GPUs resourced_workers = ( - nworkers if gen_on_worker else nworkers + 1.0 + nworkers if gen_on_worker else nworkers + 1 ) # note this "nworkers" decided before the extra worker starts sim_workers = nworkers - 1 if gen_on_worker else nworkers diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py index a5145965b9..9312c83b49 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py @@ -26,7 +26,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 6 +# TESTSUITE_NPROCS: 5 7 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py index 640d613bff..77ce05006f 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py @@ -33,9 +33,10 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0, 1] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True + libE_specs["gen_on_worker"] = True # To allow visual checking - log file not used in test log_file = "ensemble_mpi_runners_zrw_supernode_uneven_comms_" + str(comms) + "_wrks_" + str(nworkers) + ".log" @@ -45,7 +46,7 @@ # For varying size test - relate node count to nworkers in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) + n_gens = len([w for w in in_place if w != 0]) nsim_workers = nworkers - n_gens comms = libE_specs["comms"] node_file = "nodelist_mpi_runners_zrw_supernode_uneven_comms_" + str(comms) + "_wrks_" + str(nworkers) From 6505cd8438dbb2e64e962b39e981a281adcd9afa Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 15:51:34 -0500 Subject: [PATCH 23/32] various zrw / gen-worker indexing fixes. defensive programming around worker indexes and zrws. mypy types. vibe-coded new test cases for test_mpi_runners_zrw_subnode_uneven --- libensemble/libE.py | 2 +- libensemble/resources/resources.py | 6 ++---- libensemble/resources/worker_resources.py | 9 ++++++-- .../test_mpi_gpu_settings.py | 6 +++--- .../test_mpi_gpu_settings_env.py | 6 +++--- .../test_mpi_runners_zrw_subnode_uneven.py | 21 +++++++++++++++---- ...est_persistent_uniform_gen_decides_stop.py | 2 ++ 7 files changed, 35 insertions(+), 17 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index f768b1e11f..71cc654e8c 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -385,7 +385,7 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE # Run manager or worker code, depending if is_manager: if resources is not None: - resources.set_resource_manager(nworkers) + resources.set_resource_manager(nworkers + 1) return libE_mpi_manager( mpi_comm, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0 ) diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 90765e9490..5ba54cc09b 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -67,8 +67,8 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st """Initiate a new resources object""" self.top_level_dir = top_level_dir or os.getcwd() self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir="") - self.resource_manager = None # For Manager - self.worker_resources = None # For Workers + self.resource_manager: ResourceManager | None = None # For Manager + self.worker_resources: WorkerResources | None = None # For Workers def set_worker_resources(self, num_workers: int, workerid: int) -> None: """Initiate the worker resources component of resources""" @@ -167,8 +167,6 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) - if 0 not in self.zero_resource_workers: - self.zero_resource_workers.append(0) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 8df45929cc..df89625ad6 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -105,7 +105,7 @@ def free_rsets(self, worker=None): def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[int | Any]) -> list[int | None]: """Map WorkerID to index into a nodelist""" index = 0 - index_list = [] + index_list: list[int | None] = [] for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) @@ -116,6 +116,11 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in else: index_list.append(index) index += 1 + + for i in zero_resource_list: + if i >= num_workers: + logger.warning(f"Worker index {i} from zero_resource_workers is out of range (0-{num_workers - 1})") + return index_list @@ -364,7 +369,7 @@ def get_local_nodelist( local_nodelist = list(OrderedDict.fromkeys(team_list)) # Maintain order of nodes logger.debug(f"Worker's local_nodelist is {local_nodelist}") - slots = {} + slots: dict[str, list[int]] = {} for node in local_nodelist: slots[node] = [] diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py index 31e537a31d..121bb08cb1 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py @@ -66,7 +66,7 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + libE_specs["num_resource_sets"] = nworkers # Persistent gen does not need resources libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles if libE_specs["comms"] == "tcp": @@ -88,8 +88,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nworkers, + "max_resource_sets": nworkers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py index 814f5086cb..4cfc431c10 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py @@ -43,7 +43,7 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + libE_specs["num_resource_sets"] = nworkers # Persistent gen does not need resources libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles # Optional for organization of output scripts @@ -70,8 +70,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nworkers, + "max_resource_sets": nworkers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index 9c9f936edb..68e9c30cd3 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -139,10 +139,17 @@ exp_srun.append(srun_p1 + str(nodename) + srun_p2 + str(ntasks) + srun_p3 + str(ntasks) + srun_p4) test_list = test_list_base - exp_list = exp_srun + exp_list_dynamic = exp_srun.copy() + if nworkers == 5: + # Iteration 0 (Dynamic): Workers 1, 2 -> node-2; 3, 4, 5 -> node-1 + n1 = srun_p1 + "node-1" + srun_p2 + "5" + srun_p3 + "5" + srun_p4 + n2 = srun_p1 + "node-2" + srun_p2 + "8" + srun_p3 + "8" + srun_p4 + exp_list_dynamic = [n2, n2, n1, n1, n1] + # Iteration 1 (Static): Worker 1 is gen. Workers 2, 3 -> node-1; 4, 5 -> node-2 + exp_list_static = [n1, n1, n2, n2] + sim_specs["user"] = { "tests": test_list, - "expect": exp_list, "persis_gens": n_gens, } @@ -150,15 +157,21 @@ for prob_id in range(iterations): if prob_id == 0: # Uses dynamic scheduler - will find node 2 slots first (as fewer) - libE_specs["num_resource_sets"] = nworkers - 1 # Any worker can be the gen - sim_specs["user"]["offset_for_scheduler"] = True # Changes expected values + libE_specs["gen_on_worker"] = False + libE_specs["num_resource_sets"] = nworkers + sim_specs["user"]["expect"] = exp_list_dynamic + sim_specs["user"]["offset_for_scheduler"] = False + sim_specs["user"]["persis_gens"] = 0 persis_info = add_unique_random_streams({}, nworkers + 1) else: # Uses static scheduler - will find node 1 slots first + libE_specs["gen_on_worker"] = True + sim_specs["user"]["expect"] = exp_list_static del libE_specs["num_resource_sets"] libE_specs["zero_resource_workers"] = [1] # Gen must be worker 1 sim_specs["user"]["offset_for_scheduler"] = False + sim_specs["user"]["persis_gens"] = 1 persis_info = add_unique_random_streams({}, nworkers + 1) # Perform the run diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index f6a1e5d57f..7d6389ad33 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -36,6 +36,8 @@ n = 2 init_batch_size = nworkers - ngens + libE_specs["gen_on_worker"] = True + if ngens >= nworkers: sys.exit("The number of generators must be less than the number of workers -- aborting...") From 94fc628227fd77fb7bc0c91210c380fe76ae2085 Mon Sep 17 00:00:00 2001 From: jlnav Date: Mon, 23 Mar 2026 15:20:19 -0500 Subject: [PATCH 24/32] vibe coded approach lets see how this works --- libensemble/libE.py | 26 ++++++++++++------- libensemble/manager.py | 4 +-- libensemble/resources/resources.py | 2 +- libensemble/resources/worker_resources.py | 11 +++++--- libensemble/specs.py | 9 ++++--- .../test_mpi_gpu_settings.py | 14 +++++++--- .../test_mpi_gpu_settings_env.py | 14 +++++++--- ...est_persistent_uniform_gen_decides_stop.py | 13 +++++++--- .../tests/unit_tests/test_manager_main.py | 2 +- .../tests/unit_tests/test_resources.py | 24 ++++++++--------- libensemble/tools/alloc_support.py | 2 +- 11 files changed, 78 insertions(+), 43 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index abde5423c0..64abad2abd 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -155,7 +155,7 @@ def libE( exit_criteria: ExitCriteria, persis_info: dict = {}, alloc_specs: AllocSpecs = AllocSpecs(), - libE_specs: LibeSpecs = {}, + libE_specs: LibeSpecs | dict = {}, H0=None, ) -> (np.ndarray, dict, int): """ @@ -242,11 +242,16 @@ def libE( ] exit_criteria = specs_dump(ensemble.exit_criteria, by_alias=True, exclude_none=True) - # Restore objects that don't survive serialization via model_dump - if hasattr(ensemble.gen_specs, "generator") and ensemble.gen_specs.generator is not None: - gen_specs["generator"] = ensemble.gen_specs.generator - if hasattr(ensemble.gen_specs, "vocs") and ensemble.gen_specs.vocs is not None: - gen_specs["vocs"] = ensemble.gen_specs.vocs + if hasattr(ensemble.sim_specs, "simulator") and ensemble.sim_specs.simulator is not None: + sim_specs["simulator"] = ensemble.sim_specs.simulator + if hasattr(ensemble.sim_specs, "vocs") and ensemble.sim_specs.vocs is not None: + sim_specs["vocs"] = ensemble.sim_specs.vocs + + if ensemble.gen_specs is not None: + if hasattr(ensemble.gen_specs, "generator") and ensemble.gen_specs.generator is not None: + gen_specs["generator"] = ensemble.gen_specs.generator + if hasattr(ensemble.gen_specs, "vocs") and ensemble.gen_specs.vocs is not None: + gen_specs["vocs"] = ensemble.gen_specs.vocs # Extract platform info from settings or environment platform_info = get_platform(libE_specs) @@ -358,7 +363,7 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE logger.manager_warning("*WARNING* libEnsemble detected a NULL communicator") return [], persis_info, 3 # Process not in mpi_comm - assert libE_specs["mpi_comm"].Get_size() > 1, "Manager only - must be at least one worker (2 MPI tasks)" + assert libE_specs["mpi_comm"].Get_size() >= 1, "Manager only - must be at least one MPI task" with DupComm(libE_specs["mpi_comm"]) as mpi_comm: is_manager = mpi_comm.Get_rank() == 0 @@ -368,7 +373,6 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE local_host = socket.gethostname() libE_nodes = list(set(mpi_comm.allgather(local_host))) resources.add_comm_info(libE_nodes=libE_nodes) - nworkers = mpi_comm.Get_size() - 1 exctr = Executor.executor if exctr is not None: @@ -379,7 +383,8 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE # Run manager or worker code, depending if is_manager: if resources is not None: - resources.set_resource_manager(nworkers) + n_resource_workers = mpi_comm.Get_size() + resources.set_resource_manager(n_resource_workers) return libE_mpi_manager( mpi_comm, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0 ) @@ -497,7 +502,8 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li # Set manager resources after the forkpoint. if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"]) + n_resource_workers = libE_specs["nworkers"] + (not libE_specs.get("gen_on_worker", False)) + resources.set_resource_manager(n_resource_workers) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/manager.py b/libensemble/manager.py index c2763b7177..cf7aebeab4 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,7 +232,7 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - gen_on_manager = self.libE_specs.get("gen_on_manager", False) + gen_on_manager = not self.libE_specs.get("gen_on_worker", False) self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) if gen_on_manager: @@ -662,7 +662,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "gen_on_manager": self.libE_specs.get("gen_on_manager", False), + "gen_on_worker": self.libE_specs.get("gen_on_worker", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 105f8a836e..636265f3c4 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -166,7 +166,7 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st """ self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) - self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) + self.zero_resource_workers = libE_specs.get("zero_resource_workers", [0]) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 5033b2aeee..df89625ad6 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -105,8 +105,8 @@ def free_rsets(self, worker=None): def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[int | Any]) -> list[int | None]: """Map WorkerID to index into a nodelist""" index = 0 - index_list = [] - for i in range(1, num_workers + 1): + index_list: list[int | None] = [] + for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) else: @@ -116,6 +116,11 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in else: index_list.append(index) index += 1 + + for i in zero_resource_list: + if i >= num_workers: + logger.warning(f"Worker index {i} from zero_resource_workers is out of range (0-{num_workers - 1})") + return index_list @@ -364,7 +369,7 @@ def get_local_nodelist( local_nodelist = list(OrderedDict.fromkeys(team_list)) # Maintain order of nodes logger.debug(f"Worker's local_nodelist is {local_nodelist}") - slots = {} + slots: dict[str, list[int]] = {} for node in local_nodelist: slots[node] = [] diff --git a/libensemble/specs.py b/libensemble/specs.py index 718e927343..9b7fbd8b70 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -344,9 +344,9 @@ class LibeSpecs(BaseModel): nworkers: int | None = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - gen_on_manager: bool | None = False - """ Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. + gen_on_worker: bool = False + """ Instructs libEnsemble to run generator functions on a worker rank. + By default, the generator runs on the manager process as a thread (Worker 0). """ mpi_comm: object | None = None @@ -635,10 +635,11 @@ class LibeSpecs(BaseModel): libEnsemble processes (manager and workers) are running. """ - zero_resource_workers: list[int] | None = [] + zero_resource_workers: list[int] | None = [0] """ list of workers that require no resources. For when a fixed mapping of workers to resources is required. Otherwise, use ``num_resource_sets``. + By default, Worker 0 (manager thread) is a zero-resource worker. For use with supported allocation functions. """ diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py index 31e537a31d..0213be4f6f 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py @@ -66,7 +66,15 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + + # If gen_on_worker is False (default), then all nworkers are available for sims. + # Worker 0 is the generator (and it is a zero_resource_worker by default). + if not libE_specs.get("gen_on_worker", False): + nsim_workers = nworkers + else: + nsim_workers = nworkers - 1 + + libE_specs["num_resource_sets"] = nsim_workers libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles if libE_specs["comms"] == "tcp": @@ -88,8 +96,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nsim_workers, + "max_resource_sets": nsim_workers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py index 814f5086cb..55dfc01e03 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py @@ -43,7 +43,15 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + + # If gen_on_worker is False (default), then all nworkers are available for sims. + # Worker 0 is the generator (and it is a zero_resource_worker by default). + if not libE_specs.get("gen_on_worker", False): + nsim_workers = nworkers + else: + nsim_workers = nworkers - 1 + + libE_specs["num_resource_sets"] = nsim_workers libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles # Optional for organization of output scripts @@ -70,8 +78,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nsim_workers, + "max_resource_sets": nsim_workers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index d9b9465080..5374ef828f 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -33,11 +33,18 @@ nworkers, is_manager, libE_specs, _ = parse_args() for ngens in range(1, 3): + # If gen_on_worker is False (default), the first gen is on the manager (Worker 0). + # Subsequent gens (if ngens > 1) move to worker ranks. + if not libE_specs.get("gen_on_worker", False): + nsim_workers = nworkers - (ngens - 1) + else: + nsim_workers = nworkers - ngens + n = 2 - init_batch_size = nworkers - ngens + init_batch_size = nsim_workers - if ngens >= nworkers: - sys.exit("The number of generators must be less than the number of workers -- aborting...") + if nsim_workers <= 0: + sys.exit("The number of generators must be less than the available workers -- aborting...") sim_specs = { "sim_f": sim_f, diff --git a/libensemble/tests/unit_tests/test_manager_main.py b/libensemble/tests/unit_tests/test_manager_main.py index 4e246eb570..e34bc76301 100644 --- a/libensemble/tests/unit_tests/test_manager_main.py +++ b/libensemble/tests/unit_tests/test_manager_main.py @@ -6,7 +6,7 @@ import libensemble.manager as man import libensemble.tests.unit_tests.setup as setup -libE_specs = {"comms": "local"} +libE_specs = {"comms": "local", "gen_on_worker": True} def test_term_test_1(): diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index b87583f377..15db28ea07 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -694,23 +694,23 @@ def test_map_workerid_to_index(): zero_resource_list = [] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(1, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 1, "index incorrect. Received: " + str(index) + for workerID in range(0, num_workers): + index = index_list[workerID] + assert index == workerID, "index incorrect. Received: " + str(index) - zero_resource_list = [1] + zero_resource_list = [0] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(2, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 2, "index incorrect. Received: " + str(index) + for workerID in range(1, num_workers): + index = index_list[workerID] + assert index == workerID - 1, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 2] + zero_resource_list = [0, 1] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(3, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 3, "index incorrect. Received: " + str(index) + for workerID in range(2, num_workers): + index = index_list[workerID] + assert index == workerID - 2, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 3] + zero_resource_list = [0, 2] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) workerID = 2 diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index a8c9a65c86..1a47fe5608 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -132,7 +132,7 @@ def fltr_gen_workers(): wrks = [] for wrk in self.W: - if fltr_recving() and fltr_persis() and fltr_zrw() and fltr_gen_workers(): + if all((fltr_recving(), fltr_persis(), fltr_zrw(), fltr_gen_workers())): wrks.append(wrk["worker_id"]) return wrks From f2c77a565d3feadf30b315dbbe4309eac60a2c37 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 24 Mar 2026 08:35:51 -0500 Subject: [PATCH 25/32] the default is already covered by LibeSpecs, so changing the default for self.zero_resource_workers.get breaks too many tests for the benefit --- libensemble/resources/resources.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 9072b371f1..5ba54cc09b 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -166,7 +166,7 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st """ self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) - self.zero_resource_workers = libE_specs.get("zero_resource_workers", [0]) + self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") From 23814474cbe15b3d42c8d9c8bf672c74968432f9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Tue, 24 Mar 2026 11:55:34 -0500 Subject: [PATCH 26/32] if gen_on_worker is True, then [0] needs to be removed as the default zero-resource-worker. [0] shouldn't factor into the math for get_workers2assign2 --- libensemble/resources/rset_resources.py | 2 +- libensemble/tests/functionality_tests/test_1d_splitcomm.py | 1 + libensemble/utils/pydantic_bindings.py | 2 ++ libensemble/utils/specs_checkers.py | 7 +++++++ libensemble/utils/validators.py | 6 ++++++ 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/libensemble/resources/rset_resources.py b/libensemble/resources/rset_resources.py index d35cdaee8b..1ef4cc60ba 100644 --- a/libensemble/resources/rset_resources.py +++ b/libensemble/resources/rset_resources.py @@ -130,7 +130,7 @@ def get_rsets_on_a_node(num_rsets, resources): def get_workers2assign2(num_workers, resources): """Returns workers to assign resources to""" zero_resource_list = resources.zero_resource_workers - return num_workers - len(zero_resource_list) + return num_workers - len(zero_resource_list) if resources.zero_resource_workers != [0] else num_workers @staticmethod def even_assignment(nnodes, nworkers): diff --git a/libensemble/tests/functionality_tests/test_1d_splitcomm.py b/libensemble/tests/functionality_tests/test_1d_splitcomm.py index de73660d70..bc53730b96 100644 --- a/libensemble/tests/functionality_tests/test_1d_splitcomm.py +++ b/libensemble/tests/functionality_tests/test_1d_splitcomm.py @@ -34,6 +34,7 @@ libE_specs["H_file_prefix"] = "splitcomm_" + str(sub_comm_number) libE_specs["safe_mode"] = False libE_specs["disable_log_files"] = True + libE_specs["gen_on_worker"] = True sim_specs = { "sim_f": sim_f, diff --git a/libensemble/utils/pydantic_bindings.py b/libensemble/utils/pydantic_bindings.py index 6ae28efe8b..2177c197e6 100644 --- a/libensemble/utils/pydantic_bindings.py +++ b/libensemble/utils/pydantic_bindings.py @@ -7,6 +7,7 @@ from libensemble import specs from libensemble.resources import platforms from libensemble.utils.validators import ( + check_adjust_zrw_on_gen_on_worker, check_any_workers_and_disable_rm_if_tcp, check_exit_criteria, check_gpu_setting_type, @@ -95,6 +96,7 @@ "set_default_comms": set_default_comms, "set_workflow_dir": set_workflow_dir, "set_calc_dirs_on_input_dir": set_calc_dirs_on_input_dir, + "check_adjust_zrw_on_gen_on_worker": check_adjust_zrw_on_gen_on_worker, }, ) diff --git a/libensemble/utils/specs_checkers.py b/libensemble/utils/specs_checkers.py index b8e793fa51..ff73118855 100644 --- a/libensemble/utils/specs_checkers.py +++ b/libensemble/utils/specs_checkers.py @@ -100,3 +100,10 @@ def _check_logical_cores(values): scg(values, "logical_cores_per_node") % scg(values, "cores_per_node") == 0 ), "Logical cores doesn't divide evenly into cores" return values + + +def _check_adjust_zrw_on_gen_on_worker(values): + """When gen_on_worker is set the default zero_resource_worker value complicates resources""" + if scg(values, "gen_on_worker") and scg(values, "zero_resource_workers") == [0]: + scs(values, "zero_resource_workers", []) + return values diff --git a/libensemble/utils/validators.py b/libensemble/utils/validators.py index 2164bf2f40..e4aef42810 100644 --- a/libensemble/utils/validators.py +++ b/libensemble/utils/validators.py @@ -7,6 +7,7 @@ from libensemble.resources.platforms import Platform from libensemble.utils.specs_checkers import ( + _check_adjust_zrw_on_gen_on_worker, _check_any_workers_and_disable_rm_if_tcp, _check_exit_criteria, _check_H0, @@ -117,6 +118,11 @@ def check_mpi_runner_type(cls, value): check_mpi_runner_type = field_validator("mpi_runner")(classmethod(check_mpi_runner_type)) +@model_validator(mode="after") +def check_adjust_zrw_on_gen_on_worker(self): + return _check_adjust_zrw_on_gen_on_worker(self) + + @model_validator(mode="after") def check_any_workers_and_disable_rm_if_tcp(self): return _check_any_workers_and_disable_rm_if_tcp(self) From 1ee2402a0eedc8fb8a4cb9e1d5883891d226ca21 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 26 Mar 2026 15:03:42 -0500 Subject: [PATCH 27/32] update osx clang --- pixi.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pixi.lock b/pixi.lock index 3357902824..f1064024fb 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61c5432ee07721317765d0bd57cc8802f96ec9376005b4bedc1bb26f39dc116f -size 1020189 +oid sha256:ad5161d3c3e969864ae0a701c652d954eb4a09ffc452ed7d713caf21ff295d57 +size 1018393 diff --git a/pyproject.toml b/pyproject.toml index d81f934fb4..cf65092a70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -187,7 +187,7 @@ gest-api = ">=0.1,<0.2" # macOS dependencies [tool.pixi.target.osx-arm64.dependencies] -clang_osx-arm64 = ">=21.1.7,<22" +clang_osx-arm64 = ">=22.1.0,<23" # Linux dependencies [tool.pixi.target.linux-64.dependencies] From 86ea5cd047cf64509be7c3a7169b7fb8bf6be114 Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 26 Mar 2026 16:23:12 -0500 Subject: [PATCH 28/32] pixi messed up my linker, but this should help prevent such problems again --- .../tests/scaling_tests/forces/forces_app/build_forces.sh | 8 ++++++-- pixi.lock | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh b/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh index b8b379e0ee..8dd599ffed 100755 --- a/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh +++ b/libensemble/tests/scaling_tests/forces/forces_app/build_forces.sh @@ -4,8 +4,12 @@ # Building flat MPI # ------------------------------------------------- -# GCC -mpicc -O3 -o forces.x forces.c -lm +# macOS (Apple Silicon with pixi) / GCC +if [[ "$OSTYPE" == "darwin"* ]]; then + mpicc -cc=clang -O3 -o forces.x forces.c -lm +else + mpicc -O3 -o forces.x forces.c -lm +fi # Intel # mpiicc -O3 -o forces.x forces.c diff --git a/pixi.lock b/pixi.lock index f1064024fb..0647ac6202 100644 --- a/pixi.lock +++ b/pixi.lock @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad5161d3c3e969864ae0a701c652d954eb4a09ffc452ed7d713caf21ff295d57 -size 1018393 +oid sha256:af6db44054dd36ecf322829eafdcde16863284fb381111b03c9d7712ca7fe62b +size 1018583 From 1ddebbb2e4ab4fe046f7a5d9baaf438dcd3a70cd Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 26 Mar 2026 16:39:04 -0500 Subject: [PATCH 29/32] likewise adjusting other buildstrings --- .../tests/functionality_tests/test_asktell_sampling.py | 1 - libensemble/tests/regression_tests/common.py | 3 +++ libensemble/tests/run_tests.py | 5 ++++- libensemble/tests/standalone_tests/kill_test/build.sh | 9 +++++++-- libensemble/tests/unit_tests/test_executor.py | 4 ++++ libensemble/tests/unit_tests/test_executor_gpus.py | 4 ++++ 6 files changed, 22 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_asktell_sampling.py b/libensemble/tests/functionality_tests/test_asktell_sampling.py index c1e97edc78..83db331c8d 100644 --- a/libensemble/tests/functionality_tests/test_asktell_sampling.py +++ b/libensemble/tests/functionality_tests/test_asktell_sampling.py @@ -33,7 +33,6 @@ def sim_f(In): if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_worker"] = False sim_specs = { "sim_f": sim_f, diff --git a/libensemble/tests/regression_tests/common.py b/libensemble/tests/regression_tests/common.py index cb174d09c9..ebf45cdaac 100644 --- a/libensemble/tests/regression_tests/common.py +++ b/libensemble/tests/regression_tests/common.py @@ -5,6 +5,7 @@ import glob import os import os.path +import sys import time @@ -72,6 +73,8 @@ def build_simfunc(): # Build simfunc # buildstring='mpif90 -o my_simtask.x my_simtask.f90' # On cray need to use ftn buildstring = "mpicc -o my_simtask.x ../unit_tests/simdir/my_simtask.c" + if sys.platform == "darwin": + buildstring = "mpicc -cc=clang -o my_simtask.x ../unit_tests/simdir/my_simtask.c" # subprocess.run(buildstring.split(),check=True) #Python3.5+ subprocess.check_call(buildstring.split()) diff --git a/libensemble/tests/run_tests.py b/libensemble/tests/run_tests.py index 1168261fdb..72ef5633ee 100755 --- a/libensemble/tests/run_tests.py +++ b/libensemble/tests/run_tests.py @@ -264,7 +264,10 @@ def build_forces(root_dir): """Build forces.x using mpicc.""" cprint("Building forces.x before running regression tests...", style="yellow", newline=True) forces_app_dir = Path(root_dir) / "libensemble/tests/scaling_tests/forces/forces_app" - subprocess.run(["mpicc", "-O3", "-o", "forces.x", "forces.c", "-lm"], cwd=forces_app_dir, check=True) + build_cmd = ["mpicc", "-O3", "-o", "forces.x", "forces.c", "-lm"] + if platform.system() == "Darwin": + build_cmd = ["mpicc", "-cc=clang", "-O3", "-o", "forces.x", "forces.c", "-lm"] + subprocess.run(build_cmd, cwd=forces_app_dir, check=True) destination_dir = Path(root_dir) / "libensemble/tests/forces_app" os.makedirs(destination_dir, exist_ok=True) shutil.copy(forces_app_dir / "forces.x", destination_dir) diff --git a/libensemble/tests/standalone_tests/kill_test/build.sh b/libensemble/tests/standalone_tests/kill_test/build.sh index 8e47571e8c..0094221036 100755 --- a/libensemble/tests/standalone_tests/kill_test/build.sh +++ b/libensemble/tests/standalone_tests/kill_test/build.sh @@ -1,2 +1,7 @@ -mpicc -g -o burn_time.x burn_time.c -mpicc -g -o sleep_and_print.x sleep_and_print.c +if [[ "$OSTYPE" == "darwin"* ]]; then + mpicc -cc=clang -g -o burn_time.x burn_time.c + mpicc -cc=clang -g -o sleep_and_print.x sleep_and_print.c +else + mpicc -g -o burn_time.x burn_time.c + mpicc -g -o sleep_and_print.x sleep_and_print.c +fi diff --git a/libensemble/tests/unit_tests/test_executor.py b/libensemble/tests/unit_tests/test_executor.py index 95e60b2868..bf90464e84 100644 --- a/libensemble/tests/unit_tests/test_executor.py +++ b/libensemble/tests/unit_tests/test_executor.py @@ -84,6 +84,10 @@ def build_simfuncs(): app_name = ".".join([sim.split(".")[0], "x"]) if not os.path.isfile(app_name): buildstring = "mpicc -o " + os.path.join("simdir", app_name) + " " + os.path.join("simdir", sim) + if sys.platform == "darwin": + buildstring = ( + "mpicc -cc=clang -o " + os.path.join("simdir", app_name) + " " + os.path.join("simdir", sim) + ) subprocess.check_call(buildstring.split()) diff --git a/libensemble/tests/unit_tests/test_executor_gpus.py b/libensemble/tests/unit_tests/test_executor_gpus.py index 239344ecd2..8a1e02700e 100644 --- a/libensemble/tests/unit_tests/test_executor_gpus.py +++ b/libensemble/tests/unit_tests/test_executor_gpus.py @@ -48,6 +48,10 @@ def build_simfuncs(): app_name = ".".join([sim.split(".")[0], "x"]) if not os.path.isfile(app_name): buildstring = "mpicc -o " + os.path.join("simdir", app_name) + " " + os.path.join("simdir", sim) + if sys.platform == "darwin": + buildstring = ( + "mpicc -cc=clang -o " + os.path.join("simdir", app_name) + " " + os.path.join("simdir", sim) + ) subprocess.check_call(buildstring.split()) From b2356148e918c535c3c95b46cff2eed39ae91116 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 27 Mar 2026 10:23:50 -0500 Subject: [PATCH 30/32] index-list shouldn't consider worker0 - revert unit test changes --- .pre-commit-config.yaml | 2 +- libensemble/resources/worker_resources.py | 2 +- .../tests/unit_tests/test_resources.py | 24 +++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f8b6d9cea..87d87ef593 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,4 +37,4 @@ repos: rev: v1.19.1 hooks: - id: mypy - exclude: ^libensemble/utils/(launcher|loc_stack|runners|pydantic|output_directory)\.py$|^libensemble/tests/regression_tests/support\.py$|^libensemble/tests/functionality_tests/ + exclude: ^libensemble/utils/(launcher|loc_stack|runners|pydantic|output_directory)\.py$|^libensemble/tests/regression_tests/support\.py$|^libensemble/tests/functionality_tests/|^libensemble/tests/unit_tests/ diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index df89625ad6..4faa674e93 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -106,7 +106,7 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in """Map WorkerID to index into a nodelist""" index = 0 index_list: list[int | None] = [] - for i in range(0, num_workers): + for i in range(1, num_workers + 1): if i in zero_resource_list: index_list.append(None) else: diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index 15db28ea07..b87583f377 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -694,23 +694,23 @@ def test_map_workerid_to_index(): zero_resource_list = [] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(0, num_workers): - index = index_list[workerID] - assert index == workerID, "index incorrect. Received: " + str(index) - - zero_resource_list = [0] - index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(1, num_workers): - index = index_list[workerID] + for workerID in range(1, num_workers + 1): + index = index_list[workerID - 1] assert index == workerID - 1, "index incorrect. Received: " + str(index) - zero_resource_list = [0, 1] + zero_resource_list = [1] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(2, num_workers): - index = index_list[workerID] + for workerID in range(2, num_workers + 1): + index = index_list[workerID - 1] assert index == workerID - 2, "index incorrect. Received: " + str(index) - zero_resource_list = [0, 2] + zero_resource_list = [1, 2] + index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) + for workerID in range(3, num_workers + 1): + index = index_list[workerID - 1] + assert index == workerID - 3, "index incorrect. Received: " + str(index) + + zero_resource_list = [1, 3] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) workerID = 2 From 49f4ce32e581ebba3d20c77cd7a3d1e892b925e9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 27 Mar 2026 16:47:12 -0500 Subject: [PATCH 31/32] "use_gpus" is never set, so remove. So since the sims explicitly want procs matched to GPUs we really ought to assign sims to those resource sets first. Only if no GPU rsets exist do we fallback to non-gpu sets, producing a correct wait upon allocation. Thanks Claude. --- .../test_mpi_gpu_settings.py | 4 ++-- libensemble/tools/alloc_support.py | 17 ++++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py index dd4919d8aa..7985ff0fff 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py @@ -39,7 +39,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 6 +# TESTSUITE_NPROCS: 4 7 import os import sys @@ -88,7 +88,7 @@ "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "give_all_with_same_priority": False, "async_return": False, - "initial_batch_size": nworkers - 1, + "initial_batch_size": nworkers, "user": { "max_resource_sets": nworkers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index 1a47fe5608..ef3bee4965 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -4,7 +4,7 @@ from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.resources.resources import Resources -from libensemble.resources.scheduler import InsufficientFreeResources, InsufficientResourcesError, ResourceScheduler +from libensemble.resources.scheduler import InsufficientResourcesError, ResourceScheduler from libensemble.utils.misc import extract_H_ranges logger = logging.getLogger(__name__) @@ -76,13 +76,13 @@ def assign_resources(self, rsets_req, use_gpus=None, user_params=[]): """ rset_team = None if self.resources is not None: - # Try schedule to non-gpu rsets first - if use_gpus is None: + # When GPUs exist and use_gpus not explicitly set, try GPU rsets first + if use_gpus is None and self.sched.resources.total_num_gpu_rsets > 0: try: - rset_team = self.sched.assign_resources(rsets_req, use_gpus=False, user_params=user_params) + rset_team = self.sched.assign_resources(rsets_req, use_gpus=True, user_params=user_params) return rset_team - except (InsufficientFreeResources, InsufficientResourcesError): - pass + except InsufficientResourcesError: + pass # More rsets requested than GPU rsets exist - fall back to any rset_team = self.sched.assign_resources(rsets_req, use_gpus, user_params) return rset_team @@ -160,11 +160,6 @@ def _req_resources_sim(self, libE_info, user_params, H, H_rows): ) else: num_rsets_req = 1 - if "use_gpus" in H.dtype.names: - if np.any(H[H_rows]["use_gpus"]): - use_gpus = True - else: - use_gpus = False if "num_gpus" in H.dtype.names: gpus_per_rset = self.resources.resource_manager.gpus_per_rset num_rsets_req_for_gpus = AllocSupport._convert_rows_to_rsets( From 01f57a6e5d6d5f1f7ee0a0a35929bb3063a9083d Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 27 Mar 2026 16:56:33 -0500 Subject: [PATCH 32/32] still need to import this error for the tests --- libensemble/tools/alloc_support.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libensemble/tools/alloc_support.py b/libensemble/tools/alloc_support.py index ef3bee4965..96d622977b 100644 --- a/libensemble/tools/alloc_support.py +++ b/libensemble/tools/alloc_support.py @@ -4,7 +4,11 @@ from libensemble.message_numbers import EVAL_GEN_TAG, EVAL_SIM_TAG from libensemble.resources.resources import Resources -from libensemble.resources.scheduler import InsufficientResourcesError, ResourceScheduler +from libensemble.resources.scheduler import ( # noqa + InsufficientFreeResources, + InsufficientResourcesError, + ResourceScheduler, +) from libensemble.utils.misc import extract_H_ranges logger = logging.getLogger(__name__)