From 14d1cffbd36c54c18685bfed278dfb84dd2e266c Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:22:37 -0500 Subject: [PATCH 01/23] initial commit for gen_on_manager -> gen_on_worker --- docs/FAQ.rst | 5 --- docs/data_structures/libE_specs.rst | 12 ++---- docs/platforms/aurora.rst | 22 +---------- docs/platforms/perlmutter.rst | 20 ---------- docs/platforms/platforms_index.rst | 22 ++--------- docs/running_libE.rst | 30 -------------- docs/tutorials/executor_forces_tutorial.rst | 38 ------------------ docs/tutorials/gpcam_tutorial.rst | 39 ++++++++++++------- libensemble/manager.py | 16 ++++---- libensemble/specs.py | 6 +-- .../test_GPU_gen_resources.py | 8 ++-- .../test_evaluate_existing_plus_gen.py | 3 +- .../test_persistent_uniform_sampling.py | 4 +- .../test_evaluate_mixed_sample.py | 1 - .../test_persistent_gp_multitask_ax.py | 5 +-- 15 files changed, 50 insertions(+), 181 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index b8a3ea2ce5..0339dbb681 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -13,11 +13,6 @@ We recommend using the following options to help debug workflows:: logger.set_level("DEBUG") libE_specs["safe_mode"] = True -To make it easier to debug a generator try setting the **libE_specs** option ``gen_on_manager``. -To do so, add the following to your calling script:: - - libE_specs["gen_on_manager"] = True - With this, ``pdb`` breakpoints can be set as usual in the generator. For more debugging options see "How can I debug specific libEnsemble processes?" below. diff --git a/docs/data_structures/libE_specs.rst b/docs/data_structures/libE_specs.rst index c0ca141403..6ecac6e582 100644 --- a/docs/data_structures/libE_specs.rst +++ b/docs/data_structures/libE_specs.rst @@ -9,12 +9,7 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl from libensemble.specs import LibeSpecs - specs = LibeSpecs( - gen_on_manager=True, - save_every_k_gens=100, - sim_dirs_make=True, - nworkers=4 - ) + specs = LibeSpecs(save_every_k_gens=100, sim_dirs_make=True, nworkers=4) .. dropdown:: Settings by Category :open: @@ -31,9 +26,8 @@ libEnsemble is primarily customized by setting options within a ``LibeSpecs`` cl **nworkers** [int]: Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``. - **gen_on_manager** [bool] = False - Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. + **gen_on_worker** [bool] = False + Instructs Worker process to run generator instead of Manager. **mpi_comm** [MPI communicator] = ``MPI.COMM_WORLD``: libEnsemble MPI communicator. diff --git a/docs/platforms/aurora.rst b/docs/platforms/aurora.rst index af2e7cc160..4865ba0c18 100644 --- a/docs/platforms/aurora.rst +++ b/docs/platforms/aurora.rst @@ -57,7 +57,7 @@ simulations for each worker: .. code-block:: python # Instruct libEnsemble to exit after this many simulations - ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers*2) + ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers * 2) Now grab an interactive session on two nodes (or use the batch script at ``../submission_scripts/submit_pbs_aurora.sh``):: @@ -115,26 +115,6 @@ will use one GPU tile):: python run_libe_forces.py -n 25 -Running generator on the manager --------------------------------- - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows: - -.. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -then we can run with 12 (instead of 13) workers:: - - python run_libe_forces.py -n 12 - Dynamic resource assignment --------------------------- diff --git a/docs/platforms/perlmutter.rst b/docs/platforms/perlmutter.rst index 88d3f808b2..a2768e1d26 100644 --- a/docs/platforms/perlmutter.rst +++ b/docs/platforms/perlmutter.rst @@ -105,26 +105,6 @@ To see GPU usage, ssh into the node you are on in another window and run:: watch -n 0.1 nvidia-smi -Running generator on the manager -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -An alternative is to run the generator on a thread on the manager. The -number of workers can then be set to the number of simulation workers. - -Change the ``libE_specs`` in **run_libe_forces.py** as follows. - - .. code-block:: python - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - -and run with:: - - python run_libe_forces.py -n 4 - To watch video ^^^^^^^^^^^^^^ diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index c06cdbe6fd..eca8ab9d58 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,10 +24,6 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -The generator will run on a worker by default, but if running a single generator, -the :ref:`libE_specs` option **gen_on_manager** is recommended, -which runs the generator on the manager (using a thread) as below. - .. list-table:: :widths: 60 40 @@ -35,15 +31,6 @@ which runs the generator on the manager (using a thread) as below. :alt: centralized :scale: 55 - - In calling script: - - .. code-block:: python - :linenos: - - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - ) - A SLURM batch script may include: .. code-block:: bash @@ -52,7 +39,9 @@ which runs the generator on the manager (using a thread) as below. python run_libe_forces.py --nworkers 3 -When using **gen_on_manager**, set ``nworkers`` to the number of workers desired for running simulations. +If running multiple generator processes instead, then set the +:ref:`libE_specs` option **gen_on_worker** so that multiple +worker processes can run multiple generator instances. Dedicated Mode ^^^^^^^^^^^^^^ @@ -87,8 +76,6 @@ remaining nodes in the allocation. python run_libe_forces.py --nworkers 3 -Note that **gen_on_manager** is not set in the above example. - Distributed Running ------------------- @@ -137,8 +124,7 @@ Zero-resource workers --------------------- Users with persistent ``gen_f`` functions may notice that the persistent workers -are still automatically assigned system resources. This can be resolved by using -the ``gen_on_manager`` option or by +are still automatically assigned system resources. This can be resolved by :ref:`fixing the number of resource sets`. Assigning GPUs diff --git a/docs/running_libE.rst b/docs/running_libE.rst index ae658e31c6..80af301f3f 100644 --- a/docs/running_libE.rst +++ b/docs/running_libE.rst @@ -12,13 +12,6 @@ determine the parameters/inputs for simulations. Simulator functions run and manage simulations, which often involve running a user application (see :doc:`Executor`). -.. note:: - As of version 1.3.0, the generator can be run as a thread on the manager, - using the :ref:`libE_specs` option **gen_on_manager**. - When using this option, set the number of workers desired for running - simulations. See :ref:`Running generator on the manager` - for more details. - To use libEnsemble, you will need a calling script, which in turn will specify generator and simulator functions. Many :doc:`examples` are available. @@ -162,29 +155,6 @@ If this example was run as:: No simulations will be able to run. -.. _gen-on-manager: - -Running generator on the manager --------------------------------- - -The majority of libEnsemble use cases run a single generator. The -:ref:`libE_specs` option **gen_on_manager** will cause -the generator function to run on a thread on the manager. This can run -persistent user functions, sharing data structures with the manager, and avoids -additional communication to a generator running on a worker. When using this -option, the number of workers specified should be the (maximum) number of -concurrent simulations. - -If modifying a workflow to use ``gen_on_manager`` consider the following. - -* Set ``nworkers`` to the number of workers desired for running simulations. -* If using :meth:`add_unique_random_streams()` - to seed random streams, the default generator seed will be zero. -* If you have a line like ``libE_specs["nresource_sets"] = nworkers -1``, this - line should be removed. -* If the generator does use resources, ``nresource_sets`` can be increased as needed - so that the generator and all simulations are resourced. - Environment Variables --------------------- diff --git a/docs/tutorials/executor_forces_tutorial.rst b/docs/tutorials/executor_forces_tutorial.rst index e01496734b..a083aa2a82 100644 --- a/docs/tutorials/executor_forces_tutorial.rst +++ b/docs/tutorials/executor_forces_tutorial.rst @@ -336,44 +336,6 @@ These may require additional browsing of the documentation to complete. ... -Running the generator on the manager ------------------------------------- - -As of version 1.3.0, the generator can be run on a thread on the manager, -using the :ref:`libE_specs` option **gen_on_manager**. - -Change the libE_specs as follows. - - .. code-block:: python - :linenos: - :lineno-start: 28 - - nsim_workers = ensemble.nworkers - - # Persistent gen does not need resources - ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, - sim_dirs_make=True, - ensemble_dir_path="./test_executor_forces_tutorial", - ) - -When running set ``nworkers`` to the number of workers desired for running simulations. -E.g., Instead of: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 5 - -use: - -.. code-block:: bash - - python run_libe_forces.py --nworkers 4 - -Note that as the generator random number seed will be zero instead of one, the checksum will change. - -For more information see :ref:`Running generator on the manager`. - Running forces application with input file ------------------------------------------ diff --git a/docs/tutorials/gpcam_tutorial.rst b/docs/tutorials/gpcam_tutorial.rst index a013c1b67e..096d5584ca 100644 --- a/docs/tutorials/gpcam_tutorial.rst +++ b/docs/tutorials/gpcam_tutorial.rst @@ -30,6 +30,7 @@ This version (and others) of the gpCAM generator can be found at `libensemble/ge from libensemble.message_numbers import EVAL_GEN_TAG, FINISHED_PERSISTENT_GEN_TAG, PERSIS_STOP, STOP_TAG from libensemble.tools.persistent_support import PersistentSupport + def persistent_gpCAM(H_in, persis_info, gen_specs, libE_info): """Run a batched gpCAM model to create a surrogate""" @@ -156,6 +157,7 @@ For running applications using parallel resources in the simulator see the `forc # Define our simulation function import numpy as np + def six_hump_camel(H, persis_info, sim_specs, _): """Six-Hump Camel sim_f.""" @@ -189,6 +191,8 @@ First we will create a cleanup script so we can easily re-run. # To rerun this notebook, we need to delete the ensemble directory. import shutil + + def cleanup(): try: shutil.rmtree("ensemble") @@ -218,31 +222,30 @@ If you wish to make your own functions based on the above, those can be imported nworkers = 4 - # When using gen_on_manager, nworkers is number of concurrent sims. # final_gen_send means the last evaluated points are returned to the generator to update the model. - libE_specs = LibeSpecs(nworkers=nworkers, gen_on_manager=True, final_gen_send=True) + libE_specs = LibeSpecs(nworkers=nworkers, final_gen_send=True) n = 2 # Input dimensions batch_size = 4 num_batches = 6 gen_specs = GenSpecs( - gen_f=persistent_gpCAM, # Generator function - persis_in=["f"], # Objective, defined in sim, is returned to gen + gen_f=persistent_gpCAM, # Generator function + persis_in=["f"], # Objective, defined in sim, is returned to gen outputs=[("x", float, (n,))], # Parameters (name, type, size) user={ "batch_size": batch_size, "lb": np.array([-2, -1]), # lower boundaries for n dimensions - "ub": np.array([2, 1]), # upper boundaries for n dimensions - "ask_max_iter": 5, # Number of iterations for ask (default 20) + "ub": np.array([2, 1]), # upper boundaries for n dimensions + "ask_max_iter": 5, # Number of iterations for ask (default 20) "rng_seed": 0, }, ) sim_specs = SimSpecs( - sim_f=six_hump_camel, # Simulator function - inputs=["x"], # Input field names. "x" defined in gen - outputs=[("f", float)], # Objective + sim_f=six_hump_camel, # Simulator function + inputs=["x"], # Input field names. "x" defined in gen + outputs=[("f", float)], # Objective ) # Starts one persistent generator. Simulated values are returned in batch. @@ -251,7 +254,7 @@ If you wish to make your own functions based on the above, those can be imported user={"async_return": False}, # False = batch returns ) - exit_criteria = ExitCriteria(sim_max=num_batches*batch_size) + exit_criteria = ExitCriteria(sim_max=num_batches * batch_size) # Initialize and run the ensemble. ensemble = Ensemble( @@ -272,7 +275,7 @@ At the end of our calling script we run the ensemble. H, persis_info, flag = ensemble.run() # Start the ensemble. Blocks until completion. ensemble.save_output("H_array", append_attrs=False) # Save H (history of all evaluated points) to file - pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results + pprint(H[["sim_id", "x", "f"]][:16]) # See first 16 results Rerun and test model at known points ------------------------------------ @@ -312,15 +315,21 @@ values at the test points. markersize = 10 plt.figure(figsize=(10, 5)) plt.plot( - num_sims, mse, marker="^", markeredgecolor="black", markeredgewidth=2, - markersize=markersize, linewidth=2, label="Mean squared error" + num_sims, + mse, + marker="^", + markeredgecolor="black", + markeredgewidth=2, + markersize=markersize, + linewidth=2, + label="Mean squared error", ) plt.xticks(num_sims) # Labeling the axes and the legend - plt.title('Mean Squared Error at test points') + plt.title("Mean Squared Error at test points") plt.xlabel("Number of simulations") - plt.ylabel('Mean squared error (rad$^2$)') + plt.ylabel("Mean squared error (rad$^2$)") legend = plt.legend(framealpha=1, edgecolor="black") # Increase edge width here plt.grid(True) plt.show() diff --git a/libensemble/manager.py b/libensemble/manager.py index c0cf02500e..149ab819b6 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -231,19 +231,19 @@ def __init__( (1, "stop_val", self.term_test_stop_val), ] - gen_on_manager = self.libE_specs.get("gen_on_manager", False) + gen_on_worker = self.libE_specs.get("gen_on_worker", False) - self.W = np.zeros(len(self.wcomms) + gen_on_manager, dtype=Manager.worker_dtype) - if gen_on_manager: + self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + if gen_on_worker: + self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] + else: self.W["worker_id"] = np.arange(len(self.wcomms) + 1) # [0, 1, 2, ...] self.W[0]["gen_worker"] = True local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - else: - self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] - self.W = _WorkerIndexer(self.W, gen_on_manager) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_manager) + self.W = _WorkerIndexer(self.W, gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources @@ -639,7 +639,7 @@ def _get_alloc_libE_info(self) -> dict: "use_resource_sets": self.use_resource_sets, "gen_num_procs": self.gen_num_procs, "gen_num_gpus": self.gen_num_gpus, - "gen_on_manager": self.libE_specs.get("gen_on_manager", False), + "gen_on_worker": self.libE_specs.get("gen_on_worker", False), } def _alloc_work(self, H: npt.NDArray, persis_info: dict) -> dict: diff --git a/libensemble/specs.py b/libensemble/specs.py index aa70018362..e2cc525636 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -182,10 +182,8 @@ class LibeSpecs(BaseModel): nworkers: Optional[int] = 0 """ Number of worker processes in ``"local"``, ``"threads"``, or ``"tcp"``.""" - gen_on_manager: Optional[bool] = False - """ Instructs Manager process to run generator functions. - This generator function can access/modify user objects by reference. - """ + gen_on_worker: Optional[bool] = False + """ Instructs Worker process to run generator instead of Manager.""" mpi_comm: Optional[Any] = None """ libEnsemble MPI communicator. Default: ``MPI.COMM_WORLD``""" diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index d77088d7e4..48b7fee0d2 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -100,18 +100,18 @@ libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() - for gen_on_manager in [False, True]: + for gen_on_worker in [False, True]: for run in range(5): # reset libE_specs = base_libE_specs.copy() - libE_specs["gen_on_manager"] = gen_on_manager + libE_specs["gen_on_worker"] = gen_on_worker persis_info = add_unique_random_streams({}, nworkers + 1) if run == 0: libE_specs["gen_num_procs"] = 2 elif run == 1: - if gen_on_manager: - print("SECOND LIBE CALL WITH GEN ON MANAGER") + if gen_on_worker: + print("SECOND LIBE CALL WITH GEN ON WORKER INSTEAD OF MANAGER") libE_specs["gen_num_gpus"] = 1 elif run == 2: persis_info["gen_num_gpus"] = 1 diff --git a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py index fe3d8dad8e..7a16d70736 100644 --- a/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py +++ b/libensemble/tests/functionality_tests/test_evaluate_existing_plus_gen.py @@ -1,6 +1,6 @@ """ Test libEnsemble's capability to evaluate existing points and then generate -new samples via gen_on_manager. +new samples. Execute via one of the following commands (e.g. 3 workers): mpiexec -np 4 python test_evaluate_existing_sample.py @@ -43,7 +43,6 @@ def create_H0(persis_info, gen_specs, H0_size): if __name__ == "__main__": sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) gen_specs = { diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py index 81a18a5285..643b4723d7 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling.py @@ -87,9 +87,9 @@ sim_specs["in"] = ["x", "obj_component"] # sim_specs["out"] = [("f", float), ("grad", float, n)] elif run == 3: - libE_specs["gen_on_manager"] = True + libE_specs["gen_on_worker"] = True elif run == 4: - libE_specs["gen_on_manager"] = False + libE_specs["gen_on_worker"] = False libE_specs["gen_workers"] = [2] # Perform the run diff --git a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py index 481db84191..60e43fa57e 100644 --- a/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py +++ b/libensemble/tests/regression_tests/test_evaluate_mixed_sample.py @@ -44,7 +44,6 @@ H0["sim_ended"][:500] = True sampling = Ensemble(parse_args=True) - sampling.libE_specs.gen_on_manager = True sampling.H0 = H0 sampling.sim_specs = SimSpecs(sim_f=sim_f, inputs=["x"], out=[("f", float)]) sampling.alloc_specs = AllocSpecs(alloc_f=alloc_f) diff --git a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py index 8c589161ad..f88db4fe05 100644 --- a/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py +++ b/libensemble/tests/regression_tests/test_persistent_gp_multitask_ax.py @@ -2,8 +2,6 @@ Example of multi-fidelity optimization using a persistent GP gen_func (calling Ax). -This test uses the gen_on_manager option (persistent generator runs on -a thread). Therefore nworkers is the number of simulation workers. Execute via one of the following commands: mpiexec -np 4 python test_persistent_gp_multitask_ax.py @@ -50,7 +48,7 @@ def run_simulation(H, persis_info, sim_specs, libE_info): z = 8 elif task == "cheap_model": z = 1 - print('in sim', task) + print("in sim", task) libE_output = np.zeros(1, dtype=sim_specs["out"]) calc_status = WORKER_DONE @@ -64,7 +62,6 @@ def run_simulation(H, persis_info, sim_specs, libE_info): # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_manager"] = True mt_params = { "name_hifi": "expensive_model", From ae40691b0bdc89b0a39747837917a062cf20d1f9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 09:24:35 -0500 Subject: [PATCH 02/23] fix --- docs/platforms/platforms_index.rst | 47 +++++++++++++----------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index eca8ab9d58..843e65e5e6 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -24,20 +24,17 @@ simulation worker, and libEnsemble will distribute user applications across the node allocation. This is the **most common approach** where each simulation runs an MPI application. -.. list-table:: - :widths: 60 40 +.. image:: ../images/centralized_gen_on_manager.png + :alt: centralized + :scale: 55 - * - .. image:: ../images/centralized_gen_on_manager.png - :alt: centralized - :scale: 55 +A SLURM batch script may include: - A SLURM batch script may include: +.. code-block:: bash - .. code-block:: bash + #SBATCH --nodes 3 - #SBATCH --nodes 3 - - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 If running multiple generator processes instead, then set the :ref:`libE_specs` option **gen_on_worker** so that multiple @@ -51,30 +48,28 @@ True, the MPI executor will not launch applications on nodes where libEnsemble P processes (manager and workers) are running. Workers launch applications onto the remaining nodes in the allocation. -.. list-table:: - :widths: 60 40 - * - .. image:: ../images/centralized_dedicated.png - :alt: centralized dedicated mode - :scale: 30 +.. image:: ../images/centralized_dedicated.png + :alt: centralized dedicated mode + :scale: 30 - - In calling script: +In calling script: - .. code-block:: python - :linenos: +.. code-block:: python + :linenos: - ensemble.libE_specs = LibeSpecs( - num_resource_sets=2, - dedicated_mode=True, - ) + ensemble.libE_specs = LibeSpecs( + num_resource_sets=2, + dedicated_mode=True, + ) - A SLURM batch script may include: +A SLURM batch script may include: - .. code-block:: bash +.. code-block:: bash - #SBATCH --nodes 3 + #SBATCH --nodes 3 - python run_libe_forces.py --nworkers 3 + python run_libe_forces.py --nworkers 3 Distributed Running ------------------- From d6cf1d3052408edaa0d7e6ae236566d88fc6f194 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:04:31 -0500 Subject: [PATCH 03/23] additional note --- docs/overview_usecases.rst | 3 +++ docs/platforms/platforms_index.rst | 1 + 2 files changed, 4 insertions(+) diff --git a/docs/overview_usecases.rst b/docs/overview_usecases.rst index 56ad05b6c9..6d63ce8ff8 100644 --- a/docs/overview_usecases.rst +++ b/docs/overview_usecases.rst @@ -20,6 +20,9 @@ which perform computations via **user functions**: | +As of **v2.0** the **Manager** by default runs **a single generator**. This +is configurable. + The default allocator (``alloc_f``) instructs workers to run the simulator on the highest priority work from the generator. If a worker is idle and there is no work, that worker is instructed to call the generator. diff --git a/docs/platforms/platforms_index.rst b/docs/platforms/platforms_index.rst index 843e65e5e6..6daa319b94 100644 --- a/docs/platforms/platforms_index.rst +++ b/docs/platforms/platforms_index.rst @@ -59,6 +59,7 @@ In calling script: :linenos: ensemble.libE_specs = LibeSpecs( + gen_on_worker=True, num_resource_sets=2, dedicated_mode=True, ) From c54adb3d5063a6913fd9d85d3f5fe3e456f6ea19 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 2 May 2025 10:29:51 -0500 Subject: [PATCH 04/23] fixes, plus for test_manager_main we don't want to run the default worker0 during unit tests --- libensemble/manager.py | 7 ++++--- libensemble/tests/unit_tests/test_manager_main.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index 149ab819b6..49ddebc8b3 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,8 +232,9 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) + len_W = len(self.wcomms) + 1 - gen_on_worker - self.W = np.zeros(len(self.wcomms) + gen_on_worker, dtype=Manager.worker_dtype) + self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: self.W["worker_id"] = np.arange(len(self.wcomms)) + 1 # [1, 2, 3, ...] else: @@ -242,8 +243,8 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, gen_on_worker) - self.wcomms = _WorkerIndexer(self.wcomms, gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) self.resources = Resources.resources diff --git a/libensemble/tests/unit_tests/test_manager_main.py b/libensemble/tests/unit_tests/test_manager_main.py index 4e246eb570..e34bc76301 100644 --- a/libensemble/tests/unit_tests/test_manager_main.py +++ b/libensemble/tests/unit_tests/test_manager_main.py @@ -6,7 +6,7 @@ import libensemble.manager as man import libensemble.tests.unit_tests.setup as setup -libE_specs = {"comms": "local"} +libE_specs = {"comms": "local", "gen_on_worker": True} def test_term_test_1(): From dc2570d81c63a7d319f05e0ccd02ca09dfb9821d Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:11:59 -0500 Subject: [PATCH 05/23] small fixes, comments, additional unit test for affirming alloc behavior with additional worker zero --- libensemble/manager.py | 4 +- .../test_allocation_funcs_and_support.py | 55 +++++++++++++++++++ libensemble/worker.py | 2 +- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/libensemble/manager.py b/libensemble/manager.py index de36dc1319..fe6a4bd710 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -232,7 +232,7 @@ def __init__( ] gen_on_worker = self.libE_specs.get("gen_on_worker", False) - len_W = len(self.wcomms) + 1 - gen_on_worker + len_W = len(self.wcomms) + 1 - gen_on_worker # if gen_on_worker, len_W = len(self.wcomms) self.W = np.zeros(len_W, dtype=Manager.worker_dtype) if gen_on_worker: @@ -243,7 +243,7 @@ def __init__( local_worker_comm = self._run_additional_worker(hist, sim_specs, gen_specs, libE_specs) self.wcomms = [local_worker_comm] + self.wcomms - self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) + self.W = _WorkerIndexer(self.W, 1 - gen_on_worker) # if gen on worker, then no additional worker self.wcomms = _WorkerIndexer(self.wcomms, 1 - gen_on_worker) temp_EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) diff --git a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py index 6d056b1e01..9d1d45fece 100644 --- a/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py +++ b/libensemble/tests/unit_tests/test_allocation_funcs_and_support.py @@ -34,6 +34,25 @@ ], ) +W_gen_mgr = np.array( + [ + (0, True, 0, 0, False, False), + (1, False, 0, 0, False, False), + (2, False, 0, 0, False, False), + (3, False, 0, 0, False, False), + (4, False, 0, 0, False, False), + ], + dtype=[ + ("worker_id", " Date: Wed, 13 Aug 2025 15:27:02 -0500 Subject: [PATCH 06/23] check worker zero for run_order for aposmm-ibcdfo-pounders --- .../regression_tests/test_persistent_aposmm_ibcdfo_pounders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py index 7523704a0b..356b2017a9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_ibcdfo_pounders.py @@ -135,7 +135,7 @@ def synthetic_beamline_mapping(H, _, sim_specs): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 save_libE_output(H, persis_info, __file__, nworkers) From a6fd67ce71a10ebcf6fd31cab65c12339652c368 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 13 Aug 2025 15:43:01 -0500 Subject: [PATCH 07/23] test_zero_resource_workers probably needs adjustment? --- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index c8f0786d06..4739c5bfb6 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -36,7 +36,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True From 3ce9ed4aeb40b7ef5f389b2776ec4720637119cd Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:20:15 -0500 Subject: [PATCH 08/23] fixes to accomodate the zeroth worker being a zero-resource worker --- libensemble/sim_funcs/run_line_check.py | 2 +- .../tests/functionality_tests/test_zero_resource_workers.py | 2 +- libensemble/tools/parse_args.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 530b1e8d9b..96ef13a5da 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (1 + persis_gens) + offset = workerID - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index 4739c5bfb6..d286aa7128 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -101,7 +101,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index 5e302c0ce0..c52b6f7c8f 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + nworkers = args.nworkers + 1 # for manager if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 9c38da2e9ae9f3308456a23faef75a3d32068da9 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 11:21:52 -0500 Subject: [PATCH 09/23] fix the specified zero-resource worker --- .../functionality_tests/test_zero_resource_workers_subnode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 69ea2b559c..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -39,7 +39,7 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True @@ -100,7 +100,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options From dc477724c8670af1b9491a1d2d23167584b5b0ce Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 12:01:38 -0500 Subject: [PATCH 10/23] tiny fixes --- ...daptive_workers_persistent_oversubscribe_rsets.py | 8 ++++---- .../functionality_tests/test_sim_dirs_per_worker.py | 12 ++++++------ .../test_zero_resource_workers_subnode.py | 2 +- libensemble/tools/parse_args.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py index fb730b966b..94bf1a387f 100644 --- a/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py +++ b/libensemble/tests/functionality_tests/test_runlines_adaptive_workers_persistent_oversubscribe_rsets.py @@ -31,9 +31,9 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - nsim_workers = nworkers - 1 + nsim_workers = nworkers - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0] rsets = nsim_workers * 2 libE_specs["num_resource_sets"] = rsets @@ -64,7 +64,7 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n), ("x_on_cube", float, n)], "user": { - "initial_batch_size": nworkers - 1, + "initial_batch_size": nworkers, "max_resource_sets": max_rsets, "lb": np.array([-3, -2]), "ub": np.array([3, 2]), @@ -91,7 +91,7 @@ "node_file": node_file, } # Name of file containing a node-list - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, nworkers) exit_criteria = {"sim_max": 40, "wallclock_max": 300} # Perform the run diff --git a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py index 69bb34ab84..a15ede9e92 100644 --- a/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py +++ b/libensemble/tests/functionality_tests/test_sim_dirs_per_worker.py @@ -23,14 +23,14 @@ from libensemble.tests.regression_tests.support import write_sim_func as sim_f from libensemble.tools import add_unique_random_streams, parse_args -nworkers, is_manager, libE_specs, _ = parse_args() +n_simworkers, is_manager, libE_specs, _ = parse_args() # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": sim_input_dir = "./sim_input_dir" dir_to_copy = sim_input_dir + "/copy_this" dir_to_symlink = sim_input_dir + "/symlink_this" - w_ensemble = "./ensemble_workdirs_w" + str(nworkers) + "_" + libE_specs.get("comms") + w_ensemble = "./ensemble_workdirs_w" + str(n_simworkers) + "_" + libE_specs.get("comms") print("creating ensemble dir: ", w_ensemble, flush=True) for dir in [sim_input_dir, dir_to_copy, dir_to_symlink]: @@ -60,7 +60,7 @@ }, } - persis_info = add_unique_random_streams({}, nworkers + 1) + persis_info = add_unique_random_streams({}, n_simworkers) exit_criteria = {"sim_max": 21} @@ -69,9 +69,9 @@ if is_manager: assert os.path.isdir(w_ensemble), f"Ensemble directory {w_ensemble} not created." worker_dir_sum = sum(["worker" in i for i in os.listdir(w_ensemble)]) - assert worker_dir_sum == nworkers, "Number of worker dirs ({}) does not match nworkers ({}).".format( - worker_dir_sum, nworkers - ) + assert ( + worker_dir_sum == n_simworkers + 1 + ), "Number of worker dirs ({}) does not match n_simworkers ({}).".format(worker_dir_sum, n_simworkers) input_copied = [] sim_dir_sum = 0 diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index a7a57b584a..446142f5db 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 +# TESTSUITE_NPROCS: 3 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tools/parse_args.py b/libensemble/tools/parse_args.py index c52b6f7c8f..5e302c0ce0 100644 --- a/libensemble/tools/parse_args.py +++ b/libensemble/tools/parse_args.py @@ -73,7 +73,7 @@ def _mpi_parse_args(args): def _local_parse_args(args): """Parses arguments for forked processes using multiprocessing.""" libE_specs = {"comms": args.comms} - nworkers = args.nworkers + 1 # for manager + nworkers = args.nworkers if args.nresource_sets is not None: libE_specs["num_resource_sets"] = args.nresource_sets From 46708391e8faac92faf1c086c1e7dbfe4b6ae8ad Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:08:11 -0500 Subject: [PATCH 11/23] fix run_order key in persis_info for handful of aposmm tests --- .../tests/regression_tests/test_persistent_aposmm_dfols.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_periodic.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_timeout.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 6e19930691..322f3d6633 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -102,7 +102,7 @@ def combine_component(x): H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert flag == 0 assert np.min(H["f"][H["sim_ended"]]) <= 3000, "Didn't find a value below 3000" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py index d99e8802a0..8cc215800d 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_periodic.py @@ -89,7 +89,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) # The minima are known on this test problem. If the above [lb, ub] domain is diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py index e61843fd71..e6014cbee3 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_timeout.py @@ -87,6 +87,6 @@ if is_manager: assert flag == 2, "Test should have timed out" - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" min_ids = np.where(H["local_min"]) save_libE_output(H, persis_info, __file__, nworkers) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index f2d2f09cc0..75b3a582d9 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -121,7 +121,7 @@ H, persis_info, flag = libE(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0=H0) if is_manager: - assert persis_info[1].get("run_order"), "Run_order should have been given back" + assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." From befcecc2e9821df661f7f08c8248543bf1cb34fc Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 14:13:25 -0500 Subject: [PATCH 12/23] ditto --- .../test_persistent_uniform_sampling_nonblocking.py | 2 +- .../tests/regression_tests/test_persistent_aposmm_with_grad.py | 2 +- .../tests/regression_tests/test_persistent_fd_param_finder.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py index 5425578849..b62cbe6b64 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_sampling_nonblocking.py @@ -68,4 +68,4 @@ assert len(np.unique(H["gen_ended_time"])) == 2 save_libE_output(H, persis_info, __file__, nworkers) - assert persis_info[1]["spin_count"] > 0, "This should have been a nonblocking receive" + assert persis_info[0]["spin_count"] > 0, "This should have been a nonblocking receive" diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py index 75b3a582d9..48b1ae2ff6 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_with_grad.py @@ -123,7 +123,7 @@ if is_manager: assert persis_info[0].get("run_order"), "Run_order should have been given back" assert ( - len(persis_info[1]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] + len(persis_info[0]["run_order"]) >= gen_specs["user"]["stop_after_k_minima"] ), "This test should have many runs started." assert len(H) < exit_criteria["sim_max"], "Test should have stopped early due to 'stop_after_k_minima'" diff --git a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py index ac01d5683b..de97470dc2 100644 --- a/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py +++ b/libensemble/tests/regression_tests/test_persistent_fd_param_finder.py @@ -70,6 +70,6 @@ if fd_test.is_manager: assert len(H) < fd_test.exit_criteria.gen_max, "Problem didn't stop early, which should have been the case." - assert np.all(persis_info[1]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." + assert np.all(persis_info[0]["Fnoise"] > 0), "gen_f didn't find noise for all F_i components." fd_test.save_output(__file__) From 30403c70cfdbe6b366900235feebd367f0f784d3 Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:36:44 -0500 Subject: [PATCH 13/23] fix workercount? --- .../functionality_tests/test_zero_resource_workers_subnode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index 446142f5db..a7a57b584a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 +# TESTSUITE_NPROCS: 4 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": From 519930d37887c6069e1acfcb898e3b4ee1cf7b0f Mon Sep 17 00:00:00 2001 From: jlnav Date: Wed, 27 Aug 2025 15:55:26 -0500 Subject: [PATCH 14/23] trying to narrow down issue with this test... --- .../test_persistent_uniform_gen_decides_stop.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index 68c8aaaa05..f6a1e5d57f 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -13,7 +13,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 5 7 +# TESTSUITE_NPROCS: 3 5 # TESTSUITE_OS_SKIP: WIN import sys @@ -82,9 +82,7 @@ assert ( sum(counts == init_batch_size) >= ngens ), "The initial batch of each gen should be common among initial_batch_size number of points" - assert ( - len(counts) > 1 - ), "All gen_ended_times are the same; they should be different for the async case" + assert len(counts) > 1, "All gen_ended_times are the same; they should be different for the async case" gen_workers = np.unique(H["gen_worker"]) print("Generators that issued points", gen_workers) From 015cc4a90446a4759829a484eb64f1c52d8b340b Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 28 Aug 2025 11:11:47 -0500 Subject: [PATCH 15/23] still narrowing down issues with the zrw tests and having a zeroth worker... --- libensemble/sim_funcs/run_line_check.py | 6 +++--- .../test_mpi_runners_zrw_subnode_uneven.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 96ef13a5da..44827d3c90 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID - (persis_gens) + offset = workerID # - (persis_gens) new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) @@ -80,7 +80,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): exctr = Executor.executor test = sim_specs["user"]["tests"][0] exp_list = sim_specs["user"]["expect"] - p_gens = sim_specs["user"].get("persis_gens", 0) + # p_gens = sim_specs["user"].get("persis_gens", 0) task = exctr.submit( calc_type="sim", @@ -107,7 +107,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): else: wid_mod = wid - new_exp_list = exp_list[wid_mod - 1 - p_gens] + new_exp_list = exp_list[wid_mod - 1] # - p_gens] if outline != new_exp_list: print(f"Worker {wid}:\n outline is: {outline}\n exp is: {new_exp_list}", flush=True) diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index cc73d0e427..9c9f936edb 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -44,6 +44,7 @@ comms = libE_specs["comms"] libE_specs["dedicated_mode"] = True + libE_specs["zero_resource_workers"] = [0] libE_specs["enforce_worker_core_bounds"] = True # To allow visual checking - log file not used in test @@ -52,7 +53,7 @@ # For varying size test - relate node count to nworkers n_gens = 1 - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens if nsim_workers % 2 == 0: sys.exit( From 8c04b3a3212483165c31c3a10b4b222e5b23da29 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 5 Sep 2025 11:35:10 -0500 Subject: [PATCH 16/23] various fixes throughout the codebase to try making the number of workers (with gen_on_worker defaulting to False) clear to resources. other temporary debug adjusts --- libensemble/libE.py | 4 +++- libensemble/resources/worker_resources.py | 2 +- libensemble/sim_funcs/run_line_check.py | 2 +- .../functionality_tests/test_zero_resource_workers.py | 10 ++++------ libensemble/worker.py | 6 +++--- pyproject.toml | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index af302d13c8..665553fe75 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -489,8 +489,10 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li wcomms = start_proc_team(libE_specs["nworkers"], sim_specs, gen_specs, libE_specs) # Set manager resources after the forkpoint. + # if libE_specs["gen_on_worker"] == True, -n reflects the exact number of workers + # if libE_specs["gen_on_worker"] == False: nworkers internally is the number of workers + 1 if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"]) + resources.set_resource_manager(libE_specs["nworkers"] + (1 - libE_specs["gen_on_worker"])) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 5033b2aeee..8df45929cc 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -106,7 +106,7 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in """Map WorkerID to index into a nodelist""" index = 0 index_list = [] - for i in range(1, num_workers + 1): + for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) else: diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 44827d3c90..9d16205d28 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID # - (persis_gens) + offset = workerID new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index d286aa7128..93c4350786 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -9,8 +9,6 @@ The number of concurrent evaluations of the objective function will be 4-1=3. """ -import sys - import numpy as np from libensemble import logger @@ -23,7 +21,7 @@ from libensemble.tools import add_unique_random_streams, parse_args # logger.set_level("DEBUG") # For testing the test -logger.set_level("INFO") +logger.set_level("DEBUG") # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local @@ -49,7 +47,7 @@ # For varying size test - relate node count to nworkers in_place = libE_specs["zero_resource_workers"] n_gens = len(in_place) - nsim_workers = nworkers - n_gens + nsim_workers = nworkers # - n_gens comms = libE_specs["comms"] nodes_per_worker = 2 @@ -79,8 +77,8 @@ exctr = MPIExecutor(custom_info=mpi_customizer) exctr.register_app(full_path=sim_app, calc_type="sim") - if nworkers < 2: - sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") + # if nworkers < 2: + # sys.exit("Cannot run with a persistent worker if only one worker -- aborting...") n = 2 sim_specs = { diff --git a/libensemble/worker.py b/libensemble/worker.py index 5f574a3336..e543e93bcc 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -177,7 +177,7 @@ def __init__( self.runners = {EVAL_SIM_TAG: self.sim_runner.run, EVAL_GEN_TAG: self.gen_runner.run} self.calc_iter = {EVAL_SIM_TAG: 0, EVAL_GEN_TAG: 0} Worker._set_executor(self.workerID, self.comm) - Worker._set_resources(self.workerID, self.comm) + Worker._set_resources(self.workerID, self.comm, self.libE_specs) self.EnsembleDirectory = EnsembleDirectory(libE_specs=libE_specs) @staticmethod @@ -215,11 +215,11 @@ def _set_executor(workerID: int, comm: Comm) -> bool: return False @staticmethod - def _set_resources(workerID, comm: Comm) -> bool: + def _set_resources(workerID, comm: Comm, libE_specs) -> bool: """Sets worker ID in the resources, return True if set""" resources = Resources.resources if isinstance(resources, Resources): - resources.set_worker_resources(comm.get_num_workers(), workerID) + resources.set_worker_resources(comm.get_num_workers() + (1 - libE_specs["gen_on_worker"]), workerID) return True else: logger.debug(f"No resources set on worker {workerID}") diff --git a/pyproject.toml b/pyproject.toml index 68d5654da3..231afa5bbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -142,4 +142,4 @@ extend-exclude = ["*.bib", "*.xml", "docs/nitpicky"] disable_error_code = ["import-not-found", "import-untyped"] [dependency-groups] -dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4"] +dev = ["pyenchant", "enchant>=0.0.1,<0.0.2", "flake8-modern-annotations>=1.6.0,<2", "flake8-type-checking>=3.0.0,<4", "wat>=0.7.0,<0.8"] From fafff0ea5b0ab16809f00ad9f7cfe6a2f09fc3bf Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 12 Dec 2025 16:25:13 -0600 Subject: [PATCH 17/23] adjust map_workerid_to_index unit test for starting at workerID index zero --- .../tests/unit_tests/test_resources.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libensemble/tests/unit_tests/test_resources.py b/libensemble/tests/unit_tests/test_resources.py index b87583f377..15db28ea07 100644 --- a/libensemble/tests/unit_tests/test_resources.py +++ b/libensemble/tests/unit_tests/test_resources.py @@ -694,23 +694,23 @@ def test_map_workerid_to_index(): zero_resource_list = [] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(1, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 1, "index incorrect. Received: " + str(index) + for workerID in range(0, num_workers): + index = index_list[workerID] + assert index == workerID, "index incorrect. Received: " + str(index) - zero_resource_list = [1] + zero_resource_list = [0] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(2, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 2, "index incorrect. Received: " + str(index) + for workerID in range(1, num_workers): + index = index_list[workerID] + assert index == workerID - 1, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 2] + zero_resource_list = [0, 1] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) - for workerID in range(3, num_workers + 1): - index = index_list[workerID - 1] - assert index == workerID - 3, "index incorrect. Received: " + str(index) + for workerID in range(2, num_workers): + index = index_list[workerID] + assert index == workerID - 2, "index incorrect. Received: " + str(index) - zero_resource_list = [1, 3] + zero_resource_list = [0, 2] index_list = ResourceManager.get_index_list(num_workers, num_rsets, zero_resource_list) workerID = 2 From f4370683209460ccbdf97c90a564d1040273e0dd Mon Sep 17 00:00:00 2001 From: jlnav Date: Thu, 18 Dec 2025 10:46:36 -0600 Subject: [PATCH 18/23] perhaps the default zero-resource-worker should be worker 0 - but does this actually solve anything? --- libensemble/specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libensemble/specs.py b/libensemble/specs.py index 168bbcc380..16347084c5 100644 --- a/libensemble/specs.py +++ b/libensemble/specs.py @@ -469,7 +469,7 @@ class LibeSpecs(BaseModel): libEnsemble processes (manager and workers) are running. """ - zero_resource_workers: list[int] | None = [] + zero_resource_workers: list[int] | None = [0] """ list of workers that require no resources. For when a fixed mapping of workers to resources is required. Otherwise, use ``num_resource_sets``. From 39798a393218e7f508eadf2bd26050f33c1f11d8 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 13 Mar 2026 12:40:43 -0500 Subject: [PATCH 19/23] various nworkers indexing fixes for outstanding bugs and indexing errors, regarding the default presence of the zeroth worker. assistance from gemini/claude for fixing resource math in various tests. --- libensemble/libE.py | 3 +-- libensemble/manager.py | 2 +- libensemble/sim_funcs/run_line_check.py | 2 +- .../test_GPU_gen_resources.py | 27 +++++++++++++------ .../test_asktell_sampling.py | 2 +- .../test_asktell_sampling_external_gen.py | 2 +- .../test_zero_resource_workers.py | 21 ++++++++------- .../test_zero_resource_workers_subnode.py | 19 ++++++------- .../test_asktell_aposmm_nlopt.py | 2 +- .../regression_tests/test_optimas_ax_mf.py | 2 +- .../test_optimas_ax_multitask.py | 2 +- .../regression_tests/test_optimas_ax_sf.py | 2 +- .../test_optimas_grid_sample.py | 2 +- .../tests/regression_tests/test_xopt_EI.py | 2 +- .../regression_tests/test_xopt_EI_xopt_sim.py | 2 +- .../regression_tests/test_xopt_nelder_mead.py | 2 +- .../forces_simple_xopt/run_libe_forces.py | 1 - libensemble/worker.py | 2 +- 18 files changed, 54 insertions(+), 43 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index c3d5df2917..7bd84658dc 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -495,9 +495,8 @@ def libE_local(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, li # Set manager resources after the forkpoint. # if libE_specs["gen_on_worker"] == True, -n reflects the exact number of workers - # if libE_specs["gen_on_worker"] == False: nworkers internally is the number of workers + 1 if resources is not None: - resources.set_resource_manager(libE_specs["nworkers"] + (1 - libE_specs["gen_on_worker"])) + resources.set_resource_manager(libE_specs["nworkers"] + 1) if not libE_specs["disable_log_files"]: exit_logger = manager_logging_config(specs=libE_specs) diff --git a/libensemble/manager.py b/libensemble/manager.py index 208c7058a7..5d6e1cf79b 100644 --- a/libensemble/manager.py +++ b/libensemble/manager.py @@ -401,7 +401,7 @@ def _set_resources(self, Work: dict, w: int) -> None: if rset_req is None: rset_team = [] - default_rset = resource_manager.index_list[w - 1] + default_rset = resource_manager.index_list[w] if default_rset is not None: rset_team.append(default_rset) Work["libE_info"]["rset_team"] = rset_team diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 9d16205d28..98b5fa29e8 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -22,7 +22,7 @@ def exp_nodelist_for_worker(exp_list, workerID, nodes_per_worker, persis_gens): node_list = comp.split(",") for node in node_list: node_name, node_num = node.split("-") - offset = workerID + offset = workerID - 1 new_num = int(node_num) + int(nodes_per_worker * offset) new_node = "-".join([node_name, str(new_num)]) new_node_list.append(new_node) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index 48b7fee0d2..9ea37ff9a6 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -50,8 +50,6 @@ if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers # Persistent gen DOES need resources - # Mock GPU system / uncomment to detect GPUs libE_specs["sim_dirs_make"] = True # Will only contain files if dry_run is False libE_specs["gen_dirs_make"] = True # Will only contain files if dry_run is False @@ -80,8 +78,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("num_procs", int), ("num_gpus", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_procs": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": "set_in_loop", + "max_procs": "set_in_loop", # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), "dry_run": dry_run, @@ -97,7 +95,6 @@ } exit_criteria = {"sim_max": 20} - libE_specs["resource_info"] = {"cores_on_node": (nworkers * 2, nworkers * 4), "gpus_on_node": nworkers} base_libE_specs = libE_specs.copy() for gen_on_worker in [False, True]: @@ -105,7 +102,21 @@ # reset libE_specs = base_libE_specs.copy() libE_specs["gen_on_worker"] = gen_on_worker - persis_info = add_unique_random_streams({}, nworkers + 1) + libE_specs["zero_resource_workers"] = [] + + active_workers = nworkers if gen_on_worker else nworkers + 1 + sim_workers = nworkers - 1 if gen_on_worker else nworkers + + gen_specs["user"]["initial_batch_size"] = sim_workers + gen_specs["user"]["max_procs"] = sim_workers + + libE_specs["num_resource_sets"] = active_workers + libE_specs["resource_info"] = { + "cores_on_node": (active_workers * 2, active_workers * 4), + "gpus_on_node": active_workers, + } + + persis_info = add_unique_random_streams({}, active_workers) if run == 0: libE_specs["gen_num_procs"] = 2 @@ -117,13 +128,13 @@ persis_info["gen_num_gpus"] = 1 elif run == 3: # Two GPUs per resource set - libE_specs["resource_info"]["gpus_on_node"] = nworkers * 2 + libE_specs["resource_info"]["gpus_on_node"] = active_workers * 2 persis_info["gen_num_gpus"] = 1 elif run == 4: # Two GPUs requested for gen persis_info["gen_num_procs"] = 2 persis_info["gen_num_gpus"] = 2 - gen_specs["user"]["max_procs"] = max(nworkers - 2, 1) + gen_specs["user"]["max_procs"] = max(sim_workers - 1, 1) # Perform the run H, persis_info, flag = libE( diff --git a/libensemble/tests/functionality_tests/test_asktell_sampling.py b/libensemble/tests/functionality_tests/test_asktell_sampling.py index cebb858df2..7da9f1c668 100644 --- a/libensemble/tests/functionality_tests/test_asktell_sampling.py +++ b/libensemble/tests/functionality_tests/test_asktell_sampling.py @@ -34,7 +34,7 @@ def sim_f(In): if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["gen_on_manager"] = True + libE_specs["gen_on_worker"] = False sim_specs = { "sim_f": sim_f, diff --git a/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py b/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py index 8578da720c..1bbbf696f4 100644 --- a/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py +++ b/libensemble/tests/functionality_tests/test_asktell_sampling_external_gen.py @@ -44,7 +44,7 @@ def sim_f_scalar(In): if __name__ == "__main__": - libE_specs = LibeSpecs(gen_on_manager=True) + libE_specs = LibeSpecs() for test in range(1): # 2 diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers.py b/libensemble/tests/functionality_tests/test_zero_resource_workers.py index 93c4350786..bc03ebbb7a 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers.py @@ -1,12 +1,12 @@ """ Runs libEnsemble testing the zero_resource_workers argument. -Execute via one of the following commands (e.g. 3 workers): - mpiexec -np 4 python test_zero_resource_workers.py - python test_zero_resource_workers.py --nworkers 3 - python test_zero_resource_workers.py --nworkers 3 --comms tcp +Execute via one of the following commands (e.g. 4 workers): + mpiexec -np 5 python test_zero_resource_workers.py + python test_zero_resource_workers.py --nworkers 4 + python test_zero_resource_workers.py --nworkers 4 --comms tcp -The number of concurrent evaluations of the objective function will be 4-1=3. +The number of concurrent evaluations of the objective function will be 4. """ import numpy as np @@ -25,7 +25,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 3 4 +# TESTSUITE_NPROCS: 3 5 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": @@ -45,9 +45,10 @@ nodes_per_worker = 2 # For varying size test - relate node count to nworkers - in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) - nsim_workers = nworkers # - n_gens + # With gen-on-manager (default), all user workers are sim workers. + # Worker 0 (gen) is hidden and in zero_resource_workers. + n_gens = 0 + nsim_workers = nworkers comms = libE_specs["comms"] nodes_per_worker = 2 @@ -99,7 +100,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers) + persis_info = add_unique_random_streams({}, nworkers + 1) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py index a7a57b584a..19986097c7 100644 --- a/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py +++ b/libensemble/tests/functionality_tests/test_zero_resource_workers_subnode.py @@ -2,13 +2,13 @@ Runs libEnsemble testing the zero_resource_workers argument with 2 workers per node. -This test must be run on an odd number of workers >= 3 (e.g. even number of +This test must be run on an even number of workers >= 2 (e.g. odd number of procs when using mpi4py). Execute via one of the following commands (e.g. 3 workers): - mpiexec -np 4 python test_zero_resource_workers_subnode.py - python test_zero_resource_workers_subnode.py --nworkers 3 - python test_zero_resource_workers_subnode.py --nworkers 3 --comms tcp + mpiexec -np 5 python test_zero_resource_workers_subnode.py + python test_zero_resource_workers_subnode.py --nworkers 4 + python test_zero_resource_workers_subnode.py --nworkers 4 --comms tcp """ import sys @@ -29,7 +29,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 +# TESTSUITE_NPROCS: 5 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": @@ -50,9 +50,10 @@ nodes_per_worker = 0.5 # For varying size test - relate node count to nworkers - in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) - nsim_workers = nworkers - n_gens + # With gen-on-manager (default), all user workers are sim workers. + # Worker 0 (gen) is hidden and in zero_resource_workers. + n_gens = 0 + nsim_workers = nworkers if not (nsim_workers * nodes_per_worker).is_integer(): sys.exit(f"Sim workers ({nsim_workers}) must divide evenly into nodes") @@ -100,7 +101,7 @@ } alloc_specs = {"alloc_f": alloc_f} - persis_info = add_unique_random_streams({}, nworkers) + persis_info = add_unique_random_streams({}, nworkers + 1) exit_criteria = {"sim_max": (nsim_workers) * rounds} # Each worker has 2 nodes. Basic test list for portable options diff --git a/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py b/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py index a85771d7dd..5a36e2e1ec 100644 --- a/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py +++ b/libensemble/tests/regression_tests/test_asktell_aposmm_nlopt.py @@ -64,7 +64,7 @@ def six_hump_camel_func(x): n = 2 workflow.alloc_specs = AllocSpecs(alloc_f=alloc_f) - workflow.libE_specs.gen_on_manager = True + workflow.libE_specs.gen_on_worker = False vocs = VOCS( variables={"core": [-3, 3], "edge": [-2, 2], "core_on_cube": [-3, 3], "edge_on_cube": [-2, 2]}, diff --git a/libensemble/tests/regression_tests/test_optimas_ax_mf.py b/libensemble/tests/regression_tests/test_optimas_ax_mf.py index 758aa1fc2c..d99bc518a9 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_mf.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_mf.py @@ -42,7 +42,7 @@ def eval_func_mf(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={"x0": [-50.0, 5.0], "x1": [-5.0, 15.0], "res": [1.0, 8.0]}, diff --git a/libensemble/tests/regression_tests/test_optimas_ax_multitask.py b/libensemble/tests/regression_tests/test_optimas_ax_multitask.py index 43c6c444eb..65c81a373f 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_multitask.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_multitask.py @@ -57,7 +57,7 @@ def eval_func_multitask(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={ diff --git a/libensemble/tests/regression_tests/test_optimas_ax_sf.py b/libensemble/tests/regression_tests/test_optimas_ax_sf.py index e4ee9e8a79..b04753490f 100644 --- a/libensemble/tests/regression_tests/test_optimas_ax_sf.py +++ b/libensemble/tests/regression_tests/test_optimas_ax_sf.py @@ -41,7 +41,7 @@ def eval_func_sf(input_params): n = 2 batch_size = 2 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) vocs = VOCS( variables={ diff --git a/libensemble/tests/regression_tests/test_optimas_grid_sample.py b/libensemble/tests/regression_tests/test_optimas_grid_sample.py index 57c6c8fedf..555ef071f7 100644 --- a/libensemble/tests/regression_tests/test_optimas_grid_sample.py +++ b/libensemble/tests/regression_tests/test_optimas_grid_sample.py @@ -42,7 +42,7 @@ def eval_func(input_params: dict): n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) # Create varying parameters. lower_bounds = [-3.0, 2.0] diff --git a/libensemble/tests/regression_tests/test_xopt_EI.py b/libensemble/tests/regression_tests/test_xopt_EI.py index a78aee60a5..3c902ee9d7 100644 --- a/libensemble/tests/regression_tests/test_xopt_EI.py +++ b/libensemble/tests/regression_tests/test_xopt_EI.py @@ -53,7 +53,7 @@ def xtest_sim(H, persis_info, sim_specs, _): n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py b/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py index 07ace47fa9..8082a67fac 100644 --- a/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py +++ b/libensemble/tests/regression_tests/test_xopt_EI_xopt_sim.py @@ -47,7 +47,7 @@ def xtest_callable(input_dict: dict, a=0) -> dict: n = 2 batch_size = 4 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/regression_tests/test_xopt_nelder_mead.py b/libensemble/tests/regression_tests/test_xopt_nelder_mead.py index fe8039b95c..d8bfdb4597 100644 --- a/libensemble/tests/regression_tests/test_xopt_nelder_mead.py +++ b/libensemble/tests/regression_tests/test_xopt_nelder_mead.py @@ -38,7 +38,7 @@ def rosenbrock_callable(input_dict: dict) -> dict: batch_size = 1 - libE_specs = LibeSpecs(gen_on_manager=True, nworkers=batch_size) + libE_specs = LibeSpecs(nworkers=batch_size) libE_specs.reuse_output_dir = True vocs = VOCS( diff --git a/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py b/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py index 2a7f8f7018..2d23c83a88 100644 --- a/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py +++ b/libensemble/tests/scaling_tests/forces/forces_simple_xopt/run_libe_forces.py @@ -32,7 +32,6 @@ # Persistent gen does not need resources ensemble.libE_specs = LibeSpecs( - gen_on_manager=True, sim_dirs_make=True, ) diff --git a/libensemble/worker.py b/libensemble/worker.py index 3ac46b9760..3a472899e6 100644 --- a/libensemble/worker.py +++ b/libensemble/worker.py @@ -223,7 +223,7 @@ def _set_resources(workerID, comm: Comm, libE_specs) -> bool: """Sets worker ID in the resources, return True if set""" resources = Resources.resources if isinstance(resources, Resources): - resources.set_worker_resources(comm.get_num_workers() + (1 - libE_specs["gen_on_worker"]), workerID) + resources.set_worker_resources(comm.get_num_workers() + 1, workerID) return True else: logger.debug(f"No resources set on worker {workerID}") From 836e52514784ea7a22da85a5764c68f6d594dd60 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 13 Mar 2026 13:59:39 -0500 Subject: [PATCH 20/23] some clarifying renames and comments --- .../test_GPU_gen_resources.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index 9ea37ff9a6..d45abaa4bf 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -102,21 +102,23 @@ # reset libE_specs = base_libE_specs.copy() libE_specs["gen_on_worker"] = gen_on_worker - libE_specs["zero_resource_workers"] = [] + libE_specs["zero_resource_workers"] = [] # perhaps the generator needs GPUs - active_workers = nworkers if gen_on_worker else nworkers + 1 + resourced_workers = ( + nworkers if gen_on_worker else nworkers + 1.0 + ) # note this "nworkers" decided before the extra worker starts sim_workers = nworkers - 1 if gen_on_worker else nworkers gen_specs["user"]["initial_batch_size"] = sim_workers gen_specs["user"]["max_procs"] = sim_workers - libE_specs["num_resource_sets"] = active_workers + libE_specs["num_resource_sets"] = resourced_workers libE_specs["resource_info"] = { - "cores_on_node": (active_workers * 2, active_workers * 4), - "gpus_on_node": active_workers, + "cores_on_node": (resourced_workers * 2, resourced_workers * 4), + "gpus_on_node": resourced_workers, } - persis_info = add_unique_random_streams({}, active_workers) + persis_info = add_unique_random_streams({}, resourced_workers) if run == 0: libE_specs["gen_num_procs"] = 2 @@ -128,7 +130,7 @@ persis_info["gen_num_gpus"] = 1 elif run == 3: # Two GPUs per resource set - libE_specs["resource_info"]["gpus_on_node"] = active_workers * 2 + libE_specs["resource_info"]["gpus_on_node"] = resourced_workers * 2 persis_info["gen_num_gpus"] = 1 elif run == 4: # Two GPUs requested for gen From 9a8425e4531a088c338591ff11f7c1dcef2046d1 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 09:27:53 -0500 Subject: [PATCH 21/23] ditto --- .../tests/regression_tests/test_persistent_aposmm_dfols.py | 1 + .../tests/regression_tests/test_persistent_aposmm_exception.py | 1 + .../regression_tests/test_persistent_aposmm_external_localopt.py | 1 + .../tests/regression_tests/test_persistent_aposmm_nlopt.py | 1 + .../tests/regression_tests/test_persistent_aposmm_tao_blmvm.py | 1 + 5 files changed, 5 insertions(+) diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py index 6e19930691..c810db561e 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_dfols.py @@ -69,6 +69,7 @@ def combine_component(x): "gen_f": gen_f, "persis_in": ["f", "fvec"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "localopt_method": "dfols", diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py b/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py index b197dc3f07..09544e51d1 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_exception.py @@ -69,6 +69,7 @@ def assertion(passed): "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "localopt_method": "LN_BOBYQA", diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py index dd01d1069e..22cf47325f 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py @@ -75,6 +75,7 @@ "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py index 9d42784439..bf5914573f 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_nlopt.py @@ -64,6 +64,7 @@ "gen_f": gen_f, "persis_in": ["f"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), diff --git a/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py index 39ff3b79fd..8a3c762871 100644 --- a/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py +++ b/libensemble/tests/regression_tests/test_persistent_aposmm_tao_blmvm.py @@ -67,6 +67,7 @@ "gen_f": gen_f, "persis_in": ["f", "grad"] + [n[0] for n in gen_out], "out": gen_out, + "initial_batch_size": 100, "user": { "initial_sample_size": 100, "sample_points": np.round(minima, 1), From e0d4cbaee883813193b759cbabd1dc1cd6a67bc8 Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 13:30:50 -0500 Subject: [PATCH 22/23] 0 is always a zero-resource-worker, no matter what... plus mypy fixes --- libensemble/resources/resources.py | 8 +++++--- libensemble/sim_funcs/run_line_check.py | 4 ++-- .../tests/functionality_tests/test_GPU_gen_resources.py | 2 +- .../test_mpi_runners_subnode_uneven.py | 2 +- .../test_mpi_runners_zrw_supernode_uneven.py | 5 +++-- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 105f8a836e..90765e9490 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -63,10 +63,10 @@ def init_resources(cls, libE_specs: dict, platform_info: dict = {}) -> None: libE_specs=libE_specs, platform_info=platform_info, top_level_dir=top_level_dir ) - def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = None) -> None: + def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = "") -> None: """Initiate a new resources object""" self.top_level_dir = top_level_dir or os.getcwd() - self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir=None) + self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir="") self.resource_manager = None # For Manager self.worker_resources = None # For Workers @@ -101,7 +101,7 @@ class GlobalResources: :ivar int num_resource_sets: Number of resource sets, if supplied by the user. """ - def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = None) -> None: + def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: str = "") -> None: """Initializes a new Resources instance Determines the compute resources available for current allocation, including @@ -167,6 +167,8 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) + if 0 not in self.zero_resource_workers: + self.zero_resource_workers.append(0) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") diff --git a/libensemble/sim_funcs/run_line_check.py b/libensemble/sim_funcs/run_line_check.py index 98b5fa29e8..d30d10ec38 100644 --- a/libensemble/sim_funcs/run_line_check.py +++ b/libensemble/sim_funcs/run_line_check.py @@ -80,7 +80,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): exctr = Executor.executor test = sim_specs["user"]["tests"][0] exp_list = sim_specs["user"]["expect"] - # p_gens = sim_specs["user"].get("persis_gens", 0) + p_gens = sim_specs["user"].get("persis_gens", 0) task = exctr.submit( calc_type="sim", @@ -107,7 +107,7 @@ def runline_check_by_worker(H, persis_info, sim_specs, libE_info): else: wid_mod = wid - new_exp_list = exp_list[wid_mod - 1] # - p_gens] + new_exp_list = exp_list[wid_mod - 1 - p_gens] if outline != new_exp_list: print(f"Worker {wid}:\n outline is: {outline}\n exp is: {new_exp_list}", flush=True) diff --git a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py index d45abaa4bf..5308f12c7f 100644 --- a/libensemble/tests/functionality_tests/test_GPU_gen_resources.py +++ b/libensemble/tests/functionality_tests/test_GPU_gen_resources.py @@ -105,7 +105,7 @@ libE_specs["zero_resource_workers"] = [] # perhaps the generator needs GPUs resourced_workers = ( - nworkers if gen_on_worker else nworkers + 1.0 + nworkers if gen_on_worker else nworkers + 1 ) # note this "nworkers" decided before the extra worker starts sim_workers = nworkers - 1 if gen_on_worker else nworkers diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py index a5145965b9..9312c83b49 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_subnode_uneven.py @@ -26,7 +26,7 @@ # Do not change these lines - they are parsed by run-tests.sh # TESTSUITE_COMMS: mpi local -# TESTSUITE_NPROCS: 4 6 +# TESTSUITE_NPROCS: 5 7 # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py index 640d613bff..77ce05006f 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_supernode_uneven.py @@ -33,9 +33,10 @@ sim_app = "/path/to/fakeapp.x" comms = libE_specs["comms"] - libE_specs["zero_resource_workers"] = [1] + libE_specs["zero_resource_workers"] = [0, 1] libE_specs["dedicated_mode"] = True libE_specs["enforce_worker_core_bounds"] = True + libE_specs["gen_on_worker"] = True # To allow visual checking - log file not used in test log_file = "ensemble_mpi_runners_zrw_supernode_uneven_comms_" + str(comms) + "_wrks_" + str(nworkers) + ".log" @@ -45,7 +46,7 @@ # For varying size test - relate node count to nworkers in_place = libE_specs["zero_resource_workers"] - n_gens = len(in_place) + n_gens = len([w for w in in_place if w != 0]) nsim_workers = nworkers - n_gens comms = libE_specs["comms"] node_file = "nodelist_mpi_runners_zrw_supernode_uneven_comms_" + str(comms) + "_wrks_" + str(nworkers) From 6505cd8438dbb2e64e962b39e981a281adcd9afa Mon Sep 17 00:00:00 2001 From: jlnav Date: Fri, 20 Mar 2026 15:51:34 -0500 Subject: [PATCH 23/23] various zrw / gen-worker indexing fixes. defensive programming around worker indexes and zrws. mypy types. vibe-coded new test cases for test_mpi_runners_zrw_subnode_uneven --- libensemble/libE.py | 2 +- libensemble/resources/resources.py | 6 ++---- libensemble/resources/worker_resources.py | 9 ++++++-- .../test_mpi_gpu_settings.py | 6 +++--- .../test_mpi_gpu_settings_env.py | 6 +++--- .../test_mpi_runners_zrw_subnode_uneven.py | 21 +++++++++++++++---- ...est_persistent_uniform_gen_decides_stop.py | 2 ++ 7 files changed, 35 insertions(+), 17 deletions(-) diff --git a/libensemble/libE.py b/libensemble/libE.py index f768b1e11f..71cc654e8c 100644 --- a/libensemble/libE.py +++ b/libensemble/libE.py @@ -385,7 +385,7 @@ def libE_mpi(sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE # Run manager or worker code, depending if is_manager: if resources is not None: - resources.set_resource_manager(nworkers) + resources.set_resource_manager(nworkers + 1) return libE_mpi_manager( mpi_comm, sim_specs, gen_specs, exit_criteria, persis_info, alloc_specs, libE_specs, H0 ) diff --git a/libensemble/resources/resources.py b/libensemble/resources/resources.py index 90765e9490..5ba54cc09b 100644 --- a/libensemble/resources/resources.py +++ b/libensemble/resources/resources.py @@ -67,8 +67,8 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st """Initiate a new resources object""" self.top_level_dir = top_level_dir or os.getcwd() self.glob_resources = GlobalResources(libE_specs=libE_specs, platform_info=platform_info, top_level_dir="") - self.resource_manager = None # For Manager - self.worker_resources = None # For Workers + self.resource_manager: ResourceManager | None = None # For Manager + self.worker_resources: WorkerResources | None = None # For Workers def set_worker_resources(self, num_workers: int, workerid: int) -> None: """Initiate the worker resources component of resources""" @@ -167,8 +167,6 @@ def __init__(self, libE_specs: dict, platform_info: dict = {}, top_level_dir: st self.top_level_dir = top_level_dir self.dedicated_mode = libE_specs.get("dedicated_mode", False) self.zero_resource_workers = libE_specs.get("zero_resource_workers", []) - if 0 not in self.zero_resource_workers: - self.zero_resource_workers.append(0) self.num_resource_sets = libE_specs.get("num_resource_sets", None) self.enforce_worker_core_bounds = libE_specs.get("enforce_worker_core_bounds", False) self.gpus_per_group = libE_specs.get("gpus_per_group") diff --git a/libensemble/resources/worker_resources.py b/libensemble/resources/worker_resources.py index 8df45929cc..df89625ad6 100644 --- a/libensemble/resources/worker_resources.py +++ b/libensemble/resources/worker_resources.py @@ -105,7 +105,7 @@ def free_rsets(self, worker=None): def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[int | Any]) -> list[int | None]: """Map WorkerID to index into a nodelist""" index = 0 - index_list = [] + index_list: list[int | None] = [] for i in range(0, num_workers): if i in zero_resource_list: index_list.append(None) @@ -116,6 +116,11 @@ def get_index_list(num_workers: int, num_rsets: int, zero_resource_list: list[in else: index_list.append(index) index += 1 + + for i in zero_resource_list: + if i >= num_workers: + logger.warning(f"Worker index {i} from zero_resource_workers is out of range (0-{num_workers - 1})") + return index_list @@ -364,7 +369,7 @@ def get_local_nodelist( local_nodelist = list(OrderedDict.fromkeys(team_list)) # Maintain order of nodes logger.debug(f"Worker's local_nodelist is {local_nodelist}") - slots = {} + slots: dict[str, list[int]] = {} for node in local_nodelist: slots[node] = [] diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py index 31e537a31d..121bb08cb1 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings.py @@ -66,7 +66,7 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + libE_specs["num_resource_sets"] = nworkers # Persistent gen does not need resources libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles if libE_specs["comms"] == "tcp": @@ -88,8 +88,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nworkers, + "max_resource_sets": nworkers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py index 814f5086cb..4cfc431c10 100644 --- a/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py +++ b/libensemble/tests/functionality_tests/test_mpi_gpu_settings_env.py @@ -43,7 +43,7 @@ # Main block is necessary only when using local comms with spawn start method (default on macOS and Windows). if __name__ == "__main__": nworkers, is_manager, libE_specs, _ = parse_args() - libE_specs["num_resource_sets"] = nworkers - 1 # Persistent gen does not need resources + libE_specs["num_resource_sets"] = nworkers # Persistent gen does not need resources libE_specs["use_workflow_dir"] = True # Only a place for Open MPI machinefiles # Optional for organization of output scripts @@ -70,8 +70,8 @@ "persis_in": ["f", "x", "sim_id"], "out": [("priority", float), ("resource_sets", int), ("x", float, n)], "user": { - "initial_batch_size": nworkers - 1, - "max_resource_sets": nworkers - 1, # Any sim created can req. 1 worker up to all. + "initial_batch_size": nworkers, + "max_resource_sets": nworkers, # Any sim created can req. 1 worker up to all. "lb": np.array([-3, -2]), "ub": np.array([3, 2]), }, diff --git a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py index 9c9f936edb..68e9c30cd3 100644 --- a/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py +++ b/libensemble/tests/functionality_tests/test_mpi_runners_zrw_subnode_uneven.py @@ -139,10 +139,17 @@ exp_srun.append(srun_p1 + str(nodename) + srun_p2 + str(ntasks) + srun_p3 + str(ntasks) + srun_p4) test_list = test_list_base - exp_list = exp_srun + exp_list_dynamic = exp_srun.copy() + if nworkers == 5: + # Iteration 0 (Dynamic): Workers 1, 2 -> node-2; 3, 4, 5 -> node-1 + n1 = srun_p1 + "node-1" + srun_p2 + "5" + srun_p3 + "5" + srun_p4 + n2 = srun_p1 + "node-2" + srun_p2 + "8" + srun_p3 + "8" + srun_p4 + exp_list_dynamic = [n2, n2, n1, n1, n1] + # Iteration 1 (Static): Worker 1 is gen. Workers 2, 3 -> node-1; 4, 5 -> node-2 + exp_list_static = [n1, n1, n2, n2] + sim_specs["user"] = { "tests": test_list, - "expect": exp_list, "persis_gens": n_gens, } @@ -150,15 +157,21 @@ for prob_id in range(iterations): if prob_id == 0: # Uses dynamic scheduler - will find node 2 slots first (as fewer) - libE_specs["num_resource_sets"] = nworkers - 1 # Any worker can be the gen - sim_specs["user"]["offset_for_scheduler"] = True # Changes expected values + libE_specs["gen_on_worker"] = False + libE_specs["num_resource_sets"] = nworkers + sim_specs["user"]["expect"] = exp_list_dynamic + sim_specs["user"]["offset_for_scheduler"] = False + sim_specs["user"]["persis_gens"] = 0 persis_info = add_unique_random_streams({}, nworkers + 1) else: # Uses static scheduler - will find node 1 slots first + libE_specs["gen_on_worker"] = True + sim_specs["user"]["expect"] = exp_list_static del libE_specs["num_resource_sets"] libE_specs["zero_resource_workers"] = [1] # Gen must be worker 1 sim_specs["user"]["offset_for_scheduler"] = False + sim_specs["user"]["persis_gens"] = 1 persis_info = add_unique_random_streams({}, nworkers + 1) # Perform the run diff --git a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py index f6a1e5d57f..7d6389ad33 100644 --- a/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py +++ b/libensemble/tests/functionality_tests/test_persistent_uniform_gen_decides_stop.py @@ -36,6 +36,8 @@ n = 2 init_batch_size = nworkers - ngens + libE_specs["gen_on_worker"] = True + if ngens >= nworkers: sys.exit("The number of generators must be less than the number of workers -- aborting...")