From dd50e824434833b06fc1d96d47938f258ff38bf4 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Wed, 25 Feb 2026 19:13:37 +0000 Subject: [PATCH 1/7] escape ge and le Signed-off-by: SumanthRH --- docs/content/docs/configuration/config.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/content/docs/configuration/config.mdx b/docs/content/docs/configuration/config.mdx index 89bd1a2cc..16d080003 100644 --- a/docs/content/docs/configuration/config.mdx +++ b/docs/content/docs/configuration/config.mdx @@ -586,8 +586,8 @@ fully_async: ``` -- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i <= max_staleness_steps`. Larger values increase throughput but also off-policy-ness. -- `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be >= `policy_mini_batch_size` and <= `policy_mini_batch_size * (max_staleness_steps + 1)`. +- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i \<= max_staleness_steps`. Larger values increase throughput but also off-policy-ness. +- `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be \>= `policy_mini_batch_size` and \<= `policy_mini_batch_size * (max_staleness_steps + 1)`. ## Generator Configuration From 778169480efc4deb701f1aa24449c9a80ce0bc1a Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Thu, 26 Feb 2026 07:05:27 +0000 Subject: [PATCH 2/7] fix ci oom with sleep + wake up Signed-off-by: SumanthRH --- tests/backends/skyrl_train/gpu/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/backends/skyrl_train/gpu/utils.py b/tests/backends/skyrl_train/gpu/utils.py index 8802e219a..a237c9824 100644 --- a/tests/backends/skyrl_train/gpu/utils.py +++ b/tests/backends/skyrl_train/gpu/utils.py @@ -489,6 +489,8 @@ def create( client = RemoteInferenceClient( proxy_url=proxy_url, server_urls=server_urls, model_name=cfg.trainer.policy.model.path ) + asyncio.run(client.sleep(level=sleep_level)) + asyncio.run(client.wake_up()) else: eps = create_ray_wrapped_inference_engines( num_inference_engines=ie_cfg.num_engines, From d8c418febb033a0eddd053229616968c558f6d54 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Thu, 26 Feb 2026 18:19:09 +0000 Subject: [PATCH 3/7] switch to reducing gpu memory utilization Signed-off-by: SumanthRH --- .../skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py | 3 +++ tests/backends/skyrl_train/gpu/utils.py | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py b/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py index 0bcb14db6..d89b8fdfd 100644 --- a/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py +++ b/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py @@ -32,6 +32,9 @@ def get_test_actor_config() -> SkyRLTrainConfig: cfg.generator.inference_engine.async_engine = True cfg.generator.inference_engine.num_engines = 1 cfg.generator.inference_engine.run_engines_locally = True + # NOTE: We reduce the gpu memory used by vLLM because of the colocated tests + # that can OOM on L4s. For more details, see: https://github.com/NovaSky-AI/SkyRL/pull/1221 + cfg.generator.inference_engine.gpu_memory_utilization = 0.7 return cfg diff --git a/tests/backends/skyrl_train/gpu/utils.py b/tests/backends/skyrl_train/gpu/utils.py index a237c9824..8802e219a 100644 --- a/tests/backends/skyrl_train/gpu/utils.py +++ b/tests/backends/skyrl_train/gpu/utils.py @@ -489,8 +489,6 @@ def create( client = RemoteInferenceClient( proxy_url=proxy_url, server_urls=server_urls, model_name=cfg.trainer.policy.model.path ) - asyncio.run(client.sleep(level=sleep_level)) - asyncio.run(client.wake_up()) else: eps = create_ray_wrapped_inference_engines( num_inference_engines=ie_cfg.num_engines, From 10d59fbbfa39ef938a8580daebb25d3bf7517234 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Thu, 26 Feb 2026 23:49:59 +0000 Subject: [PATCH 4/7] x Signed-off-by: SumanthRH --- skyrl-gym/skyrl_gym/envs/search/env.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py index afc4741f6..80a656460 100644 --- a/skyrl-gym/skyrl_gym/envs/search/env.py +++ b/skyrl-gym/skyrl_gym/envs/search/env.py @@ -67,7 +67,9 @@ def _is_done(self, action: str) -> bool: def _validate_action(self, action: str): stop_tags = ["", ""] - action = action.rstrip("\n") # strip out any trailing newlines + # TODO (sumanthrh): This assertion should really be that the *last token* generated contains . + # The last token generated can have additional punctuation characters like periods, etc. + action = action.rstrip("\n").rstrip(".") # strip out any trailing newlines and periods for tag in stop_tags: if tag in action: assert action.split(tag, 1)[1] == "", ( From c543022c55da08df17a2e5f449b83d7fd0b4c876 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Fri, 27 Feb 2026 00:15:59 +0000 Subject: [PATCH 5/7] x Signed-off-by: SumanthRH --- skyrl-gym/skyrl_gym/envs/search/env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py index 80a656460..d46d5f8ed 100644 --- a/skyrl-gym/skyrl_gym/envs/search/env.py +++ b/skyrl-gym/skyrl_gym/envs/search/env.py @@ -69,7 +69,7 @@ def _validate_action(self, action: str): stop_tags = ["", ""] # TODO (sumanthrh): This assertion should really be that the *last token* generated contains . # The last token generated can have additional punctuation characters like periods, etc. - action = action.rstrip("\n").rstrip(".") # strip out any trailing newlines and periods + action = action.rstrip("\n") # strip out any trailing newlines and periods for tag in stop_tags: if tag in action: assert action.split(tag, 1)[1] == "", ( From 7af61186eeb22b57213d09a3863a6e401f3df964 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Fri, 27 Feb 2026 00:26:14 +0000 Subject: [PATCH 6/7] x Signed-off-by: SumanthRH --- skyrl-gym/skyrl_gym/envs/search/env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py index d46d5f8ed..80a656460 100644 --- a/skyrl-gym/skyrl_gym/envs/search/env.py +++ b/skyrl-gym/skyrl_gym/envs/search/env.py @@ -69,7 +69,7 @@ def _validate_action(self, action: str): stop_tags = ["", ""] # TODO (sumanthrh): This assertion should really be that the *last token* generated contains . # The last token generated can have additional punctuation characters like periods, etc. - action = action.rstrip("\n") # strip out any trailing newlines and periods + action = action.rstrip("\n").rstrip(".") # strip out any trailing newlines and periods for tag in stop_tags: if tag in action: assert action.split(tag, 1)[1] == "", ( From f42ff622b59ade4b8a8aafb58a3add91d4da9f28 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Fri, 27 Feb 2026 01:41:59 +0000 Subject: [PATCH 7/7] x Signed-off-by: SumanthRH --- docs/content/docs/configuration/config.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/docs/configuration/config.mdx b/docs/content/docs/configuration/config.mdx index 56b61723f..10c152834 100644 --- a/docs/content/docs/configuration/config.mdx +++ b/docs/content/docs/configuration/config.mdx @@ -586,7 +586,7 @@ fully_async: ``` -- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i \<= max_staleness_steps`. Larger values increase throughput but also off-policy-ness. +- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i <= max_staleness_steps`. Larger values increase throughput but also off-policy-ness. - `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be \>= `policy_mini_batch_size` and \<= `policy_mini_batch_size * (max_staleness_steps + 1)`. ## Generator Configuration