From dd50e824434833b06fc1d96d47938f258ff38bf4 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Wed, 25 Feb 2026 19:13:37 +0000
Subject: [PATCH 1/7] escape ge and le

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 docs/content/docs/configuration/config.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/content/docs/configuration/config.mdx b/docs/content/docs/configuration/config.mdx
index 89bd1a2cc..16d080003 100644
--- a/docs/content/docs/configuration/config.mdx
+++ b/docs/content/docs/configuration/config.mdx
@@ -586,8 +586,8 @@ fully_async:
 
 ```
 
-- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i <= max_staleness_steps`. Larger values increase throughput but also off-policy-ness.
-- `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be >= `policy_mini_batch_size` and <= `policy_mini_batch_size * (max_staleness_steps + 1)`.
+- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i \<= max_staleness_steps`. Larger values increase throughput but also off-policy-ness.
+- `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be \>= `policy_mini_batch_size` and \<= `policy_mini_batch_size * (max_staleness_steps + 1)`.
 
 ## Generator Configuration
 

From 778169480efc4deb701f1aa24449c9a80ce0bc1a Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Thu, 26 Feb 2026 07:05:27 +0000
Subject: [PATCH 2/7] fix ci oom with sleep + wake up

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 tests/backends/skyrl_train/gpu/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/backends/skyrl_train/gpu/utils.py b/tests/backends/skyrl_train/gpu/utils.py
index 8802e219a..a237c9824 100644
--- a/tests/backends/skyrl_train/gpu/utils.py
+++ b/tests/backends/skyrl_train/gpu/utils.py
@@ -489,6 +489,8 @@ def create(
             client = RemoteInferenceClient(
                 proxy_url=proxy_url, server_urls=server_urls, model_name=cfg.trainer.policy.model.path
             )
+            asyncio.run(client.sleep(level=sleep_level))
+            asyncio.run(client.wake_up())
         else:
             eps = create_ray_wrapped_inference_engines(
                 num_inference_engines=ie_cfg.num_engines,

From d8c418febb033a0eddd053229616968c558f6d54 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Thu, 26 Feb 2026 18:19:09 +0000
Subject: [PATCH 3/7] switch to reducing gpu memory utilization

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 .../skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py    | 3 +++
 tests/backends/skyrl_train/gpu/utils.py                        | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py b/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py
index 0bcb14db6..d89b8fdfd 100644
--- a/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py
+++ b/tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py
@@ -32,6 +32,9 @@ def get_test_actor_config() -> SkyRLTrainConfig:
     cfg.generator.inference_engine.async_engine = True
     cfg.generator.inference_engine.num_engines = 1
     cfg.generator.inference_engine.run_engines_locally = True
+    # NOTE: We reduce the gpu memory used by vLLM because of the colocated tests
+    # that can OOM on L4s. For more details, see: https://github.com/NovaSky-AI/SkyRL/pull/1221
+    cfg.generator.inference_engine.gpu_memory_utilization = 0.7
     return cfg
 
 
diff --git a/tests/backends/skyrl_train/gpu/utils.py b/tests/backends/skyrl_train/gpu/utils.py
index a237c9824..8802e219a 100644
--- a/tests/backends/skyrl_train/gpu/utils.py
+++ b/tests/backends/skyrl_train/gpu/utils.py
@@ -489,8 +489,6 @@ def create(
             client = RemoteInferenceClient(
                 proxy_url=proxy_url, server_urls=server_urls, model_name=cfg.trainer.policy.model.path
             )
-            asyncio.run(client.sleep(level=sleep_level))
-            asyncio.run(client.wake_up())
         else:
             eps = create_ray_wrapped_inference_engines(
                 num_inference_engines=ie_cfg.num_engines,

From 10d59fbbfa39ef938a8580daebb25d3bf7517234 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Thu, 26 Feb 2026 23:49:59 +0000
Subject: [PATCH 4/7] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 skyrl-gym/skyrl_gym/envs/search/env.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py
index afc4741f6..80a656460 100644
--- a/skyrl-gym/skyrl_gym/envs/search/env.py
+++ b/skyrl-gym/skyrl_gym/envs/search/env.py
@@ -67,7 +67,9 @@ def _is_done(self, action: str) -> bool:
 
     def _validate_action(self, action: str):
         stop_tags = ["</search>", "</answer>"]
-        action = action.rstrip("\n")  # strip out any trailing newlines
+        # TODO (sumanthrh): This assertion should really be that the *last token* generated contains <answer>.
+        # The last token generated can have additional punctuation characters like periods, etc.
+        action = action.rstrip("\n").rstrip(".")  # strip out any trailing newlines and periods
         for tag in stop_tags:
             if tag in action:
                 assert action.split(tag, 1)[1] == "", (

From c543022c55da08df17a2e5f449b83d7fd0b4c876 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Fri, 27 Feb 2026 00:15:59 +0000
Subject: [PATCH 5/7] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 skyrl-gym/skyrl_gym/envs/search/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py
index 80a656460..d46d5f8ed 100644
--- a/skyrl-gym/skyrl_gym/envs/search/env.py
+++ b/skyrl-gym/skyrl_gym/envs/search/env.py
@@ -69,7 +69,7 @@ def _validate_action(self, action: str):
         stop_tags = ["</search>", "</answer>"]
         # TODO (sumanthrh): This assertion should really be that the *last token* generated contains <answer>.
         # The last token generated can have additional punctuation characters like periods, etc.
-        action = action.rstrip("\n").rstrip(".")  # strip out any trailing newlines and periods
+        action = action.rstrip("\n")  # strip out any trailing newlines and periods
         for tag in stop_tags:
             if tag in action:
                 assert action.split(tag, 1)[1] == "", (

From 7af61186eeb22b57213d09a3863a6e401f3df964 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Fri, 27 Feb 2026 00:26:14 +0000
Subject: [PATCH 6/7] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 skyrl-gym/skyrl_gym/envs/search/env.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skyrl-gym/skyrl_gym/envs/search/env.py b/skyrl-gym/skyrl_gym/envs/search/env.py
index d46d5f8ed..80a656460 100644
--- a/skyrl-gym/skyrl_gym/envs/search/env.py
+++ b/skyrl-gym/skyrl_gym/envs/search/env.py
@@ -69,7 +69,7 @@ def _validate_action(self, action: str):
         stop_tags = ["</search>", "</answer>"]
         # TODO (sumanthrh): This assertion should really be that the *last token* generated contains <answer>.
         # The last token generated can have additional punctuation characters like periods, etc.
-        action = action.rstrip("\n")  # strip out any trailing newlines and periods
+        action = action.rstrip("\n").rstrip(".")  # strip out any trailing newlines and periods
         for tag in stop_tags:
             if tag in action:
                 assert action.split(tag, 1)[1] == "", (

From f42ff622b59ade4b8a8aafb58a3add91d4da9f28 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh99@gmail.com>
Date: Fri, 27 Feb 2026 01:41:59 +0000
Subject: [PATCH 7/7] x

Signed-off-by: SumanthRH <sumanthrh99@gmail.com>
---
 docs/content/docs/configuration/config.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/content/docs/configuration/config.mdx b/docs/content/docs/configuration/config.mdx
index 56b61723f..10c152834 100644
--- a/docs/content/docs/configuration/config.mdx
+++ b/docs/content/docs/configuration/config.mdx
@@ -586,7 +586,7 @@ fully_async:
 
 ```
 
-- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i \<= max_staleness_steps`. Larger values increase throughput but also off-policy-ness.
+- `fully_async.max_staleness_steps`: Maximum off-policy steps allowed. If a trajectory group is scheduled at step *i* and trained at step *j*, then `j - i <= max_staleness_steps`. Larger values increase throughput but also off-policy-ness.
 - `fully_async.num_parallel_generation_workers`: Number of generation workers to spawn. Should be \>= `policy_mini_batch_size` and \<= `policy_mini_batch_size * (max_staleness_steps + 1)`.
 
 ## Generator Configuration