From a842f67c30ad1cb10885df8b07e786034b261f60 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Tue, 28 Apr 2026 17:22:23 +0100
Subject: [PATCH] run: cap Redis maxmemory to N% of total_system_memory before
 init_commands

Adds an opt-in mechanism to set Redis `maxmemory` proportionally to
the bench-server's total memory before any spec-defined init_commands
run, so workloads that previously OOM-killed the OS now hit
Redis-level OOM errors instead.

Default behaviour is unchanged: nothing is sent unless a spec field
or CLI flag asks for it. RediSearchEnterprise (and any other
downstream consumer) keeps its existing semantics.

Sources, in precedence order:
  --no-maxmemory-cap        force-disable for this run
  --maxmemory-pct N         CLI override
  dbconfig.maxmemory_pct N  spec-level opt-in (per-spec)

Cap formula: min(pct% * total_system_memory, total - 4 GiB), where the
4 GiB OS-reserve floor is only applied when total_system_memory is
greater than 4 GiB. On tiny instances the floor is bypassed and only
the pct cap applies. Existing non-zero `maxmemory` values are
overwritten with a WARNING log line.

Implementation:
  - new helpers in redisbench_admin/run/common.py:
      _get_maxmemory_pct_from_dbconfig (handles both dict and list
                                        dbconfig formats)
      _resolve_maxmemory_pct           (precedence resolver)
      _compute_maxmemory_cap_bytes     (cap arithmetic + 4 GiB floor)
      apply_maxmemory_cap              (orchestrator: reads INFO
                                        memory, logs, applies)
  - execute_init_commands and run_redis_pre_steps gain an optional
    `args` kwarg (default None). When set, CLI overrides are honoured;
    when not, only the spec field is consulted, so the spec-driven
    path works regardless of caller plumbing.
  - run_local/local_db.py threads `args` through the two
    run_redis_pre_steps callsites. The remote_db path keeps `args=None`
    for now (spec field still fires); CLI override there is a small
    follow-up.

Resolves the bench-server unresponsiveness observed when the
`search-expire-doc-10-milliseconds` spec is run on m7g.8xlarge /
m7i.8xlarge: instead of the kernel OOM-killer leaving sshd starved
and the box unreachable (terraform destroy still works, but no
result is captured), Redis now refuses writes once the cap is hit
and the bench completes with a partial-result warning.

Tests:
  - 13 new unit tests in tests/test_common.py covering precedence,
    cap arithmetic at 8/128 GiB and tiny instances, no-op paths,
    overwrite-with-warn, and INFO memory failure modes. All pass.
  - 3 pre-existing tests still need RTS_PORT (live Redis) and were
    already failing on origin/master without it; not affected.
---
 redisbench_admin/run/args.py           |  14 ++
 redisbench_admin/run/common.py         | 120 +++++++++++++-
 redisbench_admin/run_local/local_db.py |   4 +-
 tests/test_common.py                   | 207 +++++++++++++++++++++++++
 4 files changed, 340 insertions(+), 5 deletions(-)

diff --git a/redisbench_admin/run/args.py b/redisbench_admin/run/args.py
index 1dafde7..6af7254 100644
--- a/redisbench_admin/run/args.py
+++ b/redisbench_admin/run/args.py
@@ -122,6 +122,20 @@ def common_run_args(parser):
         default=BENCHMARK_REGEX,
         help="specify a test regex pattern to use on the tests directory. by default uses '.*'. If --test is defined this options has no effect.",
     )
+    parser.add_argument(
+        "--maxmemory-pct",
+        type=int,
+        required=False,
+        default=None,
+        help="If set, before running each spec's init_commands send `CONFIG SET maxmemory <pct%% of total_system_memory>` so OS-level OOM is replaced with Redis-level OOM. Floor: leaves >=4 GiB for the OS when total > 4 GiB. Overrides `dbconfig.maxmemory_pct` from the spec. Range: (0, 100].",
+    )
+    parser.add_argument(
+        "--no-maxmemory-cap",
+        required=False,
+        default=False,
+        action="store_true",
+        help="Force-disable the maxmemory cap for this run, even if --maxmemory-pct or `dbconfig.maxmemory_pct` is set.",
+    )
     parser.add_argument(
         "--runner-group-member-id",
         type=str,
diff --git a/redisbench_admin/run/common.py b/redisbench_admin/run/common.py
index de2c20d..a4ef93f 100644
--- a/redisbench_admin/run/common.py
+++ b/redisbench_admin/run/common.py
@@ -570,7 +570,121 @@ def check_dbconfig_keyspacelen_requirement(
     return required, keyspacelen, keyspacelen_min
 
 
-def execute_init_commands(benchmark_config, r, dbconfig_keyname="dbconfig"):
+# --- maxmemory cap helpers --------------------------------------------------
+
+GIB = 1024**3
+MAXMEMORY_OS_RESERVE_BYTES = 4 * GIB
+
+
+def _get_maxmemory_pct_from_dbconfig(benchmark_config, dbconfig_keyname="dbconfig"):
+    """Return spec-level `dbconfig.maxmemory_pct` (int), or None if not set.
+
+    Handles both spec formats: `dbconfig` as a dict, and `dbconfig` as a list
+    of single-key dicts.
+    """
+    if dbconfig_keyname not in benchmark_config:
+        return None
+    dbconfig = benchmark_config[dbconfig_keyname]
+    if isinstance(dbconfig, dict):
+        v = dbconfig.get("maxmemory_pct")
+        return int(v) if v is not None else None
+    if isinstance(dbconfig, list):
+        for k in dbconfig:
+            if isinstance(k, dict) and "maxmemory_pct" in k:
+                return int(k["maxmemory_pct"])
+    return None
+
+
+def _resolve_maxmemory_pct(args, benchmark_config, dbconfig_keyname="dbconfig"):
+    """CLI `--no-maxmemory-cap` > CLI `--maxmemory-pct` > spec `dbconfig.maxmemory_pct`.
+
+    Returns the resolved pct (int in (0, 100]) or None to skip the cap.
+    """
+    if getattr(args, "no_maxmemory_cap", False):
+        return None
+    cli_pct = getattr(args, "maxmemory_pct", None)
+    if cli_pct is not None:
+        return cli_pct
+    return _get_maxmemory_pct_from_dbconfig(benchmark_config, dbconfig_keyname)
+
+
+def _compute_maxmemory_cap_bytes(total_system_memory, pct):
+    """Apply pct% cap with a floor that always leaves
+    MAXMEMORY_OS_RESERVE_BYTES (4 GiB) for the OS, when total exceeds it.
+
+    For tiny instances (total <= 4 GiB), only the pct cap applies.
+    """
+    pct_cap = int(total_system_memory * pct / 100)
+    if total_system_memory > MAXMEMORY_OS_RESERVE_BYTES:
+        return min(pct_cap, total_system_memory - MAXMEMORY_OS_RESERVE_BYTES)
+    return pct_cap
+
+
+def apply_maxmemory_cap(r, benchmark_config, args=None, dbconfig_keyname="dbconfig"):
+    """Set Redis maxmemory based on resolved pct of total_system_memory, BEFORE
+    spec init_commands run. Overwrite-with-warn if Redis already has a non-zero
+    maxmemory configured.
+
+    No-op when no source asks for a cap.
+
+    Returns (applied_pct, applied_bytes) or (None, None).
+    """
+    pct = _resolve_maxmemory_pct(args, benchmark_config, dbconfig_keyname)
+    if pct is None:
+        return None, None
+    if not (0 < pct <= 100):
+        logging.warning(
+            "Invalid maxmemory_pct=%s; must be in (0, 100]. Skipping cap.", pct
+        )
+        return None, None
+    try:
+        info = r.info("memory")
+    except Exception as e:
+        logging.warning("Failed to read INFO memory; skipping cap: %s", e)
+        return None, None
+    total = int(info.get("total_system_memory", 0))
+    if total <= 0:
+        logging.warning(
+            "INFO memory.total_system_memory missing/zero; skipping cap"
+        )
+        return None, None
+    cap = _compute_maxmemory_cap_bytes(total, pct)
+    floor_active = total > MAXMEMORY_OS_RESERVE_BYTES
+    try:
+        existing = int(r.config_get("maxmemory").get("maxmemory", 0))
+        if existing > 0 and existing != cap:
+            logging.warning(
+                "Overwriting existing Redis maxmemory=%s with %s (%d%% of "
+                "total_system_memory=%s, 4GiB OS-reserve floor active=%s)",
+                existing,
+                cap,
+                pct,
+                total,
+                floor_active,
+            )
+    except Exception:
+        pass
+    logging.info(
+        "Setting Redis maxmemory=%s (%d%% of total_system_memory=%s, "
+        "4GiB OS-reserve floor active=%s)",
+        cap,
+        pct,
+        total,
+        floor_active,
+    )
+    r.config_set("maxmemory", cap)
+    return pct, cap
+
+
+# ---------------------------------------------------------------------------
+
+
+def execute_init_commands(benchmark_config, r, dbconfig_keyname="dbconfig", args=None):
+    # Apply the optional maxmemory cap BEFORE any spec-defined init_commands
+    # run, so that subsequent FT.CREATE / data load / benchmark all happen
+    # against the capped Redis. No-op unless --maxmemory-pct or
+    # dbconfig.maxmemory_pct is set (and --no-maxmemory-cap isn't).
+    apply_maxmemory_cap(r, benchmark_config, args, dbconfig_keyname)
     cmds = None
     res = 0
     if dbconfig_keyname in benchmark_config:
@@ -751,14 +865,14 @@ def merge_default_and_config_metrics(
     return exporter_timemetric_path, metrics
 
 
-def run_redis_pre_steps(benchmark_config, r, required_modules):
+def run_redis_pre_steps(benchmark_config, r, required_modules, args=None):
     # In case we have modules we use it's artifact version
     # otherwise we use redis version as artifact version
     version = "N/A"
     # run initialization commands before benchmark starts
     logging.info("Running initialization commands before benchmark starts.")
     execute_init_commands_start_time = datetime.datetime.now()
-    execute_init_commands(benchmark_config, r)
+    execute_init_commands(benchmark_config, r, args=args)
     execute_asm_commands(benchmark_config, r)
     execute_init_commands_duration_seconds = (
         datetime.datetime.now() - execute_init_commands_start_time
diff --git a/redisbench_admin/run_local/local_db.py b/redisbench_admin/run_local/local_db.py
index 7bcaab1..35a3d94 100644
--- a/redisbench_admin/run_local/local_db.py
+++ b/redisbench_admin/run_local/local_db.py
@@ -182,7 +182,7 @@ def local_db_spin(
         )
         for conn in redis_conns:
             artifact_version = run_redis_pre_steps(
-                benchmark_config, conn, required_modules
+                benchmark_config, conn, required_modules, args=args
             )
 
     if dataset is None:
@@ -249,7 +249,7 @@ def local_db_spin(
     # Only run pre_steps here if SEARCH_CLUSTERSET is not set (otherwise it was already run before data loading)
     if "SEARCH_CLUSTERSET" not in os.environ:
         artifact_version = run_redis_pre_steps(
-            benchmark_config, redis_conns[0], required_modules
+            benchmark_config, redis_conns[0], required_modules, args=args
         )
 
     return result, artifact_version, cluster_api_enabled, redis_conns, redis_processes
diff --git a/tests/test_common.py b/tests/test_common.py
index 2812c43..e7718e5 100644
--- a/tests/test_common.py
+++ b/tests/test_common.py
@@ -26,6 +26,12 @@
     common_properties_log,
     execute_init_commands,
     extract_input_file_url_from_parameters,
+    _get_maxmemory_pct_from_dbconfig,
+    _resolve_maxmemory_pct,
+    _compute_maxmemory_cap_bytes,
+    apply_maxmemory_cap,
+    GIB,
+    MAXMEMORY_OS_RESERVE_BYTES,
 )
 from redisbench_admin.run_remote.args import create_run_remote_arguments
 
@@ -754,3 +760,204 @@ def test_extract_input_file_url_from_parameters():
         entry_missing_param_list, "ftsb_redisearch"
     )
     assert result is None
+
+
+# --- maxmemory cap helpers --------------------------------------------------
+
+class _Args:
+    """Minimal stand-in for argparse.Namespace, with optional attrs."""
+
+    def __init__(self, maxmemory_pct=None, no_maxmemory_cap=False):
+        self.maxmemory_pct = maxmemory_pct
+        self.no_maxmemory_cap = no_maxmemory_cap
+
+
+def test_get_maxmemory_pct_from_dbconfig_dict():
+    assert _get_maxmemory_pct_from_dbconfig({}) is None
+    assert _get_maxmemory_pct_from_dbconfig({"dbconfig": {}}) is None
+    assert _get_maxmemory_pct_from_dbconfig({"dbconfig": {"maxmemory_pct": 80}}) == 80
+    # int coercion from yaml-string
+    assert (
+        _get_maxmemory_pct_from_dbconfig({"dbconfig": {"maxmemory_pct": "75"}}) == 75
+    )
+
+
+def test_get_maxmemory_pct_from_dbconfig_list():
+    cfg = {
+        "dbconfig": [
+            {"dataset_name": "x"},
+            {"maxmemory_pct": 80},
+            {"init_commands": []},
+        ]
+    }
+    assert _get_maxmemory_pct_from_dbconfig(cfg) == 80
+    # absent
+    assert (
+        _get_maxmemory_pct_from_dbconfig({"dbconfig": [{"dataset_name": "x"}]}) is None
+    )
+
+
+def test_resolve_maxmemory_pct_precedence():
+    spec = {"dbconfig": {"maxmemory_pct": 50}}
+    # no flags -> spec wins
+    assert _resolve_maxmemory_pct(_Args(), spec) == 50
+    # CLI override -> CLI wins
+    assert _resolve_maxmemory_pct(_Args(maxmemory_pct=80), spec) == 80
+    # CLI no_cap -> None even when spec asks for one
+    assert _resolve_maxmemory_pct(_Args(no_maxmemory_cap=True), spec) is None
+    # CLI no_cap dominates CLI pct too
+    assert (
+        _resolve_maxmemory_pct(_Args(maxmemory_pct=80, no_maxmemory_cap=True), spec)
+        is None
+    )
+    # neither CLI nor spec -> None
+    assert _resolve_maxmemory_pct(_Args(), {}) is None
+
+
+def test_compute_maxmemory_cap_bytes_floor_kicks_in_on_large_instances():
+    # 128 GiB instance, 80% pct
+    total = 128 * GIB
+    pct = 80
+    pct_cap = int(total * 0.8)  # 102.4 GiB
+    floor_cap = total - MAXMEMORY_OS_RESERVE_BYTES  # 124 GiB
+    cap = _compute_maxmemory_cap_bytes(total, pct)
+    # floor doesn't bite here -> use pct cap
+    assert cap == min(pct_cap, floor_cap)
+    assert cap == pct_cap
+
+
+def test_compute_maxmemory_cap_bytes_floor_caps_high_pct_on_smaller_box():
+    # 8 GiB instance, 90% pct
+    total = 8 * GIB
+    pct = 90
+    pct_cap = int(total * 0.9)  # 7.2 GiB
+    floor_cap = total - MAXMEMORY_OS_RESERVE_BYTES  # 4 GiB
+    cap = _compute_maxmemory_cap_bytes(total, pct)
+    # 7.2 GiB > 4 GiB floor -> floor wins
+    assert cap == floor_cap
+    assert cap == 4 * GIB
+
+
+def test_compute_maxmemory_cap_bytes_no_floor_for_tiny_instances():
+    # 2 GiB instance, 50% pct -> floor bypassed (would yield negative)
+    total = 2 * GIB
+    pct = 50
+    cap = _compute_maxmemory_cap_bytes(total, pct)
+    assert cap == int(total * 0.5)
+
+
+class _MockRedis:
+    """Just enough of a redis client for apply_maxmemory_cap."""
+
+    def __init__(self, total_system_memory=None, existing_maxmemory=0, info_raises=False):
+        self._total = total_system_memory
+        self._existing = existing_maxmemory
+        self._info_raises = info_raises
+        self.config_set_calls = []
+
+    def info(self, _section):
+        if self._info_raises:
+            raise RuntimeError("boom")
+        if self._total is None:
+            return {}
+        return {"total_system_memory": self._total}
+
+    def config_get(self, _key):
+        return {"maxmemory": self._existing}
+
+    def config_set(self, key, value):
+        self.config_set_calls.append((key, value))
+
+
+def test_apply_maxmemory_cap_no_op_without_opt_in():
+    r = _MockRedis(total_system_memory=128 * GIB)
+    # No CLI flags, no spec field
+    pct, cap = apply_maxmemory_cap(r, {}, args=_Args())
+    assert pct is None and cap is None
+    assert r.config_set_calls == []
+
+
+def test_apply_maxmemory_cap_no_op_when_cli_disables():
+    spec = {"dbconfig": {"maxmemory_pct": 80}}
+    r = _MockRedis(total_system_memory=128 * GIB)
+    pct, cap = apply_maxmemory_cap(r, spec, args=_Args(no_maxmemory_cap=True))
+    assert pct is None and cap is None
+    assert r.config_set_calls == []
+
+
+def test_apply_maxmemory_cap_sets_from_spec_field():
+    spec = {"dbconfig": {"maxmemory_pct": 80}}
+    r = _MockRedis(total_system_memory=128 * GIB)
+    pct, cap = apply_maxmemory_cap(r, spec, args=_Args())
+    assert pct == 80
+    assert cap == int(128 * GIB * 0.8)
+    assert r.config_set_calls == [("maxmemory", cap)]
+
+
+def test_apply_maxmemory_cap_floor_wins_on_smaller_box():
+    spec = {"dbconfig": {"maxmemory_pct": 95}}
+    r = _MockRedis(total_system_memory=8 * GIB)
+    pct, cap = apply_maxmemory_cap(r, spec, args=_Args())
+    assert pct == 95
+    # 95% of 8 GiB > 8 GiB - 4 GiB -> floor wins
+    assert cap == 4 * GIB
+
+
+def test_apply_maxmemory_cap_skips_on_invalid_pct():
+    spec = {"dbconfig": {"maxmemory_pct": 0}}  # invalid, must be > 0
+    r = _MockRedis(total_system_memory=128 * GIB)
+    pct, cap = apply_maxmemory_cap(r, spec, args=_Args())
+    assert pct is None and cap is None
+    assert r.config_set_calls == []
+
+
+def test_apply_maxmemory_cap_skips_when_info_unavailable():
+    spec = {"dbconfig": {"maxmemory_pct": 80}}
+    r = _MockRedis(info_raises=True)
+    pct, cap = apply_maxmemory_cap(r, spec, args=_Args())
+    assert pct is None and cap is None
+    assert r.config_set_calls == []
+
+
+def test_apply_maxmemory_cap_overwrites_existing_with_warn(caplog=None):
+    """When Redis already has a non-zero maxmemory, a WARNING is logged but
+    the new cap is applied regardless."""
+    import logging as _logging
+
+    spec = {"dbconfig": {"maxmemory_pct": 80}}
+    r = _MockRedis(total_system_memory=128 * GIB, existing_maxmemory=10 * GIB)
+    with _MockLogHandler() as h:
+        apply_maxmemory_cap(r, spec, args=_Args())
+    new_cap = int(128 * GIB * 0.8)
+    assert r.config_set_calls == [("maxmemory", new_cap)]
+    assert any("Overwriting existing Redis maxmemory" in m for m in h.messages)
+
+
+class _MockLogHandler:
+    """Capture log records into self.messages."""
+
+    def __init__(self):
+        import logging as _logging
+
+        self._logging = _logging
+        self.messages = []
+
+    def __enter__(self):
+        class _Handler(self._logging.Handler):
+            def __init__(_self_inner, outer):  # noqa: N804
+                self._logging.Handler.__init__(_self_inner)
+                _self_inner.outer = outer
+
+            def emit(_self_inner, record):  # noqa: N804
+                _self_inner.outer.messages.append(record.getMessage())
+
+        self._handler = _Handler(self)
+        self._logger = self._logging.getLogger()
+        self._previous_level = self._logger.level
+        self._logger.setLevel(self._logging.DEBUG)
+        self._logger.addHandler(self._handler)
+        return self
+
+    def __exit__(self, *_):
+        self._logger.removeHandler(self._handler)
+        self._logger.setLevel(self._previous_level)