From 611c6bde0ef9b8381591a87d8edec985f0bf428f Mon Sep 17 00:00:00 2001
From: lerman25 <lerman25@gmail.com>
Date: Mon, 18 May 2026 13:33:46 +0300
Subject: [PATCH] Make RedisTimeSeries metrics export resilient to transient
 errors

The post-benchmark push to the RedisTimeSeries sidecar has been failing
intermittently with 'Connection reset by peer' (Errno 104) and similar
transport-level errors. The benchmarks themselves complete successfully,
but the metrics export raises and the whole run is marked as failed.

This makes the export resilient in three coordinated ways:

1. run_remote/run_remote.py - the 'rts' client used for the post-test
   metrics push now configures an explicit redis-py Retry policy with
   exponential backoff (5 retries, ExponentialBackoff(cap=10, base=1))
   and reasonable socket timeouts, retrying on ConnectionError and
   TimeoutError. The previous 'retry_on_timeout=True' alone is a no-op
   without a Retry object attached, so the client effectively had zero
   retries.

2. run_remote/run_remote.py - the post-test exception handler now also
   catches TimeoutError (both are surfaced once the retry layer is
   exhausted) and gates the fatal 'raise Exception(...)' behind a new
   opt-in flag '--continue-on-redistimeseries-export-error' (also
   settable via CONTINUE_ON_RTS_EXPORT_ERROR=1). Default behaviour is
   unchanged - the call still raises - so this is backward compatible.

3. utils/remote.py - push_data_to_redistimeseries already swallowed
   TimeoutError per timeseries; it now also swallows ConnectionError so
   a single transient hiccup mid-batch only increments the error
   counter instead of aborting the whole dict.

The added CLI flag lives on common_run_args so it is inherited by all
sub-commands that already accept --push_results_redistimeseries.
---
 redisbench_admin/run/args.py              | 11 +++++++
 redisbench_admin/run_remote/run_remote.py | 40 ++++++++++++++++++-----
 redisbench_admin/utils/remote.py          |  6 ++++
 3 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/redisbench_admin/run/args.py b/redisbench_admin/run/args.py
index 1dafde7..8c4fd0a 100644
--- a/redisbench_admin/run/args.py
+++ b/redisbench_admin/run/args.py
@@ -208,6 +208,17 @@ def common_run_args(parser):
         action="store_true",
         help="uploads the results to RedisTimeSeries. Proper credentials are required",
     )
+    parser.add_argument(
+        "--continue-on-redistimeseries-export-error",
+        default=bool(int(os.getenv("CONTINUE_ON_RTS_EXPORT_ERROR", "0"))),
+        action="store_true",
+        help=(
+            "Treat transient RedisTimeSeries metrics-export failures (connection "
+            "reset, timeout) as warnings instead of fatal errors. The benchmark "
+            "itself is unaffected; only the post-run metrics push is best-effort. "
+            "Can also be enabled via CONTINUE_ON_RTS_EXPORT_ERROR=1."
+        ),
+    )
     parser.add_argument(
         "--collect_commandstats",
         type=bool,
diff --git a/redisbench_admin/run_remote/run_remote.py b/redisbench_admin/run_remote/run_remote.py
index 5887db0..bae7a0f 100644
--- a/redisbench_admin/run_remote/run_remote.py
+++ b/redisbench_admin/run_remote/run_remote.py
@@ -8,6 +8,8 @@
 import sys
 import traceback
 import redis
+from redis.backoff import ExponentialBackoff
+from redis.retry import Retry
 import pytablewriter
 from pytablewriter import MarkdownTableWriter
 import redisbench_admin.run.metrics
@@ -374,11 +376,21 @@ def run_remote_command_logic(args, project_name, project_version):
                 args.redistimeseries_host, args.redistimeseries_port
             )
         )
+        # Resilient client: redis-py transparently retries transient
+        # connection/timeout errors (e.g. TCP RST from the RTS sidecar).
+        rts_retry = Retry(ExponentialBackoff(cap=10, base=1), retries=5)
         rts = redis.Redis(
             host=args.redistimeseries_host,
             port=args.redistimeseries_port,
             password=args.redistimeseries_pass,
-            retry_on_timeout=True,
+            socket_timeout=30,
+            socket_connect_timeout=10,
+            health_check_interval=30,
+            retry=rts_retry,
+            retry_on_error=[
+                redis.exceptions.ConnectionError,
+                redis.exceptions.TimeoutError,
+            ],
         )
         rts.ping()
 
@@ -1343,11 +1355,12 @@ def run_remote_command_logic(args, project_name, project_version):
                                                         expire_ms,
                                                     )
                                             except (
-                                                redis.exceptions.ConnectionError
+                                                redis.exceptions.ConnectionError,
+                                                redis.exceptions.TimeoutError,
                                             ) as e:
                                                 logging.error(
-                                                    "RedisTimeSeries connection error while pushing metrics for test '%s' "
-                                                    "(setup: '%s', branch: '%s'). The benchmark itself completed successfully, "
+                                                    "RedisTimeSeries error while pushing metrics for test '%s' "
+                                                    "(setup: '%s', branch: '%s') after client-side retries exhausted. The benchmark itself completed successfully, "
                                                     "but the metrics export to RedisTimeSeries failed: %s",
                                                     test_name,
                                                     setup_name,
@@ -1368,12 +1381,21 @@ def run_remote_command_logic(args, project_name, project_version):
                                                     username,
                                                 )
                                                 return_code |= 1
-                                                raise Exception(
-                                                    "Failed to push metrics to RedisTimeSeries for test '{}'. "
-                                                    "The benchmark ran successfully but the post-benchmark metrics export failed: {}".format(
-                                                        test_name, e
+                                                if getattr(
+                                                    args,
+                                                    "continue_on_redistimeseries_export_error",
+                                                    False,
+                                                ):
+                                                    logging.warning(
+                                                        "--continue-on-redistimeseries-export-error is set; continuing with next test."
+                                                    )
+                                                else:
+                                                    raise Exception(
+                                                        "Failed to push metrics to RedisTimeSeries for test '{}'. "
+                                                        "The benchmark ran successfully but the post-benchmark metrics export failed: {}".format(
+                                                            test_name, e
+                                                        )
                                                     )
-                                                )
 
                                         # run post commands after benchmark completes and
                                         # end-of-benchmark metrics have been collected and
diff --git a/redisbench_admin/utils/remote.py b/redisbench_admin/utils/remote.py
index 6fab71c..d14041e 100644
--- a/redisbench_admin/utils/remote.py
+++ b/redisbench_admin/utils/remote.py
@@ -838,6 +838,12 @@ def push_data_to_redistimeseries(rts, time_series_dict: dict, expire_msecs=0):
                     f"Error while working in timeseries named {timeseries_name}. "
                 )
                 datapoint_errors += 1
+            except redis.exceptions.ConnectionError as e:
+                logging.error(
+                    "Connection error while working on timeseries named %s after client-side retries: %s",
+                    timeseries_name, e,
+                )
+                datapoint_errors += 1
             progress.update()
     return datapoint_errors, datapoint_inserts