From 611c6bde0ef9b8381591a87d8edec985f0bf428f Mon Sep 17 00:00:00 2001 From: lerman25 Date: Mon, 18 May 2026 13:33:46 +0300 Subject: [PATCH] Make RedisTimeSeries metrics export resilient to transient errors The post-benchmark push to the RedisTimeSeries sidecar has been failing intermittently with 'Connection reset by peer' (Errno 104) and similar transport-level errors. The benchmarks themselves complete successfully, but the metrics export raises and the whole run is marked as failed. This makes the export resilient in three coordinated ways: 1. run_remote/run_remote.py - the 'rts' client used for the post-test metrics push now configures an explicit redis-py Retry policy with exponential backoff (5 retries, ExponentialBackoff(cap=10, base=1)) and reasonable socket timeouts, retrying on ConnectionError and TimeoutError. The previous 'retry_on_timeout=True' alone is a no-op without a Retry object attached, so the client effectively had zero retries. 2. run_remote/run_remote.py - the post-test exception handler now also catches TimeoutError (both are surfaced once the retry layer is exhausted) and gates the fatal 'raise Exception(...)' behind a new opt-in flag '--continue-on-redistimeseries-export-error' (also settable via CONTINUE_ON_RTS_EXPORT_ERROR=1). Default behaviour is unchanged - the call still raises - so this is backward compatible. 3. utils/remote.py - push_data_to_redistimeseries already swallowed TimeoutError per timeseries; it now also swallows ConnectionError so a single transient hiccup mid-batch only increments the error counter instead of aborting the whole dict. The added CLI flag lives on common_run_args so it is inherited by all sub-commands that already accept --push_results_redistimeseries. --- redisbench_admin/run/args.py | 11 +++++++ redisbench_admin/run_remote/run_remote.py | 40 ++++++++++++++++++----- redisbench_admin/utils/remote.py | 6 ++++ 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/redisbench_admin/run/args.py b/redisbench_admin/run/args.py index 1dafde7..8c4fd0a 100644 --- a/redisbench_admin/run/args.py +++ b/redisbench_admin/run/args.py @@ -208,6 +208,17 @@ def common_run_args(parser): action="store_true", help="uploads the results to RedisTimeSeries. Proper credentials are required", ) + parser.add_argument( + "--continue-on-redistimeseries-export-error", + default=bool(int(os.getenv("CONTINUE_ON_RTS_EXPORT_ERROR", "0"))), + action="store_true", + help=( + "Treat transient RedisTimeSeries metrics-export failures (connection " + "reset, timeout) as warnings instead of fatal errors. The benchmark " + "itself is unaffected; only the post-run metrics push is best-effort. " + "Can also be enabled via CONTINUE_ON_RTS_EXPORT_ERROR=1." + ), + ) parser.add_argument( "--collect_commandstats", type=bool, diff --git a/redisbench_admin/run_remote/run_remote.py b/redisbench_admin/run_remote/run_remote.py index 5887db0..bae7a0f 100644 --- a/redisbench_admin/run_remote/run_remote.py +++ b/redisbench_admin/run_remote/run_remote.py @@ -8,6 +8,8 @@ import sys import traceback import redis +from redis.backoff import ExponentialBackoff +from redis.retry import Retry import pytablewriter from pytablewriter import MarkdownTableWriter import redisbench_admin.run.metrics @@ -374,11 +376,21 @@ def run_remote_command_logic(args, project_name, project_version): args.redistimeseries_host, args.redistimeseries_port ) ) + # Resilient client: redis-py transparently retries transient + # connection/timeout errors (e.g. TCP RST from the RTS sidecar). + rts_retry = Retry(ExponentialBackoff(cap=10, base=1), retries=5) rts = redis.Redis( host=args.redistimeseries_host, port=args.redistimeseries_port, password=args.redistimeseries_pass, - retry_on_timeout=True, + socket_timeout=30, + socket_connect_timeout=10, + health_check_interval=30, + retry=rts_retry, + retry_on_error=[ + redis.exceptions.ConnectionError, + redis.exceptions.TimeoutError, + ], ) rts.ping() @@ -1343,11 +1355,12 @@ def run_remote_command_logic(args, project_name, project_version): expire_ms, ) except ( - redis.exceptions.ConnectionError + redis.exceptions.ConnectionError, + redis.exceptions.TimeoutError, ) as e: logging.error( - "RedisTimeSeries connection error while pushing metrics for test '%s' " - "(setup: '%s', branch: '%s'). The benchmark itself completed successfully, " + "RedisTimeSeries error while pushing metrics for test '%s' " + "(setup: '%s', branch: '%s') after client-side retries exhausted. The benchmark itself completed successfully, " "but the metrics export to RedisTimeSeries failed: %s", test_name, setup_name, @@ -1368,12 +1381,21 @@ def run_remote_command_logic(args, project_name, project_version): username, ) return_code |= 1 - raise Exception( - "Failed to push metrics to RedisTimeSeries for test '{}'. " - "The benchmark ran successfully but the post-benchmark metrics export failed: {}".format( - test_name, e + if getattr( + args, + "continue_on_redistimeseries_export_error", + False, + ): + logging.warning( + "--continue-on-redistimeseries-export-error is set; continuing with next test." + ) + else: + raise Exception( + "Failed to push metrics to RedisTimeSeries for test '{}'. " + "The benchmark ran successfully but the post-benchmark metrics export failed: {}".format( + test_name, e + ) ) - ) # run post commands after benchmark completes and # end-of-benchmark metrics have been collected and diff --git a/redisbench_admin/utils/remote.py b/redisbench_admin/utils/remote.py index 6fab71c..d14041e 100644 --- a/redisbench_admin/utils/remote.py +++ b/redisbench_admin/utils/remote.py @@ -838,6 +838,12 @@ def push_data_to_redistimeseries(rts, time_series_dict: dict, expire_msecs=0): f"Error while working in timeseries named {timeseries_name}. " ) datapoint_errors += 1 + except redis.exceptions.ConnectionError as e: + logging.error( + "Connection error while working on timeseries named %s after client-side retries: %s", + timeseries_name, e, + ) + datapoint_errors += 1 progress.update() return datapoint_errors, datapoint_inserts