Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions redisbench_admin/run/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,17 @@ def common_run_args(parser):
action="store_true",
help="uploads the results to RedisTimeSeries. Proper credentials are required",
)
parser.add_argument(
"--continue-on-redistimeseries-export-error",
default=bool(int(os.getenv("CONTINUE_ON_RTS_EXPORT_ERROR", "0"))),
action="store_true",
help=(
"Treat transient RedisTimeSeries metrics-export failures (connection "
"reset, timeout) as warnings instead of fatal errors. The benchmark "
"itself is unaffected; only the post-run metrics push is best-effort. "
"Can also be enabled via CONTINUE_ON_RTS_EXPORT_ERROR=1."
),
)
parser.add_argument(
"--collect_commandstats",
type=bool,
Expand Down
40 changes: 31 additions & 9 deletions redisbench_admin/run_remote/run_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import sys
import traceback
import redis
from redis.backoff import ExponentialBackoff
from redis.retry import Retry
import pytablewriter
from pytablewriter import MarkdownTableWriter
import redisbench_admin.run.metrics
Expand Down Expand Up @@ -374,11 +376,21 @@ def run_remote_command_logic(args, project_name, project_version):
args.redistimeseries_host, args.redistimeseries_port
)
)
# Resilient client: redis-py transparently retries transient
# connection/timeout errors (e.g. TCP RST from the RTS sidecar).
rts_retry = Retry(ExponentialBackoff(cap=10, base=1), retries=5)
rts = redis.Redis(
host=args.redistimeseries_host,
port=args.redistimeseries_port,
password=args.redistimeseries_pass,
retry_on_timeout=True,
socket_timeout=30,
socket_connect_timeout=10,
health_check_interval=30,
retry=rts_retry,
retry_on_error=[
redis.exceptions.ConnectionError,
redis.exceptions.TimeoutError,
],
)
rts.ping()

Expand Down Expand Up @@ -1343,11 +1355,12 @@ def run_remote_command_logic(args, project_name, project_version):
expire_ms,
)
except (
redis.exceptions.ConnectionError
redis.exceptions.ConnectionError,
redis.exceptions.TimeoutError,
) as e:
logging.error(
"RedisTimeSeries connection error while pushing metrics for test '%s' "
"(setup: '%s', branch: '%s'). The benchmark itself completed successfully, "
"RedisTimeSeries error while pushing metrics for test '%s' "
"(setup: '%s', branch: '%s') after client-side retries exhausted. The benchmark itself completed successfully, "
"but the metrics export to RedisTimeSeries failed: %s",
test_name,
setup_name,
Expand All @@ -1368,12 +1381,21 @@ def run_remote_command_logic(args, project_name, project_version):
username,
)
return_code |= 1
raise Exception(
"Failed to push metrics to RedisTimeSeries for test '{}'. "
"The benchmark ran successfully but the post-benchmark metrics export failed: {}".format(
test_name, e
if getattr(
args,
"continue_on_redistimeseries_export_error",
False,
):
logging.warning(
"--continue-on-redistimeseries-export-error is set; continuing with next test."
)
else:
raise Exception(
"Failed to push metrics to RedisTimeSeries for test '{}'. "
"The benchmark ran successfully but the post-benchmark metrics export failed: {}".format(
test_name, e
)
)
)

# run post commands after benchmark completes and
# end-of-benchmark metrics have been collected and
Expand Down
6 changes: 6 additions & 0 deletions redisbench_admin/utils/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,12 @@ def push_data_to_redistimeseries(rts, time_series_dict: dict, expire_msecs=0):
f"Error while working in timeseries named {timeseries_name}. "
)
datapoint_errors += 1
except redis.exceptions.ConnectionError as e:
logging.error(
"Connection error while working on timeseries named %s after client-side retries: %s",
timeseries_name, e,
)
datapoint_errors += 1
progress.update()
return datapoint_errors, datapoint_inserts

Expand Down