From bfbba15928aa6a91b3e4b8943e0cad16199d9d48 Mon Sep 17 00:00:00 2001 From: Pedro Cunha Date: Sun, 24 May 2026 17:21:53 +0100 Subject: [PATCH 1/2] drivers/libusb1.c: use explicit context; drop per-close libusb_exit() [#598] The original code called libusb_init(NULL) on every nut_libusb_open() and libusb_exit(NULL) on every nut_libusb_close(), bumping and decrementing the default context's refcount. That violates the documented contract for libusb_exit, which "should be called after closing all open devices and before your application terminates", and on certain firmware it wedges indefinitely: when libusb_exit hits the 1->0 transition it tears the context down, which waits on libusb's internal sync primitives for outstanding URBs to drain. URBs orphaned by libusb_reset_device or by an unexpected device disconnect never drain, so the wait never returns. The deadlock is reachable from any reconnect path. It was first recognized by 4f84b7f92 ("don't libusb_exit() when closing a previously opened device"), which dropped the call in nut_libusb_open()'s rematch loop. The fallthrough reconnect path in qx_command and the rest of the driver lifecycle still hit it. With the companion change in nutdrv_qx.c (a4813f868) that escalates persistent LIBUSB_ERROR_OVERFLOW to libusb_reset_device, the deadlock window opens on every escalation and was hit reproducibly on the 0665:5161 Cypress USB-serial bridge family (Salicru SPS, Ippon, ViewPower, Voltronic Power UPSes; see #598, #993, #2453). Switch to an explicit libusb context owned end-to-end: - Initialize once on the first nut_libusb_open() and register a matching one-shot atexit handler. - Remove libusb_exit() from nut_libusb_close(); the close path now only calls libusb_close(), which is non-blocking. - Replace libusb_get_device_list(NULL, ...) with the explicit context so the default context is never touched. - Bound the atexit libusb_exit() with a SIGALRM-based 2s timeout (POSIX) so a deadlock-prone teardown at shutdown can't wedge supervisor stop sequences (e.g. systemctl stop). Sub-millisecond on a healthy context. Bump USB_DRIVER_VERSION from 0.53 to 0.54. Validated on a Salicru SPS 1500 ONE BL (0665:5161) by authorize-toggle stress reproduction (8 cycles, clean reconnect each time, same driver PID throughout) and by a natural OVERFLOW event captured in soak (counter-gated logic at 1/3, next poll succeeded, no escalation; driver PID stable for 22h+). Signed-off-by: Pedro Cunha --- drivers/libusb1.c | 103 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 7 deletions(-) diff --git a/drivers/libusb1.c b/drivers/libusb1.c index 955ada91bd..b02da58a2f 100644 --- a/drivers/libusb1.c +++ b/drivers/libusb1.c @@ -40,9 +40,14 @@ #include "usb-common.h" #include "nut_libusb.h" #include "nut_stdint.h" +#include "nut_bool.h" + +#ifndef WIN32 +# include /* sigaction(), SIGALRM for the atexit watchdog */ +#endif #define USB_DRIVER_NAME "USB communication driver (libusb 1.0)" -#define USB_DRIVER_VERSION "0.53" +#define USB_DRIVER_VERSION "0.54" /* driver description structure */ upsdrv_info_t comm_upsdrv_info = { @@ -55,6 +60,77 @@ upsdrv_info_t comm_upsdrv_info = { #define MAX_REPORT_SIZE 0x1800 +/* Explicit libusb context, initialized once on the first call to + * nut_libusb_open() and released exactly once at process shutdown via + * the atexit handler below. + * + * Background: the original code called libusb_init(NULL) on every open and + * libusb_exit(NULL) on every close, bumping/decrementing the default + * context's refcount. That pattern violates the documented contract for + * libusb_exit (which "should be called after closing all open devices and + * before your application terminates"), and on certain firmware can wedge + * indefinitely: when libusb_exit hits the 1->0 transition it tears the + * context down, which waits on its internal sync primitives for URBs to + * drain -- but URBs orphaned by libusb_reset_device or by an unexpected + * USB device disconnect never drain, so the wait never returns. That + * deadlock has been observed on the 0665:5161 Cypress USB-serial bridge + * family (Salicru SPS, Ippon, ViewPower, various Voltronic Power UPSes; + * see NUT issues #598, #993, #2453) but the underlying bug class is + * generic and reachable via any reconnect path. + * + * The fix is to use an explicit context that we own end-to-end: + * initialize it once at first open, never call libusb_exit during runtime + * (close_dev no longer does), and release it exactly once at process + * shutdown via atexit. A SIGALRM-bounded timeout on the atexit libusb_exit + * keeps a deadlock-prone teardown from wedging supervisor shutdowns + * (e.g. systemctl stop) if it ever happens at exit time. Generalizes the + * 2018 partial fix in commit 4f84b7f92 ("don't libusb_exit() when closing + * a previously opened device"). + */ +static libusb_context *nut_usb_ctx = NULL; +static nut_bool_t nut_usb_ctx_initialized = false; + +#ifndef WIN32 +__attribute__((noreturn)) +static void nut_libusb_cleanup_alarm(int sig) +{ + NUT_UNUSED_VARIABLE(sig); + /* libusb_exit is taking too long, almost certainly waiting on URBs + * orphaned by a recent device reset / disconnect that will never + * drain. Force the process down so supervisor shutdown isn't blocked. + */ + _exit(0); +} +#endif /* !WIN32 */ + +/* Release the libusb context that nut_libusb_open() initialized. + * Bounded by a SIGALRM (POSIX) so deadlock-prone teardown can't wedge + * process shutdown indefinitely. + */ +static void nut_libusb_cleanup_atexit(void) +{ + if (!nut_usb_ctx_initialized || !nut_usb_ctx) { + return; + } +#ifndef WIN32 + { + struct sigaction sa, prev; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = nut_libusb_cleanup_alarm; + sigemptyset(&sa.sa_mask); + sigaction(SIGALRM, &sa, &prev); + alarm(2); /* sub-ms on healthy context; 2s is generous */ + libusb_exit(nut_usb_ctx); + alarm(0); + sigaction(SIGALRM, &prev, NULL); + } +#else /* WIN32 */ + libusb_exit(nut_usb_ctx); +#endif /* !WIN32 */ + nut_usb_ctx = NULL; + nut_usb_ctx_initialized = false; +} + static void nut_libusb_close(libusb_device_handle *udev); /*! Add USB-related driver variables with addvar() and dstate_setinfo(). @@ -291,10 +367,18 @@ static int nut_libusb_open(libusb_device_handle **udevp, usb_hid_number_opts_parsed = 1; } - /* libusb base init */ - if (libusb_init(NULL) < 0) { - libusb_exit(NULL); - fatal_with_errno(EXIT_FAILURE, "Failed to init libusb 1.0"); + /* libusb base init: initialize our explicit context on the first call + * here, and register the matching one-shot teardown via atexit. See the + * comment near the file-scope declaration of nut_usb_ctx for the design + * rationale (avoids the libusb_exit-per-reconnect misuse and the + * teardown-deadlock it enables on some firmware). + */ + if (!nut_usb_ctx_initialized) { + if (libusb_init(&nut_usb_ctx) < 0) { + fatal_with_errno(EXIT_FAILURE, "Failed to init libusb 1.0"); + } + nut_usb_ctx_initialized = true; + atexit(nut_libusb_cleanup_atexit); } /* TODO: Find a place for this, from Windows branch made for libusb0.c */ @@ -312,7 +396,7 @@ static int nut_libusb_open(libusb_device_handle **udevp, libusb_close(*udevp); #endif - devcount = libusb_get_device_list(NULL, &devlist); + devcount = libusb_get_device_list(nut_usb_ctx, &devlist); /* devcount may be < 0, loop will get skipped; * its SSIZE_MAX < SIZE_MAX for devnum */ @@ -1166,7 +1250,12 @@ static void nut_libusb_close(libusb_device_handle *udev) */ /* libusb_release_interface(udev, usb_subdriver.hid_rep_index); */ libusb_close(udev); - libusb_exit(NULL); + /* libusb_exit() is no longer called here. The libusb context is owned + * by nut_libusb_open() and released exactly once at process shutdown + * via the atexit handler; calling it per-close was a longstanding + * misuse that deadlocked on orphaned URBs after a device reset or + * disconnect (NUT #598). See the comment near nut_usb_ctx above. + */ } usb_communication_subdriver_t usb_subdriver = { From 9eb4ed461a3f5fa8c946fbebe0563296977ea0f9 Mon Sep 17 00:00:00 2001 From: Pedro Cunha Date: Wed, 20 May 2026 23:05:57 +0100 Subject: [PATCH 2/2] drivers/nutdrv_qx.c: qx_command(): reset USB device after persistent LIBUSB_ERROR_OVERFLOW [#598] LIBUSB_ERROR_OVERFLOW has been handled as a benign, retry-on-next-poll condition (grouped with LIBUSB_ERROR_TIMEOUT and the default case) ever since the original blazer import. For most devices a one-off oversized interrupt-IN frame is indeed transient. But some Cypress USB-serial bridge firmware (VID:PID 0665:5161; Salicru SPS, Ippon, ViewPower and various Voltronic Power UPSes) wedges the endpoint once it overruns: every subsequent interrupt read returns OVERFLOW and the driver spins in the stale-data loop until an external USB-level reset. This is the chronic hang reported in issue #598. Distinguish the two cases with a small consecutive-overflow counter: the first QX_USB_OVERFLOW_RESET_TRIES-1 overflows are retried on the next poll (so genuine transients cost nothing but a skipped cycle), and a sustained run escalates to usb_reset() + reconnect, reusing the recovery path already used for PIPE/ETIME/IO errors. Any clean read resets the counter. Captured on a Salicru SPS 1500 ONE BL: an isolated overflow recovers on the next poll, while a sustained lockup (dozens of consecutive overflows within seconds) is cleared only by the device reset, matching the behaviour of an external `usb_resetter --reset-device`. Signed-off-by: Pedro Cunha --- drivers/nutdrv_qx.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/nutdrv_qx.c b/drivers/nutdrv_qx.c index c036866647..0676f4d6dc 100644 --- a/drivers/nutdrv_qx.c +++ b/drivers/nutdrv_qx.c @@ -3881,10 +3881,23 @@ void upsdrv_cleanup(void) /* Generic command processing function: send a command and read a reply. * Returns < 0 on error, 0 on timeout and the number of bytes read on success. */ +/* Consecutive LIBUSB_ERROR_OVERFLOW results on the interrupt-IN endpoint + * tolerated before escalating from "retry on the next poll" to a USB-level + * device reset. A one-off oversized frame is transient; a sustained run means + * the bridge firmware has wedged the endpoint and only re-enumeration clears + * it (e.g. the 0665:5161 Cypress USB-serial family: Salicru SPS, Ippon, + * ViewPower, various Voltronic Power). See NUT issue #598. */ +#define QX_USB_OVERFLOW_RESET_TRIES 3 + static ssize_t qx_command(const char *cmd, size_t cmdlen, char *buf, size_t buflen) { #ifndef TESTING ssize_t ret = -1; +# ifdef QX_USB + /* Persists across calls; only consecutive overflows accumulate (any clean + * read zeroes it, see the switch on `ret` below). */ + static int overflow_tries = 0; +# endif #endif /* NOTE: Could not find in which ifdef-ed codepath, but clang complained @@ -3918,6 +3931,7 @@ static ssize_t qx_command(const char *cmd, size_t cmdlen, char *buf, size_t bufl ret = (*subdriver_command)(cmd, cmdlen, buf, buflen); if (ret >= 0) { + overflow_tries = 0; /* clean read: forget any overflow streak */ return ret; } @@ -3979,8 +3993,26 @@ static ssize_t qx_command(const char *cmd, size_t cmdlen, char *buf, size_t bufl udev = NULL; break; + case LIBUSB_ERROR_OVERFLOW: /* Value too large for defined data type: + * an oversized interrupt-IN frame. A one-off is + * transient (retry on the next poll); a sustained + * run means the bridge firmware has wedged the + * endpoint and only a USB-level reset recovers it. + * See NUT issue #598. */ + if (++overflow_tries < QX_USB_OVERFLOW_RESET_TRIES) { + upsdebugx(2, "Got OVERFLOW on EP 0x81 (%d/%d), retrying on next poll", + overflow_tries, QX_USB_OVERFLOW_RESET_TRIES); + break; + } + upsdebugx(1, "OVERFLOW on EP 0x81 persisted for %d polls; resetting device", + overflow_tries); + overflow_tries = 0; + if (usb_reset(udev) == 0) { + upsdebugx(1, "Device reset handled"); + } + goto fallthrough_case_reconnect; + case LIBUSB_ERROR_TIMEOUT: /* Connection timed out */ - case LIBUSB_ERROR_OVERFLOW: /* Value too large for defined data type */ #if EPROTO && WITH_LIBUSB_0_1 /* limit to libusb 0.1 implementation */ case -EPROTO: /* Protocol error */ #endif