From bfbba15928aa6a91b3e4b8943e0cad16199d9d48 Mon Sep 17 00:00:00 2001
From: Pedro Cunha <pedroagracio+nut@gmail.com>
Date: Sun, 24 May 2026 17:21:53 +0100
Subject: [PATCH 1/2] drivers/libusb1.c: use explicit context; drop per-close
 libusb_exit() [#598]

The original code called libusb_init(NULL) on every nut_libusb_open() and
libusb_exit(NULL) on every nut_libusb_close(), bumping and decrementing
the default context's refcount. That violates the documented contract for
libusb_exit, which "should be called after closing all open devices and
before your application terminates", and on certain firmware it wedges
indefinitely: when libusb_exit hits the 1->0 transition it tears the
context down, which waits on libusb's internal sync primitives for
outstanding URBs to drain. URBs orphaned by libusb_reset_device or by an
unexpected device disconnect never drain, so the wait never returns.

The deadlock is reachable from any reconnect path. It was first
recognized by 4f84b7f92 ("don't libusb_exit() when closing a previously
opened device"), which dropped the call in nut_libusb_open()'s rematch
loop. The fallthrough reconnect path in qx_command and the rest of the
driver lifecycle still hit it. With the companion change in nutdrv_qx.c
(a4813f868) that escalates persistent LIBUSB_ERROR_OVERFLOW to
libusb_reset_device, the deadlock window opens on every escalation and
was hit reproducibly on the 0665:5161 Cypress USB-serial bridge family
(Salicru SPS, Ippon, ViewPower, Voltronic Power UPSes; see #598, #993,
#2453).

Switch to an explicit libusb context owned end-to-end:
  - Initialize once on the first nut_libusb_open() and register a
    matching one-shot atexit handler.
  - Remove libusb_exit() from nut_libusb_close(); the close path now
    only calls libusb_close(), which is non-blocking.
  - Replace libusb_get_device_list(NULL, ...) with the explicit context
    so the default context is never touched.
  - Bound the atexit libusb_exit() with a SIGALRM-based 2s timeout
    (POSIX) so a deadlock-prone teardown at shutdown can't wedge
    supervisor stop sequences (e.g. systemctl stop). Sub-millisecond on
    a healthy context.

Bump USB_DRIVER_VERSION from 0.53 to 0.54.

Validated on a Salicru SPS 1500 ONE BL (0665:5161) by authorize-toggle
stress reproduction (8 cycles, clean reconnect each time, same driver
PID throughout) and by a natural OVERFLOW event captured in soak
(counter-gated logic at 1/3, next poll succeeded, no escalation; driver
PID stable for 22h+).

Signed-off-by: Pedro Cunha <pedroagracio+nut@gmail.com>
---
 drivers/libusb1.c | 103 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 7 deletions(-)

diff --git a/drivers/libusb1.c b/drivers/libusb1.c
index 955ada91bd..b02da58a2f 100644
--- a/drivers/libusb1.c
+++ b/drivers/libusb1.c
@@ -40,9 +40,14 @@
 #include "usb-common.h"
 #include "nut_libusb.h"
 #include "nut_stdint.h"
+#include "nut_bool.h"
+
+#ifndef WIN32
+# include <signal.h>	/* sigaction(), SIGALRM for the atexit watchdog */
+#endif
 
 #define USB_DRIVER_NAME		"USB communication driver (libusb 1.0)"
-#define USB_DRIVER_VERSION	"0.53"
+#define USB_DRIVER_VERSION	"0.54"
 
 /* driver description structure */
 upsdrv_info_t comm_upsdrv_info = {
@@ -55,6 +60,77 @@ upsdrv_info_t comm_upsdrv_info = {
 
 #define MAX_REPORT_SIZE         0x1800
 
+/* Explicit libusb context, initialized once on the first call to
+ * nut_libusb_open() and released exactly once at process shutdown via
+ * the atexit handler below.
+ *
+ * Background: the original code called libusb_init(NULL) on every open and
+ * libusb_exit(NULL) on every close, bumping/decrementing the default
+ * context's refcount. That pattern violates the documented contract for
+ * libusb_exit (which "should be called after closing all open devices and
+ * before your application terminates"), and on certain firmware can wedge
+ * indefinitely: when libusb_exit hits the 1->0 transition it tears the
+ * context down, which waits on its internal sync primitives for URBs to
+ * drain -- but URBs orphaned by libusb_reset_device or by an unexpected
+ * USB device disconnect never drain, so the wait never returns. That
+ * deadlock has been observed on the 0665:5161 Cypress USB-serial bridge
+ * family (Salicru SPS, Ippon, ViewPower, various Voltronic Power UPSes;
+ * see NUT issues #598, #993, #2453) but the underlying bug class is
+ * generic and reachable via any reconnect path.
+ *
+ * The fix is to use an explicit context that we own end-to-end:
+ * initialize it once at first open, never call libusb_exit during runtime
+ * (close_dev no longer does), and release it exactly once at process
+ * shutdown via atexit. A SIGALRM-bounded timeout on the atexit libusb_exit
+ * keeps a deadlock-prone teardown from wedging supervisor shutdowns
+ * (e.g. systemctl stop) if it ever happens at exit time. Generalizes the
+ * 2018 partial fix in commit 4f84b7f92 ("don't libusb_exit() when closing
+ * a previously opened device").
+ */
+static libusb_context	*nut_usb_ctx = NULL;
+static nut_bool_t	 nut_usb_ctx_initialized = false;
+
+#ifndef WIN32
+__attribute__((noreturn))
+static void nut_libusb_cleanup_alarm(int sig)
+{
+	NUT_UNUSED_VARIABLE(sig);
+	/* libusb_exit is taking too long, almost certainly waiting on URBs
+	 * orphaned by a recent device reset / disconnect that will never
+	 * drain. Force the process down so supervisor shutdown isn't blocked.
+	 */
+	_exit(0);
+}
+#endif	/* !WIN32 */
+
+/* Release the libusb context that nut_libusb_open() initialized.
+ * Bounded by a SIGALRM (POSIX) so deadlock-prone teardown can't wedge
+ * process shutdown indefinitely.
+ */
+static void nut_libusb_cleanup_atexit(void)
+{
+	if (!nut_usb_ctx_initialized || !nut_usb_ctx) {
+		return;
+	}
+#ifndef WIN32
+	{
+		struct sigaction	sa, prev;
+		memset(&sa, 0, sizeof(sa));
+		sa.sa_handler = nut_libusb_cleanup_alarm;
+		sigemptyset(&sa.sa_mask);
+		sigaction(SIGALRM, &sa, &prev);
+		alarm(2);	/* sub-ms on healthy context; 2s is generous */
+		libusb_exit(nut_usb_ctx);
+		alarm(0);
+		sigaction(SIGALRM, &prev, NULL);
+	}
+#else	/* WIN32 */
+	libusb_exit(nut_usb_ctx);
+#endif	/* !WIN32 */
+	nut_usb_ctx = NULL;
+	nut_usb_ctx_initialized = false;
+}
+
 static void nut_libusb_close(libusb_device_handle *udev);
 
 /*! Add USB-related driver variables with addvar() and dstate_setinfo().
@@ -291,10 +367,18 @@ static int nut_libusb_open(libusb_device_handle **udevp,
 		usb_hid_number_opts_parsed = 1;
 	}
 
-	/* libusb base init */
-	if (libusb_init(NULL) < 0) {
-		libusb_exit(NULL);
-		fatal_with_errno(EXIT_FAILURE, "Failed to init libusb 1.0");
+	/* libusb base init: initialize our explicit context on the first call
+	 * here, and register the matching one-shot teardown via atexit. See the
+	 * comment near the file-scope declaration of nut_usb_ctx for the design
+	 * rationale (avoids the libusb_exit-per-reconnect misuse and the
+	 * teardown-deadlock it enables on some firmware).
+	 */
+	if (!nut_usb_ctx_initialized) {
+		if (libusb_init(&nut_usb_ctx) < 0) {
+			fatal_with_errno(EXIT_FAILURE, "Failed to init libusb 1.0");
+		}
+		nut_usb_ctx_initialized = true;
+		atexit(nut_libusb_cleanup_atexit);
 	}
 
 /* TODO: Find a place for this, from Windows branch made for libusb0.c */
@@ -312,7 +396,7 @@ static int nut_libusb_open(libusb_device_handle **udevp,
 		libusb_close(*udevp);
 #endif
 
-	devcount = libusb_get_device_list(NULL, &devlist);
+	devcount = libusb_get_device_list(nut_usb_ctx, &devlist);
 
 	/* devcount may be < 0, loop will get skipped;
 	 * its SSIZE_MAX < SIZE_MAX for devnum */
@@ -1166,7 +1250,12 @@ static void nut_libusb_close(libusb_device_handle *udev)
 	 */
 	/* libusb_release_interface(udev, usb_subdriver.hid_rep_index); */
 	libusb_close(udev);
-	libusb_exit(NULL);
+	/* libusb_exit() is no longer called here. The libusb context is owned
+	 * by nut_libusb_open() and released exactly once at process shutdown
+	 * via the atexit handler; calling it per-close was a longstanding
+	 * misuse that deadlocked on orphaned URBs after a device reset or
+	 * disconnect (NUT #598). See the comment near nut_usb_ctx above.
+	 */
 }
 
 usb_communication_subdriver_t usb_subdriver = {

From 9eb4ed461a3f5fa8c946fbebe0563296977ea0f9 Mon Sep 17 00:00:00 2001
From: Pedro Cunha <pedroagracio+nut@gmail.com>
Date: Wed, 20 May 2026 23:05:57 +0100
Subject: [PATCH 2/2] drivers/nutdrv_qx.c: qx_command(): reset USB device after
 persistent LIBUSB_ERROR_OVERFLOW [#598]

LIBUSB_ERROR_OVERFLOW has been handled as a benign, retry-on-next-poll
condition (grouped with LIBUSB_ERROR_TIMEOUT and the default case) ever
since the original blazer import. For most devices a one-off oversized
interrupt-IN frame is indeed transient. But some Cypress USB-serial
bridge firmware (VID:PID 0665:5161; Salicru SPS, Ippon, ViewPower and
various Voltronic Power UPSes) wedges the endpoint once it overruns:
every subsequent interrupt read returns OVERFLOW and the driver spins in
the stale-data loop until an external USB-level reset. This is the
chronic hang reported in issue #598.

Distinguish the two cases with a small consecutive-overflow counter: the
first QX_USB_OVERFLOW_RESET_TRIES-1 overflows are retried on the next
poll (so genuine transients cost nothing but a skipped cycle), and a
sustained run escalates to usb_reset() + reconnect, reusing the recovery
path already used for PIPE/ETIME/IO errors. Any clean read resets the
counter.

Captured on a Salicru SPS 1500 ONE BL: an isolated overflow recovers on
the next poll, while a sustained lockup (dozens of consecutive overflows
within seconds) is cleared only by the device reset, matching the
behaviour of an external `usb_resetter --reset-device`.

Signed-off-by: Pedro Cunha <pedroagracio+nut@gmail.com>
---
 drivers/nutdrv_qx.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/nutdrv_qx.c b/drivers/nutdrv_qx.c
index c036866647..0676f4d6dc 100644
--- a/drivers/nutdrv_qx.c
+++ b/drivers/nutdrv_qx.c
@@ -3881,10 +3881,23 @@ void	upsdrv_cleanup(void)
 
 /* Generic command processing function: send a command and read a reply.
  * Returns < 0 on error, 0 on timeout and the number of bytes read on success. */
+/* Consecutive LIBUSB_ERROR_OVERFLOW results on the interrupt-IN endpoint
+ * tolerated before escalating from "retry on the next poll" to a USB-level
+ * device reset. A one-off oversized frame is transient; a sustained run means
+ * the bridge firmware has wedged the endpoint and only re-enumeration clears
+ * it (e.g. the 0665:5161 Cypress USB-serial family: Salicru SPS, Ippon,
+ * ViewPower, various Voltronic Power). See NUT issue #598. */
+#define QX_USB_OVERFLOW_RESET_TRIES	3
+
 static ssize_t	qx_command(const char *cmd, size_t cmdlen, char *buf, size_t buflen)
 {
 #ifndef TESTING
 	ssize_t	ret = -1;
+# ifdef QX_USB
+	/* Persists across calls; only consecutive overflows accumulate (any clean
+	 * read zeroes it, see the switch on `ret` below). */
+	static int	overflow_tries = 0;
+# endif
 #endif
 
 /* NOTE: Could not find in which ifdef-ed codepath, but clang complained
@@ -3918,6 +3931,7 @@ static ssize_t	qx_command(const char *cmd, size_t cmdlen, char *buf, size_t bufl
 		ret = (*subdriver_command)(cmd, cmdlen, buf, buflen);
 
 		if (ret >= 0) {
+			overflow_tries = 0;	/* clean read: forget any overflow streak */
 			return ret;
 		}
 
@@ -3979,8 +3993,26 @@ static ssize_t	qx_command(const char *cmd, size_t cmdlen, char *buf, size_t bufl
 			udev = NULL;
 			break;
 
+		case LIBUSB_ERROR_OVERFLOW:	/* Value too large for defined data type:
+						 * an oversized interrupt-IN frame. A one-off is
+						 * transient (retry on the next poll); a sustained
+						 * run means the bridge firmware has wedged the
+						 * endpoint and only a USB-level reset recovers it.
+						 * See NUT issue #598. */
+			if (++overflow_tries < QX_USB_OVERFLOW_RESET_TRIES) {
+				upsdebugx(2, "Got OVERFLOW on EP 0x81 (%d/%d), retrying on next poll",
+					overflow_tries, QX_USB_OVERFLOW_RESET_TRIES);
+				break;
+			}
+			upsdebugx(1, "OVERFLOW on EP 0x81 persisted for %d polls; resetting device",
+				overflow_tries);
+			overflow_tries = 0;
+			if (usb_reset(udev) == 0) {
+				upsdebugx(1, "Device reset handled");
+			}
+			goto fallthrough_case_reconnect;
+
 		case LIBUSB_ERROR_TIMEOUT:	/* Connection timed out */
-		case LIBUSB_ERROR_OVERFLOW:	/* Value too large for defined data type */
 #if EPROTO && WITH_LIBUSB_0_1		/* limit to libusb 0.1 implementation */
 		case -EPROTO:		/* Protocol error */
 #endif