From 798539150a114ee4709b80ed179e465ac215c3d3 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Wed, 25 Feb 2026 01:32:07 +0200 Subject: [PATCH 1/6] virtio/net/worker: Stop tx loop on NothingWritten If write_frame() returns NothingWritten, propagate the error to process_tx_loop() and return to the event loop. The event loop will wake us when the socket available space is changed. Previously when the backend could not write anything we enabled notifications and retried process_tx(), creating a busy loop ending when the write complete. Signed-off-by: Nir Soffer --- src/devices/src/virtio/net/worker.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/devices/src/virtio/net/worker.rs b/src/devices/src/virtio/net/worker.rs index 05341df3c..87a00d30d 100644 --- a/src/devices/src/virtio/net/worker.rs +++ b/src/devices/src/virtio/net/worker.rs @@ -259,11 +259,17 @@ impl NetWorker { loop { self.tx_q.queue.disable_notification(&self.mem).unwrap(); - if let Err(e) = self.process_tx() { - log::error!("Failed to process rx: {e:?} (triggered by backend socket readable)"); + let retry_later = match self.process_tx() { + Err(TxError::Backend(WriteError::NothingWritten)) => true, + Err(e) => { + log::error!("Failed to process tx: {e:?}"); + false + } + _ => false, }; - if !self.tx_q.queue.enable_notification(&self.mem).unwrap() { + let has_new_entries = self.tx_q.queue.enable_notification(&self.mem).unwrap(); + if retry_later || !has_new_entries { break; } } @@ -283,6 +289,7 @@ impl NetWorker { } let mut raise_irq = false; + let mut result = Ok(()); while let Some(head) = tx_queue.pop(&self.mem) { let head_index = head.index; @@ -332,6 +339,7 @@ impl NetWorker { } Err(WriteError::NothingWritten) => { tx_queue.undo_pop(); + result = Err(TxError::Backend(WriteError::NothingWritten)); break; } Err(WriteError::PartialWrite) => { @@ -365,7 +373,7 @@ impl NetWorker { .map_err(TxError::DeviceError)?; } - Ok(()) + result } // Copies a single frame from `self.rx_frame_buf` into the guest. From d851950c2a7fa80395413aed3b604b0a81fb501c Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Sat, 7 Mar 2026 13:42:55 +0200 Subject: [PATCH 2/6] utils/macos/epoll: add one-shot timer support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add add_oneshot_timer() to register a one-shot EVFILT_TIMER with microsecond precision. The caller provides a udata value to identify the timer event, since epoll has no timer concept — timer events are reported with empty event bits and the caller matches on udata. This will be used to implement delayed TX retries after ENOBUFS, where stress testing showed that edge-triggered EVFILT_WRITE does not re-fire reliably after the error. Assisted-by: Cursor/Claude Opus 4.6 Signed-off-by: Nir Soffer --- src/utils/src/macos/epoll.rs | 41 +++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/src/utils/src/macos/epoll.rs b/src/utils/src/macos/epoll.rs index af3f86a57..2e15c5bae 100644 --- a/src/utils/src/macos/epoll.rs +++ b/src/utils/src/macos/epoll.rs @@ -13,11 +13,23 @@ use bitflags::bitflags; use log::debug; fn event_name(filter: i16, flags: u16) -> &'static str { - match (filter, flags & libc::EV_EOF != 0) { - (libc::EVFILT_READ, false) => "READ", - (libc::EVFILT_READ, true) => "READ+EOF", - (libc::EVFILT_WRITE, false) => "WRITE", - (libc::EVFILT_WRITE, true) => "WRITE+EOF", + let eof = flags & libc::EV_EOF != 0; + match filter { + libc::EVFILT_READ => { + if eof { + "READ+EOF" + } else { + "READ" + } + } + libc::EVFILT_WRITE => { + if eof { + "WRITE+EOF" + } else { + "WRITE" + } + } + libc::EVFILT_TIMER => "TIMER", _ => "UNKNOWN", } } @@ -295,6 +307,9 @@ impl Epoll { if kevs[i].0.flags & libc::EV_EOF != 0 { events[i].events |= EventSet::HANG_UP.bits(); } + } else if kevs[i].0.filter == libc::EVFILT_TIMER { + // No epoll equivalent; caller identifies timer by udata. + events[i].events = EventSet::empty().bits(); } events[i].u64 = kevs[i].udata(); @@ -308,6 +323,22 @@ impl Epoll { Ok(nevents) } + + /// Register a one-shot timer that fires after `delay_us` microseconds. + /// The resulting event will have `data` set to `udata`. + pub fn add_oneshot_timer(&self, delay_us: u64, udata: u64) { + let kev = libc::kevent { + ident: 0, + filter: libc::EVFILT_TIMER, + flags: libc::EV_ADD | libc::EV_ONESHOT, + fflags: libc::NOTE_USECONDS, + data: delay_us as isize, + udata: udata as *mut libc::c_void, + }; + unsafe { + libc::kevent(self.queue, &kev, 1, ptr::null_mut(), 0, ptr::null()); + } + } } impl AsRawFd for Epoll { From ac95f4fa0f74276d2d5337052dc005a092b7a773 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Sat, 7 Mar 2026 13:51:19 +0200 Subject: [PATCH 3/6] virtio/net/worker: add timer-based TX retry When process_tx() fails with NothingWritten, set tx_has_deferred_frame and schedule a retry using the epoll shim's one-shot timer. The retry delay is provided by the backend via write_retry_delay_us(), which defaults to 0 (no timer). Backends opt in by overriding it. Assisted-by: Cursor/Claude Opus 4.6 Signed-off-by: Nir Soffer --- src/devices/src/virtio/net/backend.rs | 8 ++++++++ src/devices/src/virtio/net/worker.rs | 25 +++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/devices/src/virtio/net/backend.rs b/src/devices/src/virtio/net/backend.rs index b73b910b9..dab01c943 100644 --- a/src/devices/src/virtio/net/backend.rs +++ b/src/devices/src/virtio/net/backend.rs @@ -43,4 +43,12 @@ pub trait NetBackend { fn has_unfinished_write(&self) -> bool; fn try_finish_write(&mut self, hdr_len: usize, buf: &[u8]) -> Result<(), WriteError>; fn raw_socket_fd(&self) -> RawFd; + + /// Delay in microseconds before retrying after NothingWritten. + /// Returns 0 if no delay-based retry is needed (e.g. on Linux where + /// EAGAIN + EPOLLET handles retries via writable events). + #[allow(dead_code)] + fn write_retry_delay_us(&self) -> u64 { + 0 + } } diff --git a/src/devices/src/virtio/net/worker.rs b/src/devices/src/virtio/net/worker.rs index 87a00d30d..133e23b46 100644 --- a/src/devices/src/virtio/net/worker.rs +++ b/src/devices/src/virtio/net/worker.rs @@ -10,6 +10,8 @@ use super::backend::{NetBackend, ReadError, WriteError}; use super::device::{FrontendError, RxError, TxError, VirtioNetBackend}; use super::VNET_HDR_LEN; +#[cfg(target_os = "macos")] +use std::os::fd::RawFd; use std::os::fd::{AsRawFd, FromRawFd, OwnedFd}; use std::thread; use std::{cmp, result}; @@ -31,6 +33,7 @@ pub struct NetWorker { tx_iovec: Vec<(GuestAddress, usize)>, tx_frame_buf: [u8; MAX_BUFFER_SIZE], tx_frame_len: usize, + tx_has_deferred_frame: bool, } impl NetWorker { @@ -82,6 +85,7 @@ impl NetWorker { tx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_len: 0, tx_iovec: Vec::with_capacity(QUEUE_SIZE as usize), + tx_has_deferred_frame: false, }) } @@ -93,6 +97,9 @@ impl NetWorker { } fn work(mut self) { + #[cfg(target_os = "macos")] + const TX_TIMER_FD: RawFd = -2; + let virtq_rx_ev_fd = self.rx_q.event.as_raw_fd(); let virtq_tx_ev_fd = self.tx_q.event.as_raw_fd(); let backend_socket = self.backend.raw_socket_fd(); @@ -148,6 +155,10 @@ impl NetWorker { } } } + #[cfg(target_os = "macos")] + _ if event_set.is_empty() && source == TX_TIMER_FD => { + self.process_tx_loop(); + } _ => { log::warn!( "Received unknown event: {event_set:?} from fd: {source:?}" @@ -155,6 +166,16 @@ impl NetWorker { } } } + + // Arm the retry timer after processing all events, so it + // reflects the final state of tx_has_deferred_frame. + #[cfg(target_os = "macos")] + if self.tx_has_deferred_frame { + let delay = self.backend.write_retry_delay_us(); + if delay > 0 { + epoll.add_oneshot_timer(delay, TX_TIMER_FD as u64); + } + } } Err(e) => { debug!("vsock: failed to consume muxer epoll event: {e}"); @@ -259,7 +280,7 @@ impl NetWorker { loop { self.tx_q.queue.disable_notification(&self.mem).unwrap(); - let retry_later = match self.process_tx() { + self.tx_has_deferred_frame = match self.process_tx() { Err(TxError::Backend(WriteError::NothingWritten)) => true, Err(e) => { log::error!("Failed to process tx: {e:?}"); @@ -269,7 +290,7 @@ impl NetWorker { }; let has_new_entries = self.tx_q.queue.enable_notification(&self.mem).unwrap(); - if retry_later || !has_new_entries { + if self.tx_has_deferred_frame || !has_new_entries { break; } } From 66bdb82d0d1db00f5414441ff7d295138928cc63 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Thu, 19 Feb 2026 05:25:14 +0200 Subject: [PATCH 4/6] virtio/net/unixgram: Retry ENOBUFS after delay When running iperf3 with gvproxy or vmnet-helper, krunkit breaks randomly with: [2026-02-19T02:53:41Z ERROR devices::virtio::net::worker] Failed to process rx: Backend(Internal(ENOBUFS)) (triggered by backend socket readable) macOS returns ENOBUFS when the kernel socket buffer is full, rather than blocking or returning EAGAIN on non-blocking sockets. In vmnet-helper this is handled by retrying the write after 50 microseconds sleep. We use non-blocking I/O, and we get WRITE events when the socket becomes writeable, but because we need to write a complete frame, the write can fail with ENOBUFS again after we get a WRITE event. Testing shows that we typically get several WRITE event and ENOBUFS error is resolved after few retries. Stress testing shows that sometimes we don't get a WRITE event after the network proxy read all data from the socket. The network proxy is blocked on read(), libkrun blocked on kevent() never getting any event, and the guest is blocked waiting for interrupt. The only reliable way to recover from this is to retry the write after a delay. Map ENOBUFS to WriteError::NothingWritten so the write is retried instead of treating it as a fatal error. We also log a debug log to make it easy to understand the events. Override write_retry_delay_us() to return 50 microseconds delay. The worker use this to retry the write, ensuring that we make progress when WRITE event does not fire. Assisted-by: Cursor/Claude Opus 4.6 Signed-off-by: Nir Soffer --- src/devices/src/virtio/net/unixgram.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/devices/src/virtio/net/unixgram.rs b/src/devices/src/virtio/net/unixgram.rs index 1ceb0722e..da586eca4 100644 --- a/src/devices/src/virtio/net/unixgram.rs +++ b/src/devices/src/virtio/net/unixgram.rs @@ -128,8 +128,16 @@ impl NetBackend for Unixgram { /// Try to write a frame to the proxy. fn write_frame(&mut self, hdr_len: usize, buf: &mut [u8]) -> Result<(), WriteError> { - let ret = send(self.fd.as_raw_fd(), &buf[hdr_len..], MsgFlags::empty()) - .map_err(WriteError::Internal)?; + let ret = match send(self.fd.as_raw_fd(), &buf[hdr_len..], MsgFlags::empty()) { + Ok(ret) => ret, + // macOS returns ENOBUFS when the kernel socket buffer is full, + // rather than blocking or returning EAGAIN on non-blocking sockets. + Err(nix::Error::ENOBUFS) => { + debug!("write_frame: ENOBUFS"); + return Err(WriteError::NothingWritten); + } + Err(e) => return Err(WriteError::Internal(e)), + }; debug!( "Written frame size={}, written={}", buf.len() - hdr_len, @@ -150,4 +158,9 @@ impl NetBackend for Unixgram { fn raw_socket_fd(&self) -> RawFd { self.fd.as_raw_fd() } + + #[cfg(target_os = "macos")] + fn write_retry_delay_us(&self) -> u64 { + 50 + } } From fdc3e71bd1fb806138f89d25ea274b69582b0f27 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Thu, 26 Feb 2026 18:05:02 +0200 Subject: [PATCH 5/6] virtio/net/unixgram: Track ENOBUFS retry count ENOBUFS on macOS does not always resolve on the first writable event from kevent. The writable event fires when any buffer space is available, but for datagram sockets the entire message must fit atomically. When writing large frames (e.g. 65226 bytes), the socket may report writable with insufficient space (e.g. data=17996), causing send() to return ENOBUFS until enough buffer drains. Add a retry counter to log the first ENOBUFS event and the number of retries on recovery: [2026-02-26T16:36:24.440890Z INFO devices::virtio::net::unixgram] write_frame: ENOBUFS [2026-02-26T16:36:24.441544Z INFO devices::virtio::net::unixgram] write_frame: ENOBUFS resolved after 1 retries We use info level since ENOBUFS events are rare and testing in debug level is about 3x slower, distorting the results. ENOBUFS event are rare when testing with vment-helper, and common when testing with gvproxy. The following stats are from 10 minutes stress test using `iperf3 -c vm-address -P 8 --bidir -t 600`. vment-helper stats: Total ENOBUFS events: 293 Retries Count % Min Avg P99 Max 1 280 95.6% 17us 425us 1274us 1470us 2 12 4.1% 517us 656us 1047us 1047us 3 1 0.3% 624us 624us 624us 624us Overall: min=17us avg=436us p50=532us p99=1274us max=1470us Retry count histogram: 1 retries: ################################################## 280 2 retries: ## 12 3 retries: # 1 gvproxy stats: Total ENOBUFS events: 18346 Retries Count % Min Avg P99 Max 1 17291 94.2% 7us 85us 394us 1566us 2 884 4.8% 14us 188us 650us 923us 3 151 0.8% 29us 271us 1095us 2092us 4 16 0.1% 80us 319us 959us 959us 5 4 0.0% 42us 280us 691us 691us Overall: min=7us avg=92us p50=66us p99=452us max=2092us Retry count histogram: 1 retries: ################################################## 17291 2 retries: ## 884 3 retries: # 151 4 retries: # 16 5 retries: # 4 Assisted-by: Cursor/Claude Opus 4.6 Signed-off-by: Nir Soffer --- src/devices/src/virtio/net/unixgram.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/devices/src/virtio/net/unixgram.rs b/src/devices/src/virtio/net/unixgram.rs index da586eca4..e30269516 100644 --- a/src/devices/src/virtio/net/unixgram.rs +++ b/src/devices/src/virtio/net/unixgram.rs @@ -28,6 +28,7 @@ const SOCKET_RCVBUF: usize = DEFAULT_SOCKET_BUF_SIZE; pub struct Unixgram { fd: OwnedFd, + retries: u64, } impl Unixgram { @@ -61,7 +62,7 @@ impl Unixgram { }; } - Self { fd } + Self { fd, retries: 0 } } /// Create the backend opening a connection to the userspace network proxy. @@ -133,11 +134,21 @@ impl NetBackend for Unixgram { // macOS returns ENOBUFS when the kernel socket buffer is full, // rather than blocking or returning EAGAIN on non-blocking sockets. Err(nix::Error::ENOBUFS) => { - debug!("write_frame: ENOBUFS"); + if self.retries == 0 { + info!("write_frame: ENOBUFS"); + } + self.retries += 1; return Err(WriteError::NothingWritten); } Err(e) => return Err(WriteError::Internal(e)), }; + if self.retries > 0 { + info!( + "write_frame: ENOBUFS resolved after {} retries", + self.retries + ); + self.retries = 0; + } debug!( "Written frame size={}, written={}", buf.len() - hdr_len, From 990ca3022b3f7e709dec50692f30a7ce9cc6be06 Mon Sep 17 00:00:00 2001 From: Nir Soffer Date: Sun, 1 Mar 2026 16:24:02 +0200 Subject: [PATCH 6/6] virtio/net/unixgram: Unify read/write logs In read_frame we log: Read eth frame from proxy: 65550 bytes But in write_frame we log: Written frame size=65226, written=65226 The frame size is always equal to the return value. There is no partial write with datagram socket. Unify the log to: Written eth frame to proxy: 65226 bytes Signed-off-by: Nir Soffer --- src/devices/src/virtio/net/unixgram.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/devices/src/virtio/net/unixgram.rs b/src/devices/src/virtio/net/unixgram.rs index e30269516..600b6b41b 100644 --- a/src/devices/src/virtio/net/unixgram.rs +++ b/src/devices/src/virtio/net/unixgram.rs @@ -149,11 +149,7 @@ impl NetBackend for Unixgram { ); self.retries = 0; } - debug!( - "Written frame size={}, written={}", - buf.len() - hdr_len, - ret - ); + debug!("Written eth frame to proxy: {ret} bytes"); Ok(()) }