From 594ca189bcf59cd481a14ef08e6e784b97647453 Mon Sep 17 00:00:00 2001
From: Andrew Jeffery <andrew@codeconstruct.com.au>
Date: Tue, 15 Apr 2025 10:17:46 +0930
Subject: [PATCH 1/2] mctpd: Clean up unused ev_state

From at least systemd 252, sd_event_source_get_enabled() is documented
as allowing NULL for the event parameter. We weren't using the output,
so drop the variable.

Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
---
 src/mctpd.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/mctpd.c b/src/mctpd.c
index e9d5ccdd..f6c4b17a 100644
--- a/src/mctpd.c
+++ b/src/mctpd.c
@@ -2564,7 +2564,6 @@ static int method_endpoint_remove(sd_bus_message *call, void *data,
 static int
 peer_endpoint_recover(sd_event_source *s, uint64_t usec, void *userdata)
 {
-	int ev_state __attribute__((unused));
 	peer *peer = userdata;
 	ctx *ctx = peer->ctx;
 	char *peer_path;
@@ -2624,7 +2623,7 @@ peer_endpoint_recover(sd_event_source *s, uint64_t usec, void *userdata)
 			/* It's not known to be the same device, allocate a new EID */
 			dest_phys phys = peer->phys;
 
-			assert(sd_event_source_get_enabled(peer->recovery.source, &ev_state) == 0);
+			assert(sd_event_source_get_enabled(peer->recovery.source, NULL) == 0);
 			remove_peer(peer);
 			/*
 			 * The representation of the old peer is now gone. Set up the new peer,
@@ -2662,7 +2661,7 @@ peer_endpoint_recover(sd_event_source *s, uint64_t usec, void *userdata)
 		goto reschedule;
 	}
 
-	assert(sd_event_source_get_enabled(peer->recovery.source, &ev_state) == 0);
+	assert(sd_event_source_get_enabled(peer->recovery.source, NULL) == 0);
 	sd_event_source_unref(peer->recovery.source);
 	peer->recovery.delay = 0;
 	peer->recovery.source = NULL;
@@ -2681,7 +2680,7 @@ peer_endpoint_recover(sd_event_source *s, uint64_t usec, void *userdata)
 	if (rc < 0) {
 reclaim:
 		/* Recovery unsuccessful, clean up the peer */
-		assert(sd_event_source_get_enabled(peer->recovery.source, &ev_state) == 0);
+		assert(sd_event_source_get_enabled(peer->recovery.source, NULL) == 0);
 		remove_peer(peer);
 	}
 	return rc < 0 ? rc : 0;

From 498f73e85beaeb235a49aa86407af0ef60cca6ec Mon Sep 17 00:00:00 2001
From: Andrew Jeffery <andrew@codeconstruct.com.au>
Date: Tue, 15 Apr 2025 10:04:38 +0930
Subject: [PATCH 2/2] mctpd: add_peer: Update userdata pointer for active event
 sources

Mitigate the issue reported via [1] and highlighted by ASAN:

    ==179005==ERROR: AddressSanitizer: heap-use-after-free on address 0x61f000000c38 at pc 0x55dfaa7fa308 bp 0x7ffe10264420 sp 0x7ffe10264418
    READ of size 8 at 0x61f000000c38 thread T0
        0 0x55dfaa7fa307 in peer_endpoint_recover ../src/mctpd.c:2570
        1 0x7f9a43dadae3  (/lib/x86_64-linux-gnu/libsystemd.so.0+0x78ae3)
        2 0x7f9a43dade04 in sd_event_dispatch (/lib/x86_64-linux-gnu/libsystemd.so.0+0x78e04)
        3 0x7f9a43daf2e7 in sd_event_run (/lib/x86_64-linux-gnu/libsystemd.so.0+0x7a2e7)
        4 0x7f9a43daf506 in sd_event_loop (/lib/x86_64-linux-gnu/libsystemd.so.0+0x7a506)
        5 0x55dfaa80a609 in main ../src/mctpd.c:4547
        6 0x7f9a42c46249 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58
        7 0x7f9a42c46304 in __libc_start_main_impl ../csu/libc-start.c:360
        8 0x55dfaa7e38d0 in _start (mctp/build/test-mctpd+0x688d0)

    0x61f000000c38 is located 3000 bytes inside of 3040-byte region [0x61f000000080,0x61f000000c60)
    freed by thread T0 here:
        0 0x7f9a436b78d5 in __interceptor_realloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:85
        1 0x55dfaa7ef028 in add_peer ../src/mctpd.c:1419
        2 0x55dfaa7f1587 in endpoint_assign_eid ../src/mctpd.c:1601
        3 0x55dfaa7f55a0 in method_setup_endpoint ../src/mctpd.c:2038
        4 0x7f9a43d650ad  (/lib/x86_64-linux-gnu/libsystemd.so.0+0x300ad)

    previously allocated by thread T0 here:
        0 0x7f9a436b78d5 in __interceptor_realloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:85
        1 0x55dfaa7ef028 in add_peer ../src/mctpd.c:1419
        2 0x55dfaa805741 in add_local_eid ../src/mctpd.c:4052
        3 0x55dfaa80627f in add_interface_local ../src/mctpd.c:4114
        4 0x55dfaa806ffa in setup_nets ../src/mctpd.c:4200
        5 0x55dfaa80a380 in main ../src/mctpd.c:4525
        6 0x7f9a42c46249 in __libc_start_call_main ../sysdeps/nptl/libc_start_call_main.h:58

    SUMMARY: AddressSanitizer: heap-use-after-free ../src/mctpd.c:2570 in peer_endpoint_recover
    Shadow bytes around the buggy address:
      0x0c3e7fff8130: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
      0x0c3e7fff8140: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
      0x0c3e7fff8150: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
      0x0c3e7fff8160: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
      0x0c3e7fff8170: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
    =>0x0c3e7fff8180: fd fd fd fd fd fd fd[fd]fd fd fd fd fa fa fa fa
      0x0c3e7fff8190: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
      0x0c3e7fff81a0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
      0x0c3e7fff81b0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
      0x0c3e7fff81c0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
      0x0c3e7fff81d0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
    Shadow byte legend (one shadow byte represents 8 application bytes):
      Addressable:           00
      Partially addressable: 01 02 03 04 05 06 07
      Heap left redzone:       fa
      Freed heap region:       fd
      Stack left redzone:      f1
      Stack mid redzone:       f2
      Stack right redzone:     f3
      Stack after return:      f5
      Stack use after scope:   f8
      Global redzone:          f9
      Global init order:       f6
      Poisoned by user:        f7
      Container overflow:      fc
      Array cookie:            ac
      Intra object redzone:    bb
      ASan internal:           fe
      Left alloca redzone:     ca
      Right alloca redzone:    cb
    ==179005==ABORTING

Link: https://github.com/codeconstruct/mctp/issues/69 [1]
Fixes: 7ec2f8daa3a8 ("mctpd: Add support for endpoint recovery")
Signed-off-by: Andrew Jeffery <andrew@codeconstruct.com.au>
---
 src/mctpd.c         |  6 +++++
 tests/test_mctpd.py | 65 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/src/mctpd.c b/src/mctpd.c
index f6c4b17a..46c8dca9 100644
--- a/src/mctpd.c
+++ b/src/mctpd.c
@@ -1423,6 +1423,12 @@ static int add_peer(ctx *ctx, const dest_phys *dest, mctp_eid_t eid,
 		memset(&ctx->peers[ctx->size_peers], 0x0,
 			sizeof(*ctx->peers) * (new_size - ctx->size_peers));
 		ctx->size_peers = new_size;
+		// Update user data for recovery tasks
+		for (size_t ridx = 0; ridx < ctx->size_peers; ridx++) {
+			peer = &ctx->peers[ridx];
+			if (sd_event_source_get_enabled(peer->recovery.source, NULL) > 0)
+				sd_event_source_set_userdata(peer->recovery.source, peer);
+		}
 	}
 
 	// Populate it
diff --git a/tests/test_mctpd.py b/tests/test_mctpd.py
index 673bc12a..2ee7b2ae 100644
--- a/tests/test_mctpd.py
+++ b/tests/test_mctpd.py
@@ -483,3 +483,68 @@ async def test_network_local_eids_none(dbus, mctpd):
     eids = list(await net.get_local_eids())
 
     assert eids == []
+
+async def test_concurrent_recovery_setup(dbus, mctpd):
+    iface = mctpd.system.interfaces[0]
+    mctp_i = await mctpd_mctp_iface_obj(dbus, iface)
+
+    # mctpd context tracks 20 peer objects by default, add and set up 19 so we
+    # reach the allocation boundary.
+    split = 19
+    for i in range(split):
+        pep = Endpoint(iface, bytes([0x1e + i]))
+        mctpd.network.add_endpoint(pep)
+        (_, _, path, _) = await mctp_i.call_setup_endpoint(pep.lladdr)
+
+    # Grab the DBus path for an endpoint that we will cause to be removed from
+    # the network through the recovery path. Arbitrarily use the most recent
+    # one added
+    ep = await dbus.get_proxy_object(MCTPD_C, path)
+    ep_props = await ep.get_interface(DBUS_PROPERTIES_I)
+
+    # Set up a match for Connectivity transitioning to Degraded on the endpoint
+    # for which we request recovery
+    degraded = trio.Semaphore(initial_value = 0)
+    def ep_connectivity_changed(iface, changed, invalidated):
+        if iface == MCTPD_ENDPOINT_I and 'Connectivity' in changed:
+            if 'Degraded' == changed['Connectivity'].value:
+                degraded.release()
+    await ep_props.on_properties_changed(ep_connectivity_changed)
+
+    # Set up a match for the recovery endpoint object being removed from DBus
+    mctp_p = await dbus.get_proxy_object(MCTPD_C, MCTPD_MCTP_P)
+    mctp_objmgr = await mctp_p.get_interface(DBUS_OBJECT_MANAGER_I)
+    removed = trio.Semaphore(initial_value = 0)
+    def ep_removed(ep_path, interfaces):
+        if ep_path == path and MCTPD_ENDPOINT_I in interfaces:
+            removed.release()
+
+    await mctp_objmgr.on_interfaces_removed(ep_removed)
+
+    # Delete the endpoint from the network so its recovery will fail after
+    # timeout. Note we delete at the split index as the network was already
+    # populated with the default endpoint
+    del mctpd.network.endpoints[split]
+
+    # Begin recovery for the endpoint ...
+    ep_ep = await ep.get_interface(MCTPD_ENDPOINT_I)
+    await ep_ep.call_recover()
+
+    # ... and wait until we're notified the recovery process has begun
+    with trio.move_on_after(1) as expected:
+        await degraded.acquire()
+    assert not expected.cancelled_caught
+
+    # Now that we're asynchronously waiting for the endpoint recovery process
+    # to complete, force a realloc() of the peer object array by adding a new
+    # peer, which will invalidate the recovering peer's pointer
+    pep = Endpoint(iface, bytes([0x1e + split]))
+    mctpd.network.add_endpoint(pep)
+    (_, _, _, new) = await mctp_i.call_setup_endpoint(pep.lladdr)
+    assert new
+
+    # Verify the recovery process completed gracefully with removal of the
+    # endpoint's DBus object
+    with trio.move_on_after(2 * MCTPD_TRECLAIM) as expected:
+        await removed.acquire()
+    assert not expected.cancelled_caught