From 38e4c521e96d3f709535e5bf32061853ce157d25 Mon Sep 17 00:00:00 2001 From: wangzihao122 Date: Fri, 8 May 2026 19:38:39 +0800 Subject: [PATCH] Fix: add AICore task execution timeout mechanism Three-layer timeout chain to detect and recover from stuck AICore ops: - Layer 1 (STARS): call aclrtSetOpExecuteTimeOutV2 in attach_current_thread() to enable hardware-level AICore op execution monitoring (1s requested) - Layer 2 (AICPU): add 1s timeout to platform_deinit_aicore_regs() to prevent infinite wait when AICore is unresponsive (e.g. STARS-killed op); shutdown() logs and continues deiniting remaining cores - Layer 3 (Host): replace rtStreamSynchronize with aclrtSynchronizeStreamWithTimeout (2s) for both AICPU and AICore streams; return 507046 on timeout Timeout budget: scheduler idle (~200ms) + per-core deinit (1s each). Single-core case completes within host 2s timeout; multi-core case (block_dim > 1, all cores stuck) is backstopped by host timeout, with aclrtResetDevice handling final hardware cleanup. --- .../platform/include/aicpu/platform_regs.h | 2 +- .../platform/include/common/platform_config.h | 23 ++++++++++ .../platform/onboard/host/device_runner.cpp | 45 ++++++++++++++++--- .../platform/onboard/host/device_runner.h | 10 +++++ src/a2a3/platform/src/aicpu/platform_regs.cpp | 15 +++++-- .../runtime/scheduler/scheduler_cold_path.cpp | 5 ++- 6 files changed, 89 insertions(+), 11 deletions(-) diff --git a/src/a2a3/platform/include/aicpu/platform_regs.h b/src/a2a3/platform/include/aicpu/platform_regs.h index 707f5d218..3aa6370bb 100644 --- a/src/a2a3/platform/include/aicpu/platform_regs.h +++ b/src/a2a3/platform/include/aicpu/platform_regs.h @@ -106,7 +106,7 @@ void platform_init_aicore_regs(uint64_t reg_addr); * * @param reg_addr Register base address of the AICore */ -void platform_deinit_aicore_regs(uint64_t reg_addr); +int32_t platform_deinit_aicore_regs(uint64_t reg_addr); /** * Get physical core count for current platform diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index e733a9b58..bd2945dcb 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -59,6 +59,21 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 4; */ constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6; +/** + * AICore op execution timeout (microseconds). + * Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors + * AICore task execution and kills ops that exceed this threshold. + */ +constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 1000000; // 1s + +/** + * Host-side stream synchronization timeout (milliseconds). + * Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs. + * Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US to allow STARS + * enough time to kill the timed-out op and propagate the notification. + */ +constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 2000; // 2s (> op timeout 1s) + // ============================================================================= // Derived Platform Limits // ============================================================================= @@ -138,6 +153,14 @@ constexpr int PLATFORM_PROF_READYQUEUE_SIZE = */ constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz +/** + * AICore deinit wait timeout (ticks at PLATFORM_PROF_SYS_CNT_FREQ). + * platform_deinit_aicore_regs waits for AICore to acknowledge the exit + * signal. If AICore is stuck (STARS-killed op, hardware fault), waiting + * forever blocks the AICPU scheduling thread. This timeout bounds the wait. + */ +constexpr uint64_t PLATFORM_DEINIT_TIMEOUT_TICKS = PLATFORM_PROF_SYS_CNT_FREQ; // 1s + /** * Timeout duration for performance data collection (seconds) */ diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index b43e5c5d7..a6d5de667 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -268,10 +268,29 @@ int DeviceRunner::attach_current_thread(int device_id) { return rc; } + if (device_id_ == -1) { + configure_aicore_op_timeout(); + } + device_id_ = device_id; return 0; } +void DeviceRunner::configure_aicore_op_timeout() { + uint64_t actual_timeout = 0; + int rc = aclrtSetOpExecuteTimeOutV2(PLATFORM_OP_EXECUTE_TIMEOUT_US, &actual_timeout); + if (rc != 0) { + LOG_ERROR( + "aclrtSetOpExecuteTimeOutV2(%llu us) failed: %d", (unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, rc + ); + } else { + LOG_INFO_V0( + "aclrtSetOpExecuteTimeOutV2: requested=%llu us, actual=%llu us", + (unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, (unsigned long long)actual_timeout + ); + } +} + int DeviceRunner::ensure_acl_ready(int device_id) { if (device_id < 0) { LOG_ERROR("ensure_acl_ready: invalid device_id %d", device_id); @@ -684,17 +703,31 @@ int DeviceRunner::run( return rc; } - LOG_INFO_V0("=== rtStreamSynchronize stream_aicpu_ ==="); - rc = rtStreamSynchronize(stream_aicpu_); + LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicpu_ ==="); + rc = aclrtSynchronizeStreamWithTimeout(stream_aicpu_, PLATFORM_STREAM_SYNC_TIMEOUT_MS); + if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) { + LOG_ERROR( + "Stream sync timeout: stream=AICPU timeout_ms=%d device_id=%d block_dim=%d", + PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_ + ); + return rc; + } if (rc != 0) { - LOG_ERROR("rtStreamSynchronize (AICPU) failed: %d", rc); + LOG_ERROR("aclrtSynchronizeStreamWithTimeout (AICPU) failed: %d", rc); return rc; } - LOG_INFO_V0("=== rtStreamSynchronize stream_aicore_ ==="); - rc = rtStreamSynchronize(stream_aicore_); + LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicore_ ==="); + rc = aclrtSynchronizeStreamWithTimeout(stream_aicore_, PLATFORM_STREAM_SYNC_TIMEOUT_MS); + if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) { + LOG_ERROR( + "Stream sync timeout: stream=AICore timeout_ms=%d device_id=%d block_dim=%d", + PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_ + ); + return rc; + } if (rc != 0) { - LOG_ERROR("rtStreamSynchronize (AICore) failed: %d", rc); + LOG_ERROR("aclrtSynchronizeStreamWithTimeout (AICore) failed: %d", rc); return rc; } diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 8dd4dc816..3cf0e9ba2 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -512,6 +512,16 @@ class DeviceRunner { */ int prepare_orch_so(Runtime &runtime); + /** + * Configure STARS op execution timeout (once per DeviceRunner lifetime). + * + * Called on first device attach to set the hardware-level AICore op + * execution timeout via aclrtSetOpExecuteTimeOutV2. The actual + * timeout may differ from the requested value due to hardware timer + * granularity. + */ + void configure_aicore_op_timeout(); + /** * Initialize performance profiling shared memory * diff --git a/src/a2a3/platform/src/aicpu/platform_regs.cpp b/src/a2a3/platform/src/aicpu/platform_regs.cpp index 580ae95cc..e926fa585 100644 --- a/src/a2a3/platform/src/aicpu/platform_regs.cpp +++ b/src/a2a3/platform/src/aicpu/platform_regs.cpp @@ -28,6 +28,7 @@ #include #include "aicpu/platform_regs.h" +#include "aicpu/device_time.h" #include "common/platform_config.h" static uint64_t g_platform_regs = 0; @@ -73,17 +74,25 @@ void platform_init_aicore_regs(uint64_t reg_addr) { write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICPU_IDLE_TASK_ID); } -void platform_deinit_aicore_regs(uint64_t reg_addr) { +int32_t platform_deinit_aicore_regs(uint64_t reg_addr) { // Send exit signal to AICore write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICORE_EXIT_SIGNAL); - // Wait for AICore to acknowledge exit by writing AICORE_EXITED_VALUE to COND - while (read_reg(reg_addr, RegId::COND) != AICORE_EXITED_VALUE) {} + // Wait for AICore to acknowledge exit, with timeout. + // On timeout, skip register cleanup (AICore is unresponsive; host will + // aclrtResetDevice to clear all hardware state). + uint64_t t0 = get_sys_cnt_aicpu(); + while (read_reg(reg_addr, RegId::COND) != AICORE_EXITED_VALUE) { + if (get_sys_cnt_aicpu() - t0 > PLATFORM_DEINIT_TIMEOUT_TICKS) { + return -1; + } + } // Initialize task dispatch register to idle state write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICPU_IDLE_TASK_ID); // Close fast path control write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_CLOSE); + return 0; } uint32_t platform_get_physical_cores_count() { diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index a0c664c14..96db91148 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -361,7 +361,10 @@ int32_t SchedulerContext::shutdown(int32_t thread_idx) { int32_t core_id = cores[i]; uint64_t reg_addr = core_exec_states_[core_id].reg_addr; if (reg_addr != 0) { - platform_deinit_aicore_regs(reg_addr); + // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores. + if (platform_deinit_aicore_regs(reg_addr) != 0) { + LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id); + } } else { LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id); }