Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/a2a3/platform/include/aicpu/platform_regs.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void platform_init_aicore_regs(uint64_t reg_addr);
*
* @param reg_addr Register base address of the AICore
*/
void platform_deinit_aicore_regs(uint64_t reg_addr);
int32_t platform_deinit_aicore_regs(uint64_t reg_addr);

/**
* Get physical core count for current platform
Expand Down
23 changes: 23 additions & 0 deletions src/a2a3/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ constexpr int PLATFORM_MAX_AICPU_THREADS = 4;
*/
constexpr int PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH = 6;

/**
* AICore op execution timeout (microseconds).
* Passed to aclrtSetOpExecuteTimeOutV2 so that STARS actively monitors
* AICore task execution and kills ops that exceed this threshold.
*/
constexpr uint64_t PLATFORM_OP_EXECUTE_TIMEOUT_US = 1000000; // 1s

/**
* Host-side stream synchronization timeout (milliseconds).
* Passed to aclrtSynchronizeStreamWithTimeout to detect stream sync hangs.
* Must be longer than PLATFORM_OP_EXECUTE_TIMEOUT_US to allow STARS
* enough time to kill the timed-out op and propagate the notification.
*/
constexpr int PLATFORM_STREAM_SYNC_TIMEOUT_MS = 2000; // 2s (> op timeout 1s)
Comment thread
indigo1973 marked this conversation as resolved.

// =============================================================================
// Derived Platform Limits
// =============================================================================
Expand Down Expand Up @@ -138,6 +153,14 @@ constexpr int PLATFORM_PROF_READYQUEUE_SIZE =
*/
constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz

/**
* AICore deinit wait timeout (ticks at PLATFORM_PROF_SYS_CNT_FREQ).
* platform_deinit_aicore_regs waits for AICore to acknowledge the exit
* signal. If AICore is stuck (STARS-killed op, hardware fault), waiting
* forever blocks the AICPU scheduling thread. This timeout bounds the wait.
*/
constexpr uint64_t PLATFORM_DEINIT_TIMEOUT_TICKS = PLATFORM_PROF_SYS_CNT_FREQ; // 1s

/**
* Timeout duration for performance data collection (seconds)
*/
Expand Down
45 changes: 39 additions & 6 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,10 +268,29 @@ int DeviceRunner::attach_current_thread(int device_id) {
return rc;
}

if (device_id_ == -1) {
configure_aicore_op_timeout();
}
Comment thread
indigo1973 marked this conversation as resolved.

device_id_ = device_id;
return 0;
}

void DeviceRunner::configure_aicore_op_timeout() {
uint64_t actual_timeout = 0;
int rc = aclrtSetOpExecuteTimeOutV2(PLATFORM_OP_EXECUTE_TIMEOUT_US, &actual_timeout);
if (rc != 0) {
LOG_ERROR(
"aclrtSetOpExecuteTimeOutV2(%llu us) failed: %d", (unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, rc
);
} else {
LOG_INFO_V0(
"aclrtSetOpExecuteTimeOutV2: requested=%llu us, actual=%llu us",
(unsigned long long)PLATFORM_OP_EXECUTE_TIMEOUT_US, (unsigned long long)actual_timeout
);
}
}

int DeviceRunner::ensure_acl_ready(int device_id) {
if (device_id < 0) {
LOG_ERROR("ensure_acl_ready: invalid device_id %d", device_id);
Expand Down Expand Up @@ -684,17 +703,31 @@ int DeviceRunner::run(
return rc;
}

LOG_INFO_V0("=== rtStreamSynchronize stream_aicpu_ ===");
rc = rtStreamSynchronize(stream_aicpu_);
LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicpu_ ===");
rc = aclrtSynchronizeStreamWithTimeout(stream_aicpu_, PLATFORM_STREAM_SYNC_TIMEOUT_MS);
if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) {
LOG_ERROR(
"Stream sync timeout: stream=AICPU timeout_ms=%d device_id=%d block_dim=%d",
PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_
);
return rc;
}
if (rc != 0) {
LOG_ERROR("rtStreamSynchronize (AICPU) failed: %d", rc);
LOG_ERROR("aclrtSynchronizeStreamWithTimeout (AICPU) failed: %d", rc);
return rc;
}

LOG_INFO_V0("=== rtStreamSynchronize stream_aicore_ ===");
rc = rtStreamSynchronize(stream_aicore_);
LOG_INFO_V0("=== aclrtSynchronizeStreamWithTimeout stream_aicore_ ===");
rc = aclrtSynchronizeStreamWithTimeout(stream_aicore_, PLATFORM_STREAM_SYNC_TIMEOUT_MS);
if (rc == ACL_ERROR_RT_STREAM_SYNC_TIMEOUT) {
LOG_ERROR(
"Stream sync timeout: stream=AICore timeout_ms=%d device_id=%d block_dim=%d",
PLATFORM_STREAM_SYNC_TIMEOUT_MS, device_id_, block_dim_
);
return rc;
}
if (rc != 0) {
LOG_ERROR("rtStreamSynchronize (AICore) failed: %d", rc);
LOG_ERROR("aclrtSynchronizeStreamWithTimeout (AICore) failed: %d", rc);
return rc;
}

Expand Down
10 changes: 10 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,16 @@ class DeviceRunner {
*/
int prepare_orch_so(Runtime &runtime);

/**
* Configure STARS op execution timeout (once per DeviceRunner lifetime).
*
* Called on first device attach to set the hardware-level AICore op
* execution timeout via aclrtSetOpExecuteTimeOutV2. The actual
* timeout may differ from the requested value due to hardware timer
* granularity.
*/
void configure_aicore_op_timeout();

/**
* Initialize performance profiling shared memory
*
Expand Down
15 changes: 12 additions & 3 deletions src/a2a3/platform/src/aicpu/platform_regs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <cstdint>
#include "aicpu/platform_regs.h"
#include "aicpu/device_time.h"
#include "common/platform_config.h"

static uint64_t g_platform_regs = 0;
Expand Down Expand Up @@ -73,17 +74,25 @@ void platform_init_aicore_regs(uint64_t reg_addr) {
write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICPU_IDLE_TASK_ID);
}

void platform_deinit_aicore_regs(uint64_t reg_addr) {
int32_t platform_deinit_aicore_regs(uint64_t reg_addr) {
// Send exit signal to AICore
write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICORE_EXIT_SIGNAL);

// Wait for AICore to acknowledge exit by writing AICORE_EXITED_VALUE to COND
while (read_reg(reg_addr, RegId::COND) != AICORE_EXITED_VALUE) {}
// Wait for AICore to acknowledge exit, with timeout.
// On timeout, skip register cleanup (AICore is unresponsive; host will
// aclrtResetDevice to clear all hardware state).
uint64_t t0 = get_sys_cnt_aicpu();
while (read_reg(reg_addr, RegId::COND) != AICORE_EXITED_VALUE) {
if (get_sys_cnt_aicpu() - t0 > PLATFORM_DEINIT_TIMEOUT_TICKS) {
return -1;
}
}
Comment thread
indigo1973 marked this conversation as resolved.

// Initialize task dispatch register to idle state
write_reg(reg_addr, RegId::DATA_MAIN_BASE, AICPU_IDLE_TASK_ID);
// Close fast path control
write_reg(reg_addr, RegId::FAST_PATH_ENABLE, REG_SPR_FAST_PATH_CLOSE);
return 0;
}

uint32_t platform_get_physical_cores_count() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,10 @@ int32_t SchedulerContext::shutdown(int32_t thread_idx) {
int32_t core_id = cores[i];
uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
if (reg_addr != 0) {
platform_deinit_aicore_regs(reg_addr);
// Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
if (platform_deinit_aicore_regs(reg_addr) != 0) {
LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
}
Comment thread
indigo1973 marked this conversation as resolved.
} else {
LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
}
Expand Down
Loading