From eaa826bcc89b269f1b203a65908fc786a3acf1ad Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 15:27:19 -0300 Subject: [PATCH 1/5] Replace heavyweight ExceptionInfo with single atomic UInt64 The old exception handling inlined ~20 flat_store_byte instructions at every error site (bounds checks, div-by-zero, etc.), writing a 56-byte ExceptionInfo struct byte-by-byte through flat memory. This bloated register usage by ~15 VGPRs per error site, reducing occupancy even though the error paths are never taken at runtime. Replace with a single UInt64 packed with workgroup IDs (16 bits each) and an error code (8 bits), written via one atomic CAS. Each error site now needs ~3 VGPRs instead of ~15. Trade-offs: - Lost: per-workitem IDs, human-readable reason strings on device - Kept: workgroup IDs, error category (BoundsError, DomainError, etc.) - Gained: significantly lower register pressure on error paths Co-Authored-By: Claude Opus 4.6 (1M context) --- src/AMDGPU.jl | 2 +- src/device/exceptions.jl | 173 ++++++++++++++++++--------------------- src/device/runtime.jl | 5 +- src/exception_handler.jl | 30 ++----- 4 files changed, 88 insertions(+), 122 deletions(-) diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 0401c29c9..0b918e385 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -100,7 +100,7 @@ export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim export sync_workgroup, sync_workgroup_count, sync_workgroup_and, sync_workgroup_or struct KernelState - exception_info::Ptr{Device.ExceptionInfo} + exception_info::Ptr{UInt64} malloc_hc::Ptr{Cvoid} free_hc::Ptr{Cvoid} output_context::Ptr{Cvoid} diff --git a/src/device/exceptions.jl b/src/device/exceptions.jl index 0a651de21..453fef565 100644 --- a/src/device/exceptions.jl +++ b/src/device/exceptions.jl @@ -1,114 +1,97 @@ -""" -- `status::Int32`: whether exception has been thrown (0 - no, 1 - yes). -""" -struct ExceptionInfo - status::Int32 - output_lock::Int32 +# Exception reason codes — encoded in bits [7:0] of the packed exception UInt64 +# 0 means no exception +module ExceptionCode + const NONE = UInt8(0) + const UNKNOWN = UInt8(1) + const BOUNDS_ERROR = UInt8(2) + const DOMAIN_ERROR = UInt8(3) + const OVERFLOW_ERROR = UInt8(4) + const INEXACT_ERROR = UInt8(5) + const ARGUMENT_ERROR = UInt8(6) + const DIVIDE_ERROR = UInt8(7) + const DIM_MISMATCH = UInt8(8) +end - thread::@NamedTuple{x::UInt32, y::UInt32, z::UInt32} - block::@NamedTuple{x::UInt32, y::UInt32, z::UInt32} +const EXCEPTION_REASON_STRINGS = Dict{UInt8, String}( + ExceptionCode.NONE => "No exception", + ExceptionCode.UNKNOWN => "Unknown exception", + ExceptionCode.BOUNDS_ERROR => "BoundsError: Out-of-bounds array access", + ExceptionCode.DOMAIN_ERROR => "DomainError", + ExceptionCode.OVERFLOW_ERROR => "OverflowError", + ExceptionCode.INEXACT_ERROR => "InexactError: Inexact conversion", + ExceptionCode.ARGUMENT_ERROR => "ArgumentError", + ExceptionCode.DIVIDE_ERROR => "DivideError: Integer division error", + ExceptionCode.DIM_MISMATCH => "DimensionMismatch", +) - reason::LLVMPtr{UInt8, AS.Global} - reason_length::Int64 +# Packed exception format (UInt64): +# [63:48] workgroup_x (16 bits) +# [47:32] workgroup_y (16 bits) +# [31:16] workgroup_z (16 bits) +# [15:8] reserved +# [7:0] error code (non-zero = exception occurred) - ExceptionInfo() = new( - Int32(0), Int32(0), - (; x=UInt32(0), y=UInt32(0), z=UInt32(0)), - (; x=UInt32(0), y=UInt32(0), z=UInt32(0)), - LLVMPtr{UInt8, AS.Global}(), 0) +@inline function pack_exception(code::UInt8) + wg = workgroupIdx() + wg_x = UInt64(wg.x % UInt16) << 48 + wg_y = UInt64(wg.y % UInt16) << 32 + wg_z = UInt64(wg.z % UInt16) << 16 + return wg_x | wg_y | wg_z | UInt64(code) end -@inline function Base.getproperty(ei::Ptr{ExceptionInfo}, field::Symbol) - if field == :status - unsafe_load(convert(Ptr{Int32}, ei)) - elseif field == :output_lock - unsafe_load(convert(Ptr{Int32}, ei + sizeof(Int32))) - elseif field == :output_lock_ptr - reinterpret(LLVMPtr{Int32, AS.Generic}, ei + sizeof(Int32)) - elseif field == :thread - offset = 2 * sizeof(Int32) - unsafe_load(convert(Ptr{@NamedTuple{x::UInt32, y::UInt32, z::UInt32}}, ei + offset)) - elseif field == :block - offset = 2 * sizeof(Int32) + sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) - unsafe_load(convert(Ptr{@NamedTuple{x::UInt32, y::UInt32, z::UInt32}}, ei + offset)) - elseif field == :reason - offset = - 2 * sizeof(Int32) + - 2 * sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) - unsafe_load(convert(Ptr{LLVMPtr{UInt8, AS.Global}}, ei + offset)) - elseif field == :reason_length - offset = - 2 * sizeof(Int32) + - 2 * sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) + - sizeof(LLVMPtr{UInt8, AS.Global}) - unsafe_load(convert(Ptr{Int64}, ei + offset)) - else - getfield(ei, field) - end +@inline function unpack_exception(packed::UInt64) + wg_x = UInt16((packed >> 48) & 0xFFFF) + wg_y = UInt16((packed >> 32) & 0xFFFF) + wg_z = UInt16((packed >> 16) & 0xFFFF) + code = UInt8(packed & 0xFF) + return (; wg_x, wg_y, wg_z, code) end -@inline function Base.setproperty!(ei::Ptr{ExceptionInfo}, field::Symbol, value) - if field == :status - unsafe_store!(convert(Ptr{Int32}, ei), value) - elseif field == :output_lock - unsafe_store!(convert(Ptr{Int32}, ei + sizeof(Int32)), value) - elseif field == :thread - offset = 2 * sizeof(Int32) - unsafe_store!(convert(Ptr{@NamedTuple{x::UInt32, y::UInt32, z::UInt32}}, ei + offset), value) - elseif field == :block - offset = 2 * sizeof(Int32) + sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) - unsafe_store!(convert(Ptr{@NamedTuple{x::UInt32, y::UInt32, z::UInt32}}, ei + offset), value) - elseif field == :reason - offset = - 2 * sizeof(Int32) + - 2 * sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) - unsafe_store!(convert(Ptr{LLVMPtr{UInt8, AS.Global}}, ei + offset), value) - elseif field == :reason_length - offset = - 2 * sizeof(Int32) + - 2 * sizeof(@NamedTuple{x::UInt32, y::UInt32, z::UInt32}) + - sizeof(LLVMPtr{UInt8, AS.Global}) - unsafe_store!(convert(Ptr{Int64}, ei + offset), value) - else - setfield!(ei, field, value) - end -end +# Legacy compat — ExceptionInfo is now just a UInt64 +const ExceptionInfo = UInt64 function alloc_exception_info() - ei_ptr = Mem.HostBuffer(sizeof(ExceptionInfo), HIP.hipHostAllocDefault) - unsafe_store!(convert(Ptr{ExceptionInfo}, ei_ptr), ExceptionInfo()) + ei_ptr = Mem.HostBuffer(sizeof(UInt64), HIP.hipHostAllocDefault) + unsafe_store!(convert(Ptr{UInt64}, ei_ptr), UInt64(0)) return ei_ptr end -@inline function lock_output!(ei::Ptr{ExceptionInfo}) - # if llvm_atomic_cas(ei.output_lock_ptr, zero(Int32), one(Int32)) == zero(Int32) - if llvm_atomic_cas(ei.output_lock_ptr, Int32(0x0), Int32(0x1)) == Int32(0x0) - # Take the lock & write thread info. - ei.thread = workitemIdx() - ei.block = workgroupIdx() - sync_workgroup() - return true - elseif ( - ei.output_lock == Int32(0x1) && - ei.thread == workitemIdx() && - ei.block == workgroupIdx() - ) - # Thread already has the lock. - return true - else - # Other thread has the lock. - return false - end +@inline function signal_exception!(ei::Ptr{UInt64}, code::UInt8) + packed = pack_exception(code) + # First writer wins via atomic CAS; losers are no-ops. + ei_llvm = reinterpret(LLVMPtr{UInt64, AS.Generic}, ei) + llvm_atomic_cas(ei_llvm, UInt64(0), packed) + endpgm() + return end macro gpu_throw(reason) + code = _reason_to_code(reason) quote ei = kernel_state().exception_info - if lock_output!(ei) - reason_ptr, reason_length = @strptr $reason - ei.reason = reason_ptr - ei.reason_length = reason_length - end - throw(nothing) + signal_exception!(ei, $code) + throw(nothing) # unreachable, but keeps Julia's type system happy + end +end + +# Map reason strings to error codes at macro expansion time +function _reason_to_code(reason::String) + if startswith(reason, "BoundsError") + ExceptionCode.BOUNDS_ERROR + elseif startswith(reason, "DomainError") + ExceptionCode.DOMAIN_ERROR + elseif startswith(reason, "OverflowError") + ExceptionCode.OVERFLOW_ERROR + elseif startswith(reason, "InexactError") + ExceptionCode.INEXACT_ERROR + elseif startswith(reason, "ArgumentError") + ExceptionCode.ARGUMENT_ERROR + elseif startswith(reason, "DivideError") + ExceptionCode.DIVIDE_ERROR + elseif startswith(reason, "DimensionMismatch") + ExceptionCode.DIM_MISMATCH + else + ExceptionCode.UNKNOWN end end +_reason_to_code(reason) = ExceptionCode.UNKNOWN diff --git a/src/device/runtime.jl b/src/device/runtime.jl index acd43c0e2..dcabf27cb 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -77,10 +77,7 @@ end function signal_exception() ei = kernel_state().exception_info - ei.status = Int32(0x1) - # Lock in case it was not locked before, to get workitem and workgroup info. - lock_output!(ei) - endpgm() # Without endpgm we'll get hardware exception. + signal_exception!(ei, ExceptionCode.UNKNOWN) return end diff --git a/src/exception_handler.jl b/src/exception_handler.jl index 6bcdcc4f2..0e5edc716 100644 --- a/src/exception_handler.jl +++ b/src/exception_handler.jl @@ -2,44 +2,30 @@ const GLOBAL_EXCEPTION_INFO = Dict{UInt, Mem.HostBuffer}() # TODO RT_LOCK? -function exception_info(dev::HIPDevice)::Ptr{Device.ExceptionInfo} +function exception_info(dev::HIPDevice)::Ptr{UInt64} ei = get!( () -> Device.alloc_exception_info(), GLOBAL_EXCEPTION_INFO, hash(dev)) - return convert(Ptr{Device.ExceptionInfo}, Mem.device_ptr(ei)) + return convert(Ptr{UInt64}, Mem.device_ptr(ei)) end function has_exception(dev::HIPDevice)::Bool - return exception_info(dev).status != 0 + return unsafe_load(exception_info(dev)) != UInt64(0) end function reset_exception_info!(dev::HIPDevice) - unsafe_store!(exception_info(dev), Device.ExceptionInfo()) + unsafe_store!(exception_info(dev), UInt64(0)) return end -function device_str_to_host(str_ptr, str_length) - str_length == 0 && return "" - - buf = Vector{UInt8}(undef, str_length) - HSA.memory_copy( - convert(Ptr{Cvoid}, pointer(buf)), - reinterpret(Ptr{Cvoid}, str_ptr), str_length) |> Runtime.check - return String(buf) -end - function get_exception_info_string(dev::HIPDevice) - ei = exception_info(dev) - reason = device_str_to_host(ei.reason, ei.reason_length) - - workitemIdx = ei.thread - workgroupIdx = ei.block + packed = unsafe_load(exception_info(dev)) + info = Device.unpack_exception(packed) + reason = get(Device.EXCEPTION_REASON_STRINGS, info.code, "Unknown exception (code=$(info.code))") - isempty(reason) && (reason = "Unknown reason";) return """GPU Kernel Exception: $reason - workitemIdx: $workitemIdx - workgroupIdx: $workgroupIdx""" + workgroupIdx: ($(info.wg_x), $(info.wg_y), $(info.wg_z))""" end function throw_if_exception(dev::HIPDevice) From fafcac8668509d24d026bce9534135a75ea82eb5 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 15:30:17 -0300 Subject: [PATCH 2/5] Add codegen test for lightweight exception handling Verify that the new packed UInt64 exception path does not generate flat_store_byte instructions (from the old byte-by-byte ExceptionInfo writes) and uses atomic cmpswap instead. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/device/exceptions.jl | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/test/device/exceptions.jl b/test/device/exceptions.jl index 9d4f9a414..5026dd472 100644 --- a/test/device/exceptions.jl +++ b/test/device/exceptions.jl @@ -16,9 +16,35 @@ using StaticArrays AMDGPU.synchronize() catch err @test err isa ErrorException + @test occursin("GPU Kernel Exception", err.msg) end - # TODO check exception message - # TODO check specific exception type +end + +@testset "Exception codegen" begin + # Kernel with multiple div() calls — each generates an error path + function div_kernel(X, a, b, c, d) + i = workitemIdx().x + x = div(a, b) + y = div(c, d) + X[i] = x + y + return + end + + iob = IOBuffer() + AMDGPU.code_native(iob, div_kernel, Tuple{ + Device.ROCDeviceArray{Int64, 1, 1}, + Int64, Int64, Int64, Int64, + }) + asm = String(take!(iob)) + + # The new lightweight exception path should NOT generate flat_store_byte + # instructions for writing ExceptionInfo fields. Previously each div check + # inlined ~20 flat_store_byte for the 56-byte ExceptionInfo struct. + n_flat_store_byte = count("flat_store_byte", asm) + @test n_flat_store_byte == 0 + + # Should use global_atomic_cmpswap for the exception flag instead + @test occursin("global_atomic_cmpswap", asm) || occursin("flat_atomic_cmpswap", asm) end if VERSION ≥ v"1.11-" From 51358c578afb7cf2c495ab695297469a8f39844c Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 15:39:16 -0300 Subject: [PATCH 3/5] Add bounds check codegen test for lightweight exceptions Verify that bounds-checked array access also generates zero flat_store_byte and uses atomic cmpswap for exception signaling. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/device/exceptions.jl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/device/exceptions.jl b/test/device/exceptions.jl index 5026dd472..933309f2a 100644 --- a/test/device/exceptions.jl +++ b/test/device/exceptions.jl @@ -45,6 +45,23 @@ end # Should use global_atomic_cmpswap for the exception flag instead @test occursin("global_atomic_cmpswap", asm) || occursin("flat_atomic_cmpswap", asm) + + # Kernel with bounds-checked array access + function boundscheck_kernel(X, Y) + i = workitemIdx().x + X[i] = Y[i] + Y[i+1] + return + end + + iob2 = IOBuffer() + AMDGPU.code_native(iob2, boundscheck_kernel, Tuple{ + Device.ROCDeviceArray{Float64, 1, 1}, + Device.ROCDeviceArray{Float64, 1, 1}, + }) + asm2 = String(take!(iob2)) + + @test count("flat_store_byte", asm2) == 0 + @test occursin("global_atomic_cmpswap", asm2) || occursin("flat_atomic_cmpswap", asm2) end if VERSION ≥ v"1.11-" From 74b56a42e4742026007c01e19f25a809ed1d4f35 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 18 Mar 2026 15:54:56 -0300 Subject: [PATCH 4/5] Use kernel=true in codegen tests to test actual kernel assembly Without kernel=true, code_native compiles as a device function and the AddKernelStatePass is skipped, leaving julia.gpu.state_getter as a real function call. This masks the true codegen improvement (10 VGPRs vs 41) and causes false test results. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/device/exceptions.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/device/exceptions.jl b/test/device/exceptions.jl index 933309f2a..777f6a838 100644 --- a/test/device/exceptions.jl +++ b/test/device/exceptions.jl @@ -34,7 +34,7 @@ end AMDGPU.code_native(iob, div_kernel, Tuple{ Device.ROCDeviceArray{Int64, 1, 1}, Int64, Int64, Int64, Int64, - }) + }; kernel=true) asm = String(take!(iob)) # The new lightweight exception path should NOT generate flat_store_byte @@ -57,7 +57,7 @@ end AMDGPU.code_native(iob2, boundscheck_kernel, Tuple{ Device.ROCDeviceArray{Float64, 1, 1}, Device.ROCDeviceArray{Float64, 1, 1}, - }) + }; kernel=true) asm2 = String(take!(iob2)) @test count("flat_store_byte", asm2) == 0 From 635d8847d23de443290002a23fc885804b66bc8d Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 19 Mar 2026 09:06:05 -0300 Subject: [PATCH 5/5] Update src/exception_handler.jl Co-authored-by: Valentin Churavy --- src/exception_handler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/exception_handler.jl b/src/exception_handler.jl index 0e5edc716..24bca68f4 100644 --- a/src/exception_handler.jl +++ b/src/exception_handler.jl @@ -21,7 +21,7 @@ end function get_exception_info_string(dev::HIPDevice) packed = unsafe_load(exception_info(dev)) info = Device.unpack_exception(packed) - reason = get(Device.EXCEPTION_REASON_STRINGS, info.code, "Unknown exception (code=$(info.code))") + reason = get(Device.EXCEPTION_REASON_STRINGS, info.code, "Unkown error code $(info.code)") return """GPU Kernel Exception: $reason