diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 9c480bf8..7a11f56a 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -30,69 +30,81 @@ function memory_scope_to_scope(scope::Int) end end +""" + atomic_tfunc(ptrs) -> Type + +Shared tfunc for atomic operations (add, xchg, cas). +Always returns Tile{T, S}, even for 0D (S = Tuple{}). +""" +function atomic_tfunc(𝕃, @nospecialize(ptrs), @nospecialize args...) + ptrs_type = CC.widenconst(ptrs) + ptrs_type isa DataType && ptrs_type <: Tile || return nothing + ptr_type = eltype(ptrs_type) + ptr_type <: Ptr || return nothing + T = eltype(ptr_type) + S = ptrs_type.parameters[2] + return Tile{T, S} +end + # cuda_tile.atomic_cas_tko -@intrinsic atomic_cas(array, index, expected, desired, - memory_order, memory_scope) -tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +@intrinsic atomic_cas(ptr_tile, expected, desired, mask, memory_order, memory_scope) +function tfunc(𝕃, ::typeof(Intrinsics.atomic_cas), @nospecialize(ptrs), @nospecialize args...) + atomic_tfunc(𝕃, ptrs, args...) +end efunc(::typeof(Intrinsics.atomic_cas), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas), args) cb = ctx.cb tt = ctx.tt - # args: (array, index, expected, desired, memory_order, memory_scope) - array_arg = args[1] + # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("atomic CAS requires ptr_tile")) + expected_tv = emit_value!(ctx, args[2]) + expected_tv === nothing && throw(IRError("atomic CAS requires expected value")) + desired_tv = emit_value!(ctx, args[3]) + desired_tv === nothing && throw(IRError("atomic CAS requires desired value")) - # Get array info - arg_idx = extract_argument_index(array_arg) - is_tilearray = arg_idx !== nothing && is_destructured_arg(ctx, arg_idx) + # Check if mask is provided (ghost Nothing = no mask) + has_mask = get_constant(ctx, args[4]) !== nothing - if !is_tilearray - throw(IRError("atomic_cas requires a TileArray argument")) - end + memory_order = @something get_constant(ctx, args[5]) throw(IRError("atomic CAS requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[6]) throw(IRError("atomic CAS requires constant memory_scope")) - ptr_vals = get_arg_flat_values(ctx, arg_idx, :ptr) - isempty(ptr_vals) && throw(IRError("Cannot get ptr from TileArray argument")) - array_val = ptr_vals[1] - tilearray_type = get_arg_type(ctx, arg_idx) - elem_type = eltype(tilearray_type) + shape = ptr_tv.shape - # Get expected and desired values - expected_tv = emit_value!(ctx, args[3]) - expected_tv === nothing && throw(IRError("atomic_cas requires expected value")) - desired_tv = emit_value!(ctx, args[4]) - desired_tv === nothing && throw(IRError("atomic_cas requires desired value")) + # Get element type from pointer tile: Tile{Ptr{T}, S} -> T + ptrs_type = CC.widenconst(ptr_tv.jltype) + ptr_type = eltype(ptrs_type) + elem_type = eltype(ptr_type) - # Get memory order and scope from args - memory_order = @something get_constant(ctx, args[5]) throw(IRError("atomic_cas requires constant memory_order")) - memory_scope = @something get_constant(ctx, args[6]) throw(IRError("atomic_cas requires constant memory_scope")) - - # Create result type (0D tile of element type) dtype = julia_to_tile_dtype!(tt, elem_type) - result_tile_type = tile_type!(tt, dtype, Int[]) + result_tile_type = tile_type!(tt, dtype, collect(shape)) token_type = Token(tt) - # Get index and create pointer type - index_tv = emit_value!(ctx, args[2]) - ptr_type = pointer_type!(tt, dtype) - ptr_tile_type = tile_type!(tt, ptr_type, Int[]) - - # Compute pointer using OffsetOp (handles any integer index type) - pointers = encode_OffsetOp!(cb, ptr_tile_type, array_val, index_tv.v) - # Emit atomic CAS mem_ordering = memory_order_to_semantics(memory_order) mem_scope = memory_scope_to_scope(memory_scope) - old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, pointers, - expected_tv.v, desired_tv.v; - token=ctx.token, - memory_ordering=mem_ordering, - memory_scope=mem_scope) + old_val, new_token = if has_mask + mask_tv = emit_value!(ctx, args[4]) + mask_tv === nothing && throw(IRError("atomic CAS: cannot resolve mask")) + encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, expected_tv.v, desired_tv.v; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + else + encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, expected_tv.v, desired_tv.v; + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + end ctx.token = new_token - # Return scalar type (not Tile) to match the intrinsic signature - CGVal(old_val, result_tile_type, elem_type, Int[]) + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) end # cuda_tile.atomic_rmw_tko (shared helper for atomic RMW operations) @@ -100,44 +112,31 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) cb = ctx.cb tt = ctx.tt - # args: (array, index, val, memory_order, memory_scope) - array_arg = args[1] + # args: (ptr_tile, val, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && throw(IRError("atomic RMW requires ptr_tile")) + val_tv = emit_value!(ctx, args[2]) + val_tv === nothing && throw(IRError("atomic RMW requires value")) - # Get array info - arg_idx = extract_argument_index(array_arg) - is_tilearray = arg_idx !== nothing && is_destructured_arg(ctx, arg_idx) + # Check if mask is provided (ghost Nothing = no mask) + has_mask = get_constant(ctx, args[3]) !== nothing - if !is_tilearray - throw(IRError("atomic operations require a TileArray argument")) - end + # Get memory order and scope from args + memory_order = @something get_constant(ctx, args[4]) throw(IRError("atomic RMW requires constant memory_order")) + memory_scope = @something get_constant(ctx, args[5]) throw(IRError("atomic RMW requires constant memory_scope")) - ptr_vals = get_arg_flat_values(ctx, arg_idx, :ptr) - isempty(ptr_vals) && throw(IRError("Cannot get ptr from TileArray argument")) - array_val = ptr_vals[1] - tilearray_type = get_arg_type(ctx, arg_idx) - elem_type = eltype(tilearray_type) + shape = ptr_tv.shape - # Get update value - val_tv = emit_value!(ctx, args[3]) - val_tv === nothing && throw(IRError("atomic operation requires value")) + # Get element type from pointer tile: Tile{Ptr{T}, S} -> T + ptrs_type = CC.widenconst(ptr_tv.jltype) + ptr_type = eltype(ptrs_type) + elem_type = eltype(ptr_type) - # Get memory order and scope from args - memory_order = @something get_constant(ctx, args[4]) throw(IRError("atomic operation requires constant memory_order")) - memory_scope = @something get_constant(ctx, args[5]) throw(IRError("atomic operation requires constant memory_scope")) - - # Create result type (0D tile of element type) + # Create result type dtype = julia_to_tile_dtype!(tt, elem_type) - result_tile_type = tile_type!(tt, dtype, Int[]) + result_tile_type = tile_type!(tt, dtype, collect(shape)) token_type = Token(tt) - # Get index and create pointer type - index_tv = emit_value!(ctx, args[2]) - ptr_type = pointer_type!(tt, dtype) - ptr_tile_type = tile_type!(tt, ptr_type, Int[]) - - # Compute pointer using OffsetOp (handles any integer index type) - pointers = encode_OffsetOp!(cb, ptr_tile_type, array_val, index_tv.v) - # Use float add mode for floating point types actual_mode = mode if mode == AtomicADD && elem_type <: AbstractFloat @@ -148,20 +147,30 @@ function emit_atomic_rmw!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) mem_ordering = memory_order_to_semantics(memory_order) mem_scope = memory_scope_to_scope(memory_scope) - old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, pointers, - val_tv.v, actual_mode; - token=ctx.token, - memory_ordering=mem_ordering, - memory_scope=mem_scope) + old_val, new_token = if has_mask + mask_tv = emit_value!(ctx, args[3]) + mask_tv === nothing && throw(IRError("atomic RMW: cannot resolve mask")) + encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, val_tv.v, actual_mode; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + else + encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, val_tv.v, actual_mode; + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + end ctx.token = new_token - # Return scalar type (not Tile) to match the intrinsic signature - CGVal(old_val, result_tile_type, elem_type, Int[]) + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape)) end # cuda_tile.atomic_rmw_tko with XCHG -@intrinsic atomic_xchg(array, index, val, memory_order, memory_scope) -tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +@intrinsic atomic_xchg(ptr_tile, val, mask, memory_order, memory_scope) +tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg), @nospecialize args...) = atomic_tfunc(𝕃, args...) efunc(::typeof(Intrinsics.atomic_xchg), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) @@ -169,9 +178,8 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg), args) end # cuda_tile.atomic_rmw_tko with ADD -@intrinsic atomic_add(array, index, val, - memory_order, memory_scope) -tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize(array), @nospecialize args...) = eltype(CC.widenconst(array)) +@intrinsic atomic_add(ptr_tile, val, mask, memory_order, memory_scope) +tfunc(𝕃, ::typeof(Intrinsics.atomic_add), @nospecialize args...) = atomic_tfunc(𝕃, args...) efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) = CC.Effects(effects; effect_free=CC.ALWAYS_FALSE) function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) diff --git a/src/language/atomics.jl b/src/language/atomics.jl index 5405449a..535d1114 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -25,6 +25,61 @@ module MemScope const System = 2 end +# ============================================================================ +# Pointer/mask helpers +# +# Both scalar and tile-indexed paths compute (ptr_tile, mask, shape) here, +# then pass to a single set of intrinsics. +# ============================================================================ + +# Scalar index -> 0D pointer tile, no mask +@inline function _atomic_ptr_and_mask(array::TileArray{T}, index::Integer) where {T} + idx_0 = Tile(Int32(index - One())) + ptr_tile = Intrinsics.offset(array.ptr, idx_0) + (ptr_tile, nothing, ()) +end + +# N-D tile indices -> N-D pointer tile with bounds mask +@inline function _atomic_ptr_and_mask(array::TileArray{T, N}, + indices::NTuple{N, Tile{<:Integer}}) where {T, N} + # Convert each index to 0-indexed + indices_0 = ntuple(Val(N)) do d + indices[d] .- one(eltype(indices[d])) + end + + # Broadcast all index tiles to a common shape + S = reduce(broadcast_shape, ntuple(d -> size(indices[d]), Val(N))) + + # Broadcast and convert to Int32 + indices_i32 = ntuple(Val(N)) do d + convert(Tile{Int32}, broadcast_to(indices_0[d], S)) + end + + # Linear index: sum(idx[d] * stride[d]) + linear_idx = reduce(.+, ntuple(Val(N)) do d + indices_i32[d] .* broadcast_to(Tile(array.strides[d]), S) + end) + + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + # Bounds mask: 0 <= idx[d] < size[d] for all d + zero_bc = broadcast_to(Tile(Int32(0)), S) + mask = reduce(.&, ntuple(Val(N)) do d + (indices_i32[d] .>= zero_bc) .& (indices_i32[d] .< broadcast_to(Tile(size(array, d)), S)) + end) + + (ptr_tile, mask, S) +end + +# 1D convenience: single Tile -> 1-tuple +@inline function _atomic_ptr_and_mask(array::TileArray{T, 1}, indices::Tile{<:Integer}) where {T} + _atomic_ptr_and_mask(array, (indices,)) +end + +# ============================================================================ +# Atomic CAS +# ============================================================================ + """ atomic_cas(array::TileArray, index, expected, desired; memory_order, memory_scope) -> T @@ -40,43 +95,59 @@ while ct.atomic_cas(locks, idx, Int32(0), Int32(1); memory_order=ct.MemoryOrder. end ``` """ -@inline function atomic_cas(array::TileArray{T}, index, expected::T, desired::T; +@inline function atomic_cas(array::TileArray{T}, indices, + expected::TileOrScalar{T}, desired::TileOrScalar{T}; memory_order::Int=MemoryOrder.AcqRel, memory_scope::Int=MemScope.Device) where {T} - Intrinsics.atomic_cas(array, index - One(), expected, desired, memory_order, memory_scope) + ptr_tile, mask, S = _atomic_ptr_and_mask(array, indices) + expected_bc = S === () ? Tile(expected) : broadcast_to(Tile(expected), S) + desired_bc = S === () ? Tile(desired) : broadcast_to(Tile(desired), S) + result = Intrinsics.atomic_cas(ptr_tile, expected_bc, desired_bc, mask, + memory_order, memory_scope) + S === () ? Intrinsics.to_scalar(result) : result end +# ============================================================================ +# Atomic RMW operations (atomic_add, atomic_xchg) +# ============================================================================ + """ - atomic_xchg(array::TileArray, index, val; memory_order, memory_scope) -> T + atomic_add(array::TileArray, index, val; memory_order, memory_scope) -> T -Atomic exchange. Atomically replaces the value at `index` with `val` and returns +Atomic addition. Atomically adds `val` to the value at `index` and returns the original value. Index is 1-indexed. # Example ```julia -# Spin-lock release -ct.atomic_xchg(locks, idx, Int32(0); memory_order=ct.MemoryOrder.Release) +old_val = ct.atomic_add(counters, idx, Int32(1)) ``` """ -@inline function atomic_xchg(array::TileArray{T}, index, val::T; - memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T} - Intrinsics.atomic_xchg(array, index - One(), val, memory_order, memory_scope) -end +function atomic_add end """ - atomic_add(array::TileArray, index, val; memory_order, memory_scope) -> T + atomic_xchg(array::TileArray, index, val; memory_order, memory_scope) -> T -Atomic addition. Atomically adds `val` to the value at `index` and returns +Atomic exchange. Atomically replaces the value at `index` with `val` and returns the original value. Index is 1-indexed. # Example ```julia -old_val = ct.atomic_add(counters, idx, Int32(1)) +# Spin-lock release +ct.atomic_xchg(locks, idx, Int32(0); memory_order=ct.MemoryOrder.Release) ``` """ -@inline function atomic_add(array::TileArray{T}, index, val::T; - memory_order::Int=MemoryOrder.AcqRel, - memory_scope::Int=MemScope.Device) where {T} - Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope) +function atomic_xchg end + +for op in (:add, :xchg) + fname = Symbol(:atomic_, op) + intrinsic = Symbol(:atomic_, op) + + @eval @inline function $fname(array::TileArray{T}, indices, val::TileOrScalar{T}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T} + ptr_tile, mask, S = _atomic_ptr_and_mask(array, indices) + val_bc = S === () ? Tile(val) : broadcast_to(Tile(val), S) + result = Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) + S === () ? Intrinsics.to_scalar(result) : result + end end diff --git a/src/language/types.jl b/src/language/types.jl index 66397cac..edc9069e 100644 --- a/src/language/types.jl +++ b/src/language/types.jl @@ -1,5 +1,6 @@ public TileArray, Tile, Constant, TFloat32, similar_type, - ScalarInt, ScalarFloat, TileInt, TileFloat, ScalarOrTileInt, ScalarOrTileFloat + ScalarInt, ScalarFloat, IntTile, FloatTile, TileOrInt, TileOrFloat, + TileOrScalar """ ArraySpec{N} @@ -250,6 +251,9 @@ In kernel code, this is compiled to a ConstantOp. Tile{T, Tuple{}}() end +# No-op: pass-through for values already wrapped as Tile +@inline Tile(tile::Tile) = tile + #============================================================================= View Types =============================================================================# @@ -367,16 +371,19 @@ const ScalarInt = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64 const ScalarFloat = Union{Float16, BFloat16, Float32, Float64, TFloat32} """Integer tile types.""" -const TileInt{S} = Tile{T, S} where {T <: ScalarInt} +const IntTile{S} = Tile{T, S} where {T <: ScalarInt} """Floating-point tile types.""" -const TileFloat{S} = Tile{T, S} where {T <: ScalarFloat} +const FloatTile{S} = Tile{T, S} where {T <: ScalarFloat} + +"""Scalar or tile of element type T.""" +const TileOrScalar{T} = Union{T, Tile{T}} """Integer values (scalar or tile).""" -const ScalarOrTileInt = Union{ScalarInt, TileInt} +const TileOrInt = TileOrScalar{<:ScalarInt} """Floating-point values (scalar or tile).""" -const ScalarOrTileFloat = Union{ScalarFloat, TileFloat} +const TileOrFloat = TileOrScalar{<:ScalarFloat} #============================================================================= diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl index 8da55a90..90f2f3b4 100644 --- a/test/codegen/operations.jl +++ b/test/codegen/operations.jl @@ -1384,6 +1384,7 @@ @check_label "entry" code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do locks bid = ct.bid(1) + @check "offset" @check "atomic_cas_tko" old = ct.atomic_cas(locks, bid, Int32(0), Int32(1); memory_order=ct.MemoryOrder.Acquire) @@ -1399,6 +1400,7 @@ @check_label "entry" code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do locks bid = ct.bid(1) + @check "offset" @check "atomic_rmw_tko" ct.atomic_xchg(locks, bid, Int32(0); memory_order=ct.MemoryOrder.Release) @@ -1412,12 +1414,88 @@ @check_label "entry" code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do counters bid = ct.bid(1) + @check "offset" @check "atomic_rmw_tko" ct.atomic_add(counters, bid, 1.0f0) return end end end + + @testset "tile-indexed atomic_cas_tko" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_cas_tko" + ct.atomic_cas(arr, indices, Int32(0), Int32(1)) + return + end + end + end + + @testset "tile-indexed 3D atomic_add" begin + spec3d = ct.ArraySpec{3}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,3,spec3d}}) do arr + @check "iota" + i = ct.arange((4,), Int) + j = ct.arange((4,), Int) + k = ct.arange((4,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, (i, j, k), Int32(1)) + return + end + end + end + + @testset "tile-indexed atomic_rmw_tko" begin + spec = ct.ArraySpec{1}(16, true) + # xchg + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_xchg(arr, indices, Int32(42)) + return + end + end + + # add (integer) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, Int32(1)) + return + end + end + + # add (float) + spec_f32 = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, 1.5f0) + return + end + end + end end #========================================================================= diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl index 81ffe193..e757ed87 100644 --- a/test/execution/atomics.jl +++ b/test/execution/atomics.jl @@ -166,6 +166,172 @@ end @test result == n_blocks end +# ============================================================================ +# Tile-indexed atomic operations (scatter-gather style indexing) +# ============================================================================ + +@testset "atomic_add tile-indexed 1D" begin + function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Int, n) + + ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed returns old values" begin + function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_return_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed Float32" begin + function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::Int) + bid = ct.bid(1) + base = (bid - 1) * TILE + indices = base .+ ct.arange((TILE,), Int) + ct.atomic_add(arr, indices, 1.5f0; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Float32, n) + + ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(isapprox.(Array(arr), 1.5f0)) +end + +@testset "atomic_add tile-indexed with tile values" begin + function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1}, + vals::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + val_tile = ct.gather(vals, indices) + ct.atomic_add(arr, indices, val_tile; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + vals = CuArray(collect(Int, 1:16)) + + ct.launch(atomic_add_tile_val_kernel, 1, arr, vals) + + @test Array(arr) == collect(1:16) +end + +@testset "atomic_xchg tile-indexed" begin + function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + ct.atomic_xchg(arr, indices, 42; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + + ct.launch(atomic_xchg_tile_kernel, 1, arr) + + @test all(Array(arr) .== 42) +end + +@testset "atomic_cas tile-indexed success" begin + function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_tile_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_cas tile-indexed failure" begin + function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 2; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.fill(Int(1), 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_fail_kernel, 1, arr, out) + + @test all(Array(out) .== 1) # old values returned + @test all(Array(arr) .== 1) # unchanged (CAS failed) +end + +@testset "atomic_add tile-indexed out-of-bounds" begin + function atomic_add_oob_kernel(arr::ct.TileArray{Int,1}) + # Index tile is larger than array — OOB elements should be masked + indices = ct.arange((16,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 8) + + ct.launch(atomic_add_oob_kernel, 1, arr) + + # Only first 8 elements should be updated + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-indexed 3D" begin + function atomic_add_3d_kernel(arr::ct.TileArray{Int,3}) + # 3D index tiles — each is length 4, will broadcast to (4,4,4) = 64 elements + i = ct.reshape(ct.arange((4,), Int), (4, 1, 1)) + j = ct.reshape(ct.arange((4,), Int), (1, 4, 1)) + k = ct.reshape(ct.arange((4,), Int), (1, 1, 4)) + ct.atomic_add(arr, (i, j, k), 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_add_3d_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})