From 51358de3bf9a1093c6958d2906626ff3d4ed43c2 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Mon, 23 Feb 2026 18:12:18 +0000 Subject: [PATCH 1/9] Adds better memory clean-up. --- src/AMDGPU.jl | 4 + src/memory.jl | 239 +++++++++++++++++++++++++++++++++--- src/runtime/memory/utils.jl | 4 + 3 files changed, 229 insertions(+), 18 deletions(-) diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 52059b8f1..2964332ec 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -184,6 +184,10 @@ function __init__() if functional(:hip) HIP.devices() + if isinteractive() + _pool_cleanup_task[] = errormonitor( + Threads.@spawn pool_cleanup()) + end else @warn "HIP library is unavailable, HIP integration will be disabled." if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0")) diff --git a/src/memory.jl b/src/memory.jl index 6da3b2f0f..8bdc3309c 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -97,6 +97,37 @@ function soft_memory_limit() SOFT_MEMORY_LIMIT[] = soft_limit end + +## allocation statistics + +mutable struct AllocStats + Base.@atomic alloc_count::Int + Base.@atomic alloc_bytes::Int + + Base.@atomic free_count::Int + Base.@atomic free_bytes::Int + + Base.@atomic total_time::Float64 +end + +AllocStats() = AllocStats(0, 0, 0, 0, 0.0) + +Base.copy(s::AllocStats) = + AllocStats(s.alloc_count, s.alloc_bytes, + s.free_count, s.free_bytes, s.total_time) + +Base.:(-)(a::AllocStats, b::AllocStats) = (; + alloc_count = a.alloc_count - b.alloc_count, + alloc_bytes = a.alloc_bytes - b.alloc_bytes, + free_count = a.free_count - b.free_count, + free_bytes = a.free_bytes - b.free_bytes, + total_time = a.total_time - b.total_time) + +const alloc_stats = AllocStats() + + +## memory accounting + mutable struct MemoryStats # Maximum size of the heap. # Estimated during `maybe_collect` stage. @@ -134,7 +165,7 @@ function account!(stats::MemoryStats, bytes::Integer) Base.@atomic stats.live += bytes end -const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", false)) +const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", true)) function eager_gc!(flag::Bool) global EAGER_GC[] = flag @@ -178,38 +209,191 @@ function maybe_collect(; blocking::Bool = false) min_pressure = blocking ? 0.5 : 0.75 pressure < min_pressure && return - # TODO take allocations into account - # if pressure is high but we didn't allocate - don't collect - # otherwise try hard - # Check that we don't collect too often. gc_rate = stats.last_gc_time / (current_time - stats.last_time) - # Tolerate 5% GC time. max_gc_rate = 0.05 - # If freed a lot of memory last time, double max GC rate. (stats.last_freed > 0.1 * stats.size) && (max_gc_rate *= 2;) - # Be more aggressive if we are going to block. blocking && (max_gc_rate *= 2;) - # And even more if the pressure is high. pressure > 0.9 && (max_gc_rate *= 2;) pressure > 0.95 && (max_gc_rate *= 2;) gc_rate > max_gc_rate && return Base.@atomic stats.last_time = current_time - # Call the GC. pre_gc_live = stats.live gc_time = Base.@elapsed GC.gc(false) post_gc_live = stats.live - # Update stats. freed = pre_gc_live - post_gc_live Base.@atomic stats.last_freed = freed Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time return end + +## pool activity tracking + +const POOL_STATUS = AMDGPU.LockedObject(Dict{Int, Ref{Bool}}()) + +function pool_mark(dev::HIPDevice) + ps = POOL_STATUS.payload + did = HIP.device_id(dev) + status = get(ps, did, nothing) + status === nothing && return nothing + return status[] +end + +function pool_mark!(dev::HIPDevice, val::Bool) + ps = POOL_STATUS.payload + did = HIP.device_id(dev) + box = get(ps, did, nothing) + if box === nothing + Base.@lock POOL_STATUS.lock begin + box = get!(ps, did) do + Ref{Bool}(val) + end + end + end + box[] = val + return +end + + +## reclaim hooks + +""" + reclaim_hooks + +A list of callables that are invoked when memory needs to be reclaimed. +Downstream packages can push functions into this list to free cached resources +(e.g., workspace buffers, FFT plans, etc.) when GPU memory is scarce. +""" +const reclaim_hooks = Any[] + + +## pool cleanup + +const _pool_cleanup_task = Ref{Task}() + +function pool_cleanup() + idle_counters = Dict{Int, Int}() + while true + try + sleep(60) + catch ex + if ex isa EOFError + break + else + rethrow() + end + end + + for dev in HIP.devices() + did = HIP.device_id(dev) + status = pool_mark(dev) + status === nothing && continue + + if status + idle_counters[did] = 0 + else + idle_counters[did] = get(idle_counters, did, 0) + 1 + end + pool_mark!(dev, false) + + if get(idle_counters, did, 0) >= 5 + HIP.device!(dev) do + reclaim() + end + end + end + end +end + + +## reclaim + +""" + reclaim([sz=typemax(Int)]) + +Reclaims `sz` bytes of cached memory. Use this to free GPU memory before +calling into functionality that does not use the memory pool. Returns the +number of bytes actually reclaimed. +""" +function reclaim(sz::Int=typemax(Int)) + dev = AMDGPU.device() + for hook in reclaim_hooks + hook() + end + HIP.device_synchronize() + pool = Mem.pool_create(dev) + before = HIP.reserved_memory(pool) + HIP.trim(pool) + after = HIP.reserved_memory(pool) + return Int(before - after) +end + + +## pool status & queries + +""" + used_memory() + +Returns the amount of memory from the HIP memory pool that is currently +in use by the application. +""" +function used_memory() + pool = Mem.pool_create(AMDGPU.device()) + Int(HIP.used_memory(pool)) +end + +""" + cached_memory() + +Returns the amount of backing memory currently allocated (reserved) for the +HIP memory pool. +""" +function cached_memory() + pool = Mem.pool_create(AMDGPU.device()) + Int(HIP.reserved_memory(pool)) +end + +""" + pool_status([io=stdout]) + +Report to `io` on the memory status of the current GPU and the active memory pool. +""" +function pool_status(io::IO=stdout) + free_bytes, total_bytes = info() + used_bytes = total_bytes - free_bytes + used_ratio = used_bytes / total_bytes + @printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n", + 100*used_ratio, Base.format_bytes(used_bytes), + Base.format_bytes(total_bytes)) + + pool = Mem.pool_create(AMDGPU.device()) + pool_used = HIP.used_memory(pool) + pool_reserved = HIP.reserved_memory(pool) + @printf(io, "Memory pool usage: %s (%s reserved)\n", + Base.format_bytes(pool_used), + Base.format_bytes(pool_reserved)) + + hard_limit = hard_memory_limit() + soft_limit = soft_memory_limit() + if hard_limit != typemax(UInt64) || soft_limit != typemax(UInt64) + print(io, "Memory limit: ") + parts = String[] + if soft_limit != typemax(UInt64) + push!(parts, "soft = $(Base.format_bytes(soft_limit))") + end + if hard_limit != typemax(UInt64) + push!(parts, "hard = $(Base.format_bytes(hard_limit))") + end + println(io, join(parts, ", ")) + end +end + + # TODO handle stream capturing when we support HIP graphs mutable struct Managed{M} const mem::M @@ -275,16 +459,35 @@ function Base.convert(::Type{Mem.AbstractAMDBuffer}, managed::Managed{M}) where end function pool_alloc(::Type{B}, bytesize) where B - s = AMDGPU.stream() - # @info "[pool_alloc] $(Base.format_bytes(bytesize))" - # display(stacktrace()); println() - # println() - # println() - Managed(B(bytesize; stream=s); stream=s) + maybe_collect() + time = Base.@elapsed begin + s = AMDGPU.stream() + managed = Managed(B(bytesize; stream=s); stream=s) + end + + Base.@atomic alloc_stats.alloc_count += 1 + Base.@atomic alloc_stats.alloc_bytes += bytesize + Base.@atomic alloc_stats.total_time += time + + pool_mark!(AMDGPU.device(), true) + return managed end function pool_free(managed::Managed{M}) where M - _pool_free(managed.mem, managed.stream) + sz = sizeof(managed.mem) + sz == 0 && return + + try + time = Base.@elapsed _pool_free(managed.mem, managed.stream) + Base.@atomic alloc_stats.free_count += 1 + Base.@atomic alloc_stats.free_bytes += sz + Base.@atomic alloc_stats.total_time += time + catch ex + Base.showerror_nostdio(ex, "WARNING: Error while freeing $(managed.mem)") + Base.show_backtrace(Core.stdout, catch_backtrace()) + Core.println() + end + return end function _pool_free(buf, stream::HIPStream) diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl index f87d94329..1ce65a7ab 100644 --- a/src/runtime/memory/utils.jl +++ b/src/runtime/memory/utils.jl @@ -16,6 +16,10 @@ function alloc_or_retry!(f, isfailed; stream::HIP.HIPStream) HIP.device_synchronize() elseif phase == 5 HIP.trim(HIP.memory_pool(stream.device)) + elseif phase == 6 + for hook in AMDGPU.reclaim_hooks + hook() + end else break end From 46ae045b74e7d5d58dc5297cebdf1a6b4d71a2d6 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Mon, 23 Feb 2026 18:27:06 +0000 Subject: [PATCH 2/9] ... --- src/memory.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/memory.jl b/src/memory.jl index 8bdc3309c..04da5220e 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -474,7 +474,7 @@ function pool_alloc(::Type{B}, bytesize) where B end function pool_free(managed::Managed{M}) where M - sz = sizeof(managed.mem) + sz = Int(sizeof(managed.mem)) sz == 0 && return try @@ -483,7 +483,8 @@ function pool_free(managed::Managed{M}) where M Base.@atomic alloc_stats.free_bytes += sz Base.@atomic alloc_stats.total_time += time catch ex - Base.showerror_nostdio(ex, "WARNING: Error while freeing $(managed.mem)") + Base.showerror_nostdio(ex, + "WARNING: Error while freeing $(Base.format_bytes(sz)) of GPU memory") Base.show_backtrace(Core.stdout, catch_backtrace()) Core.println() end From 93fe6c6783708cbb3c2d95be726bd40d94139df7 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Mon, 23 Feb 2026 18:31:41 +0000 Subject: [PATCH 3/9] Properly parse the arch string. --- test/wmma_tests.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/wmma_tests.jl b/test/wmma_tests.jl index b26d484e2..1e1a01fcc 100644 --- a/test/wmma_tests.jl +++ b/test/wmma_tests.jl @@ -8,7 +8,8 @@ using AMDGPU.Device: WMMA_M, WMMA_N, WMMA_K, workitemIdx, workgroupIdx AMDGPU.allowscalar(false) # Only run WMMA tests on RDNA3+ (gfx1100+) -is_rdna3 = parse(Int, AMDGPU.HIP.gcn_arch(AMDGPU.device())[4:end]) >= 1100 +_arch_str = first(split(AMDGPU.HIP.gcn_arch(AMDGPU.device()), ':')) +is_rdna3 = parse(Int, _arch_str[4:end]) >= 1100 if !is_rdna3 @info "Skipping WMMA tests (requires RDNA3+)" else From 0cf7f7d87cdc049fbb392a544ad91695dcfd0228 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Tue, 24 Feb 2026 12:48:25 +0000 Subject: [PATCH 4/9] Restore comments. --- src/memory.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/memory.jl b/src/memory.jl index 04da5220e..4bcd07da5 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -209,22 +209,32 @@ function maybe_collect(; blocking::Bool = false) min_pressure = blocking ? 0.5 : 0.75 pressure < min_pressure && return + # TODO take allocations into account + # if pressure is high but we didn't allocate - don't collect + # otherwise try hard + # Check that we don't collect too often. gc_rate = stats.last_gc_time / (current_time - stats.last_time) + # Tolerate 5% GC time. max_gc_rate = 0.05 + # If freed a lot of memory last time, double max GC rate. (stats.last_freed > 0.1 * stats.size) && (max_gc_rate *= 2;) + # Be more aggressive if we are going to block. blocking && (max_gc_rate *= 2;) + # And even more if the pressure is high. pressure > 0.9 && (max_gc_rate *= 2;) pressure > 0.95 && (max_gc_rate *= 2;) gc_rate > max_gc_rate && return Base.@atomic stats.last_time = current_time + # Call the GC. pre_gc_live = stats.live gc_time = Base.@elapsed GC.gc(false) post_gc_live = stats.live + # Update stats. freed = pre_gc_live - post_gc_live Base.@atomic stats.last_freed = freed Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time From fc74b77b78a2c14150cbd9d66b9726665b00e86f Mon Sep 17 00:00:00 2001 From: neoblizz Date: Tue, 24 Feb 2026 12:53:20 +0000 Subject: [PATCH 5/9] Only start when memory is being used, and non-interactive batch jobs skip it (same as CUDA.jl) --- src/AMDGPU.jl | 4 ---- src/memory.jl | 5 +++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 2964332ec..52059b8f1 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -184,10 +184,6 @@ function __init__() if functional(:hip) HIP.devices() - if isinteractive() - _pool_cleanup_task[] = errormonitor( - Threads.@spawn pool_cleanup()) - end else @warn "HIP library is unavailable, HIP integration will be disabled." if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0")) diff --git a/src/memory.jl b/src/memory.jl index 4bcd07da5..8a40d1b0e 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -480,6 +480,11 @@ function pool_alloc(::Type{B}, bytesize) where B Base.@atomic alloc_stats.total_time += time pool_mark!(AMDGPU.device(), true) + + if isinteractive() && !isassigned(_pool_cleanup_task) + _pool_cleanup_task[] = errormonitor(Threads.@spawn pool_cleanup()) + end + return managed end From 2bfbd1e0b6ee75ae6255c4aca2efcc38aac37f54 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Tue, 24 Feb 2026 13:00:05 +0000 Subject: [PATCH 6/9] Moving arch-string fix to a separate PR. --- test/wmma_tests.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/wmma_tests.jl b/test/wmma_tests.jl index 1e1a01fcc..b26d484e2 100644 --- a/test/wmma_tests.jl +++ b/test/wmma_tests.jl @@ -8,8 +8,7 @@ using AMDGPU.Device: WMMA_M, WMMA_N, WMMA_K, workitemIdx, workgroupIdx AMDGPU.allowscalar(false) # Only run WMMA tests on RDNA3+ (gfx1100+) -_arch_str = first(split(AMDGPU.HIP.gcn_arch(AMDGPU.device()), ':')) -is_rdna3 = parse(Int, _arch_str[4:end]) >= 1100 +is_rdna3 = parse(Int, AMDGPU.HIP.gcn_arch(AMDGPU.device())[4:end]) >= 1100 if !is_rdna3 @info "Skipping WMMA tests (requires RDNA3+)" else From 544eb7c5594e6812a6c0de7be8631ac1e76658c7 Mon Sep 17 00:00:00 2001 From: neoblizz Date: Wed, 25 Feb 2026 00:05:05 +0000 Subject: [PATCH 7/9] Major change: Use MallocFromPool, Separate out register/unregister and free/alloc to avoid memory leaks. --- src/runtime/memory/hip.jl | 135 ++++++++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index c444d2e34..88abc4be8 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -39,22 +39,23 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream) dev, ctx = stream.device, stream.ctx bytesize == 0 && return HIPBuffer(dev, ctx, C_NULL, 0, true) - AMDGPU.maybe_collect() pool = pool_create(dev) ptr_ref = Ref{Ptr{Cvoid}}() ptr = alloc_or_retry!(isnothing; stream) do try - # Try to allocate. - HIP.hipMallocAsync(ptr_ref, bytesize, stream) - # HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream) - + HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream) ptr = ptr_ref[] ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) return ptr catch err - # TODO rethrow if not out of memory error - @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace()) + # Only retry for OOM. Any other error (e.g. hipErrorLaunchFailure + # from a prior kernel exception surfacing here) must propagate + # immediately so the caller sees the real cause. + err isa HIP.HIPError || rethrow(err) + err.code in (HIP.hipErrorOutOfMemory, HIP.hipErrorMemoryAllocation) || + rethrow(err) + @debug "hipMallocFromPoolAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace()) return nothing end end @@ -105,6 +106,12 @@ function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HI return end +## Host memory pinning state + +const __pin_lock = ReentrantLock() +const __pinned_memory = Dict{Ptr{Cvoid}, Int}() # ptr => bytesize +const __pin_count = Dict{Ptr{Cvoid}, Int}() # ptr => refcount + struct HostBuffer <: AbstractAMDBuffer device::HIPDevice ctx::HIPContext @@ -119,6 +126,10 @@ function HostBuffer() HostBuffer(s.device, s.ctx, C_NULL, C_NULL, 0, true) end +""" +Allocate host-pinned memory via `hipHostMalloc`. +Freed by `hipHostFree` through [`free`](@ref). +""" function HostBuffer( bytesize::Integer, flags = 0; stream::HIP.HIPStream = AMDGPU.stream(), ) @@ -131,11 +142,15 @@ function HostBuffer( HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, bytesize, true) end +""" +Register (pin) an existing host pointer via `hipHostRegister`. +Freed by `hipHostUnregister` through [`free`](@ref). +""" function HostBuffer( ptr::Ptr{Cvoid}, sz::Integer; stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false, ) - pin(ptr, sz) + register(ptr, sz) dev_ptr = get_device_ptr(ptr) HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, sz, own) end @@ -180,10 +195,11 @@ Base.convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr) function free(buf::HostBuffer; kwargs...) buf.own || return buf.ptr == C_NULL && return - unpin(buf.ptr) - # TODO - # call HIP.hipHostFree(buf) if memory was allocated via hipHostMalloc - # or is unpinning enough? + if is_registered(buf.ptr) + unregister(buf.ptr) + else + HIP.hipHostFree(buf.ptr) + end return end @@ -194,46 +210,95 @@ function get_device_ptr(ptr::Ptr{Cvoid}) ptr_ref[] end -function pin(ptr, sz) - ptr == C_NULL && error("Cannot pin `NULL` pointer.") +function attributes(ptr) + data = Ref{HIP.hipPointerAttribute_t}() + HIP.hipPointerGetAttributes(data, ptr) + return data[] +end - memtype = attributes(ptr).type - if memtype == HIP.hipMemoryTypeUnregistered - HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) - elseif memtype == HIP.hipMemoryTypeHost - # Already pinned. - else - error("Cannot pin pointer with memory type `$memtype`.") + +""" + register(ptr::Ptr{Cvoid}, sz::Integer) + +Page-lock host memory at `ptr` via `hipHostRegister` with refcounting. +Subsequent calls on the same pointer increment the refcount; the +actual `hipHostUnregister` only happens when the count drops to zero +via [`unregister`](@ref). +""" +function register(ptr::Ptr{Cvoid}, sz::Integer) + ptr == C_NULL && error("Cannot register `NULL` pointer.") + + Base.@lock __pin_lock begin + count = get(__pin_count, ptr, 0) + if count > 0 + __pin_count[ptr] = count + 1 + return + end + + memtype = attributes(ptr).type + if memtype == HIP.hipMemoryTypeUnregistered + HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) + __pinned_memory[ptr] = Int(sz) + __pin_count[ptr] = 1 + elseif memtype == HIP.hipMemoryTypeHost + # Already pinned externally (e.g. hipHostMalloc); nothing to track. + else + error("Cannot register pointer with memory type `$memtype`.") + end end return end -function unpin(ptr) - ptr == C_NULL && error("Cannot unpin `NULL` pointer.") +""" + unregister(ptr::Ptr{Cvoid}) + +Decrement the refcount for `ptr`. When it reaches zero the underlying +`hipHostUnregister` call is issued and tracking state is cleaned up. +""" +function unregister(ptr::Ptr{Cvoid}) + ptr == C_NULL && error("Cannot unregister `NULL` pointer.") + + do_unregister = false + Base.@lock __pin_lock begin + count = get(__pin_count, ptr, 0) + count == 0 && error("Cannot unregister untracked pointer $ptr.") + + if count == 1 + delete!(__pinned_memory, ptr) + delete!(__pin_count, ptr) + do_unregister = true + else + __pin_count[ptr] = count - 1 + end + end - memtype = attributes(ptr).type - if memtype == HIP.hipMemoryTypeUnregistered - # Already unpinned. - elseif memtype == HIP.hipMemoryTypeHost + if do_unregister HIP.hipHostUnregister(ptr) - else - error("Cannot unpin pointer with memory type `$memtype`.") end return end +""" + is_registered(ptr::Ptr{Cvoid}) -> Bool + +Return `true` if `ptr` is tracked as registered (pinned) memory that +should be freed via `hipHostUnregister` rather than `hipHostFree`. +""" +function is_registered(ptr::Ptr{Cvoid}) + Base.@lock __pin_lock begin + haskey(__pin_count, ptr) + end +end + +pin(ptr, sz) = register(Ptr{Cvoid}(ptr), sz) +unpin(ptr) = unregister(Ptr{Cvoid}(ptr)) + function is_pinned(ptr) ptr == C_NULL && return false data = attributes(ptr) return data.type == HIP.hipMemoryTypeHost end -function attributes(ptr) - data = Ref{HIP.hipPointerAttribute_t}() - HIP.hipPointerGetAttributes(data, ptr) - return data[] -end - """ Asynchronous 3D array copy. From 9f84735521a0cc7ab385af28eff37a3f132d171c Mon Sep 17 00:00:00 2001 From: neoblizz Date: Wed, 25 Feb 2026 00:05:50 +0000 Subject: [PATCH 8/9] Update with proper unregister. --- test/core/rocarray_base.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/core/rocarray_base.jl b/test/core/rocarray_base.jl index 2cb2d935d..b188df383 100644 --- a/test/core/rocarray_base.jl +++ b/test/core/rocarray_base.jl @@ -142,12 +142,15 @@ end @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd1))) == true @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true + # Refcounted: first free decrements the pin count but memory stays pinned. AMDGPU.unsafe_free!(xd1) @test_throws ArgumentError pointer(xd1) - @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == false + @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true + # Second free drops refcount to zero and actually unregisters. AMDGPU.unsafe_free!(xd2) @test_throws ArgumentError pointer(xd2) + @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(x))) == false end @testset "Broadcasting different buffer types" begin From 4ba5b7b6d2783c7981067b142fdbb5a8c4d03405 Mon Sep 17 00:00:00 2001 From: Muhammad Osama Date: Sat, 28 Feb 2026 14:30:47 -0600 Subject: [PATCH 9/9] Update src/runtime/memory/hip.jl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Ludovic Räss <61313342+luraess@users.noreply.github.com> --- src/runtime/memory/hip.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index 88abc4be8..5645b50ba 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -110,7 +110,7 @@ end const __pin_lock = ReentrantLock() const __pinned_memory = Dict{Ptr{Cvoid}, Int}() # ptr => bytesize -const __pin_count = Dict{Ptr{Cvoid}, Int}() # ptr => refcount +const __pin_count = Dict{Ptr{Cvoid}, Int}() # ptr => refcount struct HostBuffer <: AbstractAMDBuffer device::HIPDevice