From 51358de3bf9a1093c6958d2906626ff3d4ed43c2 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Mon, 23 Feb 2026 18:12:18 +0000
Subject: [PATCH 1/9] Adds better memory clean-up.

---
 src/AMDGPU.jl               |   4 +
 src/memory.jl               | 239 +++++++++++++++++++++++++++++++++---
 src/runtime/memory/utils.jl |   4 +
 3 files changed, 229 insertions(+), 18 deletions(-)

diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 52059b8f1..2964332ec 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -184,6 +184,10 @@ function __init__()
 
     if functional(:hip)
         HIP.devices()
+        if isinteractive()
+            _pool_cleanup_task[] = errormonitor(
+                Threads.@spawn pool_cleanup())
+        end
     else
         @warn "HIP library is unavailable, HIP integration will be disabled."
         if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0"))
diff --git a/src/memory.jl b/src/memory.jl
index 6da3b2f0f..8bdc3309c 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -97,6 +97,37 @@ function soft_memory_limit()
     SOFT_MEMORY_LIMIT[] = soft_limit
 end
 
+
+## allocation statistics
+
+mutable struct AllocStats
+    Base.@atomic alloc_count::Int
+    Base.@atomic alloc_bytes::Int
+
+    Base.@atomic free_count::Int
+    Base.@atomic free_bytes::Int
+
+    Base.@atomic total_time::Float64
+end
+
+AllocStats() = AllocStats(0, 0, 0, 0, 0.0)
+
+Base.copy(s::AllocStats) =
+    AllocStats(s.alloc_count, s.alloc_bytes,
+               s.free_count, s.free_bytes, s.total_time)
+
+Base.:(-)(a::AllocStats, b::AllocStats) = (;
+    alloc_count = a.alloc_count - b.alloc_count,
+    alloc_bytes = a.alloc_bytes - b.alloc_bytes,
+    free_count  = a.free_count  - b.free_count,
+    free_bytes  = a.free_bytes  - b.free_bytes,
+    total_time  = a.total_time  - b.total_time)
+
+const alloc_stats = AllocStats()
+
+
+## memory accounting
+
 mutable struct MemoryStats
     # Maximum size of the heap.
     # Estimated during `maybe_collect` stage.
@@ -134,7 +165,7 @@ function account!(stats::MemoryStats, bytes::Integer)
     Base.@atomic stats.live += bytes
 end
 
-const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", false))
+const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", true))
 
 function eager_gc!(flag::Bool)
     global EAGER_GC[] = flag
@@ -178,38 +209,191 @@ function maybe_collect(; blocking::Bool = false)
     min_pressure = blocking ? 0.5 : 0.75
     pressure < min_pressure && return
 
-    # TODO take allocations into account
-    #   if pressure is high but we didn't allocate - don't collect
-    #   otherwise try hard
-
     # Check that we don't collect too often.
     gc_rate = stats.last_gc_time / (current_time - stats.last_time)
-    # Tolerate 5% GC time.
     max_gc_rate = 0.05
-    # If freed a lot of memory last time, double max GC rate.
     (stats.last_freed > 0.1 * stats.size) && (max_gc_rate *= 2;)
-    # Be more aggressive if we are going to block.
     blocking && (max_gc_rate *= 2;)
 
-    # And even more if the pressure is high.
     pressure > 0.9 && (max_gc_rate *= 2;)
     pressure > 0.95 && (max_gc_rate *= 2;)
     gc_rate > max_gc_rate && return
 
     Base.@atomic stats.last_time = current_time
 
-    # Call the GC.
     pre_gc_live = stats.live
     gc_time = Base.@elapsed GC.gc(false)
     post_gc_live = stats.live
 
-    # Update stats.
     freed = pre_gc_live - post_gc_live
     Base.@atomic stats.last_freed = freed
     Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time
     return
 end
 
+
+## pool activity tracking
+
+const POOL_STATUS = AMDGPU.LockedObject(Dict{Int, Ref{Bool}}())
+
+function pool_mark(dev::HIPDevice)
+    ps = POOL_STATUS.payload
+    did = HIP.device_id(dev)
+    status = get(ps, did, nothing)
+    status === nothing && return nothing
+    return status[]
+end
+
+function pool_mark!(dev::HIPDevice, val::Bool)
+    ps = POOL_STATUS.payload
+    did = HIP.device_id(dev)
+    box = get(ps, did, nothing)
+    if box === nothing
+        Base.@lock POOL_STATUS.lock begin
+            box = get!(ps, did) do
+                Ref{Bool}(val)
+            end
+        end
+    end
+    box[] = val
+    return
+end
+
+
+## reclaim hooks
+
+"""
+    reclaim_hooks
+
+A list of callables that are invoked when memory needs to be reclaimed.
+Downstream packages can push functions into this list to free cached resources
+(e.g., workspace buffers, FFT plans, etc.) when GPU memory is scarce.
+"""
+const reclaim_hooks = Any[]
+
+
+## pool cleanup
+
+const _pool_cleanup_task = Ref{Task}()
+
+function pool_cleanup()
+    idle_counters = Dict{Int, Int}()
+    while true
+        try
+            sleep(60)
+        catch ex
+            if ex isa EOFError
+                break
+            else
+                rethrow()
+            end
+        end
+
+        for dev in HIP.devices()
+            did = HIP.device_id(dev)
+            status = pool_mark(dev)
+            status === nothing && continue
+
+            if status
+                idle_counters[did] = 0
+            else
+                idle_counters[did] = get(idle_counters, did, 0) + 1
+            end
+            pool_mark!(dev, false)
+
+            if get(idle_counters, did, 0) >= 5
+                HIP.device!(dev) do
+                    reclaim()
+                end
+            end
+        end
+    end
+end
+
+
+## reclaim
+
+"""
+    reclaim([sz=typemax(Int)])
+
+Reclaims `sz` bytes of cached memory. Use this to free GPU memory before
+calling into functionality that does not use the memory pool. Returns the
+number of bytes actually reclaimed.
+"""
+function reclaim(sz::Int=typemax(Int))
+    dev = AMDGPU.device()
+    for hook in reclaim_hooks
+        hook()
+    end
+    HIP.device_synchronize()
+    pool = Mem.pool_create(dev)
+    before = HIP.reserved_memory(pool)
+    HIP.trim(pool)
+    after = HIP.reserved_memory(pool)
+    return Int(before - after)
+end
+
+
+## pool status & queries
+
+"""
+    used_memory()
+
+Returns the amount of memory from the HIP memory pool that is currently
+in use by the application.
+"""
+function used_memory()
+    pool = Mem.pool_create(AMDGPU.device())
+    Int(HIP.used_memory(pool))
+end
+
+"""
+    cached_memory()
+
+Returns the amount of backing memory currently allocated (reserved) for the
+HIP memory pool.
+"""
+function cached_memory()
+    pool = Mem.pool_create(AMDGPU.device())
+    Int(HIP.reserved_memory(pool))
+end
+
+"""
+    pool_status([io=stdout])
+
+Report to `io` on the memory status of the current GPU and the active memory pool.
+"""
+function pool_status(io::IO=stdout)
+    free_bytes, total_bytes = info()
+    used_bytes = total_bytes - free_bytes
+    used_ratio = used_bytes / total_bytes
+    @printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n",
+            100*used_ratio, Base.format_bytes(used_bytes),
+            Base.format_bytes(total_bytes))
+
+    pool = Mem.pool_create(AMDGPU.device())
+    pool_used = HIP.used_memory(pool)
+    pool_reserved = HIP.reserved_memory(pool)
+    @printf(io, "Memory pool usage: %s (%s reserved)\n",
+            Base.format_bytes(pool_used),
+            Base.format_bytes(pool_reserved))
+
+    hard_limit = hard_memory_limit()
+    soft_limit = soft_memory_limit()
+    if hard_limit != typemax(UInt64) || soft_limit != typemax(UInt64)
+        print(io, "Memory limit: ")
+        parts = String[]
+        if soft_limit != typemax(UInt64)
+            push!(parts, "soft = $(Base.format_bytes(soft_limit))")
+        end
+        if hard_limit != typemax(UInt64)
+            push!(parts, "hard = $(Base.format_bytes(hard_limit))")
+        end
+        println(io, join(parts, ", "))
+    end
+end
+
+
 # TODO handle stream capturing when we support HIP graphs
 mutable struct Managed{M}
     const mem::M
@@ -275,16 +459,35 @@ function Base.convert(::Type{Mem.AbstractAMDBuffer}, managed::Managed{M}) where
 end
 
 function pool_alloc(::Type{B}, bytesize) where B
-    s = AMDGPU.stream()
-    # @info "[pool_alloc] $(Base.format_bytes(bytesize))"
-    # display(stacktrace()); println()
-    # println()
-    # println()
-    Managed(B(bytesize; stream=s); stream=s)
+    maybe_collect()
+    time = Base.@elapsed begin
+        s = AMDGPU.stream()
+        managed = Managed(B(bytesize; stream=s); stream=s)
+    end
+
+    Base.@atomic alloc_stats.alloc_count += 1
+    Base.@atomic alloc_stats.alloc_bytes += bytesize
+    Base.@atomic alloc_stats.total_time += time
+
+    pool_mark!(AMDGPU.device(), true)
+    return managed
 end
 
 function pool_free(managed::Managed{M}) where M
-    _pool_free(managed.mem, managed.stream)
+    sz = sizeof(managed.mem)
+    sz == 0 && return
+
+    try
+        time = Base.@elapsed _pool_free(managed.mem, managed.stream)
+        Base.@atomic alloc_stats.free_count += 1
+        Base.@atomic alloc_stats.free_bytes += sz
+        Base.@atomic alloc_stats.total_time += time
+    catch ex
+        Base.showerror_nostdio(ex, "WARNING: Error while freeing $(managed.mem)")
+        Base.show_backtrace(Core.stdout, catch_backtrace())
+        Core.println()
+    end
+    return
 end
 
 function _pool_free(buf, stream::HIPStream)
diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl
index f87d94329..1ce65a7ab 100644
--- a/src/runtime/memory/utils.jl
+++ b/src/runtime/memory/utils.jl
@@ -16,6 +16,10 @@ function alloc_or_retry!(f, isfailed; stream::HIP.HIPStream)
             HIP.device_synchronize()
         elseif phase == 5
             HIP.trim(HIP.memory_pool(stream.device))
+        elseif phase == 6
+            for hook in AMDGPU.reclaim_hooks
+                hook()
+            end
         else
             break
         end

From 46ae045b74e7d5d58dc5297cebdf1a6b4d71a2d6 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Mon, 23 Feb 2026 18:27:06 +0000
Subject: [PATCH 2/9] ...

---
 src/memory.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/memory.jl b/src/memory.jl
index 8bdc3309c..04da5220e 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -474,7 +474,7 @@ function pool_alloc(::Type{B}, bytesize) where B
 end
 
 function pool_free(managed::Managed{M}) where M
-    sz = sizeof(managed.mem)
+    sz = Int(sizeof(managed.mem))
     sz == 0 && return
 
     try
@@ -483,7 +483,8 @@ function pool_free(managed::Managed{M}) where M
         Base.@atomic alloc_stats.free_bytes += sz
         Base.@atomic alloc_stats.total_time += time
     catch ex
-        Base.showerror_nostdio(ex, "WARNING: Error while freeing $(managed.mem)")
+        Base.showerror_nostdio(ex,
+            "WARNING: Error while freeing $(Base.format_bytes(sz)) of GPU memory")
         Base.show_backtrace(Core.stdout, catch_backtrace())
         Core.println()
     end

From 93fe6c6783708cbb3c2d95be726bd40d94139df7 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Mon, 23 Feb 2026 18:31:41 +0000
Subject: [PATCH 3/9] Properly parse the arch string.

---
 test/wmma_tests.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/wmma_tests.jl b/test/wmma_tests.jl
index b26d484e2..1e1a01fcc 100644
--- a/test/wmma_tests.jl
+++ b/test/wmma_tests.jl
@@ -8,7 +8,8 @@ using AMDGPU.Device: WMMA_M, WMMA_N, WMMA_K, workitemIdx, workgroupIdx
 AMDGPU.allowscalar(false)
 
 # Only run WMMA tests on RDNA3+ (gfx1100+)
-is_rdna3 = parse(Int, AMDGPU.HIP.gcn_arch(AMDGPU.device())[4:end]) >= 1100
+_arch_str = first(split(AMDGPU.HIP.gcn_arch(AMDGPU.device()), ':'))
+is_rdna3 = parse(Int, _arch_str[4:end]) >= 1100
 if !is_rdna3
     @info "Skipping WMMA tests (requires RDNA3+)"
 else

From 0cf7f7d87cdc049fbb392a544ad91695dcfd0228 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Tue, 24 Feb 2026 12:48:25 +0000
Subject: [PATCH 4/9] Restore comments.

---
 src/memory.jl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/memory.jl b/src/memory.jl
index 04da5220e..4bcd07da5 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -209,22 +209,32 @@ function maybe_collect(; blocking::Bool = false)
     min_pressure = blocking ? 0.5 : 0.75
     pressure < min_pressure && return
 
+    # TODO take allocations into account
+    #   if pressure is high but we didn't allocate - don't collect
+    #   otherwise try hard
+
     # Check that we don't collect too often.
     gc_rate = stats.last_gc_time / (current_time - stats.last_time)
+    # Tolerate 5% GC time.
     max_gc_rate = 0.05
+    # If freed a lot of memory last time, double max GC rate.
     (stats.last_freed > 0.1 * stats.size) && (max_gc_rate *= 2;)
+    # Be more aggressive if we are going to block.
     blocking && (max_gc_rate *= 2;)
 
+    # And even more if the pressure is high.
     pressure > 0.9 && (max_gc_rate *= 2;)
     pressure > 0.95 && (max_gc_rate *= 2;)
     gc_rate > max_gc_rate && return
 
     Base.@atomic stats.last_time = current_time
 
+    # Call the GC.
     pre_gc_live = stats.live
     gc_time = Base.@elapsed GC.gc(false)
     post_gc_live = stats.live
 
+    # Update stats.
     freed = pre_gc_live - post_gc_live
     Base.@atomic stats.last_freed = freed
     Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time

From fc74b77b78a2c14150cbd9d66b9726665b00e86f Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Tue, 24 Feb 2026 12:53:20 +0000
Subject: [PATCH 5/9] Only start when memory is being used, and non-interactive
 batch jobs skip it (same as CUDA.jl)

---
 src/AMDGPU.jl | 4 ----
 src/memory.jl | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 2964332ec..52059b8f1 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -184,10 +184,6 @@ function __init__()
 
     if functional(:hip)
         HIP.devices()
-        if isinteractive()
-            _pool_cleanup_task[] = errormonitor(
-                Threads.@spawn pool_cleanup())
-        end
     else
         @warn "HIP library is unavailable, HIP integration will be disabled."
         if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0"))
diff --git a/src/memory.jl b/src/memory.jl
index 4bcd07da5..8a40d1b0e 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -480,6 +480,11 @@ function pool_alloc(::Type{B}, bytesize) where B
     Base.@atomic alloc_stats.total_time += time
 
     pool_mark!(AMDGPU.device(), true)
+
+    if isinteractive() && !isassigned(_pool_cleanup_task)
+        _pool_cleanup_task[] = errormonitor(Threads.@spawn pool_cleanup())
+    end
+
     return managed
 end
 

From 2bfbd1e0b6ee75ae6255c4aca2efcc38aac37f54 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Tue, 24 Feb 2026 13:00:05 +0000
Subject: [PATCH 6/9] Moving arch-string fix to a separate PR.

---
 test/wmma_tests.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/wmma_tests.jl b/test/wmma_tests.jl
index 1e1a01fcc..b26d484e2 100644
--- a/test/wmma_tests.jl
+++ b/test/wmma_tests.jl
@@ -8,8 +8,7 @@ using AMDGPU.Device: WMMA_M, WMMA_N, WMMA_K, workitemIdx, workgroupIdx
 AMDGPU.allowscalar(false)
 
 # Only run WMMA tests on RDNA3+ (gfx1100+)
-_arch_str = first(split(AMDGPU.HIP.gcn_arch(AMDGPU.device()), ':'))
-is_rdna3 = parse(Int, _arch_str[4:end]) >= 1100
+is_rdna3 = parse(Int, AMDGPU.HIP.gcn_arch(AMDGPU.device())[4:end]) >= 1100
 if !is_rdna3
     @info "Skipping WMMA tests (requires RDNA3+)"
 else

From 544eb7c5594e6812a6c0de7be8631ac1e76658c7 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Wed, 25 Feb 2026 00:05:05 +0000
Subject: [PATCH 7/9] Major change: Use MallocFromPool, Separate out
 register/unregister and free/alloc to avoid memory leaks.

---
 src/runtime/memory/hip.jl | 135 ++++++++++++++++++++++++++++----------
 1 file changed, 100 insertions(+), 35 deletions(-)

diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
index c444d2e34..88abc4be8 100644
--- a/src/runtime/memory/hip.jl
+++ b/src/runtime/memory/hip.jl
@@ -39,22 +39,23 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream)
     dev, ctx = stream.device, stream.ctx
     bytesize == 0 && return HIPBuffer(dev, ctx, C_NULL, 0, true)
 
-    AMDGPU.maybe_collect()
     pool = pool_create(dev)
 
     ptr_ref = Ref{Ptr{Cvoid}}()
     ptr = alloc_or_retry!(isnothing; stream) do
         try
-            # Try to allocate.
-            HIP.hipMallocAsync(ptr_ref, bytesize, stream)
-            # HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream)
-
+            HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream)
             ptr = ptr_ref[]
             ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
             return ptr
         catch err
-            # TODO rethrow if not out of memory error
-            @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace())
+            # Only retry for OOM. Any other error (e.g. hipErrorLaunchFailure
+            # from a prior kernel exception surfacing here) must propagate
+            # immediately so the caller sees the real cause.
+            err isa HIP.HIPError || rethrow(err)
+            err.code in (HIP.hipErrorOutOfMemory, HIP.hipErrorMemoryAllocation) ||
+                rethrow(err)
+            @debug "hipMallocFromPoolAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace())
             return nothing
         end
     end
@@ -105,6 +106,12 @@ function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HI
     return
 end
 
+## Host memory pinning state
+
+const __pin_lock = ReentrantLock()
+const __pinned_memory = Dict{Ptr{Cvoid}, Int}()   # ptr => bytesize
+const __pin_count    = Dict{Ptr{Cvoid}, Int}()     # ptr => refcount
+
 struct HostBuffer <: AbstractAMDBuffer
     device::HIPDevice
     ctx::HIPContext
@@ -119,6 +126,10 @@ function HostBuffer()
     HostBuffer(s.device, s.ctx, C_NULL, C_NULL, 0, true)
 end
 
+"""
+Allocate host-pinned memory via `hipHostMalloc`.
+Freed by `hipHostFree` through [`free`](@ref).
+"""
 function HostBuffer(
     bytesize::Integer, flags = 0; stream::HIP.HIPStream = AMDGPU.stream(),
 )
@@ -131,11 +142,15 @@ function HostBuffer(
     HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, bytesize, true)
 end
 
+"""
+Register (pin) an existing host pointer via `hipHostRegister`.
+Freed by `hipHostUnregister` through [`free`](@ref).
+"""
 function HostBuffer(
     ptr::Ptr{Cvoid}, sz::Integer;
     stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false,
 )
-    pin(ptr, sz)
+    register(ptr, sz)
     dev_ptr = get_device_ptr(ptr)
     HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, sz, own)
 end
@@ -180,10 +195,11 @@ Base.convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr)
 function free(buf::HostBuffer; kwargs...)
     buf.own || return
     buf.ptr == C_NULL && return
-    unpin(buf.ptr)
-    # TODO
-    # call HIP.hipHostFree(buf) if memory was allocated via hipHostMalloc
-    # or is unpinning enough?
+    if is_registered(buf.ptr)
+        unregister(buf.ptr)
+    else
+        HIP.hipHostFree(buf.ptr)
+    end
     return
 end
 
@@ -194,46 +210,95 @@ function get_device_ptr(ptr::Ptr{Cvoid})
     ptr_ref[]
 end
 
-function pin(ptr, sz)
-    ptr == C_NULL && error("Cannot pin `NULL` pointer.")
+function attributes(ptr)
+    data = Ref{HIP.hipPointerAttribute_t}()
+    HIP.hipPointerGetAttributes(data, ptr)
+    return data[]
+end
 
-    memtype = attributes(ptr).type
-    if memtype == HIP.hipMemoryTypeUnregistered
-        HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped)
-    elseif memtype == HIP.hipMemoryTypeHost
-        # Already pinned.
-    else
-        error("Cannot pin pointer with memory type `$memtype`.")
+
+"""
+    register(ptr::Ptr{Cvoid}, sz::Integer)
+
+Page-lock host memory at `ptr` via `hipHostRegister` with refcounting.
+Subsequent calls on the same pointer increment the refcount; the
+actual `hipHostUnregister` only happens when the count drops to zero
+via [`unregister`](@ref).
+"""
+function register(ptr::Ptr{Cvoid}, sz::Integer)
+    ptr == C_NULL && error("Cannot register `NULL` pointer.")
+
+    Base.@lock __pin_lock begin
+        count = get(__pin_count, ptr, 0)
+        if count > 0
+            __pin_count[ptr] = count + 1
+            return
+        end
+
+        memtype = attributes(ptr).type
+        if memtype == HIP.hipMemoryTypeUnregistered
+            HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped)
+            __pinned_memory[ptr] = Int(sz)
+            __pin_count[ptr] = 1
+        elseif memtype == HIP.hipMemoryTypeHost
+            # Already pinned externally (e.g. hipHostMalloc); nothing to track.
+        else
+            error("Cannot register pointer with memory type `$memtype`.")
+        end
     end
     return
 end
 
-function unpin(ptr)
-    ptr == C_NULL && error("Cannot unpin `NULL` pointer.")
+"""
+    unregister(ptr::Ptr{Cvoid})
+
+Decrement the refcount for `ptr`. When it reaches zero the underlying
+`hipHostUnregister` call is issued and tracking state is cleaned up.
+"""
+function unregister(ptr::Ptr{Cvoid})
+    ptr == C_NULL && error("Cannot unregister `NULL` pointer.")
+
+    do_unregister = false
+    Base.@lock __pin_lock begin
+        count = get(__pin_count, ptr, 0)
+        count == 0 && error("Cannot unregister untracked pointer $ptr.")
+
+        if count == 1
+            delete!(__pinned_memory, ptr)
+            delete!(__pin_count, ptr)
+            do_unregister = true
+        else
+            __pin_count[ptr] = count - 1
+        end
+    end
 
-    memtype = attributes(ptr).type
-    if memtype == HIP.hipMemoryTypeUnregistered
-        # Already unpinned.
-    elseif memtype == HIP.hipMemoryTypeHost
+    if do_unregister
         HIP.hipHostUnregister(ptr)
-    else
-        error("Cannot unpin pointer with memory type `$memtype`.")
     end
     return
 end
 
+"""
+    is_registered(ptr::Ptr{Cvoid}) -> Bool
+
+Return `true` if `ptr` is tracked as registered (pinned) memory that
+should be freed via `hipHostUnregister` rather than `hipHostFree`.
+"""
+function is_registered(ptr::Ptr{Cvoid})
+    Base.@lock __pin_lock begin
+        haskey(__pin_count, ptr)
+    end
+end
+
+pin(ptr, sz) = register(Ptr{Cvoid}(ptr), sz)
+unpin(ptr) = unregister(Ptr{Cvoid}(ptr))
+
 function is_pinned(ptr)
     ptr == C_NULL && return false
     data = attributes(ptr)
     return data.type == HIP.hipMemoryTypeHost
 end
 
-function attributes(ptr)
-    data = Ref{HIP.hipPointerAttribute_t}()
-    HIP.hipPointerGetAttributes(data, ptr)
-    return data[]
-end
-
 """
 Asynchronous 3D array copy.
 

From 9f84735521a0cc7ab385af28eff37a3f132d171c Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Wed, 25 Feb 2026 00:05:50 +0000
Subject: [PATCH 8/9] Update with proper unregister.

---
 test/core/rocarray_base.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/core/rocarray_base.jl b/test/core/rocarray_base.jl
index 2cb2d935d..b188df383 100644
--- a/test/core/rocarray_base.jl
+++ b/test/core/rocarray_base.jl
@@ -142,12 +142,15 @@ end
         @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd1))) == true
         @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true
 
+        # Refcounted: first free decrements the pin count but memory stays pinned.
         AMDGPU.unsafe_free!(xd1)
         @test_throws ArgumentError pointer(xd1)
-        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == false
+        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true
 
+        # Second free drops refcount to zero and actually unregisters.
         AMDGPU.unsafe_free!(xd2)
         @test_throws ArgumentError pointer(xd2)
+        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(x))) == false
     end
 
     @testset "Broadcasting different buffer types" begin

From 4ba5b7b6d2783c7981067b142fdbb5a8c4d03405 Mon Sep 17 00:00:00 2001
From: Muhammad Osama <osama94@gmail.com>
Date: Sat, 28 Feb 2026 14:30:47 -0600
Subject: [PATCH 9/9] Update src/runtime/memory/hip.jl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Ludovic Räss <61313342+luraess@users.noreply.github.com>
---
 src/runtime/memory/hip.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
index 88abc4be8..5645b50ba 100644
--- a/src/runtime/memory/hip.jl
+++ b/src/runtime/memory/hip.jl
@@ -110,7 +110,7 @@ end
 
 const __pin_lock = ReentrantLock()
 const __pinned_memory = Dict{Ptr{Cvoid}, Int}()   # ptr => bytesize
-const __pin_count    = Dict{Ptr{Cvoid}, Int}()     # ptr => refcount
+const __pin_count = Dict{Ptr{Cvoid}, Int}()       # ptr => refcount
 
 struct HostBuffer <: AbstractAMDBuffer
     device::HIPDevice