diff --git a/src/memory.jl b/src/memory.jl
index 6da3b2f0f..8a40d1b0e 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -97,6 +97,37 @@ function soft_memory_limit()
     SOFT_MEMORY_LIMIT[] = soft_limit
 end
 
+
+## allocation statistics
+
+mutable struct AllocStats
+    Base.@atomic alloc_count::Int
+    Base.@atomic alloc_bytes::Int
+
+    Base.@atomic free_count::Int
+    Base.@atomic free_bytes::Int
+
+    Base.@atomic total_time::Float64
+end
+
+AllocStats() = AllocStats(0, 0, 0, 0, 0.0)
+
+Base.copy(s::AllocStats) =
+    AllocStats(s.alloc_count, s.alloc_bytes,
+               s.free_count, s.free_bytes, s.total_time)
+
+Base.:(-)(a::AllocStats, b::AllocStats) = (;
+    alloc_count = a.alloc_count - b.alloc_count,
+    alloc_bytes = a.alloc_bytes - b.alloc_bytes,
+    free_count  = a.free_count  - b.free_count,
+    free_bytes  = a.free_bytes  - b.free_bytes,
+    total_time  = a.total_time  - b.total_time)
+
+const alloc_stats = AllocStats()
+
+
+## memory accounting
+
 mutable struct MemoryStats
     # Maximum size of the heap.
     # Estimated during `maybe_collect` stage.
@@ -134,7 +165,7 @@ function account!(stats::MemoryStats, bytes::Integer)
     Base.@atomic stats.live += bytes
 end
 
-const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", false))
+const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", true))
 
 function eager_gc!(flag::Bool)
     global EAGER_GC[] = flag
@@ -210,6 +241,169 @@ function maybe_collect(; blocking::Bool = false)
     return
 end
 
+
+## pool activity tracking
+
+const POOL_STATUS = AMDGPU.LockedObject(Dict{Int, Ref{Bool}}())
+
+function pool_mark(dev::HIPDevice)
+    ps = POOL_STATUS.payload
+    did = HIP.device_id(dev)
+    status = get(ps, did, nothing)
+    status === nothing && return nothing
+    return status[]
+end
+
+function pool_mark!(dev::HIPDevice, val::Bool)
+    ps = POOL_STATUS.payload
+    did = HIP.device_id(dev)
+    box = get(ps, did, nothing)
+    if box === nothing
+        Base.@lock POOL_STATUS.lock begin
+            box = get!(ps, did) do
+                Ref{Bool}(val)
+            end
+        end
+    end
+    box[] = val
+    return
+end
+
+
+## reclaim hooks
+
+"""
+    reclaim_hooks
+
+A list of callables that are invoked when memory needs to be reclaimed.
+Downstream packages can push functions into this list to free cached resources
+(e.g., workspace buffers, FFT plans, etc.) when GPU memory is scarce.
+"""
+const reclaim_hooks = Any[]
+
+
+## pool cleanup
+
+const _pool_cleanup_task = Ref{Task}()
+
+function pool_cleanup()
+    idle_counters = Dict{Int, Int}()
+    while true
+        try
+            sleep(60)
+        catch ex
+            if ex isa EOFError
+                break
+            else
+                rethrow()
+            end
+        end
+
+        for dev in HIP.devices()
+            did = HIP.device_id(dev)
+            status = pool_mark(dev)
+            status === nothing && continue
+
+            if status
+                idle_counters[did] = 0
+            else
+                idle_counters[did] = get(idle_counters, did, 0) + 1
+            end
+            pool_mark!(dev, false)
+
+            if get(idle_counters, did, 0) >= 5
+                HIP.device!(dev) do
+                    reclaim()
+                end
+            end
+        end
+    end
+end
+
+
+## reclaim
+
+"""
+    reclaim([sz=typemax(Int)])
+
+Reclaims `sz` bytes of cached memory. Use this to free GPU memory before
+calling into functionality that does not use the memory pool. Returns the
+number of bytes actually reclaimed.
+"""
+function reclaim(sz::Int=typemax(Int))
+    dev = AMDGPU.device()
+    for hook in reclaim_hooks
+        hook()
+    end
+    HIP.device_synchronize()
+    pool = Mem.pool_create(dev)
+    before = HIP.reserved_memory(pool)
+    HIP.trim(pool)
+    after = HIP.reserved_memory(pool)
+    return Int(before - after)
+end
+
+
+## pool status & queries
+
+"""
+    used_memory()
+
+Returns the amount of memory from the HIP memory pool that is currently
+in use by the application.
+"""
+function used_memory()
+    pool = Mem.pool_create(AMDGPU.device())
+    Int(HIP.used_memory(pool))
+end
+
+"""
+    cached_memory()
+
+Returns the amount of backing memory currently allocated (reserved) for the
+HIP memory pool.
+"""
+function cached_memory()
+    pool = Mem.pool_create(AMDGPU.device())
+    Int(HIP.reserved_memory(pool))
+end
+
+"""
+    pool_status([io=stdout])
+
+Report to `io` on the memory status of the current GPU and the active memory pool.
+"""
+function pool_status(io::IO=stdout)
+    free_bytes, total_bytes = info()
+    used_bytes = total_bytes - free_bytes
+    used_ratio = used_bytes / total_bytes
+    @printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n",
+            100*used_ratio, Base.format_bytes(used_bytes),
+            Base.format_bytes(total_bytes))
+
+    pool = Mem.pool_create(AMDGPU.device())
+    pool_used = HIP.used_memory(pool)
+    pool_reserved = HIP.reserved_memory(pool)
+    @printf(io, "Memory pool usage: %s (%s reserved)\n",
+            Base.format_bytes(pool_used),
+            Base.format_bytes(pool_reserved))
+
+    hard_limit = hard_memory_limit()
+    soft_limit = soft_memory_limit()
+    if hard_limit != typemax(UInt64) || soft_limit != typemax(UInt64)
+        print(io, "Memory limit: ")
+        parts = String[]
+        if soft_limit != typemax(UInt64)
+            push!(parts, "soft = $(Base.format_bytes(soft_limit))")
+        end
+        if hard_limit != typemax(UInt64)
+            push!(parts, "hard = $(Base.format_bytes(hard_limit))")
+        end
+        println(io, join(parts, ", "))
+    end
+end
+
+
 # TODO handle stream capturing when we support HIP graphs
 mutable struct Managed{M}
     const mem::M
@@ -275,16 +469,41 @@ function Base.convert(::Type{Mem.AbstractAMDBuffer}, managed::Managed{M}) where
 end
 
 function pool_alloc(::Type{B}, bytesize) where B
-    s = AMDGPU.stream()
-    # @info "[pool_alloc] $(Base.format_bytes(bytesize))"
-    # display(stacktrace()); println()
-    # println()
-    # println()
-    Managed(B(bytesize; stream=s); stream=s)
+    maybe_collect()
+    time = Base.@elapsed begin
+        s = AMDGPU.stream()
+        managed = Managed(B(bytesize; stream=s); stream=s)
+    end
+
+    Base.@atomic alloc_stats.alloc_count += 1
+    Base.@atomic alloc_stats.alloc_bytes += bytesize
+    Base.@atomic alloc_stats.total_time += time
+
+    pool_mark!(AMDGPU.device(), true)
+
+    if isinteractive() && !isassigned(_pool_cleanup_task)
+        _pool_cleanup_task[] = errormonitor(Threads.@spawn pool_cleanup())
+    end
+
+    return managed
 end
 
 function pool_free(managed::Managed{M}) where M
-    _pool_free(managed.mem, managed.stream)
+    sz = Int(sizeof(managed.mem))
+    sz == 0 && return
+
+    try
+        time = Base.@elapsed _pool_free(managed.mem, managed.stream)
+        Base.@atomic alloc_stats.free_count += 1
+        Base.@atomic alloc_stats.free_bytes += sz
+        Base.@atomic alloc_stats.total_time += time
+    catch ex
+        Base.showerror_nostdio(ex,
+            "WARNING: Error while freeing $(Base.format_bytes(sz)) of GPU memory")
+        Base.show_backtrace(Core.stdout, catch_backtrace())
+        Core.println()
+    end
+    return
 end
 
 function _pool_free(buf, stream::HIPStream)
diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
index c444d2e34..5645b50ba 100644
--- a/src/runtime/memory/hip.jl
+++ b/src/runtime/memory/hip.jl
@@ -39,22 +39,23 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream)
     dev, ctx = stream.device, stream.ctx
     bytesize == 0 && return HIPBuffer(dev, ctx, C_NULL, 0, true)
 
-    AMDGPU.maybe_collect()
     pool = pool_create(dev)
 
     ptr_ref = Ref{Ptr{Cvoid}}()
     ptr = alloc_or_retry!(isnothing; stream) do
         try
-            # Try to allocate.
-            HIP.hipMallocAsync(ptr_ref, bytesize, stream)
-            # HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream)
-
+            HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream)
             ptr = ptr_ref[]
             ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
             return ptr
         catch err
-            # TODO rethrow if not out of memory error
-            @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace())
+            # Only retry for OOM. Any other error (e.g. hipErrorLaunchFailure
+            # from a prior kernel exception surfacing here) must propagate
+            # immediately so the caller sees the real cause.
+            err isa HIP.HIPError || rethrow(err)
+            err.code in (HIP.hipErrorOutOfMemory, HIP.hipErrorMemoryAllocation) ||
+                rethrow(err)
+            @debug "hipMallocFromPoolAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace())
             return nothing
         end
     end
@@ -105,6 +106,12 @@ function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HI
     return
 end
 
+## Host memory pinning state
+
+const __pin_lock = ReentrantLock()
+const __pinned_memory = Dict{Ptr{Cvoid}, Int}()   # ptr => bytesize
+const __pin_count = Dict{Ptr{Cvoid}, Int}()       # ptr => refcount
+
 struct HostBuffer <: AbstractAMDBuffer
     device::HIPDevice
     ctx::HIPContext
@@ -119,6 +126,10 @@ function HostBuffer()
     HostBuffer(s.device, s.ctx, C_NULL, C_NULL, 0, true)
 end
 
+"""
+Allocate host-pinned memory via `hipHostMalloc`.
+Freed by `hipHostFree` through [`free`](@ref).
+"""
 function HostBuffer(
     bytesize::Integer, flags = 0; stream::HIP.HIPStream = AMDGPU.stream(),
 )
@@ -131,11 +142,15 @@ function HostBuffer(
     HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, bytesize, true)
 end
 
+"""
+Register (pin) an existing host pointer via `hipHostRegister`.
+Freed by `hipHostUnregister` through [`free`](@ref).
+"""
 function HostBuffer(
     ptr::Ptr{Cvoid}, sz::Integer;
     stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false,
 )
-    pin(ptr, sz)
+    register(ptr, sz)
     dev_ptr = get_device_ptr(ptr)
     HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, sz, own)
 end
@@ -180,10 +195,11 @@ Base.convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr)
 function free(buf::HostBuffer; kwargs...)
     buf.own || return
     buf.ptr == C_NULL && return
-    unpin(buf.ptr)
-    # TODO
-    # call HIP.hipHostFree(buf) if memory was allocated via hipHostMalloc
-    # or is unpinning enough?
+    if is_registered(buf.ptr)
+        unregister(buf.ptr)
+    else
+        HIP.hipHostFree(buf.ptr)
+    end
     return
 end
 
@@ -194,46 +210,95 @@ function get_device_ptr(ptr::Ptr{Cvoid})
     ptr_ref[]
 end
 
-function pin(ptr, sz)
-    ptr == C_NULL && error("Cannot pin `NULL` pointer.")
+function attributes(ptr)
+    data = Ref{HIP.hipPointerAttribute_t}()
+    HIP.hipPointerGetAttributes(data, ptr)
+    return data[]
+end
 
-    memtype = attributes(ptr).type
-    if memtype == HIP.hipMemoryTypeUnregistered
-        HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped)
-    elseif memtype == HIP.hipMemoryTypeHost
-        # Already pinned.
-    else
-        error("Cannot pin pointer with memory type `$memtype`.")
+
+"""
+    register(ptr::Ptr{Cvoid}, sz::Integer)
+
+Page-lock host memory at `ptr` via `hipHostRegister` with refcounting.
+Subsequent calls on the same pointer increment the refcount; the
+actual `hipHostUnregister` only happens when the count drops to zero
+via [`unregister`](@ref).
+"""
+function register(ptr::Ptr{Cvoid}, sz::Integer)
+    ptr == C_NULL && error("Cannot register `NULL` pointer.")
+
+    Base.@lock __pin_lock begin
+        count = get(__pin_count, ptr, 0)
+        if count > 0
+            __pin_count[ptr] = count + 1
+            return
+        end
+
+        memtype = attributes(ptr).type
+        if memtype == HIP.hipMemoryTypeUnregistered
+            HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped)
+            __pinned_memory[ptr] = Int(sz)
+            __pin_count[ptr] = 1
+        elseif memtype == HIP.hipMemoryTypeHost
+            # Already pinned externally (e.g. hipHostMalloc); nothing to track.
+        else
+            error("Cannot register pointer with memory type `$memtype`.")
+        end
     end
     return
 end
 
-function unpin(ptr)
-    ptr == C_NULL && error("Cannot unpin `NULL` pointer.")
+"""
+    unregister(ptr::Ptr{Cvoid})
+
+Decrement the refcount for `ptr`. When it reaches zero the underlying
+`hipHostUnregister` call is issued and tracking state is cleaned up.
+"""
+function unregister(ptr::Ptr{Cvoid})
+    ptr == C_NULL && error("Cannot unregister `NULL` pointer.")
+
+    do_unregister = false
+    Base.@lock __pin_lock begin
+        count = get(__pin_count, ptr, 0)
+        count == 0 && error("Cannot unregister untracked pointer $ptr.")
+
+        if count == 1
+            delete!(__pinned_memory, ptr)
+            delete!(__pin_count, ptr)
+            do_unregister = true
+        else
+            __pin_count[ptr] = count - 1
+        end
+    end
 
-    memtype = attributes(ptr).type
-    if memtype == HIP.hipMemoryTypeUnregistered
-        # Already unpinned.
-    elseif memtype == HIP.hipMemoryTypeHost
+    if do_unregister
         HIP.hipHostUnregister(ptr)
-    else
-        error("Cannot unpin pointer with memory type `$memtype`.")
     end
     return
 end
 
+"""
+    is_registered(ptr::Ptr{Cvoid}) -> Bool
+
+Return `true` if `ptr` is tracked as registered (pinned) memory that
+should be freed via `hipHostUnregister` rather than `hipHostFree`.
+"""
+function is_registered(ptr::Ptr{Cvoid})
+    Base.@lock __pin_lock begin
+        haskey(__pin_count, ptr)
+    end
+end
+
+pin(ptr, sz) = register(Ptr{Cvoid}(ptr), sz)
+unpin(ptr) = unregister(Ptr{Cvoid}(ptr))
+
 function is_pinned(ptr)
     ptr == C_NULL && return false
     data = attributes(ptr)
     return data.type == HIP.hipMemoryTypeHost
 end
 
-function attributes(ptr)
-    data = Ref{HIP.hipPointerAttribute_t}()
-    HIP.hipPointerGetAttributes(data, ptr)
-    return data[]
-end
-
 """
 Asynchronous 3D array copy.
 
diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl
index f87d94329..1ce65a7ab 100644
--- a/src/runtime/memory/utils.jl
+++ b/src/runtime/memory/utils.jl
@@ -16,6 +16,10 @@ function alloc_or_retry!(f, isfailed; stream::HIP.HIPStream)
             HIP.device_synchronize()
         elseif phase == 5
             HIP.trim(HIP.memory_pool(stream.device))
+        elseif phase == 6
+            for hook in AMDGPU.reclaim_hooks
+                hook()
+            end
         else
             break
         end
diff --git a/test/core/rocarray_base.jl b/test/core/rocarray_base.jl
index 2cb2d935d..b188df383 100644
--- a/test/core/rocarray_base.jl
+++ b/test/core/rocarray_base.jl
@@ -142,12 +142,15 @@ end
         @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd1))) == true
         @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true
 
+        # Refcounted: first free decrements the pin count but memory stays pinned.
         AMDGPU.unsafe_free!(xd1)
         @test_throws ArgumentError pointer(xd1)
-        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == false
+        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true
 
+        # Second free drops refcount to zero and actually unregisters.
         AMDGPU.unsafe_free!(xd2)
         @test_throws ArgumentError pointer(xd2)
+        @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(x))) == false
     end
 
     @testset "Broadcasting different buffer types" begin