diff --git a/src/memory.jl b/src/memory.jl index 6da3b2f0f..8a40d1b0e 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -97,6 +97,37 @@ function soft_memory_limit() SOFT_MEMORY_LIMIT[] = soft_limit end + +## allocation statistics + +mutable struct AllocStats + Base.@atomic alloc_count::Int + Base.@atomic alloc_bytes::Int + + Base.@atomic free_count::Int + Base.@atomic free_bytes::Int + + Base.@atomic total_time::Float64 +end + +AllocStats() = AllocStats(0, 0, 0, 0, 0.0) + +Base.copy(s::AllocStats) = + AllocStats(s.alloc_count, s.alloc_bytes, + s.free_count, s.free_bytes, s.total_time) + +Base.:(-)(a::AllocStats, b::AllocStats) = (; + alloc_count = a.alloc_count - b.alloc_count, + alloc_bytes = a.alloc_bytes - b.alloc_bytes, + free_count = a.free_count - b.free_count, + free_bytes = a.free_bytes - b.free_bytes, + total_time = a.total_time - b.total_time) + +const alloc_stats = AllocStats() + + +## memory accounting + mutable struct MemoryStats # Maximum size of the heap. # Estimated during `maybe_collect` stage. @@ -134,7 +165,7 @@ function account!(stats::MemoryStats, bytes::Integer) Base.@atomic stats.live += bytes end -const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", false)) +const EAGER_GC::Ref{Bool} = Ref{Bool}(@load_preference("eager_gc", true)) function eager_gc!(flag::Bool) global EAGER_GC[] = flag @@ -210,6 +241,169 @@ function maybe_collect(; blocking::Bool = false) return end + +## pool activity tracking + +const POOL_STATUS = AMDGPU.LockedObject(Dict{Int, Ref{Bool}}()) + +function pool_mark(dev::HIPDevice) + ps = POOL_STATUS.payload + did = HIP.device_id(dev) + status = get(ps, did, nothing) + status === nothing && return nothing + return status[] +end + +function pool_mark!(dev::HIPDevice, val::Bool) + ps = POOL_STATUS.payload + did = HIP.device_id(dev) + box = get(ps, did, nothing) + if box === nothing + Base.@lock POOL_STATUS.lock begin + box = get!(ps, did) do + Ref{Bool}(val) + end + end + end + box[] = val + return +end + + +## reclaim hooks + +""" + reclaim_hooks + +A list of callables that are invoked when memory needs to be reclaimed. +Downstream packages can push functions into this list to free cached resources +(e.g., workspace buffers, FFT plans, etc.) when GPU memory is scarce. +""" +const reclaim_hooks = Any[] + + +## pool cleanup + +const _pool_cleanup_task = Ref{Task}() + +function pool_cleanup() + idle_counters = Dict{Int, Int}() + while true + try + sleep(60) + catch ex + if ex isa EOFError + break + else + rethrow() + end + end + + for dev in HIP.devices() + did = HIP.device_id(dev) + status = pool_mark(dev) + status === nothing && continue + + if status + idle_counters[did] = 0 + else + idle_counters[did] = get(idle_counters, did, 0) + 1 + end + pool_mark!(dev, false) + + if get(idle_counters, did, 0) >= 5 + HIP.device!(dev) do + reclaim() + end + end + end + end +end + + +## reclaim + +""" + reclaim([sz=typemax(Int)]) + +Reclaims `sz` bytes of cached memory. Use this to free GPU memory before +calling into functionality that does not use the memory pool. Returns the +number of bytes actually reclaimed. +""" +function reclaim(sz::Int=typemax(Int)) + dev = AMDGPU.device() + for hook in reclaim_hooks + hook() + end + HIP.device_synchronize() + pool = Mem.pool_create(dev) + before = HIP.reserved_memory(pool) + HIP.trim(pool) + after = HIP.reserved_memory(pool) + return Int(before - after) +end + + +## pool status & queries + +""" + used_memory() + +Returns the amount of memory from the HIP memory pool that is currently +in use by the application. +""" +function used_memory() + pool = Mem.pool_create(AMDGPU.device()) + Int(HIP.used_memory(pool)) +end + +""" + cached_memory() + +Returns the amount of backing memory currently allocated (reserved) for the +HIP memory pool. +""" +function cached_memory() + pool = Mem.pool_create(AMDGPU.device()) + Int(HIP.reserved_memory(pool)) +end + +""" + pool_status([io=stdout]) + +Report to `io` on the memory status of the current GPU and the active memory pool. +""" +function pool_status(io::IO=stdout) + free_bytes, total_bytes = info() + used_bytes = total_bytes - free_bytes + used_ratio = used_bytes / total_bytes + @printf(io, "Effective GPU memory usage: %.2f%% (%s/%s)\n", + 100*used_ratio, Base.format_bytes(used_bytes), + Base.format_bytes(total_bytes)) + + pool = Mem.pool_create(AMDGPU.device()) + pool_used = HIP.used_memory(pool) + pool_reserved = HIP.reserved_memory(pool) + @printf(io, "Memory pool usage: %s (%s reserved)\n", + Base.format_bytes(pool_used), + Base.format_bytes(pool_reserved)) + + hard_limit = hard_memory_limit() + soft_limit = soft_memory_limit() + if hard_limit != typemax(UInt64) || soft_limit != typemax(UInt64) + print(io, "Memory limit: ") + parts = String[] + if soft_limit != typemax(UInt64) + push!(parts, "soft = $(Base.format_bytes(soft_limit))") + end + if hard_limit != typemax(UInt64) + push!(parts, "hard = $(Base.format_bytes(hard_limit))") + end + println(io, join(parts, ", ")) + end +end + + # TODO handle stream capturing when we support HIP graphs mutable struct Managed{M} const mem::M @@ -275,16 +469,41 @@ function Base.convert(::Type{Mem.AbstractAMDBuffer}, managed::Managed{M}) where end function pool_alloc(::Type{B}, bytesize) where B - s = AMDGPU.stream() - # @info "[pool_alloc] $(Base.format_bytes(bytesize))" - # display(stacktrace()); println() - # println() - # println() - Managed(B(bytesize; stream=s); stream=s) + maybe_collect() + time = Base.@elapsed begin + s = AMDGPU.stream() + managed = Managed(B(bytesize; stream=s); stream=s) + end + + Base.@atomic alloc_stats.alloc_count += 1 + Base.@atomic alloc_stats.alloc_bytes += bytesize + Base.@atomic alloc_stats.total_time += time + + pool_mark!(AMDGPU.device(), true) + + if isinteractive() && !isassigned(_pool_cleanup_task) + _pool_cleanup_task[] = errormonitor(Threads.@spawn pool_cleanup()) + end + + return managed end function pool_free(managed::Managed{M}) where M - _pool_free(managed.mem, managed.stream) + sz = Int(sizeof(managed.mem)) + sz == 0 && return + + try + time = Base.@elapsed _pool_free(managed.mem, managed.stream) + Base.@atomic alloc_stats.free_count += 1 + Base.@atomic alloc_stats.free_bytes += sz + Base.@atomic alloc_stats.total_time += time + catch ex + Base.showerror_nostdio(ex, + "WARNING: Error while freeing $(Base.format_bytes(sz)) of GPU memory") + Base.show_backtrace(Core.stdout, catch_backtrace()) + Core.println() + end + return end function _pool_free(buf, stream::HIPStream) diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl index c444d2e34..5645b50ba 100644 --- a/src/runtime/memory/hip.jl +++ b/src/runtime/memory/hip.jl @@ -39,22 +39,23 @@ function HIPBuffer(bytesize; stream::HIP.HIPStream) dev, ctx = stream.device, stream.ctx bytesize == 0 && return HIPBuffer(dev, ctx, C_NULL, 0, true) - AMDGPU.maybe_collect() pool = pool_create(dev) ptr_ref = Ref{Ptr{Cvoid}}() ptr = alloc_or_retry!(isnothing; stream) do try - # Try to allocate. - HIP.hipMallocAsync(ptr_ref, bytesize, stream) - # HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream) - + HIP.hipMallocFromPoolAsync(ptr_ref, bytesize, pool, stream) ptr = ptr_ref[] ptr == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) return ptr catch err - # TODO rethrow if not out of memory error - @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace()) + # Only retry for OOM. Any other error (e.g. hipErrorLaunchFailure + # from a prior kernel exception surfacing here) must propagate + # immediately so the caller sees the real cause. + err isa HIP.HIPError || rethrow(err) + err.code in (HIP.hipErrorOutOfMemory, HIP.hipErrorMemoryAllocation) || + rethrow(err) + @debug "hipMallocFromPoolAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace()) return nothing end end @@ -105,6 +106,12 @@ function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HI return end +## Host memory pinning state + +const __pin_lock = ReentrantLock() +const __pinned_memory = Dict{Ptr{Cvoid}, Int}() # ptr => bytesize +const __pin_count = Dict{Ptr{Cvoid}, Int}() # ptr => refcount + struct HostBuffer <: AbstractAMDBuffer device::HIPDevice ctx::HIPContext @@ -119,6 +126,10 @@ function HostBuffer() HostBuffer(s.device, s.ctx, C_NULL, C_NULL, 0, true) end +""" +Allocate host-pinned memory via `hipHostMalloc`. +Freed by `hipHostFree` through [`free`](@ref). +""" function HostBuffer( bytesize::Integer, flags = 0; stream::HIP.HIPStream = AMDGPU.stream(), ) @@ -131,11 +142,15 @@ function HostBuffer( HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, bytesize, true) end +""" +Register (pin) an existing host pointer via `hipHostRegister`. +Freed by `hipHostUnregister` through [`free`](@ref). +""" function HostBuffer( ptr::Ptr{Cvoid}, sz::Integer; stream::HIP.HIPStream = AMDGPU.stream(), own::Bool = false, ) - pin(ptr, sz) + register(ptr, sz) dev_ptr = get_device_ptr(ptr) HostBuffer(stream.device, stream.ctx, ptr, dev_ptr, sz, own) end @@ -180,10 +195,11 @@ Base.convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr) function free(buf::HostBuffer; kwargs...) buf.own || return buf.ptr == C_NULL && return - unpin(buf.ptr) - # TODO - # call HIP.hipHostFree(buf) if memory was allocated via hipHostMalloc - # or is unpinning enough? + if is_registered(buf.ptr) + unregister(buf.ptr) + else + HIP.hipHostFree(buf.ptr) + end return end @@ -194,46 +210,95 @@ function get_device_ptr(ptr::Ptr{Cvoid}) ptr_ref[] end -function pin(ptr, sz) - ptr == C_NULL && error("Cannot pin `NULL` pointer.") +function attributes(ptr) + data = Ref{HIP.hipPointerAttribute_t}() + HIP.hipPointerGetAttributes(data, ptr) + return data[] +end - memtype = attributes(ptr).type - if memtype == HIP.hipMemoryTypeUnregistered - HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) - elseif memtype == HIP.hipMemoryTypeHost - # Already pinned. - else - error("Cannot pin pointer with memory type `$memtype`.") + +""" + register(ptr::Ptr{Cvoid}, sz::Integer) + +Page-lock host memory at `ptr` via `hipHostRegister` with refcounting. +Subsequent calls on the same pointer increment the refcount; the +actual `hipHostUnregister` only happens when the count drops to zero +via [`unregister`](@ref). +""" +function register(ptr::Ptr{Cvoid}, sz::Integer) + ptr == C_NULL && error("Cannot register `NULL` pointer.") + + Base.@lock __pin_lock begin + count = get(__pin_count, ptr, 0) + if count > 0 + __pin_count[ptr] = count + 1 + return + end + + memtype = attributes(ptr).type + if memtype == HIP.hipMemoryTypeUnregistered + HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) + __pinned_memory[ptr] = Int(sz) + __pin_count[ptr] = 1 + elseif memtype == HIP.hipMemoryTypeHost + # Already pinned externally (e.g. hipHostMalloc); nothing to track. + else + error("Cannot register pointer with memory type `$memtype`.") + end end return end -function unpin(ptr) - ptr == C_NULL && error("Cannot unpin `NULL` pointer.") +""" + unregister(ptr::Ptr{Cvoid}) + +Decrement the refcount for `ptr`. When it reaches zero the underlying +`hipHostUnregister` call is issued and tracking state is cleaned up. +""" +function unregister(ptr::Ptr{Cvoid}) + ptr == C_NULL && error("Cannot unregister `NULL` pointer.") + + do_unregister = false + Base.@lock __pin_lock begin + count = get(__pin_count, ptr, 0) + count == 0 && error("Cannot unregister untracked pointer $ptr.") + + if count == 1 + delete!(__pinned_memory, ptr) + delete!(__pin_count, ptr) + do_unregister = true + else + __pin_count[ptr] = count - 1 + end + end - memtype = attributes(ptr).type - if memtype == HIP.hipMemoryTypeUnregistered - # Already unpinned. - elseif memtype == HIP.hipMemoryTypeHost + if do_unregister HIP.hipHostUnregister(ptr) - else - error("Cannot unpin pointer with memory type `$memtype`.") end return end +""" + is_registered(ptr::Ptr{Cvoid}) -> Bool + +Return `true` if `ptr` is tracked as registered (pinned) memory that +should be freed via `hipHostUnregister` rather than `hipHostFree`. +""" +function is_registered(ptr::Ptr{Cvoid}) + Base.@lock __pin_lock begin + haskey(__pin_count, ptr) + end +end + +pin(ptr, sz) = register(Ptr{Cvoid}(ptr), sz) +unpin(ptr) = unregister(Ptr{Cvoid}(ptr)) + function is_pinned(ptr) ptr == C_NULL && return false data = attributes(ptr) return data.type == HIP.hipMemoryTypeHost end -function attributes(ptr) - data = Ref{HIP.hipPointerAttribute_t}() - HIP.hipPointerGetAttributes(data, ptr) - return data[] -end - """ Asynchronous 3D array copy. diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl index f87d94329..1ce65a7ab 100644 --- a/src/runtime/memory/utils.jl +++ b/src/runtime/memory/utils.jl @@ -16,6 +16,10 @@ function alloc_or_retry!(f, isfailed; stream::HIP.HIPStream) HIP.device_synchronize() elseif phase == 5 HIP.trim(HIP.memory_pool(stream.device)) + elseif phase == 6 + for hook in AMDGPU.reclaim_hooks + hook() + end else break end diff --git a/test/core/rocarray_base.jl b/test/core/rocarray_base.jl index 2cb2d935d..b188df383 100644 --- a/test/core/rocarray_base.jl +++ b/test/core/rocarray_base.jl @@ -142,12 +142,15 @@ end @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd1))) == true @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true + # Refcounted: first free decrements the pin count but memory stays pinned. AMDGPU.unsafe_free!(xd1) @test_throws ArgumentError pointer(xd1) - @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == false + @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(xd2))) == true + # Second free drops refcount to zero and actually unregisters. AMDGPU.unsafe_free!(xd2) @test_throws ArgumentError pointer(xd2) + @test AMDGPU.Mem.is_pinned(Ptr{Cvoid}(pointer(x))) == false end @testset "Broadcasting different buffer types" begin