From b9b58f38e5920937973815db2140bfe68a679830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 27 May 2026 12:23:06 +0200 Subject: [PATCH 1/2] Estimating size of CuPy pinned memory pool --- httomo/runner/dataset_store_backing.py | 39 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/httomo/runner/dataset_store_backing.py b/httomo/runner/dataset_store_backing.py index b31ce08ce..9c88e529f 100644 --- a/httomo/runner/dataset_store_backing.py +++ b/httomo/runner/dataset_store_backing.py @@ -7,7 +7,12 @@ from httomo.data.hdf._utils.reslice import AllGatherFunc, reslice_memory_estimator from httomo.runner.section import Section, determine_section_padding -from httomo.utils import _get_slicing_dim, make_3d_shape_from_shape +from httomo.utils import ( + _get_slicing_dim, + make_3d_shape_from_shape, + gpu_enabled, + xp, +) def calculate_section_input_chunk_shape( @@ -121,24 +126,30 @@ def estimate_section_memory( ) reslice_bytes += ring_algorithm_bytes + reslice_output_bytes - # TODO: The nature of the pinned memory allocations by cupy is currently under - # investigation, so a more precise calculation for its size is not yet known. - # - # It's known that this can grow quite large via allocations exceeding the current - # allocation being bumped to the next power of 2 (ie, a 16GiB allocation that is exceeded - # by 1 byte will have a 32GiB allocation made in addition to the original 16GiB). - # - # Taking half the input data size seems to be in the ballpark for what has been observed - # with larger datasets (ie, an 84GB dataset being processed took ~520GB of memory, and with - # this arbitrary choice of 0.5 as a multiplicative factor gets the estimated value to - # ~514GB) - CUPY_PINNED_CPU_MEMORY = int(0.5 * np.prod(global_shape) * np.dtype(dtype).itemsize) + # The default CuPy pinned memory pool allocates sizes that are the smallest power of two equal or above + # the requested size. These chunks are reused for the subsequent h2d copies, however, a larger chunk is + # not going to be used for a small transfer. E.g. a transfer of 1.5 GiB will allocate a new 2 GiB chunk, + # even if a 4 GiB chunk already sits free in the pool. + # Therefore, preparing for the worst case, we accumulate all significantly large potential pool chunks below: + cupy_pinned_cpu_pool_memory = 0 + cupy_transfer_overhead = 0 + if gpu_enabled: + _, total_mem = xp.cuda.Device().mem_info + mem_allocation = 1 << 27 # 128 MiB + while mem_allocation < total_mem: + cupy_pinned_cpu_pool_memory += mem_allocation + mem_allocation *= 2 + + # CuPy copies the host array before uploading to the device. This is at most the size of the device memory + # See https://github.com/cupy/cupy/issues/9813 + cupy_transfer_overhead = total_mem return ( padded_input_chunk_bytes + output_chunk_bytes + reslice_bytes - + CUPY_PINNED_CPU_MEMORY + + cupy_pinned_cpu_pool_memory + + cupy_transfer_overhead ) From 3a774b4fd94ed9bd552c1783cac9520fbbba3530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?= Date: Wed, 27 May 2026 13:39:11 +0200 Subject: [PATCH 2/2] Incorporate data size to pinned memory estimation --- httomo/runner/dataset_store_backing.py | 7 ++++++- httomo/utils.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/httomo/runner/dataset_store_backing.py b/httomo/runner/dataset_store_backing.py index 9c88e529f..cab49f9ff 100644 --- a/httomo/runner/dataset_store_backing.py +++ b/httomo/runner/dataset_store_backing.py @@ -12,6 +12,7 @@ make_3d_shape_from_shape, gpu_enabled, xp, + clp2, ) @@ -135,8 +136,12 @@ def estimate_section_memory( cupy_transfer_overhead = 0 if gpu_enabled: _, total_mem = xp.cuda.Device().mem_info + # if the max size fits to the device memory, we can use it as a lower bound + max_size = np.prod(global_shape) * np.dtype(dtype).itemsize + size_limit = min(clp2(max_size), total_mem) + mem_allocation = 1 << 27 # 128 MiB - while mem_allocation < total_mem: + while mem_allocation <= size_limit: cupy_pinned_cpu_pool_memory += mem_allocation mem_allocation *= 2 diff --git a/httomo/utils.py b/httomo/utils.py index 22ec483d5..222936ea6 100644 --- a/httomo/utils.py +++ b/httomo/utils.py @@ -389,3 +389,17 @@ def search_max_slices_iterative( slices_low = current_slices return slices_low + + +def clp2(x: int) -> int: + """ + Round up to next power of two + """ + x -= 1 + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + return x + 1