From b9b58f38e5920937973815db2140bfe68a679830 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?=
 <lorinc.serfozo@radwaytech.com>
Date: Wed, 27 May 2026 12:23:06 +0200
Subject: [PATCH 1/2] Estimating size of CuPy pinned memory pool

---
 httomo/runner/dataset_store_backing.py | 39 +++++++++++++++++---------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/httomo/runner/dataset_store_backing.py b/httomo/runner/dataset_store_backing.py
index b31ce08ce..9c88e529f 100644
--- a/httomo/runner/dataset_store_backing.py
+++ b/httomo/runner/dataset_store_backing.py
@@ -7,7 +7,12 @@
 
 from httomo.data.hdf._utils.reslice import AllGatherFunc, reslice_memory_estimator
 from httomo.runner.section import Section, determine_section_padding
-from httomo.utils import _get_slicing_dim, make_3d_shape_from_shape
+from httomo.utils import (
+    _get_slicing_dim,
+    make_3d_shape_from_shape,
+    gpu_enabled,
+    xp,
+)
 
 
 def calculate_section_input_chunk_shape(
@@ -121,24 +126,30 @@ def estimate_section_memory(
         )
         reslice_bytes += ring_algorithm_bytes + reslice_output_bytes
 
-    # TODO: The nature of the pinned memory allocations by cupy is currently under
-    # investigation, so a more precise calculation for its size is not yet known.
-    #
-    # It's known that this can grow quite large via allocations exceeding the current
-    # allocation being bumped to the next power of 2 (ie, a 16GiB allocation that is exceeded
-    # by 1 byte will have a 32GiB allocation made in addition to the original 16GiB).
-    #
-    # Taking half the input data size seems to be in the ballpark for what has been observed
-    # with larger datasets (ie, an 84GB dataset being processed took ~520GB of memory, and with
-    # this arbitrary choice of 0.5 as a multiplicative factor gets the estimated value to
-    # ~514GB)
-    CUPY_PINNED_CPU_MEMORY = int(0.5 * np.prod(global_shape) * np.dtype(dtype).itemsize)
+    # The default CuPy pinned memory pool allocates sizes that are the smallest power of two equal or above
+    # the requested size. These chunks are reused for the subsequent h2d copies, however, a larger chunk is
+    # not going to be used for a small transfer. E.g. a transfer of 1.5 GiB will allocate a new 2 GiB chunk,
+    # even if a 4 GiB chunk already sits free in the pool.
+    # Therefore, preparing for the worst case, we accumulate all significantly large potential pool chunks below:
+    cupy_pinned_cpu_pool_memory = 0
+    cupy_transfer_overhead = 0
+    if gpu_enabled:
+        _, total_mem = xp.cuda.Device().mem_info
+        mem_allocation = 1 << 27  # 128 MiB
+        while mem_allocation < total_mem:
+            cupy_pinned_cpu_pool_memory += mem_allocation
+            mem_allocation *= 2
+
+        # CuPy copies the host array before uploading to the device. This is at most the size of the device memory
+        # See https://github.com/cupy/cupy/issues/9813
+        cupy_transfer_overhead = total_mem
 
     return (
         padded_input_chunk_bytes
         + output_chunk_bytes
         + reslice_bytes
-        + CUPY_PINNED_CPU_MEMORY
+        + cupy_pinned_cpu_pool_memory
+        + cupy_transfer_overhead
     )
 
 

From 3a774b4fd94ed9bd552c1783cac9520fbbba3530 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc=20Serf=C5=91z=C5=91?=
 <lorinc.serfozo@radwaytech.com>
Date: Wed, 27 May 2026 13:39:11 +0200
Subject: [PATCH 2/2] Incorporate data size to pinned memory estimation

---
 httomo/runner/dataset_store_backing.py |  7 ++++++-
 httomo/utils.py                        | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/httomo/runner/dataset_store_backing.py b/httomo/runner/dataset_store_backing.py
index 9c88e529f..cab49f9ff 100644
--- a/httomo/runner/dataset_store_backing.py
+++ b/httomo/runner/dataset_store_backing.py
@@ -12,6 +12,7 @@
     make_3d_shape_from_shape,
     gpu_enabled,
     xp,
+    clp2,
 )
 
 
@@ -135,8 +136,12 @@ def estimate_section_memory(
     cupy_transfer_overhead = 0
     if gpu_enabled:
         _, total_mem = xp.cuda.Device().mem_info
+        # if the max size fits to the device memory, we can use it as a lower bound
+        max_size = np.prod(global_shape) * np.dtype(dtype).itemsize
+        size_limit = min(clp2(max_size), total_mem)
+
         mem_allocation = 1 << 27  # 128 MiB
-        while mem_allocation < total_mem:
+        while mem_allocation <= size_limit:
             cupy_pinned_cpu_pool_memory += mem_allocation
             mem_allocation *= 2
 
diff --git a/httomo/utils.py b/httomo/utils.py
index 22ec483d5..222936ea6 100644
--- a/httomo/utils.py
+++ b/httomo/utils.py
@@ -389,3 +389,17 @@ def search_max_slices_iterative(
             slices_low = current_slices
 
     return slices_low
+
+
+def clp2(x: int) -> int:
+    """
+    Round up to next power of two
+    """
+    x -= 1
+    x |= x >> 1
+    x |= x >> 2
+    x |= x >> 4
+    x |= x >> 8
+    x |= x >> 16
+    x |= x >> 32
+    return x + 1