From 4b13a6c3048bb2a613c95eab2427e55073c282f6 Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Tue, 31 May 2022 15:08:28 -0600 Subject: [PATCH 1/2] addressing the case when output region for repeat operation is too big --- cunumeric/module.py | 64 ++++++++++++++++++++++++++----- src/cunumeric/index/repeat.cc | 4 ++ src/cunumeric/index/repeat.cu | 2 + src/cunumeric/index/repeat_omp.cc | 4 +- 4 files changed, 64 insertions(+), 10 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 4b9d028ddb..796e8be50c 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -2046,7 +2046,6 @@ def repeat(a, repeats, axis=None): -------- Multiple GPUs, Multiple CPUs """ - # when array is a scalar if np.ndim(a) == 0: if np.ndim(repeats) == 0: @@ -2100,11 +2099,36 @@ def repeat(a, repeats, axis=None): category=UserWarning, ) repeats = np.int64(repeats) - result = array._thunk.repeat( - repeats=repeats, - axis=axis, - scalar_repeats=True, - ) + if repeats < 0: + return ValueError( + "'repeats' should not be negative: {}".format(repeats) + ) + + # check output shape (if it will fit to GPU or not) + out_shape = list(array.shape) + out_shape[axis] *= repeats + out_shape = tuple(out_shape) + size = sum(out_shape) * array.itemsize + # check if size of the output array is less 8GB. In this case we can + # use output regions, otherwise we will use statcally allocated + # array + if size < 8589934592 / 2: + + result = array._thunk.repeat( + repeats=repeats, axis=axis, scalar_repeats=True + ) + else: + # this implementation is taken from CuPy + result = ndarray(shape=out_shape, dtype=array.dtype) + a_index = [slice(None)] * len(out_shape) + res_index = list(a_index) + offset = 0 + for i in range(a._shape[axis]): + a_index[axis] = slice(i, i + 1) + res_index[axis] = slice(offset, offset + repeats) + result[res_index] = array[a_index] + offset += repeats + return result # repeats is an array else: # repeats should be integer type @@ -2116,9 +2140,31 @@ def repeat(a, repeats, axis=None): repeats = repeats.astype(np.int64) if repeats.shape[0] != array.shape[axis]: return ValueError("incorrect shape of repeats array") - result = array._thunk.repeat( - repeats=repeats._thunk, axis=axis, scalar_repeats=False - ) + + # check output shape (if it will fit to GPU or not) + out_shape = list(array.shape) + n_repeats = sum(repeats) + out_shape[axis] = n_repeats + out_shape = tuple(out_shape) + size = sum(out_shape) * array.itemsize + # check if size of the output array is less 8GB. In this case we can + # use output regions, otherwise we will use statcally allocated + # array + if size < 8589934592 / 2: + result = array._thunk.repeat( + repeats=repeats._thunk, axis=axis, scalar_repeats=False + ) + else: # this implementation is taken from CuPy + result = ndarray(shape=out_shape, dtype=array.dtype) + a_index = [slice(None)] * len(out_shape) + res_index = list(a_index) + offset = 0 + for i in range(a._shape[axis]): + a_index[axis] = slice(i, i + 1) + res_index[axis] = slice(offset, offset + repeats[i]) + result[res_index] = array[a_index] + offset += repeats[i] + return result return ndarray(shape=result.shape, thunk=result) diff --git a/src/cunumeric/index/repeat.cc b/src/cunumeric/index/repeat.cc index bfd134cb4a..e49ae611bc 100644 --- a/src/cunumeric/index/repeat.cc +++ b/src/cunumeric/index/repeat.cc @@ -69,6 +69,8 @@ struct RepeatImplBody { int64_t out_idx = 0; for (size_t in_idx = 0; in_idx < volume; ++in_idx) { auto p = in_pitches.unflatten(in_idx, in_rect.lo); + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p]; } } @@ -88,6 +90,8 @@ struct RepeatImplBody { for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) { p[axis] = idx; offsets[off_idx++] = sum; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); sum += repeats[p]; } diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu index 09d6c71978..8d0b8c86b3 100644 --- a/src/cunumeric/index/repeat.cu +++ b/src/cunumeric/index/repeat.cu @@ -41,6 +41,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) if (offset < extent) { auto p = origin; p[axis] += offset; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); auto val = repeats[p]; offsets[offset] = val; SumReduction::fold(value, val); diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc index 823a1a16a4..8a1b3e46a7 100644 --- a/src/cunumeric/index/repeat_omp.cc +++ b/src/cunumeric/index/repeat_omp.cc @@ -77,7 +77,9 @@ struct RepeatImplBody { int64_t axis_lo = p[axis]; #pragma omp for schedule(static) private(p) for (int64_t idx = 0; idx < axis_extent; ++idx) { - p[axis] = axis_lo + idx; + p[axis] = axis_lo + idx; + // TODO replace assert with Legate exception handeling interface when available + assert(repeats[p] >= 0); auto val = repeats[p]; offsets[idx] = val; local_sums[tid] += val; From 4f02f3910a57a584f1b8bf692f270ad8cee2726f Mon Sep 17 00:00:00 2001 From: Irina Demeshko Date: Wed, 22 Jun 2022 11:47:08 -0600 Subject: [PATCH 2/2] replacing return with raise --- cunumeric/module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cunumeric/module.py b/cunumeric/module.py index 796e8be50c..032921a7be 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -2074,7 +2074,7 @@ def repeat(a, repeats, axis=None): axis = np.int32(axis) if axis >= array.ndim: - return ValueError("axis exceeds dimension of the input array") + raise ValueError("axis exceeds dimension of the input array") # If repeats is on a zero sized axis, then return the array. if array.shape[axis] == 0: @@ -2100,7 +2100,7 @@ def repeat(a, repeats, axis=None): ) repeats = np.int64(repeats) if repeats < 0: - return ValueError( + raise ValueError( "'repeats' should not be negative: {}".format(repeats) ) @@ -2139,7 +2139,7 @@ def repeat(a, repeats, axis=None): ) repeats = repeats.astype(np.int64) if repeats.shape[0] != array.shape[axis]: - return ValueError("incorrect shape of repeats array") + raise ValueError("incorrect shape of repeats array") # check output shape (if it will fit to GPU or not) out_shape = list(array.shape)