diff --git a/core/inc/SOFIE/ROperator_Pad.hxx b/core/inc/SOFIE/ROperator_Pad.hxx index 04365d8..e30b5b6 100644 --- a/core/inc/SOFIE/ROperator_Pad.hxx +++ b/core/inc/SOFIE/ROperator_Pad.hxx @@ -191,9 +191,105 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fOutputShape.empty()) + throw std::runtime_error("SOFIE Pad called to Generate_GPU_Kernel_ALPAKA without being initialized first"); + + const size_t D = fOutputShape.size(); //dimensions + + auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape); + auto outputStrides = UTILITY::ComputeStrideFromShape(fOutputShape); + size_t totalElements = ConvertShapeToLength(fOutputShape); + opName = "op_" + opName; + std::string kname = "PadKernel_" + opName; + + std::stringstream cv; + cv << fConstantValue; + + std::string op; + op = "\n//------ PAD_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fOutputShape[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "bool interior = true;\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string hi = std::to_string(fPads[d].first + static_cast(fInputShape[d])); + op += SP + SP + SP + SP + "interior = interior"; + if (fPads[d].first > 0) + op += " && (out_" + std::to_string(d) + " >= " + std::to_string(fPads[d].first) + "u)"; + op += " && (out_" + std::to_string(d) + " < " + hi + "u);\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "if (interior) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string lo = std::to_string(fPads[d].first); + op += SP + SP + SP + SP + SP + SP + + "(out_" + std::to_string(d) + " - " + lo + "u) * " + + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n"; + } + op += SP + SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + SP + "} else {\n"; + op += SP + SP + SP + SP + SP + "output[elem_idx] = static_cast(" + cv.str() + ");\n"; + op += SP + SP + SP + SP + "}\n"; + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "PadKernel_" + opName; + return SP + kname + " padKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fInputShape.empty() || fOutputShape.empty()) + throw std::runtime_error("SOFIE Pad Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fOutputShape); + std::string kname = "padKernel_" + opName; + + std::stringstream out; + out << "\n//------ PAD_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + + return out.str(); + } + }; }//SOFIE -#endif //SOFIE_ROPERATOR_Swish +#endif //SOFIE_ROPERATOR_Pad diff --git a/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx index fccacbe..3b47472 100644 --- a/test/TestCustomModelsFromONNXForAlpakaCuda.cxx +++ b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -175,6 +175,8 @@ #include "IsNaN_FromONNX_GPU_ALPAKA.hxx" #include "Clip_FromONNX_GPU_ALPAKA.hxx" #include "Not_FromONNX_GPU_ALPAKA.hxx" +#include "Pad_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Pad.ref.hxx" #include "GNN_model_FromONNX_GPU_ALPAKA.hxx" @@ -3161,3 +3163,36 @@ TEST_F(SofieAlpakaTest, Logic_BitwiseNot) for (std::size_t i = 0; i < N; ++i) EXPECT_EQ(res[i], ref[i]) << " index=" << i; } + +TEST_F(SofieAlpakaTest, Pad) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // input shape [1,2,2] -> output shape [2,3,5] (constant pad, before=[1,1,1] after=[0,0,2]) + constexpr Idx inputSize = 4; + constexpr Idx outputSize = 30; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = static_cast(i + 1); // 1,2,3,4 + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + { + SOFIE_Pad::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Pad_ExpectedOutput::outputs; + for (std::size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res[i] - ref[i]), TOLERANCE) << " index=" << i; +} diff --git a/test/input_models/references/Pad.ref.hxx b/test/input_models/references/Pad.ref.hxx new file mode 100644 index 0000000..cb7d355 --- /dev/null +++ b/test/input_models/references/Pad.ref.hxx @@ -0,0 +1,13 @@ +// Reference output for Pad.onnx ([1,2,2] -> [2,3,5], constant mode, pads before=[1,0,1] after=[0,1,2]) +// Expected computed with numpy.pad +#pragma once +namespace Pad_ExpectedOutput { + static float outputs[30] = { + 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f, + 0.f, 1.f, 2.f, 0.f, 0.f, + 0.f, 3.f, 4.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, 0.f + }; +} // namespace Pad_ExpectedOutput