diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..47452ee --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,158 @@ +name: Benchmark + +on: + pull_request: + branches: [main] + workflow_dispatch: + inputs: + warmup: + description: "Warmup iterations" + default: "5" + iterations: + description: "Timed iterations" + default: "100" + +concurrency: + group: benchmark-${{ github.ref }} + cancel-in-progress: true + +env: + LCG_VIEW: /cvmfs/sft.cern.ch/lcg/views/LCG_106a/x86_64-el9-gcc13-opt + CUDA_CVMFS: /cvmfs/sft.cern.ch/lcg/contrib/cuda/12.4/x86_64-el9 + CUDA_ARCH: "90" + BUILD_TYPE: Release + BENCH_WARMUP: ${{ github.event.inputs.warmup || '5' }} + BENCH_ITERS: ${{ github.event.inputs.iterations || '100' }} + DEPS_CACHE: /tmp/sofie-cmake-deps + +jobs: + benchmark: + name: Benchmark Comparison (H100) + runs-on: ml4ep-h100 + container: registry.cern.ch/ngt/lxplus-like:9 + timeout-minutes: 120 + + steps: + - name: GPU check + run: nvidia-smi + + - name: Setup build environment + run: | + set -euo pipefail + + if [ -f "${{ env.LCG_VIEW }}/setup.sh" ]; then + set +u; source "${{ env.LCG_VIEW }}/setup.sh"; set -u + else + echo "LCG view not found — installing from dnf" + dnf install -y epel-release + dnf install -y cmake ninja-build gcc-c++ python3 git \ + protobuf-devel openblas-devel + fi + + if [ -x "${{ env.CUDA_CVMFS }}/bin/nvcc" ]; then + echo "${{ env.CUDA_CVMFS }}/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=${{ env.CUDA_CVMFS }}" >> "$GITHUB_ENV" + elif [ -x /usr/local/cuda/bin/nvcc ]; then + echo "/usr/local/cuda/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=/usr/local/cuda" >> "$GITHUB_ENV" + else + dnf config-manager --add-repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + dnf install -y cuda-compiler-12-4 cuda-cudart-devel-12-4 cuda-libraries-devel-12-4 + echo "/usr/local/cuda-12.4/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=/usr/local/cuda-12.4" >> "$GITHUB_ENV" + fi + + echo "PATH=$PATH" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" + echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}" >> "$GITHUB_ENV" + + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + path: sofie-pr + + - name: Checkout main branch + if: github.event_name == 'pull_request' + uses: actions/checkout@v4 + with: + ref: main + path: sofie-main + + - name: Cache FetchContent dependencies + uses: actions/cache@v4 + with: + path: ${{ env.DEPS_CACHE }} + key: cmake-deps-bench-${{ hashFiles('sofie-pr/benchmark/CMakeLists.txt') }} + restore-keys: cmake-deps-bench- + + - name: Configure PR build + run: | + cmake -B sofie-pr/build -S sofie-pr \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DSOFIE_WITH_ROOT=OFF \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_BACKEND=CUDA \ + "-DSOFIE_BENCHMARK_CUDA_ARCH=${{ env.CUDA_ARCH }}" \ + "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \ + "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}" + + - name: Build PR benchmark + run: cmake --build sofie-pr/build --target sofie_benchmark -j$(nproc) + + - name: Run PR benchmark + working-directory: sofie-pr/build/benchmark + run: | + ./sofie_benchmark \ + -w ${{ env.BENCH_WARMUP }} \ + -n ${{ env.BENCH_ITERS }} \ + | tee benchmark_pr.txt + + - name: Configure main build + if: github.event_name == 'pull_request' + run: | + cmake -B sofie-main/build -S sofie-main \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DSOFIE_WITH_ROOT=OFF \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_BACKEND=CUDA \ + "-DSOFIE_BENCHMARK_CUDA_ARCH=${{ env.CUDA_ARCH }}" \ + "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \ + "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}" + + - name: Build main benchmark + if: github.event_name == 'pull_request' + run: cmake --build sofie-main/build --target sofie_benchmark -j$(nproc) + + - name: Run main benchmark + if: github.event_name == 'pull_request' + working-directory: sofie-main/build/benchmark + run: | + ./sofie_benchmark \ + -w ${{ env.BENCH_WARMUP }} \ + -n ${{ env.BENCH_ITERS }} \ + | tee benchmark_main.txt + + - name: Summarise PR vs main + if: github.event_name == 'pull_request' + run: | + echo "### Benchmark comparison: PR vs main" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + echo "── PR ──────────────────────────────────────────────────────────────" \ + >> "$GITHUB_STEP_SUMMARY" + cat sofie-pr/build/benchmark/benchmark_pr.txt >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "── main ────────────────────────────────────────────────────────────" \ + >> "$GITHUB_STEP_SUMMARY" + cat sofie-main/build/benchmark/benchmark_main.txt >> "$GITHUB_STEP_SUMMARY" + echo '```' >> "$GITHUB_STEP_SUMMARY" + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: benchmark-results-${{ github.run_id }} + path: | + sofie-pr/build/benchmark/benchmark_pr.txt + sofie-main/build/benchmark/benchmark_main.txt diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..475b782 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,113 @@ +name: Unit Tests + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +concurrency: + group: tests-${{ github.ref }} + cancel-in-progress: true + +env: + LCG_VIEW: /cvmfs/sft.cern.ch/lcg/views/LCG_106a/x86_64-el9-gcc13-opt + CUDA_CVMFS: /cvmfs/sft.cern.ch/lcg/contrib/cuda/12.4/x86_64-el9 + CUDA_ARCH: "90" + BUILD_TYPE: Release + DEPS_CACHE: /tmp/sofie-cmake-deps + +jobs: + gpu-tests: + name: GPU Unit Tests (NVIDIA/H100) + runs-on: ml4ep-h100 + container: registry.cern.ch/ngt/lxplus-like:9 + timeout-minutes: 60 + + steps: + - name: GPU check + run: nvidia-smi + + - name: Setup build environment + run: | + set -euo pipefail + + # LCG view (cmake, gcc-13, protobuf, openblas) + if [ -f "${{ env.LCG_VIEW }}/setup.sh" ]; then + set +u; source "${{ env.LCG_VIEW }}/setup.sh"; set -u + else + echo "LCG view not found — installing from dnf" + dnf install -y epel-release + dnf install -y cmake ninja-build gcc-c++ python3 git \ + protobuf-devel openblas-devel + fi + + # CUDA toolkit (nvcc + headers) + if [ -x "${{ env.CUDA_CVMFS }}/bin/nvcc" ]; then + echo "${{ env.CUDA_CVMFS }}/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=${{ env.CUDA_CVMFS }}" >> "$GITHUB_ENV" + elif [ -x /usr/local/cuda/bin/nvcc ]; then + echo "/usr/local/cuda/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=/usr/local/cuda" >> "$GITHUB_ENV" + else + echo "nvcc not found — installing CUDA toolkit from NVIDIA repo" + dnf config-manager --add-repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + dnf install -y cuda-compiler-12-4 cuda-cudart-devel-12-4 cuda-libraries-devel-12-4 + echo "/usr/local/cuda-12.4/bin" >> "$GITHUB_PATH" + echo "CUDA_HOME=/usr/local/cuda-12.4" >> "$GITHUB_ENV" + fi + + # GTest + dnf install -y gtest-devel 2>/dev/null || \ + dnf install -y googletest-devel 2>/dev/null || ( + cd /tmp + git clone --depth 1 -b v1.14.0 https://github.com/google/googletest.git + cmake -B gtest-build -S googletest \ + -DCMAKE_INSTALL_PREFIX=/usr/local -DBUILD_SHARED_LIBS=ON + cmake --build gtest-build -j$(nproc) + cmake --install gtest-build + ) + + echo "PATH=$PATH" >> "$GITHUB_ENV" + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV" + echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}" >> "$GITHUB_ENV" + + - name: Checkout + uses: actions/checkout@v4 + + - name: Cache FetchContent dependencies + uses: actions/cache@v4 + with: + path: ${{ env.DEPS_CACHE }} + key: cmake-deps-tests-${{ hashFiles('test/CMakeLists.txt') }} + restore-keys: cmake-deps-tests- + + - name: Configure + run: | + cmake -B build -S . \ + -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \ + -DSOFIE_WITH_ROOT=OFF \ + -Dtesting=ON \ + -DENABLE_ALPAKA_TESTS=ON \ + -DALPAKA_BACKEND=cuda \ + "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \ + "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}" + + - name: Build tests + run: | + cmake --build build \ + --target TestCustomModelsFromONNXForAlpakaCuda \ + -j$(nproc) + + - name: Run tests + working-directory: build + run: ctest --output-on-failure -j1 + + - name: Upload test log + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-log-${{ github.run_id }} + path: build/Testing/Temporary/LastTest.log diff --git a/CMakeLists.txt b/CMakeLists.txt index c9bd226..16f4782 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,9 +4,16 @@ project(Sofie DESCRIPTION "SOFIE" LANGUAGES CXX) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) + find_package(BLAS) if(NOT BLAS_FOUND) - message(WARNING "BLAS not found: TMVA-SOFIE will not be fully tested") + message(WARNING "BLAS not found: sofie will not be fully tested") endif() message(STATUS "Looking for Protobuf") @@ -17,49 +24,101 @@ if(NOT Protobuf_FOUND) endif() if(NOT Protobuf_FOUND) if(fail-on-missing) - message(FATAL_ERROR "Protobuf libraries not found and they are required (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf libraries not found and they are required (sofie option enabled)") else() - message(STATUS "Protobuf not found. Switching off tmva-sofie option") + message(STATUS "Protobuf not found. Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() else() if(Protobuf_VERSION LESS 3.0) if(fail-on-missing) - message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (sofie option enabled)") else() - message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off tmva-sofie option") + message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() else() if(NOT TARGET protobuf::protoc) if(fail-on-missing) - message(FATAL_ERROR "Protobuf compiler not found (tmva-sofie option enabled)") + message(FATAL_ERROR "Protobuf compiler not found (sofie option enabled)") else() - message(STATUS "Protobuf compiler not found. Switching off tmva-sofie option") + message(STATUS "Protobuf compiler not found. Switching off sofie option") message(FATAL_ERROR "SOFIE cannot be installed without Protobuf") endif() endif() endif() endif() -find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree) -include(${ROOT_USE_FILE}) +option(SOFIE_WITH_ROOT "Enable ROOT support (required for .root weight files and ROOT serialization)" OFF) + +if(SOFIE_WITH_ROOT) + find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree) + if(ROOT_FOUND) + include(${ROOT_USE_FILE}) + message(STATUS "ROOT found: enabling ROOT support in SOFIE") + else() + message(FATAL_ERROR "SOFIE_WITH_ROOT is ON but ROOT was not found") + endif() +else() + message(STATUS "Building SOFIE without ROOT support (SOFIE_WITH_ROOT=OFF)") +endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -set(CMAKE_INSTALL_BINDIR "bin" CACHE PATH "user executables (bin)") -set(CMAKE_INSTALL_INCLUDEDIR "include" CACHE PATH "header files") -set(CMAKE_INSTALL_LIBDIR "lib" CACHE PATH "libraries") if(ccache) set(CMAKE_C_COMPILER_LAUNCHER ccache) set(CMAKE_CXX_COMPILER_LAUNCHER ccache) endif() +option(testing "Build and run tests" OFF) if(testing) - find_package(GTest REQUIRED) + find_package(GTest REQUIRED) enable_testing() endif() -include(cmake/modules/RoottestMacros.cmake) +option(SOFIE_BENCHMARK "Build the SOFIE CUDA benchmark toolkit" OFF) + +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + include(cmake/modules/RoottestMacros.cmake) +else() + include(cmake/modules/SofieTestMacros.cmake) +endif() + +add_subdirectory(utils) +add_subdirectory(core) +add_subdirectory(parsers) + +if(testing) + add_subdirectory(test) +endif() + +if(SOFIE_BENCHMARK) + add_subdirectory(benchmark) +endif() + +# ── Install cmake package config files ────────────────────────────────────── + +configure_package_config_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/SOFIEConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) + +write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion +) + +install( + EXPORT SOFIETargets + FILE SOFIETargets.cmake + NAMESPACE SOFIE:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) -add_subdirectory(src) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE +) diff --git a/README.md b/README.md index 97902f8..5c4042b 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,259 @@ # SOFIE -This is an experimental standalone version of SOFIE - a tool for Fast ML Inference within ROOT - the scientific data analysis framework. -Since SOFIE is a part of ROOT and therefore needs to be built altogether, it takes quite a long time in its development and testing. This standalone version allows you to just build SOFIE with the pre-built binaries of ROOT- making the entire development process way faster. +This is an experimental standalone version of **SOFIE** — a tool for Fast ML Inference +within [ROOT](https://root.cern), the scientific data analysis framework. +This standalone is especially developed for implementing and evaluating inference on +**heterogeneous architectures** (CUDA GPUs, AMD GPUs via HIP/ROCm, CPUs) using the +[Alpaka](https://github.com/alpaka-group/alpaka) portability layer. + +--- ## Installation -1. Getting a ROOT binary. -Download a pre-built binary of ROOT based on your architecture from [here](https://root.cern/install/). +### Prerequisites + +- CMake ≥ 3.16 +- C++20-capable compiler (GCC ≥ 11, Clang ≥ 14) +- [Protocol Buffers](https://protobuf.dev/) ≥ 3.0 (for ONNX model parsing) +- *(Optional)* ROOT ≥ 6.28 — only needed if using `.root` weight files or ROOT-based + serialization (`-DSOFIE_WITH_ROOT=ON`) +- *(Optional for GPU testing/benchmarking)* CUDA Toolkit ≥ 11.8 + +### 1. Clone and build -2. Build standalone SOFIE ```bash git clone https://github.com/sanjibansg/SOFIE.git cd SOFIE mkdir build && cd build -cmake -Dtesting=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo .. -cmake --build . --target install -j10 +cmake -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . --target install -j$(nproc) +``` + +To disable ROOT (build without ROOT dependency): + +```bash +cmake -DSOFIE_WITH_ROOT=OFF -DCMAKE_INSTALL_PREFIX=../install .. ``` -The commands above should build the SOFIE standalone. To include it within the ROOT binary and run altogether, we need to source the shared libraries for `SOFIE_core` and `SOFIE_parsers`. Within the SOFIE repository we may call + +### 2. Source the environment (ROOT-integrated workflow only) + +If you need the SOFIE libraries to be accessible from within a ROOT session: + +```bash +# Example — adjust the ROOT tarball name to match your download +source root_v6.36.02.Linux-ubuntu24.04-x86_64-gcc13.3/root/bin/thisroot.sh +source setup.sh # adds SOFIE_core and SOFIE_parsers to LD_LIBRARY_PATH +``` + +This step is **not required** when building without ROOT +(`-DSOFIE_WITH_ROOT=OFF`). + +--- + +## Testing + +Unit and integration tests are enabled with `-Dtesting=ON` and require +[GoogleTest](https://github.com/google/googletest). + +### CPU / default tests + +```bash +cmake -Dtesting=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . -j$(nproc) +ctest --output-on-failure +``` + +### GPU tests (Alpaka/CUDA) + +Alpaka-based GPU tests compile SOFIE-generated inference code as CUDA and verify +correctness against reference outputs. They require the CUDA Toolkit and a +compatible NVIDIA GPU. + +```bash +cmake -Dtesting=ON \ + -DENABLE_ALPAKA_TESTS=ON \ + -DALPAKA_BACKEND=cuda \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +cmake --build . -j$(nproc) +ctest --output-on-failure +``` + +| CMake flag | Default | Description | +|---|---|---| +| `-Dtesting=ON` | `OFF` | Enable the test suite | +| `-DENABLE_ALPAKA_TESTS=ON` | `OFF` | Enable Alpaka GPU tests | +| `-DALPAKA_BACKEND=` | `cuda` | Alpaka backend: `cuda`, `hip`, `cpu`, `sycl` | + +The test executable is `TestCustomModelsFromONNXForAlpakaCuda`. ONNX model files +used as test inputs are located in `core/test/input_models/`. Models with symbolic +(dynamic) input dimensions are specialised by the emitter before testing. + +--- + +## Benchmarking + +The benchmark toolkit (`benchmark/`) measures **inference latency and throughput** for +ONNX models compiled by SOFIE and executed via Alpaka. It supports an optional +side-by-side comparison with **ONNX Runtime GPU**. + +### Supported backends + +| Backend | CMake value | Status | +|---------|-------------|--------| +| NVIDIA CUDA | `CUDA` (default) | Supported | +| AMD HIP/ROCm | `HIP` | Planned | + +### Quick start + +```bash +# Place .onnx models in benchmark/models/ first +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_BACKEND=CUDA \ + -DSOFIE_BENCHMARK_CUDA_ARCH=86 \ # e.g. 86 for RTX 30xx, 80 for A100 + /path/to/SOFIE +cmake --build build --target sofie_benchmark -j$(nproc) +cd build/benchmark && ./sofie_benchmark +``` + +For a full reference of benchmark CMake flags, runtime options, the large-input +cluster benchmark, and instructions for adding new backends, see +[benchmark/README.md](benchmark/README.md). + +### Profiling + +Add `-DSOFIE_BENCHMARK_PROFILE=ON` to enable **per-operator GPU timing** and a +**CPU/GPU memory breakdown** printed after each model's throughput line. ```bash -source setup.sh +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_PROFILE=ON \ + /path/to/SOFIE +cmake --build build --target sofie_benchmark -j$(nproc) +cd build/benchmark && ./sofie_benchmark +``` + +> Profiling inserts `alpaka::wait(queue)` after each operator, which serialises +> GPU execution. Use a non-profile build for peak-throughput numbers. + +Profiling can also be enabled on a per-model basis outside the benchmark by +passing `Options::kProfile` at code-generation time (see +[Profiling in user code](#profiling-in-user-code) below). + +--- + +## GPU Architecture Support + +SOFIE generates Alpaka-based inference code that is portable across GPU +architectures: + +- **NVIDIA CUDA** — select the SM architecture with + `-DSOFIE_BENCHMARK_CUDA_ARCH=` (e.g. `75` for Turing, `86` for Ampere, + `90` for Hopper). +- **AMD HIP/ROCm** — the Alpaka backend tag (`alpaka::TagGpuHipRt`) and the + `SOFIE_BACKEND_HIP` compile-time define are already wired in + `benchmark/src/BenchmarkBackend.hxx`; full build-system integration is in + progress. +- **CPU** — a serial CPU Alpaka backend (`alpaka::TagCpuSerial`) is available as a + fallback for debugging and portability testing. + +--- +## Project Structure + +``` +SOFIE/ +├── core/ # Core SOFIE library (RModel, operators, code generators) +│ └── test/ # Unit/integration tests +├── parsers/ # ONNX → RModel parser +├── benchmark/ # Latency / throughput benchmark toolkit +│ ├── models/ # Place .onnx benchmark models here +│ └── src/ # CMake-configured source templates +├── utils/ # Utility targets +└── cmake/ # CMake modules and config templates ``` -Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull. +--- + +## Profiling in user code + +Both the CPU and GPU code generators accept `Options::kProfile` to embed +per-operator timing and memory reporting directly in the generated session struct. + +### CPU inference + +```cpp +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +SOFIE::RModelParser_ONNX parser; +SOFIE::RModel model = parser.Parse("my_model.onnx"); + +// Generate with profiling enabled +model.Generate(SOFIE::Options::kProfile); +model.OutputGenerated("MyModel.hxx"); +``` + +The generated `Session` struct gains: + +| Method | Description | +|--------|-------------| +| `PrintProfilingResults(bool order=true)` | Per-operator mean ± stderr (µs), sorted by avg time | +| `ResetProfilingResults()` | Clear accumulated timing data | +| `GetOpAvgTime()` | `std::map` of averages | +| `GetOpVariance()` | `std::map` of variances | + +```cpp +#include "MyModel.hxx" +SOFIE_MyModel::Session session("MyModel.dat"); + +// Warmup +for (int i = 0; i < 10; ++i) session.infer(input); +session.ResetProfilingResults(); + +// Timed runs +for (int i = 0; i < 100; ++i) session.infer(input); +session.PrintProfilingResults(); +``` + +### GPU inference (Alpaka/CUDA) + +```cpp +model.GenerateGPU_ALPAKA(SOFIE::Options::kProfile); +model.OutputGenerated("MyModel_GPU_ALPAKA.hxx"); +``` + +The generated GPU `Session` additionally provides: + +| Method | Description | +|--------|-------------| +| `PrintProfilingResults(bool order=true)` | Per-operator GPU wall-clock time (µs) with `alpaka::wait` sync | +| `ResetProfilingResults()` | Clear accumulated timing data | +| `GetOpAvgTime()` | `std::map` of averages | +| `PrintMemoryInfo()` | CPU/GPU memory breakdown (computed at code-gen time) | + +```cpp +#include "MyModel_GPU_ALPAKA.hxx" +SOFIE_MyModel::Session session("MyModel_GPU_ALPAKA.dat"); + +for (int i = 0; i < 10; ++i) session.infer(input_d); // warmup +session.ResetProfilingResults(); + +for (int i = 0; i < 100; ++i) session.infer(input_d); // timed +session.PrintProfilingResults(); +session.PrintMemoryInfo(); +``` + +> **Timing accuracy:** `alpaka::wait(queue)` is called after each operator kernel +> so the wall-clock measurement captures actual GPU execution time. This +> disables kernel pipelining; use a non-profile build for throughput measurement. + +--- - ## Inspiration -The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit). + +The standalone version of SOFIE is developed with inspiration from the standalone +version of RooFit developed by Jonas Rembser, which can be found +[here](https://github.com/guitargeek/roofit). diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 0000000..5ba3a9d --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,721 @@ +cmake_minimum_required(VERSION 3.18) +include(FetchContent) + +################################################################################ +# SOFIE Alpaka Benchmark Toolkit +# +# Usage: +# cmake -Bbuild -DSOFIE_BENCHMARK=ON . +# cmake --build build --target sofie_benchmark +# cd build/benchmark && ./sofie_benchmark [options] +# +# To also benchmark with ONNX Runtime GPU: +# cmake -Bbuild -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_ORT=ON \ +# [-DONNXRUNTIME_ROOT=/usr/local/onnxruntime] . +# ./sofie_benchmark --onnxruntime +# +# Place .onnx models in benchmark/models/ and re-run cmake to register them. +################################################################################ + +option(SOFIE_BENCHMARK_ORT + "Also benchmark ONNX Runtime GPU alongside SOFIE (requires ORT ≥ 1.18)" + OFF) + +option(SOFIE_BENCHMARK_PROFILE + "Enable per-operator GPU profiling (per-op timing + memory breakdown). \ +Mutually exclusive with throughput benchmarking — rebuild without this flag \ +to measure throughput." + OFF) + +################################################################################ +# Backend / architecture selection +# +# SOFIE_BENCHMARK_BACKEND applies to both throughput benchmarking and profiling. +# Currently only CUDA is supported; setting any other value is a hard error. +# When AMD GPU (HIP/ROCm) support is added, this option will accept "HIP". +################################################################################ + +set(SOFIE_BENCHMARK_BACKEND "CUDA" CACHE STRING + "Target accelerator backend for SOFIE benchmark and profiling (currently only CUDA)") +set_property(CACHE SOFIE_BENCHMARK_BACKEND PROPERTY STRINGS CUDA) + +string(TOUPPER "${SOFIE_BENCHMARK_BACKEND}" _bench_backend) + +if(NOT _bench_backend STREQUAL "CUDA") + message(FATAL_ERROR + "SOFIE Benchmark: SOFIE_BENCHMARK_BACKEND='${SOFIE_BENCHMARK_BACKEND}' is not " + "supported. Only 'CUDA' is currently implemented. " + "AMD GPU (HIP/ROCm) support is planned for a future release.") +endif() + +# Compile-time defines propagated to every benchmark translation unit. +# These drive BenchmarkBackend.hxx type aliases and SOFIE_BENCH_DEVICE_SYNC(). +set(_SOFIE_BENCH_ALPAKA_DEFINE ALPAKA_ACC_GPU_CUDA_ENABLED) +set(_SOFIE_BENCH_BACKEND_DEFINE SOFIE_BACKEND_CUDA) + +if(SOFIE_BENCHMARK_PROFILE) + message(STATUS "SOFIE Benchmark: profiling ENABLED " + "(backend = ${SOFIE_BENCHMARK_BACKEND}, throughput benchmarking disabled)") +else() + message(STATUS "SOFIE Benchmark: backend = ${SOFIE_BENCHMARK_BACKEND}") +endif() + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/../core/inc + ${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc +) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +################################################################################ +# Discover models +################################################################################ + +file(GLOB BENCHMARK_ONNX_MODELS + "${CMAKE_CURRENT_SOURCE_DIR}/models/*.onnx") + +# Exclude the base/template models used by specialize_models.py to generate +# fixed-size variants (e.g. gnn_h32_k2_n100_e500.onnx). The base models +# have symbolic dim_param input dimensions that SOFIE cannot resolve during +# code generation: +# • GetTensorShape() throws "unspecified dimension parameter" for input +# tensors whose shapes were not concretised by ONNX shape inference. +# • GetTensorShape() throws "is a dynamic tensor" when an operator tries +# to read a concrete size from an intermediate tensor that remained +# dynamic because its input shapes were unknown. +# The concrete specialised variants produced by specialize_models.py are +# benchmarked instead. +list(FILTER BENCHMARK_ONNX_MODELS EXCLUDE REGEX + "/(gnn_h32_k2|gnn_h64_k4|punet_h32_k2_heads4_layers2|punet_h64_k4_heads4_layers2|transformer_d32_h2_L6_ff32)\\.onnx$") + +# Exclude models that exceed GPU memory (cudaErrorMemoryAllocation) or trigger +# a cuBLAS EXECUTION_FAILED error (status 13) on the available hardware +# (RTX 2070 SUPER, 8 GB VRAM). They stay on disk for reference and are +# compiled into sofie_benchmark_large (cluster target) instead. +list(FILTER BENCHMARK_ONNX_MODELS EXCLUDE REGEX + "/(punet_h32_k2_heads4_layers2_n10000_e50000\ +|punet_h32_k2_heads4_layers2_n1000_e5000\ +|punet_h32_k2_heads4_layers2_n3000_e15000\ +|punet_h32_k2_heads4_layers2_n30000_e150000\ +|punet_h32_k2_heads4_layers2_n100000_e500000\ +|punet_h64_k4_heads4_layers2_n10000_e50000\ +|punet_h64_k4_heads4_layers2_n100_e500\ +|punet_h64_k4_heads4_layers2_n300_e1500\ +|punet_h64_k4_heads4_layers2_n3000_e15000\ +|punet_h64_k4_heads4_layers2_n30000_e150000\ +|punet_h64_k4_heads4_layers2_n100000_e500000\ +|gnn_h32_k2_n30000_e150000\ +|gnn_h32_k2_n100000_e500000\ +|gnn_h64_k4_n30000_e150000\ +|gnn_h64_k4_n100000_e500000\ +|transformer_L1000_B100\ +|transformer_L100_B100\ +|transformer_L8000_B1\ +|transformer_d32_h2_L6_ff32_n60_s60)\\.onnx$") + +if(NOT BENCHMARK_ONNX_MODELS) + message(STATUS + "SOFIE Benchmark: No .onnx models found in benchmark/models/. " + "Add ONNX models there and re-run cmake to enable benchmarking.") + return() +endif() + +list(LENGTH BENCHMARK_ONNX_MODELS N_MODELS) +message(STATUS "SOFIE Benchmark: Found ${N_MODELS} model(s) in benchmark/models/") + +################################################################################ +# Fetch Alpaka and sofieBLAS (same pinned revisions as the test suite) +################################################################################ + +FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG dev +) +FetchContent_MakeAvailable(sofieBLAS) + +FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d +) +FetchContent_MakeAvailable(alpaka) + +################################################################################ +# Hardware toolkit setup — CUDA (the only supported backend for now) +################################################################################ + +enable_language(CUDA) +find_package(CUDAToolkit REQUIRED) +message(STATUS "SOFIE Benchmark: CUDA toolkit ${CUDAToolkit_VERSION}") + +################################################################################ +# Optional: ONNX Runtime GPU backend +################################################################################ + +set(SOFIE_ORT_FOUND FALSE) + +if(SOFIE_BENCHMARK_ORT) + # Prefer manual detection — the installed ORT CMake config may reference + # a wrong lib path (e.g. lib64 vs lib) and raise a hard error even with QUIET. + # If ONNXRUNTIME_ROOT is provided, go straight to the manual path. + # Otherwise attempt the CMake config with NO_DEFAULT_PATH so it only looks + # where we tell it, and fall through to manual on failure. + + set(_ort_search_roots "") + if(DEFINED ONNXRUNTIME_ROOT) + list(APPEND _ort_search_roots "${ONNXRUNTIME_ROOT}") + endif() + list(APPEND _ort_search_roots + /usr/local/onnxruntime /usr/local /usr /opt) + + # Manual header + library search (reliable, no broken cmake-config risk) + find_path(ONNXRUNTIME_INCLUDE_DIR + NAMES onnxruntime_cxx_api.h + PATHS ${_ort_search_roots} + PATH_SUFFIXES include include/onnxruntime + NO_DEFAULT_PATH) + + find_library(ONNXRUNTIME_LIBRARY + NAMES onnxruntime + PATHS ${_ort_search_roots} + PATH_SUFFIXES lib lib64 + NO_DEFAULT_PATH) + + if(ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY) + set(SOFIE_ORT_FOUND TRUE) + add_library(onnxruntime::onnxruntime SHARED IMPORTED) + set_target_properties(onnxruntime::onnxruntime PROPERTIES + IMPORTED_LOCATION "${ONNXRUNTIME_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${ONNXRUNTIME_INCLUDE_DIR}") + set(SOFIE_ORT_TARGET onnxruntime::onnxruntime) + message(STATUS "SOFIE Benchmark: ONNX Runtime found — ${ONNXRUNTIME_LIBRARY}") + message(STATUS "SOFIE Benchmark: ORT headers — ${ONNXRUNTIME_INCLUDE_DIR}") + else() + message(WARNING + "SOFIE Benchmark: SOFIE_BENCHMARK_ORT=ON but ONNX Runtime " + "not found. Set -DONNXRUNTIME_ROOT= or install ORT. " + "ORT benchmarking will be disabled.") + endif() +endif() + +if(SOFIE_BENCHMARK_ORT AND NOT SOFIE_ORT_FOUND) + message(STATUS "SOFIE Benchmark: ORT benchmarking disabled (library not found)") +endif() + +################################################################################ +# Build per-model strings for configure_file +################################################################################ + +set(_EMIT_BLOCK +"try {\n\ + EmitBenchmarkModel(\"@1@\", \"@2@\", outDir);\n\ +} catch (const std::exception &e) {\n\ + std::cerr << \"[ERROR] @2@: \" << e.what() << \"\\n\";\n\ + ++failures;\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] @2@: unknown exception\\n\";\n\ + ++failures;\n\ +}\n\ +") + +set(BENCHMARK_EMIT_CAPTURES "") +set(BENCHMARK_BENCH_HEADERS "") +set(BENCHMARK_FWD_DECLS "") +set(BENCHMARK_SINGLE_MODEL_CASES "") +set(BENCHMARK_SPAWN_CALLS "") +set(GENERATED_HEADERS "") +set(BENCHMARK_MODEL_CU_SRCS "") + +foreach(ONNX_FILE ${BENCHMARK_ONNX_MODELS}) + get_filename_component(MODEL_NAME "${ONNX_FILE}" NAME_WE) + + string(REGEX REPLACE "[^A-Za-z0-9]" "_" MODEL_CPPNAME "${MODEL_NAME}") + + set(GEN_HXX "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_GPU_ALPAKA.hxx") + set(GEN_BENCH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.hxx") + list(APPEND GENERATED_HEADERS "${GEN_HXX}" "${GEN_BENCH}") + + string(REPLACE "@1@" "${ONNX_FILE}" _emit_cap "${_EMIT_BLOCK}") + string(REPLACE "@2@" "${MODEL_NAME}" _emit_cap "${_emit_cap}") + string(APPEND BENCHMARK_EMIT_CAPTURES "${_emit_cap}") + + string(APPEND BENCHMARK_BENCH_HEADERS + "#include \"${MODEL_NAME}_bench.hxx\"\n") + + # Forward declaration for the main TU (function defined in per-model .cu) + string(APPEND BENCHMARK_FWD_DECLS + "void Benchmark_${MODEL_CPPNAME}(int warmup, int iterations, const std::string& weightsDir);\n") + + # Single-model dispatch: one if-branch per model (used in --single-model mode) + string(APPEND BENCHMARK_SINGLE_MODEL_CASES + " if (model == \"${MODEL_CPPNAME}\") {\n" + " Benchmark_${MODEL_CPPNAME}(warmup, iterations, weightsDir);\n" + "#ifdef SOFIE_BENCHMARK_ORT\n" + " if (run_ort) BenchmarkORT_GPU(\"${ONNX_FILE}\", \"${MODEL_NAME}\", warmup, iterations);\n" + "#endif\n" + " return 0;\n" + " }\n" + ) + + # Subprocess spawn: run this binary as a child with --single-model . + # Each child gets a fresh CUDA context so GPU memory from the previous + # model is completely freed before the next one allocates. + string(APPEND BENCHMARK_SPAWN_CALLS + " {\n" + " std::string cmd = std::string(argv[0])\n" + " + \" --single-model ${MODEL_CPPNAME}\"\n" + " + commonArgs;\n" + " int rc = std::system(cmd.c_str());\n" + " if (rc != 0) {\n" + " std::fprintf(stderr, \"[ERROR] ${MODEL_NAME}: subprocess exited %d\\n\", rc);\n" + " ++totalFailed;\n" + " }\n" + " }\n" + ) + + # Per-model compilation unit: include one bench.hxx → one .cu file + set(_model_cu "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.cu") + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/ModelBench.cu.in" + "${_model_cu}" + @ONLY + ) + list(APPEND BENCHMARK_MODEL_CU_SRCS "${_model_cu}") +endforeach() + +################################################################################ +# Configure emitter and runner sources +################################################################################ + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkEmitter.cxx.in" + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx" + @ONLY +) + +# Main runner: only main() + forward declarations (no model headers). +# Each model is compiled in its own _bench.cu TU (see BENCHMARK_MODEL_CU_SRCS). +set(RUNNER_SRC "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_main.cpp") +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkRunner.cxx.in" + "${RUNNER_SRC}" + @ONLY +) + +################################################################################ +# Emitter executable (plain C++, generates SOFIE headers at build time) +################################################################################ + +add_executable(sofie_benchmark_emitter + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx" +) + +target_include_directories(sofie_benchmark_emitter PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../core/inc" + "${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc" +) + +target_link_libraries(sofie_benchmark_emitter PRIVATE + SOFIE_core + SOFIE_parsers + protobuf::libprotobuf +) + +target_compile_options(sofie_benchmark_emitter PRIVATE + -Wno-unused-parameter + -Wno-array-bounds +) + +target_compile_definitions(sofie_benchmark_emitter PRIVATE + $<$:SOFIE_BENCHMARK_PROFILE> +) + +################################################################################ +# Custom command: run emitter → generate inference + benchmark headers +################################################################################ + +add_custom_command( + OUTPUT ${GENERATED_HEADERS} + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND "$" "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS sofie_benchmark_emitter ${BENCHMARK_ONNX_MODELS} + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + COMMENT "SOFIE Benchmark: generating headers for ${N_MODELS} model(s)..." + VERBATIM +) + +add_custom_target(sofie_benchmark_headers + DEPENDS ${GENERATED_HEADERS} +) + +################################################################################ +# Benchmark runner (compiled as .cu, same as the test suite) +################################################################################ + +# Mark every per-model bench.cu as CUDA so nvcc processes the device kernels. +set_source_files_properties(${BENCHMARK_MODEL_CU_SRCS} PROPERTIES LANGUAGE CUDA) + +# The main runner is plain C++ (no CUDA kernels, just calls forward-declared fns). +set_source_files_properties("${RUNNER_SRC}" PROPERTIES LANGUAGE CXX) + +add_executable(sofie_benchmark "${RUNNER_SRC}" ${BENCHMARK_MODEL_CU_SRCS}) + +add_dependencies(sofie_benchmark sofie_benchmark_headers) + +target_include_directories(sofie_benchmark PRIVATE + "${CMAKE_CURRENT_BINARY_DIR}" # generated headers live here + "${CMAKE_CURRENT_SOURCE_DIR}/src" # ONNXRuntimeBenchmark.hxx + "${alpaka_SOURCE_DIR}/include" + "${sofieblas_SOURCE_DIR}/include" + "${CUDAToolkit_INCLUDE_DIRS}" +) + +# Default to the native GPU architecture so we compile only one arch (saves +# memory and time). The previous "70;75;80;86;89;90" multi-arch list caused +# nvcc to OOM-kill when processing all models. Override via +# cmake -DSOFIE_BENCHMARK_CUDA_ARCH="75;86" . +if(NOT DEFINED SOFIE_BENCHMARK_CUDA_ARCH OR SOFIE_BENCHMARK_CUDA_ARCH STREQUAL "") + if(CMAKE_CUDA_ARCHITECTURES AND NOT CMAKE_CUDA_ARCHITECTURES STREQUAL "") + set(SOFIE_BENCHMARK_CUDA_ARCH "${CMAKE_CUDA_ARCHITECTURES}") + else() + set(SOFIE_BENCHMARK_CUDA_ARCH "75") # RTX 2070 SUPER default + endif() +endif() +message(STATUS "SOFIE Benchmark: CUDA architectures = ${SOFIE_BENCHMARK_CUDA_ARCH}") + +set_target_properties(sofie_benchmark PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES "${SOFIE_BENCHMARK_CUDA_ARCH}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" +) + +target_compile_definitions(sofie_benchmark PRIVATE + ${_SOFIE_BENCH_ALPAKA_DEFINE} + ${_SOFIE_BENCH_BACKEND_DEFINE} + ALPAKA_HAS_STD_ATOMIC_REF + $<$:SOFIE_BENCHMARK_ORT> + $<$:SOFIE_BENCHMARK_PROFILE> +) + +target_compile_options(sofie_benchmark PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --use_fast_math + -O1 + -Wno-deprecated-gpu-targets + # Suppress "variable was declared but never referenced" (#177-D). + # Generated Expand/Where kernels compute per-dimension indices for + # broadcast dimensions; when a dimension has size 1 the index variable + # is always 0 and goes unused. Without this suppress the CUDA device + # compiler counts them against its error-budget and stops compilation. + -diag-suppress 177 + # Limit device register usage to reduce per-kernel memory footprint + # during compilation of the large all-models TU. + --maxrregcount=64 + > + $<$: + -O2 + -fPIC + > +) + +target_link_libraries(sofie_benchmark PRIVATE + SOFIE_core + CUDA::cudart + CUDA::cublas + CUDA::cublasLt + $<$:${SOFIE_ORT_TARGET}> +) + +if(SOFIE_ORT_FOUND) + message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured " + "(${N_MODELS} model(s), CUDA backend + ORT-GPU)") +else() + message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured " + "(${N_MODELS} model(s), CUDA backend; " + "re-configure with -DSOFIE_BENCHMARK_ORT=ON for ORT comparison)") +endif() + +# Convenience CTest entry +if(testing) + add_test( + NAME SofieBenchmark + COMMAND sofie_benchmark --warmup 5 --iterations 20 + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) +endif() + +################################################################################ +# Large-input benchmark (cluster GPUs: A100 / H100 / MI300X, ≥40 GB VRAM) +# +# Enables: +# • All models excluded from sofie_benchmark due to OOM on ≤8 GB GPUs +# (punet large, transformer_L1000_B100, transformer_L8000_B1, ...) +# • New very-large GNN / PUNet variants: n=30 000 / n=100 000 nodes +# • transformer_d32_h2_L6_ff32_n60_s60 (max sequence length for this arch) +# +# Usage: +# cmake -Bbuild . -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_LARGE=ON \ +# [-DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=80] # 80=A100, 90=H100 +# cmake --build build --target sofie_benchmark_large +################################################################################ + +option(SOFIE_BENCHMARK_LARGE + "Build sofie_benchmark_large for cluster GPUs (A100/H100, ≥40 GB VRAM)" + OFF) + +if(SOFIE_BENCHMARK_LARGE) + + # CUDA architecture for the cluster. Override on the command line: + # cmake ... -DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=90 # H100 + set(SOFIE_BENCHMARK_LARGE_CUDA_ARCH "80" CACHE STRING + "CUDA SM architecture(s) for sofie_benchmark_large (e.g. 80=A100, 90=H100)") + + # ------------------------------------------------------------------------- + # Exact list of large-input models to include. + # These are either previously OOM-excluded or brand-new (n=30k / n=100k). + # ------------------------------------------------------------------------- + set(LARGE_MODEL_NAMES + # ── PUNet h32/k2 (OOM + new) ────────────────────────────────────── + "punet_h32_k2_heads4_layers2_n1000_e5000" + "punet_h32_k2_heads4_layers2_n3000_e15000" + "punet_h32_k2_heads4_layers2_n10000_e50000" + "punet_h32_k2_heads4_layers2_n30000_e150000" + "punet_h32_k2_heads4_layers2_n100000_e500000" + # ── PUNet h64/k4 (OOM + new) ────────────────────────────────────── + "punet_h64_k4_heads4_layers2_n100_e500" + "punet_h64_k4_heads4_layers2_n300_e1500" + "punet_h64_k4_heads4_layers2_n1000_e5000" + "punet_h64_k4_heads4_layers2_n3000_e15000" + "punet_h64_k4_heads4_layers2_n10000_e50000" + "punet_h64_k4_heads4_layers2_n30000_e150000" + "punet_h64_k4_heads4_layers2_n100000_e500000" + # ── GNN h32/k2 (new large) ──────────────────────────────────────── + "gnn_h32_k2_n30000_e150000" + "gnn_h32_k2_n100000_e500000" + # ── GNN h64/k4 (new large) ──────────────────────────────────────── + "gnn_h64_k4_n30000_e150000" + "gnn_h64_k4_n100000_e500000" + # ── Transformers (OOM on 8 GB) ────────────────────────────────────── + "transformer_L100_B100" + "transformer_L1000_B100" + "transformer_L8000_B1" + # ── Transformer d32 (max sequence length for this architecture) ───── + "transformer_d32_h2_L6_ff32_n60_s60" + ) + + # Build a regex that matches any of the large model names. + set(_large_regex "") + foreach(_m ${LARGE_MODEL_NAMES}) + if(_large_regex) + set(_large_regex "${_large_regex}|${_m}") + else() + set(_large_regex "${_m}") + endif() + endforeach() + set(_large_regex "(${_large_regex})\\.onnx$") + + file(GLOB _ALL_LARGE_ONNX "${CMAKE_CURRENT_SOURCE_DIR}/models/*.onnx") + list(FILTER _ALL_LARGE_ONNX INCLUDE REGEX "${_large_regex}") + + if(NOT _ALL_LARGE_ONNX) + message(WARNING + "SOFIE_BENCHMARK_LARGE=ON but none of the expected large-input .onnx " + "files were found in benchmark/models/. Run specialize_models.py " + "(with the extended GNN_VARIANTS list) first, then re-run cmake.") + else() + list(LENGTH _ALL_LARGE_ONNX N_LARGE) + message(STATUS + "SOFIE Benchmark Large: ${N_LARGE} large-input model(s), " + "CUDA arch = ${SOFIE_BENCHMARK_LARGE_CUDA_ARCH}") + + # ------------------------------------------------------------------ + # Build per-model strings (same pattern as the small benchmark above) + # ------------------------------------------------------------------ + set(LARGE_EMIT_CAPTURES "") + set(LARGE_FWD_DECLS "") + set(LARGE_SINGLE_MODEL_CASES "") + set(LARGE_SPAWN_CALLS "") + set(LARGE_GENERATED_HEADERS "") + set(LARGE_MODEL_CU_SRCS "") + + foreach(ONNX_FILE ${_ALL_LARGE_ONNX}) + get_filename_component(MODEL_NAME "${ONNX_FILE}" NAME_WE) + string(REGEX REPLACE "[^A-Za-z0-9]" "_" MODEL_CPPNAME "${MODEL_NAME}") + + set(GEN_HXX "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_GPU_ALPAKA.hxx") + set(GEN_BENCH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.hxx") + list(APPEND LARGE_GENERATED_HEADERS "${GEN_HXX}" "${GEN_BENCH}") + + string(REPLACE "@1@" "${ONNX_FILE}" _ec "${_EMIT_BLOCK}") + string(REPLACE "@2@" "${MODEL_NAME}" _ec "${_ec}") + string(APPEND LARGE_EMIT_CAPTURES "${_ec}") + + string(APPEND LARGE_FWD_DECLS + "void Benchmark_${MODEL_CPPNAME}(int warmup, int iterations, const std::string& weightsDir);\n") + + string(APPEND LARGE_SINGLE_MODEL_CASES + " if (model == \"${MODEL_CPPNAME}\") {\n" + " Benchmark_${MODEL_CPPNAME}(warmup, iterations, weightsDir);\n" + "#ifdef SOFIE_BENCHMARK_ORT\n" + " if (run_ort) BenchmarkORT_GPU(\"${ONNX_FILE}\", \"${MODEL_NAME}\", warmup, iterations);\n" + "#endif\n" + " return 0;\n" + " }\n" + ) + + string(APPEND LARGE_SPAWN_CALLS + " {\n" + " std::string cmd = std::string(argv[0])\n" + " + \" --single-model ${MODEL_CPPNAME}\"\n" + " + commonArgs;\n" + " int rc = std::system(cmd.c_str());\n" + " if (rc != 0) {\n" + " std::fprintf(stderr, \"[ERROR] ${MODEL_NAME}: subprocess exited %d\\n\", rc);\n" + " ++totalFailed;\n" + " }\n" + " }\n" + ) + + set(_model_cu "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.cu") + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/ModelBench.cu.in" + "${_model_cu}" + @ONLY + ) + list(APPEND LARGE_MODEL_CU_SRCS "${_model_cu}") + endforeach() + + # ------------------------------------------------------------------ + # Configure emitter + runner sources for the large benchmark + # ------------------------------------------------------------------ + set(BENCHMARK_EMIT_CAPTURES "${LARGE_EMIT_CAPTURES}") + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkEmitter.cxx.in" + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_large.cxx" + @ONLY + ) + + set(BENCHMARK_FWD_DECLS "${LARGE_FWD_DECLS}") + set(BENCHMARK_SINGLE_MODEL_CASES "${LARGE_SINGLE_MODEL_CASES}") + set(BENCHMARK_SPAWN_CALLS "${LARGE_SPAWN_CALLS}") + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkRunner.cxx.in" + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp" + @ONLY + ) + + # ------------------------------------------------------------------ + # sofie_benchmark_large_emitter — generates hxx + dat at build time + # ------------------------------------------------------------------ + add_executable(sofie_benchmark_large_emitter + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_large.cxx" + ) + target_include_directories(sofie_benchmark_large_emitter PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/../core/inc" + "${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc" + ) + target_link_libraries(sofie_benchmark_large_emitter PRIVATE + SOFIE_core + SOFIE_parsers + protobuf::libprotobuf + ) + target_compile_options(sofie_benchmark_large_emitter PRIVATE + -Wno-unused-parameter + -Wno-array-bounds + ) + target_compile_definitions(sofie_benchmark_large_emitter PRIVATE + $<$:SOFIE_BENCHMARK_PROFILE> + ) + + # ------------------------------------------------------------------ + # Custom command: run large emitter → large hxx / dat files + # ------------------------------------------------------------------ + add_custom_command( + OUTPUT ${LARGE_GENERATED_HEADERS} + COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND "$" "${CMAKE_CURRENT_BINARY_DIR}" + DEPENDS sofie_benchmark_large_emitter ${_ALL_LARGE_ONNX} + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + COMMENT "SOFIE Benchmark Large: generating headers for ${N_LARGE} large model(s)..." + VERBATIM + ) + + add_custom_target(sofie_benchmark_large_headers + DEPENDS ${LARGE_GENERATED_HEADERS} + ) + + # ------------------------------------------------------------------ + # sofie_benchmark_large — the cluster benchmark binary + # ------------------------------------------------------------------ + set_source_files_properties(${LARGE_MODEL_CU_SRCS} PROPERTIES LANGUAGE CUDA) + set_source_files_properties( + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp" + PROPERTIES LANGUAGE CXX) + + add_executable(sofie_benchmark_large + "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp" + ${LARGE_MODEL_CU_SRCS} + ) + add_dependencies(sofie_benchmark_large sofie_benchmark_large_headers) + + target_include_directories(sofie_benchmark_large PRIVATE + "${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${alpaka_SOURCE_DIR}/include" + "${sofieblas_SOURCE_DIR}/include" + "${CUDAToolkit_INCLUDE_DIRS}" + ) + + set_target_properties(sofie_benchmark_large PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES "${SOFIE_BENCHMARK_LARGE_CUDA_ARCH}" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + ) + + target_compile_definitions(sofie_benchmark_large PRIVATE + ${_SOFIE_BENCH_ALPAKA_DEFINE} + ${_SOFIE_BENCH_BACKEND_DEFINE} + ALPAKA_HAS_STD_ATOMIC_REF + $<$:SOFIE_BENCHMARK_PROFILE> + # ORT intentionally excluded: libonnxruntime has no static lib, + # so the binary would fail on pods that lack the CUDA toolkit. + ) + + target_compile_options(sofie_benchmark_large PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --use_fast_math + -O2 + -Wno-deprecated-gpu-targets + -diag-suppress 177 + --maxrregcount=64 + > + $<$: + -O2 + -fPIC + > + ) + + target_link_libraries(sofie_benchmark_large PRIVATE + SOFIE_core + CUDA::cudart_static # statically embed libcudart — no .so needed on the pod + CUDA::cublas_static # statically embed libcublas + CUDA::cublasLt_static # statically embed libcublasLt + CUDA::culibos # required companion for static cublas + # ORT intentionally excluded — no static libonnxruntime available + ) + + message(STATUS + "SOFIE Benchmark Large: target 'sofie_benchmark_large' configured " + "(${N_LARGE} large model(s), sm${SOFIE_BENCHMARK_LARGE_CUDA_ARCH})") + endif() # _ALL_LARGE_ONNX +endif() # SOFIE_BENCHMARK_LARGE diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..b06a2bf --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,234 @@ +# SOFIE Benchmark for Inference on Heterogeneous Architectures + +Measures **inference latency and throughput** for ONNX models compiled by SOFIE and +executed via [Alpaka](https://github.com/alpaka-group/alpaka). Optionally runs the +same models through **ONNX Runtime GPU** for a side-by-side comparison. + +--- + +## Supported Backends + +| Backend | CMake value | Status | +|---------|-------------|--------| +| NVIDIA CUDA | `CUDA` (default) | Supported | +| AMD HIP/ROCm | `HIP` | Planned — not yet implemented | + +The target architecture is selected with `-DSOFIE_BENCHMARK_BACKEND=` at +configure time. Specifying any value other than `CUDA` is a **hard CMake error** until +the corresponding backend is implemented. + +The generated inference code and timing harness are backend-agnostic: they use +`sofie_bench::AccTag`, `sofie_bench::Platform`, `sofie_bench::Queue`, and the +`SOFIE_BENCH_DEVICE_SYNC()` macro defined in `src/BenchmarkBackend.hxx`. Only the +low-level toolkit (CUDA vs HIP) needs to be swapped to add a new backend. + +--- + +## Quick Start + +### 1. Add your models + +``` +benchmark/models/ + GNN_model.onnx + simple_transformer.onnx + resnet50.onnx + ... +``` + +Re-run CMake after adding or removing files (it globs `models/*.onnx`). + +### 2. Configure + +```bash +# SOFIE inference only — CUDA backend (default) +cmake -B build -DSOFIE_BENCHMARK=ON /path/to/SOFIE + +# Explicitly name the backend (useful for CI or future HIP support) +cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_BACKEND=CUDA /path/to/SOFIE + +# With ONNX Runtime GPU comparison +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_ORT=ON \ + -DONNXRUNTIME_ROOT=/path/to/onnxruntime \ + /path/to/SOFIE + +# Override the CUDA SM architecture (default: native GPU or sm_75) +cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_CUDA_ARCH="86" /path/to/SOFIE +``` + +| CMake flag | Default | Description | +|---|---|---| +| `-DSOFIE_BENCHMARK=ON` | — | Enable the benchmark suite | +| `-DSOFIE_BENCHMARK_BACKEND=` | `CUDA` | Target accelerator backend | +| `-DSOFIE_BENCHMARK_CUDA_ARCH=` | native / `75` | CUDA SM architecture(s), e.g. `86` for RTX 30xx, `80` for A100 | +| `-DSOFIE_BENCHMARK_ORT=ON` | `OFF` | Also benchmark ONNX Runtime GPU | +| `-DONNXRUNTIME_ROOT=` | — | Path for ORT headers/library | +| `-DSOFIE_BENCHMARK_PROFILE=ON` | `OFF` | Enable per-operator GPU profiling instead of throughput benchmarking (see [Profiling](#profiling)) | +| `-DSOFIE_BENCHMARK_LARGE=ON` | `OFF` | Build `sofie_benchmark_large` for cluster GPUs (A100/H100, ≥40 GB VRAM) | +| `-DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=` | `80` | CUDA SM architecture for the large-input benchmark | + +> **Tested with ONNX Runtime 1.22.0 GPU** +> (`onnxruntime-linux-x64-gpu-1.22.0`). The CMake config bundled with some ORT +> installations may reference an incorrect `lib64/` path — this toolkit uses manual +> header/library detection to avoid that. + +### 3. Build + +```bash +cmake --build build --target sofie_benchmark -j$(nproc) +``` + +This automatically: +1. Builds **`sofie_benchmark_emitter`** — parses each `.onnx` and emits: + - `_GPU_ALPAKA.hxx` — SOFIE Alpaka inference code + - `_GPU_ALPAKA.dat` — serialized weights + - `_bench.hxx` — timing wrapper `Benchmark_()` +2. Builds **`sofie_benchmark`** — compiles all generated code and links the timing loop. + +### 4. Run + +```bash +cd build/benchmark + +# SOFIE only (no ORT needed at runtime) +./sofie_benchmark + +# SOFIE + ONNX Runtime GPU comparison +LD_LIBRARY_PATH=/path/to/onnxruntime/lib:$LD_LIBRARY_PATH \ +./sofie_benchmark --onnxruntime +``` + +--- + +## Runtime Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--warmup, -w ` | 10 | Warm-up iterations (not timed) | +| `--iterations, -n ` | 100 | Timed iterations | +| `--weights-dir ` | `.` | Directory containing `.dat` weight files | +| `--onnxruntime, --ort` | off | Run ONNX Runtime GPU benchmark after each SOFIE model | +| `--help, -h` | | Print this help and exit | + +--- + +## Large-input Benchmark (`sofie_benchmark_large`) + +For cluster GPUs (A100/H100/MI300X with ≥40 GB VRAM) a separate target is available +that includes models excluded from the default benchmark due to memory constraints on +consumer cards (≤8 GB): + +```bash +cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_LARGE=ON \ + -DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=80 # 80=A100, 90=H100 +cmake --build build --target sofie_benchmark_large -j$(nproc) +``` + +The large-benchmark binary links CUDA runtime statically so it can run on cluster +nodes where the CUDA toolkit is not installed system-wide. + +--- + +## Profiling + +Profiling and throughput benchmarking are **mutually exclusive** builds. Rebuild +with `-DSOFIE_BENCHMARK_PROFILE=ON` to switch the binary into profiling mode: the +timed H2D/inference/D2H loops are replaced by a profiling pass that measures +per-operator GPU time and prints a CPU/GPU memory breakdown. The target backend +and CUDA architecture are controlled by the same `SOFIE_BENCHMARK_BACKEND` and +`SOFIE_BENCHMARK_CUDA_ARCH` flags used for benchmarking. + +```bash +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_PROFILE=ON \ + /path/to/SOFIE +cmake --build build --target sofie_benchmark -j$(nproc) +cd build/benchmark && ./sofie_benchmark +``` + +After the normal throughput table, each model will print two additional blocks: + +**GPU Profiling Results** — per-operator wall-clock time (microseconds) measured +with `std::chrono` and an `alpaka::wait(queue)` synchronisation point after every +kernel. Results are sorted by average time descending, with ± stderr over all +timed iterations. Warmup iterations are excluded (the session is reset before the +timed runs start). + +``` +============================================================ + GPU PROFILING RESULTS + (wall-clock with alpaka::wait synchronization) +============================================================ + MatMul_3 : 142.718 +/- 0.412 us (100 runs) + MatMul_1 : 138.005 +/- 0.389 us (100 runs) + LayerNorm_5 : 23.441 +/- 0.201 us (100 runs) + ... + Overall_Time : 847.332 +/- 1.104 us (100 runs) +============================================================ +``` + +**Memory Usage Breakdown** — sizes computed at code-generation time from tensor +shapes and types. No runtime measurement is needed; the values are embedded +as constants in the generated session code. + +``` +============================================================ + MEMORY USAGE BREAKDOWN +============================================================ + CPU Memory: + Constant/embedded tensors : 0 bytes (0.0000 MB) + Weight tensors : 12582912 bytes (12.000 MB) + Intermediate memory pool : 0 bytes (0.0000 MB) + Total CPU : 12582912 bytes (12.000 MB) + GPU Memory (device buffers): + Weight device buffers : 12582912 bytes (12.000 MB) + Intermediate device bufs : 4194304 bytes (4.000 MB) + Total GPU : 16777216 bytes (16.000 MB) +============================================================ +``` + +> **Note:** Profiling and benchmarking are mutually exclusive. In a profiling +> build the throughput table is not printed; in a benchmark build +> `PrintProfilingResults` / `PrintMemoryInfo` are not called. Rebuild without +> `-DSOFIE_BENCHMARK_PROFILE=ON` to measure peak throughput. + +The same flag works for the large-input benchmark: + +```bash +cmake -B build \ + -DSOFIE_BENCHMARK=ON \ + -DSOFIE_BENCHMARK_LARGE=ON \ + -DSOFIE_BENCHMARK_PROFILE=ON \ + /path/to/SOFIE +cmake --build build --target sofie_benchmark_large -j$(nproc) +``` + +--- + +## Re-running after adding models + +```bash +cmake build +cmake --build build --target sofie_benchmark -j$(nproc) +``` + +--- + +## Adding a New Backend (HIP/ROCm) + +The benchmark infrastructure is designed so adding a new backend requires changes +in only a few places: + +1. **`CMakeLists.txt`** — add `"HIP"` to the `SOFIE_BENCHMARK_BACKEND` allowed + values, call `enable_language(HIP)`, find `hip::host`, and set + `_SOFIE_BENCH_ALPAKA_DEFINE = ALPAKA_ACC_GPU_HIP_ENABLED` / + `_SOFIE_BENCH_BACKEND_DEFINE = SOFIE_BACKEND_HIP`. +2. **`src/BenchmarkBackend.hxx`** — already contains the `SOFIE_BACKEND_HIP` branch + with `alpaka::TagGpuHipRt` aliases and `hipDeviceSynchronize()` sync macro. +3. **`src/ModelBench.cu.in`** — rename to `.hip.in` (or use a common extension) and + configure the source file language property to `HIP`. +4. **`src/ONNXRuntimeBenchmark.hxx`** — swap `OrtCUDAProviderOptions` for the ROCm + execution provider options if ORT comparison is desired on AMD hardware. diff --git a/benchmark/models/.gitkeep b/benchmark/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmark/models/GNN_model.onnx b/benchmark/models/GNN_model.onnx new file mode 100644 index 0000000..833e34d Binary files /dev/null and b/benchmark/models/GNN_model.onnx differ diff --git a/benchmark/models/gnn_h32_k2.onnx b/benchmark/models/gnn_h32_k2.onnx new file mode 100644 index 0000000..03e9f69 Binary files /dev/null and b/benchmark/models/gnn_h32_k2.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n100000_e500000.onnx b/benchmark/models/gnn_h32_k2_n100000_e500000.onnx new file mode 100644 index 0000000..eb43d13 Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n100000_e500000.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n10000_e50000.onnx b/benchmark/models/gnn_h32_k2_n10000_e50000.onnx new file mode 100644 index 0000000..bf795eb Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n10000_e50000.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n1000_e5000.onnx b/benchmark/models/gnn_h32_k2_n1000_e5000.onnx new file mode 100644 index 0000000..4ecab7b Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n1000_e5000.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n100_e500.onnx b/benchmark/models/gnn_h32_k2_n100_e500.onnx new file mode 100644 index 0000000..aa94c75 Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n100_e500.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n30000_e150000.onnx b/benchmark/models/gnn_h32_k2_n30000_e150000.onnx new file mode 100644 index 0000000..c7d2a73 Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n30000_e150000.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n3000_e15000.onnx b/benchmark/models/gnn_h32_k2_n3000_e15000.onnx new file mode 100644 index 0000000..cd3c21b Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n3000_e15000.onnx differ diff --git a/benchmark/models/gnn_h32_k2_n300_e1500.onnx b/benchmark/models/gnn_h32_k2_n300_e1500.onnx new file mode 100644 index 0000000..2761c2a Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n300_e1500.onnx differ diff --git a/benchmark/models/gnn_h64_k4.onnx b/benchmark/models/gnn_h64_k4.onnx new file mode 100644 index 0000000..7de5594 Binary files /dev/null and b/benchmark/models/gnn_h64_k4.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n100000_e500000.onnx b/benchmark/models/gnn_h64_k4_n100000_e500000.onnx new file mode 100644 index 0000000..379fe0c Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n100000_e500000.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n10000_e50000.onnx b/benchmark/models/gnn_h64_k4_n10000_e50000.onnx new file mode 100644 index 0000000..177aa1c Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n10000_e50000.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n1000_e5000.onnx b/benchmark/models/gnn_h64_k4_n1000_e5000.onnx new file mode 100644 index 0000000..f9f92ef Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n1000_e5000.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n100_e500.onnx b/benchmark/models/gnn_h64_k4_n100_e500.onnx new file mode 100644 index 0000000..4496d2f Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n100_e500.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n30000_e150000.onnx b/benchmark/models/gnn_h64_k4_n30000_e150000.onnx new file mode 100644 index 0000000..8da0743 Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n30000_e150000.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n3000_e15000.onnx b/benchmark/models/gnn_h64_k4_n3000_e15000.onnx new file mode 100644 index 0000000..16de0ed Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n3000_e15000.onnx differ diff --git a/benchmark/models/gnn_h64_k4_n300_e1500.onnx b/benchmark/models/gnn_h64_k4_n300_e1500.onnx new file mode 100644 index 0000000..3e6d355 Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n300_e1500.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2.onnx b/benchmark/models/punet_h32_k2_heads4_layers2.onnx new file mode 100644 index 0000000..a918af9 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx new file mode 100644 index 0000000..0b6a9b9 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx new file mode 100644 index 0000000..419a6e8 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx new file mode 100644 index 0000000..fab0378 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx new file mode 100644 index 0000000..68f5278 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx new file mode 100644 index 0000000..be0835a Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx new file mode 100644 index 0000000..a4f0ef3 Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx differ diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx new file mode 100644 index 0000000..f1be9ad Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2.onnx b/benchmark/models/punet_h64_k4_heads4_layers2.onnx new file mode 100644 index 0000000..398e1a9 Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx new file mode 100644 index 0000000..882cba5 Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx new file mode 100644 index 0000000..007f479 Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx new file mode 100644 index 0000000..a6e583f Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx new file mode 100644 index 0000000..640d9fc Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx new file mode 100644 index 0000000..d8ef5ab Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx new file mode 100644 index 0000000..eca6217 Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx differ diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx new file mode 100644 index 0000000..b9dfa5e Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx differ diff --git a/benchmark/models/simple_transformer.onnx b/benchmark/models/simple_transformer.onnx new file mode 100644 index 0000000..1925d9d Binary files /dev/null and b/benchmark/models/simple_transformer.onnx differ diff --git a/benchmark/models/simple_transformer.onnx.data b/benchmark/models/simple_transformer.onnx.data new file mode 100644 index 0000000..3f52857 Binary files /dev/null and b/benchmark/models/simple_transformer.onnx.data differ diff --git a/benchmark/models/simple_transformer_300.onnx b/benchmark/models/simple_transformer_300.onnx new file mode 100644 index 0000000..b32c59a Binary files /dev/null and b/benchmark/models/simple_transformer_300.onnx differ diff --git a/benchmark/models/simple_transformer_300.onnx.data b/benchmark/models/simple_transformer_300.onnx.data new file mode 100644 index 0000000..d1a5ee9 Binary files /dev/null and b/benchmark/models/simple_transformer_300.onnx.data differ diff --git a/benchmark/models/specialize_models.py b/benchmark/models/specialize_models.py new file mode 100644 index 0000000..fae90ad --- /dev/null +++ b/benchmark/models/specialize_models.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +""" +Specialize parametric ONNX models to static input shapes for SOFIE GPU benchmarking. + +For each source model, produces N variants with concrete (static) input dimensions, +replacing all dim_param symbols with dim_value integers, then runs ONNX shape +inference to propagate concrete shapes through the entire graph. + +Usage: + python3 specialize_models.py +""" + +import copy +import os +import sys +import onnx +from onnx import shape_inference, TensorProto + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def set_static_inputs(model: onnx.ModelProto, + shape_map: dict[str, list[int]]) -> onnx.ModelProto: + """ + Return a deep copy of *model* with every dim_param in graph inputs replaced + by the concrete dim_value from *shape_map* {input_name: [d0, d1, ...]}. + Unmentioned inputs are left unchanged. + """ + m = copy.deepcopy(model) + for inp in m.graph.input: + if inp.name not in shape_map: + continue + t = inp.type.tensor_type + new_dims = shape_map[inp.name] + for i, dim in enumerate(t.shape.dim): + dim.ClearField("dim_param") + dim.dim_value = new_dims[i] + return m + + +def fix_output_shapes(model: onnx.ModelProto) -> onnx.ModelProto: + """ + Run ONNX shape inference so all intermediate and output value_info entries + carry concrete shapes (where inferrable). + """ + return shape_inference.infer_shapes(model, check_type=True, + strict_mode=False, + data_prop=True) + + +def verify_no_dynamic_inputs(model: onnx.ModelProto, name: str) -> None: + """Warn if any graph input still has a dim_param after specialization.""" + for inp in model.graph.input: + t = inp.type.tensor_type + for dim in t.shape.dim: + if dim.dim_param: + print(f" [WARN] {name}: input '{inp.name}' still has " + f"dim_param='{dim.dim_param}'") + + +def patch_output_shapes(model: onnx.ModelProto, + output_shape_map: dict[str, list[int | str]]) -> onnx.ModelProto: + """ + Forcibly set output tensor shapes to concrete values where ONNX shape + inference could not propagate through ops like ScatterElements / Expand / Add. + *output_shape_map* maps output name → list of dim values (int = static, + str = leave dim_param unchanged). + """ + m = copy.deepcopy(model) + for out in m.graph.output: + if out.name not in output_shape_map: + continue + t = out.type.tensor_type + new_dims = output_shape_map[out.name] + for i, dim in enumerate(t.shape.dim): + v = new_dims[i] + if isinstance(v, int): + dim.ClearField("dim_param") + dim.dim_value = v + return m + + +def save(model: onnx.ModelProto, path: str) -> None: + onnx.save(model, path) + size_kb = os.path.getsize(path) / 1024 + print(f" → saved {os.path.basename(path)} ({size_kb:.0f} KB)") + + +# --------------------------------------------------------------------------- +# Model specifications +# --------------------------------------------------------------------------- + +BASE = os.path.dirname(os.path.abspath(__file__)) + +# ── GNN family ────────────────────────────────────────────────────────────── +# Inputs: node_features [n_nodes, 29] +# edge_features [n_edges, 5] +# edge_index [n_edges, 2] (int64) +# Output: edge_scores [n_edges, 1] +# +# Scale: 5 variants with n_edges ≈ 5 × n_nodes (realistic for tracking GNNs) +GNN_VARIANTS = [ + {"n_nodes": 100, "n_edges": 500}, + {"n_nodes": 300, "n_edges": 1500}, + {"n_nodes": 1000, "n_edges": 5000}, + {"n_nodes": 3000, "n_edges": 15000}, + {"n_nodes": 10000, "n_edges": 50000}, + # Large-input variants — cluster GPUs only (≥40 GB VRAM) + {"n_nodes": 30000, "n_edges": 150000}, + {"n_nodes": 100000, "n_edges": 500000}, +] + +GNN_MODELS = [ + "gnn_h32_k2.onnx", + "gnn_h64_k4.onnx", + "punet_h32_k2_heads4_layers2.onnx", + "punet_h64_k4_heads4_layers2.onnx", +] + + +def gnn_shape_map(n_nodes: int, n_edges: int) -> dict[str, list[int]]: + return { + "node_features": [n_nodes, 29], + "edge_features": [n_edges, 5], + "edge_index": [n_edges, 2], + } + + +def gnn_output_shape_map(n_nodes: int, n_edges: int) -> dict[str, list[int]]: + # edge_scores output: [n_edges, 1] — patch for models where shape inference + # cannot trace through ScatterElements/Expand back to the output. + return {"edge_scores": [n_edges, 1]} + + +def gnn_suffix(v: dict) -> str: + return f"n{v['n_nodes']}_e{v['n_edges']}" + + +# ── Transformer ────────────────────────────────────────────────────────────── +# Inputs: src [1, n_nodes, 3] float32 +# tgt [1, seq_length] int64 +# Output: logits [batch, seq_length, 132] +# +# Constraint: tgt_positional_encoding.pe is [1, 60, 32] +# → seq_length must be ≤ 60. +# Use the same value for n_nodes and seq_length (square attention pattern). +TRANSFORMER_VARIANTS = [ + {"n_nodes": 10, "seq_len": 10}, + {"n_nodes": 20, "seq_len": 20}, + {"n_nodes": 30, "seq_len": 30}, + {"n_nodes": 40, "seq_len": 40}, + {"n_nodes": 50, "seq_len": 50}, + # Maximum allowed by the positional-encoding table [1, 60, 32] + {"n_nodes": 60, "seq_len": 60}, +] + +TRANSFORMER_MODELS = ["transformer_d32_h2_L6_ff32.onnx"] + + +def transformer_shape_map(n_nodes: int, seq_len: int) -> dict[str, list[int]]: + return { + "src": [1, n_nodes, 3], + "tgt": [1, seq_len], + } + + +def transformer_output_shape_map(n_nodes: int, seq_len: int) -> dict[str, list[int]]: + # logits output: [batch=1, seq_length, 132]. The batch dim is produced by a + # bias Add op whose shape inference leaves it as 'Addlogits_dim_0'. + return {"logits": [1, seq_len, 132]} + + +def transformer_suffix(v: dict) -> str: + return f"n{v['n_nodes']}_s{v['seq_len']}" + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def specialize_family(model_names: list[str], + variants: list[dict], + shape_map_fn, + suffix_fn, + output_shape_map_fn=None) -> None: + for mname in model_names: + src_path = os.path.join(BASE, mname) + if not os.path.exists(src_path): + print(f"[SKIP] {mname} — file not found") + continue + + stem = mname.removesuffix(".onnx") + # Load with external data so weights are embedded in the new file + base_model = onnx.load(src_path, load_external_data=True) + print(f"\nSpecializing {mname}:") + + for v in variants: + out_name = f"{stem}_{suffix_fn(v)}.onnx" + out_path = os.path.join(BASE, out_name) + + smap = shape_map_fn(**v) + m = set_static_inputs(base_model, smap) + try: + m = fix_output_shapes(m) + except Exception as e: + print(f" [WARN] shape inference failed for {out_name}: {e}") + + # Manually patch outputs that inference couldn't resolve + if output_shape_map_fn is not None: + omap = output_shape_map_fn(**v) + m = patch_output_shapes(m, omap) + + verify_no_dynamic_inputs(m, out_name) + save(m, out_path) + + +def main() -> None: + print("=" * 60) + print("SOFIE benchmark model specialization") + print("=" * 60) + + specialize_family(GNN_MODELS, GNN_VARIANTS, gnn_shape_map, gnn_suffix, + output_shape_map_fn=gnn_output_shape_map) + specialize_family(TRANSFORMER_MODELS, TRANSFORMER_VARIANTS, + transformer_shape_map, transformer_suffix, + output_shape_map_fn=transformer_output_shape_map) + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/benchmark/models/transformer_L1000.onnx.data b/benchmark/models/transformer_L1000.onnx.data new file mode 100644 index 0000000..935826b Binary files /dev/null and b/benchmark/models/transformer_L1000.onnx.data differ diff --git a/benchmark/models/transformer_L1000_B1.onnx b/benchmark/models/transformer_L1000_B1.onnx new file mode 100644 index 0000000..c0728dd Binary files /dev/null and b/benchmark/models/transformer_L1000_B1.onnx differ diff --git a/benchmark/models/transformer_L1000_B100.onnx b/benchmark/models/transformer_L1000_B100.onnx new file mode 100644 index 0000000..669777d Binary files /dev/null and b/benchmark/models/transformer_L1000_B100.onnx differ diff --git a/benchmark/models/transformer_L1000_B100.onnx.data b/benchmark/models/transformer_L1000_B100.onnx.data new file mode 100644 index 0000000..cb59778 Binary files /dev/null and b/benchmark/models/transformer_L1000_B100.onnx.data differ diff --git a/benchmark/models/transformer_L100_B100.onnx b/benchmark/models/transformer_L100_B100.onnx new file mode 100644 index 0000000..1af481f Binary files /dev/null and b/benchmark/models/transformer_L100_B100.onnx differ diff --git a/benchmark/models/transformer_L100_B100.onnx.data b/benchmark/models/transformer_L100_B100.onnx.data new file mode 100644 index 0000000..08b324e Binary files /dev/null and b/benchmark/models/transformer_L100_B100.onnx.data differ diff --git a/benchmark/models/transformer_L8000_B1.onnx b/benchmark/models/transformer_L8000_B1.onnx new file mode 100644 index 0000000..fdd7d69 Binary files /dev/null and b/benchmark/models/transformer_L8000_B1.onnx differ diff --git a/benchmark/models/transformer_L8000_B1.onnx.data b/benchmark/models/transformer_L8000_B1.onnx.data new file mode 100644 index 0000000..91daf1d Binary files /dev/null and b/benchmark/models/transformer_L8000_B1.onnx.data differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32.onnx b/benchmark/models/transformer_d32_h2_L6_ff32.onnx new file mode 100644 index 0000000..be64f93 Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx new file mode 100644 index 0000000..2d5a39f Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx new file mode 100644 index 0000000..3cb3b3a Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx new file mode 100644 index 0000000..5e869ea Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx new file mode 100644 index 0000000..0dfe632 Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx new file mode 100644 index 0000000..05af836 Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx differ diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx new file mode 100644 index 0000000..3970cba Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx differ diff --git a/benchmark/src/BenchmarkBackend.hxx b/benchmark/src/BenchmarkBackend.hxx new file mode 100644 index 0000000..c12fde3 --- /dev/null +++ b/benchmark/src/BenchmarkBackend.hxx @@ -0,0 +1,49 @@ +#pragma once +// Backend type aliases and helpers — selected at compile time by CMake via +// -DSOFIE_BACKEND_CUDA / -DSOFIE_BACKEND_HIP. +// Every generated bench header and the runner use these so they stay free of +// hard-coded backend-specific APIs (cuda_runtime.h, hip_runtime.h, …). + +#include + +// ---- Per-backend runtime header and device-sync macro ---------------------- +#if defined(SOFIE_BACKEND_CUDA) +# include +# define SOFIE_BENCH_DEVICE_SYNC() cudaDeviceSynchronize() +#elif defined(SOFIE_BACKEND_HIP) +# include +# define SOFIE_BENCH_DEVICE_SYNC() hipDeviceSynchronize() +#else +# define SOFIE_BENCH_DEVICE_SYNC() do {} while (0) +#endif + +namespace sofie_bench { + +using Idx = std::size_t; +using Dim1 = alpaka::DimInt<1>; +using Ext1 = alpaka::Vec; + +#if defined(SOFIE_BACKEND_CUDA) + + using AccTag = alpaka::TagGpuCudaRt; + using Platform = alpaka::PlatformCudaRt; + using Device = alpaka::DevCudaRt; + using Queue = alpaka::Queue; + +#elif defined(SOFIE_BACKEND_HIP) + + using AccTag = alpaka::TagGpuHipRt; + using Platform = alpaka::PlatformHipRt; + using Device = alpaka::DevHipRt; + using Queue = alpaka::Queue; + +#else // CPU serial (default / fallback) + + using AccTag = alpaka::TagCpuSerial; + using Platform = alpaka::PlatformCpu; + using Device = alpaka::DevCpu; + using Queue = alpaka::Queue; + +#endif + +} // namespace sofie_bench diff --git a/benchmark/src/BenchmarkEmitter.cxx.in b/benchmark/src/BenchmarkEmitter.cxx.in new file mode 100644 index 0000000..f5d5435 --- /dev/null +++ b/benchmark/src/BenchmarkEmitter.cxx.in @@ -0,0 +1,309 @@ +// SOFIE Benchmark Emitter +// Auto-configured by CMake — do not edit directly. +// For each .onnx model in benchmark/models/ this binary generates: +// _GPU_ALPAKA.hxx — SOFIE inference code +// _GPU_ALPAKA.dat — serialized weights +// _bench.hxx — timing function, following the same +// pattern as the unit tests + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +#include +#include +#include +#include +#include +#include + +using namespace SOFIE; + +static size_t resolveDim(const Dim &d) { + return (d.dim > 0) ? static_cast(d.dim) : 1u; +} + +static int EmitBenchmarkModel(const std::string &onnxPath, + const std::string &modelName, + const std::string &outDir) +{ + std::cout << "[Benchmark] Processing: " << onnxPath << "\n"; + + RModelParser_ONNX parser; + RModel model = parser.Parse(onnxPath); + + const auto &inputNames = model.GetInputTensorNames(); + if (inputNames.empty()) { + std::cerr << "[WARN] " << modelName << " has no inputs – skipping.\n"; + return 1; + } + + // Map SOFIE tensor type to C++ type string + auto tensorTypeToCpp = [](ETensorType t) -> std::string { + switch (t) { + case ETensorType::FLOAT: return "float"; + case ETensorType::DOUBLE: return "double"; + case ETensorType::INT32: return "int32_t"; + case ETensorType::INT64: return "int64_t"; + case ETensorType::UINT8: return "uint8_t"; + case ETensorType::INT8: return "int8_t"; + case ETensorType::UINT16: return "uint16_t"; + case ETensorType::INT16: return "int16_t"; + case ETensorType::UINT32: return "uint32_t"; + case ETensorType::UINT64: return "uint64_t"; + case ETensorType::BOOL: return "uint8_t"; + default: return "float"; + } + }; + + // Collect input metadata before code generation + struct TensorMeta { + std::string cppType; + size_t numElements; + }; + std::vector inputs; + for (const auto &n : inputNames) { + TensorMeta m; + try { m.cppType = tensorTypeToCpp(model.GetTensorType(n)); } + catch (...) { m.cppType = "float"; } + m.numElements = 1; + try { + for (const auto &d : model.GetDimTensorShape(n)) + m.numElements *= resolveDim(d); + } catch (...) {} + inputs.push_back(m); + } + + // Generate SOFIE GPU/Alpaka inference code first — this calls Initialize() + // which runs shape inference on all operators. Output tensor shapes are only + // available *after* Initialize(), so we must call GenerateGPU_ALPAKA() before + // querying output metadata. +#ifdef SOFIE_BENCHMARK_PROFILE + model.GenerateGPU_ALPAKA(Options::kProfile); +#else + model.GenerateGPU_ALPAKA(); +#endif + + // Collect output metadata AFTER code generation so shapes are fully propagated. + const auto &outputNames = model.GetOutputTensorNames(); + std::vector outputs; + for (const auto &n : outputNames) { + TensorMeta m; + try { m.cppType = tensorTypeToCpp(model.GetTensorType(n)); } + catch (...) { m.cppType = "float"; } + m.numElements = 1; + try { + for (const auto &d : model.GetDimTensorShape(n)) + m.numElements *= resolveDim(d); + } catch (...) {} + outputs.push_back(m); + } + + std::string hxxPath = outDir + "/" + modelName + "_GPU_ALPAKA.hxx"; + std::string benchPath = outDir + "/" + modelName + "_bench.hxx"; + + model.OutputGenerated(hxxPath); + + // Sanitize model name into a valid C++ identifier + std::string cppName = modelName; + for (char &c : cppName) + if (!std::isalnum(static_cast(c))) c = '_'; + + // Build "session.infer(input_d_0, input_d_1, ...)" + std::ostringstream inferCall; + inferCall << "session.infer("; + for (size_t i = 0; i < inputs.size(); ++i) { + if (i) inferCall << ", "; + inferCall << "input_d_" << i; + } + inferCall << ")"; + + std::ofstream bench(benchPath); + if (!bench.is_open()) { + std::cerr << "[ERROR] Cannot open " << benchPath << "\n"; + return 1; + } + + bench + << "// Auto-generated benchmark for model: " << modelName << "\n" + << "// DO NOT EDIT — regenerated by the SOFIE benchmark emitter.\n" + << "// Backend is selected at compile time via -DSOFIE_BACKEND_CUDA / -DSOFIE_BACKEND_HIP\n" + << "// (see BenchmarkBackend.hxx for the sofie_bench:: type aliases).\n" + << "#pragma once\n\n" + << "#include \"" << modelName << "_GPU_ALPAKA.hxx\"\n" + << "#include \"BenchmarkBackend.hxx\"\n" + << "#include \n" + << "#include \n" + << "#include \n" + << "#include \n\n" + // Not inline: each model is compiled in its own _bench.cu TU and + // called from the main TU via a forward declaration. Using inline + // here would require all 32 models to share one translation unit + // which OOM-kills the CUDA compiler. + << "void Benchmark_" << cppName + << "(int warmup, int iterations, const std::string& weightsDir) {\n" + << " using namespace sofie_bench;\n\n" + << " // ---- Device/host setup (mirrors unit-test pattern) ----\n" + << " alpaka::PlatformCpu hostPlatform{};\n" + << " auto host = alpaka::getDevByIdx(hostPlatform, 0u);\n" + << " Platform platform{};\n" + << " auto device = alpaka::getDevByIdx(platform, 0u);\n" + << " Queue queue{device};\n\n" + << " std::mt19937 rng(42);\n" + << " std::uniform_real_distribution fdist(-1.0f, 1.0f);\n\n"; + + + for (size_t i = 0; i < inputs.size(); ++i) { + const std::string &T = inputs[i].cppType; + const size_t N = inputs[i].numElements; + bench + << " // Input " << i << ": " << T << "[" << N << "]\n" + << " auto input_h_" << i << " = alpaka::allocBuf<" << T + << ", Idx>(host, Ext1::all(Idx{" << N << "}));\n" + << " {\n" + << " auto *p = reinterpret_cast<" << T + << "*>(alpaka::getPtrNative(input_h_" << i << "));\n"; + if (T == "float" || T == "double") { + bench + << " for (size_t k = 0; k < " << N + << "; ++k) p[k] = static_cast<" << T << ">(fdist(rng));\n"; + } else { + bench + << " std::fill(p, p + " << N + << ", static_cast<" << T << ">(0));\n"; + } + bench + << " }\n" + << " auto input_d_" << i << " = alpaka::allocBuf<" << T + << ", Idx>(device, Ext1::all(Idx{" << N << "}));\n" + << " alpaka::memcpy(queue, input_d_" << i + << ", input_h_" << i << ");\n\n"; + } + bench << " alpaka::wait(queue);\n\n"; + + bench + << " // ---- Create session (loads weights) ----\n" + << " std::string weightFile = weightsDir + \"/" + << modelName << "_GPU_ALPAKA.dat\";\n" + << " SOFIE_" << cppName + << "::Session session(weightFile);\n\n"; + + + bench << " // ---- One priming inference to obtain output buffer handle(s) ----\n" + << " auto out_d = " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n\n"; + + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string &T = outputs[i].cppType; + const size_t N = outputs[i].numElements; + bench + << " // Output " << i << ": " << T << "[" << N << "]\n" + << " auto out_h_" << i << " = alpaka::allocBuf<" << T + << ", Idx>(host, Ext1::all(Idx{" << N << "}));\n"; + } + bench << "\n"; + + auto outBufExpr = [&](size_t i) -> std::string { + if (outputs.size() == 1) return "out_d"; + return "out_d[" + std::to_string(i) + "]"; + }; + + bench << " // ---- Warmup (not timed) ----\n" + << " for (int w = 0; w < warmup; ++w) {\n"; + // H2D: inputs host → device + for (size_t i = 0; i < inputs.size(); ++i) + bench << " alpaka::memcpy(queue, input_d_" << i + << ", input_h_" << i << ");\n"; + bench << " alpaka::wait(queue);\n"; + // Inference + bench << " " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n"; + // D2H: outputs device → host + for (size_t i = 0; i < outputs.size(); ++i) + bench << " alpaka::memcpy(queue, out_h_" << i + << ", " << outBufExpr(i) << ");\n"; + bench << " alpaka::wait(queue);\n" + << " }\n" + << " cudaDeviceSynchronize();\n\n"; + + // ---- Profiling path (mutually exclusive with benchmark path) ---- + bench << "#ifdef SOFIE_BENCHMARK_PROFILE\n" + << " // ---- Profiling: reset warmup data, run iterations, print results ----\n" + << " session.ResetProfilingResults();\n" + << " for (int _i = 0; _i < iterations; ++_i)\n" + << " " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n" + << " std::printf(\"%s\\n\", std::string(60, '-').c_str());\n" + << " std::printf(\"Model: " << modelName << "\\n\");\n" + << " session.PrintProfilingResults();\n" + << " session.PrintMemoryInfo();\n" + << "#else\n"; + + // ---- Benchmark path ---- + bench + << " // ---- Timed input transfer (H2D: host -> device) ----\n" + << " auto t0_in = std::chrono::high_resolution_clock::now();\n" + << " for (int _i = 0; _i < iterations; ++_i) {\n"; + for (size_t i = 0; i < inputs.size(); ++i) + bench << " alpaka::memcpy(queue, input_d_" << i + << ", input_h_" << i << ");\n"; + bench << " alpaka::wait(queue);\n" + << " }\n" + << " SOFIE_BENCH_DEVICE_SYNC();\n" + << " auto t1_in = std::chrono::high_resolution_clock::now();\n\n"; + + bench + << " // ---- Timed inference ----\n" + << " auto t0_infer = std::chrono::high_resolution_clock::now();\n" + << " for (int _i = 0; _i < iterations; ++_i)\n" + << " " << inferCall.str() << ";\n" + << " alpaka::wait(session.queue);\n" + << " SOFIE_BENCH_DEVICE_SYNC();\n" + << " auto t1_infer = std::chrono::high_resolution_clock::now();\n\n"; + + bench + << " // ---- Timed output transfer (D2H: device -> host) ----\n" + << " auto t0_out = std::chrono::high_resolution_clock::now();\n" + << " for (int _i = 0; _i < iterations; ++_i) {\n"; + for (size_t i = 0; i < outputs.size(); ++i) + bench << " alpaka::memcpy(queue, out_h_" << i + << ", " << outBufExpr(i) << ");\n"; + bench << " alpaka::wait(queue);\n" + << " }\n" + << " SOFIE_BENCH_DEVICE_SYNC();\n" + << " auto t1_out = std::chrono::high_resolution_clock::now();\n\n"; + + bench + << " double avg_infer_ms = std::chrono::duration" + "(t1_infer - t0_infer).count() / iterations;\n" + << " double avg_in_ms = std::chrono::duration" + "(t1_in - t0_in ).count() / iterations;\n" + << " double avg_out_ms = std::chrono::duration" + "(t1_out - t0_out ).count() / iterations;\n" + << " double throughput = (avg_infer_ms > 0.0) ? 1000.0 / avg_infer_ms : 0.0;\n\n" + << " std::printf(\"%-40s %12.4f %14.4f %15.4f %16.1f\\n\",\n" + << " \"" << modelName << "\",\n" + << " avg_infer_ms, avg_in_ms, avg_out_ms, throughput);\n" + << "#endif // SOFIE_BENCHMARK_PROFILE\n" + << "}\n"; + + bench.close(); + + std::cout << "[Benchmark] Wrote: " << hxxPath << "\n" + << " Wrote: " << benchPath << "\n"; + return 0; +} + +int main(int argc, char *argv[]) { + if (argc < 2) { + std::cerr << "Usage: sofie_benchmark_emitter \n"; + return 1; + } + std::string outDir = argv[1]; + int failures = 0; + +@BENCHMARK_EMIT_CAPTURES@ + + std::cout << "[Benchmark Emitter] Done — " << failures << " failure(s).\n"; + return failures == 0 ? 0 : 1; +} diff --git a/benchmark/src/BenchmarkRunner.cxx.in b/benchmark/src/BenchmarkRunner.cxx.in new file mode 100644 index 0000000..1848680 --- /dev/null +++ b/benchmark/src/BenchmarkRunner.cxx.in @@ -0,0 +1,140 @@ +// SOFIE Alpaka Benchmark Runner — main TU +// Auto-configured by CMake — do not edit directly. +// +// Execution model: +// Normal mode — iterates over all models, spawning ONE subprocess per model +// so each model gets a fresh CUDA context and all GPU memory +// is freed when the subprocess exits. This avoids the +// cudaErrorMemoryAllocation that occurs when a large model's +// CUDA allocator cache is not returned to the OS between runs. +// Single-model mode (--single-model ) — called BY the parent; runs +// exactly one model then exits. + +#include +#include +#include +#include +#include +#include + +// Optional ONNX Runtime GPU comparison +#ifdef SOFIE_BENCHMARK_ORT +#include "ONNXRuntimeBenchmark.hxx" +#endif + +// Forward-declare one per-model benchmark function (defined in _bench.cu) +@BENCHMARK_FWD_DECLS@ + +// --------------------------------------------------------------------------- +// Dispatch table: called when --single-model is given. +// --------------------------------------------------------------------------- +static int run_single_model(const std::string& model, + int warmup, int iterations, + const std::string& weightsDir, + bool run_ort) +{ +@BENCHMARK_SINGLE_MODEL_CASES@ + std::fprintf(stderr, "Unknown model: %s\n", model.c_str()); + return 1; +} + +int main(int argc, char *argv[]) { + int warmup = 10; + int iterations = 100; + std::string weightsDir = "."; + bool run_ort = false; + std::string singleModel; + + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + if ((a == "--warmup" || a == "-w") && i + 1 < argc) warmup = std::stoi(argv[++i]); + else if ((a == "--iterations" || a == "-n") && i + 1 < argc) iterations = std::stoi(argv[++i]); + else if (a == "--weights-dir" && i + 1 < argc) weightsDir = argv[++i]; + else if (a == "--single-model" && i + 1 < argc) singleModel = argv[++i]; + else if (a == "--onnxruntime" || a == "--ort") run_ort = true; + else if (a == "--help" || a == "-h") { +#ifdef SOFIE_BENCHMARK_PROFILE + std::cout << + "Usage: sofie_benchmark [options] [PROFILING MODE]\n" + " Per-operator GPU timing (alpaka::wait per op) + memory breakdown.\n" + " Throughput benchmarking is disabled in this build.\n" + " Rebuild without -DSOFIE_BENCHMARK_PROFILE=ON for throughput numbers.\n\n" + " --warmup, -w Warmup iterations (default: 10)\n" + " --iterations, -n Profiling runs (default: 100)\n" + " --weights-dir SOFIE .dat files (default: .)\n" + " --single-model Run one model (internal)\n" + ; +#else + std::cout << + "Usage: sofie_benchmark [options]\n" + " --warmup, -w Warmup iterations (default: 10)\n" + " --iterations, -n Timed iterations (default: 100)\n" + " --weights-dir SOFIE .dat weight files (default: .)\n" +#ifdef SOFIE_BENCHMARK_ORT + " --onnxruntime, --ort Also run ONNX Runtime GPU comparison\n" +#else + " --onnxruntime, --ort (not available; rebuild with -DSOFIE_BENCHMARK_ORT=ON)\n" +#endif + " Rebuild with -DSOFIE_BENCHMARK_PROFILE=ON for per-operator profiling.\n" + " --single-model Run exactly one model by C++ name (internal)\n" + ; +#endif + return 0; + } + } + + // ----------------------------------------------------------------------- + // Single-model mode: called as a subprocess, run one model and exit. + // All GPU memory is freed when this process exits. + // ----------------------------------------------------------------------- + if (!singleModel.empty()) + return run_single_model(singleModel, warmup, iterations, weightsDir, run_ort); + + // ----------------------------------------------------------------------- + // Orchestrator mode: spawn one child process per model so every model + // starts with a fresh CUDA context (avoids cudaErrorMemoryAllocation + // caused by the CUDA allocator retaining freed pages between models). + // ----------------------------------------------------------------------- +#ifndef SOFIE_BENCHMARK_ORT + if (run_ort) { + std::fprintf(stderr, + "Warning: --onnxruntime requested but this binary was built without " + "ORT support. Rebuild with -DSOFIE_BENCHMARK_ORT=ON.\n"); + run_ort = false; + } +#endif + +#ifdef SOFIE_BENCHMARK_PROFILE + std::printf("=== SOFIE Alpaka Profiler ===\n"); + std::printf("Backend: @SOFIE_BENCHMARK_BACKEND@ | Warmup: %d | Iterations: %d\n\n", + warmup, iterations); +#else + std::printf("=== SOFIE Alpaka Benchmark ===\n"); + std::printf("Backend: @SOFIE_BENCHMARK_BACKEND@ | Warmup: %d | Iterations: %d", warmup, iterations); +#ifdef SOFIE_BENCHMARK_ORT + if (run_ort) std::printf(" | ORT-GPU: ON"); +#endif + std::printf("\n\n"); + + std::printf("%-40s %12s %14s %15s %16s\n", + "Model", "infer(ms)", "in_xfer(ms)", "out_xfer(ms)", "Throughput(inf/s)"); + std::printf("%s\n", std::string(103, '-').c_str()); +#endif // SOFIE_BENCHMARK_PROFILE + std::fflush(stdout); + + // Build the common argument suffix forwarded to every subprocess. + std::ostringstream common; + common << " -w " << warmup + << " -n " << iterations + << " --weights-dir \"" << weightsDir << "\""; + if (run_ort) common << " --ort"; + std::string commonArgs = common.str(); + + int totalFailed = 0; +@BENCHMARK_SPAWN_CALLS@ + + if (totalFailed > 0) + std::fprintf(stderr, "\n%d model(s) failed.\n", totalFailed); + + return (totalFailed > 0) ? 1 : 0; +} diff --git a/benchmark/src/BenchmarkUtils.hxx b/benchmark/src/BenchmarkUtils.hxx new file mode 100644 index 0000000..9dda93a --- /dev/null +++ b/benchmark/src/BenchmarkUtils.hxx @@ -0,0 +1,167 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sofie_bench { + +struct BenchmarkConfig { + int warmupIter = 10; + int benchIter = 100; + int deviceId = 0; + float tolerance = 1e-3f; + bool validateOrt = false; + std::string weightsDir = "."; + bool csvOutput = false; + bool verbose = false; +}; + +struct BenchmarkResult { + std::string modelName; + size_t inputElements = 0; + size_t outputElements = 0; + float avgInferMs = 0.0f; // per-inference average (chrono) + float throughput = 0.0f; // inferences / second + float weightMemMB = 0.0f; // device memory for model weights + float runtimeMemMB = 0.0f; // device memory for intermediates + bool ortRan = false; + bool ortMatch = false; + float ortMaxDiff = -1.0f; + bool skipped = false; + std::string skipReason; +}; + +inline BenchmarkConfig ParseArgs(int argc, char *argv[]) { + BenchmarkConfig cfg; + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if ((arg == "--warmup" || arg == "-w") && i + 1 < argc) + cfg.warmupIter = std::stoi(argv[++i]); + else if ((arg == "--iterations" || arg == "-n") && i + 1 < argc) + cfg.benchIter = std::stoi(argv[++i]); + else if ((arg == "--device" || arg == "-d") && i + 1 < argc) + cfg.deviceId = std::stoi(argv[++i]); + else if ((arg == "--tolerance" || arg == "-t") && i + 1 < argc) + cfg.tolerance = std::stof(argv[++i]); + else if (arg == "--validate-ort") + cfg.validateOrt = true; + else if ((arg == "--weights-dir") && i + 1 < argc) + cfg.weightsDir = argv[++i]; + else if (arg == "--csv") + cfg.csvOutput = true; + else if (arg == "--verbose" || arg == "-v") + cfg.verbose = true; + else if (arg == "--help" || arg == "-h") { + std::cout << "SOFIE Alpaka Benchmark\n\n" + << "Options:\n" + << " --warmup, -w Warmup iterations (default: 10)\n" + << " --iterations, -n Benchmark iterations (default: 100)\n" + << " --device, -d Device index (default: 0)\n" + << " --tolerance, -t ONNXRuntime diff tolerance (default: 1e-3)\n" + << " --validate-ort Compare SOFIE outputs to ONNXRuntime\n" + << " --weights-dir Directory containing .dat weight files (default: .)\n" + << " --csv Print results in CSV format\n" + << " --verbose, -v Verbose output\n"; + std::exit(0); + } + } + return cfg; +} + +inline void PrintDeviceInfo(const std::string &deviceName) { + std::cout << "Device: " << deviceName << "\n"; +} + +inline void PrintHeader(const BenchmarkConfig &cfg, const std::string &deviceName = "") { + std::cout << "\n=== SOFIE Alpaka Benchmark ===\n"; + if (!deviceName.empty()) + PrintDeviceInfo(deviceName); + std::cout << "Warmup: " << cfg.warmupIter + << " | Iterations: " << cfg.benchIter; + if (cfg.validateOrt) + std::cout << " | ONNXRuntime validation ON (tol=" << cfg.tolerance << ")"; + std::cout << "\n\n"; + + if (cfg.csvOutput) { + std::cout << "Model,InputElems,OutputElems,AvgInferMs,Throughput(inf/s)," + "WeightMem(MB),RuntimeMem(MB),OrtMatch,OrtMaxDiff\n"; + } else { + std::cout << std::left + << std::setw(30) << "Model" + << std::setw(12) << "Input" + << std::setw(12) << "Output" + << std::setw(14) << "Avg(ms)" + << std::setw(16) << "Throughput(i/s)" + << std::setw(12) << "ORT Check" + << "\n"; + std::cout << std::string(96, '-') << "\n"; + } +} + +inline void PrintResult(const BenchmarkResult &r, const BenchmarkConfig &cfg) { + if (r.skipped) { + if (!cfg.csvOutput) + std::cout << std::left << std::setw(30) << r.modelName + << " [SKIPPED: " << r.skipReason << "]\n"; + return; + } + + if (cfg.csvOutput) { + std::cout << r.modelName << "," + << r.inputElements << "," + << r.outputElements << "," + << std::fixed << std::setprecision(4) << r.avgInferMs << "," + << std::fixed << std::setprecision(1) << r.throughput << "," + << std::fixed << std::setprecision(2) << r.weightMemMB << "," + << std::fixed << std::setprecision(2) << r.runtimeMemMB << ","; + if (r.ortRan) + std::cout << (r.ortMatch ? "PASS" : "FAIL") << "," << r.ortMaxDiff; + else + std::cout << "N/A,N/A"; + std::cout << "\n"; + } else { + std::string ortStr = "N/A"; + if (r.ortRan) { + std::ostringstream oss; + oss << (r.ortMatch ? "PASS" : "FAIL") + << "(d=" << std::scientific << std::setprecision(1) << r.ortMaxDiff << ")"; + ortStr = oss.str(); + } + std::cout << std::left + << std::setw(30) << r.modelName + << std::setw(12) << r.inputElements + << std::setw(12) << r.outputElements + << std::setw(14) << std::fixed << std::setprecision(4) << r.avgInferMs + << std::setw(16) << std::fixed << std::setprecision(1) << r.throughput + << std::setw(12) << ortStr + << "\n"; + } +} + +inline void PrintSummary(const std::vector &results, const BenchmarkConfig &cfg) { + if (cfg.csvOutput) return; + + std::cout << "\n" << std::string(96, '=') << "\n"; + int ran = 0, skipped = 0, ortFail = 0; + float totalMs = 0.0f; + for (const auto &r : results) { + if (r.skipped) { ++skipped; continue; } + ++ran; + totalMs += r.avgInferMs; + if (r.ortRan && !r.ortMatch) ++ortFail; + } + std::cout << "Summary: " << ran << " model(s) benchmarked"; + if (skipped) std::cout << ", " << skipped << " skipped"; + if (ran > 0) std::cout << ", avg inference " << std::fixed << std::setprecision(4) << (totalMs / ran) << " ms"; + if (ortFail) std::cout << ", " << ortFail << " ORT mismatch(es)"; + std::cout << "\n"; +} + +} // namespace sofie_bench diff --git a/benchmark/src/ModelBench.cu.in b/benchmark/src/ModelBench.cu.in new file mode 100644 index 0000000..4717407 --- /dev/null +++ b/benchmark/src/ModelBench.cu.in @@ -0,0 +1,5 @@ +// Per-model compilation unit for: @MODEL_NAME@ +// Each model is compiled in isolation so the GPU template instantiation +// for one model's Session does not share a translation unit with +// every other model's (which would OOM the CUDA compiler on the all-in-one TU). +#include "@MODEL_NAME@_bench.hxx" diff --git a/benchmark/src/ONNXRuntimeBenchmark.hxx b/benchmark/src/ONNXRuntimeBenchmark.hxx new file mode 100644 index 0000000..9d3ddd3 --- /dev/null +++ b/benchmark/src/ONNXRuntimeBenchmark.hxx @@ -0,0 +1,210 @@ +// SOFIE Benchmark — ONNX Runtime GPU backend +// Generic benchmark: loads any ONNX model, introspects shapes, runs with the +// CUDA ExecutionProvider. Float inputs are filled with uniform random values; +// integer inputs are zeroed (safe for index tensors like edge_index). +// +// Data stays on the HOST side of the ORT API (ORT handles H↔D transfers +// internally) — this measures end-to-end latency from the application's +// perspective. Use the optional IOBinding path (--ort-device-io, WIP) to +// measure pure GPU compute time comparable to the SOFIE numbers. +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace sofie_ort_bench_detail { + +/// Total element count from a shape vector (-1 dynamic dims are treated as 1). +inline std::size_t shapeToSize(const std::vector& shape) { + std::size_t n = 1; + for (auto d : shape) n *= (d > 0 ? static_cast(d) : 1u); + return n; +} + +inline const char* ortTypeName(ONNXTensorElementDataType t) { + switch (t) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: return "float32"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: return "float64"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: return "int32"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: return "int64"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: return "uint8"; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: return "bool"; + default: return "other"; + } +} + +} // namespace sofie_ort_bench_detail + +/// Run @p model_path through ONNX Runtime's CUDAExecutionProvider. +/// Results are printed in the same table format as the SOFIE Alpaka benchmark. +/// +/// @param model_path Full path to the .onnx file. +/// @param model_name Display name shown in the table (typically the stem). +/// @param warmup Number of warm-up iterations (not timed). +/// @param iterations Number of timed iterations. +/// @param device_id CUDA device index (default 0). +/// @param verbose If true, print per-input shape/type information. +inline void BenchmarkORT_GPU(const std::string& model_path, + const std::string& model_name, + int warmup, + int iterations, + int device_id = 0, + bool verbose = false) +{ + using namespace sofie_ort_bench_detail; + + Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "sofie_ort_bench"); + + Ort::SessionOptions opts; + opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + opts.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + + OrtCUDAProviderOptions cuda_opts{}; + cuda_opts.device_id = device_id; + cuda_opts.arena_extend_strategy = 0; // kNextPowerOfTwo + cuda_opts.gpu_mem_limit = SIZE_MAX; + cuda_opts.cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive; + cuda_opts.do_copy_in_default_stream = 1; + opts.AppendExecutionProvider_CUDA(cuda_opts); + + Ort::Session session(env, model_path.c_str(), opts); + Ort::AllocatorWithDefaultOptions alloc; + Ort::MemoryInfo mem_cpu = + Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + + const std::size_t num_inputs = session.GetInputCount(); + + std::vector input_names_str(num_inputs); + std::vector input_names_ptr(num_inputs); + std::vector> input_shapes(num_inputs); + std::vector input_types(num_inputs); + + std::vector> float_data(num_inputs); + std::vector> double_data(num_inputs); + std::vector> int64_data(num_inputs); + std::vector> int32_data(num_inputs); + std::vector> uint8_data(num_inputs); + + std::mt19937 rng(42); + std::uniform_real_distribution fdist(-1.f, 1.f); + + std::vector input_tensors; + input_tensors.reserve(num_inputs); + + for (std::size_t i = 0; i < num_inputs; ++i) { + auto name_ptr = session.GetInputNameAllocated(i, alloc); + input_names_str[i] = name_ptr.get(); + input_names_ptr[i] = input_names_str[i].c_str(); + + auto info = session.GetInputTypeInfo(i); + auto tinfo = info.GetTensorTypeAndShapeInfo(); + input_types[i] = tinfo.GetElementType(); + input_shapes[i] = tinfo.GetShape(); + + for (auto& d : input_shapes[i]) if (d < 0) d = 1; + + std::size_t n = shapeToSize(input_shapes[i]); + + if (verbose) { + std::printf(" Input %-2zu %-20s type=%-8s numel=%zu\n", + i, input_names_str[i].c_str(), + ortTypeName(input_types[i]), n); + } + + switch (input_types[i]) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: { + float_data[i].resize(n); + for (auto& v : float_data[i]) v = fdist(rng); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, float_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: { + double_data[i].resize(n, 0.0); + for (auto& v : double_data[i]) + v = static_cast(fdist(rng)); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, double_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { + int64_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, int64_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { + int32_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, int32_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: { + uint8_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, uint8_data[i].data(), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: { + uint8_data[i].assign(n, 0); + input_tensors.push_back(Ort::Value::CreateTensor( + mem_cpu, + reinterpret_cast(uint8_data[i].data()), n, + input_shapes[i].data(), input_shapes[i].size())); + break; + } + default: + throw std::runtime_error( + std::string("BenchmarkORT_GPU: unsupported input type for ") + + input_names_str[i]); + } + } + + const std::size_t num_outputs = session.GetOutputCount(); + std::vector output_names_str(num_outputs); + std::vector output_names_ptr(num_outputs); + for (std::size_t i = 0; i < num_outputs; ++i) { + auto ptr = session.GetOutputNameAllocated(i, alloc); + output_names_str[i] = ptr.get(); + output_names_ptr[i] = output_names_str[i].c_str(); + } + + Ort::RunOptions run_opts; + + for (int w = 0; w < warmup; ++w) { + session.Run(run_opts, + input_names_ptr.data(), input_tensors.data(), num_inputs, + output_names_ptr.data(), num_outputs); + } + cudaDeviceSynchronize(); + + auto t0 = std::chrono::high_resolution_clock::now(); + for (int it = 0; it < iterations; ++it) { + session.Run(run_opts, + input_names_ptr.data(), input_tensors.data(), num_inputs, + output_names_ptr.data(), num_outputs); + } + cudaDeviceSynchronize(); + auto t1 = std::chrono::high_resolution_clock::now(); + + double avg_ms = std::chrono::duration(t1 - t0).count() + / iterations; + double throughput = (avg_ms > 0.0) ? 1000.0 / avg_ms : 0.0; + + std::string label = std::string(model_name) + " [ORT-GPU]"; + std::printf("%-30s avg %8.4f ms (%8.1f inf/s)\n", + label.c_str(), avg_ms, throughput); +} diff --git a/check_style.sh b/check_style.sh new file mode 100644 index 0000000..22a56e4 --- /dev/null +++ b/check_style.sh @@ -0,0 +1,37 @@ +#!/bin/bash +set -e + +# Directories +SRC_DIR="./include" +TEST_DIR="./tests" + +echo "📝 Discovering source/header files..." + +FILES=$(find "$SRC_DIR" "$TEST_DIR" \ + -path "$TEST_DIR/build" -prune -o \ + -type f \( \ + -name '*.cpp' -o -name '*.cc' -o -name '*.cxx' -o \ + -name '*.h' -o -name '*.hpp' -o -name '*.hxx' -o -name '*.hh' \ + \) -print) + +if [ -z "$FILES" ]; then + echo "⚠️ No files found to process." + exit 0 +fi + +echo "🎯 Files to check:" +echo "$FILES" + +echo "🎨 Running clang-format..." +for file in $FILES; do + echo "Formatting $file" + clang-format -i "$file" +done + +echo "🔍 Running clang-tidy..." +for file in $FILES; do + echo "Linting $file" + clang-tidy "$file" --extra-arg=-std=c++20 -- -I"$SRC_DIR" || true +done + +echo "✅ Formatting and linting complete." diff --git a/cmake/SOFIEConfig.cmake.in b/cmake/SOFIEConfig.cmake.in new file mode 100644 index 0000000..94ebc4a --- /dev/null +++ b/cmake/SOFIEConfig.cmake.in @@ -0,0 +1,13 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +find_dependency(Protobuf) + +if(@SOFIE_WITH_ROOT@) + find_dependency(ROOT COMPONENTS Core TMVA Tree) +endif() + +include("${CMAKE_CURRENT_LIST_DIR}/SOFIETargets.cmake") + +check_required_components(SOFIE) diff --git a/cmake/modules/SofieTestMacros.cmake b/cmake/modules/SofieTestMacros.cmake new file mode 100644 index 0000000..1f4d235 --- /dev/null +++ b/cmake/modules/SofieTestMacros.cmake @@ -0,0 +1,73 @@ +# Fallback test macros used when ROOT is not available. +# These provide the same interface as ROOTTEST_GENERATE_EXECUTABLE and +# ROOTTEST_ADD_TEST from RoottestMacros.cmake but without requiring ROOT. + +macro(ROOTTEST_GENERATE_EXECUTABLE executable) + cmake_parse_arguments(ARG "" "RESOURCE_LOCK" + "LIBRARIES;COMPILE_FLAGS;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED" + ${ARGN}) + + add_executable(${executable} EXCLUDE_FROM_ALL ${ARG_UNPARSED_ARGUMENTS}) + set_target_properties(${executable} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + if(ARG_DEPENDS) + add_dependencies(${executable} ${ARG_DEPENDS}) + endif() + + if(ARG_LIBRARIES) + target_link_libraries(${executable} ${ARG_LIBRARIES}) + endif() + + if(ARG_COMPILE_FLAGS) + set_target_properties(${executable} PROPERTIES COMPILE_FLAGS ${ARG_COMPILE_FLAGS}) + endif() + + set(_sofie_build_test ${executable}-build) + add_test(NAME ${_sofie_build_test} + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target ${executable}) + + if(ARG_FIXTURES_SETUP) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP}) + endif() + if(ARG_FIXTURES_CLEANUP) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP}) + endif() + if(ARG_FIXTURES_REQUIRED) + set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED}) + endif() +endmacro() + +function(ROOTTEST_ADD_TEST testname) + cmake_parse_arguments(ARG "" + "WORKING_DIR;TIMEOUT;RESOURCE_LOCK" + "EXEC;COMMAND;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED;ENVIRONMENT;PROPERTIES" + ${ARGN}) + + if(ARG_EXEC) + set(_cmd ${ARG_EXEC}) + elseif(ARG_COMMAND) + set(_cmd ${ARG_COMMAND}) + else() + message(FATAL_ERROR "ROOTTEST_ADD_TEST: must specify EXEC or COMMAND") + endif() + + add_test(NAME ${testname} COMMAND ${_cmd} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + + if(ARG_FIXTURES_SETUP) + set_property(TEST ${testname} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP}) + endif() + if(ARG_FIXTURES_CLEANUP) + set_property(TEST ${testname} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP}) + endif() + if(ARG_FIXTURES_REQUIRED) + set_property(TEST ${testname} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED}) + endif() + if(ARG_ENVIRONMENT) + set_property(TEST ${testname} PROPERTY ENVIRONMENT ${ARG_ENVIRONMENT}) + endif() + if(ARG_TIMEOUT) + set_property(TEST ${testname} PROPERTY TIMEOUT ${ARG_TIMEOUT}) + endif() +endfunction() diff --git a/src/SOFIE_core/CMakeLists.txt b/core/CMakeLists.txt similarity index 72% rename from src/SOFIE_core/CMakeLists.txt rename to core/CMakeLists.txt index 7297957..a99f6d4 100644 --- a/src/SOFIE_core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -17,6 +17,8 @@ set(sources_headers SOFIE/OperatorList.hxx SOFIE/RModel_Base.hxx SOFIE/RModel.hxx + SOFIE/RModelProfiler.hxx + SOFIE/RModelProfilerGPU.hxx SOFIE/ROperator.hxx SOFIE/ROperator_BasicUnary.hxx SOFIE/ROperator_BasicBinary.hxx @@ -76,6 +78,9 @@ list(TRANSFORM sources_headers PREPEND "inc/") set(sources_cxx src/RModel_Base.cxx src/RModel.cxx + src/RModelProfiler.cxx + src/RModelProfilerGPU.cxx + src/RModel_ALPAKA.cxx src/RModel_GNN.cxx src/RModel_GraphIndependent.cxx src/RFunction.cxx @@ -86,24 +91,33 @@ set(sources_cxx ) target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx}) -target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc) -target_link_libraries(SOFIE_core PUBLIC - Tree - Core - RIO +target_include_directories(SOFIE_core PUBLIC + $ + $ ) +target_link_libraries(SOFIE_core PUBLIC utils) -ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers} - LINKDEF inc/LinkDef.h - MODULE SOFIE_core - OPTIONS --deep -) +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + target_compile_definitions(SOFIE_core PUBLIC SOFIE_SUPPORT_ROOT_BINARY) + target_link_libraries(SOFIE_core PUBLIC Tree Core RIO) + + ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_core + OPTIONS --deep + ) + + # Install the dictionaries. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap + DESTINATION lib) +endif() install(TARGETS SOFIE_core - LIBRARY DESTINATION lib + EXPORT SOFIETargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) -install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include") -if(testing) - add_subdirectory(test) -endif() diff --git a/src/SOFIE_core/README.md b/core/README.md similarity index 96% rename from src/SOFIE_core/README.md rename to core/README.md index 033cad4..b0a50a1 100644 --- a/src/SOFIE_core/README.md +++ b/core/README.md @@ -12,10 +12,10 @@ This is a new development in TMVA and is currently in early experimental stage. ## Installation -Build ROOT with the cmake option tmva-sofie enabled. +Build ROOT with the cmake option sofie enabled. ```bash -cmake ../root -Dtmva-sofie=ON +cmake ../root -Dsofie=ON make -j8 ``` @@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; SOFIE::RModel model = parser.Parse(“./example_model.onnx”); model.Generate(); @@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t Here is the updated list of supported ONNX operators. You can obtain this list by doing ```cpp -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; std::vector supportedOperators = parser.GetRegisteredOperators(); ``` @@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types: You can also check your model whether all operators are implemented by doing the following: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; parser.CheckModel("example_model.ONNX"); ``` diff --git a/src/SOFIE_core/inc/LinkDef.h b/core/inc/LinkDef.h similarity index 100% rename from src/SOFIE_core/inc/LinkDef.h rename to core/inc/LinkDef.h diff --git a/src/SOFIE_core/inc/SOFIE/FunctionList.hxx b/core/inc/SOFIE/FunctionList.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/FunctionList.hxx rename to core/inc/SOFIE/FunctionList.hxx diff --git a/src/SOFIE_core/inc/SOFIE/OperatorList.hxx b/core/inc/SOFIE/OperatorList.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/OperatorList.hxx rename to core/inc/SOFIE/OperatorList.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/core/inc/SOFIE/RFunction.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/RFunction.hxx rename to core/inc/SOFIE/RFunction.hxx index 53c30e3..f79691a 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx +++ b/core/inc/SOFIE/RFunction.hxx @@ -3,6 +3,7 @@ #include "SOFIE/RModel_Base.hxx" #include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx b/core/inc/SOFIE/RFunction_MLP.hxx similarity index 90% rename from src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx rename to core/inc/SOFIE/RFunction_MLP.hxx index 8dfc0e1..d9f8626 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx +++ b/core/inc/SOFIE/RFunction_MLP.hxx @@ -15,7 +15,7 @@ enum class Activation { class RFunction_MLP: public RFunction_Update { private: - Int_t fNumLayers; // Number of Layers in MLP + int_t fNumLayers; // Number of Layers in MLP Activation fActivationFunction; bool fActivateFinal; // if True, fActivationFunction is applied as the activation for the last layer std::vector fKernelTensors; @@ -23,7 +23,7 @@ private: public: virtual ~RFunction_MLP() {} - RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN); + RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN); void Initialize(); diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx b/core/inc/SOFIE/RFunction_Mean.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx rename to core/inc/SOFIE/RFunction_Mean.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx b/core/inc/SOFIE/RFunction_Sum.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx rename to core/inc/SOFIE/RFunction_Sum.hxx diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/core/inc/SOFIE/RModel.hxx similarity index 60% rename from src/SOFIE_core/inc/SOFIE/RModel.hxx rename to core/inc/SOFIE/RModel.hxx index 79541af..8153408 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel.hxx +++ b/core/inc/SOFIE/RModel.hxx @@ -10,20 +10,33 @@ namespace SOFIE { class RModel final : public RModel_Base { + friend class RModelProfiler; + friend class RModelProfilerGPU; + private: bool fIsInitialized = false; bool fIsSubGraph = false; + bool fUseVDT = false; + bool fProfile = false; int fVerbose = 0; int fBatchSize = -1; long fReadPos = 0; // reading file position + size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors + size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors + size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool + + OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended; std::unordered_map fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs? std::unordered_map fReadyInputTensorInfos; // input tensors where shape is full defined std::unordered_map fInitializedTensors; std::unordered_map fIntermediateTensorInfos; std::unordered_map fDynamicTensorInfos; + std::unordered_map, bool>> fShapeTensors; // constant tensors describing a shape + std::unordered_map fAliasTensors; // alias tensors (name -> original tensor name) std::unordered_map fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value + std::vector fDimShapeNames; // parameter names used to define the shapes std::vector fOutputTensorNames; std::vector fInputTensorNames; // input tensor names using ONNX order @@ -38,6 +51,30 @@ private: MemoryPoolInfo fIntermediateMemoryInfo; /// fIntermediateTensorFrequencyLookup; /// opIndices; ///< consecutive op indices forming this group + std::string inputTensor; ///< input tensor name of the first op + std::string outputTensor; ///< output tensor name of the last op + size_t numElements = 0; + bool isFused() const { return opIndices.size() > 1; } + std::string suffix() const { + std::string s; + for (auto i : opIndices) s += "_" + std::to_string(i); + return s; + } + }; + std::vector fEltwiseFusionGroups; /// fOpToFusionGroupIdx; /// fusion group index + std::set fFusionIntermediateTensors; /// fSkipOperators; /// &GetTensorShape(std::string name) const; - std::vector GetDynamicTensorShape(std::string name) const; - const ETensorType &GetTensorType(std::string name) const; + std::vector GetTensorShape(const std::string & name) const; + std::vector GetDimTensorShape(const std::string & name) const; + ETensorType GetTensorType(std::string name) const; + std::vector GetDynamicTensorShape(const std::string & name) const ; + + // get the values for the tensor representing a shape + const std::vector & GetShapeTensorValues(const std::string & tensor_name) const; + bool CheckIfTensorAlreadyExist(std::string tensor_name); void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape); @@ -81,6 +123,7 @@ public: size_t length = ConvertShapeToLength(shape); std::shared_ptr data_ptr(malloc(length * sizeof(T)), free); std::memcpy(data_ptr.get(), (void*) data, length * sizeof(T)); + std::cout<<"Length of constant tensor "<(T()), shape, data_ptr); } // for boolean can be more convenient passing an std::vector @@ -102,6 +145,12 @@ public: AddInitializedTensor(tensor_name, GetTemplatedType(T()), shape, data); } + void AddShapeTensor(const std::string & name, const std::vector & shapeValues, bool scalar = false); + void AddAliasTensor(const std::string & name, const std::string & origin); + bool IsAliasTensor(const std::string & tensor_name) const; + + void AddExtraCodeForDimShapes(const std::string & code) { fExtraCodeForDimShapes += code; } + // add and initialize subgraph to the model void InitializeSubGraph(std::shared_ptr graph); @@ -118,13 +167,15 @@ public: bool IsDimInputTensor(const std::string &name) const; // check if tensor is a fully specified input tensor bool IsReadyInputTensor(const std::string &name) const; + /// check if a tensor is a shape tensor + bool IsShapeTensor(const std::string & name) const; // Add intermediate tensor void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape); void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape); // Add an intermediate dynamic tensor void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape); - + void AddShapeParam(const std::string & name, size_t def_value = 0); void AddInputTensorName(std::string name); void AddOutputTensorNameList(std::vector output_tensor_names); void @@ -132,6 +183,9 @@ public: void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); std::shared_ptr GetInitializedTensorData(std::string tensor_name); + void RemoveInitializedTensor(std::string tensor_name); + template + std::vector GetTensorData(const std::string & name); void Initialize(int batchSize = -1, bool verbose = false); void Initialize(const std::map & inputParams, bool verbose = false); @@ -141,40 +195,77 @@ public: { Generate(static_cast>(options), batchSize, pos, verbose); } + void GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize = -1, bool verbose = false); + void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false) + { + GenerateGPU_ALPAKA(static_cast>(options), batchSize, verbose); + } // generate the infer function signature. If isdecl= false generate the calling infer function // used to infer the sub-graphs std::string GenerateInferSignature(bool isdecl = true); + // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function + // used to infer the sub-graphs + std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true); + + // generate the _infer_impl signature using ViewPlainPtr types instead of Buf types + std::string GenerateImplSignature_GPU_ALPAKA(bool isdecl = true); + + void RemoveIntermediateTensor(const std::string& tensor_name){ + fIntermediateTensorInfos.erase(tensor_name); + } + // calculate total intermediate memory and position intermediate tensor addresses - std::string AllocateIntermediateMemory(std::span op_output_tensors); - void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); + std::string AllocateIntermediateMemory(std::span op_output_tensors); + void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); protected: // internal functions // generate code for the initialized tensors void GenerateInitializedTensorInfo(); + + void GenerateInitializedTensorInfo_GPU_ALPAKA(); // generate code for the intermediate tensors void GenerateIntermediateTensorInfo(); + + // generate code for the temporary initialized tensors containers + void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + // generate code for the dynamic tensors void GenerateDynamicTensorInfo(); + + void GenerateDynamicTensorInfo_GPU_ALPAKA(); // generate code for declarations needed by operators void GenerateOperatorDeclarations(); // generate code for inference void GenerateOutput(); + + void GenerateOutput_GPU_ALPAKA(); + + void MoveInitializedTensorsToBuffers_ALPAKA(); // generate code for initializing memory pool for intermediate tensors void GenerateIntermediateMemoryPool(); // Generate all session code void GenerateSessionCode(); + void GenerateSessionCode_GPU_ALPAKA(); + void GenerateGPU_ALPAKA_Buffers(); + + void CheckAndFuseOperators(); + bool IsInputTensorShapeParam(std::string const ¶mName) const; + std::vector CollectTensorMemberNames(const std::string &input); + void GenerateRequiredInputTensorInfo(); public: const std::vector &GetInputTensorNames() const { return fInputTensorNames; } const std::vector &GetOutputTensorNames() const { return fOutputTensorNames; } + const std::vector & GetDimShapeNames() const { return fDimShapeNames; } void ReadInitializedTensorsFromFile(long); long WriteInitializedTensorsToFile(std::string filename = ""); - void PrintIntermediateTensors(); - void PrintOutputTensors(); + void PrintIntermediateTensors() const; + void PrintOutputTensors() const; + void PrintSummary() const; void OutputGenerated(std::string filename = "", bool append = false); std::vector GetOutputTensorNames() { return fOutputTensorNames; } void SetFilename(std::string filename) { fName = filename; } @@ -185,24 +276,46 @@ public: //a view only T obj; if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()){ - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); + throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n"); } InitializedTensor new_tensor_ {GetTemplatedType(obj), new_tensor.GetShape() , static_cast(new_tensor.GetData())}; fInitializedTensors[tensor_name] = new_tensor_; } */ - void PrintRequiredInputTensors(); - void PrintInitializedTensors(); - void PrintDynamicTensors(); + void PrintRequiredInputTensors() const; + void PrintInitializedTensors() const; + void PrintDynamicTensors() const; void HeadInitializedTensors(std::string name, int n_print = 50); bool UseSession() const { return fUseSession; } - + void SetUseVDT(bool on) { + fUseVDT = on; + } + bool UseVDT() const { return fUseVDT;} + +#ifdef SOFIE_SUPPORT_ROOT_BINARY // Use the ClassDef macro to allow definition of custom streaming ClassDefNV(RModel, 3); +#endif + }; +template +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsInitializedTensor(name)) return std::vector{}; + T * data = static_cast(GetInitializedTensorData(name).get()); + size_t size = ConvertShapeToLength(GetTensorShape(name)); + return std::vector(data, data+size); +} + +template<> +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsShapeTensor(name)) return std::vector{}; + return GetShapeTensorValues(name); +} + + } // namespace SOFIE #endif // SOFIE_RMODEL diff --git a/core/inc/SOFIE/RModelProfiler.hxx b/core/inc/SOFIE/RModelProfiler.hxx new file mode 100644 index 0000000..93e05f7 --- /dev/null +++ b/core/inc/SOFIE/RModelProfiler.hxx @@ -0,0 +1,34 @@ +#ifndef SOFIE_RMODELPROFILER +#define SOFIE_RMODELPROFILER + +#include "SOFIE/RModel.hxx" + +namespace SOFIE { + +/// \class RModelProfiler +/// \brief Generates profiled inference code for an RModel (CPU path). +/// +/// Instruments the generated C++ code to measure per-operator execution time +/// using std::chrono. Activated when RModel::Generate is called with Options::kProfile. +class RModelProfiler { + +public: + static void AddNeededStdLibs(RModel &model); + static std::string GenerateSessionMembers(); + static std::string GenerateUtilityFunctions(); + static std::string GenerateBeginInferCode(); + static std::string GenerateOperatorCode(ROperator &op, size_t op_idx); + static std::string GenerateEndInferCode(); + + RModelProfiler() = delete; + ~RModelProfiler() = default; + + RModelProfiler(const RModelProfiler &) = delete; + RModelProfiler(RModelProfiler &&) = delete; + RModelProfiler &operator=(const RModelProfiler &) = delete; + RModelProfiler &operator=(RModelProfiler &&) = delete; +}; + +} // namespace SOFIE + +#endif // SOFIE_RMODELPROFILER diff --git a/core/inc/SOFIE/RModelProfilerGPU.hxx b/core/inc/SOFIE/RModelProfilerGPU.hxx new file mode 100644 index 0000000..bc4aab2 --- /dev/null +++ b/core/inc/SOFIE/RModelProfilerGPU.hxx @@ -0,0 +1,52 @@ +#ifndef SOFIE_RMODELPROFILERGPU +#define SOFIE_RMODELPROFILERGPU + +#include +#include +#include "SOFIE/RModel.hxx" + +namespace SOFIE { + +/// \class RModelProfilerGPU +/// \brief Generates profiled inference code for the GPU/Alpaka path. +/// +/// Instruments the generated C++ code to measure per-operator GPU execution time +/// using std::chrono + alpaka::wait for synchronization, and reports CPU/GPU memory usage. +/// Activated when RModel::GenerateGPU_ALPAKA is called with Options::kProfile. +class RModelProfilerGPU { + +public: + static void AddNeededStdLibs(RModel &model); + static std::string GenerateSessionMembers(); + static std::string GenerateUtilityFunctions(); + + // Memory info: CPU and GPU tensor sizes computed at code-gen time. + struct MemoryInfo { + // CPU-side + size_t constantTensorBytes = 0; // tensors embedded as C++ arrays (IsConstantTensor) + size_t weightTensorBytes = 0; // tensors loaded from .dat into temporary CPU vectors + size_t intermediateCPUBytes = 0; // intermediate tensor pool (0 in GPU path) + // GPU-side + size_t weightDeviceBytes = 0; // ALL initialized tensor device buffers (const + weights) + size_t intermediateGPUBytes = 0; // intermediate device buffers (excl. fused intermediates) + }; + + static MemoryInfo ComputeMemoryInfo(const RModel &model); + static std::string GenerateMemoryReport(const MemoryInfo &info); + + static std::string GenerateBeginInferCode(); + static std::string GenerateOperatorCode(ROperator &op, size_t op_idx); + static std::string GenerateEndInferCode(); + + RModelProfilerGPU() = delete; + ~RModelProfilerGPU() = default; + + RModelProfilerGPU(const RModelProfilerGPU &) = delete; + RModelProfilerGPU(RModelProfilerGPU &&) = delete; + RModelProfilerGPU &operator=(const RModelProfilerGPU &) = delete; + RModelProfilerGPU &operator=(RModelProfilerGPU &&) = delete; +}; + +} // namespace SOFIE + +#endif // SOFIE_RMODELPROFILERGPU diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/core/inc/SOFIE/RModel_Base.hxx similarity index 54% rename from src/SOFIE_core/inc/SOFIE/RModel_Base.hxx rename to core/inc/SOFIE/RModel_Base.hxx index f8a9d34..b598652 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx +++ b/core/inc/SOFIE/RModel_Base.hxx @@ -12,8 +12,10 @@ #include #include #include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" + +#ifdef SOFIE_SUPPORT_ROOT_BINARY #include "TBuffer.h" +#endif namespace SOFIE { @@ -25,12 +27,29 @@ enum class Options { kRootBinaryWeightFile = 0x4, kGNN = 0x8, kGNNComponent = 0x10, + kProfile = 0x20, +}; + +// Optimization levels inspired by ONNXRuntime. +// We only get Operator Fusion with the Basic, and +// memory reuse with Extended. kExtended is enabled +// by default +enum class OptimizationLevel { + kBasic = 0x0, + kExtended = 0x1, }; enum class WeightFileType { None, RootBinary, Text }; -std::underlying_type_t operator|(Options opA, Options opB); -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB); + +inline std::underlying_type_t operator|(Options opA, Options opB) { + return static_cast>(opA) | + static_cast>(opB); +} + +inline std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { + return opA | static_cast>(opB); +} class RModel_Base { @@ -53,6 +72,45 @@ protected: bool fIsGNN = false; bool fIsGNNComponent = false; + // Function to generate the code for declaring and initializing constant tensors + // This is for tensors which are not part of weight files and can be created from the Constant operator + template + std::string GenerateConstantTensorCode(const std::pair &t) + { + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + std::cout<<"Constant tensor name: "< 100) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + std::cout<<"insider allocate on stack and length\n"; + do { + sameData = (data[idx] == data[idx - 1]); + idx++; + } while (sameData && idx < length); + } + if (allocateOnStack) { + strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; + } else { + strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); + } + public: /** Default constructor. Needed to allow serialization of ROOT objects. See @@ -73,15 +131,15 @@ public: } void AddNeededStdLib(std::string libname) { - if (fAllowedStdLib.find(libname) != fAllowedStdLib.end()) { - fNeededStdLib.insert(libname); - } + // if the library is already in the set, insert does nothing + fNeededStdLib.insert(libname); } void AddNeededCustomHeader(std::string filename) { fCustomOpHeaders.insert(filename); } void GenerateHeaderInfo(std::string &hgname); + void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname); void PrintGenerated() { std::cout << fGC; } std::string ReturnGenerated() { return fGC; } diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx b/core/inc/SOFIE/RModel_GNN.hxx similarity index 94% rename from src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx rename to core/inc/SOFIE/RModel_GNN.hxx index 558f82c..93bb092 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx +++ b/core/inc/SOFIE/RModel_GNN.hxx @@ -66,7 +66,7 @@ struct GNN_Init { break; } default: { - throw std::runtime_error("TMVA SOFIE: Invalid Update function supplied for creating GNN function block."); + throw std::runtime_error("SOFIE: Invalid Update function supplied for creating GNN function block."); } } } @@ -88,7 +88,7 @@ struct GNN_Init { break; } default: { - throw std::runtime_error("TMVA SOFIE: Invalid Aggregate function supplied for creating GNN function block."); + throw std::runtime_error("SOFIE: Invalid Aggregate function supplied for creating GNN function block."); } } } diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx b/core/inc/SOFIE/RModel_GraphIndependent.hxx similarity index 96% rename from src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx rename to core/inc/SOFIE/RModel_GraphIndependent.hxx index 407c645..dfade7f 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx +++ b/core/inc/SOFIE/RModel_GraphIndependent.hxx @@ -49,7 +49,7 @@ struct GraphIndependent_Init { } default: { throw std::runtime_error( - "TMVA SOFIE: Invalid Update function supplied for creating GraphIndependent function block."); + "SOFIE: Invalid Update function supplied for creating GraphIndependent function block."); } } } diff --git a/core/inc/SOFIE/ROperator.hxx b/core/inc/SOFIE/ROperator.hxx new file mode 100644 index 0000000..c24fd70 --- /dev/null +++ b/core/inc/SOFIE/ROperator.hxx @@ -0,0 +1,133 @@ +#ifndef SOFIE_ROPERATOR +#define SOFIE_ROPERATOR + +#include +#include +#include + +#include "SOFIE/SOFIE_common.hxx" + + +namespace SOFIE{ + +class RModel; + +enum class OperatorKind { + GEMM = 0, + LAYERNORM = 1, + RELU = 2, + CONSTANT = 3, + CONSTANTOFSHAPE = 4, + UNDEFINED = 5, + CONV=6, + BATCHNORM=7, + CAST=8, + COMPARISON=9, + EINSUM=10, + ELU=11, + SIGMOID=12, + TANH=13, + SOFTMAX=14, + LEAKYRELU=15, + UNARY_RECIPROCAL=16, + UNARY_SQRT=17, + UNARY_NEG=18, + UNARY_EXP=19, + UNARY_LOG=20, + UNARY_SIN=21, + UNARY_COS=22, + UNARY_ABS=23, + CLIP=24, + NOT=25 +}; + +inline const char* toString(OperatorKind kind) { + switch (kind) { + case OperatorKind::GEMM: return "GEMM"; + case OperatorKind::LAYERNORM: return "LAYERNORM"; + case OperatorKind::RELU: return "RELU"; + case OperatorKind::CONSTANT: return "CONSTANT"; + case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE"; + case OperatorKind::BATCHNORM: return "BATCHNORM"; + case OperatorKind::CONV: return "CONV"; + case OperatorKind::UNDEFINED: return "UNDEFINED"; + default: return "UNKNOWN"; + } +} + +inline std::set FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM}; + +class ROperator{ + + +public: + virtual std::vector GetBlasRoutines() { return {}; } + virtual std::vector GetStdLibs() { return {}; } + virtual std::vector> ShapeInference(std::vector>) { return {}; }; + virtual std::vector TypeInference(std::vector) { return {}; }; + virtual void Initialize(RModel&) = 0; + virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel + virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel + // generate initialization code for session constructor + virtual std::string GenerateInitCode() { return "";} + virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";}; + // generate some specific declaration code for Session + virtual std::string GenerateDeclCode() { return "";} + // generate session data members specific to operator + virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Header() { return "";} + virtual std::string GetFusableOutputTensorName() { return "";} + virtual std::string GetBlasConfig() { return ""; } + virtual void UpdateFusableTensorName(std::string, const std::function& removal_func){ return;}; + + // Elementwise kernel fusion interface + virtual bool IsElementwise() const { return false; } + // Returns the C++ expression applying this op to inputVar (a local T variable) for fused kernel generation + virtual std::string GetElementwiseExpr(const std::string& /*inputVar*/) const { return ""; } + + //virtual void Forward_reference() = 0; + //virtual void Forward_blas() = 0; + virtual ~ROperator(){} + + std::string fName = "UnnamedOperator"; + const std::string &Name() const { return fName; } + +protected: + OperatorKind fKind = OperatorKind::UNDEFINED; + size_t fOpOrder = 0; + const std::string SP = " "; ///< space used to correctly indent the generated C++ code + bool fUseSession = false; ///< flag to identify if using the session class + bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) + bool fIsOutputParamShape = false; ///< flag to identify of the output represents a parametric shape (can be known at compile time) + + mutable std::vector fInputTensorNames; + mutable std::vector fOutputTensorNames; + +public: + std::span GetOpInputTensors() const { + return fInputTensorNames; + } + + std::span GetOpOutputTensors() const { + return fOutputTensorNames; + } + + OperatorKind GetKind() const { return fKind; } + bool IsOutputConstant() const { return fIsOutputConstant; } + + void RegisterOperatorOrder(const size_t ord){ + fOpOrder = ord; + } + size_t GetOpOrder(){ + return fOpOrder; + } + +}; + + + +}//SOFIE + +#endif //SOFIE_OPERATOR diff --git a/core/inc/SOFIE/ROperator_BasicBinary.hxx b/core/inc/SOFIE/ROperator_BasicBinary.hxx new file mode 100644 index 0000000..9a1a963 --- /dev/null +++ b/core/inc/SOFIE/ROperator_BasicBinary.hxx @@ -0,0 +1,589 @@ +#ifndef SOFIE_SOFIE_ROperator_BasicBinary +#define SOFIE_SOFIE_ROperator_BasicBinary + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE { + +enum EBasicBinaryOperator { + Add, + Sub, + Mul, + Div, + Pow, + Mod, + FMod +}; + +template +struct BinaryOperatorTrait {}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Add"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; } + static T Func(T t1, T t2) { return t1 + t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Sub"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; } + static T Func(T t1, T t2) { return t1 - t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Mul"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; } + static T Func(T t1, T t2) { return t1 * t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Div"; } + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; } + static T Func(T t1, T t2) { return t1 / t2; } +}; + +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Pow"; } + static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::pow(t1, t2); } +}; +template +struct BinaryOperatorTrait { + static const std::string Name() { return "Mod"; } + static std::string Op(const std::string & t1, const std::string t2) { return "(" + t1 + " % " + t2 + ")"; } + static T Func(T t1, T t2) { return t1 % t2; } +}; +template +struct BinaryOperatorTrait { + static const std::string Name() { return "FMod"; } + static std::string Op(const std::string & t1, const std::string t2) { return "std::fmod(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::fmod(t1, t2); } +}; + +template +class ROperator_BasicBinary final : public ROperator { +private: + int fBroadcastFlag = 0; + std::string fNA; + std::string fNB; + std::string fNBroadcastedA; + std::string fNBroadcastedB; + std::string fNY; + + std::vector fShapeA; + std::vector fShapeB; + std::vector fShapeY; + + std::vector fDimShapeA; + std::vector fDimShapeB; + std::vector fDimShapeY; + +public: + ROperator_BasicBinary() {} + ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY) + : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = {fNA, fNB}; + fOutputTensorNames = {fNY}; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { return input; } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override + { + // assume now inputs have same shape (no broadcasting) + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel &model) override + { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNA)) { + throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNB)) { + throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); + } + int dynamicInputs = 0; + if (model.IsDynamicTensor(fNA)) { + fDimShapeA = model.GetDynamicTensorShape(fNA); + dynamicInputs |= 1; + } else { + fShapeA = model.GetTensorShape(fNA); + fDimShapeA = ConvertShapeToDim(fShapeA); + } + if (model.IsDynamicTensor(fNB)) { + dynamicInputs |= 2; + fDimShapeB = model.GetDynamicTensorShape(fNB); + } else { + fShapeB = model.GetTensorShape(fNB); + fDimShapeB = ConvertShapeToDim(fShapeB); + } + if (dynamicInputs & 1 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNA << " is dynamic " + << ConvertDimShapeToString(fDimShapeA) << std::endl; + if (dynamicInputs & 2 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNB << " is dynamic " + << ConvertDimShapeToString(fDimShapeB) << std::endl; + + // check if need to broadcast at initialization time if shapes are known and different + // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done) + // case of known shapes + // if shapes are known find the output shape from broadcasting + if (dynamicInputs == 0) { + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB); + fBroadcastFlag = ret.first; + fShapeY = ret.second; + auto lengthY = ConvertShapeToLength(fShapeY); + if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) { + bool broadcast = fBroadcastFlag > 0; + if (broadcast) { + // Y is the common shape of A and B + bool broadcastA = fBroadcastFlag & 2; + bool broadcastB = fBroadcastFlag & 1; + // Broadcast A to Y + if (broadcastA) { + fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; + auto data = model.GetInitializedTensorData(fNA); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), + std::default_delete()); + if (model.Verbose()) + std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + // Update the data and the shape of A + model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); + fShapeA = fShapeY; + fDimShapeA = ConvertShapeToDim(fShapeA); + } + // Broadcast B to Y + if (broadcastB) { + fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; + auto data = model.GetInitializedTensorData(fNB); + if (model.Verbose()) + std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) + << std::endl; + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + if (model.Verbose()) + std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); + fShapeB = fShapeY; + fDimShapeB = ConvertShapeToDim(fShapeB); + } + } else { + fShapeY = fShapeA; + } + // tensors are constant: perform here the binary operation + + const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA; + const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB; + auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + std::vector dataY(lengthY); + for (size_t i = 0; i < dataY.size(); i++) { + dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); + } + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + // flag tensors to not be written in the generated code or weight file + model.SetNotWritableInitializedTensor(nameA); + model.SetNotWritableInitializedTensor(nameB); + fIsOutputConstant = true; + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl; + } + } else if (((model.IsShapeTensor(fNA) && model.IsShapeTensor(fNB)) || + (model.IsShapeTensor(fNA) && model.IsInitializedTensor(fNB)) || + (model.IsShapeTensor(fNB) && model.IsInitializedTensor(fNA))) + && (fShapeA.size() <=1 && fShapeB.size() <=1 && model.GetTensorType(fNA) == ETensorType::INT64)) { + // case of shape tensors ( tensors are of rank 0 or 1 ) + std::vector dimValA; + std::vector dimValB; + if (model.IsShapeTensor(fNA)) + dimValA = model.GetShapeTensorValues(fNA); + if (model.IsShapeTensor(fNB)) + dimValB = model.GetShapeTensorValues(fNB); + // adjust for broadcasting - repet values until it reaches shapes of Y + if (!fShapeY.empty() && fShapeY[0] > 1) { + if (dimValA.size() == 1) dimValA = std::vector( fShapeY[0], dimValA[0]); + if (dimValB.size() == 1) dimValB = std::vector( fShapeY[0], dimValB[0]); + } + + auto convertDataToDim = [&](const std::string & name, const std::vector & shape, std::vector & dimValues) { + auto data = static_cast(model.GetInitializedTensorData(name).get()); + dimValues.resize(lengthY); + for (size_t i = 0; i < lengthY; i++) { + if (!shape.empty() && lengthY == shape[0]) + dimValues[i] = Dim{ static_cast(data[i])}; + else // case dataA is a scalar + dimValues[i] = Dim{ static_cast(data[0])}; + } + }; + if (model.IsInitializedTensor(fNA)) { + convertDataToDim(fNA,fShapeA,dimValA); + } else if (model.IsInitializedTensor(fNB)) { + convertDataToDim(fNB,fShapeB,dimValB); + } + + //perform binary operations on shape tensors + std::vector dimValY(lengthY); + for (size_t i = 0; i < lengthY; i++) { + if (!dimValA[i].isParam && !dimValB[i].isParam) { + size_t d = BinaryOperatorTrait::Func(dimValA[i].dim, dimValB[i].dim); + dimValY[i] = Dim{d}; + } else { + auto res = BinaryOperatorTrait::Op(dimValA[i].GetVal(), dimValB[i].GetVal()); + dimValY[i] = Dim{res, static_cast(-1)}; + } + } + model.AddShapeTensor(fNY,dimValY, fShapeY.empty()); // cannot be a scalar + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertDimShapeToString(dimValY) << " (shape)" << std::endl; + } + // no code needs to be generated (flag this as a constant output tensor) + fIsOutputConstant = true; + + } else { + // case of defined and non-constant tensors + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << std::endl; + } + // we convert non-dim shapes to Dim shapes + fDimShapeY = ConvertShapeToDim(fShapeY); + } + } else { + // case A or B have dynamic shapes. We need to broadcast if shape are not same + auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB); + fBroadcastFlag = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; + } + } + return false; + }; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(fDimShapeA[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (fDimShapeA[i].dim != 1) + s = fDimShapeA[i]; + else + s = fDimShapeB[i]; + } else if (IsInputDimParam(fDimShapeB[i].param)) { + if (fDimShapeB[i].dim != 1) + s = fDimShapeB[i]; + else + s = fDimShapeA[i]; + } + } + } + } + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << ConvertDimShapeToString(fDimShapeA) << " , " + << ConvertDimShapeToString(fDimShapeB) << " --> " << ConvertDimShapeToString(fDimShapeY) << std::endl; + } + } + } + + std::string GenerateInitCode() override + { + std::stringstream out; + return out.str(); + } + + std::string Generate(std::string opName) override + { + + if (fIsOutputConstant) + return ""; + + opName = "op_" + opName; + + std::stringstream out; + out << SP << "\n//------ " << opName << " " << BinaryOperatorTrait::Name() << " --> " + << ConvertDimShapeToString(fDimShapeY) << "\n"; + auto length = ConvertDimShapeToLength(fDimShapeY); + std::string typeName = TensorType::Name(); + + // we need to check if we can broadcast (case flag has bit 4 set) + + if (fBroadcastFlag & 4) { + // need to check if shapes are the same + auto lengthA = ConvertDimShapeToLength(fDimShapeA); + auto lengthB = ConvertDimShapeToLength(fDimShapeB); + out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n"; + // check if A->B or B->A + // bool broadcastable = true; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 && + fDimShapeB[i].isParam) { + // B->A B[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator " + << opName << "\");\n"; + } + if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 && + fDimShapeA[i].isParam) { + // A-> B A[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator " + << opName << "\");\n"; + } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) { + // both shapes are parametric and we broadcast to maximum + // we allocate here output vector + out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i] + << " != 1 || " << fDimShapeB[i] << " != 1))\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName + << "\");\n"; + } + } + out << SP << "}\n"; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_A, compute_idx_B, compute_idx_Y; + if (fDimShapeA.empty() || + std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_A = "0"; + } else { + for (size_t i = 0; i < fDimShapeA.size(); ++i) { + if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1") + continue; + compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())); + if (stridesA[i].GetVal() != "1") + compute_idx_A += " * " + stridesA[i].GetVal(); + compute_idx_A += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_A.pop_back(); + } + if (fDimShapeB.empty() || + std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_B = "0"; + } else { + for (size_t i = 0; i < fDimShapeB.size(); ++i) { + if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1") + continue; + compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())); + if (stridesB[i].GetVal() != "1") + compute_idx_B += " * " + stridesB[i].GetVal(); + compute_idx_B += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_B.pop_back(); + } + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNY << "[" << compute_idx_Y << "] = " + << BinaryOperatorTrait::Op("tensor_" + fNA + "[" + compute_idx_A + "]", + "tensor_" + fNB + "[" + compute_idx_B + "]") + << " ;\n"; + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ "+opName+"_"+BinaryOperatorTrait::Name()+"_KERNEL_ALPAKA\n"; + op += SP + "struct Binary"+opName+BinaryOperatorTrait::Name()+"Kernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < " + std::to_string(ConvertShapeToLength(fShapeY)) + ") {\n"; + auto stridesA = UTILITY::ComputeStrideFromShape(fShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fShapeB); + + for(size_t id_s = 0; id_s < stridesA.size(); ++id_s){ + if(fShapeA[id_s] == 1) + stridesA[id_s] = 0; + } + + for(size_t id_s = 0; id_s < stridesB.size(); ++id_s){ + if(fShapeB[id_s] == 1) + stridesB[id_s] = 0; + } + + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + // --- Fast-path index simplifications --- + // Check whether A is broadcast (all strides zero → single element) + bool isAScalar = true; + for (const auto& s : stridesA) { if (s != 0) { isAScalar = false; break; } } + // Check whether B is broadcast (all strides zero → single element) + bool isBScalar = true; + for (const auto& s : stridesB) { if (s != 0) { isBScalar = false; break; } } + // Check whether A has the same contiguous layout as Y (no broadcasting) + bool isAContiguous = (fShapeA.size() == fShapeY.size()); + if (isAContiguous) { + for (size_t i = 0; i < fShapeA.size(); ++i) + if (fShapeA[i] != fShapeY[i]) { isAContiguous = false; break; } + } + // Check whether B has the same contiguous layout as Y (no broadcasting) + bool isBContiguous = (fShapeB.size() == fShapeY.size()); + if (isBContiguous) { + for (size_t i = 0; i < fShapeB.size(); ++i) + if (fShapeB[i] != fShapeY[i]) { isBContiguous = false; break; } + } + + std::string flattened_index_A = ""; + std::string flattened_index_B = ""; + + if (isAScalar) { + // A is a single broadcast value + flattened_index_A = "0"; + } else if (isAContiguous) { + // A and Y have identical shapes → direct index + flattened_index_A = "idx"; + } else { + // General broadcast case: decompose idx into per-dim coords + std::string temp = "idx"; + for (size_t id_s = 0; id_s < fShapeA.size(); ++id_s) { + auto strideY = stridesY[id_s]; + auto strideA = stridesA[id_s]; + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + flattened_index_A += coord + " * " + std::to_string(strideA) + " + "; + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + if (!flattened_index_A.empty()) + flattened_index_A.erase(flattened_index_A.size() - 3); + } + + if (isBScalar) { + // B is a single broadcast value + flattened_index_B = "0"; + } else if (isBContiguous) { + // B and Y have identical shapes → direct index + flattened_index_B = "idx"; + } else { + // General broadcast case + std::string temp = "idx"; + for (size_t id_s = 0; id_s < fShapeB.size(); ++id_s) { + auto strideY = stridesY[id_s]; + auto strideB = stridesB[id_s]; + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + flattened_index_B += coord + " * " + std::to_string(strideB) + " + "; + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + if (!flattened_index_B.empty()) + flattened_index_B.erase(flattened_index_B.size() - 3); + } + + op += "C[idx] = " + BinaryOperatorTrait::Op("A["+flattened_index_A+"]", "B["+flattened_index_B+"]") + ";\n"; + op += "}\n}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + return SP + "Binary"+OpName+BinaryOperatorTrait::Name()+"Kernel binary" + OpName + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + if (fDimShapeY.empty()) { + throw std::runtime_error("SOFIE Operator Basic Binary called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fDimShapeY); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA + << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::vector GetStdLibs() override + { + if (Op == EBasicBinaryOperator::Pow) { + return {std::string("cmath")}; + } else { + return {}; + } + } + + +}; + +} // namespace SOFIE + +#endif // SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx b/core/inc/SOFIE/ROperator_BasicNary.hxx similarity index 85% rename from src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx rename to core/inc/SOFIE/ROperator_BasicNary.hxx index cbe0497..928ab1c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx +++ b/core/inc/SOFIE/ROperator_BasicNary.hxx @@ -81,10 +81,10 @@ private: std::vector fNInputs; std::string fNY; - std::vector> fShapeInputs; + std::vector> fShapeInputs; std::vector fNBroadcastedInputs; - std::vector fShapeY; + std::vector fShapeY; bool fBroadcast = false; @@ -119,18 +119,24 @@ public: void Initialize(RModel& model) override { for (auto &it : fNInputs) { if (!model.CheckIfTensorAlreadyExist(it)) { - throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model"); + throw std::runtime_error("SOFIE BasicNary Op Input Tensor " + it + " is not found in model"); } - fShapeInputs.push_back(model.GetTensorShape(it)); + fShapeInputs.push_back(model.GetDimTensorShape(it)); + } + // Find the common output shape by pairwise multidirectional broadcast + fShapeY = fShapeInputs[0]; + for (size_t i = 1; i < fShapeInputs.size(); i++) { + auto shapeA = fShapeY; + auto shapeB = fShapeInputs[i]; + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB); + fShapeY = ret.second; } - // Find the common shape of the input tensors - fShapeY = UTILITY::MultidirectionalBroadcastShape(fShapeInputs); model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY); // Broadcasting size_t N = fNInputs.size(); fNBroadcastedInputs.reserve(N); for (size_t i = 0; i < N; i++) { - if (!UTILITY::AreSameShape(model.GetTensorShape(fNInputs[i]), fShapeY)) { + if (!UTILITY::AreSameShape(fShapeInputs[i], fShapeY)) { fBroadcast = true; std::string name = "Broadcasted" + fNInputs[i]; model.AddIntermediateTensor(name, model.GetTensorType(fNInputs[0]), fShapeY); @@ -145,18 +151,18 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first"); + throw std::runtime_error("SOFIE BasicNary called to Generate without being initialized first"); } std::stringstream out; - size_t length = ConvertShapeToLength(fShapeY); + std::string length = ConvertDimShapeToLength(fShapeY); out << SP << "\n//------ BasicNary operator\n"; if (fBroadcast) { for (size_t i = 0; i < fNInputs.size(); i++) { if (fNBroadcastedInputs[i] != fNInputs[i]) { - out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertShapeToString(fShapeY) << "\n"; + out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertDimShapeToString(fShapeY) << "\n"; out << SP << SP << "{\n"; - out << SP << SP << SP << fType << "* data = SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertShapeToString(fShapeInputs[i]); - out << ", " << ConvertShapeToString(fShapeY) << ");\n"; + out << SP << SP << SP << fType << "* data = SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertDimShapeToString(fShapeInputs[i]); + out << ", " << ConvertDimShapeToString(fShapeY) << ");\n"; out << SP << SP << SP << "std::copy(data, data + " << length << ", " << fNBroadcastedInputs[i] << ");\n"; out << SP << SP << SP << "delete[] data;\n"; out << SP << SP << "}\n"; diff --git a/core/inc/SOFIE/ROperator_BasicUnary.hxx b/core/inc/SOFIE/ROperator_BasicUnary.hxx new file mode 100644 index 0000000..dfe6714 --- /dev/null +++ b/core/inc/SOFIE/ROperator_BasicUnary.hxx @@ -0,0 +1,207 @@ +#ifndef SOFIE_ROPERATOR_BASIC_UNARY +#define SOFIE_ROPERATOR_BASIC_UNARY + +#include +#include +#include + + +namespace SOFIE { + +enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs, kSoftplus, kAtan, kFloor }; + +template +struct UnaryOpTraits { +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Reciprocal"; } + static std::string Op(const std::string &X) { return "1/" + X; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Sqrt"; } + static std::string Op(const std::string &X) { return "std::sqrt(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Neg"; } + static std::string Op(const std::string &X) { return "-" + X; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Exp"; } + static std::string Op(const std::string &X) { return "std::exp(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Log"; } + static std::string Op(const std::string &X) { return "std::log(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Sin"; } + static std::string Op(const std::string &X) { return "std::sin(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Cos"; } + static std::string Op(const std::string &X) { return "std::cos(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Abs"; } + static std::string Op(const std::string &X) { return "std::abs(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Softplus"; } + static std::string Op(const std::string &X) { return "std::log(std::exp(" + X + ") + 1)"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Atan"; } + static std::string Op(const std::string &X) { return "std::atan(" + X + ")"; } +}; + +template +struct UnaryOpTraits { + static std::string Name() { return "Floor"; } + static std::string Op(const std::string &X) { return "std::floor(" + X + ")"; } +}; + +template +class ROperator_BasicUnary final : public ROperator { +private: + std::string fNX; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeY; + +public: + ROperator_BasicUnary() {} + + ROperator_BasicUnary(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + + switch(Op) { + case EBasicUnaryOperator::kReciprocal: + fKind = OperatorKind::UNARY_RECIPROCAL; + break; + case EBasicUnaryOperator::kSqrt: + fKind = OperatorKind::UNARY_SQRT; + break; + case EBasicUnaryOperator::kNeg: + fKind = OperatorKind::UNARY_NEG; + break; + case EBasicUnaryOperator::kExp: + fKind = OperatorKind::UNARY_EXP; + break; + case EBasicUnaryOperator::kLog: + fKind = OperatorKind::UNARY_LOG; + break; + case EBasicUnaryOperator::kSin: + fKind = OperatorKind::UNARY_SIN; + break; + case EBasicUnaryOperator::kCos: + fKind = OperatorKind::UNARY_COS; + break; + case EBasicUnaryOperator::kAbs: + fKind = OperatorKind::UNARY_ABS; + break; + } + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector> ShapeInference(std::vector> input) override { return input; } + + std::vector TypeInference(std::vector input) override { return input; } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + } + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + + std::string Generate(std::string OpName) override + { + OpName = "op_" + OpName; + std::stringstream out; + + out << SP << "\n//---- Operator" << UnaryOpTraits::Name() << " " << OpName << "\n"; + std::string length = ConvertDimShapeToLength(fShapeX); + out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = " << UnaryOpTraits::Op("tensor_" + fNX + "[i]") << ";\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*OpName*/) override { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ " + UnaryOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Unary" + UnaryOpTraits::Name() + "Kernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * data, T * output, std::size_t const length) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + "output[idx] = " +UnaryOpTraits::Op("data[idx]") + ";\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*OpName*/) override { + return SP + "Unary" + UnaryOpTraits::Name() + "Kernel " + UnaryOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShapeX); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", " << UnaryOpTraits::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::vector GetStdLibs() override { + if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) { + return { std::string("cmath") }; + } else { + return {}; + } + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return UnaryOpTraits::Op(v); + } +}; + +} // namespace SOFIE + +#endif diff --git a/core/inc/SOFIE/ROperator_Basic_Is.hxx b/core/inc/SOFIE/ROperator_Basic_Is.hxx new file mode 100644 index 0000000..fabe976 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Basic_Is.hxx @@ -0,0 +1,145 @@ +#ifndef SOFIE_ROPERATOR_BASIC_IS +#define SOFIE_ROPERATOR_BASIC_IS + +#include +#include +#include +#include + +namespace SOFIE { + +enum class EBasicIsOperator { kIsInf, kIsInfPos, kIsInfNeg, kIsNaN }; + +template +struct IsOpTraits { +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInf"; } + static std::string Op(const std::string &x) { return "std::isinf(" + x + ")"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInfPos"; } + static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " > 0)"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsInfNeg"; } + static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " < 0)"; } +}; + +template<> +struct IsOpTraits { + static std::string Name() { return "IsNaN"; } + static std::string Op(const std::string &x) { return "std::isnan(" + x + ")"; } +}; + + +template +class ROperator_Basic_Is final : public ROperator { +private: + std::string fNX; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeY; + +public: + ROperator_Basic_Is() {} + + ROperator_Basic_Is(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + } + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + + out << SP << "\n//---- Operator " << IsOpTraits::Name() << " " << opName << "\n"; + auto length = ConvertDimShapeToLength(fShapeX); + out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = " << IsOpTraits::Op("tensor_" + fNX + "[i]") << ";\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override + { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ " + IsOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Is" + IsOpTraits::Name() + "Kernel {\n"; + op += SP + SP + "template\n"; + // Output is uint8_t (bool storage), input is T (float/double). + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const & acc,\n"; + op += SP + SP + SP + "T const * data,\n"; + op += SP + SP + SP + "uint8_t * output,\n"; + op += SP + SP + SP + "std::size_t const length) const\n"; + op += SP + SP + "{\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + SP + "output[idx] = static_cast(" + IsOpTraits::Op("data[idx]") + ");\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override + { + return SP + "Is" + IsOpTraits::Name() + "Kernel " + IsOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + auto length = ConvertDimShapeToLength(fShapeX); + + out << "\n//------ " << opName << "_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << opName + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << IsOpTraits::Name() << "Kernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return IsOpTraits::Op(v); + } + + std::vector GetStdLibs() override { + return { std::string("cmath") }; + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_BASIC_IS diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/core/inc/SOFIE/ROperator_BatchNormalization.hxx similarity index 65% rename from src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx rename to core/inc/SOFIE/ROperator_BatchNormalization.hxx index a27cea4..8bc3b3d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx +++ b/core/inc/SOFIE/ROperator_BatchNormalization.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_ROPERATOR_BatchNormalization #define SOFIE_ROPERATOR_BatchNormalization -#include "SOFIE_common.hxx" -#include "ROperator.hxx" -#include "RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include @@ -59,7 +59,7 @@ public: } else{ throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a BatchNormalization operator"); + std::runtime_error("SOFIE Encountered unsupported type parsing a BatchNormalization operator"); } } @@ -72,12 +72,12 @@ public: std::vector> ShapeInference(std::vector> input) override { if (input.size() != 5 ) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference need 5 input tensors"); + std::runtime_error("SOFIE BatchNormalization Op Shape inference need 5 input tensors"); } for(size_t i = 0; i < input.size(); i++) { if (input[i].size() != 4) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions"); + std::runtime_error("SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions"); } } @@ -88,30 +88,30 @@ public: void Initialize(RModel& model) override { if (!model.CheckIfTensorAlreadyExist(fNX)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNScale)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNB)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNMean)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNVar)) { throw - std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model"); + std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 2 || fShapeX.size() > 4) { throw - std::runtime_error("TMVA SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX)); + std::runtime_error("SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX)); } fShapeScale = model.GetTensorShape(fNScale); @@ -185,7 +185,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeX.empty()){ - throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Batch Normalization called to Generate without being initialized first"); } std::stringstream out; @@ -227,6 +227,80 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + + std::string kname = "BatchNormKernel_" + opName; + std::string op; + op = "\n//------ BATCHNORM_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ X,\n"; + op += SP + SP + SP + "T const* __restrict__ scale,\n"; + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + op += SP + SP + SP + "T const* __restrict__ mean,\n"; + op += SP + SP + SP + "T* __restrict__ Y,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t i = global_thread_idx; i < totalElements; i += grid_thread_extent) {\n"; + + op += SP + SP + SP + SP + "T val = (X[i] - mean[i]) * scale[i] + bias[i];\n"; + + if (fActivation == EActivationType::RELU) + op += SP + SP + SP + SP + "Y[i] = val > static_cast(0) ? val : static_cast(0);\n"; + else + op += SP + SP + SP + SP + "Y[i] = val;\n"; + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "BatchNormKernel_" + opName; + return SP + kname + " batchNormKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "batchNormKernel_" + opName; + + std::stringstream out; + out << "\n//------ BATCHNORM_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + + out << SP << "auto task_" << fNY << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNScale << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNMean << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << fNY << ");\n"; + + return out.str(); + } + std::vector GetBlasRoutines() override { return { std::string("Copy"), std::string("Axpy") }; } }; diff --git a/core/inc/SOFIE/ROperator_Cast.hxx b/core/inc/SOFIE/ROperator_Cast.hxx new file mode 100644 index 0000000..3571e39 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Cast.hxx @@ -0,0 +1,175 @@ +#ifndef SOFIE_ROPERATOR_Cast +#define SOFIE_ROPERATOR_Cast + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +std::vector convertToInt64(const In* src, size_t n) { + std::vector dst(n); + std::transform(src, src + n, dst.begin(), + [](In v) { return static_cast(v); }); + return dst; +} + + +class ROperator_Cast final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + ETensorType fType; + +public: + ROperator_Cast(){} + ROperator_Cast(ETensorType type,std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), + fType(type) + { + fKind = OperatorKind::CAST; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + //input must be a graph input, or already initialized intermediate tensor + if (model.CheckIfTensorAlreadyExist(fNX) == false){ + throw std::runtime_error("SOFIE Cast Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + // should we add a check if the same type + auto inputType = model.GetTensorType(fNX); + if (model.IsInitializedTensor(fNX)) { + fIsOutputConstant = true; + auto inputData = model.GetInitializedTensorData(fNX); + if (fType == ETensorType::INT64) { + size_t length = ConvertShapeToLength(fShape); + std::vector convertedData; + if (inputType == ETensorType::FLOAT) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else if (inputType == ETensorType::DOUBLE) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else if (inputType == ETensorType::INT32) { + convertedData = convertToInt64(static_cast(inputData.get()), length); + } else { + // Already INT64 — safe direct copy + convertedData.assign(static_cast(inputData.get()), + static_cast(inputData.get()) + length); + } + model.AddConstantTensor(fNY, ConvertShapeToInt(fShape), convertedData.data()); + model.SetNotWritableInitializedTensor(fNX); + } + else + fIsOutputConstant = false; + } else if (model.IsShapeTensor(fNX) && fType == ETensorType::INT64) { + auto shapeData = model.GetShapeTensorValues(fNX); + model.AddShapeTensor(fNY, shapeData, fShape.size() == 0); + fIsOutputConstant = true; + } + if (!fIsOutputConstant) + model.AddIntermediateTensor(fNY, fType, fShape); + if (model.Verbose()) { + std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << ConvertTypeToString(fType); + if (fType == ETensorType::BOOL) std::cout << " (converted from BOOL) "; + std::cout << " for " << fNY << " shape " << ConvertDimShapeToString(fShape); + if (fIsOutputConstant) std::cout << " (constant) "; + std::cout << std::endl; + } + } + + + std::string Generate(std::string opName) override { + + // output shape can be empty if is a scalar + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + + out << "\n//------ CAST " << opName << " ---> " << fNY << " " << ConvertDimShapeToString(fShape) << "\n"; + // no generated code for constant outputs + if (fIsOutputConstant) return out.str(); + + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + + // need to handle bool case separatly since casting to uint8 will not give right result + if (fType == ETensorType::BOOL) + out << SP << SP << "tensor_" << fNY << "[id] = (tensor_" << fNX << "[id] != 0) ? 1 : 0;\n"; + else + out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< ConvertTypeToString(fType) << ">(tensor_" << fNX << "[id]);\n"; + + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + std::string op; + op = "\n//------ CAST_KERNEL_ALPAKA\n"; + op += SP + "struct CastKernel"+opName+"{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "dst[i] = static_cast(src[i]);\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + // Use a per-operator variable name so that multiple Cast operators with + // different source/destination types in the same model each get their own + // distinct member variable (the struct type is already per-op: CastKernelN). + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + return SP + "CastKernel" + opName + " castKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + // Save the raw operator index before building the "op_N" prefix so the + // variable name matches the one declared in Generate_GPU_Kernel_Definitions_ALPAKA. + std::string varName = "castKernel_" + OpName; + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Cast called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ CAST_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY << ", " << varName << ", alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << ")); \n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + // Cast changes the data type, so it cannot participate in the single-type-T + // FusedEltwiseKernel (which reads input and writes output as the same T). + // Returning false here routes Cast through its own Generate_GPU_ALPAKA path, + // which correctly uses separate SrcT and DstT device buffers. + bool IsElementwise() const override { return false; } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Cast diff --git a/core/inc/SOFIE/ROperator_Clip.hxx b/core/inc/SOFIE/ROperator_Clip.hxx new file mode 100644 index 0000000..4a92afb --- /dev/null +++ b/core/inc/SOFIE/ROperator_Clip.hxx @@ -0,0 +1,376 @@ +#ifndef SOFIE_ROPERATOR_CLIP +#define SOFIE_ROPERATOR_CLIP + +#include "SOFIE_common.hxx" +#include "ROperator.hxx" +#include "RModel.hxx" + +#include +#include +#include +#include +#include + +namespace SOFIE { + +// --------------------------------------------------------------------------- +// ROperator_Clip +// +// ONNX spec: Y = max(min_val, min(max_val, X)) element-wise +// +// The min and max bounds are optional in the ONNX spec: +// - if fNMin is empty → no lower clipping (effectively -inf) +// - if fNMax is empty → no upper clipping (effectively +inf) +// +// Bounds can be provided either as: +// (a) initializer / constant tensors (scalar, shape []), +// (b) runtime input tensors (resolved at Generate time), +// (c) compile-time float literals (via the fMin / fMax attributes). +// +// The implementation follows the Selu operator style exactly: +// - static shape stored in fShape +// - dynamic shape stored in fDimShape +// - a flat loop over all elements in Generate() +// --------------------------------------------------------------------------- + +template +class ROperator_Clip final : public ROperator { +private: + + // Tensor names + std::string fNX; // input data + std::string fNY; // output + std::string fNMin; // optional: tensor name for min bound + std::string fNMax; // optional: tensor name for max bound + + + // Static shape (non-dynamic path, mirrors Selu) + std::vector fShape; + + // Dynamic shape (Dim-aware, for dynamic input tensors) + std::vector fDimShape; + bool fIsDynamic = false; + + // Compile-time bound values — used when bounds are constant tensors + // Initialised to the ONNX defaults (no clipping) + T fMin = std::numeric_limits::lowest(); // -inf equivalent + T fMax = std::numeric_limits::max(); // +inf equivalent + + // Flags indicating whether each bound is: + // - absent (no input provided) + // - a constant resolved at Initialize time + // - a runtime tensor that must be read in the generated code + bool fHasMin = false; + bool fHasMax = false; + bool fMinIsConstant = false; + bool fMaxIsConstant = false; + +public: + + ROperator_Clip() {} + + // Constructor for the common case where bounds are tensor inputs + // (follows ONNX node input order: X, min, max) + ROperator_Clip(std::string nameX, + std::string nameY, + std::string nameMin = "", + std::string nameMax = "") + : fNX (UTILITY::Clean_name(nameX)), + fNY (UTILITY::Clean_name(nameY)), + fNMin(nameMin.empty() ? "" : UTILITY::Clean_name(nameMin)), + fNMax(nameMax.empty() ? "" : UTILITY::Clean_name(nameMax)) + { + fKind = OperatorKind::CLIP; + fInputTensorNames = { fNX }; + if (!fNMin.empty()) fInputTensorNames.push_back(fNMin); + if (!fNMax.empty()) fInputTensorNames.push_back(fNMax); + fOutputTensorNames = { fNY }; + } + + // Convenience constructor when bounds are known scalars at model-build time + ROperator_Clip(std::string nameX, + std::string nameY, + T minVal, + T maxVal) + : fNX (UTILITY::Clean_name(nameX)), + fNY (UTILITY::Clean_name(nameY)), + fMin(minVal), fMax(maxVal), + fHasMin(true), fHasMax(true), + fMinIsConstant(true), fMaxIsConstant(true) + { + fKind = OperatorKind::CLIP; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + // ----------------------------------------------------------------------- + void Initialize(RModel& model) override + { + // ---- validate main input ------------------------------------------ + if (!model.CheckIfTensorAlreadyExist(fNX)) + throw std::runtime_error( + "SOFIE Clip Op Input Tensor " + fNX + " is not found in model"); + + // ---- collect shape (static or dynamic, mirrors BasicBinary) ------- + if (model.IsDynamicTensor(fNX)) { + fIsDynamic = true; + fDimShape = model.GetDynamicTensorShape(fNX); + } else { + fShape = model.GetTensorShape(fNX); + fDimShape = ConvertShapeToDim(fShape); + } + + // ---- resolve min bound -------------------------------------------- + if (!fNMin.empty() && model.CheckIfTensorAlreadyExist(fNMin)) { + fHasMin = true; + if (model.IsInitializedTensor(fNMin)) { + // constant scalar tensor — read value now + auto data = static_cast(model.GetInitializedTensorData(fNMin).get()); + fMin = data[0]; + fMinIsConstant = true; + model.SetNotWritableInitializedTensor(fNMin); + } + // else: runtime input — will be dereferenced in generated code + } + + // ---- resolve max bound -------------------------------------------- + if (!fNMax.empty() && model.CheckIfTensorAlreadyExist(fNMax)) { + fHasMax = true; + if (model.IsInitializedTensor(fNMax)) { + auto data = static_cast(model.GetInitializedTensorData(fNMax).get()); + fMax = data[0]; + fMaxIsConstant = true; + model.SetNotWritableInitializedTensor(fNMax); + } + } + + // ---- register output tensor --------------------------------------- + if (fIsDynamic) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fDimShape); + else + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + + if (model.Verbose()) { + std::cout << "Clip : " << fNX << " " + << ConvertShapeToString(fShape); + if (fHasMin) + std::cout << " min=" << (fMinIsConstant + ? std::to_string(fMin) : fNMin + "(runtime)"); + if (fHasMax) + std::cout << " max=" << (fMaxIsConstant + ? std::to_string(fMax) : fNMax + "(runtime)"); + std::cout << " --> " << fNY << "\n"; + } + + // only needs and — no cmath + model.AddNeededStdLib("algorithm"); + model.AddNeededStdLib("limits"); + } + + + // ----------------------------------------------------------------------- + // GPU ALPAKA + // ----------------------------------------------------------------------- + + // Each Clip instance carries its own min/max values (passed as kernel + // arguments) and may have different element types. Use per-operator names + // for the kernel struct and member variable so that multiple Clip operators + // in the same model do not produce duplicate definitions. + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override + { + std::string kname = "ClipKernel_op_" + opName; + std::string op; + op = "\n//------ CLIP_KERNEL_ALPAKA op_" + opName + "\n"; + op += "struct " + kname + " {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + "TAcc const & acc,\n"; + op += SP + SP + "T const * __restrict__ data,\n"; + op += SP + SP + "T * __restrict__ out,\n"; + op += SP + SP + "std::size_t numElements,\n"; + op += SP + SP + "T minVal,\n"; + op += SP + SP + "T maxVal) const\n"; + op += SP + "{\n"; + op += SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "T val = data[idx];\n"; + op += SP + SP + SP + "val = val < minVal ? minVal : val;\n"; + op += SP + SP + SP + "out[idx] = val > maxVal ? maxVal : val;\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override + { + std::string kname = "ClipKernel_op_" + opName; + std::string vname = "clipKernel_op_" + opName; + return kname + " " + vname + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override + { + // Save the raw operator index before building the "op_N" prefix so that + // the variable name matches the one declared in Generate_GPU_Kernel_Definitions_ALPAKA. + std::string varName = "clipKernel_op_" + OpName; + OpName = "op_" + OpName; + + if (fShape.empty() && fDimShape.empty()) + throw std::runtime_error( + "SOFIE Operator Clip called to Generate_GPU_ALPAKA without being initialized first"); + + std::stringstream out; + out << "\n//------ CLIP_GPU_ALPAKA " << OpName << "\n"; + + std::string length = ConvertDimShapeToLength(fDimShape); + + std::string minExpr, maxExpr; + if (fMinIsConstant) { + minExpr = ToStringHighPrec(fMin); + } else if (fHasMin) { + throw std::runtime_error( + "SOFIE Clip GPU ALPAKA: runtime (non-constant) min bound is not supported in GPU path"); + } else { + minExpr = "std::numeric_limits<" + TensorType::Name() + ">::lowest()"; + } + + if (fMaxIsConstant) { + maxExpr = ToStringHighPrec(fMax); + } else if (fHasMax) { + throw std::runtime_error( + "SOFIE Clip GPU ALPAKA: runtime (non-constant) max bound is not supported in GPU path"); + } else { + maxExpr = "std::numeric_limits<" + TensorType::Name() + ">::max()"; + } + + std::string castMin = "static_cast<" + TensorType::Name() + ">(" + minExpr + ")"; + std::string castMax = "static_cast<" + TensorType::Name() + ">(" + maxExpr + ")"; + + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << OpName + << " = alpaka::createTaskKernel(workDiv_" << fNY << ", " << varName + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << length << ")" + << ", " << castMin << ", " << castMax << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return true; } + + std::string GetElementwiseExpr(const std::string& v) const override + { + std::string minExpr, maxExpr; + if (fMinIsConstant) minExpr = ToStringHighPrec(fMin); + else if (fHasMin) minExpr = "tensor_" + fNMin + "[0]"; + else minExpr = "std::numeric_limits<" + TensorType::Name() + ">::lowest()"; + + if (fMaxIsConstant) maxExpr = ToStringHighPrec(fMax); + else if (fHasMax) maxExpr = "tensor_" + fNMax + "[0]"; + else maxExpr = "std::numeric_limits<" + TensorType::Name() + ">::max()"; + + std::string expr = fHasMax || fMaxIsConstant ? "std::min(" + maxExpr + ", " + v + ")" : v; + if (fHasMin || fMinIsConstant) + expr = "std::max(" + minExpr + ", " + expr + ")"; + return expr; + } + + std::string GetFusableOutputTensorName() override { return fNY; } + + void UpdateFusableTensorName(std::string fusable_tensor_name, + const std::function& removal_func) override + { + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + + // ----------------------------------------------------------------------- + // Generate + // ----------------------------------------------------------------------- + std::string Generate(std::string OpName) override + { + OpName = "op_" + OpName; + + if (fShape.empty() && fDimShape.empty()) + throw std::runtime_error( + "SOFIE Operator Clip called to Generate without being initialized first"); + + std::stringstream out; + out << SP << "\n//------ CLIP " << OpName << "\n"; + + // ---- build the length expression (static or dynamic) ------------- + std::string length = ConvertDimShapeToLength(fDimShape); + + // ---- build min/max expressions for the generated code ------------ + // + // Priority: + // 1. compile-time constant value → emit literal + // 2. runtime input tensor → emit tensor_[0] (scalar) + // 3. not provided → emit numeric_limits extreme + // + std::string minExpr, maxExpr; + + if (fMinIsConstant) { + minExpr = ToStringHighPrec(fMin); + } else if (fHasMin) { + minExpr = "tensor_" + fNMin + "[0]"; // scalar input tensor + } else { + // No lower bound — use lowest representable value + minExpr = "std::numeric_limits<" + TensorType::Name() + + ">::lowest()"; + } + + if (fMaxIsConstant) { + maxExpr = ToStringHighPrec(fMax); + } else if (fHasMax) { + maxExpr = "tensor_" + fNMax + "[0]"; + } else { + // No upper bound — use max representable value + maxExpr = "std::numeric_limits<" + TensorType::Name() + + ">::max()"; + } + + auto tensorValue = [](const std::string & name, const std::string & index) { + std::stringstream s; + s << "tensor_" << name << "[" << index << "]"; + return s.str(); + }; + + // ---- flat element loop (identical structure to Selu) ------------- + out << SP << "for (int id = 0; id < " << length << " ; id++) {\n"; + std::string firstExpr = fHasMax ? "std::min(" + maxExpr + ", " + tensorValue(fNX, "id") + ")" : tensorValue(fNX, "id"); + std::string secondExpr = fHasMin ? "std::max(" + minExpr + ", " + firstExpr + ")" : firstExpr; + out << SP << SP << tensorValue(fNY, "id") << " = " << secondExpr << ";\n"; + out << SP << "}\n"; + + return out.str(); + } + + +private: + + // Helper: convert a T value to string with enough precision + std::string ToStringHighPrec(T val) const { + std::ostringstream ss; + ss << std::setprecision(std::numeric_limits::max_digits10) << val; + // add dot if missing + if (ss.str().find(".") == std::string::npos) ss << "."; + // append 'f' suffix for float literals so generated code compiles + // cleanly without implicit double→float conversion warnings + if (std::is_same::value) ss << "f"; + return ss.str(); + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_CLIP \ No newline at end of file diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx b/core/inc/SOFIE/ROperator_Comparision.hxx similarity index 57% rename from src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx rename to core/inc/SOFIE/ROperator_Comparision.hxx index 7648a9a..1e02d53 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx +++ b/core/inc/SOFIE/ROperator_Comparision.hxx @@ -1,4 +1,3 @@ - #ifndef SOFIE_ROperator_Comparision #define SOFIE_ROperator_Comparision @@ -73,30 +72,26 @@ public: ROperator_Comparision(){} ROperator_Comparision(const std::string & nameX1, const std::string & nameX2, const std::string & nameY): fNX1(UTILITY::Clean_name(nameX1)), fNX2(UTILITY::Clean_name(nameX2)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::COMPARISON; fInputTensorNames = { fNX1, fNX2 }; - - // output will be a boolean vector so should not be considered for memory optimized pool fOutputTensorNames = { fNY }; } - // type of output given input std::vector TypeInference(std::vector input) override { return input; } - // shape of output tensors given input tensors std::vector> ShapeInference(std::vector> input) override { - auto ret = input; // return vector size 1 with first input + auto ret = input; return ret; } void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor if (!model.CheckIfTensorAlreadyExist(fNX1)){ - throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNX2)) { - throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model"); } fShapeX1 = model.GetTensorShape(fNX1); fShapeX2 = model.GetTensorShape(fNX2); @@ -104,38 +99,34 @@ public: fTensorType2 = model.GetTensorType(fNX2); bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2); if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2); + // ONNX comparison ops support multidirectional broadcasting (numpy semantics): + // both inputs can be broadcast to the common output shape. + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeX1, fShapeX2); + fShapeY = ret.second; bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY); bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY); - // Broadcast A to Y if (broadcastX1) { if (model.IsInitializedTensor(fNX1)) { auto data = model.GetInitializedTensorData(fNX1); std::shared_ptr broadcastedData( UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX1, fShapeY), std::default_delete()); - // Update the data and the shape of A model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData); fShapeX1 = fShapeY; } else { - // Add an intermediate tensor for broadcasting A fNBroadcastedX1 = "Broadcasted" + fNX1; model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY); } } - // Broadcast B to Y if (broadcastX2) { if (model.IsInitializedTensor(fNX2)) { auto data = model.GetInitializedTensorData(fNX2); std::shared_ptr broadcastedData( UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX2, fShapeY), std::default_delete()); - // Update the data and the shape of B model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData); fShapeX2 = fShapeY; } else { - // Add an intermediate tensor for broadcasting B fNBroadcastedX2 = "Broadcasted" + fNX2; model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY); } @@ -143,8 +134,7 @@ public: } else { fShapeY = fShapeX1; } - // case of constant tensors - if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2) ) { + if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2)) { fIsOutputConstant = true; auto data1 = static_cast(model.GetInitializedTensorData(fNX1).get()); auto data2 = static_cast(model.GetInitializedTensorData(fNX2).get()); @@ -158,9 +148,8 @@ public: << ConvertValuesToString(length,outData) << std::endl; delete [] outData; } else { - model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY); + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY); } - // check if this is not output operators to add a specific line for definining the tensor_xxx variable const auto & outputTensorNames = model.GetOutputTensorNames(); fIsModelOutput = false; if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end()) @@ -170,14 +159,12 @@ public: std::string Generate(std::string OpName) override { if (fIsOutputConstant) return ""; OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first"); + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); } std::stringstream out; out << SP << "\n//------ " << ComparisionTrait::Name() << "\n"; size_t length = ConvertShapeToLength(fShapeY); - // Broadcast A if it's uninitialized if (!fNBroadcastedX1.empty()) { std::string type1 = ConvertTypeToString(fTensorType1); out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n"; @@ -187,7 +174,6 @@ public: out << SP << SP << "delete[] data;\n"; out << SP << "}\n"; } - // Broadcast B if it's uninitialized if (!fNBroadcastedX2.empty()) { std::string type2 = ConvertTypeToString(fTensorType2); out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n"; @@ -199,14 +185,126 @@ public: } const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1; const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") << " ;\n"; out << SP << "}\n"; - // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector if (!fIsModelOutput) out << SP << "const std::vector & tensor_" << fNY << " = fTensor_" << fNY << ";\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); + + const std::size_t D = fShapeY.size(); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::vector shapeX1_padded(D, 1); + std::vector shapeX2_padded(D, 1); + { + size_t off1 = D - fShapeX1.size(); + for (size_t i = 0; i < fShapeX1.size(); ++i) + shapeX1_padded[off1 + i] = fShapeX1[i]; + size_t off2 = D - fShapeX2.size(); + for (size_t i = 0; i < fShapeX2.size(); ++i) + shapeX2_padded[off2 + i] = fShapeX2[i]; + } + + auto stridesX1 = UTILITY::ComputeStrideFromShape(shapeX1_padded); + auto stridesX2 = UTILITY::ComputeStrideFromShape(shapeX2_padded); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + std::string type1 = ConvertTypeToString(fTensorType1); + std::string type2 = ConvertTypeToString(fTensorType2); + std::string kname = "ComparisonKernel_" + opName; + std::string opname = ComparisionTrait::Name(); + + std::string op; + op = "\n//------ " + opname + "_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + type1 + " const* __restrict__ x1,\n"; + op += SP + SP + SP + type2 + " const* __restrict__ x2,\n"; + op += SP + SP + SP + "uint8_t* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const x1_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX1_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX1[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const x2_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX2_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX2[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = "+ ComparisionTrait::Op("x1[x1_idx]" , "x2[x2_idx]") + " ;\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "ComparisonKernel_" + opName; + return SP + kname + " comparisonKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "comparisonKernel_" + opName; + + std::stringstream out; + out << "\n//------ " << ComparisionTrait::Name() << "_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX1 << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNX2 << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); } diff --git a/core/inc/SOFIE/ROperator_Concat.hxx b/core/inc/SOFIE/ROperator_Concat.hxx new file mode 100644 index 0000000..36ede27 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Concat.hxx @@ -0,0 +1,505 @@ +#ifndef SOFIE_ROPERATOR_Concat +#define SOFIE_ROPERATOR_Concat + + + #include "SOFIE/SOFIE_common.hxx" + #include "SOFIE/ROperator.hxx" + #include "SOFIE/RModel.hxx" + + #include + #include + #include + #include + #include + + namespace SOFIE{ + + class ROperator_Concat final : public ROperator + { + private: + int fAxis=0; + int fnewAxis=0; + std::vector fInputs; + std::string fOutput; + std::vectorfOutputShape; + std::vector fOutputShapeData; // in case output is a shape tensor we store here the output shape value data (can be parametric) + std::vector> fInputShapes; + ETensorType fInputType = ETensorType::UNDEFINED; + + public: + + ROperator_Concat(){} + ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): + fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { + fInputs.reserve(inputs.size()); + for (auto & name : inputs) + fInputs.push_back(UTILITY::Clean_name(name)); + + fInputTensorNames.resize(fInputs.size()); + std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(), + [](const std::string& s) -> std::string_view { return s; }); + fOutputTensorNames = { fOutput }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + // get shape of output given inputs. It is going to be called after initialized + std::vector> ShapeInference(std::vector> inputs) override { + std::vector> ret(1); + // treat negative axis case + if (fAxis<0) { + fAxis = inputs[0].size()+fAxis; + } + if (fAxis < 0 || fAxis >= (int) inputs[0].size()) + throw std::runtime_error("SOFIE Concat Op - invalid axis value "); + + int concat_dim=0; + // case of Concat (fNewAxis = 0) and not ConcatFromSequence + if(fnewAxis == 0){ + for (size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i - 1].size()) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + + ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int)iaxis == fAxis) + concat_dim += inputs[i][iaxis]; + else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis]) + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertShapeToString(inputs[i]) + " and " + + ConvertShapeToString(inputs[i - 1])); + } + } + + // output shape + ret[0] = inputs[0]; + ret[0][fAxis] = concat_dim; + } + std::vector stack; + // case ConCatFromSequence + if(fnewAxis == 1){ + for(size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i-1].size() ) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + + ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int) iaxis == fAxis) + stack.push_back(inputs[i][iaxis]); + else + if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis]) + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1])); + } + + } + for(auto it:stack) + ret[0].push_back(it); + } + + return ret; + } + + // get shape of output given inputs. It is going to be called after initialized + std::vector ShapeInference(const std::vector> & inputs, const RModel & model) { + std::vector ret(inputs[0].size()); + // treat negative axis case + if (fAxis<0) { + fAxis = inputs[0].size()+fAxis; + } + if (fAxis < 0 || fAxis >= (int) inputs[0].size()) + throw std::runtime_error("SOFIE Concat Op - invalid axis value "); + + Dim concat_dim; + if(fnewAxis == 0){ + for (size_t i = 0; i < inputs.size(); i++) { + if (i > 0 && inputs[i].size() != inputs[i - 1].size()) + throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + + ConvertDimShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDimShapeToString(inputs[i - 1])); + for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { + if ((int)iaxis == fAxis) { + // support both integer and params shape for the concatenation axis + if (concat_dim.param.empty() && concat_dim.dim == 0) + concat_dim = inputs[i][iaxis]; + else if (inputs[i][iaxis].isParam || concat_dim.isParam) { + concat_dim = + Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(), + static_cast(-1)}; + } else { + concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim }; + } + } + else if (i == 0) { + ret[iaxis] = inputs[i][iaxis]; + } + else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) { + throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " + + ConvertDimShapeToString(inputs[i]) + " and " + + ConvertDimShapeToString(inputs[i - 1])); + } + else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){ + // if shape is not parametric use it + ret[iaxis] = inputs[i][iaxis]; + } + else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) { + // check which parameter is first in RModel list + auto & dimNames = model.GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param); + if (p1 < p2) ret[iaxis] = inputs[i][iaxis]; + } + + } + // add parenthesis in case is an expression + if (concat_dim.isParam && concat_dim.dim == static_cast(-1)) + concat_dim = Dim{ std::string("(") + concat_dim.GetVal() + std::string(")"), concat_dim.dim }; + } + + // output shape for concatenated axis + ret[fAxis] = concat_dim; + + } + // case of stacking (not supported yet) + // here we need to check that input shapes are the same + // for example for fAxis == 0 + // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....] + if(fnewAxis == 1){ + throw std::runtime_error("SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported "); + } + return ret; + } + + void Initialize(RModel& model) override { + std::vector> inputIntShapes; + for (auto &it : fInputs) { + if (model.CheckIfTensorAlreadyExist(it) == false) { + throw std::runtime_error("SOFIE Concat Op Input Tensor " + it + " is not found in model"); + } + fInputShapes.push_back(model.GetDimTensorShape(it)); + if (!model.IsDynamicTensor(it)) { + inputIntShapes.push_back(ConvertShapeToInt(fInputShapes.back())); + } + } + if (inputIntShapes.size() == fInputs.size()) { + // if all input shapes are static we can compute output shape at initialization time + auto outputIntShape = ShapeInference(inputIntShapes)[0]; + fOutputShape = ConvertShapeToDim(outputIntShape); + if (model.Verbose()) + std::cout << "Initialize Concat operator with defined inputs shapes, " + << "output has shape " << ConvertShapeToString(outputIntShape) << std::endl; + + } else { + // if at least one input shape is dynamic we need to compute output shape using the symbolic expression for the dimensions + fOutputShape = ShapeInference(fInputShapes, model); + if (model.Verbose()) + std::cout << "Initialize Concat operator with dynamic inputs shapes, " + << "output has shape " << ConvertDimShapeToString(fOutputShape) << std::endl; + } + + // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) + bool isOutputShape = false; + + // if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { + fIsOutputConstant = true; + isOutputShape = true; + + for (auto &input : fInputs) { + if (model.IsDynamicTensor(input)) { + fIsOutputConstant = false; + isOutputShape = false; + break; + } + if (!model.IsInitializedTensor(input)) { + if (model.IsShapeTensor(input)) { + // if it is a shape tensor we can have constant output if the shapes are defined) + auto shapeData = model.GetShapeTensorValues(input); + bool isShapeFullyDefined = ConvertShapeToInt(shapeData).size() == shapeData.size(); + if (!isShapeFullyDefined) { + fIsOutputConstant = false; + } else { + // if shape is fully defined we can consider output as constant and we can compute the output + // shape at initialization time + fIsOutputConstant = fIsOutputConstant && true; + } + // inputs are then shape tensors and output is a shape tensor + isOutputShape = true; + } else { + // case of standard intermediate tensor + fIsOutputConstant = false; + isOutputShape = false; + break; + } + } else { + fIsOutputConstant = fIsOutputConstant && true; + } + } + //} + + if (fIsOutputConstant) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + std::vector outputData(ConvertShapeToLength(outputShape)); + size_t offset = 0; + for (auto &input : fInputs) { + auto inputData = static_cast(model.GetInitializedTensorData(input).get()); + auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant + size_t inputLength = ConvertShapeToLength(inputShape); + std::copy(inputData, inputData + inputLength, outputData.begin() + offset); + offset += inputLength; + // the data of the input tensor don't need to be written in the generated code and data file + model.SetNotWritableInitializedTensor(input); + } + model.AddConstantTensor(fOutput, outputShape, outputData.data()); + if (model.Verbose()) { + std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertValuesToString(outputData) << " (constant)" << std::endl; + } + } else if (isOutputShape) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + if (outputShape.size() != 1) + throw std::runtime_error("SOFIE Concat Op - output shape for shape tensor must have rank 1"); + // output shape is a rank 1 tensor with size equal to the output rank + std::vector outputData(outputShape[0]); + size_t offset = 0; + for (auto &input : fInputs) { + std::vector inputData; + auto inputShape = model.GetTensorShape(input); // shape is not dynamic + size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar + if (model.IsShapeTensor(input)) { + inputData = model.GetShapeTensorValues(input); + } else if (model.IsInitializedTensor(input)) { + inputData.resize(inputLength); + auto intData = static_cast(model.GetInitializedTensorData(input).get()); + for (size_t i = 0; i < inputData.size(); i++) + inputData[i] = Dim{static_cast(intData[i])}; + } else { + // this should not happen + throw std::runtime_error("SOFIE Concat Operator- invalid tensor input " + input + + " for shape output type"); + } + std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset); + offset += inputLength; + } + // add output tensor + model.AddShapeTensor(fOutput, outputData, false); // cannot be a scalar + fOutputShapeData = outputData; + if (model.Verbose()) { + std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertDimShapeToString(outputData) << " (shape)" << std::endl; + } + fIsOutputParamShape = true; + } + if (!fIsOutputConstant && !fIsOutputParamShape) { + fInputType = model.GetTensorType(fInputs[0]); + model.AddIntermediateTensor(fOutput, fInputType, fOutputShape); + if (model.Verbose()) { + std::cout << "Concat ---> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << std::endl; + } + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out<<"\n//--------- Concat " << opName << " --> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << "\n"; + + if (fIsOutputConstant) return out.str(); + + if (fIsOutputParamShape) { + // output is a shape tensor defined by the concatenation of the input shapes + out << "// output is a shape tensor defined by the concatenation of the input shapes\n"; + for (int i = 0; i < static_cast(fOutputShape + [0].dim); i++) { + out << SP << "tensor_" << fOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n"; + } + return out.str(); + } + // special case when memory is contiguous + bool hasShapeOnes = true; + for(int i = 0; i 0) + out << offset; + offset += " + " + length; + out << ", " << "tensor_" << fInputs[i] << ", " + length << ");\n"; + } + } + else { + + std::vector outStride = UTILITY::ComputeStrideFromShape(fOutputShape); + std::vector> inStrides(fInputs.size()); + int idx = 0; + for ( auto &s : inStrides) { + s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]); + idx++; + } + for (int i = 0; i < fAxis; ++i) { + // loop on dimensions + out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n"; + } + + out << SP << SP << SP << "int idxOut = "; + for (int k = 0; k < fAxis; k++) { + if (k > 0) out << " + "; + out << outStride[k].GetVal() << "*i" << k; + } + out << ";\n"; + + for (size_t j = 0; j < fInputs.size(); j++) { + if (j>0) + out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n"; + out << SP << SP << SP << "int idxIn" << j <<" = "; + for (int k = 0; k < fAxis; k++) { + if (k > 0) out << " + "; + out << inStrides[j][k].GetVal() << "*i" << k; + } + out << ";\n"; + out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n"; + out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; + out << SP << SP << SP << "}\n"; + // concatenate the axis values + } + for (int i = 0; i < fAxis; ++i) { + out << SP << "}\n"; + } + } + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant || fIsOutputParamShape) return ""; + opName = "op_" + opName; + if (fOutputShape.empty()) + throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first"); + + const std::size_t D = fOutputShape.size(); + const std::size_t Nin = fInputs.size(); + + auto outStrides = UTILITY::ComputeStrideFromShape(fOutputShape); + + std::vector prefix(Nin); + prefix[0] = 0; + for (std::size_t k = 1; k < Nin; ++k) + prefix[k] = prefix[k - 1] + std::stoul(fInputShapes[k - 1][fAxis].GetVal()); + + std::vector> inStrides(Nin); + for (std::size_t k = 0; k < Nin; ++k) + inStrides[k] = UTILITY::ComputeStrideFromShape(fInputShapes[k]); + + std::string op; + op = "\n//------ CONCAT_KERNEL_ALPAKA\n"; + op += SP + "struct ConcatKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "std::array inputs,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "std::size_t remaining;\n"; + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string stride_val = outStrides[d].GetVal(); + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = remaining / " + stride_val + "u;\n"; + op += SP + SP + SP + SP + "remaining -= out_" + std::to_string(d) + + " * " + stride_val + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t chosen = 0;\n"; + for (std::size_t k = 0; k < Nin; ++k) { + std::size_t end_k = prefix[k] + std::stoul(fInputShapes[k][fAxis].GetVal()); + op += SP + SP + SP + SP + "chosen += static_cast(" + + std::to_string(end_k) + "u <= out_" + std::to_string(fAxis) + ");\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const output_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + "out_" + std::to_string(d) + + " * " + outStrides[d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t k = 0; k < Nin; ++k) { + op += SP + SP + SP + SP + SP + "(chosen == " + std::to_string(k) + "u) * (\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " - " + std::to_string(prefix[k]) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + SP + coord + + " * " + inStrides[k][d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : "\n"; + } + op += SP + SP + SP + SP + SP + ")"; + op += (k + 1 < Nin) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[output_idx] = inputs[chosen][input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant || fIsOutputParamShape) return ""; + opName = "op_" + opName; + return SP + "ConcatKernel_" + opName + " concatKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fIsOutputConstant || fIsOutputParamShape) return ""; + OpName = "op_" + OpName; + if (fOutputShape.empty()) { + throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fOutputShape); + out << "\n//------ CONCAT_GPU_ALPAKA\n"; + switch (fInputType){ + case ETensorType::FLOAT: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + case ETensorType::INT64: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + default: + throw std::runtime_error("Data type for Concat operator is not yet supported."); + } + for(size_t i=0; i0) out << ", "; + out << "alpaka::getPtrNative(deviceBuf_" << fInputs[i] << ")"; + } + out << "};\n"; + + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << OpName + << ", concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + }; + }//SOFIE + + + #endif //SOFIE_ROPERATOR_CONCAT diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/core/inc/SOFIE/ROperator_Constant.hxx similarity index 64% rename from src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx rename to core/inc/SOFIE/ROperator_Constant.hxx index 0d08432..4fea387 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx +++ b/core/inc/SOFIE/ROperator_Constant.hxx @@ -18,6 +18,7 @@ private: std::string fNX; std::string fNY; std::vector fShape; + std::vector fDimShape; // used for dynamic ConstantOfShape std::vector fValues; std::string fAttrType; bool fIsConstantOfShape = false; @@ -52,15 +53,35 @@ public: // case of ConstantOfShape (since no inputs in case of Constant operator) fIsConstantOfShape = true; if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor is not found in model"); + } + if (model.IsShapeTensor(fNX)) { + // Input is a shape tensor (symbolic dimensions) — output will be a dynamic tensor + // whose shape is determined at runtime from the symbolic values. + const auto & dimVals = model.GetShapeTensorValues(fNX); + std::vector outShape; + for (const auto & d : dimVals) + outShape.push_back(d); + if (fValues.size() != 1) + throw std::runtime_error("SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size())); + // Register as a dynamic intermediate tensor — values will be filled at runtime + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), outShape); + // Store shape for code generation (use fShape for rank, values = 0 for symbolic dims) + fShape.resize(outShape.size()); + for (size_t i = 0; i < outShape.size(); i++) + fShape[i] = outShape[i].isParam ? 0 : outShape[i].dim; + // Store symbolic lengths/shape for Generate() + fDimShape = outShape; + fIsOutputConstant = false; // cannot be constant since shape is dynamic + return; } // get output shape from input values: - // can work only if input is a constant or initialized tensor (or dynamic one) + // can work only if input is a constant or initialized tensor auto dptr = model.GetInitializedTensorData(fNX); auto input_tensor = static_cast(dptr.get()); auto input_shape = model.GetTensorShape(fNX); if (input_shape.size() > 1 ) - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor has invalid shape"); + throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor has invalid shape"); if (input_tensor != nullptr && !input_shape.empty()) { fShape = std::vector (input_shape[0]); for (size_t i = 0; i < fShape.size(); i++) @@ -70,7 +91,7 @@ public: length = ConvertShapeToLength(fShape); if (fValues.size() != 1) - throw std::runtime_error("TMVA SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size())); + throw std::runtime_error("SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size())); T value = fValues[0]; fValues = std::vector(length, value); @@ -80,7 +101,7 @@ public: // in case of standard constant the shape is provided as input length = ConvertShapeToLength(fShape); if (length != fValues.size()) - throw std::runtime_error("TMVA SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) + + throw std::runtime_error("SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) + " with " + std::to_string(fValues.size()) + " values"); } @@ -101,6 +122,11 @@ public: // no code to generate here. Tensor are defined in Session constructor return "//---------------------------------------\n"; } + + std::string Generate_GPU_ALPAKA(std::string /* OpName */) override { + // no code to generate here. Tensor are defined in Session constructor + return "//---------------------------------------\n"; + } }; }//SOFIE diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx new file mode 100644 index 0000000..835a0ff --- /dev/null +++ b/core/inc/SOFIE/ROperator_Conv.hxx @@ -0,0 +1,999 @@ +#ifndef SOFIE_SOFIE_ROPERATOR_CONV +#define SOFIE_SOFIE_ROPERATOR_CONV + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + + +namespace SOFIE { + +template +class ROperator_Conv final : public ROperator +{ +private: + bool fBroadcastBias = false; + + std::string fAttrAutopad; + std::vector fAttrDilations; + size_t fAttrGroup; + std::vector fAttrKernelShape; + std::vector fAttrPads; + std::vector fAttrStrides; + + std::string fNX; + std::string fNW; + std::string fNB; + std::string fNY; + + std::string convK; + std::string imcol; + + std::vector fShapeX; + std::vector fShapeW; + std::vector fShapeB; + std::vector fShapeY; + + std::string fType; + + size_t fDim; // dimension of the convolution + + +public: + + ROperator_Conv() {} + + ROperator_Conv(std::string autopad, std::vector dilations, + size_t group, std::vector kernelShape, std::vector pads, + std::vector strides, std::string nameX, std::string nameW, + std::string nameB, std::string nameY): + fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), + fAttrPads(pads), fAttrStrides(strides), + fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), + fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + if(std::is_same::value) { + fType = "float"; + } else { + throw + std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + } + fInputTensorNames = { fNX, fNB }; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::CONV; + } + + ROperator_Conv(std::string autopad, std::vector dilations, + size_t group, std::vector kernelShape, std::vector pads, + std::vector strides, std::string nameX, std::string nameW, + std::string nameY): + fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), + fAttrPads(pads), fAttrStrides(strides), + fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY)) + { + if(std::is_same::value) { + fType = "float"; + } else { + throw + std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + } + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + fKind= OperatorKind::CONV; + } + + std::vector TypeInference(std::vector input) override { + ETensorType out = input[0]; + return {out}; + } + + // function returning output shape given input + std::vector DoShapeInference(const std::vector & input, const std::vector & weight) { + // shape of convolution input has to be (according to ONNX): N x C x H x W + // Where N : batch size, C : input channels, H : input height, W : input width + + if (input.size() -2 != fDim) { + throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid input "); + } + if (weight.size() -2 != fDim) { + throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid weights "); + } + if (fAttrGroup == 0 && input[1].isParam) + throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without group attr"); + if (fAttrKernelShape.empty()) { + if (input[2].isParam || (fDim > 1 && input[3].isParam) || (fDim > 2 && input[4].isParam)) + throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without kernel attr"); + } + + if (fAttrGroup == 0) { + fAttrGroup = input[1].dim / weight[1]; + } + + // kernel shape + size_t k1 = ((fAttrKernelShape.empty())? weight[2] : fAttrKernelShape[0]); + size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? weight[3] : fAttrKernelShape[1]) : 1; + size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? weight[4] : fAttrKernelShape[2]) : 1; + + + size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1; + size_t i2 = (fDim > 2) ? 4 : 3; + size_t i3 = 5; + + if (fAttrDilations.empty()) { + fAttrDilations = {1, 1, 1}; + } + fAttrDilations.resize(3); + if (fDim < 3) { + fAttrDilations.resize(3, 1); + } + // Shape of the kernel + fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1), + k2 + (fAttrDilations[1] - 1) * (k2 - 1), + k3 + (fAttrDilations[2] - 1) * (k3 - 1)}; + + if (fAttrStrides.empty()) { + fAttrStrides = {1, 1, 1}; + } + if (fDim < 3) + fAttrStrides.resize(3, 1); + + if (fAttrAutopad == "NOTSET") { + if (fAttrPads.empty()) { + fAttrPads = {1, 1, 1, 1, 1, 1}; + } + } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") { + for (size_t d = 0; d < fDim; ++d) { + if (input[d + 2].isParam) + throw std::runtime_error( + "TMVA SOFIE Conv Op: SAME padding with parametric input shape is not supported"); + } + // ONNX SAME padding: total_pad = max(0, (ceil(in/stride)-1)*stride + kernel - in) + // SAME_UPPER places extra padding at end, SAME_LOWER at beginning + fAttrPads.assign(6, 0); + for (size_t d = 0; d < fDim; ++d) { + size_t inSize = input[d + 2].dim; + size_t stride_d = fAttrStrides[d]; + size_t outSize = (inSize + stride_d - 1) / stride_d; + int totalPad = std::max(0, (int)((outSize - 1) * stride_d + fAttrKernelShape[d]) - (int)inSize); + if (fAttrAutopad == "SAME_UPPER") { + fAttrPads[d] = (size_t)(totalPad / 2); + fAttrPads[d + fDim] = (size_t)(totalPad - totalPad / 2); + } else { + fAttrPads[d] = (size_t)(totalPad - totalPad / 2); + fAttrPads[d + fDim] = (size_t)(totalPad / 2); + } + } + } else if (fAttrAutopad != "VALID") { + throw + std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad"); + } + // to be sure pad is vector of size 6 + if (fDim < 3) fAttrPads.resize(6, 0); + + Dim input1 = input[2]; + Dim input2 = (fDim > 1) ? input[3] : Dim{1}; + Dim input3 = (fDim > 2) ? input[4] : Dim{1}; + + size_t pad1 = fAttrPads[0] + fAttrPads[i1]; + + // function to get output dimension of convolution given input + + auto computeOutput = [&](Dim inputDim, size_t kernel, size_t pad, size_t stride) { + if (!inputDim.isParam) { + size_t outSize = (inputDim.dim + pad - kernel) / stride + 1; + return Dim{outSize}; + } else { + if (stride == 1){ + if ((pad - kernel + 1) == 0 ) + // output is same as input + return inputDim; + else { + int64_t v = pad - kernel + 1; + std::string outStr = "(" + inputDim.param + "+" + std::to_string(v) + ")"; + return Dim{ outStr, static_cast(-1)}; + } + } else { // general case (stride not 1) + int64_t v = pad - kernel; + std::string outStr = "((" + inputDim.param + "+" + std::to_string(v) + ")/" + + std::to_string(stride) + "1)"; + return Dim{ outStr, static_cast(-1)}; + } + } + throw std::runtime_error("TMVA SOFIE Conv Op - invalid values"); + return Dim{}; + }; + + Dim output1 = computeOutput(input1, fAttrKernelShape[0], pad1, fAttrStrides[0]); + + Dim batch_size = input[0]; // first element in input tensor + Dim output_channels = Dim{weight[0]}; // first element in weight tensor + + std::vector ret({ batch_size, output_channels, output1 }); + + if (fDim == 1) + return ret; + + size_t pad2 = fAttrPads[1] + fAttrPads[i2]; + Dim output2 = computeOutput(input2, fAttrKernelShape[1], pad2, fAttrStrides[1]); + + // output is N x M x OH x OW + ret.push_back(output2); + if (fDim == 2) + return ret; + + size_t pad3 = fAttrPads[2] + fAttrPads[i3]; + Dim output3 = computeOutput(input3, fAttrKernelShape[2], pad3, fAttrStrides[2]); + + // output is N x M x OH x OW x OD + ret.push_back(output3); + return ret; + } + + void Initialize(RModel& model) override { + fUseSession = model.UseSession(); + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (fShapeX.size() < 3 || fShapeX.size() > 5) { + std::cout << fNX << " : " << ConvertDimShapeToString(fShapeX) << std::endl; + throw + std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); + } + fDim = fShapeX.size() - 2; + if (!model.CheckIfTensorAlreadyExist(fNW)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); + } + fShapeW = model.GetTensorShape(fNW); + if (fShapeW.size() < 3 || fShapeW.size() > 5) { + std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; + throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); + } + fShapeY = DoShapeInference(fShapeX, fShapeW); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + if (fNB != "") { + if (!model.CheckIfTensorAlreadyExist(fNB)) { + throw + std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model"); + } + fShapeB = model.GetTensorShape(fNB); + if (fShapeB.size() != 1) + throw + std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)"); + std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); + auto shapeDimB = model.GetDimTensorShape(fNB); + bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape); + if (broadcast_needed) { + auto original_data = model.GetInitializedTensorData(fNB); + // make bias shape equal to Y shape by adding 1 + if (fShapeB.size() < 1) + throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape"); + // we assume bias tensor dimension is equal to number of filters that is the second dimension in + // the output tensor + if (!(shapeDimB[0] == fShapeY[1])) + throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " + + ConvertShapeToString(fShapeB)); + if (fType != "float") + throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported"); + // here is the actual broadcasting + fBroadcastBias = true; + if (!fUseSession) { + // do here broadcasting + std::vector shape(fDim + 1, 1); + shape[0] = fShapeB[0]; + auto intTargetShape = ConvertShapeToInt(targetShape); + std::shared_ptr new_data_ptr( + UTILITY::UnidirectionalBroadcast(static_cast(original_data.get()), shape, intTargetShape), + std::default_delete()); + model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr); + fShapeB = model.GetTensorShape(fNB); + } + } + } + // output channel size can be parametric and is an expression + std::vector outputDims = std::vector(fShapeY.begin()+2, fShapeY.end()); + //check if shape is not parametric + std::vector outputInts = ConvertShapeToInt(outputDims); + Dim channelDim; + if (outputInts.empty()) { + auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W + channelDim = Dim{ outputChannelSize, static_cast(-1)}; + } else { + size_t outputChannelSize = ConvertShapeToLength(outputInts); + channelDim = Dim{ outputChannelSize }; + } + size_t kernelSize = fAttrKernelShape[0]; + for (size_t i = 1; i < fDim; i++) { + kernelSize *= fAttrKernelShape[i]; + } + + std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; + std::vector shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim }; + model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); + model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); + convK = fNX +"_f"; + imcol = fNX +"_xcol"; + fOutputTensorNames.emplace_back(convK); + fOutputTensorNames.emplace_back(imcol); + fInputTensorNames.emplace_back(convK); + fInputTensorNames.emplace_back(imcol); + + if (model.Verbose()) { + std::cout << "Conv - " << fDim << " " << fNX << " : " << ConvertDimShapeToString(fShapeX) + << " --> " << fNY << " : " << ConvertDimShapeToString(fShapeY) << std::endl; + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + // Generate initialization code for broadcasting of bias tensor + if (fBroadcastBias) { + // include a separate scope to avoid defining unique operator temp variables + std::vector shape(fDim + 1, 1); + // bias (is a 1D tensor) + shape[0] = fShapeB[0]; + std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); + out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n"; + // in case of dynamic tensors check needs to be done at run time + bool isOutDynamic = ConvertShapeToInt(targetShape).empty(); + auto length = ConvertDimShapeToLength(targetShape); + if (isOutDynamic) + out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n"; + else + out << SP << "{\n"; + out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" + << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertDimShapeToString(fShapeY) << ");\n"; + out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n"; + out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n"; + out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n"; + out << SP << SP << "delete[] data;\n"; + out << SP << "}\n"; + } + return out.str(); + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + + if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { + throw + std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + } + + std::stringstream out; + auto bsize = fShapeX[0]; + size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; // kernel depth + size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height + size_t kWidth = fShapeW[fDim+1]; // kernel width + auto iDepth = (fDim > 2) ? fShapeX[2] : Dim{1}; // input depth + auto iHeight = (fDim > 1) ? fShapeX[fDim] : Dim{1}; // input height + auto iWidth = fShapeX[fDim+1]; // input width + auto oDepth = (fDim > 2) ? fShapeY[2] : Dim{1}; // output depth + auto oHeight = (fDim > 1) ? fShapeY[fDim] : Dim{1}; // ouput height + auto oWidth = fShapeY[fDim+1]; // output width + // total output size for a channel + auto outputChannelStride = ConvertDimShapeToLength(std::vector{oDepth, oHeight, oWidth}); // size of channel = D * H * W + auto outputBatchStride = ConvertDimShapeToLength(std::vector{fShapeY[1] , oDepth, oHeight, oWidth}); // size of C * D * H * W + // input size + auto inputChannelStride = ConvertDimShapeToLength(std::vector{iDepth, iHeight, iWidth}); + auto inputBatchStride = ConvertDimShapeToLength(std::vector{fShapeX[1] , iDepth, iHeight, iWidth}); // size of C * D * H * W + + out << "\n//---- operator Conv " << OpName << "\n"; + + // vectorize the (dilated)convolution kernels into a matrix + // no need to transpose the matrix + // to fix for 1d and 3d + + size_t id = (fDim > 2) ? fDim-3 : 2; + size_t ih = (fDim > 1) ? fDim-2 : 1; + size_t iw = fDim-1; + + size_t wstrideDil = fAttrDilations[iw]; + size_t hstride = kWidth; + size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; // stride dilated in the height + size_t dstride = kHeight * kWidth; + size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t icstride = kHeight * kWidth * kDepth; + size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t ocstride = fShapeW[1] * icstride; + size_t ocstrideDil = fShapeW[1] * icstrideDil; + + out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n"; + out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n"; + if (fDim > 2) + out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n"; + if (fDim > 1) + out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n"; + out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n"; + + out << SP << SP << SP << SP << SP << "tensor_" < 2) out << " + kd * " << dstrideDil; + if (fDim > 1) out << " + kh * " << hstrideDil; + out << " + kw * " << wstrideDil << " ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride; + if (fDim > 2) out << " + kd * " << dstride; + if (fDim > 1) out << " + kh * " << hstride; + out << " + kw ];\n"; + + out << SP << SP << SP << SP << "}\n"; + if (fDim > 1) out << SP << SP << SP << "}\n"; + if (fDim > 2) out << SP << SP << SP << "}\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + + //out << SP << "char " << OpName << "_transA = 'T';\n"; + out << SP << "char " << OpName << "_transA = 'N';\n"; + out << SP << "char " << OpName << "_transB = 'N';\n"; + out << SP << "int " << OpName << "_m = " << outputChannelStride << ";\n"; // output h*w + assert(fShapeY[1] == fShapeW[0]); + //assert(fShapeW[1] == fShapeX[1] / fAttrGroup); + out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels + out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n"; + out << SP << "float " << OpName << "_alpha = 1.0;\n"; + if (fNB != "") + out << SP << "float " << OpName << "_beta = 1.0;\n"; + else // when bias is not present beta needs to be equal to zero to avoid re-using previous results in output tensor + out << SP << "float " << OpName << "_beta = 0.0;\n"; + + + // Loop on batch size + out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n"; + + // IM2COL: Unroll the input tensor + // order input data as (e.g. kernel 2x2) and (xa,ya) is channel 1 and (xb,yb) is channel 2 + // (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk) + // (xa2,...xak+1,ya1,...yak)(......) + // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows. + // By doing this one has consecutive memory reads and writes + // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w) + if (fDim ==1) { + if (fAttrPads[0] != fAttrPads[1] ) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " + << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; + } + fAttrPads[1] = 0; + fAttrStrides[1] = 1; + } + if (fDim == 2) { + if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; + fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; + } + } + if (fDim == 3) { + if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { + std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; + fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; + fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; + fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2; + } + } + out << SP << SP << "size_t out_offset = n * " << outputBatchStride << ";\n"; + + if (fAttrGroup == 1) { + out << SP << SP << "size_t x_offset = n * " << inputBatchStride << ";\n"; + // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y, output_h * + // output_w) + if (fDim < 3) { + out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX + << " + x_offset," + // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, + // dilation_w, + // + << fShapeW[1] << "," << iHeight << "," << iWidth << ","; + if (fDim == 1) + out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," + << fAttrDilations[0]; + else // dim ==2 + out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," + << fAttrDilations[1]; + out << "," << "tensor_" <(tensor_" << fNX + << " + x_offset," + // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, + // dilation_d, dilation_h, dilation_w, + // + << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," + << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," + << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << "," + << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << "," + << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," + << "tensor_" << fNX << "_xcol);\n\n "; + } + // BLAS + out << SP << "SOFIE::Gemm_Call(" + << "tensor_" << fNY << " + out_offset, false, false, " + << OpName << "_m, " << OpName << "_n, " << OpName << "_k, " + << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f, " + << OpName << "_beta, "; + if (fNB != "") + out << "tensor_" << fNB; + else + out << "nullptr"; + out << ");\n"; + + + // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" + // << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName + // << "_m,\n"; // use m if op_xcol is not transpose , otherwise k + // out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY + // << " + out_offset, &" << OpName << "_m);\n"; + } else { + // case of group convolution + // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each + // group) + // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; + out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n"; + out << SP << SP << "size_t x_offset = n * " << inputBatchStride << " + g * " + << fShapeW[1] << " * " << inputChannelStride << ";\n "; + out << SP << SP << "size_t g_offset = g * " << fShapeW[0] << " * (" << outputChannelStride << ") / " << fAttrGroup << ";\n "; + out << SP << SP << "size_t out_offset = n * " << outputBatchStride << " + g_offset;\n"; + + if (fDim < 3) { + out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX + << " + x_offset," + // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, + // dilation_w, + // + << fShapeW[1] << "," << iHeight << "," << iWidth << ","; + if (fDim == 1) + out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," + << fAttrDilations[0]; + else // dim ==2 + out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," + << fAttrDilations[1]; + out << ", tensor_" << fNX << "_xcol);\n\n "; + } else { + // 3d im2col + out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX + << " + x_offset," + // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, + // dilation_d, dilation_h, dilation_w, + // + << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << "," + << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] + << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] + << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX + << "_xcol);\n\n "; + } + + // BLAS + // n must be divided by the number of groups + out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n"; + // offset g must be g * k * n + out << SP << SP << SP << "size_t offset_f = g * " + << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup + << ";\n"; + + out << SP << "SOFIE::Gemm_Call(" + << "tensor_" << fNY << " + out_offset, false, false, " + << OpName << "_m, " << OpName << "_n, " << OpName << "_k, " + << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f + offset_f, " + << OpName << "_beta, "; + if (fNB != "") + out << "tensor_" << fNB << " + g_offset"; + else + out << "nullptr"; + out << ");\n"; + out << SP << SP << "}\n"; // end of group loop + } + out << SP << "}\n"; // end of batch size loop + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + + size_t oDepth = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth = fShapeY[fDim + 1].dim; + size_t iDepth = (fDim > 2) ? fShapeX[2].dim : 1; + size_t iHeight = (fDim > 1) ? fShapeX[fDim].dim : 1; + size_t iWidth = fShapeX[fDim + 1].dim; + size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; + size_t kWidth = fShapeW[fDim + 1]; + size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; + + size_t kernelSize = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + size_t colRows = fShapeW[1] * kernelSize; + size_t colCols = oDepth * oHeight * oWidth; + size_t colElements = colRows * colCols; + size_t outChannels = fShapeW[0]; + size_t spatialSize = oDepth * oHeight * oWidth; + + // Strides for weight vectorisation + size_t id = (fDim > 2) ? fDim - 3 : 2; + size_t ih = (fDim > 1) ? fDim - 2 : 1; + size_t iw = fDim - 1; + size_t wstrideDil = fAttrDilations[iw]; + size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; + size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; + size_t ocstrideDil = fShapeW[1] * icstrideDil; + size_t hstride = kWidth; + size_t dstride = kHeight * kWidth; + size_t icstride = kHeight * kWidth * kDepth; + size_t ocstride = fShapeW[1] * icstride; + size_t wTotalElements = ConvertShapeToLength(fShapeW); + + std::string op; + + // Kernel 1: Weight vectorisation — reorder W into _f with dilation layout + // Each thread handles one output element of _f + std::string wKname = "WeightVecKernel_" + opName; + op = "\n//------ WEIGHT_VEC_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + wKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ W,\n"; + op += SP + SP + SP + "T* __restrict__ f,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose elem_idx into (oc, ic, kd, kh, kw) using compile-time strides + op += SP + SP + SP + SP + "std::size_t const oc = elem_idx / " + std::to_string(ocstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const oc_rem = elem_idx % " + std::to_string(ocstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ic = oc_rem / " + std::to_string(icstride) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ic_rem = oc_rem % " + std::to_string(icstride) + "u;\n"; + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const kd = ic_rem / " + std::to_string(kHeight * kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = (ic_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = ic_rem / " + std::to_string(kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = ic_rem;\n\n"; + } + + // Compute destination index in _f (dilated layout) + op += SP + SP + SP + SP + "std::size_t const f_idx =\n"; + op += SP + SP + SP + SP + SP + "oc * " + std::to_string(ocstrideDil) + "u +\n"; + op += SP + SP + SP + SP + SP + "ic * " + std::to_string(icstrideDil) + "u"; + if (fDim > 2) op += " +\n" + SP + SP + SP + SP + SP + "kd * " + std::to_string(dstrideDil) + "u"; + if (fDim > 1) op += " +\n" + SP + SP + SP + SP + SP + "kh * " + std::to_string(hstrideDil) + "u"; + op += " +\n" + SP + SP + SP + SP + SP + "kw * " + std::to_string(wstrideDil) + "u;\n\n"; + + op += SP + SP + SP + SP + "f[f_idx] = W[elem_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + + // Kernel 2: Im2Col + std::string im2colKname = "Im2ColKernel_" + opName; + op += SP + "//------ IM2COL_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + im2colKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ col,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "std::size_t const col_row = elem_idx / " + std::to_string(colCols) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const col_col = elem_idx % " + std::to_string(colCols) + "u;\n\n"; + + op += SP + SP + SP + SP + "std::size_t const ic = col_row / " + std::to_string(kernelSize) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const k_rem = col_row % " + std::to_string(kernelSize) + "u;\n"; + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const kd = k_rem / " + std::to_string(kHeight * kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = (k_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = k_rem / " + std::to_string(kWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const kw = k_rem;\n\n"; + } + + if (fDim > 2) { + op += SP + SP + SP + SP + "std::size_t const od = col_col / " + std::to_string(oHeight * oWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = (col_col / " + std::to_string(oWidth) + "u) % " + std::to_string(oHeight) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n"; + } else if (fDim > 1) { + op += SP + SP + SP + SP + "std::size_t const od = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = col_col / " + std::to_string(oWidth) + "u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const od = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const oh = 0u;\n"; + op += SP + SP + SP + SP + "std::size_t const ow = col_col;\n\n"; + } + + // Depth: trivially 0 for fDim < 3 (od=kd=0 always); pads[0] is height-begin for 2D, so + // applying it here would make id_in negative and zero the whole output. + if (fDim >= 3) { + op += SP + SP + SP + SP + "int64_t const id_in = static_cast(od * " + std::to_string(fAttrStrides[0]) + + "u + kd * " + std::to_string(fAttrDilations[0]) + "u) - " + std::to_string(fAttrPads[0]) + ";\n"; + } else { + op += SP + SP + SP + SP + "int64_t const id_in = 0;\n"; + } + // Height: for fDim==3 the height dim is at strides/pads index 1; for fDim==2 it is at index 0. + // For fDim==1 oh=kh=0 so ih_in=0. + { + size_t const hIdx = (fDim > 2) ? 1 : 0; + if (fDim >= 2) { + op += SP + SP + SP + SP + "int64_t const ih_in = static_cast(oh * " + std::to_string(fAttrStrides[hIdx]) + + "u + kh * " + std::to_string(fAttrDilations[hIdx]) + "u) - " + std::to_string(fAttrPads[hIdx]) + ";\n"; + } else { + op += SP + SP + SP + SP + "int64_t const ih_in = 0;\n"; + } + } + // Width: fAttrStrides/Dilations/Pads are ordered [d,h,w] so width is at index fDim-1. + { + size_t const wIdx = fDim - 1; + op += SP + SP + SP + SP + "int64_t const iw_in = static_cast(ow * " + std::to_string(fAttrStrides[wIdx]) + + "u + kw * " + std::to_string(fAttrDilations[wIdx]) + "u) - " + std::to_string(fAttrPads[wIdx]) + ";\n\n"; + } + + op += SP + SP + SP + SP + "bool const in_bounds =\n"; + op += SP + SP + SP + SP + SP + "id_in >= 0 && id_in < " + std::to_string(iDepth) + " &&\n"; + op += SP + SP + SP + SP + SP + "ih_in >= 0 && ih_in < " + std::to_string(iHeight) + " &&\n"; + op += SP + SP + SP + SP + SP + "iw_in >= 0 && iw_in < " + std::to_string(iWidth) + ";\n\n"; + + op += SP + SP + SP + SP + "if (in_bounds) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const in_idx =\n"; + op += SP + SP + SP + SP + SP + SP + "ic * " + std::to_string(iDepth * iHeight * iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(id_in) * " + std::to_string(iHeight * iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(ih_in) * " + std::to_string(iWidth) + "u +\n"; + op += SP + SP + SP + SP + SP + SP + "static_cast(iw_in);\n"; + op += SP + SP + SP + SP + SP + "col[elem_idx] = input[in_idx];\n"; + op += SP + SP + SP + SP + "} else {\n"; + op += SP + SP + SP + SP + SP + "col[elem_idx] = static_cast(0);\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + + // Kernel 3: Bias broadcast (only if bias present) + if (!fNB.empty()) { + std::string biasKname = "BiasBroadcastKernel_" + opName; + op += SP + "//------ BIAS_BROADCAST_KERNEL_ALPAKA (Conv " + opName + ")\n"; + op += SP + "struct " + biasKname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n"; + op += SP + SP + SP + SP + "std::size_t const channel = elem_idx / " + std::to_string(spatialSize) + "u;\n"; + op += SP + SP + SP + SP + "output[elem_idx] = bias[channel];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + } + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string op; + op = SP + "WeightVecKernel_" + opName + " weightVecKernel_" + opName + ";\n"; + op += SP + "Im2ColKernel_" + opName + " im2colKernel_" + opName + ";\n"; + if (!fNB.empty()) + op += SP + "BiasBroadcastKernel_" + opName + " biasBroadcastKernel_" + opName + ";\n"; + return op; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first"); + + size_t bsize = fShapeX[0].dim; + size_t oDepth = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth = fShapeY[fDim + 1].dim; + size_t iDepth = (fDim > 2) ? fShapeX[2].dim : 1; + size_t iHeight = (fDim > 1) ? fShapeX[fDim].dim : 1; + size_t iWidth = fShapeX[fDim + 1].dim; + size_t outChannels = fShapeW[0]; + size_t kernelSize = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + // gemm dimensions computed from shape members + size_t gemm_n = outChannels; // output channels + size_t gemm_k = fShapeW[1] * kernelSize; // input channels/group * kernel volume + size_t gemm_m = oDepth * oHeight * oWidth; // output spatial size per channel + size_t colElements = gemm_k * gemm_m; // colRows * colCols + size_t wTotal = ConvertShapeToLength(fShapeW); + + // For group conv: per-group output channels and _f offset + // gemm_n stays as total output channels — we divide per group at launch + size_t groupFOffset = gemm_n * gemm_k; // elements of _f per group + + std::stringstream out; + out << "\n//------ CONV_GPU_ALPAKA\n"; + + // ----------------------------------------------------------------------- + // Step 1: Weight vectorisation kernel — runs once, fully on GPU + // ----------------------------------------------------------------------- + out << SP << "// Step 1: vectorise W -> _f on GPU (once per infer call)\n"; + out << SP << "{\n"; + out << SP << SP << "auto const elementsPerThread_wv = Vec::all(static_cast(1));\n"; + out << SP << SP << "auto const elementsPerGrid_wv = Vec::all(Idx{" << wTotal << "});\n"; + out << SP << SP << "auto const workDiv_wv = sofie_workdiv(elementsPerGrid_wv);\n"; + out << SP << SP << "alpaka::exec(queue, workDiv_wv, weightVecKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNW << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", static_cast(" << wTotal << "));\n"; + out << SP << SP << "alpaka::wait(queue);\n"; + out << SP << "}\n\n"; + + // ----------------------------------------------------------------------- + // Step 2: Batch loop + // ----------------------------------------------------------------------- + out << SP << "for (std::size_t n = 0; n < " << bsize << "; n++) {\n\n"; + out << SP << SP << "std::size_t const x_offset = n * " + << fShapeX[1].dim * iDepth * iHeight * iWidth << "u;\n"; + out << SP << SP << "std::size_t const out_offset = n * " + << fShapeY[1].dim * gemm_m << "u;\n\n"; + + // ----------------------------------------------------------------------- + // Step 3 + 4: Im2Col then GEMM — structure differs for grouped vs non-grouped + // ----------------------------------------------------------------------- + if (fAttrGroup == 1) { + // Non-grouped: single im2col per batch, then GEMM + out << SP << SP << "// Step 3: im2col\n"; + out << SP << SP << "{\n"; + out << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << "auto const elementsPerGrid_im2col = Vec::all(Idx{" << colElements << "});\n"; + out << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n"; + out << SP << SP << SP << "alpaka::exec(queue, workDiv_im2col, im2colKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + x_offset" + << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", static_cast(" << colElements << "));\n"; + out << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << "}\n\n"; + + if (!fNB.empty()) { + size_t biasElements = gemm_n * gemm_m; + out << SP << SP << "// Step 4a: broadcast bias into output slice\n"; + out << SP << SP << "{\n"; + out << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << "auto const elementsPerGrid_bias = Vec::all(Idx{" << biasElements << "});\n"; + out << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n"; + out << SP << SP << SP << "alpaka::exec(queue, workDiv_bias, biasBroadcastKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset" + << ", static_cast(" << biasElements << "));\n"; + out << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << "}\n\n"; + out << SP << SP << "// Step 4b: GEMM beta=1 accumulates onto bias-initialised output\n"; + out << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; + } else { + out << SP << SP << "// Step 4: GEMM beta=0 (no bias)\n"; + out << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ")" + << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n"; + } + // Wait for GEMM to finish before next batch overwrites the shared _xcol buffer. + out << SP << SP << "alpaka::wait(queue);\n\n"; + + } else { + // Grouped convolution: im2col and GEMM per group with group-adjusted input pointer. + // Each group processes fShapeW[1] input channels starting at g * fShapeW[1]. + out << SP << SP << "for (std::size_t g = 0; g < " << fAttrGroup << "; g++) {\n\n"; + out << SP << SP << SP << "std::size_t const g_in_offset = x_offset + g * " + << fShapeW[1] * iDepth * iHeight * iWidth << "u;\n"; + out << SP << SP << SP << "std::size_t const g_out_offset = out_offset + g * " + << gemm_n * gemm_m << "u;\n"; + out << SP << SP << SP << "std::size_t const f_offset = g * " << groupFOffset << "u;\n\n"; + + out << SP << SP << SP << "// im2col for group g (reads only this group's input channels)\n"; + out << SP << SP << SP << "{\n"; + out << SP << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << SP << "auto const elementsPerGrid_im2col = Vec::all(Idx{" << colElements << "});\n"; + out << SP << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n"; + out << SP << SP << SP << SP << "alpaka::exec(queue, workDiv_im2col, im2colKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + g_in_offset" + << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", static_cast(" << colElements << "));\n"; + out << SP << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << SP << "}\n\n"; + + if (!fNB.empty()) { + size_t groupBiasElements = gemm_n * gemm_m; + out << SP << SP << SP << "// Broadcast group bias\n"; + out << SP << SP << SP << "{\n"; + out << SP << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast(1));\n"; + out << SP << SP << SP << SP << "auto const elementsPerGrid_bias = Vec::all(Idx{" << groupBiasElements << "});\n"; + out << SP << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n"; + out << SP << SP << SP << SP << "alpaka::exec(queue, workDiv_bias, biasBroadcastKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ") + g * " << gemm_n + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset" + << ", static_cast(" << groupBiasElements << "));\n"; + out << SP << SP << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP << SP << "}\n\n"; + out << SP << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset" + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n"; + } else { + out << SP << SP << SP << "blas.matmul('n', 'n', " + << gemm_m << ", " << gemm_n << ", " << gemm_k + << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")" + << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset" + << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n"; + } + // Wait for GEMM to finish before next group's im2col overwrites the shared _xcol buffer. + out << SP << SP << SP << "alpaka::wait(queue);\n\n"; + out << SP << SP << "}\n"; // end group loop + } + + out << SP << "}\n"; // end batch loop + return out.str(); + } + + /*! \brief Returns the blas routines needed to compile the generated code + */ + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } + + + std::string GetBlasConfig(){ + size_t oDepth_ = (fDim > 2) ? fShapeY[2].dim : 1; + size_t oHeight_ = (fDim > 1) ? fShapeY[fDim].dim : 1; + size_t oWidth_ = fShapeY[fDim + 1].dim; + size_t kSize_ = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2]; + size_t gemm_n_ = fShapeW[0]; + size_t gemm_k_ = fShapeW[1] * kSize_; + size_t gemm_m_ = oDepth_ * oHeight_ * oWidth_; + auto lda = std::to_string(gemm_m_); // ld for xcol^T (gemm_m×gemm_k col-major) + auto ldb = std::to_string(gemm_k_); // ld for xf^T (gemm_k×gemm_n col-major) + auto ldc = std::to_string(gemm_m_); // ld for y^T (gemm_m×gemm_n col-major) + return std::to_string(gemm_m_) + ", " + std::to_string(gemm_n_) + ", " + std::to_string(gemm_k_) + ", " + lda + ", " + ldb + ", " + ldc + ", 'n', 'n'"; + } + +}; + +} // namespace SOFIE + +#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/core/inc/SOFIE/ROperator_ConvTranspose.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx rename to core/inc/SOFIE/ROperator_ConvTranspose.hxx index 0467385..5a4acf3 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx +++ b/core/inc/SOFIE/ROperator_ConvTranspose.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX -#include -#include -#include +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include #include @@ -88,7 +88,7 @@ public: if (std::is_same::value) { fType = "float"; } else { - throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); + throw std::runtime_error("SOFIE Encountered unsupported type parsing a Conv operator"); } } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc b/core/inc/SOFIE/ROperator_ConvTranspose.icc similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc rename to core/inc/SOFIE/ROperator_ConvTranspose.icc index 3a52796..52b6b3e 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc +++ b/core/inc/SOFIE/ROperator_ConvTranspose.icc @@ -105,22 +105,22 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ fUseSession = model.UseSession(); if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model"); + throw std::runtime_error("SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 3 || fShapeX.size() > 5) { std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Transpose Op input data tensor" + fNX + + throw std::runtime_error("SOFIE Conv Transpose Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); } fDim = fShapeX.size() - 2; if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); + throw std::runtime_error("SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() < 3 || fShapeW.size() > 5) { std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Transpose Op input weight tensor" + fNW + + throw std::runtime_error("SOFIE Conv Transpose Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); } fShapeY = ShapeInference({fShapeX, fShapeW})[0]; @@ -128,11 +128,11 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); if (fNB != "") { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model"); + throw std::runtime_error("SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model"); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() < 1) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has empty shape"); + throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has empty shape"); size_t bsize = ConvertShapeToLength(fShapeB); size_t ysize = ConvertShapeToLength(fShapeY); @@ -143,13 +143,13 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ // we assume bias tensor size is equal to number of filters that is the second dimension in // the output tensor if (bsize != fShapeY[1] ) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has wrong shape: " + + throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has wrong shape: " + ConvertShapeToString(fShapeB)); auto original_data = model.GetInitializedTensorData(fNB); if (fType != "float") - throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported"); + throw std::runtime_error("SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported"); // here the acual broadcasting if (!fUseSession) { // Broadcast B from M to N x M x Od x Oh x Ow @@ -170,7 +170,7 @@ void ROperator_ConvTranspose::Initialize(RModel& model){ else { // bias tensor is already correct shape, no need to broadcast if (fShapeY != fShapeB) - throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" + + throw std::runtime_error("SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" + ConvertShapeToString(fShapeB)); fNBroadcastedB = fNB; } @@ -218,7 +218,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) OpName = "op_" + OpName; if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first"); } std::stringstream out; @@ -331,7 +331,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) // Resulting matrix op_xcol is (output channels * filter_h * filter_w , output_h * output_w) if (fDim == 1) { if (fAttrPads[0] != fAttrPads[1]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; } @@ -339,7 +339,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) } if (fDim == 2) { if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { - std::cout << "TMVA SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; @@ -347,7 +347,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) } if (fDim == 3) { if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { - std::cout << "TMVA SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " + std::cout << "SOFIE Operator ConvTranspose: asymmetric padding not supported. Assume an average padding " << std::endl; fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; @@ -385,7 +385,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) out << ", tensor_" << fNY << " + out_offset);\n\n "; } else { // 3d : needs a col2im for 3d - throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported"); + throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported"); out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX << " + x_offset," // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, @@ -436,7 +436,7 @@ std::string ROperator_ConvTranspose::Generate(std::string OpName) out << ", tensor_" << fNY << " + out_offset);\n\n "; } else { // 3d im2col - throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported"); + throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported"); out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX << " + x_offset," diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx b/core/inc/SOFIE/ROperator_Custom.hxx similarity index 92% rename from src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx rename to core/inc/SOFIE/ROperator_Custom.hxx index c24d329..fb618d4 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx +++ b/core/inc/SOFIE/ROperator_Custom.hxx @@ -48,13 +48,13 @@ public: for(auto& it:fInputNames){ if (model.CheckIfTensorAlreadyExist(it) == false){ - throw std::runtime_error("TMVA SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model"); + throw std::runtime_error("SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model"); } fInputSizes.push_back(ConvertShapeToLength(model.GetTensorShape(it))); } if(fOutputNames.size() != fOutputShapes.size()){ - throw std::runtime_error("TMVA SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors"); + throw std::runtime_error("SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors"); } for(long unsigned int i=0; i & namesX, const std::string & nameY): fNInputs(namesX.size()), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::EINSUM; for (size_t i = 0; i < namesX.size(); i++) fNInputs[i] = UTILITY::Clean_name(namesX[i]); // parse teh equations to find labels if (!ParseEquation(equation)) - throw std::runtime_error("TMVA SOFIE Einsum Op: Error parsing the equation " + equation); + throw std::runtime_error("SOFIE Einsum Op: Error parsing the equation " + equation); fInputTensorNames.resize(fNInputs.size()); std::transform(fNInputs.begin(), fNInputs.end(), fInputTensorNames.begin(), @@ -128,7 +129,7 @@ public: std::map labelsMap; for ( auto & name : fNInputs) { if (!model.CheckIfTensorAlreadyExist(name)) - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor ") + name + "is not found in model"); + throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor ") + name + "is not found in model"); // if (model.IsDynamicTensor(name) || model.IsDimInputTensor(name) ) { // // not yet supported @@ -140,7 +141,7 @@ public: std::string labels = fInputLabels[i]; for (size_t j = 0; j < shape.size(); j++) { if (j >= labels.length()) { - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape)); + throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape)); } labelsMap[labels[j]] = shape[j]; } @@ -149,7 +150,7 @@ public: // get output shape from label maps for (char l : fOutputLabels) { if (labelsMap.count(l) == 0) - throw std::runtime_error(std::string("TMVA SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs"); + throw std::runtime_error(std::string("SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs"); fShapeY.push_back(labelsMap[l]); } // we need to get the labels we are going to sum @@ -209,7 +210,7 @@ public: opName = "op_" + opName; if (fShapeY.size() != fOutputLabels.length()) { - throw std::runtime_error("TMVA SOFIE Einsum Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Einsum Op called to Generate without being initialized first"); } // function to write compute expression index from strides diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx b/core/inc/SOFIE/ROperator_Elu.hxx similarity index 81% rename from src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx rename to core/inc/SOFIE/ROperator_Elu.hxx index 34e18a6..6588b61 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx +++ b/core/inc/SOFIE/ROperator_Elu.hxx @@ -19,7 +19,7 @@ private: float falpha= 1.0; //default value std::string fNX; std::string fNY; - std::vector fShape; + std::vector fShape; std::string fType; public: @@ -27,6 +27,7 @@ public: ROperator_Elu(float alpha,std::string nameX, std::string nameY): falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::ELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; @@ -34,7 +35,7 @@ public: fType = "float"; } else{ - throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Elu operator"); + throw std::runtime_error("SOFIE Encountered unsupported type parsing a Elu operator"); } } @@ -49,9 +50,9 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Elu Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Elu Op Input Tensor is not found in model"); } - fShape = model.GetTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); } @@ -59,10 +60,10 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Elu called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Elu called to Generate without being initialized first"); } std::stringstream out; - size_t length = ConvertShapeToLength(fShape); + std::string length = ConvertDimShapeToLength(fShape); out << SP << "float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx b/core/inc/SOFIE/ROperator_Erf.hxx similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx rename to core/inc/SOFIE/ROperator_Erf.hxx index 72f8cc5..6a51864 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx +++ b/core/inc/SOFIE/ROperator_Erf.hxx @@ -17,7 +17,7 @@ private: std::string fNX; std::string fNY; - std::vector fShape; + std::vector fShape; public: ROperator_Erf(){} @@ -41,7 +41,7 @@ public: if (model.CheckIfTensorAlreadyExist(fNX) == false){ throw std::runtime_error("SOFIE SOFIE Erf Op Input Tensor is not found in model"); } - fShape = model.GetTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); } @@ -52,7 +52,7 @@ public: throw std::runtime_error("SOFIE SOFIE Erf operator called to Generate without being initialized first"); } std::stringstream out; - size_t length = ConvertShapeToLength(fShape); + std::string length = ConvertDimShapeToLength(fShape); out << "\n//------ ERF\n"; out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; out << SP << SP << "tensor_" << fNY << "[id] = std::erf(tensor_" << fNX << "[id]);\n"; diff --git a/core/inc/SOFIE/ROperator_Expand.hxx b/core/inc/SOFIE/ROperator_Expand.hxx new file mode 100644 index 0000000..95955ed --- /dev/null +++ b/core/inc/SOFIE/ROperator_Expand.hxx @@ -0,0 +1,347 @@ +#ifndef SOFIE_ROperator_Expand +#define SOFIE_ROperator_Expand + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Expand final : public ROperator{ +private: + + std::vector fShapeX; + std::vector fShape; + std::vector fShapeY; + std::vector fShapeDim; + + std::string fNX; + std::string fNShape; + std::string fNY; + std::string fType; + + bool fInitialized = false; + bool fInitializedShape = false; + bool fInitBroadcast = false; + +public: + ROperator_Expand(){} + ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){ + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("SOFIE Expand Op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (model.IsInitializedTensor(fNShape)) { + fInitializedShape = true; + int64_t *shapeData = + static_cast(model.GetInitializedTensorData(fNShape).get()); + fShape = model.GetTensorShape(fNShape); + if (fShape.size() != 1) { + throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); + } + size_t N = fShape[0]; + // what do we do if shapeData contains negative values? + for (size_t i = 0; i < N; i++) { + if ( shapeData[i] < 0) + throw std::runtime_error("TMVA::SOFIE - Expand: invalid shape value " + std::to_string(shapeData[i])); + } + std::vector shape(shapeData, shapeData + N); + fShapeDim = ConvertShapeToDim(shape); + } else if (model.IsShapeTensor(fNShape)) { + // case input shape is a shape tensor + fShapeDim = model.GetShapeTensorValues(fNShape); + fInitializedShape = true; + } else { + // assume shape of input shape is known (size is 1) + auto shapeOfInputShape = model.GetTensorShape(fNShape); + fShapeDim.resize(shapeOfInputShape[0]); + for (size_t i = 0; i < fShapeDim.size(); i++) { + fShapeDim[i] = Dim{std::string("v_") + fNShape + "_" + std::to_string(i)}; + model.AddShapeParam(fShapeDim[i].param); + } + } + // Y is the common shape of fShapeX and shape + auto ret = SOFIE::UTILITY::MultidirectionalBroadcastShape(fShapeX, fShapeDim); + fShapeY = ret.second; + fInitialized = model.IsInitializedTensor(fNX) && fInitializedShape; + std::vector shapeX; + std::vector shapeY; + // case shape tensor and input shape are known + if (!model.IsDynamicTensor(fNX) && !model.IsDimInputTensor(fNX) && fInitializedShape) { + shapeX = ConvertShapeToInt(fShapeX); + shapeY = ConvertShapeToInt(fShapeY); + if (!UTILITY::AreSameShape(shapeX, shapeY)) + fInitBroadcast = true; + } + if (fInitialized) { + // cannot have Dim initialized tensors + assert(!shapeX.empty() && !shapeY.empty()); + // Broadcast X to the common shape shapeY + // If X is an initialized tensor (constant) + auto data = model.GetInitializedTensorData(fNX); + if (fInitBroadcast) { + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), shapeX, shapeY), + std::default_delete()); + // Update the data and the shape of X + model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData); + fShapeX = fShapeY; + // need to set as a not writable tensor + model.SetNotWritableInitializedTensor(fNX); + data = broadcastedData; + } + if (fInitBroadcast || model.IsConstantTensor(fNX)) { + fIsOutputConstant = true; // constant output in this case + model.AddConstantTensor(fNY, model.GetTensorType(fNX), shapeY, data); + fOutputTensorNames.pop_back(); + } else { + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), shapeY); + } + } else { + // // case input is not initialized + // if (shapeX.empty() && shapeDim.empty()) { + + // } + // if (fInitializedShape) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) { + std::cout << "Expand - input " << fNX << " shape " << ConvertDimShapeToString(fShapeX) << " --> " << fNY << " shape " + << ConvertDimShapeToString(fShapeY) << (fIsOutputConstant ? ConvertValuesToString(model.GetTensorData(fNY)) + " (constant)" : "") << std::endl; + } + + if (fInitializedShape && model.IsInitializedTensor(fNShape)) { + // Shape values are fully consumed into fShapeY/fShapeDim at generation time — + // no device buffer needed for fNShape for Heterogeneous inference + model.SetNotWritableInitializedTensor(fNShape); + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + if (!fIsOutputConstant && fInitialized && !fInitBroadcast) { + // shapeX and shapeY are the same in this case + auto length = ConvertDimShapeToLength(fShapeY); + out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n"; + out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; + } + return out.str(); + } + + std::string Generate(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE Expand Op called to Generate without being initialized first"); + } + std::stringstream out; + out << SP << "\n//------ Expand " << opName << " --> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to declare shape parameters for non initialized shapes + if (!fInitializedShape) { + for (size_t i = 0; i < fShapeDim.size(); i++) { + out << SP << "size_t " << fShapeDim[i] << " = " << "tensor_" << fNShape << "[" << i << "];\n"; + } + } + // No need to broadcast A if it's an initialized tensor or shapes are the same + if (!fInitialized && fShapeX != fShapeY) { + out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n"; + out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNX << ", " << ConvertDimShapeToString(fShapeX) << ", " << ConvertDimShapeToString(fShapeY) + << ", tensor_"<& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + needsBroadcast = std::any_of(fShapeX.begin(), fShapeX.end(), + [&](const Dim& d) { + size_t i = &d - fShapeX.data(); + return fShapeX[i].dim != fShapeY[i].dim; + }); + } + if (!needsBroadcast) return ""; // same static shape — just a memcpy + + const std::size_t D = fShapeY.size(); + + // Left-pad fShapeX with dim=1 entries to match rank of fShapeY + std::vector shapeX_padded(D, 1); + size_t offset = D - fShapeX.size(); + for (size_t i = 0; i < fShapeX.size(); ++i) + shapeX_padded[offset + i] = fShapeX[i].dim; + + std::vector shapeY_int(D); + for (size_t i = 0; i < D; ++i) + shapeY_int[i] = fShapeY[i].dim; + + auto stridesX = UTILITY::ComputeStrideFromShape(shapeX_padded); + auto stridesY = UTILITY::ComputeStrideFromShape(shapeY_int); + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + + std::string kname = "ExpandKernel_" + opName; + + std::string op; + op = "\n//------ EXPAND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index using compile-time output strides + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(shapeY_int[d]) + "u;\n"; + } + op += "\n"; + + // Input index: broadcast dims (shapeX_padded[d]==1) contribute 0 — + // compiler eliminates zero terms entirely, no runtime branch + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX_padded[d] == 1) { + op += SP + SP + SP + SP + SP + "0u"; + } else { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX[d]) + "u"; + } + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; // end grid-stride loop + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + if (fInitialized) return ""; + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + if (!needsBroadcast) return ""; + + opName = "op_" + opName; + std::string kname = "ExpandKernel_" + opName; + return SP + kname + " expandKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Expand called to Generate without being initialized first"); + + std::stringstream out; + out << "\n//------ EXPAND_GPU_ALPAKA\n"; + + if (fInitialized && !fInitBroadcast) { + // GenerateInitCode already handled the copy — nothing to do at inference time + return ""; + } + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + bool staticShapes = isStatic(fShapeX) && isStatic(fShapeY); + + // Check if broadcast is actually needed for static shapes + bool needsBroadcast = !staticShapes; // dynamic always needs runtime broadcast + if (staticShapes) { + needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + } + + if (!needsBroadcast) { + // Same static shape — device-to-device copy + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY + << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + return out.str(); + } + + if (!staticShapes) { + // Dynamic shapes — not yet supported on GPU, throw a clear error + throw std::runtime_error( + "SOFIE Expand GPU: dynamic shapes are not yet supported for GPU inference. " + "Tensor " + fNX + " has a dynamic shape."); + } + + // Static broadcast — launch the expand kernel + std::vector shapeY_int(fShapeY.size()); + for (size_t i = 0; i < fShapeY.size(); ++i) + shapeY_int[i] = fShapeY[i].dim; + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + std::string kname = "expandKernel_" + opName; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n"; + + return out.str(); +} +}; +}//SOFIE + +#endif //SOFIE_ROperator_Expand diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx b/core/inc/SOFIE/ROperator_EyeLike.hxx similarity index 89% rename from src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx rename to core/inc/SOFIE/ROperator_EyeLike.hxx index 8e94e1c..91103ef 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx +++ b/core/inc/SOFIE/ROperator_EyeLike.hxx @@ -40,11 +40,11 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); if (fShape.size() != 2) - throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not of rank 2"); + throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not of rank 2"); if(fdtype){ ETensorType extractedType = static_cast(fdtype); @@ -59,7 +59,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator EyeLike called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator EyeLike called to Generate without being initialized first"); } auto length = ConvertShapeToLength(fShape); auto stride = SOFIE::UTILITY::ComputeStrideFromShape(fShape); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/core/inc/SOFIE/ROperator_GRU.hxx similarity index 92% rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx rename to core/inc/SOFIE/ROperator_GRU.hxx index bb1a74e..037e016 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx +++ b/core/inc/SOFIE/ROperator_GRU.hxx @@ -11,7 +11,6 @@ #include #include - namespace SOFIE { /*! \brief Gated Recurrent Unit operator @@ -91,7 +90,7 @@ template class ROperator_GRU final : public ROperator { fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)), fNInitial_h(UTILITY::Clean_name(nameInitial_h)), fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) { - + fInputTensorNames = { fNX, fNW, fNR }; if (!fNB.empty()){ fInputTensorNames.emplace_back(fNB); @@ -115,7 +114,7 @@ template class ROperator_GRU final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a GRU operator"); + "SOFIE Encountered unsupported type parsing a GRU operator"); } } @@ -123,39 +122,34 @@ template class ROperator_GRU final : public ROperator { * * \param input type of the input tensors */ - std::vector TypeInference(std::vector /*input*/); + std::vector TypeInference(std::vector /*input*/) override; /*! \brief Infers the shape of the output tensors * * \param input shape of the input tensors */ - std::vector> ShapeInference(std::vector> /*input*/); + std::vector> ShapeInference(std::vector> /*input*/) override; /*! \brief Initialize the model * * \param model Model */ - void Initialize(RModel &); + void Initialize(RModel &) override; /*! \brief Generate the inference code * * \param OpName name of the operator */ - std::string Generate(std::string /*OpName*/); - - /*! \brief Generate the code for the Session internal data vectors - * - * \param opName name of the operator - */ - std::string GenerateSessionMembersCode(std::string opName); + std::string Generate(std::string /*OpName*/) override; /*! \brief Returns the blas routines needed to compile the generated code */ - std::vector GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; } + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } }; } // namespace SOFIE + // Implementation of the ROperator_GRU class #include "SOFIE/ROperator_GRU.icc" diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/core/inc/SOFIE/ROperator_GRU.icc similarity index 93% rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc rename to core/inc/SOFIE/ROperator_GRU.icc index f3813c2..f24460c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc +++ b/core/inc/SOFIE/ROperator_GRU.icc @@ -38,33 +38,33 @@ void ROperator_GRU::Initialize(RModel& model){ fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + " is not found in model."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not found in model."); + throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 4) { - throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions."); + throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions."); } if (fShapeB.size() == 2) { // Broadcasting the bias @@ -99,25 +99,25 @@ void ROperator_GRU::Initialize(RModel& model){ } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + + throw std::runtime_error("SOFIE GRU Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } @@ -141,7 +141,7 @@ void ROperator_GRU::Initialize(RModel& model){ activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } @@ -150,22 +150,22 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrDirection != "reverse" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid GRU direction fAttrDirection = " + + "SOFIE - Invalid GRU direction fAttrDirection = " + fAttrDirection); } if (3 * fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1] / 3)); } if (fAttrLayout > 1) { - throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " + + throw std::runtime_error("SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } if (fAttrLinearBeforeReset > 1) { throw std::runtime_error( - "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset) + "SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset) + " must be 0 or 1."); } if (fAttrActivations.empty()) { @@ -175,51 +175,45 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrActivations = {"Sigmoid", "Tanh"}; } } -} -// generate code for Session data members (e.g. internal vectors) -template -std::string ROperator_GRU::GenerateSessionMembersCode(std::string opName) -{ - opName = "op_" + opName; - std::stringstream out; + // To get unique intermediate tensor names, we add the name of the input + // tensor. One might also consider using the index of the operator in the + // RMode, but this information is not available in the current scope. + std::string opName = "op_gru_" + fNX; size_t num_directions = fShapeW[0]; size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1]; size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0]; size_t input_size = fShapeX[2]; + auto declareVector = [&](std::string const &name, std::size_t n){ + std::string fullName = opName + "_" + name; + model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector{n}); + }; + if (fAttrLayout != 0) { - out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">(" - << seq_length * batch_size * input_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; + declareVector("input", seq_length * batch_size * input_size); + declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize); + declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize); } // Set the feedforward size_t ff_size = seq_length * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; + declareVector("f_update_gate", ff_size); + declareVector("f_reset_gate", ff_size); + declareVector("f_hidden_gate", ff_size); // gate results size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("update_gate", hs_size); + declareVector("reset_gate", hs_size); + declareVector("hidden_gate", hs_size); // feedback - out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">(" - << batch_size * fAttrHiddenSize << ");\n"; + declareVector("feedback", batch_size * fAttrHiddenSize); // hiddden state if (fAttrLayout != 0 || fNY.empty()) { - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("hidden_state", hs_size); } - - out << "\n"; - - return out.str(); } @@ -234,12 +228,14 @@ auto ROperator_GRU::Generate(std::string OpName) size_t input_size = fShapeX[2]; size_t num_directions = fShapeW[0]; + auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; }; + // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; + out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n"; } else { out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n"; } @@ -261,8 +257,7 @@ auto ROperator_GRU::Generate(std::string OpName) << fNInitial_h << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName - << "_initial_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize << "];\n"; @@ -283,9 +278,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the feedforward size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n"; @@ -294,9 +289,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the gates size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n"; @@ -307,14 +302,14 @@ auto ROperator_GRU::Generate(std::string OpName) out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n"; } } if (fUseSession) { - out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n"; + out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n"; } else { out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n"; } diff --git a/core/inc/SOFIE/ROperator_Gather.hxx b/core/inc/SOFIE/ROperator_Gather.hxx new file mode 100644 index 0000000..3c16f18 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Gather.hxx @@ -0,0 +1,400 @@ +#ifndef SOFIE_ROPERATOR_GATHER +#define SOFIE_ROPERATOR_GATHER + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include + + +namespace SOFIE{ + +class ROperator_Gather final : public ROperator +{ +private: + + int64_t fAttrAxis = 0; + + std::string fNX; + std::string fNIndices; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeIndices; + std::vector fShapeY; + + std::vector fIndices; // indices vector in case they are known at initialization + + std::string fType; + +public: + ROperator_Gather(){} + ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY): + fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) { + fInputTensorNames = { fNX, fNIndices }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; + return ret; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("SOFIE Gather Op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetDimTensorShape(fNX); + if (model.Verbose()) + std::cout << "Gather - initial shape " << ConvertDimShapeToString(fShapeX) << " shape of indices " + << ConvertDimShapeToString(model.GetDimTensorShape(fNIndices)) << std::endl; + // fShapeIndices can be dynamic + fShapeIndices = model.GetDimTensorShape(fNIndices); + size_t q = fShapeIndices.size(); + // Axis in range [0, r) where r=rank(X) + size_t r = fShapeX.size(); + // Set the axis + if (fAttrAxis < 0) { + fAttrAxis = fAttrAxis + int64_t(r); + } + + + // case indices tensor is initialized + if (model.IsInitializedTensor(fNIndices)) { + // empty shape Indices is a scalar value for the indices + size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices)); + int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); + // update indices data in case of negative dim values + for (size_t i = 0; i < indicesLength; i++) { + // move this at generation time? + if (!fShapeX[fAttrAxis].isParam) { + if (indicesData[i] < 0) { + indicesData[i] += fShapeX[fAttrAxis].dim; + } + } + } + // Save in a vector gather Indices of size q + fIndices = std::vector(indicesData, indicesData + indicesLength); + } + // Output shape + if (model.Verbose()) + std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertDimShapeToString(fShapeIndices) << std::endl; + + if (fShapeY.empty()) { + fShapeY.resize(q + r - 1); + if (fAttrAxis > 0) { + // Copy shape of X[0, ..., axis-1) to Shape of Y[0, ..., axis-1) + std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin()); + } + // Set shape of Y[axis, ..., axis + q) + for (size_t i = 0; i < q; i++) { + fShapeY[fAttrAxis + i] = Dim{ fShapeIndices[i]}; + } + // Copy shape of X[axis + 1, ..., r) to shape of Y[axis + q, ... q + r - 1) + std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q); + } + // case input is known (type is an integer) and input indices is a scalar (or vector of size 1) + if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) { + auto shapeX = ConvertShapeToInt(fShapeX); // we assume model is not dynamic + auto shapeY = ConvertShapeToInt(fShapeY); + if (model.GetTensorType(fNX) == ETensorType::INT64) { + auto inputData = static_cast(model.GetInitializedTensorData(fNX).get()); + // if q <=1 and r = 1 output length = 1 (it is a scalar) + std::vector outputData(1); //ConvertShapeToLength(shapeY)); + outputData[0] = inputData[fIndices[0]]; + model.AddConstantTensor(fNY, shapeY, outputData.data()); + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertShapeToString(shapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(shapeY) + << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl; + fIsOutputConstant = true; + } + } + // case input is a shape tensor (r is == 1 by definition) and indices are known + else if (model.IsShapeTensor(fNX) && q <=1 && fIndices.size() > 0) { + auto inputData = model.GetShapeTensorValues(fNX); + // if r == 1 and q<=1 then output length is 1 (is a scalar or tensor of size1) + std::vector outputData(1); + outputData[0] = inputData[fIndices[0]]; + if (outputData[0].isParam) { + fIsOutputConstant = true; + // shapeY can be scalar or vector of size1 + model.AddShapeTensor(fNY, outputData, fShapeY.size() == 0); + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values " << ConvertDimShapeToString(outputData) << " (shape) " << std::endl; + } else { + int64_t value = static_cast(outputData[0].dim); + auto shapeY = ConvertShapeToInt(fShapeY); + model.AddConstantTensor(fNY, shapeY, &value); + fIsOutputConstant = true; + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values {" << value << "} (constant) " << std::endl; + } + } + if (!fIsOutputConstant) { + // Add output tensor + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) + std::cout << "Gather: input " << fNX << " " << ConvertDimShapeToString(fShapeX) << " indices " << fNIndices << ConvertDimShapeToString(fShapeIndices) + << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << std::endl; + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out << "//--------- Gather " << opName << " --> " << fNY << " " << ConvertDimShapeToString(fShapeY) << "\n"; + if (fIsOutputConstant) { + // no code to generate here for constant output. Tensor output is defined in Session constructor + out << "//--------------------(constant)----------\n"; + return out.str(); + } + // The shape of the output is q + r - 1 + size_t r = fShapeX.size(); + // Indices of shape q + size_t q = fShapeIndices.size(); + // Strides + auto stridesX = UTILITY::ComputeStrideFromShape(fShapeX); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + + // case fIndices is not known we need to correct for negative axis indices at run-time + if (fIndices.empty()) { + auto indicesLength = ConvertDimShapeToLength(fShapeIndices); + out << SP << "// correct in case of negative gather indices\n"; + out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n"; + out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n"; + out << SP << SP << SP << "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n"; + out << SP << "}\n"; + } + + // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}] + // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1) + // iterate in [0 ... axis) [0 ... q) [axis ... r - 1) + // for j_0, j_1, ..., j_{axis-1} + + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + std::string index = "j_" + std::to_string(j); + for (size_t k = 0; k <= j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; + } + // for i_0, i_1, ..., i_{q - 1} + for (size_t i = 0; i < q; i++) { + std::string index = "i_" + std::to_string(i); + for (size_t k = 0; k <= i + fAttrAxis; k++) out << SP; + out << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; + } + // for j_axis, j_{axis + 1}, ..., j_{r - 1} + for (size_t j = fAttrAxis; j + 1 < r; j++) { + std::string index = "j_" + std::to_string(q+j); // annotate index using output axis + for (size_t k = 0; k <= q + j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; + } + + // add a scope for local variables in case above loop are not done + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << SP << "{ // scalar case \n"; + + // output index + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t y_index = "; + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + if (j > 0) out << " + "; + out << "j_" << j; + if (stridesY[j].dim != 1) out << " * " << stridesY[j]; + } + for (size_t i = 0; i < q; i++) { + if (fAttrAxis + i > 0) out << " + "; + out << "i_" << i; + if (stridesY[fAttrAxis + i].dim != 1) out << " * " << stridesY[fAttrAxis + i]; + } + for (size_t j = fAttrAxis; j + 1 < r; j++) { + if (j + q > 0) out << " + "; + out << "j_" << q+j; + if (stridesY[q+j].dim != 1) out << " * " << stridesY[q+j]; + } + // empty case + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << "0"; + out << ";\n"; + + // input Indices + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t i_index = "; + for (size_t i = 0; i < q; i++) { + if (i > 0) out << " + "; + out << "i_" << i; + if (stridesIndices[i].dim != 1) out << " * " << stridesIndices[i]; + } + // empty case + if (q == 0) + out << "0"; + out << ";\n"; + + // K + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; + // Input + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t x_index = k"; + if (stridesX[fAttrAxis].dim != 1) out << " * " << stridesX[fAttrAxis]; + for (size_t j = 0; j < size_t(fAttrAxis); j++) { + out << " + "; + out << " j_" << j; + if (stridesX[j].dim != 1) out << " * " << stridesX[j]; + } + // for input corresponding stride is axis+1,.... r + // loop is on j from fAttrAxis, so consider stridesX[j+1] + for (size_t j = fAttrAxis; j+1 < r; j++) { + out << " + "; + out << " j_" << q+j; + if (stridesX[j+1].dim != 1) out << " * " << stridesX[j+1]; + } + out << ";\n"; + for (size_t k = 0; k < q + r; k++) out << SP; + out << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; + + // end loops j_k, j_{k + 1}, ..., j_{r - 2} + for (size_t j = q+r-1; j > 0; j--) { + for (size_t k = 0; k \n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + stridesY[d].GetVal() + "u) % " + + fShapeY[d].GetVal() + "u;\n"; + } + op += "\n"; + + // Output dims [axis ... axis+q) correspond to the indices tensor dims [0 ... q) + // so i_index = sum over i in [0,q): out_{axis+i} * stridesIndices[i] + if (q == 0) { + op += SP + SP + SP + SP + "std::size_t const i_index = 0u;\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const i_index =\n"; + for (std::size_t i = 0; i < q; ++i) { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(fAttrAxis + i) + + " * " + stridesIndices[i].GetVal() + "u"; + op += (i + 1 < q) ? " +\n" : ";\n"; + } + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t k = indices[i_index];\n"; + op += SP + SP + SP + SP + "if (k < 0) k += " + fShapeX[fAttrAxis].GetVal() + ";\n"; + op += SP + SP + SP + SP + "if (k < 0) k = 0;\n"; + op += SP + SP + SP + SP + "if (k >= static_cast(" + fShapeX[fAttrAxis].GetVal() + ")) " + + "k = static_cast(" + fShapeX[fAttrAxis].GetVal() + ") - 1;\n\n"; + + // x_index = k * stridesX[axis] + // + sum over j in [0, axis): out_j * stridesX[j] + // + sum over j in [axis+1, r): out_{j-1+q} * stridesX[j] + // (the dims after axis in Y are shifted by q-1 relative to X) + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + op += SP + SP + SP + SP + SP + "static_cast(k) * " + stridesX[fAttrAxis].GetVal() + "u"; + for (std::size_t j = 0; j < static_cast(fAttrAxis); ++j) { + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(j) + " * " + stridesX[j].GetVal() + "u"; + } + for (std::size_t j = fAttrAxis + 1; j < r; ++j) { + // in Y, the coord for X's dim j lives at output dim q + j - 1 + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(q + j - 1) + " * " + stridesX[j].GetVal() + "u"; + } + op += ";\n\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "GatherKernel_" + opName; + return SP + kname + " gatherKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE Gather Op called to Generate without being initialized first"); + + auto totalElements = ConvertDimShapeToLength(fShapeY); + std::string kname = "gatherKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHER_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); +} + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_RELU diff --git a/core/inc/SOFIE/ROperator_GatherND.hxx b/core/inc/SOFIE/ROperator_GatherND.hxx new file mode 100644 index 0000000..ffcdab8 --- /dev/null +++ b/core/inc/SOFIE/ROperator_GatherND.hxx @@ -0,0 +1,297 @@ +#ifndef SOFIE_ROPERATOR_GATHERND +#define SOFIE_ROPERATOR_GATHERND + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + +namespace SOFIE { + +class ROperator_GatherND final : public ROperator +{ +private: + + int64_t fBatchDims = 0; + + std::string fNData; + std::string fNIndices; + std::string fNY; + + std::vector fShapeData; + std::vector fShapeIndices; + std::vector fShapeY; + + std::string fType; + +public: + ROperator_GatherND() {} + ROperator_GatherND(int64_t batchDims, + std::string nameData, + std::string nameIndices, + std::string nameY) + : fBatchDims(batchDims), + fNData(UTILITY::Clean_name(nameData)), + fNIndices(UTILITY::Clean_name(nameIndices)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNData, fNIndices }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return { input[0] }; + } + + std::vector> ShapeInference(std::vector> input) override { + return { input[0] }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNData)) + throw std::runtime_error("SOFIE GatherND: data tensor " + fNData + " not found in model"); + if (!model.CheckIfTensorAlreadyExist(fNIndices)) + throw std::runtime_error("SOFIE GatherND: indices tensor " + fNIndices + " not found in model"); + + fShapeData = model.GetTensorShape(fNData); + fShapeIndices = model.GetTensorShape(fNIndices); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + if (r < 1) + throw std::runtime_error("SOFIE GatherND: data rank must be >= 1"); + if (q < 1) + throw std::runtime_error("SOFIE GatherND: indices rank must be >= 1"); + if (b >= std::min(q, r)) + throw std::runtime_error("SOFIE GatherND: batch_dims must be < min(q, r)"); + if (last_idx_dim > r - b) + throw std::runtime_error("SOFIE GatherND: indices_shape[-1] must be <= r - batch_dims"); + + for (size_t i = 0; i < b; ++i) { + if (fShapeData[i] != fShapeIndices[i]) + throw std::runtime_error("SOFIE GatherND: first batch_dims dimensions of data and indices must match"); + } + + // Output shape: batch_dims + indices[0..q-2] + data[b + last_idx_dim .. r-1] + // rank = b + (q - b - 1) + (r - b - last_idx_dim) + // = q + r - last_idx_dim - 1 - b + fShapeY.clear(); + for (size_t i = 0; i < b; ++i) + fShapeY.push_back(fShapeData[i]); + for (size_t i = b; i + 1 < q; ++i) + fShapeY.push_back(fShapeIndices[i]); + for (size_t i = b + last_idx_dim; i < r; ++i) + fShapeY.push_back(fShapeData[i]); + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNData), fShapeY); + fType = ConvertTypeToString(model.GetTensorType(fNData)); + + if (model.Verbose()) + std::cout << "GatherND: data " << ConvertShapeToString(fShapeData) + << " indices " << ConvertShapeToString(fShapeIndices) + << " batch_dims=" << fBatchDims + << " -> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::stringstream out; + out << SP << "//--------- GatherND operator " << opName << "\n"; + + out << SP << "for (size_t out_idx = 0; out_idx < " << totalOutput << "; out_idx++) {\n"; + + out << SP << SP << "size_t rem = out_idx;\n"; + size_t Dy = fShapeY.size(); + for (size_t d = 0; d < Dy; ++d) { + out << SP << SP << "size_t oy_" << d << " = rem / " << stridesY[d] << ";\n"; + out << SP << SP << "rem %= " << stridesY[d] << ";\n"; + } + + out << SP << SP << "size_t idx_base = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + for (size_t i = b; i + 1 < q; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + + out << SP << SP << "size_t data_idx = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "data_idx += oy_" << i << " * " << stridesData[i] << ";\n"; + + out << SP << SP << "for (size_t k = 0; k < " << last_idx_dim << "; k++) {\n"; + out << SP << SP << SP << "int64_t idx_val = tensor_" << fNIndices + << "[idx_base + k * " << stridesIndices[q - 1] << "];\n"; + out << SP << SP << SP << "if (idx_val < 0) idx_val += " << "static_cast(tensor_" + << fNData << "_shape[" << b << " + k]);\n"; + out << SP << SP << SP << "data_idx += static_cast(idx_val) * " << "data_stride_b_plus_k_" << opName << "[k];\n"; + out << SP << SP << "}\n"; + + // Accumulate trailing data dims from output coords + // Y dims [b + (q-b-1) .. ] correspond to data dims [b + last_idx_dim .. r-1] + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + out << SP << SP << "data_idx += oy_" << oy_dim << " * " << stridesData[i] << ";\n"; + } + + out << SP << SP << "tensor_" << fNY << "[out_idx] = tensor_" << fNData << "[data_idx];\n"; + out << SP << "}\n"; + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t Dy = fShapeY.size(); + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::string kname = "GatherNDKernel_" + opName; + + std::string op; + op = "\n//------ GATHERND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ data,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (size_t d = 0; d < Dy; ++d) { + op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const idx_base =\n"; + // batch dims: oy_0..oy_{b-1} * stridesIndices[0..b-1] + // outer idx dims: oy_b..oy_{b+(q-b-2)} * stridesIndices[b..q-2] + bool first = true; + for (size_t i = 0; i < q - 1; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesIndices[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; // q==1: scalar index tuple + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t data_idx =\n"; + first = true; + for (size_t i = 0; i < b; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesData[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "// Read " + std::to_string(last_idx_dim) + "-element index tuple\n"; + for (size_t k = 0; k < last_idx_dim; ++k) { + size_t idx_offset = k; + size_t data_axis = b + k; + op += SP + SP + SP + SP + "{\n"; + op += SP + SP + SP + SP + SP + + "int64_t idx_val = indices[idx_base + " + + std::to_string(idx_offset) + "u];\n"; + op += SP + SP + SP + SP + SP + + "if (idx_val < 0) idx_val += " + + std::to_string(fShapeData[data_axis]) + ";\n"; + op += SP + SP + SP + SP + SP + + "data_idx += static_cast(idx_val) * " + + std::to_string(stridesData[data_axis]) + "u;\n"; + op += SP + SP + SP + SP + "}\n"; + } + op += "\n"; + + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + op += SP + SP + SP + SP + + "data_idx += oy_" + std::to_string(oy_dim) + + " * " + std::to_string(stridesData[i]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = data[data_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "GatherNDKernel_" + opName; + return SP + kname + " gatherNDKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "gatherNDKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHERND_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } +}; + +} // SOFIE + +#endif // SOFIE_ROPERATOR_GATHERND diff --git a/core/inc/SOFIE/ROperator_Gemm.hxx b/core/inc/SOFIE/ROperator_Gemm.hxx new file mode 100644 index 0000000..eecb33b --- /dev/null +++ b/core/inc/SOFIE/ROperator_Gemm.hxx @@ -0,0 +1,860 @@ +#ifndef SOFIE_ROPERATOR_GEMM +#define SOFIE_ROPERATOR_GEMM + + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + + +namespace SOFIE{ + + + template + class ROperator_Gemm final : public ROperator + { + + private: + bool fIsDynamic = false; + bool fBroadcastBias = false; + bool fCheckBiasShapeAtRuntime = false; // flag to identify the need to do a run time check of bias shape compatibility in case of dynamic shapes and uni-directional broadcasting + + float fAttrAlpha = 1.0; + float fAttrBeta = 1.0; + int_t fAttrTransA = 0; + int_t fAttrTransB = 0; + + std::string fNA; + std::string fNB; + std::string fNC = ""; + std::string fNY; + std::string fType; + EActivationType fActivation; + float fLeakyReluAlpha = 0.01f; // used when fActivation == LEAKYRELU + std::vector fShapeA; + std::vector fShapeB; + std::vector fShapeC; + std::vector fDimShapeC; + std::vector fShapeY; + RModel * fModel = nullptr; + + public: + + ROperator_Gemm(){} + ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): + fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), + fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fActivation = activation; + fType = "float"; + static_assert(std::is_same_v, + "TMVA::SOFIE - Unsupported type parsing a Gemm operator"); + fInputTensorNames = { fNA, fNB }; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::GEMM; + } + + ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): + fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), + fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) + { + fActivation = activation; + fType = "float"; + + fInputTensorNames = {fNA, fNB, fNC}; + fOutputTensorNames = { fNY }; + fKind = OperatorKind::GEMM; + } + + std::vector TypeInference(std::vector input) override { + ETensorType out = input[0]; + return {out}; + } + + template + std::vector DoShapeInference(const std::vector> & input){ + if (input.size() > 3) throw std::runtime_error("SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor"); + // accept tensor with input dimensions > 2 + // example: A = (d1,d2,...,N1,N2) B = (d1,d2,...,N2,N3) --> Y = (d1,d2,..,N1,N3) + for (auto& i: input){ + if (i.size() < 2){ + throw std::runtime_error("SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions"); + } + } + + // when there are 3 inputs shape of Y is the one of C + if (input.size() == 3){ + //shape of C is shape of Y + return input[2]; + } + // ioffset cannot be less than 2 + int ioffset = input[0].size()-2; // in case of tensors with dim > 2 + + std::vector s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2); + std::vector s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2); + // reverse in case of transpose + if (fAttrTransA){ + std::reverse(s_a.begin(), s_a.end()); + } + if (fAttrTransB){ + std::reverse(s_b.begin(), s_b.end()); + } + std::vector s_y; + s_y.reserve(input[0].size()); + if (input[0].size() > 2 && input[1].size() == input[0].size()) { + // in case of dim > 2 first dimensions are equal to the input ones not + // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) + // here could probably use the Broadcasting function UTILITY::MultidirectionalBroadcastShape + for (size_t i = 0; i < input[0].size()-2; i++) { + Dim valueA = input[0][i]; + Dim valueB = input[1][i]; + if (valueA.GetVal() != valueB.GetVal()) { + if (valueB.GetVal() == "1") + s_y.push_back(input[0][i]); + else if (valueA.GetVal() == "1") + s_y.push_back(input[1][i]); + else if (!valueA.isParam && !valueB.isParam) + throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + + valueB.GetVal()); + else if (valueA.isParam && valueB.isParam){ + // check which parameter is first in RModel list + auto & dimNames = fModel->GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), valueA.param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), valueB.param); + if (p1 < p2) s_y.push_back(input[0][i]); + else s_y.push_back(input[1][i]); + } + else if (!valueA.isParam) + s_y.push_back(input[0][i]); + else if (!valueB.isParam) + s_y.push_back(input[1][i]); + else + throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + + valueB.GetVal()); + } + else + s_y.push_back(input[0][i]); + } + } + + s_y.push_back(s_a[0]); + s_y.push_back(s_b[1]); + return s_y; + } + + std::vector> ShapeInference(std::vector> input) override { + std::vector> ret; + ret.push_back(DoShapeInference(input)); + return ret; + } + std::vector DynamicShapeInference(const std::vector> & input){ + return DoShapeInference(input); + } + + + + void Initialize(RModel& model) override { + //TODO: propagate A or B as specified by ONNX standard + fModel = &model; + + if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model"); + } + if (fNC != ""){ + if (model.CheckIfTensorAlreadyExist(fNC) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNC + " is not found in model"); + } + } + if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) { + fShapeA = model.GetDynamicTensorShape(fNA); + fIsDynamic = true; + } else { + auto shapeA_int = model.GetTensorShape(fNA); + fShapeA = ConvertShapeToDim(shapeA_int); + } + // case A is of dim1 we prepend a 1 but we need to remove later + bool prependOne = false; + if (fShapeA.size() == 1) { + fShapeA.insert(fShapeA.begin(), Dim(1)); + prependOne = true; + } + + if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) { + fShapeB = model.GetDynamicTensorShape(fNB); + fIsDynamic = true; + } + else { + auto shapeB_int = model.GetTensorShape(fNB); + fShapeB = ConvertShapeToDim(shapeB_int); + } + // case B is dim1 we append a 1 but we need to remove later + bool appendOne = false; + if (fShapeB.size() == 1) { + fShapeB.insert(fShapeB.end(), Dim(1)); + appendOne = true; + } + // assume if not shape is 2 that extra values are 1. + // implement also MatMul case where we stack matrices (see numpy.matmul) + if (fShapeA.size() != fShapeB.size()) { + // if different dimensions we prepend 1 values + if (fShapeA.size() < fShapeB.size()) { + fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1)); + } else if (fShapeB.size() < fShapeA.size()) { + fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1)); + } + } + + fShapeY = DynamicShapeInference({fShapeA, fShapeB}); + std::vector shapeY = ConvertShapeToInt(fShapeY); + + // bias is normally not dynamic (not support it for time being) + if (fNC != ""){ + if (model.IsDynamicTensor(fNC)) + fDimShapeC = model.GetDynamicTensorShape(fNC); + else { + fShapeC = model.GetTensorShape(fNC); + fDimShapeC = ConvertShapeToDim(fShapeC); + } + // for dynamic outputs broadcasting is always needed + bool broadcast_needed = false; + if (fIsDynamic && shapeY.empty()) + broadcast_needed = true; + else + // consider broadcasting also if they have different length + broadcast_needed = (fShapeC != shapeY); + + + if (broadcast_needed) { + fBroadcastBias = true; + // check if broadcasting is compatible and note that prepend 1 to shapeC + auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, fDimShapeC); + // return flag must not have bit equal to 2 since this is a unidirectional broadcast of C->Y + // + if ((r.first & 2) == 2) { + throw std::runtime_error("SOFIE Gemm Op - bias tensor of shape " + ConvertDimShapeToString(fDimShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY)); + } else if (r.first == 4) { + // we need to do a run time check of bias shape if it is compatible + fCheckBiasShapeAtRuntime = true; + } + fShapeC = ConvertShapeToInt(fDimShapeC); + } + } + + // remove appended or prepended value of 1 in Y + if (prependOne) { + if (fIsDynamic) + fShapeY.erase(fShapeY.begin()); + else + shapeY.erase(shapeY.begin()); + } + if (appendOne) { + if (fIsDynamic) + fShapeY.erase(fShapeY.end()-1); + else + shapeY.erase(shapeY.end()-1); + } + + if (!fIsDynamic) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY); + else + model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY); + + if (model.Verbose()){ + std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; + if (fIsDynamic) + std::cout << ConvertDimShapeToString(fShapeY) << std::endl; + else + std::cout << ConvertShapeToString(shapeY) << std::endl; + } + + model.AddNeededStdLib("algorithm"); + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + + // if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { + // throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first"); + // } + std::stringstream out; + out << "\n//--------- Gemm " << opName << " " << ConvertDimShapeToString(fShapeA) << " * " << ConvertDimShapeToString(fShapeB) + << " -> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + int64_t dimC = fDimShapeC.size(); + if (dimA != dimB || dimA != dimY || (fBroadcastBias && dimC != dimY)) { + std::cout << " shape A " << ConvertDimShapeToString(fShapeA) + << " shape B " << ConvertDimShapeToString(fShapeB) + << " shape C " << ConvertDimShapeToString(fDimShapeC) + << " shape Y " << ConvertDimShapeToString(fShapeY) << std::endl; + throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + // size of A: if (transposeA) is m*k else k*m + // size of B n*k + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sExtraY; + for (int64_t i = 0; i < dimY-2; i++) { + sExtraY.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul) + std::string lengthExtra_C; + std::vector sExtraC; + std::vector sC; + bool haveExtraC = false; + if (dimC > 2) { + sC = {fDimShapeC[dimC-2], fDimShapeC[dimC-1]}; + for (int64_t i = 0; i < dimC-2; i++) { + sExtraC.push_back(fDimShapeC[i]); + } + lengthExtra_C = ConvertDimShapeToLength(sExtraC); + if (lengthExtra_C != "1") haveExtraC = true; + } else if (dimC > 0) { + for (int64_t i = 0; i < dimC; i++) { + sC.push_back(fDimShapeC[i]); + } + } + + // case bias is present + if (!fNC.empty()){ + // when the 2 last dims of bias and Y are not compatible we need to perform a run time broadcast + if (sC != sY) fBroadcastBias = true; + if (!fBroadcastBias) { + // add a check in case broadcasting was not needed or done outside of session + // C should have smaller dimension of Y + if (!fIsDynamic) { + if ((std::stoi(lengthGemm) != std::stoi(ConvertDimShapeToLength(sC))) || + ( haveExtraC && std::stoi(lengthExtra_Y) != std::stoi(lengthExtra_C))) + throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor " + fNC + " has not correct size " + + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should not be a dynamic tensor) + out << SP << "assert(" << lengthGemm << " == " << ConvertDimShapeToLength(sC) << ");\n"; + if (haveExtraC) out << SP << "assert(" << lengthExtra_Y << " == " << lengthExtra_C << ");\n"; + } + } + } else { + fBroadcastBias = false; + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + // some model don't have bias but Beta is not zero - force it to zero + fAttrBeta = 0; + std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n"; + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra_Y) > 1); + // compute input offset for stack multiplications + std::string lengthExtra_A; + std::string lengthExtra_B; + std::string increment_A; + std::string increment_B; + + if (doStackMul) { + std::vector sA(fShapeA.begin(), fShapeA.begin()+dimA-2); + std::vector sB(fShapeB.begin(), fShapeB.begin()+dimB-2); + std::vector mA = {fShapeA[dimA-2], fShapeA[dimA-1]}; + std::vector mB = {fShapeB[dimB-2], fShapeB[dimB-1]}; + lengthExtra_A = ConvertDimShapeToLength(sA); + lengthExtra_B = ConvertDimShapeToLength(sB); + // if A ( b, m, k) and B (b, k, n) these are the strides of A and B ( m*k for A and n*k for B ) + increment_A = ConvertDimShapeToLength(mA); + increment_B = ConvertDimShapeToLength(mB); + } + bool extraA = (doStackMul && lengthExtra_A != "1"); + bool extraB = (doStackMul && lengthExtra_B != "1"); + bool extraC = (doStackMul && haveExtraC && !fBroadcastBias); + // run time check for bias broadcasting + std::string biasShapeType = opName + "_biasShapeType"; + if (fBroadcastBias && fCheckBiasShapeAtRuntime) { + // create a flag according to bias shape: + // = 1 for (1,Y2) + // = 2 for (Y1,1) + // = 3 for a scalar + out << SP << "int " << biasShapeType << " = 0;\n"; + // case vector of columns + if (sC[0].GetVal() != "1" && sC[1].GetVal() != sY[1].GetVal()) + out << SP << "if (" << sC[0] << " == 1 && " << sC[1] << " == " << sY[1] << ")\n"; + else if (sC[0].GetVal() == "1") + out << SP << "if (" << sC[1] << " == " << sY[1] << ")\n"; + else if (sC[1].GetVal() == sY[1].GetVal()) + out << SP << "if (" << sC[0] << " == 1)\n"; + + out << SP << SP << biasShapeType << " = 1;\n"; + + // case vector of rows + if (sC[1].GetVal() != "1" && sC[0].GetVal() != sY[0].GetVal()) + out << SP << "else if (" << sC[1] << " == 1 && " << sC[0] << " == " << sY[0] << ")\n"; + else if (sC[1].GetVal() == "1") + out << SP << "else if (" << sC[0] << " == " << sY[0] << ")\n"; + else if (sC[0].GetVal() == sY[0].GetVal()) + out << SP << "else if (" << sC[1] << " == 1)\n"; + + out << SP << SP << biasShapeType << " = 2;\n"; + + // case scalar + if (sC[0].GetVal() != "1" && sC[1].GetVal() != "1") + out << SP << "else if (" << sC[0] << " == 1 && " << sC[1] << " == 1 )\n"; + else if (sC[0].GetVal() == "1") + out << SP << "else if (" << sC[1] << " == 1)\n"; + else if (sC[1].GetVal() == "1") + out << SP << "else if (" << sC[0] << " == 1)\n"; + out << SP << SP << biasShapeType << " = 3;\n"; + out << SP << "else\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE Gemm Op - bias tensor " + << ConvertDimShapeToString(fDimShapeC) << " cannot be broadcasted to " + << ConvertDimShapeToString(fShapeY) << "\");\n"; + } + auto SP2 = SP; + if (doStackMul) { + out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations + if (extraA) + out << SP << "size_t " << opName << "_A_offset = 0;\n"; + if (extraB) + out << SP << "size_t " << opName << "_B_offset = 0;\n"; + if (extraC) + out << SP << "size_t " << opName << "_C_offset = 0;\n"; + out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n"; + SP2 += SP; + } + // do the bias broadcasting at run time by + // initializing output Y vector with bias values + if (fBroadcastBias) { + + fAttrBeta = 1.; + + // loop on first output dimension + out << SP2 << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n"; + out << SP2 << SP << "size_t y_index = "; + if (doStackMul) // add offset in case of stack multiplications (not sure if bias is present in these cases) + out << opName << "_y_offset + "; + if (sY[1].GetVal() != "1") + out << sY[1] << " * j;\n"; + else + out << "j;\n"; + + std::string prefix = SP2 + SP + "SOFIE::"; + std::string target = "tensor_" + fNY; + if (sC.size() != 2) { + throw std::runtime_error("SOFIE Gemm Op - invalid rank for bias tensor " + ConvertDimShapeToString(fDimShapeC) + ConvertDimShapeToString(sC)); + } if (sC[0].GetVal() == "1" && sC[1].GetVal() == sY[1].GetVal()) { + out << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n"; + } else if (sC[1].GetVal() == "1" && sC[0].GetVal() == sY[0].GetVal()) { + out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n"; + } else if (sC[0].GetVal() == "1" && sC[1].GetVal() == "1") { + // scalar case + out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n"; + } else if (fCheckBiasShapeAtRuntime) { + // in the generic dynamic case we check at run time that bias is compatible + // we check that bias[0] = 1 or equal to SY[0] and that bias[1] = 1 or equal to SY[1] + // tbd: this run-time check coul;d be moved outside the loop for better run time efficiency + out << SP2 << SP << "if (" << biasShapeType << " == 1)\n"; // case vector of columns + out << SP << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n"; + out << SP2 << SP << "else if (" << biasShapeType << " == 2)\n"; // case vector of rows + out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n"; + out << SP2 << SP << "else \n"; // scalar case + out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n"; + } else { + throw std::runtime_error("SOFIE Gemm Op - invalid shape for bias tensor " + ConvertDimShapeToString(fDimShapeC)); + } + + out << SP2 << "}\n"; + } + + if (fType == "float"){ + + out << SP2 << "SOFIE::Gemm_Call(" << "tensor_" << fNY; + if (doStackMul) out << " + " << opName << "_y_offset"; + out << ", " + << (fAttrTransB ? "true, " : "false, ") + << (fAttrTransA ? "true, " : "false, ") + << n << ", " << m << ", " << k << ", "; + out << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ", tensor_" << fNB; + if (extraB) out << " + " << opName << "_B_offset"; + out << ", tensor_" << fNA; + if (extraA) out << " + " << opName << "_A_offset"; + out << ", " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ","; + // in the case of bias and no broadcasting needed - I need to add bias as an extra tensor in Gemm call + if (!fNC.empty() && !fBroadcastBias) { + out << "tensor_" << fNC; + if (extraC) { + out << " + " << opName << "_C_offset"; + } + } else { + out << "nullptr"; + } + out << ");\n"; + + } + + if (doStackMul) { + out << SP << SP << opName << "_y_offset += " << lengthGemm << ";\n"; + if (lengthExtra_A != "1") + out << SP << SP << opName << "_A_offset += " << increment_A << ";\n"; + if (lengthExtra_B != "1") + out << SP << SP << opName << "_B_offset += " << increment_B << ";\n"; + if (extraC) + // increment_C is lengthGEmm + out << SP << SP << opName << "_C_offset += " << lengthGemm << ";\n"; + out << SP << "}\n"; // end of loop on the stacked multiplication + } + + // fuse activation with GEMM output (in-place on fNY) + if (fActivation == EActivationType::RELU) { + out << SP << "//--- applying RELU to output\n"; + std::string tnsr = "tensor_" + fNY; + std::string reluSize = ConvertDimShapeToLength(fShapeY); + out << SP << "SOFIE::Relu(" << tnsr << ", " << tnsr << ", " << reluSize << ");\n"; + } else if (fActivation == EActivationType::LEAKYRELU) { + out << SP << "//--- applying LEAKYRELU to output (in-place)\n"; + std::string tnsr = "tensor_" + fNY; + std::string reluSize = ConvertDimShapeToLength(fShapeY); + out << SP << "{\n"; + out << SP << SP << "constexpr float lrelu_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fLeakyReluAlpha << "f;\n"; + out << SP << SP << "for (size_t _i = 0; _i < " << reluSize << "; ++_i)\n"; + out << SP << SP << SP << tnsr << "[_i] = " << tnsr << "[_i] >= 0.f ? " << tnsr << "[_i] : lrelu_alpha * " << tnsr << "[_i];\n"; + out << SP << "}\n"; + } + + return out.str(); + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + + if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fDimShapeC.empty())) { + throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first"); + } + std::stringstream out; + out << "\n//--------- Gemm_GPU_ALPAKA\n"; + // Note: alpaka::wait(queue) intentionally removed here. + // Operations are enqueued asynchronously on the Alpaka queue's CUDA + // stream. Synchronisation only happens once per inference at the + // alpaka::wait(queue) call in _infer_impl's tail and at the + // cudaDeviceSynchronize in the benchmark harness. Adding a wait + // before every GEMM stalls the CPU<->GPU pipeline and is the primary + // cause of SOFIE being slower than ONNXRuntime. + out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; + out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + if (dimA != dimB || dimA != dimY) { + throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sA; + for (int64_t i = 0; i < dimY-2; i++) { + sA.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) + + out << SP << "int " << opName << "_m = " << m << ";\n"; + out << SP << "int " << opName << "_n = " << n << ";\n"; + out << SP << "int " << opName << "_k = " << k << ";\n"; + out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; + + // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface + out << SP << "float " << opName << "_beta = 0;\n"; + + // case bias is present + if (!fNC.empty()){ + if (!fBroadcastBias) { + // add a check in case broadcasting was not needed or done outside of session + // C should have same size as Y + if (!fIsDynamic) { + if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) + throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor has not correct size " + + ConvertDimShapeToString(fDimShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should equal output size) + out << SP << "assert(" << lengthGemm << " == " << ConvertDimShapeToLength(fDimShapeC) << ");\n"; + } + } + } else { + fBroadcastBias = false; + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + // some model don't have bias but Beta is not zero - force it to zero + fAttrBeta = 0; + std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n"; + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); + + // Compute per-iteration strides for each buffer when stacking. + // m/n/k are std::string from Dim::GetVal(); stoi() is safe for static shapes. + size_t strideA = 0, strideB = 0, strideY = 0, strideC = 0; + // GPU optimisation flags (static shapes only): + // batchCollapseB — strideB==0: B is the shared weight, so replace the N-iteration + // loop with a single cuBLASLt GEMM whose batch dimension is + // folded into the "n_sofie" parameter (n_sofie = m_onnx * N). + // This turns 30 per-token GEMM launches into one kernel call. + // useSBatched — both strides non-zero AND no bias: use cublasSgemmStridedBatched + // so the GPU driver schedules all N GEMMs in one call. + // (Bias epilogue is not available on the strided-batched path, so + // this only applies to pure MatMul ops such as softmax(QK^T)·V.) + bool batchCollapseB = false; + bool useSBatched = false; + if (doStackMul && !fIsDynamic) { + strideA = static_cast(std::stoi(m)) * static_cast(std::stoi(k)); + // B is a shared weight (broadcast over the stacked/batch dimension) when all its + // leading dims (beyond the 2 matrix dims) are 1. In that case strideB must be 0 + // so every iteration reads from the same B slice — not i * n*k (which goes OOB). + bool bLeadingDimsAllOne = true; + for (int64_t i = 0; i < dimB - 2; i++) { + if (fShapeB[i].dim != 1) { bLeadingDimsAllOne = false; break; } + } + strideB = bLeadingDimsAllOne ? 0 + : static_cast(std::stoi(n)) * static_cast(std::stoi(k)); + strideY = static_cast(std::stoi(m)) * static_cast(std::stoi(n)); + strideC = !fNC.empty() ? static_cast(std::stoi(lengthGemm)) : 0; + + batchCollapseB = (strideB == 0); + useSBatched = !batchCollapseB && fNC.empty(); + } + + // Emit the loop only for the serial fallback path (dynamic shapes, or static + // shapes where both A and B vary per iteration AND a bias epilogue is needed). + bool useSerialLoop = doStackMul && !batchCollapseB && !useSBatched; + if (useSerialLoop || (doStackMul && fIsDynamic)) { + out << SP << "size_t " << opName << "_yoffset = 0;\n"; + out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; + } + + // Use getPtrNative() for all args so the raw-pointer overload is selected + // regardless of whether each buffer is a BufXxx or ViewPlainPtr. + // For the loop path, add per-iteration offsets; for the collapsed/batched + // paths, use base pointers (the whole contiguous tensor is processed at once). + std::string pA = "alpaka::getPtrNative(deviceBuf_" + fNA + ")"; + std::string pB = "alpaka::getPtrNative(deviceBuf_" + fNB + ")"; + std::string pY = "alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + if (useSerialLoop && !fIsDynamic) { + pA += " + i * " + std::to_string(strideA); + if (strideB > 0) pB += " + i * " + std::to_string(strideB); + // strideB == 0: B is a shared weight, pointer stays at base + pY += " + i * " + std::to_string(strideY); + } + + if (useSBatched) { + // ---------------------------------------------------------------- + // gemmStridedBatched: both A and B vary per batch (e.g. per attention + // head), and there is no bias. Uses cublasSgemmStridedBatched via + // the legacy cuBLAS handle so all N GEMMs are issued in one driver call. + // + // sofieBLAS convention (column-major transpose trick): + // transa_sofie = transB_onnx, transb_sofie = transA_onnx + // m_sofie = n_onnx, n_sofie = m_onnx + // A_sofie = fNB, B_sofie = fNA + // lda = m_sofie (leading dim of A when transA_sofie='n') + // ldb = k (leading dim of B when transB_sofie='n') + // ldc = m_sofie (leading dim of C) + // ---------------------------------------------------------------- + size_t m_sofie = static_cast(std::stoi(n)); // ONNX n + size_t n_sofie = static_cast(std::stoi(m)); // ONNX m + size_t k_val = static_cast(std::stoi(k)); + size_t lda = m_sofie; // transA_sofie='n' + size_t ldb = k_val; // transB_sofie='n' + size_t ldc = m_sofie; + size_t sA = m_sofie * k_val; // stride per batch for fNB + size_t sB = k_val * n_sofie; // stride per batch for fNA (= strideA_onnx) + size_t sC = m_sofie * n_sofie; // stride per batch for fNY (= strideY) + size_t batchCount = static_cast(std::stoi(lengthExtra)); + out << SP << "blas.gemmStridedBatched(" + << opName << "_transB, " << opName << "_transA, " + << m_sofie << ", " << n_sofie << ", " << k_val << ", " + << opName << "_alpha, " + << "alpaka::getPtrNative(deviceBuf_" << fNB << "), " + << lda << ", " << sA << ", " + << "alpaka::getPtrNative(deviceBuf_" << fNA << "), " + << ldb << ", " << sB << ", " + << opName << "_beta, " + << "alpaka::getPtrNative(deviceBuf_" << fNY << "), " + << ldc << ", " << sC << ", " + << batchCount << ");\n"; + } else if (!fNC.empty()) { + // ---------------------------------------------------------------- + // GEMM with bias: Y = alpha * op(A) * op(B) + bias + // cuBLAS is column-major so we swap A↔B and transA↔transB + // (row-major C=A*B ↔ col-major C^T = B^T * A^T). + // The epilogue fuses the bias-add (and optional ReLU/GELU) in the + // same kernel, avoiding a separate element-wise pass. + // + // For batch-collapse (batchCollapseB), use m*batchCount so that all + // tokens are processed in a single cuBLASLt kernel launch instead of N. + // The bias vector is broadcast across all columns by the epilogue. + // ---------------------------------------------------------------- + std::string call_m = batchCollapseB + ? std::to_string(static_cast(std::stoi(m)) * static_cast(std::stoi(lengthExtra))) + : (opName + "_m"); + + std::string pC = "alpaka::getPtrNative(deviceBuf_" + fNC + ")"; + if (useSerialLoop && !fIsDynamic) { + if (!fBroadcastBias && strideC > 0) + pC += " + i * " + std::to_string(strideC); + } + if (fActivation == EActivationType::RELU) { + out << SP << "blas.gemmrelu(" + << opName << "_transB, " << opName << "_transA, " + << opName << "_n, " << call_m << ", " + << opName << "_k, " << opName << "_alpha, " + << pB << ", " << pA << ", " + << opName << "_beta, " << pC << ", " << pY << ");\n"; + } else { + out << SP << "blas.gemm(" + << opName << "_transB, " << opName << "_transA, " + << opName << "_n, " << call_m << ", " + << opName << "_k, " << opName << "_alpha, " + << pB << ", " << pA << ", " + << opName << "_beta, " << pC << ", " << pY << ");\n"; + } + } else { + // ---------------------------------------------------------------- + // Pure MatMul (no bias): Y = alpha * op(A) * op(B) + // This covers: + // • Scaled Dot-Product Attention: softmax(QK^T/√d) @ V + // • Any other no-bias matrix multiplication + // Previously this branch emitted nothing (empty loop body), which + // caused the attention output to be silently uninitialized. + // For batch-collapse, use m*batchCount for the same reason as above. + // ---------------------------------------------------------------- + std::string call_m = batchCollapseB + ? std::to_string(static_cast(std::stoi(m)) * static_cast(std::stoi(lengthExtra))) + : (opName + "_m"); + + out << SP << "blas.matmul(" + << opName << "_transB, " << opName << "_transA, " + << opName << "_n, " << call_m << ", " + << opName << "_k, " << opName << "_alpha, " + << pB << ", " << pA << ", " + << opName << "_beta, " << pY << ");\n"; + } + + if (useSerialLoop || (doStackMul && fIsDynamic)) { + out << SP << "}\n"; // end of loop on the stacked multiplication + } + + // GEMM+LeakyReLU fusion (GPU): cuBLASLt has no native LeakyReLU epilogue, + // so we emit a cheap in-place ALPAKA kernel immediately after the GEMM. + // This avoids allocating a separate intermediate buffer and saves one + // GPU kernel launch compared to a standalone LeakyReLU operator. + if (fActivation == EActivationType::LEAKYRELU) { + std::string numElem = ConvertDimShapeToLength(fShapeY); + out << SP << "//--- GEMM+LeakyReLU in-place fusion\n"; + out << SP << "{\n"; + out << SP << SP << "constexpr float " << opName << "_lrelu_alpha = " + << std::setprecision(std::numeric_limits::max_digits10) + << fLeakyReluAlpha << "f;\n"; + out << SP << SP << "auto const elementsPerThread_lrelu_" << opName + << " = Vec::all(static_cast(1));\n"; + out << SP << SP << "auto const elementsPerGrid_lrelu_" << opName + << " = Vec::all(Idx{" << numElem << "});\n"; + out << SP << SP << "auto const workDiv_lrelu_" << opName + << " = sofie_workdiv(elementsPerGrid_lrelu_" << opName << ");\n"; + // In-place: input and output pointer are the same device buffer. + out << SP << SP << "auto task_lrelu_" << opName + << " = alpaka::createTaskKernel(workDiv_lrelu_" << opName + << ", leakyReluKernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << numElem << ")" + << ", static_cast(" << opName << "_lrelu_alpha));\n"; + out << SP << SP << "alpaka::enqueue(queue, task_lrelu_" << opName << ");\n"; + out << SP << "}\n"; + } + + return out.str(); + } + + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNY); + fNY = fusable_tensor_name; + fOutputTensorNames[0] = fNY; + } + + // --- Activation fusion accessors (used by FuseGemmActivations_GPU) --- + EActivationType GetActivationType() const { return fActivation; } + /// Set fused activation. alpha is only meaningful for LEAKYRELU. + void SetActivation(EActivationType act, float alpha = 0.f) { + fActivation = act; + fLeakyReluAlpha = alpha; + } + + std::string GetBlasConfig(){ + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + auto lda = (fAttrTransA ? m : k); + auto ldb = (fAttrTransB ? k : n); + auto ldc = n; + std::string transFlags = std::string(fAttrTransB ? "'t'" : "'n'") + ", " + (fAttrTransA ? "'t'" : "'n'"); + + // For stacked (batched) GEMMs on static shapes, return the layout that + // matches the actual call emitted by Generate_GPU_ALPAKA: + // - batch-collapse (strideB==0): single GEMM with n_sofie = m_onnx * batchCount + // → register the batched layout + // - gemmStridedBatched (both strides non-zero, no bias): uses legacy cuBLAS, + // no cuBLASLt layout needed → return "" + if (dimY > 2 && !fIsDynamic) { + std::vector sExtra; + for (int64_t i = 0; i < dimY - 2; i++) sExtra.push_back(fShapeY[i]); + auto lengthExtra = ConvertDimShapeToLength(sExtra); + if (std::stoi(lengthExtra) > 1) { + bool bLeadingDimsAllOne = true; + for (int64_t i = 0; i < dimB - 2; i++) { + if (fShapeB[i].dim != 1) { bLeadingDimsAllOne = false; break; } + } + if (bLeadingDimsAllOne) { + // batch-collapse: register layout for the full-batch GEMM + auto m_batched = std::to_string(std::stoi(m) * std::stoi(lengthExtra)); + return n+", "+m_batched+", "+k+", "+ldb+", "+lda+", "+ldc+", "+transFlags; + } else if (fNC.empty()) { + // gemmStridedBatched: legacy cuBLAS, no cuBLASLt layout needed + return ""; + } + // else: serial loop with bias — fall through to per-iteration layout + } + } + + return n+", "+m+", "+k+", "+ldb+", "+lda+", "+ldc+", "+transFlags; + } + }; + + +}//SOFIE + +#endif //SOFIE_ROPERATOR_GEMM diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx b/core/inc/SOFIE/ROperator_Identity.hxx similarity index 66% rename from src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx rename to core/inc/SOFIE/ROperator_Identity.hxx index efb6b14..43688cf 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx +++ b/core/inc/SOFIE/ROperator_Identity.hxx @@ -41,7 +41,7 @@ public: void Initialize(RModel& model) override { //input must be a graph input, or already initialized intermediate tensor if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Identity Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Identity Op Input Tensor is not found in model"); } fShape = model.GetTensorShape(fNX); if (model.IsInitializedTensor(fNX)) { @@ -77,7 +77,7 @@ public: if (fIsOutputConstant || fIsInputInitialized) return ""; OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Identity called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Identity called to Generate without being initialized first"); } std::stringstream out; out << "\n//------ IDENTITY\n"; @@ -86,6 +86,31 @@ public: return out.str(); } + std::string GenerateInitCode_GPU_ALPAKA() override { + // For initialized (weight) tensors: the device buffer for X is already populated by + // MoveInitializedTensorsToBuffers_ALPAKA(); copy it into the Y device buffer. + if (!fIsInputInitialized) return ""; + std::stringstream out; + out << "\n//------ IDENTITY (init)\n"; + out << SP << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n"; + return out.str(); + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + // Constant outputs and already-initialised tensors need no runtime work. + if (fIsOutputConstant || fIsInputInitialized) return ""; + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Identity called to Generate_GPU_ALPAKA without being initialized first"); + } + std::stringstream out; + out << "\n//------ IDENTITY\n"; + // Device buffers cannot simply be aliased; perform an explicit device-to-device copy. + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx b/core/inc/SOFIE/ROperator_LSTM.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx rename to core/inc/SOFIE/ROperator_LSTM.hxx index 5bfd4e3..69fb7a2 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx +++ b/core/inc/SOFIE/ROperator_LSTM.hxx @@ -106,7 +106,7 @@ template class ROperator_LSTM final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a LSTM operator"); + "SOFIE Encountered unsupported type parsing a LSTM operator"); } fInputTensorNames = { fNX, fNW, fNR }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/core/inc/SOFIE/ROperator_LSTM.icc similarity index 97% rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc rename to core/inc/SOFIE/ROperator_LSTM.icc index bec7760..2fb390d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc +++ b/core/inc/SOFIE/ROperator_LSTM.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_LSTM_I #define SOFIE_ROPERATOR_LSTM_I - namespace SOFIE { template @@ -41,33 +40,33 @@ auto ROperator_LSTM::Initialize(RModel& model) fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + " is not found in model."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions."); + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not found in model."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 5) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions."); } if (fShapeB.size() == 2) { // Broadcasting the bias @@ -104,46 +103,46 @@ auto ROperator_LSTM::Initialize(RModel& model) } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } if (!fNInitial_c.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_c)) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_c + " is not found in model."); } fShapeInitial_c = model.GetTensorShape(fNInitial_c); if (fShapeInitial_c.size() != 3) { - throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + + throw std::runtime_error("SOFIE LSTM Op input tensor " + fNInitial_c + " is not of 3 dimensions."); } } if (!fNP.empty()) { if (!model.CheckIfTensorAlreadyExist(fNP)) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not found in model."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not found in model."); } fShapeP = model.GetTensorShape(fNP); if (fShapeP.size() != 2 && fShapeP.size() != 4) { - throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions."); + throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions."); } if (fShapeP.size() == 2) { // Broadcasting the weight for peepholes @@ -197,28 +196,28 @@ auto ROperator_LSTM::Initialize(RModel& model) activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } if (fAttrDirection != "forward" && fAttrDirection != "backward" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid LSTM direction fAttrDirection = " + + "SOFIE - Invalid LSTM direction fAttrDirection = " + fAttrDirection); } if (4 * fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1] / 4)); } if (fAttrInputForget > 1) { throw std::runtime_error( - "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget) + "SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget) + " must be 0 or 1."); } if (fAttrLayout > 1) { - throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " + + throw std::runtime_error("SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } @@ -291,7 +290,7 @@ auto ROperator_LSTM::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; diff --git a/core/inc/SOFIE/ROperator_LayerNormalization.hxx b/core/inc/SOFIE/ROperator_LayerNormalization.hxx new file mode 100644 index 0000000..dbf113a --- /dev/null +++ b/core/inc/SOFIE/ROperator_LayerNormalization.hxx @@ -0,0 +1,732 @@ +#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION +#define SOFIE_ROPERATOR_LAYERNORMALIZATION + +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include +#include + +namespace SOFIE { + +template +class ROperator_LayerNormalization : public ROperator { +private: + bool fCastToFloat = false; // flag to indicate if operation 1 are in floats (to be impl) + int fAttrAxis; + float fAttrEpsilon; + size_t fAttrStashType; + + std::string fNX; + std::string fNScale; + std::string fNB; + std::string fNY; + std::string fNMean; + std::string fNInvStdDev; + + std::string fNCastedX; + std::string fNNormalizedX; + std::string fNBroadcastedB; + + std::vector fShapeX; + std::vector fShapeScale; + std::vector fShapeB; + std::vector fShapeY; + std::vector fShapeMean; + std::vector fShapeInvStdDev; + + size_t fAxis; // axis in [0, size) + size_t fSize; // Size of the input + // size_t fAxisDim; + + std::vector fNormalizedShape; // shape from X[ axis,...,N-1] + std::vector fAxesShape; // shape from X[0,..,axis-1] + // lengths in string format + std::string fLength; // Length of the input + std::string fNormalizedLength; + std::string fAxesLength; + + std::string fType; + +public: + ROperator_LayerNormalization() {} + + ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX, + const std::string &nameScale, const std::string &nameB, const std::string &nameY, + const std::string &nameMean, const std::string &nameInvStdDev) + : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)), + fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)), + fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev)) + { + fInputTensorNames = { fNX, fNScale }; + if (!fNB.empty()){ + fInputTensorNames.emplace_back(fNB); + } + + fOutputTensorNames = { fNY }; + if (!fNMean.empty()){ + fOutputTensorNames.emplace_back(fNMean); + } + if (!fNInvStdDev.empty()){ + fOutputTensorNames.emplace_back(fNInvStdDev); + } + } + + std::vector> ShapeInference(std::vector> input) override { return input; } + + std::vector TypeInference(std::vector input) override { return input; } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found."); + } + bool isDynamic = model.IsDynamicTensor(fNX); + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + // Type of the output + fType = ConvertTypeToString(model.GetTensorType(fNX)); + // Size of the input + fSize = fShapeX.size(); + // Axis in [0, size) + fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis; + // Shape of fShapeX[0, ..., fAxis) + fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); + // Length of the axes + fAxesLength = ConvertDimShapeToLength(fAxesShape); + // Shape of fShapeX[fAxis, ..., fSize) + fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); + // Length of the normalized axis + fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape); + // length of the input + fLength = ConvertDimShapeToLength(fShapeX); + // Type of mean and std + ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); + // Mean + if (!fNMean.empty()) { + // cannot use initializer list with one element since it is ambiguous + if (isDynamic) + // add size_t(-1) to indicate that shape is an expression + model.AddIntermediateTensor(fNMean, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); + else + model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); + } + // Inverse Standard Deviation + if (!fNInvStdDev.empty()) { + if (isDynamic) + model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); + else + model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); + } + // if mean and stdev are not empty they are not defined in the output list + // Cast X to float + if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { + fCastToFloat = true; + fType = "float"; + // fNCastedX = "Casted" + fNX; + // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); + // fNNormalizedX = "Normalized" + fNX; + // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + } + // scale shape + fShapeScale = model.GetDimTensorShape(fNScale); + // appends 1 to scale shapes if missing + size_t dimScale = fShapeScale.size(); + if (dimScale < fSize) { + for (size_t i = 0; i < fSize-dimScale; i++) + fShapeScale.insert(fShapeScale.begin(), Dim{1}); + } + // check also shape if consistent now + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); + } + if (!fNB.empty()) { + fShapeB = model.GetDimTensorShape(fNB); + // appends 1 to bias shapes if missing + size_t dimB = fShapeB.size(); + if (dimB < fShapeX.size()) { + for (size_t i = 0; i < fSize-dimB; i++) + fShapeB.insert(fShapeB.begin(), Dim{1}); + } + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); + } + } + + std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << " " << ConvertDimShapeToString(fShapeScale) << std::endl; + + // // Broadcast the bias + // if (!fNB.empty()) { + // fShapeB = model.GetTensorShape(fNB); + // size_t lengthB = ConvertShapeToLength(fShapeB); + // if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { + // fNBroadcastedB = "Broadcasted" + fNB; + // model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + // } + // } + model.AddNeededStdLib("cmath"); + } + + std::string GenerateInitCode() override + { + std::stringstream out; + if (!fNBroadcastedB.empty()) { + out << SP << "// Broadcasting the bias of LayerNormalization op\n"; + out << SP << "{\n"; + out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; + out << fNB << ", " << ConvertDimShapeToString(fShapeB) << ", " << ConvertDimShapeToString(fShapeX) << ");\n"; + out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; + out << SP << "delete[] data;\n"; + out << SP << "}\n"; + } + return out.str(); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + if (fShapeX.empty()) { + throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + + " called to generate without being initialized first."); + } + + std::stringstream out; + + out << "//---- Layer Normalization operator " << opName << "\n"; + + // Loop over all the normalized axes i.e. [axis, ..., size) + std::vector inputShape(fSize); + + for (size_t i = 0; i < fSize; i++) { + inputShape[i] = fShapeX[i].GetVal(); + } + + auto strides = UTILITY::ComputeStrideFromShape(fShapeX); + std::string inputIndex = "axis_0 * " + strides[0].GetVal(); + for (size_t i = 1; i < fSize; i++) { + inputIndex += " + axis_" + std::to_string(i); + if (i < fSize-1) inputIndex += " * " + strides[i].GetVal(); + } + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + std::string scaleIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1) { + if (!scaleIndex.empty()) scaleIndex += " + "; + scaleIndex += "axis_" + std::to_string(i); + if ( scaleStrides[i].dim != 1) scaleIndex += " * " + scaleStrides[i].GetVal(); + } + } + if (scaleIndex.empty()) scaleIndex = "0"; + + auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB); + std::string biasIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1) { + if (!biasIndex.empty()) biasIndex += " + "; + biasIndex += "axis_" + std::to_string(i); + if ( biasStrides[i].dim != 1) biasIndex += " * " + biasStrides[i].GetVal(); + } + } + if (biasIndex.empty()) biasIndex = "0"; + + auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); + std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); + for (size_t i = 1; i < fAxis; i++) { + axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); + } + + + // compute mean and std-dev. Save in tensors if requested + + out << SP << "// Compute the mean\n"; + + // Loop over all the outer dims in [0, fAxis) + for (size_t i = 0; i < fAxis; i++) { + std::string iIdx = "axis_" + std::to_string(i); + out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] + << "; " << iIdx << "++) {\n"; + } + out << SP << SP << fType << " mean = 0.;\n"; + // loop over the normalized dimensions (fAxis,....,N-1) + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] + << "; " << jIdx << "++) {\n"; + } + out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n"; + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + out << SP << SP << "mean /= " << fType << "(" << fNormalizedLength << ");\n"; + + + out << SP << "// Compute the inverse Standard Deviation\n"; + + // Set sum = 0 + out << SP << SP << fType << " sum = 0.;\n"; + // loop over all the dims in [0, fAxis) + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] + << "; " << jIdx << "++){\n"; + } + out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n"; + out << SP << SP << SP << "sum += tmp*tmp;\n"; + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + out << SP << SP << fType << " invStdDev = 1 / std::sqrt("; + out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; + + + // set output mean and invStdDev if requested + if (!fNMean.empty()) + out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n"; + if (!fNInvStdDev.empty()) + out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n"; + + // scale and add bias + + out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; + + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx + << "++){\n"; + } + out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale; + out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)"; + + // add bias if needed + if (!fNB.empty()) + // assume bias has index as scale + out << " + tensor_" << fNB << "[" << biasIndex << "]"; + out << ";\n"; + + // close loops on normalizing dim [..,fAxis,...fSize-1] + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + // close loops on the other dimensions [0,...,fAxis] + for (size_t i = 0; i < fAxis; i++) { + out << SP << "}\n"; + } + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first"); + + // ----------------------------------------------------------------------- + // Parallel block-per-row strategy (for static normalizedLength ≤ 1024): + // • One block per row (axes element). + // • blockSize = next power-of-2 ≥ normalizedLength, capped at 1024. + // • Each thread loads one element, two shared-memory tree reductions + // compute mean then variance; final pass normalises in parallel. + // This replaces the previous single-thread-per-row serial scan. + // For dynamic shapes or normalizedLength > 1024, fall back to the original + // serial kernel (one thread per row, explicit loops). + // ----------------------------------------------------------------------- + + // Determine whether we can use the parallel path + size_t normLenVal = 0; + bool canParallel = false; + try { + normLenVal = std::stoul(fNormalizedLength); + canParallel = (normLenVal > 0 && normLenVal <= 1024); + } catch (...) {} + + // Compute blockSize = next power-of-2 >= normLenVal + size_t blockSize = 1; + if (canParallel) { + while (blockSize < normLenVal) blockSize <<= 1; + } + + // Each thread handles one "row" — one element of the axes dims [0..axis) + // and iterates over all normalized dims [axis..size) + // axesLength = product of fShapeX[0..axis) + // normalizedLength = product of fShapeX[axis..size) + // totalElements = axesLength (one thread per row) + + std::vector inputShape(fSize); + for (size_t i = 0; i < fSize; i++) + inputShape[i] = fShapeX[i].GetVal(); + + auto strides = UTILITY::ComputeStrideFromShape(fShapeX); + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + auto biasStrides = (!fNB.empty()) ? UTILITY::ComputeStrideFromShape(fShapeB) + : std::vector{}; + auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); + + // Build index expressions reusing the same logic as Generate() + // input index: axis_0*stride0 + axis_1*stride1 + ... + norm_0*stride_axis + ... + // For the kernel we decompose the linear thread index into axis coords, + // then loop over normalized dims inside the kernel. + + std::string kname = "LayerNormKernel_" + opName; + std::string op; + op = "\n//------ LAYERNORM_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ X,\n"; + op += SP + SP + SP + "T const* __restrict__ scale,\n"; + if (!fNB.empty()) + op += SP + SP + SP + "T const* __restrict__ bias,\n"; + if (!fNMean.empty()) + op += SP + SP + SP + "T* __restrict__ out_mean,\n"; + if (!fNInvStdDev.empty()) + op += SP + SP + SP + "T* __restrict__ out_invstd,\n"; + op += SP + SP + SP + "T* __restrict__ Y,\n"; + op += SP + SP + SP + "std::size_t const axesLength) const {\n\n"; + + if (canParallel) { + // --------------------------------------------------------------- + // PARALLEL PATH: one block per row, blockSize threads per block. + // Each thread handles one element in the normalised dimension. + // Two shared-memory tree reductions compute mean then variance. + // --------------------------------------------------------------- + std::string bs = std::to_string(blockSize); + std::string nl = fNormalizedLength; // e.g. "64" + std::string eps = std::to_string(fAttrEpsilon); + + op += SP + SP + SP + "// Block-parallel LayerNorm: one block per row, " + + bs + " threads per block, " + nl + " active.\n"; + op += SP + SP + SP + "auto& shmem = alpaka::declareSharedVar(acc);\n"; + op += SP + SP + SP + "auto const row = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "auto const tid = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (row >= axesLength) return;\n\n"; + + // --- Decompose row into axis-dim coordinates (same logic as serial path) --- + if (fAxis > 0) { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + "std::size_t const axis_" + std::to_string(i) + + " = (row / " + axesStrides[i].GetVal() + "u) % " + + inputShape[i] + "u;\n"; + } + op += "\n"; + } + + // Base input offset for this row + op += SP + SP + SP + "std::size_t const row_base =\n"; + if (fAxis == 0) { + op += SP + SP + SP + SP + "0u;\n\n"; + } else { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + strides[i].GetVal() + "u"; + op += (i + 1 < fAxis) ? " +\n" : ";\n\n"; + } + } + + // Map thread id → index within normalised dims. + // For each normalised dim j, the "within-norm" stride is the product of + // dimensions after it: normInnerStrides[j-fAxis] computed at code-gen time. + // Then: norm_offset = sum_j( (tid / normInnerStride[j]) % dim[j] * stride[j] ) + // For the common 1D normalised case this simplifies to: norm_offset = tid * stride[fAxis] + + // Build the norm-dim strides (strides within the flattened normalised space) + auto normShape = fNormalizedShape; // dims [fAxis .. fSize-1] + auto normInner = UTILITY::ComputeStrideFromShape(normShape); + + op += SP + SP + SP + "bool const in_range = (tid < " + nl + "u);\n"; + op += SP + SP + SP + "std::size_t norm_offset = 0u;\n"; + op += SP + SP + SP + "std::size_t s_norm_offset = 0u;\n"; + if (!fNB.empty()) + op += SP + SP + SP + "std::size_t b_norm_offset = 0u;\n"; + op += SP + SP + SP + "if (in_range) {\n"; + + if (fSize - fAxis == 1) { + // Single normalised dim — simplest case + op += SP + SP + SP + SP + "norm_offset = tid * " + strides[fAxis].GetVal() + "u;\n"; + if (fShapeScale[fAxis].dim != 1) + op += SP + SP + SP + SP + "s_norm_offset = tid * " + scaleStrides[fAxis].GetVal() + "u;\n"; + if (!fNB.empty() && fShapeB[fAxis].dim != 1) + op += SP + SP + SP + SP + "b_norm_offset = tid * " + biasStrides[fAxis].GetVal() + "u;\n"; + } else { + // Multi-dim normalised space + op += SP + SP + SP + SP + "std::size_t norm_rem = tid;\n"; + for (size_t j = fAxis; j < fSize; ++j) { + size_t ji = j - fAxis; + op += SP + SP + SP + SP + "{ std::size_t nj = norm_rem / " + normInner[ji].GetVal() + "u;" + + " norm_rem %= " + normInner[ji].GetVal() + "u;" + + " norm_offset += nj * " + strides[j].GetVal() + "u;"; + if (fShapeScale[j].dim != 1) + op += " s_norm_offset += nj * " + scaleStrides[j].GetVal() + "u;"; + if (!fNB.empty() && fShapeB[j].dim != 1) + op += " b_norm_offset += nj * " + biasStrides[j].GetVal() + "u;"; + op += " }\n"; + } + } + op += SP + SP + SP + "}\n\n"; + + op += SP + SP + SP + "std::size_t const norm_idx = row_base + norm_offset;\n"; + op += SP + SP + SP + "T const val = in_range ? X[norm_idx] : static_cast(0);\n\n"; + + // --- Pass 1: parallel mean --- + op += SP + SP + SP + "// Pass 1: compute mean via shared-memory tree reduction\n"; + op += SP + SP + SP + "shmem[tid] = val;\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n"; + size_t half = blockSize / 2; + while (half > 0) { + op += SP + SP + SP + "if (tid < " + std::to_string(half) + "u) shmem[tid] += shmem[tid + " + std::to_string(half) + "u];\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n"; + half >>= 1; + } + op += SP + SP + SP + "T const mean = shmem[0] / static_cast(" + nl + ");\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n\n"; + + // --- Pass 2: parallel variance --- + op += SP + SP + SP + "// Pass 2: compute variance\n"; + op += SP + SP + SP + "T const diff = val - mean;\n"; + op += SP + SP + SP + "shmem[tid] = in_range ? diff * diff : static_cast(0);\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n"; + half = blockSize / 2; + while (half > 0) { + op += SP + SP + SP + "if (tid < " + std::to_string(half) + "u) shmem[tid] += shmem[tid + " + std::to_string(half) + "u];\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n"; + half >>= 1; + } + op += SP + SP + SP + "T const invStdDev = static_cast(1) / alpaka::math::sqrt(acc," + " shmem[0] / static_cast(" + nl + ") + static_cast(" + eps + "));\n\n"; + + // Save mean/invstd if requested + if (!fNMean.empty()) + op += SP + SP + SP + "if (tid == 0u) out_mean[row] = mean;\n"; + if (!fNInvStdDev.empty()) + op += SP + SP + SP + "if (tid == 0u) out_invstd[row] = invStdDev;\n"; + op += "\n"; + + // --- Pass 3: normalise, scale, bias --- + op += SP + SP + SP + "// Pass 3: normalize + scale + bias\n"; + op += SP + SP + SP + "if (in_range) {\n"; + + // scale base (axis contribution) + op += SP + SP + SP + SP + "std::size_t const scale_base =\n"; + { + bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeScale[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + scaleStrides[i].GetVal() + "u"; + if (any) op += " +\n"; + any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n"; + } + op += SP + SP + SP + SP + "T out_val = scale[scale_base + s_norm_offset] * invStdDev * (val - mean);\n"; + + if (!fNB.empty()) { + op += SP + SP + SP + SP + "std::size_t const bias_base =\n"; + bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeB[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + biasStrides[i].GetVal() + "u"; + if (any) op += " +\n"; + any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n"; + op += SP + SP + SP + SP + "out_val += bias[bias_base + b_norm_offset];\n"; + } + + op += SP + SP + SP + SP + "Y[norm_idx] = out_val;\n"; + op += SP + SP + SP + "}\n"; // end in_range + + } else { + // --------------------------------------------------------------- + // SERIAL PATH (dynamic shapes or normalizedLength > 1024): + // one thread per row, explicit loops over normalized dims. + // --------------------------------------------------------------- + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= axesLength) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + op += SP + SP + SP + "for (std::size_t row = global_thread_idx; row < axesLength; row += grid_thread_extent) {\n\n"; + + if (fAxis > 0) { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + SP + "std::size_t const axis_" + std::to_string(i) + + " = (row / " + axesStrides[i].GetVal() + "u) % " + + inputShape[i] + "u;\n"; + } + op += "\n"; + } + + op += SP + SP + SP + SP + "std::size_t const row_base =\n"; + if (fAxis == 0) { + op += SP + SP + SP + SP + SP + "0u;\n\n"; + } else { + for (size_t i = 0; i < fAxis; ++i) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + strides[i].GetVal() + "u"; + op += (i + 1 < fAxis) ? " +\n" : ";\n\n"; + } + } + + op += SP + SP + SP + SP + "std::size_t const scale_base =\n"; + { bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeScale[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + scaleStrides[i].GetVal() + "u"; + if (any) op = " +\n" + op; any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n\n"; + } + + if (!fNB.empty()) { + op += SP + SP + SP + SP + "std::size_t const bias_base =\n"; + bool any = false; + for (size_t i = 0; i < fAxis; ++i) { + if (fShapeB[i].dim != 1) { + op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i) + + " * " + biasStrides[i].GetVal() + "u"; + if (any) op = " +\n" + op; any = true; + } + } + if (!any) op += SP + SP + SP + SP + SP + "0u"; + op += ";\n\n"; + } + + op += SP + SP + SP + SP + "T mean = static_cast(0);\n"; + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + op += SP + SP + SP + SP + SP + "mean += X[norm_idx];\n"; + for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + "mean /= static_cast(" + fNormalizedLength + ");\n\n"; + + op += SP + SP + SP + SP + "T sum = static_cast(0);\n"; + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + op += SP + SP + SP + SP + SP + "T tmp = X[norm_idx] - mean;\n"; + op += SP + SP + SP + SP + SP + "sum += tmp * tmp;\n"; + for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + "T const invStdDev = static_cast(1) / " + "alpaka::math::sqrt(acc, sum / static_cast(" + fNormalizedLength + + ") + static_cast(" + std::to_string(fAttrEpsilon) + "));\n\n"; + + if (!fNMean.empty()) + op += SP + SP + SP + SP + "out_mean[row] = mean;\n"; + if (!fNInvStdDev.empty()) + op += SP + SP + SP + SP + "out_invstd[row] = invStdDev;\n"; + op += "\n"; + + for (size_t j = fAxis; j < fSize; ++j) + op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j) + + " = 0; n_" + std::to_string(j) + " < " + inputShape[j] + + "u; n_" + std::to_string(j) + "++) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base"; + for (size_t j = fAxis; j < fSize; ++j) + op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u"; + op += ";\n"; + op += SP + SP + SP + SP + SP + "std::size_t const s_idx = scale_base"; + for (size_t j = fAxis; j < fSize; ++j) { + if (fShapeScale[j].dim != 1) + op += " + n_" + std::to_string(j) + " * " + scaleStrides[j].GetVal() + "u"; + } + op += ";\n"; + op += SP + SP + SP + SP + SP + "T val = scale[s_idx] * invStdDev * (X[norm_idx] - mean);\n"; + if (!fNB.empty()) { + op += SP + SP + SP + SP + SP + "std::size_t const b_idx = bias_base"; + for (size_t j = fAxis; j < fSize; ++j) { + if (fShapeB[j].dim != 1) + op += " + n_" + std::to_string(j) + " * " + biasStrides[j].GetVal() + "u"; + } + op += ";\n"; + op += SP + SP + SP + SP + SP + "val += bias[b_idx];\n"; + } + op += SP + SP + SP + SP + SP + "Y[norm_idx] = val;\n"; + for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n"; + + op += SP + SP + SP + "}\n"; // end row loop + } + + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "LayerNormKernel_" + opName; + return SP + kname + " layerNormKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty()) + throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first"); + + std::string axesLengthStr = fAxesLength; + std::string kname = "layerNormKernel_" + opName; + + // Determine parallel vs serial (same logic as kernel generation) + size_t normLenVal2 = 0; + bool canParallel2 = false; + try { normLenVal2 = std::stoul(fNormalizedLength); canParallel2 = (normLenVal2 > 0 && normLenVal2 <= 1024); } + catch (...) {} + size_t blockSize2 = 1; + if (canParallel2) { while (blockSize2 < normLenVal2) blockSize2 <<= 1; } + + std::string args = + "alpaka::getPtrNative(deviceBuf_" + fNX + "), " + + "alpaka::getPtrNative(deviceBuf_" + fNScale + ")"; + if (!fNB.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNB + ")"; + if (!fNMean.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNMean + ")"; + if (!fNInvStdDev.empty()) + args += ", alpaka::getPtrNative(deviceBuf_" + fNInvStdDev + ")"; + args += ", alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + args += ", static_cast(" + axesLengthStr + ")"; + + std::stringstream out; + out << "\n//------ LAYERNORM_GPU_ALPAKA\n"; + if (canParallel2) { + // Parallel: one block per row, blockSize2 threads per block + out << SP << "alpaka::WorkDivMembers workDiv_" << opName << "(\n"; + out << SP << SP << "Vec::all(Idx{" << axesLengthStr << "}),\n"; // numBlocks = rows + out << SP << SP << "Vec::all(Idx{" << blockSize2 << "u}),\n"; // threads/block + out << SP << SP << "Vec::all(Idx{1u}));\n"; + } else { + // Serial fallback: normal sofie_workdiv + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << axesLengthStr << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + } + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname << ", " << args << ");\n"; + + return out.str(); + } + + std::vector GetBlasRoutines() override { return { std::string("Axpy") }; } + + std::vector GetStdLibs() override { return { std::string("cmath") }; } +}; + +} // namespace SOFIE + +#endif diff --git a/core/inc/SOFIE/ROperator_LeakyRelu.hxx b/core/inc/SOFIE/ROperator_LeakyRelu.hxx new file mode 100644 index 0000000..9eb15c1 --- /dev/null +++ b/core/inc/SOFIE/ROperator_LeakyRelu.hxx @@ -0,0 +1,144 @@ +#ifndef SOFIE_ROPERATOR_LeakyRelu +#define SOFIE_ROPERATOR_LeakyRelu + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_LeakyRelu final : public ROperator +{ + +private: + + /* Attributes*/ + float falpha=0.01; //default value + std::string fNX; + std::string fNY; + std::vector fShape; + std::string fType; + +public: + ROperator_LeakyRelu(){} + ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY): + falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fKind = OperatorKind::LEAKYRELU; + if(std::is_same::value){ + fType = "float"; + } + else{ + throw + std::runtime_error("SOFIE Encountered unsupported type parsing a Leaky Relu operator"); + } + + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Leaky Relu Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Leaky Relu called to Generate without being initialized first"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + + out << "\n//------ LEAKY RELU\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n"; + op += "struct LeakyReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : alpha * data[idx];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return "LeakyReluKernel leakyReluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator LeakyRelu called to Generate without being initialized first"); + } + + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n"; + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + /// Alpha accessor — used by the GEMM+LeakyReLU fusion pass. + float GetAlpha() const { return falpha; } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "((" + v + " >= 0) ? " + v + " : " + std::to_string(falpha) + " * " + v + ")"; + } + + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_LeakyRelu diff --git a/core/inc/SOFIE/ROperator_Logic.hxx b/core/inc/SOFIE/ROperator_Logic.hxx new file mode 100644 index 0000000..3f98e94 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Logic.hxx @@ -0,0 +1,336 @@ +#ifndef SOFIE_ROPERATOR_LOGIC +#define SOFIE_ROPERATOR_LOGIC + + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + +namespace SOFIE { + +enum class ELogicBinaryOp { + // logical (bool / uint8) + And, + Or, + Xor, + // bitwise (integer types) + BitwiseAnd, + BitwiseOr, + BitwiseXor, +}; + + +template +struct LogicBinaryTrait {}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "And"; } + static std::string KernelName() { return "AndKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " && " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a && b); } + static ETensorType OutputType() { return ETensorType::BOOL; } +}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "Or"; } + static std::string KernelName() { return "OrKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " || " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a || b); } + static ETensorType OutputType() { return ETensorType::BOOL; } +}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "Xor"; } + static std::string KernelName() { return "XorKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " != " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a != b); } + static ETensorType OutputType() { return ETensorType::BOOL; } +}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "BitwiseAnd"; } + static std::string KernelName() { return "BitwiseAndKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " & " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a & b); } + static ETensorType OutputType() { return GetTemplatedType(T()); } +}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "BitwiseOr"; } + static std::string KernelName() { return "BitwiseOrKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " | " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a | b); } + static ETensorType OutputType() { return GetTemplatedType(T()); } +}; + +template +struct LogicBinaryTrait { + static std::string Name() { return "BitwiseXor"; } + static std::string KernelName() { return "BitwiseXorKernel"; } + static std::string Expr(const std::string &a, const std::string &b) { + return "(" + a + " ^ " + b + ")"; + } + static T Eval(T a, T b) { return static_cast(a ^ b); } + static ETensorType OutputType() { return GetTemplatedType(T()); } +}; + +template +class ROperator_LogicBinary final : public ROperator { +private: + std::string fNA; + std::string fNB; + std::string fNY; + std::vector fShape; + + using Trait = LogicBinaryTrait; + +public: + ROperator_LogicBinary() {} + + ROperator_LogicBinary(std::string nameA, std::string nameB, std::string nameY) + : fNA(UTILITY::Clean_name(nameA)), + fNB(UTILITY::Clean_name(nameB)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNA, fNB }; + fOutputTensorNames = { fNY }; + } + + // ── Type / shape inference ──────────────────────────────────────────────── + std::vector TypeInference(std::vector input) override { + return { Trait::OutputType() }; + } + + std::vector> ShapeInference(std::vector> input) override { + if (input.size() < 2) + throw std::runtime_error("SOFIE " + Trait::Name() + + " ShapeInference requires 2 inputs"); + return { input[0] }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNA)) + throw std::runtime_error("SOFIE " + Trait::Name() + ": input A '" + + fNA + "' not found in model"); + if (!model.CheckIfTensorAlreadyExist(fNB)) + throw std::runtime_error("SOFIE " + Trait::Name() + ": input B '" + + fNB + "' not found in model"); + + fShape = model.GetDimTensorShape(fNA); + auto length = ConvertShapeToLength(fShape); + // Constant-fold: if both inputs are constant, compute output at init time. + if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) { + auto dataA = static_cast(model.GetInitializedTensorData(fNA).get()); + auto dataB = static_cast(model.GetInitializedTensorData(fNB).get()); + std::vector dataY(length); + for (size_t i = 0; i < length; ++i) + dataY[i] = Trait::Eval(dataA[i], dataB[i]); + std::vector outShape = (length == 1) ? + std::vector{} : std::vector{ length }; + model.AddConstantTensor(fNY, outShape, dataY.data()); + fIsOutputConstant = true; + } else { + model.AddIntermediateTensor(fNY, Trait::OutputType(), fShape); + } + + if (model.Verbose()) { + std::cout << Trait::Name() << " : " << fNA << " , " << fNB + << " -> " << fNY << " " << ConvertDimShapeToString(fShape) + << (fIsOutputConstant ? " [constant-folded]" : "") << std::endl; + } + } + + std::string Generate(std::string OpName) override { + if (fIsOutputConstant) return ""; + OpName = "op_" + OpName; + auto length = ConvertDimShapeToLength(fShape); + std::stringstream out; + out << "\n//------ " << Trait::Name() << "\n"; + out << SP << "for (std::size_t id = 0; id < " << length << "u; ++id) {\n"; + out << SP << SP << "tensor_" << fNY << "[id] = " + << Trait::Expr("tensor_" + fNA + "[id]", "tensor_" + fNB + "[id]") + << ";\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + OpName = "op_" + OpName; + std::stringstream op; + op << "\n//------ " << Trait::Name() << "_KERNEL_ALPAKA\n"; + op << "struct " << Trait::KernelName() << "_" << OpName << " {\n"; + op << SP << "template\n"; + op << SP << "ALPAKA_FN_ACC void operator()(" + << "TAcc const& acc, " + << "T const* __restrict__ A, " + << "T const* __restrict__ B, " + << "T* __restrict__ C, " + << "std::size_t const N) const {\n"; + op << SP << SP << "auto const idx = " + << "alpaka::getIdx(acc)[0];\n"; + op << SP << SP << "if (idx >= N) return;\n"; + op << SP << SP << "C[idx] = " << Trait::Expr("A[idx]", "B[idx]") << ";\n"; + op << SP << "}\n"; + op << "};\n"; + return op.str(); + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + std::string clean = "op_" + OpName; + return SP + Trait::KernelName() + "_" + clean + " logic_" + clean + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + std::string cleanOp = "op_" + OpName; + auto length = ConvertDimShapeToLength(fShape); + std::stringstream out; + out << "\n//------ " << Trait::Name() << "_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY + << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY + << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY + << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << cleanOp + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", logic_" << cleanOp << "Kernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNA << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return Trait::Expr(v, v); // not really meaningful for binary, but satisfy interface + } +}; + +template +class ROperator_BitwiseNot final : public ROperator { +private: + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_BitwiseNot() {} + + ROperator_BitwiseNot(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + return input; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) + throw std::runtime_error("SOFIE BitwiseNot: input tensor '" + fNX + + "' not found in model"); + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + if (model.Verbose()) + std::cout << "BitwiseNot: " << fNX << " -> " << fNY + << " " << ConvertDimShapeToString(fShape) << std::endl; + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + auto length = ConvertDimShapeToLength(fShape); + std::stringstream out; + out << "\n//------ BITWISE_NOT\n"; + out << SP << "for (std::size_t id = 0; id < " << length << "u; ++id) {\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ~tensor_" << fNX << "[id];\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream op; + op << "\n//------ BITWISE_NOT_KERNEL_ALPAKA\n"; + op << "struct BitwiseNotKernel_" << OpName << " {\n"; + op << SP << "template\n"; + op << SP << "ALPAKA_FN_ACC void operator()(" + << "TAcc const& acc, " + << "T const* __restrict__ input, " + << "T* __restrict__ output, " + << "std::size_t const N) const {\n"; + op << SP << SP << "auto const idx = " + << "alpaka::getIdx(acc)[0];\n"; + op << SP << SP << "if (idx >= N) return;\n"; + op << SP << SP << "output[idx] = ~input[idx];\n"; + op << SP << "}\n"; + op << "};\n"; + return op.str(); + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + std::string clean = "op_" + OpName; + return SP + "BitwiseNotKernel_" + clean + " bitwiseNotKernel_" + clean + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + std::string cleanOp = "op_" + OpName; + auto length = ConvertDimShapeToLength(fShape); + std::stringstream out; + out << "\n//------ BITWISE_NOT_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY + << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY + << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY + << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << cleanOp + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", bitwiseNotKernel_" << cleanOp + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "~" + v; + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_LOGIC diff --git a/core/inc/SOFIE/ROperator_Not.hxx b/core/inc/SOFIE/ROperator_Not.hxx new file mode 100644 index 0000000..4e42eca --- /dev/null +++ b/core/inc/SOFIE/ROperator_Not.hxx @@ -0,0 +1,112 @@ +#ifndef TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT +#define TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT + +#include +#include +#include + + +namespace SOFIE { + + +class ROperator_Not final : public ROperator { +private: + std::string fNX; + std::string fNY; + + std::vector fShapeX; + std::vector fShapeY; + +public: + ROperator_Not() {} + + ROperator_Not(std::string nameX, std::string nameY) + : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + { + fKind = OperatorKind::NOT; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) { + throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + } + fShapeX = model.GetDimTensorShape(fNX); + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + } + + std::string Generate(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + + out << SP << "\n//---- Operator Not " << opName << "\n"; + auto length = ConvertDimShapeToLength(fShapeX); + out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = !tensor_" + fNX + "[i];\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override + { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ NOT_KERNEL_ALPAKA\n"; + op += SP + "struct NotKernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const & acc,\n"; + op += SP + SP + SP + "T const * data,\n"; + op += SP + SP + SP + "T * output,\n"; + op += SP + SP + SP + "std::size_t const length) const\n"; + op += SP + SP + "{\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < length) {\n"; + op += SP + SP + SP + SP + "output[idx] = !data[idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override + { + return SP + "NotKernel notKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override + { + opName = "op_" + opName; + std::stringstream out; + auto length = ConvertDimShapeToLength(fShapeX); + + out << "\n//------ " << opName << "_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY << " = Vec::all(Idx{" << length << "});\n"; + out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << opName + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", " << "notKernel" + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", " << length << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + + bool IsElementwise() const override { return !fIsOutputConstant; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "!" + v; + } + +}; + +} // namespace SOFIE + +#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx b/core/inc/SOFIE/ROperator_Pad.hxx similarity index 89% rename from src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx rename to core/inc/SOFIE/ROperator_Pad.hxx index dae3a5b..04365d8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx +++ b/core/inc/SOFIE/ROperator_Pad.hxx @@ -61,13 +61,13 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Pad Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Pad Op Input Tensor is not found in model"); } fInputShape = model.GetTensorShape(fNX); if (fMode != EMode::kConstant) { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only Constant mode"); + throw std::runtime_error("SOFIE Pad Op supports now only Constant mode"); } // get pads data @@ -75,7 +75,7 @@ public: if (model.IsInitializedTensor(fNP)) { padsData = static_cast(model.GetInitializedTensorData(fNP).get()); } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Pads data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Pads data"); } // get constant value fConstantValue = 0; @@ -84,7 +84,7 @@ public: T * cData = static_cast(model.GetInitializedTensorData(fNCV).get()); fConstantValue = cData[0]; } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Constant Value data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Constant Value data"); } } std::vector axes; @@ -103,10 +103,10 @@ public: for (size_t i = 0; i < nax; i++) axes[i] = data[i]; } else { - throw std::runtime_error("TMVA SOFIE Pad Op invalid input Axes type"); + throw std::runtime_error("SOFIE Pad Op invalid input Axes type"); } } else { - throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Axes data"); + throw std::runtime_error("SOFIE Pad Op supports now only initialized Axes data"); } } @@ -127,7 +127,7 @@ public: fPads[i].second = padsData[axesSize + i]; int64_t outDim = static_cast(fOutputShape[i]) + fPads[i].first + fPads[i].second; if (outDim < 0) - throw std::runtime_error("TMVA SOFIE Pad Op : invalid Pads values"); + throw std::runtime_error("SOFIE Pad Op : invalid Pads values"); fOutputShape[i] = outDim; } } @@ -149,7 +149,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fOutputShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Pad called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Pad called to Generate without being initialized first"); } std::stringstream out; auto inputStride = UTILITY::ComputeStrideFromShape(fInputShape); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx rename to core/inc/SOFIE/ROperator_Pool.hxx index e6fbc25..8e11271 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx +++ b/core/inc/SOFIE/ROperator_Pool.hxx @@ -76,7 +76,7 @@ public: fType = "float"; } else { throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Pool operator"); + std::runtime_error("SOFIE Encountered unsupported type parsing a Pool operator"); } fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; @@ -94,19 +94,19 @@ public: // Where N is batch size, C : input channels, H : input height, W = input width // or it can be [N, C, F1,F2,....FN] . Minimum dimension is 3 if (input.size() != 1 ) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference need 1 input tensor"); + throw std::runtime_error("SOFIE" + Name() + "Op Shape inference need 1 input tensor"); } if (input[0].size() < 3) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions"); + throw std::runtime_error("SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions"); } // support only input tensors with dim = 3,4,5 if (input[0].size() < 3 || input[0].size() > 5) { - throw std::runtime_error("TMVA SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported"); + throw std::runtime_error("SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported"); } if (input[0].size() -2 != fDim) { throw - std::runtime_error("TMVA SOFIE Pool Op Shape inference - invalid inputs "); + std::runtime_error("SOFIE Pool Op Shape inference - invalid inputs "); } // kernel shape size_t k1 = ((fAttrKernelShape.empty())? input[0][2] : fAttrKernelShape[0]); @@ -156,7 +156,7 @@ public: } } else if (fAttrAutopad != "VALID") { throw - std::runtime_error("TMVA SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad); + std::runtime_error("SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad); } // to be sure pad is vector of size 6 if (fDim < 3) fAttrPads.resize(6, 0); @@ -204,13 +204,13 @@ public: if (!model.CheckIfTensorAlreadyExist(fNX)) { throw - std::runtime_error("TMVA SOFIE Pool op Input Tensor " + fNX + " is not found in model"); + std::runtime_error("SOFIE Pool op Input Tensor " + fNX + " is not found in model"); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() < 3 || fShapeX.size() > 5) { std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; throw - std::runtime_error("TMVA SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); + std::runtime_error("SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); } fDim = fShapeX.size() - 2; // case of GlobalAveragePool. It is a pool case with kernel shape == image shape @@ -267,7 +267,7 @@ public: OpName = "op_" + OpName; if (fShapeX.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Pool Op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Pool Op called to Generate without being initialized first"); } std::stringstream out; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx b/core/inc/SOFIE/ROperator_RNN.hxx similarity index 98% rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx rename to core/inc/SOFIE/ROperator_RNN.hxx index aed7bc1..3a0f58f 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx +++ b/core/inc/SOFIE/ROperator_RNN.hxx @@ -91,7 +91,7 @@ template class ROperator_RNN final : public ROperator { fType = "float"; } else { throw std::runtime_error( - "TMVA SOFIE Encountered unsupported type parsing a RNN operator"); + "SOFIE Encountered unsupported type parsing a RNN operator"); } fInputTensorNames = { fNX, fNW, fNR }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/core/inc/SOFIE/ROperator_RNN.icc similarity index 96% rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc rename to core/inc/SOFIE/ROperator_RNN.icc index c03c1c2..467fda8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc +++ b/core/inc/SOFIE/ROperator_RNN.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_RNN_I #define SOFIE_ROPERATOR_RNN_I - namespace SOFIE { template @@ -39,40 +38,40 @@ auto ROperator_RNN::Initialize(RModel& model) fUseSession = model.UseSession(); // Check the input and output tensors if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNX + " is not found in model."); } fShapeX = model.GetTensorShape(fNX); if (fShapeX.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNX + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNW + " is not found in model."); } fShapeW = model.GetTensorShape(fNW); if (fShapeW.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNW + " is not of 3 dimensions."); } if (!model.CheckIfTensorAlreadyExist(fNR)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNR + " is not found in model."); } fShapeR = model.GetTensorShape(fNR); if (fShapeR.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNR + " is not of 3 dimensions."); } if (!fNB.empty()) { if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB + + throw std::runtime_error("SOFIE RNN op input tensor " + fNB + " is not found in model."); } fShapeB = model.GetTensorShape(fNB); if (fShapeB.size() != 2 && fShapeB.size() != 4) { - throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB + + throw std::runtime_error("SOFIE RNN op input tensor " + fNB + " is not of 2 or 4 dimensions."); } if (fShapeB.size() == 2) { @@ -112,23 +111,23 @@ auto ROperator_RNN::Initialize(RModel& model) } if (!fNSequence_lens.empty()) { if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNSequence_lens + "is not found in model."); } fShapeSequence_lens = model.GetTensorShape(fNSequence_lens); if (fShapeSequence_lens.size() != 1) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNSequence_lens + " is not of 1 dimension."); } } if (!fNInitial_h.empty()) { if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNInitial_h + " is not found in model."); } fShapeInitial_h = model.GetTensorShape(fNInitial_h); if (fShapeInitial_h.size() != 3) { - throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + + throw std::runtime_error("SOFIE RNN Op input tensor " + fNInitial_h + " is not of 3 dimensions."); } } @@ -153,24 +152,24 @@ auto ROperator_RNN::Initialize(RModel& model) activation != "ScaledTanh" && activation != "HardSigmoid" && activation != "Elu" && activation != "Softsign" && activation != "Softplus") { - throw std::runtime_error("TMVA SOFIE - Activation function " + + throw std::runtime_error("SOFIE - Activation function " + activation + " not implemented"); } } if (fAttrDirection != "forward" && fAttrDirection != "backward" && fAttrDirection != "bidirectional") { throw std::runtime_error( - "TMVA SOFIE - Invalid RNN direction fAttrDirection = " + + "SOFIE - Invalid RNN direction fAttrDirection = " + fAttrDirection); } if (fAttrHiddenSize != fShapeW[1]) { throw std::runtime_error( - "TMVA SOFIE - fAttrHiddenSize must be equal to " + + "SOFIE - fAttrHiddenSize must be equal to " + std::to_string(fShapeW[1])); } if (fAttrLayout > 1) { throw std::runtime_error( - "TMVA SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + + "SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) + " must be 0 (timewise) or 1 (batchwise)"); } if (fAttrActivations.empty()) { @@ -230,7 +229,7 @@ auto ROperator_RNN::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { if (fType == "float") { - out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n"; } } else { if (fUseSession) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx b/core/inc/SOFIE/ROperator_Random.hxx similarity index 95% rename from src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx rename to core/inc/SOFIE/ROperator_Random.hxx index cde08b5..0de1cd9 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx +++ b/core/inc/SOFIE/ROperator_Random.hxx @@ -121,13 +121,13 @@ public: if (fUseROOT) { if (fMode == kNormal) { if (fParams.count("mean") == 0 || fParams.count("scale") == 0) - throw std::runtime_error("TMVA SOFIE RandomNormal op : no mean or scale are defined"); + throw std::runtime_error("SOFIE RandomNormal op : no mean or scale are defined"); float mean = fParams["mean"]; float scale = fParams["scale"]; out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Gaus(" << mean << "," << scale << ");\n"; } else if (fMode == kUniform) { if (fParams.count("high") == 0 || fParams.count("low") == 0) - throw std::runtime_error("TMVA SOFIE RandomUniform op : no low or high are defined"); + throw std::runtime_error("SOFIE RandomUniform op : no low or high are defined"); float high = fParams["high"]; float low = fParams["low"]; out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Uniform(" << low << "," << high << ");\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/core/inc/SOFIE/ROperator_Range.hxx similarity index 84% rename from src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx rename to core/inc/SOFIE/ROperator_Range.hxx index 8af272d..8ea17d9 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx +++ b/core/inc/SOFIE/ROperator_Range.hxx @@ -8,7 +8,6 @@ #include #include - namespace SOFIE{ template @@ -51,15 +50,15 @@ public: //input must be a graph input, or already initialized intermediate tensor if (!model.CheckIfTensorAlreadyExist(fNStart)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNStart + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNStart + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNLimit)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNLimit + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNLimit + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNDelta)) { throw - std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model"); + std::runtime_error("SOFIE Range Op Input Tensor " + fNDelta + "is not found in model"); } ETensorType type = ConvertStringToType(fType); if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) { @@ -67,7 +66,7 @@ public: T * limit = static_cast(model.GetInitializedTensorData(fNLimit).get()); T * delta = static_cast(model.GetInitializedTensorData(fNDelta).get()); if (!start || !delta || !limit) - std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data"); + std::runtime_error("SOFIE Range Op Input Tensor has invalid input data"); T a = *start; T b = *limit; T d = *delta; @@ -89,9 +88,9 @@ public: model.AddDynamicTensor(fNOutput, type, fShape); } if (model.Verbose()) { - std::cout << "Range -> output is " << fNOutput << " "; - if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl; - else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl; + std::cout << "Range -> output is " << fNOutput << " : " << ConvertDimShapeToString(fShape); + if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData(fNOutput)); + std::cout << std::endl; } } @@ -103,7 +102,7 @@ public: OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Range operator called to Generate without being initialized first"); } std::string sizeName = fShape[0].param; @@ -121,5 +120,5 @@ public: }; }//SOFIE - + #endif //SOFIE_ROPERATOR_RANGE diff --git a/core/inc/SOFIE/ROperator_Reduce.hxx b/core/inc/SOFIE/ROperator_Reduce.hxx new file mode 100644 index 0000000..f3e7170 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Reduce.hxx @@ -0,0 +1,475 @@ +#ifndef SOFIE_ROPERATOR_Reduce +#define SOFIE_ROPERATOR_Reduce + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + + +namespace SOFIE{ + +enum EReduceOpMode { ReduceMean, ReduceSum, ReduceSumSquare, ReduceProd, ReduceL2, ReduceMax, InvalidReduceOp }; + +template +class ROperator_Reduce final : public ROperator +{ +private: + /* Attributes*/ + int fkeepdims = 1; //default value + std::vector fAttrAxes; + EReduceOpMode fReduceOpMode; + std::string fNX; + std::string fNAxes; + std::string fNY; + std::vector fShapeX; + std::vector fShapeY; + std::vector fShapeYNotPruned; // needed for fKeepdims=0 + + +public: + + std::string Name() { + if (fReduceOpMode == ReduceMean) return "ReduceMean"; + else if (fReduceOpMode == ReduceSumSquare) return "ReduceSumSquare"; + else if (fReduceOpMode == ReduceProd) return "ReduceProd"; + else if (fReduceOpMode == ReduceSum) return "ReduceSum"; + else if (fReduceOpMode == ReduceL2) return "ReduceL2"; + else if (fReduceOpMode == ReduceMax) return "ReduceMax"; + return "Invalid"; + } + + std::vector GetStdLibs() override { + if (fReduceOpMode == ReduceL2) + return { std::string("cmath") }; + if (fReduceOpMode == ReduceMax) + return { std::string("limits") }; + return {}; + } + + ROperator_Reduce(){} + ROperator_Reduce(int keepdims, std::vector attrAxes, std::string nameX, std::string nameAxes, std::string nameY): + fkeepdims(keepdims), fAttrAxes(attrAxes), fNX(UTILITY::Clean_name(nameX)), fNAxes(UTILITY::Clean_name(nameAxes)), fNY(UTILITY::Clean_name(nameY)) { + fReduceOpMode = Op; + + fInputTensorNames = { fNX }; + if(!fNAxes.empty()){ + fInputTensorNames.emplace_back(fNAxes); + } + + fOutputTensorNames = { fNY }; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { + return input; + } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + auto & outputShape = ret[0]; + for (size_t j = 0; j < fAttrAxes.size(); j++) { + if (fAttrAxes[j] < 0) fAttrAxes[j] += outputShape.size(); + if (fAttrAxes[j] < 0 || (size_t) fAttrAxes[j] >= outputShape.size() ) + throw std::runtime_error("SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j])); + // set to 1 the reduced dims + outputShape[fAttrAxes[j]] = 1; + } + fShapeYNotPruned = outputShape; + // in case of pruning dimension we need to sort axes attributes + if (fkeepdims == 0) { + auto ax = fAttrAxes; + std::sort(ax.begin(), ax.end()); + for (size_t j = 0; j < ax.size(); j++) { + // erase reduced dimensions, but keep last one + if (outputShape.size() > 1) { + outputShape.erase(outputShape.begin() + ax[j]); + for (size_t k = j+1; k < ax.size(); k++) + ax[k] -= 1; // decrease by one since we have removed a value + } + } + } + return ret; + } + void Initialize(RModel& model) override { + + fUseSession = model.UseSession(); + + if (!model.CheckIfTensorAlreadyExist(fNX)) { + // input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Reduce Op Input Tensor " + fNX + " is not found in model"); + } + fShapeX = model.GetTensorShape(fNX); + // check if tensor with axes is provided + if (!fNAxes.empty()) { + auto ax_shptr = model.GetInitializedTensorData(fNAxes); + auto ax_ptr = static_cast(ax_shptr.get()); + auto ax_shape = model.GetTensorShape(fNAxes); + size_t ax_length = ConvertShapeToLength(ax_shape); + fAttrAxes = std::vector(ax_ptr, ax_ptr+ax_length); + } else if (fAttrAxes.empty()) { + // in case no axes is passed assume full reduction + fAttrAxes.resize(fShapeX.size()); + for (size_t i = 0; i < fAttrAxes.size(); i++) + fAttrAxes[i] = i; + } + // find shape of Y and add it in the list of intermediate tensors + fShapeY = ShapeInference({fShapeX})[0]; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + if (model.Verbose()){ + std::cout << Name() << " : " << fNX << " -> " << fNY << " shape " << ConvertShapeToString(fShapeY) << std::endl; + } + model.AddNeededStdLib("algorithm"); + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShapeX.empty() || fShapeY.empty()) { + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); + } + + size_t inputLength = SOFIE::ConvertShapeToLength(fShapeX); + size_t outputLength = SOFIE::ConvertShapeToLength(fShapeY); + + auto inputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeX); + // output stride (or not pruned vector) + auto outputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeYNotPruned); + + // write here according to size of shape + // in generation code can be done automatically + // i0 = i / stride0 % shape0; i1 = i / stride1 % shape1 and so on + // and we have for the inverse + // i = i0 * s0 + i1 * s1 + i2 * s2 + i3 * s3 .... + + // don't need to divide by last stride s[n-1] since it is 1 by definition + + std::stringstream out; + out << "\n//---- operator " << Name() << " " << opName << "\n"; + // check where is reduced axes are first or last one. In these case we can do a faster implementation + enum EReduceDim {kFirst, kLast, kMiddle}; + EReduceDim reduceDims = kLast; + int kmin = fShapeX.size()-fAttrAxes.size(); + for (int k = fShapeX.size()-1; k >= kmin; k--) { + // if k is not a reduced axis is not last ones + if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { + reduceDims = kMiddle; + break; + } + } + if (reduceDims == kMiddle) { + reduceDims = kFirst; + // check if at the beginning + for (size_t k = 0; k < fAttrAxes.size(); k++) { + // if k is not a reduced axis is not first ones + if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { + reduceDims = kMiddle; + break; + } + } + } + size_t reducedLength = inputLength / outputLength; + if (reduceDims == kLast) { + //std::cout << "reduction for operator " << opName << " is last" << std::endl; + // new faster implementation using a single loop + // faster to loop first on reduced dimension and then output + // reset output tensors + + // loop on output dimensions + out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n"; + // loop on reduce dimensions + if (fReduceOpMode == ReduceProd) + out << SP << SP << "tensor_" << fNY << "[i] = 1;\n"; + else if (fReduceOpMode == ReduceMax) + out << SP << SP << "tensor_" << fNY << "[i] = std::numeric_limits::lowest();\n"; + else + out << SP << SP << "tensor_" << fNY << "[i] = 0;\n"; + out << SP << SP << "for (size_t j = 0; j < " << reducedLength << "; j++) {\n"; + + if (fReduceOpMode == ReduceProd) + out << SP << SP << SP << "tensor_" << fNY << "[i] *= tensor_" << fNX << "[i * " << reducedLength << " + j];\n"; + else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) + out << SP << SP << SP << "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j];\n"; + else if(fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2) + out << SP << SP << SP << "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j] * tensor_" + << fNX << "[i * " << reducedLength << " + j];\n"; + else if (fReduceOpMode == ReduceMax) + out << SP << SP << SP << "if (tensor_" << fNX << "[i * " << reducedLength << " + j] > tensor_" << fNY << "[i])\n" + << SP << SP << SP << SP << "tensor_" << fNY << "[i] = tensor_" << fNX << "[i * " << reducedLength << " + j];\n"; + out << SP << SP << "}\n"; // end j loop + if(fReduceOpMode == ReduceMean) + out << SP << SP << "tensor_" << fNY << "[i] /= static_cast(" << reducedLength << ");\n"; + else if (fReduceOpMode == ReduceL2) + out << SP << SP << "tensor_" << fNY << "[i] = std::sqrt(tensor_" << fNY << "[i]);\n"; + + out << SP << "}\n"; // end i loop + } else if (reduceDims == kFirst) { + //std::cout << "reduction for operator " << opName << " is first" << std::endl; + // case reduction is at beginning + // reset output tensors + if (fReduceOpMode == ReduceProd) + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n"; + else if (fReduceOpMode == ReduceMax) + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength + << ", std::numeric_limits::lowest());\n"; + else + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 0);\n"; + + out << SP << "for (size_t i = 0; i < " << reducedLength << "; i++) {\n"; + out << SP << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n"; + + if (fReduceOpMode == ReduceProd) + out << SP << SP << SP << "tensor_" << fNY << "[j] *= tensor_" << fNX << "[i * " << outputLength << " + j];\n"; + else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) + out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j];\n"; + else if(fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2) + out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j] * tensor_" + << fNX << "[i * " << outputLength << " + j];\n"; + else if (fReduceOpMode == ReduceMax) + out << SP << SP << SP << "if (tensor_" << fNX << "[i * " << outputLength << " + j] > tensor_" << fNY << "[j])\n" + << SP << SP << SP << SP << "tensor_" << fNY << "[j] = tensor_" << fNX << "[i * " << outputLength << " + j];\n"; + out << SP << SP << "}\n"; // end j loop + out << SP << "}\n"; // end i loop + if(fReduceOpMode == ReduceMean) { + out << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n"; + out << SP << SP << "tensor_" << fNY << "[j] /= static_cast(" << reducedLength << ");\n"; + out << SP << "}\n"; // end j loop + } else if (fReduceOpMode == ReduceL2) { + out << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n"; + out << SP << SP << "tensor_" << fNY << "[j] = std::sqrt(tensor_" << fNY << "[j]);\n"; + out << SP << "}\n"; // end j loop + } + } + else + { // standard case + //std::cout << "reduction for operator " << opName << " is middle" << std::endl; + // reset output tensors + if (fReduceOpMode == ReduceProd) + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n"; + else if (fReduceOpMode == ReduceMax) + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength + << ", std::numeric_limits::lowest());\n"; + else + out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ",0);\n"; + + out << SP << "for (size_t i = 0; i < " << inputLength << "; i++) {\n"; + + size_t dim = fShapeX.size(); // this is the input dimension (e.g. 2, 3 or 4 or more) + + // here we find output index + out << SP << SP << "size_t outputIndex = 0;\n"; + for (size_t k = 0; k < dim; k++) { + if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { + // do for not reducing axes + out << SP << SP << "size_t i_" << k << " = i / " << inputStrides[k] << " % " << fShapeX[k] << ";\n"; + out << SP << SP << "outputIndex += i_" << k << " * " << outputStrides[k] << ";\n"; + } + } + // now compute reduction + out << SP << SP << "// compute reduction....\n"; + if (fReduceOpMode == ReduceProd) + out << SP << SP << "tensor_" << fNY << "[outputIndex] *= tensor_" << fNX << "[i];\n"; + else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) + out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i];\n"; + else if (fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2) { + out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i] * tensor_" << fNX + << "[i];\n"; + } else if (fReduceOpMode == ReduceMax) { + out << SP << SP << "if (tensor_" << fNX << "[i] > tensor_" << fNY << "[outputIndex])\n"; + out << SP << SP << SP << "tensor_" << fNY << "[outputIndex] = tensor_" << fNX << "[i];\n"; + } + out << SP << "}\n"; // end loop on input elements + // post-processing passes + if (fReduceOpMode == ReduceMean) { + out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] /= static_cast(" << reducedLength << ");\n"; + out << SP << "}\n"; + } else if (fReduceOpMode == ReduceL2) { + out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n"; + out << SP << SP << "tensor_" << fNY << "[i] = std::sqrt(tensor_" << fNY << "[i]);\n"; + out << SP << "}\n"; + } + } + + return out.str(); + } + + // --------------------------------------------------------------------------- + // GPU kernel: one block per output element, 256 threads cooperatively reduce + // the slice via shared-memory tree reduction. + // This replaces the previous naive "one thread per output element" approach + // which serialised the entire reduction loop inside a single thread. + // --------------------------------------------------------------------------- + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); + + const std::size_t Dx = fShapeX.size(); + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeX); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeYNotPruned); + std::size_t inputLength = ConvertShapeToLength(fShapeX); + std::size_t outputLength = ConvertShapeToLength(fShapeY); + std::size_t reducedLength = inputLength / outputLength; + + // Partition axes into keep (non-reduced) and reduce sets. + std::vector redAxes, keepAxes; + for (std::size_t d = 0; d < Dx; ++d) { + if (std::find(fAttrAxes.begin(), fAttrAxes.end(), (int64_t)d) != fAttrAxes.end()) + redAxes.push_back(d); + else + keepAxes.push_back(d); + } + + // Row-major strides for decomposing the flat reduction index r into + // per-axis coordinates. + // redStrides[i] = product of fShapeX[redAxes[j]] for j > i + std::vector redStrides(redAxes.size(), 1); + for (int ri = (int)redAxes.size() - 2; ri >= 0; --ri) + redStrides[ri] = redStrides[ri + 1] * fShapeX[redAxes[ri + 1]]; + + std::string kname = "ReduceKernel_" + Name() + "_" + fNY; + + std::string op; + op = "\n//------ " + Name() + "_KERNEL_ALPAKA (block parallel reduction)\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const reducedLength,\n"; + op += SP + SP + SP + "std::size_t const outputLength) const {\n\n"; + + // ---- shared memory (fixed 256 slots, matches block size) ---- + op += SP + SP + SP + "auto& shmem = alpaka::declareSharedVar(acc);\n\n"; + + // ---- block/thread addressing ---- + // One block per output element; threads cooperate within the block. + op += SP + SP + SP + "auto const out_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "auto const thread_id = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (out_idx >= outputLength) return;\n\n"; + + // ---- decode output (keep-axis) coordinates from out_idx ---- + for (std::size_t d = 0; d < Dx; ++d) { + if (std::find(redAxes.begin(), redAxes.end(), d) == redAxes.end()) { + op += SP + SP + SP + "std::size_t const oy_" + std::to_string(d) + + " = (out_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fShapeYNotPruned[d]) + "u;\n"; + } + } + op += "\n"; + + // ---- thread-stride partial accumulation over reduction axis ---- + std::string startVal; + if (Op == ReduceProd) startVal = "static_cast(1)"; + else if (Op == ReduceMax) startVal = "std::numeric_limits::lowest()"; + else startVal = "static_cast(0)"; + op += SP + SP + SP + "T partial = " + startVal + ";\n"; + op += SP + SP + SP + "for (std::size_t r = thread_id; r < reducedLength; r += 256u) {\n"; + + // Decode flat reduction index r into per-axis coordinates. + for (std::size_t ri = 0; ri < redAxes.size(); ++ri) { + std::size_t rd = redAxes[ri]; + op += SP + SP + SP + SP + "std::size_t const r_" + std::to_string(rd) + + " = (r / " + std::to_string(redStrides[ri]) + "u) % " + + std::to_string(fShapeX[rd]) + "u;\n"; + } + + // Compute flat input index. + op += SP + SP + SP + SP + "std::size_t const in_idx =\n"; + for (std::size_t d = 0; d < Dx; ++d) { + bool isReduced = std::find(redAxes.begin(), redAxes.end(), d) != redAxes.end(); + std::string coord = isReduced ? "r_" + std::to_string(d) : "oy_" + std::to_string(d); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < Dx) ? " +\n" : ";\n"; + } + + // Partial accumulation step. + if (Op == ReduceProd) + op += SP + SP + SP + SP + "partial *= input[in_idx];\n"; + else if (Op == ReduceSum || Op == ReduceMean) + op += SP + SP + SP + SP + "partial += input[in_idx];\n"; + else if (Op == ReduceSumSquare || Op == ReduceL2) + op += SP + SP + SP + SP + "partial += input[in_idx] * input[in_idx];\n"; + else if (Op == ReduceMax) + op += SP + SP + SP + SP + "if (input[in_idx] > partial) partial = input[in_idx];\n"; + + op += SP + SP + SP + "}\n\n"; // end thread-stride loop + + // ---- store in shared memory and synchronise ---- + op += SP + SP + SP + "shmem[thread_id] = partial;\n"; + op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n\n"; + + // ---- binary tree reduction within the block ---- + op += SP + SP + SP + "for (std::size_t s = 128u; s > 0u; s >>= 1u) {\n"; + op += SP + SP + SP + SP + "if (thread_id < s) {\n"; + if (Op == ReduceProd) + op += SP + SP + SP + SP + SP + "shmem[thread_id] *= shmem[thread_id + s];\n"; + else if (Op == ReduceMax) + op += SP + SP + SP + SP + SP + "if (shmem[thread_id + s] > shmem[thread_id]) shmem[thread_id] = shmem[thread_id + s];\n"; + else + op += SP + SP + SP + SP + SP + "shmem[thread_id] += shmem[thread_id + s];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + "alpaka::syncBlockThreads(acc);\n"; + op += SP + SP + SP + "}\n\n"; + + // ---- thread 0 writes the final result ---- + op += SP + SP + SP + "if (thread_id == 0u) {\n"; + op += SP + SP + SP + SP + "T result = shmem[0];\n"; + if (Op == ReduceMean) + op += SP + SP + SP + SP + "result /= static_cast(" + std::to_string(reducedLength) + "u);\n"; + else if (Op == ReduceL2) + op += SP + SP + SP + SP + "result = std::sqrt(result);\n"; + op += SP + SP + SP + SP + "output[out_idx] = result;\n"; + op += SP + SP + SP + "}\n"; + + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + std::string kname = "ReduceKernel_" + Name() + "_" + fNY; + return SP + kname + " reduceKernel_" + Name() + "_" + fNY + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string /*opName*/) override { + if (fShapeX.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first"); + + std::size_t inputLength = ConvertShapeToLength(fShapeX); + std::size_t outputLength = ConvertShapeToLength(fShapeY); + std::size_t reducedLength = inputLength / outputLength; + std::string kname = "reduceKernel_" + Name() + "_" + fNY; + + std::stringstream out; + out << "\n//------ " << Name() << "_GPU_ALPAKA\n"; + // Grid: one block per output element; Block: 256 threads cooperate to + // reduce the corresponding slice. + out << SP << "alpaka::WorkDivMembers workDiv_" << fNY << "(\n"; + out << SP << SP << "Vec::all(Idx{" << outputLength << "u}),\n"; + out << SP << SP << "Vec::all(Idx{256u}),\n"; + out << SP << SP << "Vec::all(Idx{1u}));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << reducedLength << "u)" + << ", static_cast(" << outputLength << "u));\n"; + + return out.str(); + } + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_Reduce + diff --git a/core/inc/SOFIE/ROperator_Relu.hxx b/core/inc/SOFIE/ROperator_Relu.hxx new file mode 100644 index 0000000..96d5931 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Relu.hxx @@ -0,0 +1,130 @@ +#ifndef SOFIE_ROPERATOR_RELU +#define SOFIE_ROPERATOR_RELU + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Relu final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Relu(){} + ROperator_Relu(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::RELU; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); + } + + fShape = model.GetDimTensorShape(fNX); + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + if (model.Verbose()) { + std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; + } + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ RELU\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { + std::string op; + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + op += "struct ReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : 0;\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ReluKernel reluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ RELU_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNY + << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "(" + v + ") >= T(0) ? (" + v + ") : T(0)"; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_RELU diff --git a/core/inc/SOFIE/ROperator_Reshape.hxx b/core/inc/SOFIE/ROperator_Reshape.hxx new file mode 100644 index 0000000..4393b32 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Reshape.hxx @@ -0,0 +1,449 @@ +#ifndef SOFIE_ROPERATOR_RESHAPE +#define SOFIE_ROPERATOR_RESHAPE + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + + +namespace SOFIE{ + +enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze }; + + +class ROperator_Reshape final : public ROperator +{ + +private: + + bool fVerbose = false; + bool fDimInput = false; + bool fDynamicShape = false; + ReshapeOpMode fOpMode = Reshape; // type of Reshape operator + + int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape + int fAxis = 1; // (for Flatten) + + std::string fNData; // input data tensor name + std::string fNInput2; // reshape or axes tensor name depending on operator + std::string fNOutput; // output tensor name + std::vector fShapeInput; // input shape data + std::vector fShapeOutput; // output shape data + std::vector fOutputShapeData; // in case output is a shape tensor we store here the shape value data (can be parametric) + std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) + std::vector fShape; // shape tensor values provided for Reshape for int shapes4 + +public: + + std::string Name() const { + if (fOpMode == Reshape) return "Reshape"; + if (fOpMode == Flatten) return "Flatten"; + if (fOpMode == Squeeze) return "Squeeze"; + if (fOpMode == Unsqueeze) return "Unsqueeze"; + return ""; + } + + ROperator_Reshape(){} + ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + if (opMode == Reshape) fAllowZero = attr_value; + if (opMode == Flatten) fAxis = attr_value; + + fInputTensorNames = { fNData }; + if(!fNInput2.empty()){ + fInputTensorNames.emplace_back(fNInput2); + } + fOutputTensorNames = { fNOutput }; + } + + // for squeeze/unsqueezed operators following old ONNX version (< 10) + // In this cases axes are passed as attribute values + ROperator_Reshape(ReshapeOpMode opMode, std::vector attrAxes, std::string nameData, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)), + fAttrAxes(attrAxes) + { + assert(fOpMode == Squeeze || fOpMode == Unsqueeze); + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + + + // output shape + std::vector DoShapeInference(const std::vector & input_shape, const std::vector & target_shape) { + if (fOpMode == Reshape) { + // correct the provided shape (here we have the value) for 0 or -1 + // the target_shape can be a scalar in case of not present shape input tensor + std::vector output_shape = target_shape; + bool hasMinusOne = false; + bool hasZero = false; + for (size_t i = 0; i < output_shape.size(); i++) { + // case for zero values in given shape: in this case we take the corresponding value from input shape + if (!output_shape[i].isParam) { + if (output_shape[i].dim == 0) { + hasZero = true; + if (fAllowZero) + output_shape[i] = Dim{0}; + else { + if (i > 0 && output_shape.size() != input_shape.size()) + std::cout << "WARNING: TMVA Reshape Op : output shape has zero value at index " << i << + " but input shape has a different rank than output shape" << std::endl; + if (i >= input_shape.size()) + throw std::runtime_error("TMVA Reshape Op : output shape has zero value at index " + std::to_string(i) + + " but input shape does not have corresponding index"); + } + output_shape[i] = input_shape[i]; + } else if (output_shape[i].dim == static_cast(-1)) { + hasMinusOne = true; + } + } + } + if (hasZero && hasMinusOne) { + throw std::runtime_error("TMVA Reshape Op : zero value in shape is not allowed when there is also a -1 in shape"); + } + // now case of -1 in shape - we can infer the value of -1 from all other values + for (size_t i = 0; i < output_shape.size(); i++) { + if (output_shape[i] == static_cast(-1) && !output_shape[i].isParam) { + auto tmp = output_shape; + tmp.erase(tmp.begin() + i); // erase -1 value to compute the length of the other dimensions + auto tmp_length = ConvertDimShapeToLength(tmp); + auto input_length = ConvertDimShapeToLength(input_shape); + if (fVerbose) + std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length " + << input_length << " to " << tmp_length << std::endl; + + if (IsInteger(tmp_length) && IsInteger(input_length)) + output_shape[i] = Dim{static_cast(std::stoi(input_length) / std::stoi(tmp_length))}; + else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) { + output_shape[i] = Dim{input_length, static_cast(-1)}; + } + else { + //we can try simplifying expression if tmp_length is integer and part of input_length + // contains tmp_length + bool canSimplify = false; + std::vector reduced_input; + if (IsInteger(tmp_length)) { + + // try to tokenize with * the input length + + std::stringstream ss(input_length); + + std::string token; + + // Tokenizing w.r.t. space '*' + while(getline(ss, token, '*')) + { + // remove any whitespace + token.erase(std::remove_if(token.begin(), token.end(), + [](unsigned char x) { return std::isspace(x); }), token.end()); + if (token != tmp_length) { + if (IsInteger(token)) { + size_t il = static_cast(std::stoi(input_length)); + size_t tl = static_cast(std::stoi(tmp_length)); + if ((il % tl) == 0) { + canSimplify = true; + reduced_input.push_back(Dim{il / tl}); + } + } else { + reduced_input.push_back(Dim{token}); + } + } else { + // token is equal to tmp_length, can be not considered and is simplified + canSimplify = true; + } + } + } + if (canSimplify) { + // if length contains * we need to add some brackets + std::string res_shape = ConvertDimShapeToLength(reduced_input); + if (res_shape.find('*') != std::string::npos) + output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast(-1)}; + else + output_shape[i] = Dim{res_shape}; + } + if (!canSimplify) + output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast(-1)}; + } + + break; // cannot have more than -1 + } + // throw std::runtime_error( + // "TMVA Reshape Op : output shape has multiple negative or zero values"); + } + + if (fVerbose) + std::cout << "Reshape: correct output shape to " << ConvertDimShapeToString(output_shape) << std::endl; + + if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) { + throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertDimShapeToString(input_shape) + + ConvertDimShapeToString(output_shape)); + } + return output_shape; + + } else if (fOpMode == Flatten) { + // flatten case + if (fAxis < 0) + fAxis += input_shape.size(); + auto s1 = std::vector(input_shape.begin(), input_shape.begin() + fAxis); + auto s2 = std::vector(input_shape.begin() + fAxis, input_shape.end()); + auto l1 = ConvertDimShapeToLength(s1); + auto l2 = ConvertDimShapeToLength(s2); + std::vector newShape = {Dim{l1}, Dim{l2}}; + return newShape; + } else if (fOpMode == Squeeze) { + // squeeze + // assume no axis is provided - remove all axes with value equal to 1 + auto output_shape = input_shape; + if (fAttrAxes.empty()) { + size_t i = 0; + while (i < output_shape.size()) { + if (output_shape[i] == Dim{1}) { + output_shape.erase(output_shape.begin() + i); + } else { + i++; + } + } + } else { + auto axes = fAttrAxes; + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] < 0) + axes[i] += input_shape.size(); + if (!(output_shape[axes[i]] == Dim{1})) + throw std::runtime_error("TMVA Squeeze Op : Invalid axis value " + std::to_string(axes[i]) + + " for " + ConvertDimShapeToString(output_shape)); + } + // for calling vector::erase we must sort axes in decreasing order to avoid + std::sort(axes.begin(), axes.end(), std::greater()); + for (auto & axis : axes) { + output_shape.erase(output_shape.begin() + axis); + } + } + return output_shape; + } + else if (fOpMode == Unsqueeze) { + // unsqueeze + assert(!fAttrAxes.empty()); + auto output_shape = input_shape; + auto &axes = fAttrAxes; + // output rank + int64_t r = input_shape.size() + axes.size(); + for (auto &a : axes) { + int64_t i = static_cast(a); + if (i < -r || i > r - 1) + throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); + if (i >= 0) + output_shape.insert(output_shape.begin() + i, Dim{1}); + else + // negative axes + output_shape.insert(output_shape.end() + i + 1, Dim{1}); + } + return output_shape; + } + throw std::runtime_error("TMVA Reshape Op : Invalid ReshapeOpMode"); + return {Dim{}}; + } + + void Initialize(RModel& model) override { + + fVerbose = model.Verbose(); + if (fVerbose) + std::cout << "initialize reshape op type " << fOpMode << " - for input " << fNData + << " to shape given by " << fNInput2 << std::endl; + + if (model.CheckIfTensorAlreadyExist(fNData) == false) { + // input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); + } + fShapeInput = model.GetDimTensorShape(fNData); + fDimInput = model.IsDynamicTensor(fNData); + // check if optional tensor exists defining shape or axes + if (!fNInput2.empty()) { + if (model.CheckIfTensorAlreadyExist(fNInput2)) { + if (model.IsInitializedTensor(fNInput2)) { + // assume input shape is an initialized tensor + auto dptr = model.GetInitializedTensorData(fNInput2); + auto values = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNInput2); + size_t n = 1; + if (vec.size() > 0) + n = vec[0]; // size of shape input tensor + // copy values in fShape vector or fAttrAxes + if (fOpMode == Reshape) + fShape = std::vector(values, values + n); + else + fAttrAxes = std::vector(values, values + n); + + std::vector targetShape(fShape.begin(),fShape.end()); + fShapeOutput = DoShapeInference(fShapeInput, targetShape); + // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed + model.SetNotWritableInitializedTensor(fNInput2); + } else if (model.IsShapeTensor(fNInput2)) { + auto shapeData = model.GetShapeTensorValues(fNInput2); + fShapeOutput = DoShapeInference(fShapeInput, shapeData); + if (model.Verbose()) + std::cout << "Reshape op - get output shape from shape tensor " << fNInput2 << " with value " << ConvertDimShapeToString(shapeData) << std::endl; + } else { + // we cannot get shape at initialization time but at run-time + fDynamicShape = true; + // size of shape output us given by size of shape input tensor + if (model.IsDynamicTensor(fNInput2)) { + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " cannot have dynamic shape"); + } + auto shapeInput2 = model.GetTensorShape(fNInput2); + fShapeOutput.resize(shapeInput2[0]); + for (size_t i = 0; i < fShapeOutput.size(); i++) { + fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)}; + } + } + } else { + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model"); + } + } else if (!fAttrAxes.empty()) { + // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze) + fShapeOutput = DoShapeInference(fShapeInput, std::vector{}); + } else if (fOpMode == Flatten || fOpMode == Squeeze) { + fShapeOutput = DoShapeInference(fShapeInput, std::vector{}); + } else { + throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data"); + } + // check if output is constant or not + if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { + fIsOutputConstant = true; + auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); + auto o_shape = ConvertShapeToInt(fShapeOutput); + if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) ) + throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); + model.AddConstantTensor(fNOutput, o_shape, inputData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl; + } + } + // for input shape tensors we can have it if output shape is size==1 or a scalar + else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) { + // not sure if we ever end-up here - maybe reshaping from scalar to vector or viceversa + fIsOutputParamShape = true; + fOutputShapeData = model.GetShapeTensorValues(fNData); + model.AddShapeTensor(fNOutput, fOutputShapeData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (shape) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertDimShapeToString(fOutputShapeData) << std::endl; + } + } + else { + // non-constant case + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); + if (model.Verbose()) + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << std::endl; + } + } + + std::string Generate(std::string opName) override { + + + std::stringstream out; + std::string opType = "Reshape"; + if (fOpMode == Flatten) + opType = "Flatten"; + else if (fOpMode == Squeeze) + opType = "Squeeze"; + else if (fOpMode == Unsqueeze) + opType = "Unsqueeze"; + + out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertDimShapeToString(fShapeOutput) << "\n"; + + if (fIsOutputConstant) return out.str(); //no op for constant tensors + + if (fIsOutputParamShape) { + // no code to generate here for param shape output. Tensor output is defined in Session constructor + out << "//----------------output is a shape tensor----------\n"; + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << " ] = " << fOutputShapeData[i].GetVal() << ";\n"; + } + return out.str(); + } + + // in case of dynamic output shape we need to set the shape value from input shape tensor + // and take case of the zero values + if (fDynamicShape) { + for (size_t i = 0; i < fShapeOutput.size(); i++) { + // since fNInput2 values are int64_t, should we check if they are negative? + out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n"; + if (!fAllowZero) + out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) " + << fShapeOutput[i].param << " = " << fShapeInput[i] << ";\n"; + } + } + + // output of reshape is same as input + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + // check needs to be done at run-time + out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE Reshape " << opName << " output length " + << lengthOut << " is different than input one " << lengthIn << "\");\n"; + } + + + out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput + << ");\n"; + return out.str(); + } + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + + opName = "op_" + opName; + + if (fIsOutputParamShape) { + // shape tensor output: fill host-side tensor values, no device copy needed + std::stringstream out; + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i].GetVal() << ";\n"; + } + return out.str(); + } + + std::string opType = "Reshape"; + if (fOpMode == Flatten) opType = "Flatten"; + else if (fOpMode == Squeeze) opType = "Squeeze"; + else if (fOpMode == Unsqueeze) opType = "Unsqueeze"; + + std::stringstream out; + out << SP << "///------- " << opType << " operator " << opName << "\n"; + + if (fDynamicShape) { + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + out << SP << "if (" << lengthOut << " != " << lengthIn << ")\n"; + out << SP << SP << "throw std::runtime_error(\"SOFIE " << opType + << " Op : output length is different from input length\");\n"; + } + } + + // Reshape / View / Squeeze / Unsqueeze are zero-copy reinterpretations of memory. + // Instead of a GPU memcpy + CPU synchronisation barrier, create a local non-owning + // view that aliases the source buffer. All downstream getPtrNative() calls on the + // local view return the same device pointer as the source — no data movement at all. + auto outputLength = ConvertDimShapeToLength(fShapeOutput); + out << SP << "auto deviceBuf_" << fNOutput + << " = alpaka::createView(devAcc, alpaka::getPtrNative(deviceBuf_" << fNData + << "), static_cast(" << outputLength << "));\n"; + + return out.str(); +} + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_RESHAPE diff --git a/core/inc/SOFIE/ROperator_ScatterElements.hxx b/core/inc/SOFIE/ROperator_ScatterElements.hxx new file mode 100644 index 0000000..3cedaa7 --- /dev/null +++ b/core/inc/SOFIE/ROperator_ScatterElements.hxx @@ -0,0 +1,469 @@ +#ifndef SOFIE_ROperator_ScatterElements +#define SOFIE_ROperator_ScatterElements + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + + +class ROperator_ScatterElements final : public ROperator{ +private: + + int64_t fAxis; + + std::string fNX; + std::string fNI; + std::string fNU; + std::string fNY; + std::string fReduction; + + // True only when fNI is a constant/initialized tensor: pre-sort at model + // load time is legal and the atomic-free segmented-add path is used. + // For dynamic index tensors (computed at inference time) we fall back to + // the original atomicAdd kernel — still faster than before because the + // stray alpaka::wait() before the scatter is removed. + bool fUseSegmentedReduction = false; + + std::vector fShapeX; + std::vector fShapeI; + std::vector fShapeY; + + // define reduction function. Possibilities are: + // none (default), add, mul, max, min + std::string ReductionFunction(const std::string & t1, const std::string & t2 ) { + std::string name = fReduction; + if (name.empty() || name == "none") + return t2; + else if (name == "add") + return t1 + " + " + t2; + else if (name == "mul") + return t1 + " * " + t2; + else if (name == "max") + return "std::max(" + t1 + "," + t2 + ")"; + else if (name == "min") + return "std::min(" + t1 + "," + t2 + ")"; + else + throw std::runtime_error("SOFIE ScatterElements : invalid reduction attribute"); + + return std::string(); + } + +public: + ROperator_ScatterElements(){} + ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY, + int axis, std::string reduction): + fAxis(axis), + fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)), + fNY(UTILITY::Clean_name(nameY)), + fReduction(reduction) + { + fInputTensorNames = { fNX, fNI, fNU }; + fOutputTensorNames = { fNY }; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { + return input; + } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override { + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)){ + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNI)) { + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNU)) { + throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model"); + } + //tbd check for constant tensors + + fShapeX = model.GetDimTensorShape(fNX); + fShapeI = model.GetDimTensorShape(fNI); + auto fShapeU = model.GetDimTensorShape(fNU); + if (fShapeU.size() != fShapeI.size()) + throw std::runtime_error(std::string("SOFIE ScatterElements - update tensor has invalid rank")) ; + if (fShapeX.size() == 0) + throw std::runtime_error(std::string("SOFIE ScatterElements - input tensor has zero rank ")) ; + if (fShapeX.size() != fShapeI.size()) + throw std::runtime_error(std::string("SOFIE ScatterElements - index tensor has invalid rank ")) ; + + if (fAxis < 0) fAxis += (int64_t)fShapeX.size(); + + // assume output shape is identical to input shape + fShapeY = fShapeX; + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + + // For "add" reduction, only use the atomic-free segmented path when the + // index tensor is a static (constant/initialized) tensor — i.e. when the + // graph topology is fixed across inference calls. For dynamic index + // tensors the original atomicAdd kernel is used, but the stray + // alpaka::wait() before it has already been removed (significant win). + if (fReduction == "add" && model.IsInitializedTensor(fNI)) { + fUseSegmentedReduction = true; + // Convert Dim-based shape to size_t shape for registration. + std::vector shapeI_static; + for (const auto& d : fShapeI) + shapeI_static.push_back(d.dim ? d.dim : 1); + model.AddIntermediateTensor(fNI + "_sortedI", ETensorType::INT32, shapeI_static); + model.AddIntermediateTensor(fNI + "_sortPerm", ETensorType::INT32, shapeI_static); + // std::iota and std::stable_sort used in the generated init code. + model.AddNeededStdLib("numeric"); + model.AddNeededStdLib("algorithm"); + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + return out.str(); + } + + // ----------------------------------------------------------------------- + // GenerateInitCode_GPU_ALPAKA — emitted once inside the Session constructor. + // + // For "add" scatter, we build two static (model-lifetime) device buffers: + // + // deviceBuf__sortedI : int64_t[|I|] — the index values, sorted along + // the scatter axis, then feature within each row. + // deviceBuf__sortPerm : int32_t[|I|] — argsort of I (maps sorted + // position back to the original update position). + // + // Both are computed on the host at load time and uploaded once. During + // inference the segmented-add kernel reads from these buffers, which are + // read-only and never modified. + // ----------------------------------------------------------------------- + std::string GenerateInitCode_GPU_ALPAKA() override { + if (!fUseSegmentedReduction) return ""; // only static-index models use segmented path + + std::string totalElements = ConvertDimShapeToLength(fShapeI); + // Feature dimension = last dim of I (the non-axis stride). + std::string numFeatures = fShapeI.back().GetVal(); + + std::stringstream out; + out << "\n// --- ScatterElements sorted-index init for segmented-add ---\n"; + out << "{\n"; + out << SP << "// Build host-side argsort of the index tensor " << fNI << "\n"; + out << SP << "// along scatter axis " << fAxis << " so inference can use\n"; + out << SP << "// the atomic-free segmented-add kernel.\n"; + out << SP << "const std::size_t _nElem_" << fNI << " = " << totalElements << ";\n"; + out << SP << "const std::size_t _nFeat_" << fNI << " = " << numFeatures << ";\n"; + out << SP << "const std::size_t _nRows_" << fNI << " = _nElem_" << fNI << " / _nFeat_" << fNI << ";\n"; + + // Retrieve the host pointer for the index tensor. + out << SP << "auto* _hI_" << fNI << " = tensor_" << fNI << ";\n"; + + // Build a sorted permutation (argsort of row-axis indices). + out << SP << "std::vector _hostSortedI_" << fNI << "(_nElem_" << fNI << ");\n"; + out << SP << "std::vector _hostSortPerm_" << fNI << "(_nElem_" << fNI << ");\n"; + out << SP << "// argsort rows by axis index value\n"; + out << SP << "std::vector _rowOrder_" << fNI << "(_nRows_" << fNI << ");\n"; + out << SP << "std::iota(_rowOrder_" << fNI << ".begin(), _rowOrder_" << fNI << ".end(), 0);\n"; + out << SP << "std::stable_sort(_rowOrder_" << fNI << ".begin(), _rowOrder_" << fNI << ".end(),\n"; + out << SP << SP << "[&](std::size_t a, std::size_t b){\n"; + out << SP << SP << SP << "return _hI_" << fNI << "[a * _nFeat_" << fNI << "] < _hI_" << fNI << "[b * _nFeat_" << fNI << "];\n"; + out << SP << SP << "});\n"; + out << SP << "for (std::size_t _r = 0; _r < _nRows_" << fNI << "; ++_r) {\n"; + out << SP << SP << "std::size_t _src = _rowOrder_" << fNI << "[_r];\n"; + out << SP << SP << "for (std::size_t _f = 0; _f < _nFeat_" << fNI << "; ++_f) {\n"; + out << SP << SP << SP << "_hostSortedI_" << fNI << "[_r * _nFeat_" << fNI << " + _f] = " + << "static_cast(_hI_" << fNI << "[_src * _nFeat_" << fNI << " + _f]);\n"; + out << SP << SP << SP << "_hostSortPerm_" << fNI << "[_r * _nFeat_" << fNI << " + _f] = " + << "static_cast(_src * _nFeat_" << fNI << " + _f);\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + + // Allocate device buffers and upload. + out << SP << "auto _hBufSortedI_" << fNI + << " = alpaka::allocBuf(host, Ext1D::all(Idx{_nElem_" << fNI << "}));\n"; + out << SP << "auto _hBufSortPerm_" << fNI + << " = alpaka::allocBuf(host, Ext1D::all(Idx{_nElem_" << fNI << "}));\n"; + out << SP << "std::copy(_hostSortedI_" << fNI << ".begin(), _hostSortedI_" << fNI << ".end(), " + << "alpaka::getPtrNative(_hBufSortedI_" << fNI << "));\n"; + out << SP << "std::copy(_hostSortPerm_" << fNI << ".begin(), _hostSortPerm_" << fNI << ".end(), " + << "alpaka::getPtrNative(_hBufSortPerm_" << fNI << "));\n"; + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNI << "_sortedI, _hBufSortedI_" << fNI << ");\n"; + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNI << "_sortPerm, _hBufSortPerm_" << fNI << ");\n"; + out << "}\n"; + return out.str(); + } + + std::string Generate(std::string opName) override { + + if (fIsOutputConstant) return ""; + + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + std::stringstream out; + out << SP << "\n//-------- ScatterElements --- " << opName << "\n"; + + auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); + auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); + + std::string length = ConvertDimShapeToLength(fShapeY); + + // function to write compute expression for global index from Dim-based strides + auto tensorIndex = [](const std::vector & stride, const std::vector & idx) { + std::stringstream strst; + int dims = idx.size(); + assert (dims == (int) stride.size()); + for (int i = 0; i < dims; i++) { + std::string sv = stride[i].GetVal(); + if (sv != "1") + strst << sv << "*" << idx[i]; + else + strst << idx[i]; + if (i < dims-1) + strst << " + "; + } + return strst.str(); + }; + + + // copy first input in output (maybe can be avoided??) + out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; + + // loop on tensor rank + int dims = fShapeY.size(); + std::vector idx(dims); + for (int i = 0; i < dims; i++) { + idx[i] = std::string("i") + std::to_string(i); + for (int j = 0; j <= i; j++) out << SP; + out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i].GetVal() << "; " << idx[i] << "++) {\n"; + } + // correct index for specific axis + for (int j = 0; j <= dims; j++) out << SP; + out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "int iAxis = tensor_" << fNI << "[updateIndex];\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis].GetVal() << ";\n"; + idx[fAxis] = "iAxis"; + for (int j = 0; j <= dims; j++) out << SP; + out << "int outIndex = " << tensorIndex(strideY, idx) << ";\n"; + for (int j = 0; j <= dims; j++) out << SP; + out << "tensor_" << fNY << "[outIndex] = " + << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n"; + + for (int i = dims; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + return out.str(); + } + + // ----------------------------------------------------------------------- + // Generate_GPU_Kernel_ALPAKA + // + // For the "add" reduction (the GNN scatter-add case) we emit a + // *segmented* kernel instead of the naive atomicAdd kernel. + // + // Motivation: the index tensor I (edge_index) is STATIC — it never + // changes between inference calls. We pre-sort it once at model init + // by the scatter axis value so that all updates targeting the same + // output row are contiguous. Each GPU thread then owns one contiguous + // segment of updates and accumulates them with a simple serial loop, + // writing the result with a single non-atomic store. This eliminates + // all atomic serialisation and improves cache locality on the update + // tensor U. + // + // The sorted permutation is stored in the device buffer + // deviceBuf__sortPerm (int32, length = |I|) + // and is built by GenerateInitCode_GPU_ALPAKA below. + // + // Non-"add" reductions retain the original atomicXxx kernel. + // ----------------------------------------------------------------------- + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + const std::size_t D = fShapeI.size(); + + auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); + auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); + + std::string totalElementsStr = ConvertDimShapeToLength(fShapeI); + + // ---- segmented-add path (only when index tensor is static/constant) ---- + if (fUseSegmentedReduction) { + // Number of output rows along the scatter axis. + std::string numOutputRows = fShapeY[fAxis].GetVal(); + // Feature stride along the non-axis dimension (for 2-D tensors this + // is just strideI[1], i.e. the number of features per row). + std::string featStride = strideI[D - 1].GetVal(); // stride of last dim + + std::string op; + op = "\n//------ SCATTERELEMENTS_SEGMENTED_ADD_KERNEL_ALPAKA\n"; + op += "// One thread per output-row × feature column.\n"; + op += "// Reads updates in sorted order — no atomics needed.\n"; + op += SP + "struct ScatterElementsKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T* Y,\n"; + op += SP + SP + SP + "int64_t const* I_sorted,\n"; // axis index, sorted + op += SP + SP + SP + "T const* U,\n"; + op += SP + SP + SP + "int32_t const* sortPerm,\n"; // argsort of I + op += SP + SP + SP + "std::size_t const totalUpdates,\n"; + op += SP + SP + SP + "std::size_t const numFeatures) const {\n\n"; + + op += SP + SP + SP + "// Each thread processes one (output_row, feature) pair.\n"; + op += SP + SP + SP + "auto const tid = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "auto const stride = alpaka::getWorkDiv(acc)[0];\n"; + op += SP + SP + SP + "// Total work = numOutputRows * numFeatures (= size of Y)\n"; + op += SP + SP + SP + "std::size_t const totalWork = " + numOutputRows + " * numFeatures;\n"; + op += SP + SP + SP + "for (std::size_t t = tid; t < totalWork; t += stride) {\n"; + op += SP + SP + SP + SP + "std::size_t const out_row = t / numFeatures;\n"; + op += SP + SP + SP + SP + "std::size_t const feat = t % numFeatures;\n"; + op += SP + SP + SP + SP + "// Binary-search for the first sorted index == out_row.\n"; + op += SP + SP + SP + SP + "std::size_t lo = 0, hi = totalUpdates;\n"; + op += SP + SP + SP + SP + "while (lo < hi) {\n"; + op += SP + SP + SP + SP + SP + "std::size_t mid = (lo + hi) / 2;\n"; + op += SP + SP + SP + SP + SP + "if (static_cast(I_sorted[mid * numFeatures]) < out_row) lo = mid + 1;\n"; + op += SP + SP + SP + SP + SP + "else hi = mid;\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + "T acc_val = Y[out_row * numFeatures + feat];\n"; + op += SP + SP + SP + SP + "for (std::size_t k = lo; k < totalUpdates; ++k) {\n"; + op += SP + SP + SP + SP + SP + "if (static_cast(I_sorted[k * numFeatures]) != out_row) break;\n"; + op += SP + SP + SP + SP + SP + "std::size_t const perm_k = static_cast(sortPerm[k * numFeatures + feat]);\n"; + op += SP + SP + SP + SP + SP + "acc_val += U[perm_k];\n"; + op += SP + SP + SP + SP + "}\n"; + op += SP + SP + SP + SP + "Y[out_row * numFeatures + feat] = acc_val;\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + // ---- original atomic kernel (non-add reductions) ---- + std::string op; + op = "\n//------ SCATTERELEMENTS_KERNEL_ALPAKA\n"; + op += SP + "struct ScatterElementsKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T* Y,\n"; + op += SP + SP + SP + "int64_t const* I,\n"; + op += SP + SP + SP + "T const* U,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "std::size_t remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const idx_" + std::to_string(d) + + " = remaining / " + strideI[d].GetVal() + ";\n"; + op += SP + SP + SP + SP + "remaining -= idx_" + std::to_string(d) + + " * " + strideI[d].GetVal() + ";\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t iAxis = I[elem_idx];\n"; + op += SP + SP + SP + SP + "if (iAxis < 0) iAxis += " + fShapeY[fAxis].GetVal() + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t const out_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == (std::size_t)fAxis) + ? "static_cast(iAxis)" + : "idx_" + std::to_string(d); + op += SP + SP + SP + SP + SP + coord + " * " + strideY[d].GetVal(); + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + if (fReduction.empty() || fReduction == "none") { + op += SP + SP + SP + SP + "Y[out_idx] = U[elem_idx];\n"; + } else if (fReduction == "mul") { + op += SP + SP + SP + SP + "alpaka::atomicMul(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "max") { + op += SP + SP + SP + SP + "alpaka::atomicMax(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "min") { + op += SP + SP + SP + SP + "alpaka::atomicMin(acc, &Y[out_idx], U[elem_idx]);\n"; + } + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + return SP + "ScatterElementsKernel_" + opName + " scatterElementsKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + std::string totalElements = ConvertDimShapeToLength(fShapeI); + + std::stringstream out; + out << "\n//------ SCATTERELEMENTS_GPU_ALPAKA\n"; + + // Copy input → output (seeds the accumulation buffer, then scatter adds to it). + // No wait needed here — ALPAKA's in-order queue ensures ordering. + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n"; + + if (fUseSegmentedReduction) { + // ---- segmented-add path: atomic-free, uses pre-sorted index buffers ---- + // Work is one thread per (output_row × feature); the kernel does a + // serial loop over the sorted segment and accumulates without atomics. + std::string numOutputRows = fShapeY[fAxis].GetVal(); + std::string numFeatures = fShapeI.back().GetVal(); + std::string numRows = std::string("(") + totalElements + " / " + numFeatures + ")"; + std::string totalWork = numOutputRows + " * " + numFeatures; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(static_cast(" << totalWork << "));\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", scatterElementsKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << "_sortedI)" + << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << "_sortPerm)" + << ", static_cast(" << numRows << ")" + << ", static_cast(" << numFeatures << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + } else { + // ---- original atomic kernel (non-add reductions) ---- + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(static_cast(" << totalElements << "));\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", scatterElementsKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n"; + } + return out.str(); +} +}; + +}//SOFIE + + +#endif //SOFIE_ROperator_ScatterElements diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx b/core/inc/SOFIE/ROperator_Selu.hxx similarity index 83% rename from src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx rename to core/inc/SOFIE/ROperator_Selu.hxx index 96f4445..5bec42c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx +++ b/core/inc/SOFIE/ROperator_Selu.hxx @@ -17,7 +17,7 @@ private: std::string fNX; std::string fNY; - std::vector fShape; + std::vector fShape; public: ROperator_Selu(){} @@ -38,9 +38,9 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Selu Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Selu Op Input Tensor is not found in model"); } - fShape = model.GetTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); } @@ -48,13 +48,10 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Selu called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Selu called to Generate without being initialized first"); } std::stringstream out; - int length = 1; - for(auto& i: fShape){ - length *= i; - } + std::string length = ConvertDimShapeToLength(fShape); out << "\t" << "for (int id = 0; id < " << length << " ; id++){\n"; out << "\t\t" << "tensor_" << fNY << "[id] = 1.0507009873554804934193349852946 * (std::max(float(0.0), tensor_" << fNX << "[id]) + std::min(0.0, 1.6732632423543772848170429916717 * (std::exp(" << "tensor_" << fNX << "[id]" <<")-1)));\n"; out << "\t}\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/core/inc/SOFIE/ROperator_Shape.hxx similarity index 64% rename from src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx rename to core/inc/SOFIE/ROperator_Shape.hxx index 52bdeae..c466271 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx +++ b/core/inc/SOFIE/ROperator_Shape.hxx @@ -47,10 +47,16 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Shape Op Input Tensor " + fNX + " is not found in model"); + throw std::runtime_error("SOFIE Shape Op Input Tensor " + fNX + " is not found in model"); } - fShape = model.GetTensorShape(fNX); - size_t length = fShape.size(); // this the size of shape not length of tensor + // Use Dim-aware shape query to handle dynamic (symbolic) tensors + auto dimShape = model.GetDimTensorShape(fNX); + size_t length = dimShape.size(); // rank of the input tensor + // Build fShape from dimShape (0 for symbolic/dynamic dims, concrete value otherwise) + fShape.resize(length); + for (size_t i = 0; i < length; i++) + fShape[i] = dimShape[i].isParam ? 0 : dimShape[i].dim; + fStart = std::max(fStart,(int) -length); fStart = std::min(fStart,(int) length); if (fStart < 0) fStart += length; @@ -74,6 +80,14 @@ public: std::cout << std::endl; } fIsOutputConstant = true; + } else if (model.IsDynamicTensor(fNX) && !fOutput_shape.empty()) { + // For dynamic tensors, register the output as a shape tensor with symbolic dimension values + std::vector dimVals(dimShape.begin() + fStart, dimShape.begin() + fEnd); + model.AddShapeTensor(fNY, dimVals, false); + fIsOutputConstant = true; // no runtime code needed + if (model.Verbose()) { + std::cout << "Output of Shape (dynamic input) is shape tensor: " << ConvertDimShapeToString(dimVals) << std::endl; + } } else model.AddIntermediateTensor(fNY, ETensorType::INT64, fOutput_shape); @@ -87,7 +101,7 @@ public: OpName = "op_" + OpName; if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first"); } std::stringstream out; @@ -101,6 +115,26 @@ public: return out.str(); } + std::string Generate_GPU_ALPAKA(std::string OpName) override { + // no need to generate code if the output is constant + if (fIsOutputConstant) return ""; + + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first"); + } + std::stringstream out; + + out << "\n//------ Shape\n"; + // add a dummy statement to avoid warning for unused input + out << SP << "(void) deviceBuf_" << fNX << ";\n"; + size_t length = ConvertShapeToLength(fOutput_shape); + for (size_t id = 0; id < length; id++) { + out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n"; + } + return out.str(); + } + }; }//SOFIE diff --git a/core/inc/SOFIE/ROperator_Sigmoid.hxx b/core/inc/SOFIE/ROperator_Sigmoid.hxx new file mode 100644 index 0000000..6540b8c --- /dev/null +++ b/core/inc/SOFIE/ROperator_Sigmoid.hxx @@ -0,0 +1,124 @@ +#ifndef SOFIE_ROPERATOR_Sigmoid +#define SOFIE_ROPERATOR_Sigmoid + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE{ + +template +class ROperator_Sigmoid final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Sigmoid(){} + ROperator_Sigmoid(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::SIGMOID; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Sigmoid Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + } + + + std::string Generate(std::string opName) override { + if (fShape.empty()){ + throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ Sigmoid -- " << opName << "\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_" << fNX << "[id]));\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ SIGMOID_KERNEL_ALPAKA\n"; + op += "struct SigmoidKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if(idx < numElements) {\n"; + op += SP + SP + SP + SP + "out[idx] = static_cast(1) / (static_cast(1) + exp(-data[idx]));\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "SigmoidKernel sigmoidKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ SIGMOID_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "static_cast(1) / (static_cast(1) + exp(-(" + v + ")))"; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + + std::vector GetStdLibs() override { return { std::string("cmath") };} +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Sigmoid diff --git a/core/inc/SOFIE/ROperator_Slice.hxx b/core/inc/SOFIE/ROperator_Slice.hxx new file mode 100644 index 0000000..fb738cf --- /dev/null +++ b/core/inc/SOFIE/ROperator_Slice.hxx @@ -0,0 +1,592 @@ +#ifndef SOFIE_ROPERATOR_SLICE +#define SOFIE_ROPERATOR_SLICE + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include + + +namespace SOFIE{ + +// slice operator + +template +class ROperator_Slice final : public ROperator +{ + +private: + + // flags to indicate if start/end and steps are not defined at compiled time + bool fIsStartUndef = false; + bool fIsEndUndef = false; + bool fIsStepUndef = false; + bool fIdentitySlice = false; + std::string fNData; // input data tensor name + std::string fNOutput; // output data name + std::vector fNames; // tensor names for meta(axis) information + std::vector fShapeInput; // input shape + std::vector fShapeOutput; // output shape + std::vector fOutputShapeData; // output shape data in case output is a shape param tensor + + // saved Start/End.Steps are corrected from initial ONNX for negative/default values + // and are available for each axis + std::vector fStart; // starting values of slices for all axes + std::vector fEnd; // End values of slices for all axes + std::vector fSteps; // step values of slices for all axes + std::vector fStartDims; // input starting values of slices + std::vector fEndDims; // input End values of slices + std::vector fStepDims; // input step values of slices + std::vector fAxes; // axes for input start/emd/step values + + std::vector> fAttributes; // attributes for the version <=10 case + + +public: + + ROperator_Slice(){} + + // ctor for versions >= 10 + ROperator_Slice(std::string nameData, std::vector names, std::string nameOutput) + : fNData(UTILITY::Clean_name(nameData)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + fNames.resize(4); + // axes and steps can be optional + for (size_t i = 0; i < names.size(); ++i) { + fNames[i] = UTILITY::Clean_name(names[i]); + } + + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + // ctor for versions < 10 + ROperator_Slice(std::string nameData, std::vector starts, std::vector ends, std::vector axes, std::string nameOutput) + : fNData(UTILITY::Clean_name(nameData)), + fNOutput(UTILITY::Clean_name(nameOutput)) + { + fAttributes.push_back(starts); + fAttributes.push_back(ends); + fAttributes.push_back(axes); + } + + + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model"); + } + + std::vector> shapes; + fShapeInput = model.GetDimTensorShape(fNData); + shapes.push_back(fShapeInput); + + std::vector> itensors(4); + + if (fNames.size() > 0) { // size has to be equal to 4 + // loop on the extra 2 or 3 or 4 inputs + for (size_t i = 0; i < 4; ++i) { + if (!fNames[i].empty()) { + if (model.IsInitializedTensor(fNames[i])) { + auto dptr = model.GetInitializedTensorData(fNames[i]); + auto tensor = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNames[i]); + assert(vec.size() == 1); + itensors[i] = std::vector(tensor, tensor + vec[0]); + + } else if (model.IsShapeTensor(fNames[i])) { + // case is a shape tensor + if (i == 0) { + fStartDims = model.GetShapeTensorValues(fNames[i]); + } else if (i == 1) { + fEndDims = model.GetShapeTensorValues(fNames[i]); + } else if (i == 3) { + fStepDims = model.GetShapeTensorValues(fNames[i]); + } + } else { + // case is an intermediate tensor + auto shape = model.GetTensorShape(fNames[i]); + size_t s = shape[0]; + for (size_t k = 0; k < s; k++) { + if (i == 0) { + fStartDims.push_back( Dim{std::string("start_") + fNOutput + "_" + std::to_string(k)}); + fIsStartUndef = true; + } else if (i == 1) { + fEndDims.push_back(Dim{std::string("end_") + fNOutput + "_" + std::to_string(k)}); + fIsEndUndef = true; + } else if (i == 3) { + fStepDims.push_back(Dim{std::string("step_") + fNOutput + "_" + std::to_string(k)}); + fIsStepUndef = true; + } + } + } + } + } + } else { + // old slice versions + assert(fAttributes.size() > 1); + for (size_t i = 0; i < fAttributes.size(); i++) { + itensors[i] = fAttributes[i]; + } + } + size_t dim = fShapeInput.size(); + + // default values + fSteps = std::vector(dim, Dim{1}); + fStart = std::vector(dim, Dim{0}); + fEnd = fShapeInput; + + // default axes + if (itensors[2].empty()) { + fAxes.resize(dim); + std::iota(fAxes.begin(), fAxes.end(), 0); + } else { + fAxes = itensors[2]; + for (size_t i = 0; i < fAxes.size(); i++) { + // negative axes - they count from the back + if (fAxes[i] < 0) fAxes[i] = dim + fAxes[i]; + if (fAxes[i] < 0 || fAxes[i] >= static_cast(dim)) + throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(fAxes[i]) + + " for " + std::to_string(i)); + } + } + // Loop on axis to get start/end/step values + for (size_t i = 0; i < fAxes.size(); i++) { + if (!itensors[0].empty() ) + fStartDims.push_back(Dim{ static_cast(itensors[0][i])}); + if (fStartDims.empty()) + throw std::runtime_error("TMVA Slice Op : Missing start input tensor"); + + if (!itensors[1].empty()) + fEndDims.push_back(Dim{ static_cast(itensors[1][i])}); + else if (fEndDims.empty()) + throw std::runtime_error("TMVA Slice Op : Missing end input tensor"); + + if (!itensors[3].empty()) { + fStepDims.push_back(Dim{ static_cast(itensors[3][i])}); + } + else if (fStepDims.size() < fAxes.size()) // this can happen since it is optional + fStepDims.push_back(Dim{size_t(1)}); + + if (!fShapeInput[fAxes[i]].isParam) { + size_t iAxisDim = fShapeInput[fAxes[i]].dim; + //correct values if too large or too small + IType istart = 0; + if (!fStartDims[i].isParam) { + istart = static_cast(fStartDims[i].dim); + if (istart < 0) istart = iAxisDim + istart; + } + IType iend = static_cast(iAxisDim); + if (!fEndDims[i].isParam) { + iend = static_cast(fEndDims[i].dim); + if (iend < 0) iend = iAxisDim + iend; + } + //steps + IType istep = 1; + if (!fStepDims[i].isParam) { + istep = static_cast(fStepDims[i].dim); + } else { + throw std::runtime_error("TMVA Slice Op : parametric step inputs are not supported"); + } + // clamp start end values depending on steps + // start must be [0,N] for positive steps or [0,N-1] for negative + // end must be [0,N] for positive steps or [-1, N-1] for negative + if (istart < 0) istart = 0; + if (istep > 0) { + if (istart > static_cast(iAxisDim)) istart = static_cast(iAxisDim); + if (iend < 0) iend = 0; + if (iend > static_cast(iAxisDim)) iend = static_cast(iAxisDim); + } else if (istep < 0) { + if (istart > static_cast(iAxisDim)-1) istart = static_cast(iAxisDim) -1; + if (iend < -1) iend = -1; + if (iend > static_cast(iAxisDim)-1) iend = static_cast(iAxisDim) -1; + } else { + throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(istep) + + " for " + std::to_string(i)); + } + // for parametric values clamping we will done at run time + if (fStartDims[i].isParam) + fStart[fAxes[i]] = fStartDims[i]; + else + fStart[fAxes[i]] = Dim{size_t(istart)}; + if (fStartDims[i].isParam) + fEnd[fAxes[i]] = fEndDims[i]; + else + fEnd[fAxes[i]] = Dim{size_t(iend)}; + + fSteps[fAxes[i]] = Dim{size_t(istep)}; + } else { + //std::cout << i << " Param dim for " << fAxes[i] << " " << fShapeInput[fAxes[i]] << std::endl; + // correct only negative values + if (!fStartDims[i].isParam) { + IType istart = static_cast(fStartDims[i].dim); + if (istart < 0) { + std::string sstart = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-istart) +")"; + fStart[fAxes[i]] = Dim{sstart,size_t(-1)}; + } else { + fStart[fAxes[i]] = Dim{size_t(istart)}; + } + } else { + fStart[fAxes[i]] = fStartDims[i]; + } + if (!fEndDims[i].isParam) { + IType iend = static_cast(fEndDims[i].dim); + if (iend < 0) { + std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")"; + fEnd[fAxes[i]] = Dim{send,size_t(-1)}; + } else if (iend == std::numeric_limits::max()){ + fEnd[fAxes[i]] = fShapeInput[fAxes[i]]; + } else { + fEnd[fAxes[i]] = Dim{size_t(iend)}; + } + } else { + fEnd[fAxes[i]] = fEndDims[i]; + } + + fSteps[fAxes[i]] = fStepDims[i]; + } + + } + // find output shape + fShapeOutput.resize(dim); + for (size_t i = 0; i < dim; i++) { + if (!fEnd[i].isParam && !fStart[i].isParam && !fSteps[i].isParam) { + int64_t istart = static_cast(fStart[i].dim); + int64_t iend = static_cast(fEnd[i].dim); + int64_t istep= static_cast(fSteps[i].dim); + int64_t s = (iend-istart)/istep; + fShapeOutput[i] = Dim{static_cast(s)}; + } else { + std::string s; + if (fStart[i].GetVal() != "0") + s = "(" + fEnd[i].GetVal() + "-" + fStart[i].GetVal() + ")"; + else + s = fEnd[i].GetVal(); + if (fSteps[i].GetVal() != "1") { + s.insert(0,"("); + s += ")/" + fSteps[i].GetVal() + ")"; + } + fShapeOutput[i] = Dim{s,size_t(-1)}; + // add also the shape parameters to RModel to declare them when + // allocating output tensor + if (fEnd[i].isParam && fEnd[i].dim != size_t(-1)) + model.AddShapeParam(fEnd[i].param,fEnd[i].dim ); + if (fStart[i].isParam && fStart[i].dim != size_t(-1)) + model.AddShapeParam(fStart[i].param,fStart[i].dim ); + if (fSteps[i].isParam && fSteps[i].dim != size_t(-1)) + model.AddShapeParam(fSteps[i].param,fSteps[i].dim ); + + } + } + // case input is a constant tensor and of int64 type + if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { + fIsOutputConstant = true; + auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); + size_t outputSize = ConvertShapeToLength(ConvertShapeToInt(fShapeOutput)); + std::vector outputData(outputSize); + std::vector inputStride = UTILITY::ComputeStrideFromShape(ConvertShapeToInt(fShapeInput)); + if (model.Verbose()) { + std::cout << "Do slice for initialized input ..(start, end, step)\n"; + for (size_t ii = 0; ii< fStart.size(); ii++) + std::cout << fStart [ii] << " " << fEnd[ii] << " " << fSteps[ii] << std::endl; + } + // perform slice using a recursive function- need to use two lambda functions for this + auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) { + auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) { + if (fStart[iax].isParam || fEnd[iax].isParam || fSteps[iax].isParam) + throw std::runtime_error("TMVA Slice Op : cannot have parametric values when input is constant"); + // compute indices + std::vector indices; + for (IType i = (IType) fStart[iax].dim; (IType(fSteps[iax].dim) > 0) ? i < IType(fEnd[iax].dim) : i > IType(fEnd[iax].dim); i += IType(fSteps[iax].dim) ) + indices.push_back(i); + if (iax == dim-1) { // last axis + for (size_t i = 0; i < indices.size(); i++) { + outputData[outputIdx] = inputData[inputOffset + indices[i]]; + outputIdx++; + } + return; + } else { + for (size_t i = 0; i < indices.size(); i++) { + size_t offset = inputOffset + inputStride[iax]*indices[i]; + sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl); + } + } + }; + slice_impl(iaxis, outIdx, inOffset,slice_impl); + }; + size_t idx = 0; + size_t offset = 0; + sliceRecursive(0, idx, offset); + + model.AddConstantTensor(fNOutput, ConvertShapeToInt(fShapeOutput), outputData.data()); + if (model.Verbose()) { + std::cout << "Slice: output is a constant tensor " << ConvertDimShapeToString(fShapeOutput) << " : " + << ConvertValuesToString(outputData) << std::endl; + } + } + else if (model.IsShapeTensor(fNData) && !fStart[0].isParam && !fEnd[0].isParam) { + // case of input is a shape tensor. In this case rank=1 always, axis =0 and Slice is trivial + auto inputData = model.GetShapeTensorValues(fNData); + fOutputShapeData = std::vector(inputData.begin() + fStart[0].dim, inputData.begin() + fEnd[0].dim); + // try to convert to integer values if possible + auto outputData = ConvertShapeToInt(fOutputShapeData); + fShapeOutput = { Dim{fOutputShapeData.size()}}; + if (outputData.empty()) { + // is a param shape tensor + model.AddShapeTensor(fNOutput, fOutputShapeData); + fIsOutputParamShape = true; + if (model.Verbose()) { + std::cout << "Slice: output is a shape tensor -> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << " with values " + << ConvertDimShapeToString(fOutputShapeData) << " (shape)" << std::endl; + } + } else { + fIsOutputConstant = true; + std::vector data(outputData.size()); + std::copy(outputData.begin(), outputData.end(), data.begin()); + model.AddConstantTensor(fNOutput, {data.size()}, data.data()); + if (model.Verbose()) { + std::cout << "Slice: output is a constant tensor -> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << " with values " + << ConvertDimShapeToString(fOutputShapeData) << " constant " << std::endl; + } + } + } + else { + // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1 + size_t ndim = fShapeInput.size(); + fIdentitySlice = fShapeOutput.size() == ndim; + // check also if input data is not input to the model. In that case we copy the data since we cannot just copy from the input pointer + fIdentitySlice &= (!model.IsReadyInputTensor(fNData) && !model.IsDimInputTensor(fNData)); + for (size_t idim = 0; idim < ndim; idim++) { + if (!fIdentitySlice) break; + fIdentitySlice &= (fStart[idim].GetVal() == "0"); + fIdentitySlice &= (fSteps[idim].GetVal() == "1"); + fIdentitySlice &= (fEnd[idim].GetVal() == fShapeInput[idim].GetVal()); + } + + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); + //if (fIdentitySlice) model.AddAliasTensor(fNOutput, fNData); + + if (model.Verbose()) { + std::cout << "Slice " << fNData << " " << ConvertDimShapeToString(fShapeInput) + << "---> " << fNOutput << " " << ConvertDimShapeToString(fShapeOutput); + if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) "; + std::cout << std::endl; + + } + } + } + + std::string Generate(std::string opName) override { + + if (fShapeInput.empty() || fShapeOutput.empty()){ + throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first"); + } + + std::stringstream out; + + out << "///------- Slice operator " << opName << "---> " << fNOutput << " " + << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl; + if (fIsOutputConstant) return out.str(); //no op for constant tensors + if (fIsOutputParamShape) { + out << "/// Slice output is a shape tensor with values : " << ConvertDimShapeToString(fShapeOutput) << "\n"; + // need to generate code assigning values to shape tensors + for (int i = 0; i < static_cast(fShapeOutput[0].dim); i++) { + out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n"; + } + return out.str(); + } + + size_t ndim = fShapeInput.size(); + + if (fIdentitySlice) { + out << "/// Slice is just an identity (copy) \n"; + //out << SP << "tensor_" << fNOutput << " = const_cast<" << ConvertTypeToString(fOutputType) << " *>(tensor_" << fNData << ");\n"; + out << SP << "std::copy(tensor_" << fNData << ", tensor_" << fNData << " + " << ConvertDimShapeToLength(fShapeInput) << ", tensor_" << fNOutput << ");\n"; + return out.str(); + } + + // loop on the dimensions depending no the orders + auto strides = UTILITY::ComputeStrideFromShape(fShapeInput); + + + out << SP << "{\n"; // define operator scope + for (size_t i = 0; i < fStepDims.size(); i++) { + if (fStepDims[i].isParam) { + if (fIsStepUndef) + out << SP << "size_t " << fStepDims[i] << " = tensor_" << fNames[3] << "[" << i << "];\n"; + } + } + // special case for parametric values for start/end. Need to do clipping + for (size_t i = 0; i < fStartDims.size(); i++) { + if (fStartDims[i].isParam && fStartDims[i].param != fShapeInput[fAxes[i]].param) { + std::string s_start = "start_" + std::to_string(i); + if (fIsStartUndef) { + s_start = fStartDims[i].param; + out << SP << "size_t " << s_start << " = tensor_" << fNames[0] << "[" << i << "];\n"; + } else { + out << SP << "size_t " << s_start << " = " << fStartDims[i] << ";\n"; + fStart[fAxes[i]] = s_start; // need to use this value later when slicing + } + out << SP << "if (" << s_start << " < 0) " << s_start << " += " << fShapeInput[fAxes[i]] <<";\n"; + out << SP << "if (" << s_start << " < 0) " << s_start << " = 0;\n"; + if (!fStepDims[i].isParam) { + if (static_cast(fStepDims[i].dim) > 0 ) + out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " ) " << s_start << " = " << fShapeInput[fAxes[i]] <<";\n"; + else + out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_start << " = " << fShapeInput[fAxes[i]] << " - 1;\n"; + } + } + // special case if step is negative and shape are equal and step is negative + else if (fStartDims[i].isParam && fStartDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast(fStepDims[i].dim) < 0 ) { + fStart[fAxes[i]] = Dim{ fStartDims[i].param + "-1" }; + } + } + // now to for end + for (size_t i = 0; i < fEndDims.size(); i++) { + if (fEndDims[i].isParam && fEndDims[i].param != fShapeInput[fAxes[i]].param) { + std::string s_end = "end_" + std::to_string(i); + if (fIsEndUndef) { + s_end = fEndDims[i].param; + out << SP << "size_t " << s_end << " = tensor_" << fNames[1] << "[" << i << "];\n"; + } else { + out << SP << "size_t " << s_end << " = " << fEndDims[i] << ";\n"; + fEnd[fAxes[i]] = s_end; // need to use this value later when slicing + } + out << SP << "if (" << s_end << " < 0) " << s_end << " += " << fShapeInput[fAxes[i]] <<";\n"; + if (!fStepDims[i].isParam) { + if (static_cast(fStepDims[i].dim) > 0 ) { + out << SP << "if (" << s_end << " < 0) " << s_end << " = 0;\n"; + out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " ) " << s_end << " = " << fShapeInput[fAxes[i]] <<";\n"; + } else { + out << SP << "if (" << s_end << " < -1) " << s_end << " = -1;\n"; + out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_end << " = " << fShapeInput[fAxes[i]] << " - 1;\n"; + } + } + } + // special case if step is negative and shape are equal and step is negative + else if (fEndDims[i].isParam && fEndDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast(fStepDims[i].dim) < 0 ) { + fEnd[fAxes[i]] = Dim{ fEndDims[i].param + "-1" }; + } + } + + out << SP << "size_t iOut = 0;\n"; + std::string MSP = SP; + for (size_t idim = 0; idim < ndim; idim++) { + out << MSP << "for (size_t i" << idim << " = " << fStart[idim] << "; i" << idim << " < " << fEnd[idim] + << "; i" << idim << "+= " << fSteps[idim] << ") {\n"; + MSP += SP; + if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n"; + } + out << MSP << "size_t iInput = "; + for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + "; + // here should be step size ? + out << "i" << ndim-1 << ";\n"; + out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" <\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + outputStrides[d].GetVal() + "u) % " + + fShapeOutput[d].GetVal() + "u;\n"; + } + op += "\n"; + + // Map each output coord back to input coord: + // input_coord[d] = fStart[d] + out_d * fSteps[d] + // Negative steps are supported naturally since fStart/fEnd/fSteps are + // already corrected for negative/default values during Initialize(). + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + // input coordinate for this dim: start + out_d * step + std::string input_coord = "(" + fStart[d].GetVal() + + " + out_" + std::to_string(d) + + " * " + fSteps[d].GetVal() + ")"; + op += SP + SP + SP + SP + SP + + "static_cast(" + input_coord + ")" + + " * " + inputStrides[d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "SliceKernel_" + opName; + return SP + kname + " sliceKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeOutput.empty()) + throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeOutput); + std::string kname = "sliceKernel_" + opName; + + std::stringstream out; + out << "\n//------ SLICE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNOutput << ")" + << ", static_cast(" << totalElements << "));\n"; + + return out.str(); + } + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_SLICE diff --git a/core/inc/SOFIE/ROperator_Softmax.hxx b/core/inc/SOFIE/ROperator_Softmax.hxx new file mode 100644 index 0000000..5626c0f --- /dev/null +++ b/core/inc/SOFIE/ROperator_Softmax.hxx @@ -0,0 +1,192 @@ +#ifndef SOFIE_ROPERATOR_Softmax +#define SOFIE_ROPERATOR_Softmax + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE { + +class ROperator_Softmax final : public ROperator { + +private: + bool fLogSoftmax; // for the logsoftmax case + bool fUseVDT = false; + int64_t fAttrAxis; + + std::string fNX; + std::string fNY; + std::vector fShape; + + std::string fType; + +public: + ROperator_Softmax() {} + ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY, bool logSoftmax = false) + : fLogSoftmax(logSoftmax), + fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) + + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { return input; } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; // suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNX) == + false) { // input must be a graph input, or already initialized intermediate tensor + throw std::runtime_error("SOFIE Softmax Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + fType = ConvertTypeToString(model.GetTensorType(fNX)); + if (model.Verbose()) { + std::cout << "Softmax -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; + } + fUseVDT = model.UseVDT(); + if (fUseVDT) { + model.AddNeededCustomHeader("vdt/exp.h"); + if (fLogSoftmax) + model.AddNeededCustomHeader("vdt/log.h"); + } + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Operator Softmax called to Generate without being initialized first"); + } + std::stringstream out; + out << "///------- Softmax " << opName << " ---> " // << fNY << " " + << ConvertDimShapeToString(fShape) << "\n" << std::endl; + size_t size = fShape.size(); + auto length_str = ConvertDimShapeToLength(fShape); + size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis; + + std::string expFunction = (fUseVDT) ? "vdt::fast_expf" : "std::exp"; + std::string logFunction = (fUseVDT) ? "vdt::fast_logf" : "std::log"; + + // Check if this is the special case where memory is contiguous. + if (axis == size - 1) { + std::string axis_size = fShape[axis].GetVal(); + std::string num_rows; + if (IsInteger(length_str) && IsInteger(axis_size)) { + num_rows = std::to_string(std::stoul(length_str) / std::stoul(axis_size)); + } else { + num_rows = "(" + length_str + ") / (" + axis_size + ")"; + } + + out << SP << "//----- softmax axis is last one - " << axis << "\n"; + out << SP << "for (int i = 0; i < " << num_rows << "; ++i) {\n"; + out << SP << SP << "size_t offset = i * " << axis_size << ";\n"; + out << SP << SP << fType << " const * x_ptr = &tensor_" << fNX << "[offset];\n"; + out << SP << SP << fType << " * y_ptr = &tensor_" << fNY << "[offset];\n"; + + out << SP << SP << fType << " vmax = x_ptr[0];\n"; + out << SP << SP << "for (int j = 1; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "if (x_ptr[j] > vmax) vmax = x_ptr[j];\n"; + out << SP << SP << "}\n"; + + out << SP << SP << fType << " sum = 0.0;\n"; + out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "y_ptr[j] = " << expFunction << "(x_ptr[j] - vmax);\n"; + out << SP << SP << SP << "sum += y_ptr[j];\n"; + out << SP << SP << "}\n"; + + out << SP << SP << fType << " inv_sum = 1.0f / sum;\n"; + out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n"; + out << SP << SP << SP << "y_ptr[j] *= inv_sum;\n"; + if (fLogSoftmax) + out << SP << SP << SP << "y_ptr[j] = " << logFunction << "(y_ptr[j]);\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + + } else { + // generic case for any axis + auto stride = UTILITY::ComputeStrideFromShape(fShape); + size_t k = 0; + std::vector l(size); + for (size_t i = 0; i < size; i++) { + if (i != axis) { + for (size_t j = 0; j < k; j++) out << SP; + l[i] = std::string("i") + std::to_string(i); + out << SP << "for (int " << l[i] << " = 0; " << l[i] << " < " << fShape[i] << "; " << l[i] << "++) {\n"; + k++; + } + } + for (size_t j = 0; j < size-1; j++) out << SP; + out << fType << " sum = 0.;\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "size_t index = "; + bool first = true; + for (size_t i = 0; i < size; i++) { + if (i == axis) continue; + if (!first) out << " + "; + if (stride[i].GetVal() != "1") + out << stride[i] << "*"; + out << l[i]; + first = false; + } + out << ";\n"; + // find maximum looping along reduced axis + for (size_t j = 0; j < size-1; j++) out << SP; + out << fType << " vmax = tensor_" << fNX << "[index];\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 1; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << fType << " x = tensor_" << fNX << "[index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << "];\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "if (x > vmax) vmax = x;\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + // compute softmax + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "size_t id = index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << ";\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] = " << expFunction << "(tensor_" << fNX << "[id] - vmax);\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "sum += tensor_" << fNY << "[id];\n"; + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + // normalize + for (size_t j = 0; j < size-1; j++) out << SP; + out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "size_t id = index + i"; + if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")"; + out << ";\n"; + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] /= sum;\n"; + if (fLogSoftmax) { + for (size_t j = 0; j < size; j++) out << SP; + out << "tensor_" << fNY << "[id] = " << logFunction << "(tensor_" << fNY << "[id]);\n"; + } + for (size_t j = 0; j < size-1; j++) out << SP; + out << "}\n"; + //end loops + for (int i = static_cast(k) - 1; i >= 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + } + return out.str(); + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_Softmax diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/core/inc/SOFIE/ROperator_Split.hxx similarity index 51% rename from src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx rename to core/inc/SOFIE/ROperator_Split.hxx index 63fbcb3..9604ca8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx +++ b/core/inc/SOFIE/ROperator_Split.hxx @@ -51,14 +51,14 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Split Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Split Op Input Tensor is not found in model"); } fInputShape = model.GetTensorShape(fNX); // correct for negative axis if (fAxis < 0) fAxis += fInputShape.size(); if (fAxis < 0 || fAxis >= static_cast(fInputShape.size()) ) - throw std::runtime_error("TMVA SOFIE Split - invalid axis " + std::to_string(fAxis)); + throw std::runtime_error("SOFIE Split - invalid axis " + std::to_string(fAxis)); // compute output shapes size_t nsplit = fNYs.size(); @@ -77,10 +77,10 @@ public: } else { // get split tensor values if (!model.IsInitializedTensor(fNSplit)) - throw std::runtime_error("TMVA SOFIE Split - non-initialized split tensors are not supported"); + throw std::runtime_error("SOFIE Split - non-initialized split tensors are not supported"); auto splitShape = model.GetTensorShape(fNSplit); if (splitShape.size() != 1 || splitShape[0] != nsplit) - throw std::runtime_error("TMVA SOFIE Split - split input tensor has invalid shape"); + throw std::runtime_error("SOFIE Split - split input tensor has invalid shape"); auto split_data = static_cast(model.GetInitializedTensorData(fNSplit).get()); fSplit = std::vector(split_data, split_data + nsplit); } @@ -94,7 +94,7 @@ public: fOutputShapes.push_back(outputShape); } if (tot_split != fInputShape[fAxis]) - throw std::runtime_error("TMVA SOFIE Split - Sum of split sizes must match the input dimension along the axis"); + throw std::runtime_error("SOFIE Split - Sum of split sizes must match the input dimension along the axis"); if (model.Verbose()) { @@ -109,7 +109,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fOutputShapes.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first"); } auto input_strides = UTILITY::ComputeStrideFromShape(fInputShape); @@ -153,6 +153,105 @@ public: return out.str(); } +std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fOutputShapes.empty()) + throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first"); + + const std::size_t D = fInputShape.size(); + const std::size_t Nin = fNYs.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape); + + std::string op; + op = "\n//------ SPLIT_KERNEL_ALPAKA\n"; + std::cout<<"Generating GPU kernel for Split operator with input shape "<< ConvertShapeToString(fInputShape) << " and output shapes : "; + for (std::size_t i = 0; i < Nin; ++i) { + std::cout<<"Loop running for output "<\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* input,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fOutputShapes[i][d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " + " + std::to_string(axis_offset) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + } + std::cout<<"Finished generating GPU kernel for Split operator "<(1));\n"; + out << SP << SP << "auto const elementsPerGrid_" << i << " = Vec::all(Idx{" << length << "});\n"; + out << SP << SP << "auto const workDiv_" << i << " = sofie_workdiv(elementsPerGrid_" << i << ");\n"; + out << SP << SP << "auto task_" << opName << "_" << i << " = alpaka::createTaskKernel(workDiv_" << i + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")" + << ", static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << opName << "_" << i << ");\n"; + out << SP << "}\n"; + } + return out.str(); +} + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx b/core/inc/SOFIE/ROperator_SubGraph.hxx similarity index 92% rename from src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx rename to core/inc/SOFIE/ROperator_SubGraph.hxx index cb17671..e273bde 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx +++ b/core/inc/SOFIE/ROperator_SubGraph.hxx @@ -34,8 +34,7 @@ public: n = UTILITY::Clean_name(n); fInputTensorNames = { fNX }; - std::transform(fNYs.begin(), fNYs.end(), fOutputTensorNames.begin(), - [](const std::string& s) -> std::string_view { return s; }); + fOutputTensorNames.assign(fNYs.begin(), fNYs.end()); } std::vector TypeInference(std::vector input) override { @@ -50,7 +49,7 @@ public: void Initialize(RModel& model) override { //input must be a graph input, or already initialized intermediate tensor if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE If Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE If Op Input Tensor is not found in model"); } //add the subgraph model to parent RModel and initialize them model.InitializeSubGraph(fModel_then); @@ -71,7 +70,7 @@ public: fType = type; else { if (type != fType) - throw std::runtime_error("TMVA SOFIE If Op supports only all outputs of the same type"); + throw std::runtime_error("SOFIE If Op supports only all outputs of the same type"); } model.AddIntermediateTensor(fNYs[i], fType, shape ); } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx b/core/inc/SOFIE/ROperator_Swish.hxx similarity index 82% rename from src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx rename to core/inc/SOFIE/ROperator_Swish.hxx index a2552f1..cecdd3c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx +++ b/core/inc/SOFIE/ROperator_Swish.hxx @@ -17,7 +17,7 @@ private: std::string fNX; std::string fNY; - std::vector fShape; + std::vector fShape; public: ROperator_Swish(){} @@ -38,9 +38,9 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Swish Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE Swish Op Input Tensor is not found in model"); } - fShape = model.GetTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); } @@ -48,13 +48,10 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Swish called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator Swish called to Generate without being initialized first"); } std::stringstream out; - int length = 1; - for(auto& i: fShape){ - length *= i; - } + std::string length = ConvertDimShapeToLength(fShape); out << "\t" << "for (int id = 0; id < " << length << " ; id++){\n"; out << "\t\t" << "tensor_" << fNY << "[id] = tensor_" << fNX <<"[id] / (1 + std::exp( - tensor_" << fNX << "[id]));\n"; out << "\t}\n"; diff --git a/core/inc/SOFIE/ROperator_Tanh.hxx b/core/inc/SOFIE/ROperator_Tanh.hxx new file mode 100644 index 0000000..f71b89f --- /dev/null +++ b/core/inc/SOFIE/ROperator_Tanh.hxx @@ -0,0 +1,112 @@ +#ifndef SOFIE_ROPERATOR_Tanh +#define SOFIE_ROPERATOR_Tanh + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Tanh final : public ROperator +{ + +private: + + std::string fNX; + std::string fNY; + std::vector fShape; + +public: + ROperator_Tanh(){} + ROperator_Tanh(std::string nameX, std::string nameY): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::TANH; + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + auto ret = input; //suggest copy to compiler + return ret; + } + + void Initialize(RModel& model) override { + //input must be a graph input, or already initialized intermediate tensor + if (model.CheckIfTensorAlreadyExist(fNX) == false){ + throw std::runtime_error("SOFIE Tanh Op Input Tensor is not found in model"); + } + fShape = model.GetDimTensorShape(fNX); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + + } + + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Tanh operator called to Generate without being initialized first"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ TANH\n"; + out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n"; + out << SP << "}\n"; + return out.str(); + } + + std::vector GetStdLibs() override { return { std::string("cmath") };} + + bool IsElementwise() const override { return true; } + std::string GetElementwiseExpr(const std::string& v) const override { + return "tanh(" + v + ")"; + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ TANH_KERNEL_ALPAKA\n"; + op += "struct TanhKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if (idx < numElements) { out[idx] = tanh(data[idx]); }\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "TanhKernel tanhKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("SOFIE Tanh called to Generate_GPU_ALPAKA without being initialized"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(fShape); + out << "\n//------ TANH_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNX + << ", tanhKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_Tanh diff --git a/core/inc/SOFIE/ROperator_Tile.hxx b/core/inc/SOFIE/ROperator_Tile.hxx new file mode 100644 index 0000000..5a3921e --- /dev/null +++ b/core/inc/SOFIE/ROperator_Tile.hxx @@ -0,0 +1,249 @@ +#ifndef SOFIE_ROPERATOR_Tile +#define SOFIE_ROPERATOR_Tile + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + + +namespace SOFIE{ + +template +class ROperator_Tile final : public ROperator +{ + +private: + + std::string fNRepeats; + std::string fNInput; + std::string fNY; + std::vector fShapeInput; + std::vector fShapeY; + std::vector fRepeats; + +public: + ROperator_Tile(){} + ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY): + fNRepeats(UTILITY::Clean_name(nameRepeat)), + fNInput(UTILITY::Clean_name(nameInput)), + fNY(UTILITY::Clean_name(nameY)) { + fInputTensorNames = { fNRepeats, fNInput }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + std::vector ret = input[0]; + for (size_t i = 0; i < input[1].size(); i++) + ret[i] = ret[i] * input[1][i]; + return {ret}; + } + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNInput) == false) + throw std::runtime_error("SOFIE Tile Op Input Tensor is not found in model"); + if (model.CheckIfTensorAlreadyExist(fNRepeats) == false) + throw std::runtime_error("SOFIE Tile Op Repeats Tensor is not found in model"); + + fShapeInput = model.GetTensorShape(fNInput); + + if (!model.IsInitializedTensor(fNRepeats)) + throw std::runtime_error("SOFIE Tile Op: non-initialized repeats input is not supported"); + + auto repptr = model.GetInitializedTensorData(fNRepeats); + auto repeats_data = static_cast(repptr.get()); + if (repeats_data == nullptr) + throw std::runtime_error("SOFIE Tile Op: failed to retrieve repeats tensor data"); + + auto repeats_shape = model.GetTensorShape(fNRepeats); + if (repeats_shape.size() != 1) + throw std::runtime_error("SOFIE Tile Op: repeats tensor must be 1D"); + + size_t num_elements = repeats_shape[0]; + + // Save repeats if known at generation time so the GPU kernel can bake + // fShapeInput[d] directly without needing a runtime repeats pointer. + // fRepeats is left empty if repeats are not initialized (future case), + // which will cause the kernel to use the runtime repeats pointer path. + fRepeats.resize(num_elements); + std::copy(repeats_data, repeats_data + num_elements, fRepeats.begin()); + if (fRepeats.size()){ + model.RemoveInitializedTensor(fNRepeats); + } + fShapeY = ShapeInference({fShapeInput, fRepeats})[0]; + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); + + if (model.Verbose()) + std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) + << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) + << " given repeats " << ConvertShapeToString(fRepeats) << std::endl; + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Tile Op called to Generate without being initialized first"); + + std::stringstream out; + std::string input = "tensor_" + fNInput; + std::string output = "tensor_" + fNY; + std::string repeats = "tensor_" + fNRepeats; + + out << "///-------- Tile operator\n"; + out << "{\n"; + + out << SP << "const int input_shape[" << fShapeInput.size() << "] = {"; + for (size_t i = 0; i < fShapeInput.size(); ++i) { + if (i > 0) out << ", "; + out << fShapeInput[i]; + } + out << "};\n"; + + out << SP << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; + out << SP << "int s = 1;\n"; + + // Read repeats from the tensor at runtime so the generated code remains + // correct even if repeats become a runtime input/intermediate in the future + out << SP << "for (int i = " << fShapeInput.size() - 1 << "; i >= 0; i--) {\n"; + out << SP << SP << "int r = " << repeats << "[i];\n"; + out << SP << SP << "int i_offset = 0, o_offset = 0;\n"; + out << SP << SP << "s = s * input_shape[i];\n"; + out << SP << SP << "if (i == " << fShapeInput.size() - 1 << ") {\n"; + out << SP << SP << SP << "for (int j = 0; j < inputLength / s; j++) {\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << input << " + i_offset, " + << input << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << SP << "i_offset += s;\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "} else {\n"; + out << SP << SP << SP << "for (int j = inputLength / s - 1; j >= 0; j--) {\n"; + out << SP << SP << SP << SP << "o_offset = j * s * r;\n"; + out << SP << SP << SP << SP << "i_offset = j * s;\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << output << " + i_offset, " + << output << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "}\n"; + out << SP << SP << "s *= r;\n"; + out << SP << SP << "inputLength *= r;\n"; + out << SP << "}\n"; + out << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first"); + + const std::size_t D = fShapeInput.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeInput); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeY); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + + // If fRepeats is populated, repeats were known at generation time and + // we can bake fShapeInput[d] as literals — no runtime repeats pointer needed. + // If fRepeats is empty (future: runtime repeats), pass repeats as a kernel arg. + bool repeatsKnown = !fRepeats.empty(); + + std::string kname = "TileKernel_" + opName; + + std::string op; + op = "\n//------ TILE_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + if (!repeatsKnown) + op += SP + SP + SP + "int64_t const* __restrict__ repeats,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index — output strides always compile-time + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + // Input index: fShapeInput[d] is always a compile-time constant since + // it is the input tensor shape, never runtime-variable. + // When repeatsKnown, we bake it directly as a literal. + // When not repeatsKnown (future), we still use fShapeInput[d] as a + // literal for the % — repeats pointer is only needed if fShapeY is dynamic. + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + + "(out_" + std::to_string(d) + " % " + std::to_string(fShapeInput[d]) + "u)" + + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "TileKernel_" + opName; + return SP + kname + " tileKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first"); + + bool repeatsKnown = !fRepeats.empty(); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "tileKernel_" + opName; + + // Build argument list once, reused for both getValidWorkDiv and exec + std::string args = + "alpaka::getPtrNative(deviceBuf_" + fNInput + "), " + + "alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + if (!repeatsKnown) + args += ", alpaka::getPtrNative(deviceBuf_" + fNRepeats + ")"; + args += ", static_cast(" + std::to_string(totalElements) + ")"; + + std::stringstream out; + out << "\n//------ TILE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "auto task_" << opName << " = alpaka::createTaskKernel(workDiv_" << opName + << ", " << kname << ", " << args << ");\n"; + out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n"; + return out.str(); + } + +}; + +}//SOFIE + +#endif //SOFIE_ROPERATOR_Tile diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx b/core/inc/SOFIE/ROperator_TopK.hxx similarity index 94% rename from src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx rename to core/inc/SOFIE/ROperator_TopK.hxx index 06d8179..7db1768 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx +++ b/core/inc/SOFIE/ROperator_TopK.hxx @@ -48,7 +48,7 @@ public: std::vector> ShapeInference(std::vector> input) override { if (input.size() != 2) { - throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors"); + throw std::runtime_error("SOFIE TopK Op Shape Inference needs exactly 2 input tensors"); } auto shape = input[0]; // Shape format: [ m x n x o x p ... ] @@ -62,11 +62,11 @@ public: void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor is not found in model"); + throw std::runtime_error("SOFIE TopK Op Input Tensor is not found in model"); } if (model.CheckIfTensorAlreadyExist(fNK) == false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model"); + throw std::runtime_error("SOFIE TopK Op Input Tensor i.e. K is not found in model"); } fShapeX = model.GetTensorShape(fNX); @@ -77,7 +77,7 @@ public: fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis; if(static_cast(fAttrAxis) >= fShapeX.size()){ throw - std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" ."); + std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+std::to_string(fShapeX.size())+" ."); } // fK cannot be larger that axis dimension fK = std::min(fK, fShapeX[fAttrAxis]); @@ -111,7 +111,7 @@ public: std::string Generate(std::string OpName) override { OpName = "op_" + OpName; if (fShapeX.empty()) { - throw std::runtime_error("TMVA SOFIE Operator TopK called to Generate without being initialized first"); + throw std::runtime_error("SOFIE Operator TopK called to Generate without being initialized first"); } std::stringstream out; size_t size = fShapeX.size(); diff --git a/core/inc/SOFIE/ROperator_Transpose.hxx b/core/inc/SOFIE/ROperator_Transpose.hxx new file mode 100644 index 0000000..03dad41 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Transpose.hxx @@ -0,0 +1,239 @@ +#ifndef SOFIE_ROPERATOR_TRANSPOSE +#define SOFIE_ROPERATOR_TRANSPOSE + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include + + +namespace SOFIE{ + + + + +template +class ROperator_Transpose final : public ROperator +{ + +private: + std::vector fAttrPerm; + + std::string fNData; + std::string fNOutput; + std::vector fShapeData; // used for initialized (constant) tensor case + std::vector fShapeOutput; // used for initialized (constant) tensor case + std::vector fDimShapeData; // used for dynamic/runtime tensor case + std::vector fDimShapeOutput; // used for dynamic/runtime tensor case + +public: + + ROperator_Transpose(){} + ROperator_Transpose(std::vector attr_perm, std::string nameData, std::string nameOutput): + fAttrPerm(attr_perm), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) { + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + + ROperator_Transpose(std::string nameData, std::string nameOutput): + fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) { + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; + } + + std::vector TypeInference(std::vector input) override { + return input; + } + + std::vector> ShapeInference(std::vector> input) override { + if (input.size() > 1) throw std::runtime_error("SOFIE Tranpose Op Shape Inference only need 1 input tensor"); + auto& data = input[0]; + if (fAttrPerm.size() != data.size() ) + throw std::runtime_error("SOFIE Tranpose Op - Invalid axes attributes"); + + std::vector output_shape(fAttrPerm.size()); + for (size_t i = 0; i < fAttrPerm.size(); i++){ + output_shape[i] = data[fAttrPerm[i]]; + } + std::vector> ret; + ret.push_back(output_shape); + return ret; + } + + + void Initialize(RModel& model) override { + if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor + std::cout<<"Input tensor for transpose: "<= 0; i--){ + fAttrPerm.push_back(i); + } + } + std::vector> inputs = { fShapeData }; + fShapeOutput = ShapeInference(inputs).front(); + fIsOutputConstant = true; + auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData); + auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput); + size_t length = ConvertShapeToLength(fShapeOutput); + auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); + size_t dim = fShapeData.size(); + std::vector outputIdx(dim); + std::vector outputData(length); + for (size_t i = 0; i < length; i++) { + outputIdx[0] = i / outStrides[0]; + for (size_t j = 1; j < dim; j++) { + outputIdx[j] = (i % outStrides[j-1]) / outStrides[j]; + } + // compute input index + size_t inputIndex = 0; + for (size_t j = 0; j < dim; j++) { + // find value in fAtrrPerm corresponding to j + int k = std::find(fAttrPerm.begin(), fAttrPerm.end(), j) - fAttrPerm.begin(); + inputIndex += outputIdx[k] * inStrides[j]; + } + outputData[i] = inputData[inputIndex]; + } + model.AddConstantTensor(fNOutput, fShapeOutput, outputData.data()); + if (model.Verbose()) { + std::cout << "Transpose: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : " + << ConvertValuesToString(outputData) << std::endl; + } + } else { + // Non-initialized (runtime/dynamic) tensor: use Dim-aware shapes + fDimShapeData = model.GetDimTensorShape(fNData); + size_t rank = fDimShapeData.size(); + if (fAttrPerm.empty()){ + fAttrPerm.reserve(rank); + for (int i = rank - 1; i >= 0; i--){ + fAttrPerm.push_back(i); + } + } + fDimShapeOutput.resize(fAttrPerm.size()); + for (size_t i = 0; i < fAttrPerm.size(); i++){ + fDimShapeOutput[i] = fDimShapeData[fAttrPerm[i]]; + } + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fDimShapeOutput); + if (model.Verbose()) { + std::cout << "Transpose ---> " << fNOutput << " " << ConvertDimShapeToString(fDimShapeOutput) << std::endl; + } + } + } + + std::string Generate(std::string OpName) override { + if (fIsOutputConstant) return ""; //no op for constant tensors + OpName = "op_" + OpName; + // Use Dim shapes when available (dynamic case), else convert from concrete shapes + auto dimShapeData = fDimShapeData.empty() ? ConvertShapeToDim(fShapeData) : fDimShapeData; + auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput; + if (dimShapeData.empty() || dimShapeOutput.empty()){ + throw std::runtime_error("SOFIE Transpose Op called to Generate without being initialized first"); + } + int dim = dimShapeData.size(); + auto inStrides = UTILITY::ComputeStrideFromShape(dimShapeData); + auto outStrides = UTILITY::ComputeStrideFromShape(dimShapeOutput); + std::string length = ConvertDimShapeToLength(dimShapeOutput); + + std::stringstream out; + // Implement transpose operator using consecutive write outputs. + // tensorOut[id] = tensorInput[ inStrides[0]*i0 + inStrides[1]*i1 + ...] + // where j_k = i_fAttrPerm[k] and (j0,j1,...) are the output indices for id + out << SP << "///------- Transpose operator\n" << std::endl; + out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; + out << SP << SP << "tensor_" << fNOutput << "[id] = tensor_" << fNData << "[ "; + // compute output j indices from id + std::vector i_out(dim); + for (int k = 0; k < dim; k++){ + if (k == 0) + i_out[k] = "id"; + else + i_out[k] = "(id % " + outStrides[k-1].GetVal() + ")"; + if (k < dim-1) + i_out[k] += " / " + outStrides[k].GetVal(); + } + // use output indices to compute input index, inverting the permutation + for (int k = 0; k < dim; k++){ + int l = std::find(fAttrPerm.begin(), fAttrPerm.end(), k) - fAttrPerm.begin(); + assert(l >= 0 && l < dim); + out << "( " << i_out[l] << " )"; + if (k < dim-1) { + out << " * " << inStrides[k].GetVal(); + out << " + "; + } + } + out << "];\n"; + out << SP << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) { + std::string op; + OpName = "op_" + OpName; + op = "\n//------ TRANSPOSE_KERNEL_ALPAKA\n"; + op += SP + "struct TransposeKernel_" + OpName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,"; + op += "const std::size_t totalElements) const {\n"; + op += SP + SP + SP + SP + "auto const idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + SP + "if(idx >= totalElements) return;\n"; + op += SP + SP + SP + SP + "std::size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + "std::size_t remaining = idx;\n"; + op += SP + SP + SP + SP + "std::size_t coord;\n"; + + auto dimShapeData = fDimShapeData.empty() ? ConvertShapeToDim(fShapeData) : fDimShapeData; + auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput; + auto inputStrides = UTILITY::ComputeStrideFromShape(dimShapeData); + auto outputStrides = UTILITY::ComputeStrideFromShape(dimShapeOutput); + + for (size_t k = 0; k < dimShapeData.size(); k++) { + op += SP + SP + SP + SP + "coord = remaining / " + + outputStrides[k].GetVal() + "u;\n"; + op += SP + SP + SP + SP + "remaining = remaining - coord * " + + outputStrides[k].GetVal() + "u;\n"; + op += SP + SP + SP + SP + "input_idx += coord * " + + inputStrides[fAttrPerm[k]].GetVal() + "u;\n"; + } + + op += SP + SP + SP + SP + "output[idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + return SP + "TransposeKernel_op_" + OpName + " transposeKernel_" + OpName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput; + if (dimShapeOutput.empty()) { + throw std::runtime_error("SOFIE Operator Transpose called to Generate without being initialized first"); + } + std::stringstream out; + std::string length = ConvertDimShapeToLength(dimShapeOutput); + + out << "\n//------ TRANSPOSE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"<(workDiv_" << fNOutput + << ", transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData + << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast(" << length << "));\n"; + out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n"; + return out.str(); + } + +}; + +}//SOFIE + + +#endif //SOFIE_ROPERATOR_TRANSPOSE diff --git a/core/inc/SOFIE/ROperator_Trilu.hxx b/core/inc/SOFIE/ROperator_Trilu.hxx new file mode 100644 index 0000000..04e18d5 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Trilu.hxx @@ -0,0 +1,232 @@ +#ifndef SOFIE_ROPERATOR_TRILU +#define SOFIE_ROPERATOR_TRILU + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include +#include +#include + +namespace SOFIE { + +template +class ROperator_Trilu final : public ROperator { +private: + int fUpper = 1; + + int64_t fK = 0; + bool fKIsStatic= true; + + std::string fNX; + std::string fNK; + std::string fNY; + + std::vector fShape; + size_t fM = 0; + size_t fN = 0; + size_t fBatch = 1; + size_t fTotal = 0; + +public: + ROperator_Trilu() {} + + ROperator_Trilu(int upper, std::string nameX, std::string nameY) + : fUpper(upper), + fNX(UTILITY::Clean_name(nameX)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX }; + fOutputTensorNames = { fNY }; + } + + ROperator_Trilu(int upper, std::string nameX, std::string nameK, std::string nameY) + : fUpper(upper), + fNX(UTILITY::Clean_name(nameX)), + fNK(UTILITY::Clean_name(nameK)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNX, fNK }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return { input[0] }; + } + + std::vector> ShapeInference(std::vector> input) override { + if (input.empty()) + throw std::runtime_error("SOFIE Trilu ShapeInference: no input shapes"); + return { input[0] }; // output has the same shape as input + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNX)) + throw std::runtime_error("SOFIE Trilu: input tensor '" + fNX + + "' not found in model"); + + fShape = model.GetTensorShape(fNX); + if (fShape.size() < 2) + throw std::runtime_error("SOFIE Trilu: input tensor '" + fNX + + "' must have at least 2 dimensions, got " + + std::to_string(fShape.size())); + + fN = fShape.back(); + fM = fShape[fShape.size() - 2]; + fBatch = 1; + for (size_t d = 0; d + 2 < fShape.size(); ++d) + fBatch *= fShape[d]; + fTotal = fBatch * fM * fN; + + if (!fNK.empty()) { + if (model.IsInitializedTensor(fNK) || model.IsConstantTensor(fNK)) { + // Bake the constant value into generated code. + auto data_ptr = static_cast( + model.GetInitializedTensorData(fNK).get()); + fK = data_ptr[0]; + fKIsStatic = true; + } else { + fKIsStatic = false; + } + } + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); + + if (model.Verbose()) { + std::cout << "Trilu: " << fNX + << " upper=" << fUpper << " k="; + if (fKIsStatic) std::cout << fK; + else std::cout << "dyn(" << fNK << ")"; + std::cout << " -> " << fNY + << " " << ConvertShapeToString(fShape) << std::endl; + } + } + + std::string Generate(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) + throw std::runtime_error( + "SOFIE Trilu: Generate called before Initialize"); + + std::stringstream out; + out << "\n//------ TRILU\n"; + + if (fKIsStatic) { + out << SP << "const int64_t k_" << OpName << " = " << fK << "LL;\n"; + } else { + out << SP << "const int64_t k_" << OpName + << " = static_cast(tensor_" << fNK << "[0]);\n"; + } + + out << SP << "for (std::size_t id = 0; id < " << fTotal << "u; ++id) {\n"; + out << SP << SP << "const std::size_t mat_id = id % " + << (fM * fN) << "u;\n"; + out << SP << SP << "const std::ptrdiff_t row = " + << "static_cast(mat_id / " << fN << "u);\n"; + out << SP << SP << "const std::ptrdiff_t col = " + << "static_cast(mat_id % " << fN << "u);\n"; + if (fUpper) { + out << SP << SP << "const bool keep = (col >= row + k_" << OpName << ");\n"; + } else { + out << SP << SP << "const bool keep = (col <= row + k_" << OpName << ");\n"; + } + out << SP << SP << "tensor_" << fNY << "[id] = keep ? tensor_" << fNX + << "[id] : static_cast(0);\n"; + out << SP << "}\n"; + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) + throw std::runtime_error( + "SOFIE Trilu: Generate_GPU_Kernel_ALPAKA called before Initialize"); + + std::stringstream op; + op << "\n//------ TRILU_KERNEL_ALPAKA\n"; + op << "struct TriluKernel_" << OpName << " {\n"; + op << SP << "template\n"; + op << SP << "ALPAKA_FN_ACC void operator()(" + << "TAcc const& acc, " + << "T const* __restrict__ input, " + << "T* __restrict__ output, " + << "const std::size_t total, " + << "const std::ptrdiff_t k) const {\n"; + op << SP << SP << "auto const idx = " + << "alpaka::getIdx(acc)[0];\n"; + op << SP << SP << "if (idx >= total) return;\n"; + op << SP << SP << "constexpr std::size_t N = " << fN << "u;\n"; + op << SP << SP << "constexpr std::size_t MN = " << (fM * fN) << "u;\n"; + op << SP << SP << "const std::size_t mat_id = idx % MN;\n"; + op << SP << SP << "const std::ptrdiff_t row = " + << "static_cast(mat_id / N);\n"; + op << SP << SP << "const std::ptrdiff_t col = " + << "static_cast(mat_id % N);\n"; + if (fUpper) { + op << SP << SP << "const bool keep = (col >= row + k);\n"; + } else { + op << SP << SP << "const bool keep = (col <= row + k);\n"; + } + op << SP << SP << "output[idx] = keep ? input[idx] : T(0);\n"; + op << SP << "}\n"; + op << "};\n"; + return op.str(); + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + std::string cleaned = "op_" + OpName; + return SP + "TriluKernel_" + cleaned + " triluKernel_" + cleaned + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fShape.empty()) + throw std::runtime_error( + "SOFIE Trilu: Generate_GPU_ALPAKA called before Initialize"); + + std::string cleanOp = "op_" + OpName; + std::stringstream out; + out << "\n//------ TRILU_GPU_ALPAKA\n"; + + if (fKIsStatic) { + out << SP << "const std::ptrdiff_t k_" << cleanOp + << " = static_cast(" << fK << "LL);\n"; + } else { + out << SP << "std::ptrdiff_t k_" << cleanOp << ";\n"; + out << SP << "{\n"; + out << SP << SP + << "auto hostK = alpaka::allocBuf(host, Ext1D::all(Idx{1}));\n"; + out << SP << SP + << "alpaka::memcpy(queue, hostK, deviceBuf_" << fNK << ");\n"; + out << SP << SP << "alpaka::wait(queue);\n"; + out << SP << SP + << "k_" << cleanOp << " = static_cast(" + << "*reinterpret_cast(alpaka::getPtrNative(hostK)));\n"; + out << SP << "}\n"; + } + + out << SP << "auto const elementsPerThread_" << fNY + << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << fNY + << " = Vec::all(Idx{" << fTotal << "});\n"; + out << SP << "auto const workDiv_" << fNY + << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n"; + out << SP << "auto task_" << cleanOp + << " = alpaka::createTaskKernel(workDiv_" << fNY + << ", triluKernel_" << cleanOp + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << fTotal << ")" + << ", k_" << cleanOp << ");\n"; + out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n"; + return out.str(); + } +}; + +} // namespace SOFIE + +#endif // SOFIE_ROPERATOR_TRILU diff --git a/core/inc/SOFIE/ROperator_Where.hxx b/core/inc/SOFIE/ROperator_Where.hxx new file mode 100644 index 0000000..b9956e9 --- /dev/null +++ b/core/inc/SOFIE/ROperator_Where.hxx @@ -0,0 +1,613 @@ +#ifndef SOFIE_ROperator_Where +#define SOFIE_ROperator_Where + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include + +namespace SOFIE{ + +template +class ROperator_Where final : public ROperator{ +private: + + bool fIsInputBoolTensor = false; + + + std::string fNX; + std::string fNY; + std::string fNC; + std::string fNBroadcastedX; + std::string fNBroadcastedY; + std::string fNBroadcastedC; + std::string fNZ; + + + + // static shapes (used when tensors are not dynamic) ) + std::vector fShapeX; + std::vector fShapeY; + std::vector fShapeC; + std::vector fShapeZ; + + // Dynamic generic shapes + std::vector fDimShapeC; + std::vector fDimShapeX; + std::vector fDimShapeY; + std::vector fDimShapeZ; + + // Broadcast flag: mirrors convention of BasicBinary + // bit 0: broadcast Y->X (Y needs expanding) + // bit 1: broadcast X->Y (X needs expanding) + // bit 2: broadcast C->Z (C needs expanding) + // bit 4: shapes may differ at runtime (dynamic) + int fBroadcastFlag = 0; + +public: + ROperator_Where(){} + ROperator_Where(const std::string & nameC, const std::string & nameX, const std::string & nameY, const std::string & nameZ): + fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), fNC(UTILITY::Clean_name(nameC)), fNZ(UTILITY::Clean_name(nameZ)){ + fInputTensorNames = { fNX, fNY, fNC }; + fOutputTensorNames = { fNZ }; + } + + // type of output given input + std::vector TypeInference(std::vector input) override { + return input; + } + + // shape of output tensors given input tensors + std::vector> ShapeInference(std::vector> input) override { + // assume now inputs have same shape (no broadcasting) + auto ret = std::vector>(1, input[0]); // return vector size 1 with first input + return ret; + } + + void Initialize(RModel& model) override { + // input must be a graph input, or already initialized intermediate tensor + if (!model.CheckIfTensorAlreadyExist(fNX)){ + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNX + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNY)) { + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNY + "is not found in model"); + } + if (!model.CheckIfTensorAlreadyExist(fNC)) { + throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNC + "is not found in model"); + } + // check if fNC input tensor is boolean + if (model.IsReadyInputTensor(fNC)) + fIsInputBoolTensor = true; + + // ---------------------------------------------------------------- // + // Collect shapes – dynamic or static + // ---------------------------------------------------------------- // + int dynamicInputs = 0; // bitmask: bit0=C, bit1=X, bit2=Y + + if (model.IsDynamicTensor(fNC)) { + fDimShapeC = model.GetDynamicTensorShape(fNC); + dynamicInputs |= 1; + } else { + fShapeC = model.GetTensorShape(fNC); + fDimShapeC = ConvertShapeToDim(fShapeC); + } + if (model.IsDynamicTensor(fNX)) { + fDimShapeX = model.GetDynamicTensorShape(fNX); + dynamicInputs |= 2; + } else { + fShapeX = model.GetTensorShape(fNX); + fDimShapeX = ConvertShapeToDim(fShapeX); + } + if (model.IsDynamicTensor(fNY)) { + fDimShapeY = model.GetDynamicTensorShape(fNY); + dynamicInputs |= 4; + } else { + fShapeY = model.GetTensorShape(fNY); + fDimShapeY = ConvertShapeToDim(fShapeY); + } + + + if (model.Verbose()) { + if (dynamicInputs & 1) + std::cout << "Where : condition " << fNC << " is dynamic " << ConvertDimShapeToString(fDimShapeC) << "\n"; + if (dynamicInputs & 2) + std::cout << "Where : " << fNX << " is dynamic " << ConvertDimShapeToString(fDimShapeX) << "\n"; + if (dynamicInputs & 4) + std::cout << "Where : Y " << fNZ << " is dynamic " << ConvertDimShapeToString(fDimShapeZ) << "\n"; + } + + // ---------------------------------------------------------------- // + // Static path: all shapes known at code-gen time + // ---------------------------------------------------------------- // + if (dynamicInputs == 0) { + + bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY) || !UTILITY::AreSameShape(fShapeX, fShapeC); + if (broadcast) { + // find shape to broadcast between X,Y,C looking for max length + size_t lengthX = ConvertShapeToLength(fShapeX); + size_t lengthY = ConvertShapeToLength(fShapeY); + size_t lengthC = ConvertShapeToLength(fShapeC); + bool broadcastX = false, broadcastY = false, broadcastC = false; + if (lengthX >= lengthY && lengthX >= lengthC) { + fShapeZ = fShapeX; + // broadcast Y and C if different than X + broadcastY = (lengthY != lengthX); + broadcastC = (lengthC != lengthX); + } else if (lengthY >= lengthX && lengthY >= lengthC) { + fShapeZ = fShapeY; + // broadcast X and C if different than Y + broadcastX = (lengthX != lengthY); + broadcastC = (lengthC != lengthY); + } else if (lengthC >= lengthX && lengthC >= lengthY) { + fShapeZ = fShapeC; + // broadcast X and Y if different than C + broadcastX = (lengthX != lengthC); + broadcastY = (lengthY != lengthC); + } + + // Broadcast X to Z + if (broadcastX) { + fNBroadcastedX = "BC_" + fNX + "_to_" + fNZ; + if (model.IsInitializedTensor(fNX)) { + auto data = model.GetInitializedTensorData(fNX); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX, fShapeZ), + std::default_delete()); + // Update the data and the shape of X + model.AddConstantTensor(fNBroadcastedX, model.GetTensorType(fNX), fShapeZ, broadcastedData); + fShapeX = fShapeZ; + } else { + // I need to prepend to shape of X the extra dimensions added for broadcasting to Z + if (fShapeX.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeX.size(); + fShapeX.insert(fShapeX.begin(), nPrepend, 1); + } + } + } + // Broadcast Y to Z + if (broadcastY) { + fNBroadcastedY = "BC_" + fNY + "_to_" + fNZ; + if (model.IsInitializedTensor(fNY)) { + auto data = model.GetInitializedTensorData(fNY); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeY, fShapeZ), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + model.AddConstantTensor(fNBroadcastedY, model.GetTensorType(fNY), fShapeZ, broadcastedData); + fShapeY = fShapeZ; + } else { + // I need to prepend to shape of Y the extra dimensions added for broadcasting to Z + if (fShapeY.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeY.size(); + fShapeY.insert(fShapeY.begin(), nPrepend, 1); + } + + } + } + // Broadcast C to Z + if (broadcastC) { + fNBroadcastedC = "BC_" + fNC + "_to_" + fNZ; + if (model.IsInitializedTensor(fNC)) { + auto data = model.GetInitializedTensorData(fNC); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeZ), + std::default_delete()); + // do not update tensor C but add broadcasted one (since it can be input to some other operators) + model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeZ, broadcastedData); + fShapeC = fShapeZ; + } else { + // I need to prepend to shape of C the extra dimensions added for broadcasting to Z + if (fShapeC.size() < fShapeZ.size()) { + size_t nPrepend = fShapeZ.size() - fShapeC.size(); + fShapeC.insert(fShapeC.begin(), nPrepend, 1); + } + } + } + } else { + fShapeZ = fShapeX; + } + // check case of constant output (if all inputs are defined) + if (model.IsInitializedTensor(fNC)) { + std::string nameC = fNBroadcastedC.empty() ? fNC : fNBroadcastedC; + auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); + model.SetNotWritableInitializedTensor(nameC); + T *dataX = nullptr; + T *dataY = nullptr; + std::vector shapeDataX; + std::vector shapeDataY; + if (model.IsInitializedTensor(fNX)) { + std::string nameX = fNBroadcastedX.empty() ? fNX : fNBroadcastedX; + dataX = static_cast(model.GetInitializedTensorData(nameX).get()); + // flag tensors to not be written in a file + model.SetNotWritableInitializedTensor(nameX); + } else if (model.IsShapeTensor(fNX)) { + shapeDataX = model.GetShapeTensorValues(fNX); + } + if (model.IsInitializedTensor(fNY)) { + std::string nameY = fNBroadcastedY.empty() ? fNY : fNBroadcastedY; + dataY = static_cast(model.GetInitializedTensorData(nameY).get()); + model.SetNotWritableInitializedTensor(nameY); + } else if (model.IsShapeTensor(fNY)) { + shapeDataY = model.GetShapeTensorValues(fNY); + } + std::vector dataZ; // used in case output is constant tensor + std::vector shapeDataZ; // used in case output is a shape tensor (can be also constant if all + // dimensions are not parametric) + // if fNC (condition) is initialized we know the output is a shape or a constant tensor, + // so we can compute it at initialization and add it as a constant tensor to the model + // (and not add the operator output as intermediate tensor to the model) + bool isOutputConstantTensor = true; + if (dataX && dataY) { + dataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < dataZ.size(); i++) + dataZ[i] = (dataC[i]) ? dataX[i] : dataY[i]; + if (model.Verbose()) + std::cout << "data A and B : dataZ constant: " << ConvertValuesToString(dataZ) << std::endl; + } else if (dataX && shapeDataY.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? Dim{size_t(dataX[i])} : shapeDataY[i]; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << "data A but shapeB " << ConvertDimShapeToString(shapeDataY) << " " + << isOutputConstantTensor << std::endl; + } else if (dataY && shapeDataX.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? shapeDataY[i] : Dim{size_t(dataY[i])}; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << "data B but shapeA " << ConvertDimShapeToString(shapeDataX) << " " + << isOutputConstantTensor << std::endl; + } else if (shapeDataY.size() > 0 && shapeDataX.size() > 0) { + shapeDataZ.resize(ConvertShapeToLength(fShapeZ)); + for (size_t i = 0; i < shapeDataZ.size(); i++) { + shapeDataZ[i] = (dataC[i]) ? shapeDataX[i] : shapeDataY[i]; + isOutputConstantTensor &= !shapeDataZ[i].isParam; + } + if (model.Verbose()) + std::cout << " shapeA and B " << ConvertDimShapeToString(shapeDataX) << " shapeB " + << ConvertDimShapeToString(shapeDataY) << " " << isOutputConstantTensor << std::endl; + } + fIsOutputConstant = true; + // add as constant or shape tensor depending on the case + if (dataZ.size() > 0) + model.AddConstantTensor(fNZ, fShapeZ, dataZ.data()); + else if (shapeDataZ.size() > 0) + model.AddShapeTensor(fNZ, shapeDataZ, fShapeZ.size() == 0); + else { + fIsOutputConstant = false; + } + if (fIsOutputConstant && model.Verbose()) + std::cout << "Where op ---> " << fNZ << " " << ConvertShapeToString(fShapeZ) << " : " + << ((dataZ.size() > 0) ? ConvertValuesToString(dataZ) : ConvertDimShapeToString(shapeDataZ)) + << ((dataZ.size() > 0) ? " (constant)" : " (shape)") << std::endl; + + // output is a constant tensor + if (fIsOutputConstant) + fOutputTensorNames.pop_back(); + } + if (!fIsOutputConstant) { + + fDimShapeZ = ConvertShapeToDim(fShapeZ); + model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fShapeZ); + if (model.Verbose()) + std::cout << "Where : condition : " << fNC << " " << ConvertShapeToString(fShapeC) << " X " + << fNX << " " << ConvertShapeToString(fShapeX) << " Y " << fNY << " " + << ConvertShapeToString(fShapeY) << " ---> " << fNZ << " " << ConvertShapeToString(fShapeZ) + << std::endl; + } + } else { + // ---------------------------------------------------------------- // + // Dynamic path: at least one input has a parametric shape + // Need to use BroadcastShape to find output shape + // ---------------------------------------------------------------- // + auto retXY = UTILITY::MultidirectionalBroadcastShape(fDimShapeX, fDimShapeY); + fBroadcastFlag = retXY.first; + fDimShapeZ = retXY.second; + auto retCZ = UTILITY::MultidirectionalBroadcastShape(fDimShapeC, fDimShapeZ); + fBroadcastFlag |= retCZ.first; + fDimShapeZ = retCZ.second; + + // Resolve std::max params to actual input dim params (same logic as BasicBinary) + if (fBroadcastFlag & 4) { + auto IsInputDimParam = [&](const std::string &p) { + for (auto &input : model.GetInputTensorNames()) + for (auto &s : model.GetDimTensorShape(input)) + if (s.isParam && s.param == p) return true; + return false; + }; + for (size_t i = 0; i < fDimShapeZ.size(); i++) { + auto &s = fDimShapeZ[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + // prefer A dim over B dim + if (i < fDimShapeX.size() && IsInputDimParam(fDimShapeX[i].param)) { + s = (fDimShapeX[i].dim != 1) ? fDimShapeX[i] : fDimShapeY[i]; + } else if (i < fDimShapeY.size() && IsInputDimParam(fDimShapeY[i].param)) { + s = (fDimShapeY[i].dim != 1) ? fDimShapeY[i] : fDimShapeX[i]; + } + } + } + } + // I need to prepend to shape of X,Y,C the extra dimensions added for broadcasting to Z + if (fDimShapeX.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeX.size(); + fDimShapeX.insert(fDimShapeX.begin(), nPrepend, Dim{1}); + } + if (fDimShapeY.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeY.size(); + fDimShapeY.insert(fDimShapeY.begin(), nPrepend, Dim{1}); + } + if (fDimShapeC.size() < fDimShapeZ.size()) { + size_t nPrepend = fDimShapeZ.size() - fDimShapeC.size(); + fDimShapeC.insert(fDimShapeC.begin(), nPrepend, Dim{1}); + } + + model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fDimShapeZ); + + if (model.Verbose()) + std::cout << "Where (dynamic) : C=" << ConvertDimShapeToString(fDimShapeC) + << " A=" << ConvertDimShapeToString(fDimShapeX) + << " B=" << ConvertDimShapeToString(fDimShapeY) + << " --> Y=" << ConvertDimShapeToString(fDimShapeZ) << "\n"; + } + } + + std::string GenerateInitCode() override { + std::stringstream out; + return out.str(); + } + + std::string Generate(std::string opName) override { + + opName = "op_" + opName; + std::stringstream out; + out << SP << "\n//------ WHERE " << opName << " --> " << ConvertDimShapeToString(fDimShapeZ) << "\n"; + if (fIsOutputConstant) return out.str(); + + + // ---------------------------------------------------------------- // + // Runtime broadcast validation (dynamic shapes, flag bit 4) + // ---------------------------------------------------------------- // + if (fBroadcastFlag & 4) { + auto lengthX = ConvertDimShapeToLength(fDimShapeX); + auto lengthY = ConvertDimShapeToLength(fDimShapeY); + auto lengthC = ConvertDimShapeToLength(fDimShapeC); + out << SP << "if (" << lengthX << " != " << lengthY << " || " + << lengthX << " != " << lengthC << ") {\n"; + for (size_t i = 0; i < fDimShapeZ.size(); i++) { + // validate X vs Z + if (i < fDimShapeX.size() && fDimShapeX[i].isParam) { + out << SP << SP << "if (" << fDimShapeX[i] << " != 1 && " + << fDimShapeX[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast A dim " << i << " in " << opName << "\");\n"; + } + // validate Y vs Z + if (i < fDimShapeY.size() && fDimShapeY[i].isParam) { + out << SP << SP << "if (" << fDimShapeY[i] << " != 1 && " + << fDimShapeY[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast B dim " << i << " in " << opName << "\");\n"; + } + // validate C vs Z + if (i < fDimShapeC.size() && fDimShapeC[i].isParam) { + out << SP << SP << "if (" << fDimShapeC[i] << " != 1 && " + << fDimShapeC[i] << " != " << fDimShapeZ[i] << ")\n"; + out << SP << SP << SP + << "throw std::runtime_error(\"SOFIE Where: cannot broadcast C dim " << i << " in " << opName << "\");\n"; + } + } + out << SP << "}\n"; + } + // implement now where using teh strides and looping on the different dimensions + // ---------------------------------------------------------------- // + // Generate loop(s) with per-dimension stride-based index arithmetic + // ---------------------------------------------------------------- // + auto stridesX = UTILITY::ComputeStrideFromShape(fDimShapeX); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + auto stridesC = UTILITY::ComputeStrideFromShape(fDimShapeC); + auto stridesZ = UTILITY::ComputeStrideFromShape(fDimShapeZ); + + auto buildIdxExpr = [&](const std::vector &dimShape, + const std::vector &strides, + size_t rankZ) -> std::string { + if (dimShape.empty() || + std::all_of(dimShape.begin(), dimShape.end(), + [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) + return "0"; + std::string expr; + size_t offset = rankZ - dimShape.size(); + for (size_t i = 0; i < dimShape.size(); ++i) { + if (dimShape[i].dim == 1 || dimShape[i].GetVal() == "1") continue; + expr += "idx_" + std::to_string(i + offset); + if (strides[i].GetVal() != "1") + expr += " * " + strides[i].GetVal(); + expr += " + "; + } + if (expr.size() >= 3) + for (int j = 0; j < 3; j++) expr.pop_back(); // remove trailing " + " + return expr.empty() ? "0" : expr; + }; + + std::string idxX = buildIdxExpr(fDimShapeX, stridesX, fDimShapeZ.size()); + std::string idxY = buildIdxExpr(fDimShapeY, stridesY, fDimShapeZ.size()); + std::string idxC = buildIdxExpr(fDimShapeC, stridesC, fDimShapeZ.size()); + + // Emit nested loops over output shape + int nloop = 0; + std::string idxZ; + // case Z is a scalar (all dimensions are 1) or Z has no dimension + if (fDimShapeZ.empty() || + std::all_of(fDimShapeZ.begin(), fDimShapeZ.end(), + [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + idxZ = "0"; + } else { + for (size_t i = 0; i < fDimShapeZ.size(); ++i) { + if (fDimShapeZ[i].dim != 1 && fDimShapeZ[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i + << " < " << fDimShapeZ[i] << "; ++idx_" << i << ") {\n"; + idxZ += "idx_" + std::to_string(i); + if (stridesZ[i].GetVal() != "1") + idxZ += " * " + stridesZ[i].GetVal(); + idxZ += " + "; + } + } + if (idxZ.size() >= 3) + for (int j = 0; j < 3; j++) idxZ.pop_back(); + } + + // Inner assignment + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNZ << "[" << idxZ << "] = " + << "tensor_" << fNC << "[" << idxC << "] ? " + << "tensor_" << fNX << "[" << idxX << "] : " + << "tensor_" << fNY << "[" << idxY << "];\n"; + + // Close loops + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + + return out.str(); + } + + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeZ.empty()) + throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first"); + + const std::size_t D = fShapeZ.size(); + std::size_t totalElements = ConvertShapeToLength(fShapeZ); + + std::vector shapeA_padded(D, 1); + std::vector shapeB_padded(D, 1); + std::vector shapeC_padded(D, 1); + { + size_t offA = D - fShapeX.size(); + for (size_t i = 0; i < fShapeX.size(); ++i) shapeA_padded[offA + i] = fShapeX[i]; + size_t offB = D - fShapeY.size(); + for (size_t i = 0; i < fShapeY.size(); ++i) shapeB_padded[offB + i] = fShapeY[i]; + size_t offC = D - fShapeC.size(); + for (size_t i = 0; i < fShapeC.size(); ++i) shapeC_padded[offC + i] = fShapeC[i]; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(shapeA_padded); + auto stridesB = UTILITY::ComputeStrideFromShape(shapeB_padded); + auto stridesC = UTILITY::ComputeStrideFromShape(shapeC_padded); + auto stridesZ = UTILITY::ComputeStrideFromShape(fShapeZ); + + std::string typeName = TensorType::Name(); + std::string kname = "WhereKernel_" + opName; + + std::string op; + op = "\n//------ WHERE_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ x,\n"; + op += SP + SP + SP + "T const* __restrict__ y,\n"; + op += SP + SP + SP + "uint8_t const* __restrict__ cond,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesZ[d]) + "u) % " + + std::to_string(fShapeZ[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const c_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeC_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesC[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const x_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeA_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesA[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const y_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeB_padded[d] == 1) + op += SP + SP + SP + SP + SP + "0u"; + else + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesB[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = cond[c_idx] ? x[x_idx] : y[y_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "WhereKernel_" + opName; + return SP + kname + " whereKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeZ.empty()) + throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeZ); + std::string kname = "whereKernel_" + opName; + + std::stringstream out; + out << "\n//------ WHERE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNC << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNZ << ")" + << ", static_cast(" << totalElements << "));\n"; + + return out.str(); + } + +}; + +}//SOFIE + +#endif //TMVA_SOFIE_ROperator_Where diff --git a/src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx b/core/inc/SOFIE/SOFIEHelpers.hxx similarity index 100% rename from src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx rename to core/inc/SOFIE/SOFIEHelpers.hxx diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/core/inc/SOFIE/SOFIE_common.hxx similarity index 68% rename from src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx rename to core/inc/SOFIE/SOFIE_common.hxx index d183052..e36df0a 100644 --- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx +++ b/core/inc/SOFIE/SOFIE_common.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_COMMON #define SOFIE_SOFIE_COMMON -#include "TMVA/RTensor.hxx" +#include "SOFIE/RTensor.hxx" -#include "ROOT/RSpan.hxx" +#include #include #include @@ -21,13 +21,10 @@ #include #include - -namespace SOFIE{ - -//typedef RTensor tensor_t; +namespace SOFIE { enum class ETensorType{ - UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive + UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16 }; @@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) { switch (type) { case ETensorType::FLOAT: return sizeof(float); case ETensorType::DOUBLE: return sizeof(double); - case ETensorType::UNINT8: return sizeof(uint8_t); + case ETensorType::UINT8: return sizeof(uint8_t); case ETensorType::INT8: return sizeof(int8_t); case ETensorType::UINT16: return sizeof(uint16_t); case ETensorType::INT16: return sizeof(int16_t); @@ -58,6 +55,9 @@ typedef std::int64_t int_t; std::string ConvertTypeToString(ETensorType type); ETensorType ConvertStringToType(std::string type); +// find if a string represents a number +bool IsInteger(const std::string & s); + struct Dim{ bool isParam = false; size_t dim = 0; @@ -67,16 +67,42 @@ struct Dim{ Dim() {} // constructor for a parametric dimension with the option to pass a default dim value - Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {} + // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2") + // in case the string represents a number make Dim not parametric + Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) + { + if (IsInteger(p)) { + isParam = false; + dim = std::stoi(p); + } + } // constructor for a non-parametric dimension Dim(size_t d) : dim(d) {} std::string GetVal() const { - return (isParam) ? param : std::to_string(dim); + // cast to int64_t for negative shape values + return (isParam) ? param : std::to_string(static_cast(dim)); + } + + std::ostream& operator<< (std::ostream& os) const { + os << GetVal(); + return os; + } + + bool operator==(const Dim& rhs) const { + return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim; + } + bool operator!=(const Dim& rhs) const { + return !(*this == rhs); } }; +//bool operator==(const Dim& lhs, const Dim& rhs); +inline std::ostream & operator<< (std::ostream &os, const Dim &d) { + os << d.GetVal(); + return os; +} struct InputTensorInfo{ ETensorType type; @@ -93,6 +119,18 @@ struct DynamicTensorInfo{ std::vector shape; }; +// template traits for Tensor Shape +template +struct TensorShape {}; +template<> +struct TensorShape { + static bool IsDim() { return true; } +}; +template<> +struct TensorShape { + static bool IsDim() { return false; } +}; + // template traits for Tensor type template struct TensorType {}; @@ -120,6 +158,18 @@ template<> struct TensorType { static const std::string Name() { return "uint64_t"; } }; +template<> +struct TensorType { + static const std::string Name() { return "bool"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "int8_t"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "uint8_t"; } +}; struct TensorMemoryInfo { std::string_view tensor_name; @@ -148,47 +198,85 @@ struct MemoryPoolInfo { std::map available_stack; }; -std::vector ConvertShapeToDim(std::vector shape); +std::vector ConvertShapeToDim(const std::vector & shape); -std::vector ConvertShapeToInt(std::vector shape); +std::vector ConvertShapeToInt(const std::vector & shape); -std::size_t ConvertShapeToLength(std::vector shape); +std::size_t ConvertShapeToLength(const std::vector & shape); +std::size_t ConvertShapeToLength(const std::vector & shape); -std::string ConvertShapeToString(std::vector shape); -std::string ConvertDynamicShapeToString(std::vector shape); -// std::string ConvertShapeToString(std::vector shape) { -// return ConvertDynamicShapeToString(shape); -// } +std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertDimShapeToString(const std::vector & shape); + +std::string ConvertDimShapeToLength(const std::vector & shape); -std::string ConvertDynamicShapeToLength(std::vector shape); template std::string ConvertValToString(T value) { std::stringstream ret; - if (std::is_floating_point_v) - ret << std::setprecision(std::numeric_limits::max_digits10); - ret << value; + ret << std::to_string(value); + return ret.str(); +} +// float specialization +template<> +inline std::string ConvertValToString(float value) { + std::stringstream ret; + // special case for infinity and Nan + if (std::isinf(value)) + ret << (value > 0 ? "std::numeric_limits::infinity()" : + "-std::numeric_limits::infinity()"); + else if (std::isnan(value)) + ret << "std::numeric_limits::quiet_NaN()"; + else { + ret << std::setprecision(std::numeric_limits::max_digits10); + ret << value; + } + return ret.str(); +} +// double specialization +template<> +inline std::string ConvertValToString(double value) { + std::stringstream ret; + // special case for infinity and Nan + if (std::isinf(value)) + ret << (value > 0 ? "std::numeric_limits::infinity()" : + "-std::numeric_limits::infinity()"); + else if (std::isnan(value)) + ret << "std::numeric_limits::quiet_NaN()"; + else { + ret << std::setprecision(std::numeric_limits::max_digits10); + ret << value; + } + return ret.str(); +} +// int64_t specialization for INT64_MIN +template<> +inline std::string ConvertValToString(int64_t value) { + std::stringstream ret; + if (value == INT64_MIN) + ret << "INT64_MIN"; + else + ret << std::to_string(value); return ret.str(); } // convert list of values in a string taking into account the precision template -std::string ConvertValuesToString(size_t n, const T * data) { +std::string ConvertValuesToString(size_t n, const T * data, size_t maxprint = -1) { std::stringstream ret; ret << "{ "; - for (size_t i = 0; i < n; i++) { - if (std::is_floating_point_v) - ret << std::setprecision(std::numeric_limits::max_digits10); - ret << data[i]; + for (size_t i = 0; i < std::min(n,maxprint); i++) { + ret << ConvertValToString(data[i]); if (i < n-1) ret << ", "; + if (i < n-1 && i == maxprint-1) ret << "..... "; } ret << "}"; return ret.str(); } template -std::string ConvertValuesToString(const std::vector & data) { - return ConvertValuesToString(data.size(), data.data()); +std::string ConvertValuesToString(const std::vector & data, size_t maxprint = 5) { + return ConvertValuesToString(data.size(), data.data(), maxprint); } class InitializedTensor { @@ -204,10 +292,18 @@ public: std::shared_ptr const &sharedptr() const { return fData; } // query if tensor comes from a Constant operator bool IsConstantTensor() const { return fConstant;} - // query if tensor needs to be written in a weight file. Constant tensors are not written in a file + // query if tensor needs to be written in a weight file. Constant tensors are not written in a separate file bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;} + // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor) + // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in + // the generated code + bool IsNotWritable() const { return fIsNotWritable; } // set not writable initialized tensors - i.e. tensor that must not be written in a file void SetNotWritable() { fIsNotWritable = true;} + // set writable initialized tensors - i.e. tensor that must be written in a file + void SetWritable() { fIsNotWritable = false;} + // set as constant (needed for non-float initialized tensors) + void SetConstant() { fConstant = true;} template T const *data() const @@ -223,16 +319,8 @@ public: for (std::size_t item : fShape) { fSize *= static_cast(item); } - switch (fType) { - case ETensorType::FLOAT: fSize *= sizeof(float); break; - case ETensorType::DOUBLE: fSize *= sizeof(double); break; - case ETensorType::INT32: fSize *= sizeof(int32_t); break; - case ETensorType::INT64: fSize *= sizeof(int64_t); break; - case ETensorType::BOOL: fSize *= sizeof(bool); break; - default: - throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " + - ConvertTypeToString(fType)); - } + // get size in bytes + fSize *= GetTypeSize(fType); fPersistentData = static_cast(fData.get()); } void CastPersistentToShared() @@ -271,7 +359,7 @@ private: template ETensorType GetTemplatedType(T /*obj*/ ){ if (std::is_same::value) return ETensorType::FLOAT; - if (std::is_same::value) return ETensorType::UNINT8; + if (std::is_same::value) return ETensorType::UINT8; if (std::is_same::value) return ETensorType::INT8; if (std::is_same::value) return ETensorType::UINT16; if (std::is_same::value) return ETensorType::INT16; @@ -287,6 +375,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){ } namespace UTILITY{ + + + +// clean operator and tensor names +std::string Clean_name(std::string input_tensor_name); + // Check if two shapes are equal bool AreSameShape(const std::vector&, const std::vector&); bool AreSameShape(const std::vector&, const std::vector&); @@ -296,10 +390,14 @@ bool AreSameShape(const std::vector&, const std::vector&); // Multidirectional broadcast a list of tensors to the same shape std::vector MultidirectionalBroadcastShape(std::vector>); -// Unidirectional broadcast two shapes to the same shape -std::vector UnidirectionalBroadcastShape(std::vector, std::vector); +// Multidirectional broadcast two shapes to the same shape + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); +std::vector UnidirectionalBroadcastShape(std::vector &, std::vector &); + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); + -std::string Clean_name(std::string input_tensor_name); template T* BroadcastConvBias(const T* data, const size_t channel, const std::vector& targetShape) { @@ -343,16 +441,14 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector, class ContT = std::span > -void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, ContT broadcastedData) { +template> +void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones ) size_t size = shape.size(); // Current length of the broadcasted tensor size_t curLength = data.size(); - size_t targetLength = broadcastedData.size(); - assert(ConvertShapeToLength(targetShape) == targetLength); // special case when broadcasting last dimensions (initial shapes must be the same) - if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) { + if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) { size_t bsize = targetShape.back(); // compute the size of the data to broadcast for (int k = int(size)-2; k >=0; k--) { @@ -360,16 +456,16 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st bsize *= targetShape[k]; } for (size_t i = 0; i < curLength; i++) { - std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]); + std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]); } return; } - std::copy(data.begin(), data.end(), broadcastedData.begin()); + std::copy(data.begin(), data.end(), broadcastedData); // Product of the previous dimensions of targetShape size_t arrayNum = 1; // New broadcasted data: is this needed? - std::vector newData(targetLength); + std::vector newData(ConvertShapeToLength(targetShape)); for (size_t idx = 0; idx < size; idx++) { size_t dim = shape[idx]; @@ -385,8 +481,8 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) { for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) { size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength; - std::copy(broadcastedData.begin() + arrayIdx * arrayLength, - broadcastedData.begin() + (arrayIdx + 1) * arrayLength, + std::copy(broadcastedData + arrayIdx * arrayLength, + broadcastedData + (arrayIdx + 1) * arrayLength, newData.begin() + offset); } } @@ -400,12 +496,11 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st // Update current length curLength = newLength; // Update broadcasted data - std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin()); + std::copy(newData.begin(), newData.begin() + newLength, broadcastedData); } // Update the number of arrays arrayNum *= targetDim; } - //return broadcastedData; } // interface where we allocate a new array for broadcasted data @@ -413,10 +508,8 @@ template T* CreateBroadcastTensor(const T* data, const std::vector& shape, const std::vector& targetShape, size_t targetLength) { // newShape is an array of size equal to dimension along which we are broadcasting the tensor T* broadcastedData = new T[targetLength]; - std::span bData(broadcastedData, broadcastedData+targetLength); size_t curLength = ConvertShapeToLength(shape); - std::span inData(data, curLength); - BroadcastTensor, std::span>(inData, shape, targetShape, bData); + BroadcastTensor({data, curLength}, shape, targetShape, broadcastedData); return broadcastedData; } // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not @@ -429,14 +522,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector& shape, cons std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); } - return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); } // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations template -void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, std::span broadcastedData) { +void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { size_t curLength = ConvertShapeToLength(shape); std::span inData(const_cast(data), curLength); // Prepend shape with ones @@ -445,12 +538,10 @@ void UnidirectionalBroadcast(const T* data, const std::vector& shape, co std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - BroadcastTensor(inData, newShape, targetShape, broadcastedData); + BroadcastTensor(inData, newShape, targetShape, broadcastedData); } - BroadcastTensor>(inData, shape, targetShape, broadcastedData); + BroadcastTensor(inData, shape, targetShape, broadcastedData); } -// specialization for vector of boolean -void UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData); /// compute stride of a tensor given its shape (assume layout is row-major) std::vector ComputeStrideFromShape(const std::vector & shape); @@ -619,8 +710,6 @@ void col2im(const Dtype* data_col, const int channels, //std::cout << "finishing col2imp" << std::endl; } - - } // end namespace UTILITY namespace BLAS{ @@ -631,37 +720,37 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m, struct GNN_Data { - TMVA::Experimental::RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) - TMVA::Experimental::RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) - TMVA::Experimental::RTensor global_data; // the global features, tensor with shape (1, num_global_features) - TMVA::Experimental::RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) + RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) + RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) + RTensor global_data; // the global features, tensor with shape (1, num_global_features) + RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) // edge_index[0,:] are the receivers and edge_index[1,:] are the senders // need to have default constructor since RTensor has not one - GNN_Data(): node_data(TMVA::Experimental::RTensor({})), edge_data(TMVA::Experimental::RTensor({})), global_data(TMVA::Experimental::RTensor({})), edge_index(TMVA::Experimental::RTensor({})) {} + GNN_Data(): node_data(RTensor({})), edge_data(RTensor({})), global_data(RTensor({})), edge_index(RTensor({})) {} }; template -TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, TMVA::Experimental::RTensor & t2, int axis = 0) +RTensor Concatenate( RTensor & t1, RTensor & t2, int axis = 0) { // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis if (t1.GetMemoryLayout() != t2.GetMemoryLayout()) - throw std::runtime_error("TMVA RTensor Concatenate - tensors have different memory layout"); + throw std::runtime_error("RTensor Concatenate - tensors have different memory layout"); auto & shape1 = t1.GetShape(); auto & shape2 = t2.GetShape(); if (t1.GetSize()/shape1[axis] != t2.GetSize()/shape2[axis]) { std::cout << "axis " << axis << " sizes " << t1.GetSize() << " " << t2.GetSize() << " "; std::cout << "shape 1 : " << ConvertShapeToString(t1.GetShape()); std::cout << " shape 2 : " << ConvertShapeToString(t2.GetShape()) << std::endl; - throw std::runtime_error("TMVA RTensor Concatenate - tensors have incompatible shapes"); + throw std::runtime_error("RTensor Concatenate - tensors have incompatible shapes"); } std::vector outShape = shape1; outShape[axis] = shape1[axis] + shape2[axis]; - TMVA::Experimental::RTensor tout(outShape, t1.GetMemoryLayout()); - if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) { - throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors"); + RTensor tout(outShape, t1.GetMemoryLayout()); + if (t1.GetMemoryLayout() == MemoryLayout::ColumnMajor) { + throw std::runtime_error("RTensor Concatenate is not yet supported for column major tensors"); } auto & stride1 = t1.GetStrides(); @@ -693,10 +782,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) { inline GNN_Data Copy(const GNN_Data & data) { GNN_Data out; - out.node_data = TMVA::Experimental::RTensor(data.node_data.GetShape()); - out.edge_data = TMVA::Experimental::RTensor(data.edge_data.GetShape()); - out.global_data = TMVA::Experimental::RTensor(data.global_data.GetShape()); - out.edge_index = TMVA::Experimental::RTensor(data.edge_index.GetShape()); + out.node_data = RTensor(data.node_data.GetShape()); + out.edge_data = RTensor(data.edge_data.GetShape()); + out.global_data = RTensor(data.global_data.GetShape()); + out.edge_index = RTensor(data.edge_index.GetShape()); std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData()); std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData()); std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData()); @@ -704,6 +793,136 @@ inline GNN_Data Copy(const GNN_Data & data) { return out; } -}//SOFIE +inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A, + const float *B, float beta, const float *C) +{ + char ct = 't'; + char cn = 'n'; + const int *lda = transa ? &k : &m; + const int *ldb = transb ? &n : &k; + const int *ldc = &m; + if (C != nullptr) { + std::copy(C, C + m * n, output); + } + BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb, + &beta, output, ldc); +} + +inline void Fill(float *output, float value, int size) +{ + std::fill(output, output + size, value); +} + +template +inline void Copy(T *output, T const *input, int size) +{ + std::copy(input, input + size, output); +} + +inline void Relu(float *output, float const *input, int size) +{ + for (int i = 0; i < size; i++) { + output[i] = (input[i] > 0.0f) ? input[i] : 0.0f; + } +} +// function to read float from the file dealing with inf and nan values +inline float ParseFloatToken (const std::string & s) { + if (s == "inf") return std::numeric_limits::infinity(); + if (s == "-inf") return -std::numeric_limits::infinity(); + if (s == "nan") return std::numeric_limits::quiet_NaN(); + return std::stof(s); +} + +template +void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength) +{ + std::string name; + std::size_t length; + is >> name >> length; + if (name != expectedName) { + std::string err_msg = + "sofie failed to read the correct tensor name; expected name is " + expectedName + " , read " + name; + throw std::runtime_error(err_msg); + } + if (length != expectedLength) { + std::string err_msg = "sofie failed to read the correct tensor size; expected size is " + + std::to_string(expectedLength) + " , read " + std::to_string(length); + throw std::runtime_error(err_msg); + } + std::string token; + for (size_t i = 0; i < length; ++i) { + is >> token; + target[i] = ParseFloatToken(token); + } + if (is.fail()) { + throw std::runtime_error("sofie failed to read the values for tensor " + expectedName); + } +} + +//Utility functions to generate code +void EmitNestedLoops(std::stringstream &out, size_t loopRank, const std::vector shape); +void CloseNestedLoops(std::stringstream &out, size_t loopRank); + + +// code for the memory greeding allocations +struct TensorLifeInfo { + int begin; // start time (op index) lifetime + int end; // end time lifetime + size_t size; // size of tensors in bytes +}; + +struct MemoryResult { + std::size_t total_bytes = 0; // total memory needed + std::vector offsets; // resulted offsets for each tensor +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ); + +// Simple Dimension classes ans helpers to add constexpr meta info on input +// tensors to the emitted code. +struct SingleDim { + enum class Kind { + Static, + Symbolic + }; + + Kind kind; + std::size_t dim; + std::string_view name; + + constexpr SingleDim(std::size_t v) : kind(Kind::Static), dim(v), name() {} + constexpr SingleDim(const char *v) : kind(Kind::Symbolic), dim(0), name(v) {} +}; + +struct TensorDims { + const SingleDim *data; + std::size_t size; + + constexpr std::size_t total_size() const + { + std::size_t result = 1; + for (std::size_t i = 0; i < size; ++i) { + result *= data[i].dim; + } + return result; + } +}; + +template +constexpr TensorDims makeDims(Arr const &arr) +{ + return TensorDims{arr.data(), arr.size()}; +} + +inline std::string ConvertOutputTypeToString(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); +} + + +} // namespace SOFIE -#endif //TMVA_SOFIE_RMODEL +#endif //TMVA_SOFIE_COMMON diff --git a/src/SOFIE_core/src/Prototype.cxx b/core/src/Prototype.cxx similarity index 100% rename from src/SOFIE_core/src/Prototype.cxx rename to core/src/Prototype.cxx diff --git a/src/SOFIE_core/src/RFunction.cxx b/core/src/RFunction.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction.cxx rename to core/src/RFunction.cxx diff --git a/src/SOFIE_core/src/RFunction_MLP.cxx b/core/src/RFunction_MLP.cxx similarity index 91% rename from src/SOFIE_core/src/RFunction_MLP.cxx rename to core/src/RFunction_MLP.cxx index eff76f6..5666f3e 100644 --- a/src/SOFIE_core/src/RFunction_MLP.cxx +++ b/core/src/RFunction_MLP.cxx @@ -10,13 +10,13 @@ namespace SOFIE { -RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function, bool activate_final, GraphType gType): +RFunction_MLP::RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function, bool activate_final, GraphType gType): RFunction_Update(target, gType), fNumLayers(numLayers), fActivationFunction(activation_function), fActivateFinal(activate_final) { // assuming all the linear layers has a kernel and a bias initialized tensors if (fActivateFinal) { if (fActivationFunction == Activation::Invalid) { - throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " + + throw std::runtime_error("SOFIE GNN doesn't currently supports the provided activation function for " + fFuncName + " update."); } function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)}); @@ -43,12 +43,12 @@ void RFunction_MLP::Initialize() { double beta = (fBiasTensors[i].empty()) ? 0. : 1.; op_gemm.reset(new ROperator_Gemm(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i))); function_block->AddOperator(std::move(op_gemm)); - fGemmInput = fFuncName+"Gemm"+i; + fGemmInput = fFuncName+"Gemm"+std::to_string(i); if (fActivationFunction == Activation::RELU) { std::unique_ptr op_relu; op_relu.reset(new ROperator_Relu(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i))); function_block->AddOperator(std::move(op_relu)); - fGemmInput = fFuncName+"Relu"+i; + fGemmInput = fFuncName+"Relu"+std::to_string(i); } } diff --git a/src/SOFIE_core/src/RFunction_Mean.cxx b/core/src/RFunction_Mean.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction_Mean.cxx rename to core/src/RFunction_Mean.cxx diff --git a/src/SOFIE_core/src/RFunction_Sum.cxx b/core/src/RFunction_Sum.cxx similarity index 100% rename from src/SOFIE_core/src/RFunction_Sum.cxx rename to core/src/RFunction_Sum.cxx diff --git a/core/src/RModel.cxx b/core/src/RModel.cxx new file mode 100644 index 0000000..377171c --- /dev/null +++ b/core/src/RModel.cxx @@ -0,0 +1,2012 @@ +#include +#include +#include +#include +#include + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +#include "TFile.h" +#endif + +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelProfiler.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +namespace { +const std::string SP = " "; + +void ReplaceAll(std::string &str, const std::string &from, const std::string &to) +{ + size_t pos = 0; + while ((pos = str.find(from, pos)) != std::string::npos) { + str.replace(pos, from.length(), to); + pos += to.length(); + } +} + +bool IsIdentifierChar(char c) +{ + return std::isalnum(static_cast(c)) || c == '_'; +} + +// Returns true if s is a valid C++ identifier (can be used as a variable name). +// Dim::param can be either a plain name (e.g. "W") or a computed expression +// (e.g. "((W+-3)/2+1)"); only the former can be used as a C++ variable name. +bool IsIdentifier(const std::string &s) +{ + if (s.empty() || std::isdigit(static_cast(s[0]))) + return false; + for (char c : s) + if (!IsIdentifierChar(c)) + return false; + return true; +} + +// Get the data member name corresponding to a tensor with a given name. +std::string TensorMember(std::string const &name) +{ + return "tensor_" + name; +} + +} // namespace + +std::vector RModel::GetTensorShape(const std::string & name) const { + auto f = fReadyInputTensorInfos.find(name); + if (f != fReadyInputTensorInfos.end()) { + return f->second.shape; + } + auto f2 = fInitializedTensors.find(name); + if (f2 != fInitializedTensors.end()) { + return f2->second.shape(); + } + auto f3 = fInputTensorInfos.find(name); + if (f3 != fInputTensorInfos.end()) { + throw std::runtime_error("SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter"); + } + auto f4 = fIntermediateTensorInfos.find(name); + if (f4 != fIntermediateTensorInfos.end()) { + return f4->second.shape; + } + // case of shape tensors + auto f5 = fShapeTensors.find(name); + if (f5 != fShapeTensors.end()) { + // shape is vector of size 1 with size of shape values or just a scalar + if (f5->second.second) // check scalar flag + return std::vector{}; + else + return std::vector{f5->second.first.size()}; + } + + if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + throw std::runtime_error("SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); + + if (fIsSubGraph && fParentGraph) + return fParentGraph->GetTensorShape(name); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} + +std::vector RModel::GetDimTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // in case is not a dynamic tensor convert normal shape to Dim one + // for this we need to return the vector by value + return ConvertShapeToDim(GetTensorShape(name)); +} +std::vector RModel::GetDynamicTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // throw error if shape is not dynamic + if (!IsDynamicTensor(name)) + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not dynamic"); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} + +ETensorType RModel::GetTensorType(std::string name) const { + auto f = fReadyInputTensorInfos.find(name); + if (f != fReadyInputTensorInfos.end()) { + return f->second.type; + } + auto f2 = fInitializedTensors.find(name); + if (f2 != fInitializedTensors.end()) { + return f2->second.type(); + } + auto f3 = fInputTensorInfos.find(name); + if (f3 != fInputTensorInfos.end()) { + return f3->second.type; + } + auto f4 = fIntermediateTensorInfos.find(name); + if (f4 != fIntermediateTensorInfos.end()) { + return f4->second.type; + } + auto f5 = fDynamicTensorInfos.find(name); + if (f5 != fDynamicTensorInfos.end()){ + return f5->second.type; + } + // case of shape tensor type is INT64 + if (fShapeTensors.find(name) != fShapeTensors.end()){ + return ETensorType::INT64; + } + + if (fIsSubGraph && fParentGraph) + return fParentGraph->GetTensorType(name); + + throw std::runtime_error("SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName); +} + +bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { + if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end()) return true; + if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true; + if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; + if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; + if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; + if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true; + if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); + return false; +} + +void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { + input_name = UTILITY::Clean_name(input_name); + if (CheckIfTensorAlreadyExist(input_name)) { + throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n"); + } + + InputTensorInfo inputInfo { type, shape }; + fInputTensorInfos[input_name] = inputInfo; +} + +void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { + input_name = UTILITY::Clean_name(input_name); + if (CheckIfTensorAlreadyExist(input_name)) { + throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n"); + } + TensorInfo inputInfo { type, shape }; + fReadyInputTensorInfos[input_name] = inputInfo; +} + +void RModel::AddInputTensorName(std::string input_name) { + fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name)); +} + +void RModel::AddOperator(std::unique_ptr op, int order_execution) +{ + AddBlasRoutines(op->GetBlasRoutines()); + auto libs = op->GetStdLibs(); + auto op_input_tensors = op->GetOpInputTensors(); + for (auto &stdlib : libs) { + AddNeededStdLib(stdlib); + } + if (order_execution >= 0) { + fOperators.insert(fOperators.begin() + order_execution, std::move(op)); + } else { + fOperators.push_back(std::move(op)); + order_execution = fOperators.size() - 1; + } + + // storing the last usage of tensors which are input to the operator + // (excluding tensors which are inputs to the model or the initialized (weights) tensors) + // We call this function during parsing so we don't have yet initialized the operators + for (size_t index = 0; index < op_input_tensors.size(); index++) { + if (!IsInitializedTensor(UTILITY::Clean_name(std::string(op_input_tensors[index]))) && + std::find(fInputTensorNames.begin(), fInputTensorNames.end(), + UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end()) { + + fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution; + if (Verbose()) + std::cout << "adding order execution for " << op_input_tensors[index] << " order " << order_execution + << std::endl; + } + } +} + +void RModel::AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + //NB: own data + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n"); + } + InitializedTensor new_tensor {type, shape, data}; + fInitializedTensors[tensor_name] = new_tensor; +} + +void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + //NB: own data + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: constant tensor with name " + tensor_name + " already exists \n"); + } + InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor + fInitializedTensors[tensor_name] = new_tensor; +} + +void RModel::AddShapeTensor(const std::string & name, const std::vector & shape_values, bool scalar){ + auto tensor_name = UTILITY::Clean_name(name); + if (fShapeTensors.count(tensor_name) != 0) { + throw std::runtime_error("sofie: shape tensor with name " + tensor_name + " already exists \n"); + } + fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar); +} + +void RModel::AddAliasTensor(const std::string & name, const std::string & origin){ + // add an alias tensor to origin + auto tensor_name = UTILITY::Clean_name(name); + auto origin_name = UTILITY::Clean_name(origin); + if (fAliasTensors.count(tensor_name) != 0) { + throw std::runtime_error("sofie: alias tensor with name " + tensor_name + " already exists \n"); + } + fAliasTensors[tensor_name] = origin_name; +} + +bool RModel::IsShapeTensor(const std::string & tensor_name) const { + return fShapeTensors.count(tensor_name) != 0; +} + +bool RModel::IsAliasTensor(const std::string & tensor_name) const { + return fAliasTensors.count(tensor_name) != 0; +} + +const std::vector & RModel::GetShapeTensorValues(const std::string & tensor_name) const { + //if (!IsShapeTensor(tensor_name) ) return std::vector{}; + return fShapeTensors.at(tensor_name).first; +} + +bool RModel::IsInitializedTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fInitializedTensors.find(name) != fInitializedTensors.end(); +} +bool RModel::IsConstantTensor(const std::string& tensorName) const { + // a constant tensor is an initialized tensor but has the constant flag set + std::string name = UTILITY::Clean_name(tensorName); + auto itr = fInitializedTensors.find(name); + if (itr == fInitializedTensors.end()) return false; + return itr->second.IsConstantTensor(); +} + +// dynamic tensors include also Dim input tensors +bool RModel::IsDynamicTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + return (ret) ? true : IsDimInputTensor(tensorName); +} +bool RModel::IsDimInputTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fInputTensorInfos.find(name) != fInputTensorInfos.end(); +} +bool RModel::IsReadyInputTensor(const std::string& tensorName) const { + std::string name = UTILITY::Clean_name(tensorName); + return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end(); +} + +// generic addition of a tensor +void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape) { + auto int_shape = ConvertShapeToInt(dim_shape); + if (!int_shape.empty()) + AddIntermediateTensor(tensor_name, type, int_shape); + else + AddDynamicTensor(tensor_name, type, dim_shape); +} + +void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape) { + tensor_name = UTILITY::Clean_name(tensor_name); + if (CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n"); + } + TensorInfo new_tensor {type, shape}; + fIntermediateTensorInfos[tensor_name] = new_tensor; +} + +void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape){ + tensor_name = UTILITY::Clean_name(tensor_name); + if (CheckIfTensorAlreadyExist(tensor_name)){ + throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n"); + } + DynamicTensorInfo new_tensor {type, shape}; + fDynamicTensorInfos[tensor_name] = new_tensor; + // store shape parameter if not existing + for (auto &d : shape) { + if (d.isParam) { + if (d.dim != size_t(-1)) { + AddShapeParam(d.param, d.dim); + } + } + } +} + +void RModel::AddShapeParam(const std::string & param, size_t default_value) { + if (fShapeParams.count(param) == 0) { + fShapeParams[param] = std::to_string(default_value); + // add also in the vector list (used to keep the order) + fDimShapeNames.push_back(param); + } +} + +void RModel::AddOutputTensorNameList(std::vector outputtensornames) { + fOutputTensorNames.clear(); + for(auto& it : outputtensornames) { + fOutputTensorNames.emplace_back(UTILITY::Clean_name(it)); + } +} + +void RModel::UpdateOutputTensorList(std::vector curr_output_tensors, std::vector new_output_tensors) { + for(auto& it:curr_output_tensors) { + fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end()); + } + fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end()); +} + +void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { + tensor_name = UTILITY::Clean_name(tensor_name); + if (!CheckIfTensorAlreadyExist(tensor_name)) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to update it"); + } + InitializedTensor new_tensor {type, shape, data}; + fInitializedTensors[tensor_name] = new_tensor; +} + +std::shared_ptr RModel::GetInitializedTensorData(std::string tensor_name) { + auto f = fInitializedTensors.find(tensor_name); + if (f == fInitializedTensors.end()) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to get its data"); + } else { + return f->second.sharedptr(); + } +} + +void RModel::RemoveInitializedTensor(std::string tensor_name) { + auto f = fInitializedTensors.find(tensor_name); + if (f == fInitializedTensors.end()) { + throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to remove it"); + } else { + fInitializedTensors.erase(f); + } +} + +void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { + auto t = fInitializedTensors.find(tensor_name); + if (t == fInitializedTensors.end()) { + throw std::runtime_error("sofie: initialized tensor " + tensor_name + " not found when trying to get its info"); + } + t->second.SetNotWritable(); + } + +std::string RModel::AllocateIntermediateMemory(std::span op_output_tensors) +{ + std::stringstream code; + + if (fVerbose) { + std::cout << "Total chunks allocated\n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl; + } + } + + auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) { + std::string typeName = ConvertTypeToString(GetTensorType(name)); + code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes"; + code << "\n" + << typeName << "* " << TensorMember(name) << " = reinterpret_cast<" << typeName + << "*>(fIntermediateMemoryPool.data() + " << location << ");\n"; + }; + + if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n"; + // order output tensors by size + std::vector ordered_output_tensors; + + for (auto &it : op_output_tensors) { + auto name = std::string(it); + if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() || + fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + continue; + + // case of alias tensor + if (IsAliasTensor(name)) { + continue; + } + + auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name)); + // important fill the pair in the ordered output tensors with the string view and not the string + TensorMemoryInfo tmi = {it, tensor_size}; + ordered_output_tensors.push_back(tmi); + } + std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(), + [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; }); + + for (auto &it : ordered_output_tensors) { + bool allocated = false; + std::string name = std::string{it.tensor_name}; + size_t tensor_size = it.tensor_size; + if (fVerbose) + std::cout << "output tensor " << name << " size " << tensor_size << std::endl; + + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end();) { + + if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second; + // check if available memory chunks can accommodate the tensor + if (chunk->second >= tensor_size) { + // need to use here string_view (i.e it.tensor_name) + // split returns the new chunk with size of new tensor. The free chunk is before the used one + auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size); + auto new_chunk_location = chunk->first + chunk->second - tensor_size; + fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; + + declareIntermediateTensor(name, tensor_size, new_chunk_location); + chunk->second -= tensor_size; + + allocated = true; + + if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location; + + if (chunk->second == 0) { + if (fVerbose) std::cout << " and deleted since size matches"; + fIntermediateMemoryInfo.available_stack.erase(chunk); + } + if (fVerbose) std::cout << std::endl; + break; + } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first && + fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) { + // case last available chunk is the last in the memory, we can increase that one + fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size}; + declareIntermediateTensor(name, tensor_size, chunk->first); + fIntermediateMemoryInfo.available_stack.erase(chunk); + allocated = true; + if (fVerbose) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl; + break; + } + ++chunk; + if (fVerbose) std::cout << std::endl; + } + + if (!allocated) { + size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() + ? 0 + : fIntermediateMemoryInfo.total_stack.rbegin()->first + + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; + + fIntermediateMemoryInfo.total_stack[chunk_idx] = it; + + declareIntermediateTensor(name, tensor_size, chunk_idx); + + if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx + << std::endl; + } + } + return code.str(); +} + +void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ + if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n"; + //print available chunks + if (fVerbose) std::cout << "available chunks before freeing them : \n"; + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) { + if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl; + } + for (auto &iv : op_input_tensors) { + // last occurrence of the tensor is reached => flush it from memory + if (fVerbose) std::cout << ".. input tensors : " << iv; + + // for alias tensors replace name with its alias + std::string it{iv}; // convert view to string + if (IsAliasTensor(it)) + it = fAliasTensors[it]; + if (fIntermediateTensorFrequencyLookup[it] == op_idx) { + if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); + chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + if (fVerbose) std::cout << "--- chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size; + if (chunk->second.tensor_name == it) { + if (fVerbose) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first; + // check if nearby chunks in available memory can coalesce + auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound( + chunk->first); // smallest element greater than the flushed chunk idx + auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) + ? fIntermediateMemoryInfo.available_stack.end() + : std::prev(first_greater); // largest element smaller than the flushed chunk idx + + // check if the next stack entry is actually adjacent in memory + + if (last_smaller != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == chunk->first) { + // merge chunk with previous one + last_smaller->second += chunk->second.tensor_size; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); + if (fVerbose) std::cout << " is adjacent in memory with previous one - merge "; + if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == first_greater->first) { + // merge also with following one + last_smaller->second += first_greater->second; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater->first]); + // delete merged one in available stack and in total stack + fIntermediateMemoryInfo.total_stack.erase(first_greater->first); + fIntermediateMemoryInfo.available_stack.erase(first_greater); + if (fVerbose) std::cout << " merge also with following that is free "; + } + fIntermediateMemoryInfo.total_stack.erase(chunk->first); + if (fVerbose) std::cout << std::endl; + break; + } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + chunk->first + chunk->second.tensor_size == first_greater->first) { + // merge with first greater + if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n"; + // cannot modify idx of first_greter. Insert a new one and delete previous one + size_t new_size = chunk->second.tensor_size + first_greater->second; + size_t first_greater_idx = first_greater->first; + fIntermediateMemoryInfo.available_stack.erase(first_greater); + // cannot use anymore first_greater + fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size}); + fIntermediateMemoryInfo.total_stack[chunk->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater_idx]); + fIntermediateMemoryInfo.total_stack.erase(first_greater_idx); + } else { + fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size}); + if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl; + } + chunk->second.tensor_name = "free"; + break; + } + } + } else { + if (fVerbose) std::cout << std::endl; + } + } +} + +void RModel::Initialize(int batchSize, bool verbose) { + std::map inputParams; + if (batchSize > 0) { + inputParams["input_size"] = batchSize; + inputParams["batch_size"] = batchSize; + inputParams["bs"] = batchSize; + } + Initialize(inputParams, verbose); + fIntermediateMemoryInfo = MemoryPoolInfo(); +} +void RModel::Initialize(const std::map & inputParams, bool verbose) { + + fVerbose = int(verbose); + + if (fIsInitialized) { + if (verbose) + std::cout << "Model is already initialized - skip initialization " << std::endl; + return; + } + fIntermediateTensorInfos.clear(); + fDynamicTensorInfos.clear(); + + + // loop on inputs and see if shape can be full specified + // if the batch size is provided it can be used to specify the full shape + // Add the full specified tensors in fReadyInputTensors collection + auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements + for (auto &input : originalInputTensorInfos) { + if (verbose) std::cout << "looking at the tensor " << input.first << std::endl; + // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one + if (!inputParams.empty()) { + for (auto &d : input.second.shape) { + if (d.isParam) { + std::string pname = d.param; + if (pname == input.first + "_size") pname = "input_size"; + auto itr = inputParams.find(pname); + if (itr != inputParams.end() ) { + d = Dim{ itr->second }; + if (verbose) + std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl; + } + } + } + } + // see if shape now is fully defined + auto shape = ConvertShapeToInt(input.second.shape); + if (verbose) + std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " + << ConvertDimShapeToString(input.second.shape) << std::endl; + if (!shape.empty()) { + // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and + // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored + fInputTensorInfos.erase(input.first); + // add to the ready input tensor information the new fixed shape + AddInputTensorInfo(input.first, input.second.type, shape); + // check consistency + assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size()); + } + // store the parameters of the input tensors + else { + // store the found parametric shape parameters + for (auto &d : input.second.shape) { + if (d.isParam) { + if (fShapeParams.count(d.param) == 0) { + fDimShapeNames.push_back(d.param); + fShapeParams[d.param] = std::to_string(d.dim); + } + } + } + } + } + + if (verbose) { + PrintRequiredInputTensors(); + PrintDynamicTensors(); + } + + // Go through model and initialize each operator + int i = 0; + + std::vector temp_available_stack; // vector stores individual chunks of available memory that maybe reused + + // Build set of initialized tensors consumed by at least one runtime operator (need for later) + std::unordered_set runtimeInitializedInputs; + for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ + if (verbose) { + auto& r = *fOperators[op_idx].get(); + std::cout << "Initializing operator " << i << " " << typeid(r).name() << std::endl; + } + fOperators[op_idx]->Initialize(*this); + for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ + std::string name = std::string{it}; + // check if tensor is not an initialized or output tensor and it is not already in the list + if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && + std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && + fInitializedTensors.find(name) == fInitializedTensors.end()) + { + fIntermediateTensorFrequencyLookup[it] = op_idx; + } + } + // loop for non-constant operators and flag the inputs which are initialized tensors to make sure they are writable + if (!fOperators[op_idx]->IsOutputConstant()) { + for (auto &it : fOperators[op_idx]->GetOpInputTensors()) { + std::string name = std::string{it}; + if (fInitializedTensors.find(name) != fInitializedTensors.end()) { + runtimeInitializedInputs.insert(name); + } + } + } + + i++; + } + + // loop on initialized tensors and make the integers as constant to be + // not written in a weight file and check if the tensors flagged as not writable are really not writable, + // i.e. are not used by non constant operators + for (auto &it : fInitializedTensors) { + // check if not-writable tensors are really not writable, i.e. are not used by non constant operators + if (it.second.IsNotWritable() && runtimeInitializedInputs.find(it.first) != runtimeInitializedInputs.end()) { + it.second.SetWritable(); + if (verbose) { + std::cout << "Initialized tensor " << it.first << " is flagged as not writable but is used by non constant operators, set it as writable \n"; + } + } + // if the tensor is an integer we can flag it as constant since it will not be written in a weight file and it is considered equivalent as being created from a Constant operator + // only FLOAT tensors are written in a weight file + if (it.second.type() != ETensorType::FLOAT) { + it.second.SetConstant(); + } + } + + // check if there are initialized tensors to write in a weight file + if (fUseWeightFile) { + bool modelHasWeights = false; + for (auto &it : fInitializedTensors) { + if (it.second.IsWeightTensor()) { + modelHasWeights = true; + break; + } + } + if (!modelHasWeights) + fUseWeightFile = false; + } + + // update fIntermediateTensorFrequencyLookup for alias tensors + for (auto & it : fAliasTensors) { + if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue; + if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() ) + fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first]; + else { + // take the largest one + fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] ); + } + } + + fIsInitialized = true; +} + +void RModel::InitializeSubGraph(std::shared_ptr graph) { + // add the subgraph to the list + fSubGraphs.push_back(graph); + //this needs to be done before initializing + graph->fParentGraph = this; + graph->fIsSubGraph = true; + + graph->Initialize(fBatchSize, fVerbose); + // set the same options as parent model + graph->fWeightFile = fWeightFile; + graph->fUseWeightFile = fUseWeightFile; + graph->fUseSession = fUseSession; + // add needed blas routines and libs + std::vector blasRoutines; + for (auto & e : graph->fNeededBlasRoutines) + blasRoutines.push_back(e); + AddBlasRoutines(blasRoutines); + for (auto e : graph->fNeededStdLib) + AddNeededStdLib(e); + + // add parent input tensors to current graph + for (auto & name : fInputTensorNames) + graph->fInputTensorNames.emplace_back(name); + + // clean graph name + graph->fName = UTILITY::Clean_name(graph->fName); + +} + +// Function to generate the code for declaring and initializing constant tensors +// This is for tensors which are not part of weight files and can be created from the Constant operator +template +std::string GenerateConstantTensorCode(const std::pair &t) +{ + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + // avoid using stack sizes for constant tensors to reduce compilation time + // also for weights which can be broadcasted do not use stack but allocate as a std::vector + bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + do { + sameData = (data[idx] == data[idx - 1]); + idx++; + } while (sameData && idx < length); + } + if (allocateOnStack) { + strs << type << " fTensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; + strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ";\n"; + } else { + strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); +} + +void RModel::GenerateInitializedTensorInfo() +{ + if (!fInitializedTensors.empty()) + fGC += "// initialized (weights and constant) tensors\n"; + + // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors ) + for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; + size_t length = ConvertShapeToLength(i.second.shape()); + if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() || i.second.type() != ETensorType::FLOAT ) { + if (i.second.type() == ETensorType::FLOAT) { + // check if NaN of Inf are inside tensor data + bool hasInfOrNaN = false; + const float *data = i.second.data(); + for (size_t idx = 0; idx < length; idx++) { + if (std::is_floating_point::value) { + if (std::isinf(data[idx]) || std::isnan(data[idx])) { + hasInfOrNaN = true; + break; + } + } + } + if (hasInfOrNaN) + AddNeededStdLib("limits"); + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(float); + } else if (i.second.type() == ETensorType::INT64) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(int64_t); + } else if (i.second.type() == ETensorType::INT32) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(int32_t); + } else if (i.second.type() == ETensorType::BOOL || i.second.type() == ETensorType::UINT8 ) { + fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += length * sizeof(uint8_t); + } + + + } else { + // case of tensors which are read from a file + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + fGC += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fWeightsTensorSize += length * sizeof(float); + } + } + } +} + +void RModel::GenerateIntermediateMemoryPool() { + if (fIntermediateMemoryInfo.total_stack.empty()) return; + fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; + + // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors + // of other data types + auto const &totalStack = fIntermediateMemoryInfo.total_stack; + const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; + fGC += "std::vector fIntermediateMemoryPool = std::vector(" + std::to_string(memPoolSize) + ");\n\n"; +} + +void RModel::GenerateIntermediateTensorInfo() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + for (auto &i : fIntermediateTensorInfos) { + bool is_alias = (IsAliasTensor(i.first)); + if (i.second.type == ETensorType::BOOL && !is_alias) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; + tensor_declaration_block += "std::uint8_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + continue; + } + bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended); + bool not_in_freq_map = + (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end()); + bool not_in_output_names = + (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()); + + if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) { + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 4 * length; + } + else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "double * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; + } + else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; + tensor_declaration_block += "int64_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; + } + } + if (is_alias) { + tensor_declaration_block += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n"; + } + + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + for (auto &i : fDynamicTensorInfos) { + fGC += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n"; + } + fGC += "//--- dynamic tensors pool\n"; + fGC += "std::vector fDynamicMemoryPool;\n"; + } +} + +// generate code for specific operator declarations to be defined in the Session class +void RModel::GenerateOperatorDeclarations() { + std::string strcode; + for (auto & op : fOperators) { + strcode += op->GenerateDeclCode(); + } + if (strcode.empty()) return; + fGC += "\n//---- operator declarations \n"; + fGC += strcode; + fGC += "\n"; +} + +void RModel::GenerateDynamicTensorInfo() +{ + // generate code for allocating dynamic tensors using the greedy memory allocations + if (fDynamicTensorInfos.empty()) + return; + + if (fVerbose) { + std::cout << "generating code for dynamic tensor management" << std::endl; + PrintDynamicTensors(); + } + + std::stringstream out; + out << "// dynamic tensor memory management\n"; + out << SP << "std::vector dynamicTensorInfos;\n"; + out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n"; + + // loop on all the operators to find begin/end life of the tensors + int op_index = 0; + std::vector> tensors; + tensors.reserve(fDynamicTensorInfos.size()); + for (auto & op : fOperators) { + // loop on output tensors - + for (auto &it : op->GetOpOutputTensors()) { + if (fVerbose) { + auto op_ptr = op.get(); + std::cout << "Looping on operator " << op_index << " " << typeid(*op_ptr).name() << std::endl; + } + // check if is a dynamic tensor and not an alias tensor or output tensor + std::string name = std::string(it); + if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name) + && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end()) { + auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name)); + auto type = GetTensorType(name); + size_t type_size = GetTypeSize(type); + int begin = op_index; + int end = fOperators.size(); + // look for end + auto it_lookup = fIntermediateTensorFrequencyLookup.find(name); + if (it_lookup != fIntermediateTensorFrequencyLookup.end()) + end = it_lookup->second + 1; // end is last time used + 1 + // // some tensors (like xcol in convolutions) are just used within the operators + // if (end == 0 && begin > 0) end = begin+1; + + if (begin> end) { + std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << " " << " end " << end << std::endl; + throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin"); + } + + // write in code + out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });" + << " // tensor_" << name << std::endl; + tensors.push_back({name,type}); + } + } + op_index++; // increment operator index + } + out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n"; + out << "// allocating now the memory\n"; + out << SP << "fDynamicMemoryPool = std::vector(memory_result.total_bytes);\n"; + out << SP << "int idx = 0;\n"; + for (auto & it : tensors) { + out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n"; + } + // check that all dynamic tensors are covered + bool missingTensor = false; + for (auto &i : fDynamicTensorInfos) { + if (IsAliasTensor(i.first)) continue; + if (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) != fOutputTensorNames.end()) continue; + if (std::find(tensors.begin(), tensors.end(), std::pair{i.first, i.second.type}) == tensors.end()) { + std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl; + missingTensor = true; + } + } + if (missingTensor) + throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list"); + + fGC += out.str(); +} + +/// Check if a given parameter is used for the shape of an input tensor. +bool RModel::IsInputTensorShapeParam(std::string const ¶mName) const +{ + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + if (d.param == paramName) + return true; + } + } + } + return false; +} + +/// Collects all identifiers starting with "tensor_" in the input code, +/// provided that the occurrence is not immediately preceded by a +/// character that is valid in a C++ identifier. Excludes input and output tensor names. +/// Returns a deduplicated std::vector. +std::vector RModel::CollectTensorMemberNames(const std::string &input) +{ + const std::string target = "tensor_"; + + std::vector result; + + for (size_t i = 0; i < input.size();) { + + bool doCollect = false; + + if (i + target.size() <= input.size() && input.compare(i, target.size(), target) == 0 && + (i == 0 || !IsIdentifierChar(input[i - 1]))) { + + doCollect = true; + + std::size_t j = i + target.size(); + + // Extend to full identifier + while (j < input.size() && IsIdentifierChar(input[j])) + ++j; + + std::string fullName = input.substr(i, j - i); + + // Exclude input tensor names + for (std::string const &name : fInputTensorNames) { + if (fullName == target + name) { + doCollect = false; + break; + } + } + + // Exclude output tensor names + if (doCollect) { + for (std::string const &name : fOutputTensorNames) { + if (fullName == target + name) { + doCollect = false; + break; + } + } + } + + if (doCollect) { + result.push_back(fullName); + } + + i = j; // advance past the identifier + } else { + ++i; + } + } + + // Deduplicate (order not preserved) + std::sort(result.begin(), result.end()); + result.erase(std::unique(result.begin(), result.end()), result.end()); + + return result; +} + +std::string RModel::GenerateInferSignature(bool isdecl) { + // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2" + // if (decl = false) generate only calling signature (tensor1,tensor2,....) + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + // need to check if the input parameters is already existing in another input tensor + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + std::string type = ConvertTypeToString(GetTensorType(name)); + if (type == "other") + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + rGC += type + " const* "; + } + rGC += "tensor_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last "," + return rGC; +} + +namespace { + +std::string typeForOutput(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); +} + +std::string memberNameForDimShape(std::string name) +{ + if (!name.empty()) { + name[0] = std::toupper(static_cast(name[0])); + } + name = "f" + name; + return name; +} + +} + +void RModel::GenerateOutput() +{ + size_t outputSize = fOutputTensorNames.size(); + // assume output types are all the same + + bool sameOutputTypes = true; + std::string inferReturnType; // type return by infer function + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); + fGC += "\n\n"; + if (outputSize == 1) { + fGC += "std::vector<" + typeForOutput(eFirstOutputType) + ">"; + } else { + // if all output types are the same we return an std::vector - otherwise a tuple + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) + sameOutputTypes = false; + } + if (sameOutputTypes) + fGC += "std::vector>"; + else { + inferReturnType = "std::tuple<"; + for (size_t i = 0; i < outputSize; i++) { + inferReturnType += "std::vector<" + typeForOutput(GetTensorType(fOutputTensorNames[i])) + ">"; + if (i < outputSize - 1) + inferReturnType += ","; + } + inferReturnType += ">"; + fGC += inferReturnType; + } + } + + fGC += " infer(" + GenerateInferSignature() + "){\n"; + + std::string doInferArgs = GenerateInferSignature(false); + if (!doInferArgs.empty()) + doInferArgs += ","; + for (std::string const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + std::string n; + if(!isDynamic) { + n = std::to_string(ConvertShapeToLength(GetTensorShape(name))); + } else { + std::string dimLen = ConvertDimShapeToLength(GetDynamicTensorShape(name)); + // Use the session member (fXxx) when any dim is a runtime-computed identifier + // (e.g. NonZero count). For expression-type dims derived from input shapes + // (e.g. "((W+-3)/2+1)"), use the expression directly. + // for input shape parameters we don't need to use the session member since it is passed as argument to the infer function and it is not a runtime computed value + bool hasRuntimeParam = false; + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && IsIdentifier(dim.param) && !IsInputTensorShapeParam(dim.param)) + hasRuntimeParam = true; + } + n = hasRuntimeParam ? memberNameForDimShape(dimLen) : dimLen; + } + std::string outputName = "output_tensor_" + name; + fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > " + outputName + "(" + n + ");\n"; + doInferArgs += " " + outputName + ".data(),"; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) { + fGC += SP + "size_t " + dim.param + " = 0;\n"; + doInferArgs += " " + dim.param + ","; + } + } + } + } + if (!doInferArgs.empty()) + doInferArgs.back() = ' '; + + // verifying if the dynamic parameters are within allowed range + std::unordered_set input_params_checked; + std::string dynamic_parameters_check = ""; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && input_params_checked.count(pName) == 0) { + std::string memberName = memberNameForDimShape(d.param); + dynamic_parameters_check += d.param + " > " + memberName + " || "; + input_params_checked.insert(pName); + fGC += SP + "if (" + d.param + " > " + memberName + ") {\n"; + fGC += SP + SP + "throw std::runtime_error(\"sofie: dynamic input tensor shape parameter " + + d.param + " exceeds the initialized maximum allowed shape.\");\n"; + fGC += SP + "}\n"; + } + } + } + } + + if (fUseSession) { + fGC += SP + "doInfer(*this, " + doInferArgs + ");\n"; + } else { + fGC += SP + "doInfer(" + doInferArgs + ");\n"; + } + + // If the output tensors have dynamic sizes, now is the time to set them + for (std::string const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + if (isDynamic) { + std::string outputName = "output_tensor_" + name; + auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name)); + fGC += SP + outputName + ".resize(" + tensor_size + ");\n"; + } + } + + fGC += SP + "return {"; + for (size_t i = 0; i < fOutputTensorNames.size(); i++) { + fGC += "output_tensor_" + fOutputTensorNames[i]; + if (i < fOutputTensorNames.size() - 1) + fGC += ","; + } + fGC += "};\n"; + fGC += "}\n"; // end of infer function scope +} + +void RModel::GenerateSessionCode() +{ + std::string sessionName = !fIsSubGraph ? "Session" : "Session_" + fName; + + if (fUseSession && !fIsGNNComponent) { + // forward declare session struct + fGC += "struct " + sessionName + ";\n"; + } + + // Determine the signature of the actual inference function + std::string doInferSignature = GenerateInferSignature(); + if (!doInferSignature.empty()) + doInferSignature += ", "; + for (auto const &name : fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + doInferSignature += typeForOutput(GetTensorType(name)) + " *tensor_" + name + ","; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) + doInferSignature += " size_t &" + dim.param + "_output,"; + } + } + } + doInferSignature.back() = ' '; + + if (fUseSession) { + doInferSignature = sessionName + " const &session, " + doInferSignature; + } + + doInferSignature = "inline void doInfer(" + doInferSignature + ")"; + + if (!fIsGNNComponent) { + // forward declare inference implementation + fGC += doInferSignature + ";\n"; + } + + // define the Session struct (for GNN this is generated in RModel_GNN) + if (fUseSession && !fIsGNNComponent) { + fGC += "struct " + sessionName + " {\n"; + } + + // generate code for declaring the initialized tensors + GenerateInitializedTensorInfo(); + + if (fOptimizationLevel == OptimizationLevel::kExtended) { + // evaluate total intermediate memory and position intermediate tensor addresses + std::string intermediate_memory_alloc_string = ""; + intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) { + auto op = fOperators[op_idx].get(); + std::cout << "\n******************\n analyzing input/output operator " << op_idx << " " + << typeid(*op).name() << std::endl; + } + intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); + CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); + } + + // to check remaining unused fragments after memory allocation (lesser the better) + // for (const auto &it: fIntermediateMemoryInfo.available_stack){ + // std::cout<<"chunk_idx: "<fName + " fSession_" + graph->fName + ";\n"; + } + + // Generate code for Session constructor + if (fUseSession) { + // add here specific operator code that needs to define session data members + fGC += "\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + std::string opName = std::to_string(id); + fGC += fOperators[id]->GenerateSessionMembersCode(opName); + } + fGC += "\n"; + // here add initialization and reading of weight tensors + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) { + fileName += ".dat"; + } + if (fWeightFile == WeightFileType::RootBinary) { + fileName += ".root"; + } + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + // no need to pass weight file since it is not used + // keep passing a string for compatibility + fGC += sessionName + "(std::string = \"\""; + } + // add initialization of shape parameters + // assume all parameters are of type size_t + if (!fDimShapeNames.empty()) { + // need to use same order as in infer function not alphabetical one + for (auto &p : fDimShapeNames) { + fGC += ",\n"; + fGC += " size_t " + p + " = " + fShapeParams[p]; + } + } + fGC += ") {\n"; + + // initializing dynamic parameters + if (!fDimShapeNames.empty()) { + fGC += "\n\n"; + std::sort(fDimShapeNames.begin(), fDimShapeNames.end()); + for (const auto &p : fDimShapeNames) { + fGC += " " + memberNameForDimShape(p) + " = " + p + ";\n"; + } + } + // add some extra code needed for initialization of dynamic parameters + fGC += fExtraCodeForDimShapes; + + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(fReadPos); + fGC += "\n"; + // fUseWeightFile = fUseWeightFile; + } + + // now we have passed the parameters we can allocate the dynamic tensors + GenerateDynamicTensorInfo(); + + // add here initialization code for operator + for (size_t id = 0; id < fOperators.size(); id++) { + fGC += fOperators[id]->GenerateInitCode(); + } + + fGC += "}\n\n"; + } + + // generate the inference overload that returns an output struct + GenerateOutput(); + + // generate profiling utility functions inside the Session struct + if (fProfile) { + fGC += RModelProfiler::GenerateUtilityFunctions(); + } + + // end of session + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n\n"; + + GenerateRequiredInputTensorInfo(); + } + + fGC += doInferSignature + " {\n"; + fGC += "\n"; + + // generate the inference code + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + if (fOutputTensorNames.size() == 0) + throw std::runtime_error("sofie: output size=0 are not supported"); + + if (fProfile) { + fGC += RModelProfiler::GenerateBeginInferCode(); + } + + std::string allOperatorCode; + + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + if (fProfile) { + allOperatorCode += RModelProfiler::GenerateOperatorCode(*fOperators[op_idx], op_idx); + } else { + allOperatorCode += fOperators[op_idx]->Generate(std::to_string(op_idx)); + } + } + + // If the generated code users members of the session struct, use the + // local variable name that we're using for the session: + ReplaceAll(allOperatorCode, "this->", "session."); + + if (fUseSession && !fIsGNNComponent) { + // Collect all "tensor_*" data members that are not input or output tensors + std::vector tensorMemberNames = CollectTensorMemberNames(allOperatorCode); + for (auto const& name: tensorMemberNames) { + fGC += " auto &" + name + " = session." + name + ";\n"; + } + fGC += "\n"; + } + + fGC += allOperatorCode; + + if (fProfile) { + fGC += RModelProfiler::GenerateEndInferCode(); + } + + for (auto const& name: fOutputTensorNames) { + bool isDynamic = fDynamicTensorInfos.count(name) > 0; + if(isDynamic) { + for (auto const &dim : GetDynamicTensorShape(name)) { + if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) + fGC += " " + dim.param + "_output = " + dim.param + ";\n"; + } + } + if(IsConstantTensor(name)) { + std::string t = "session.tensor_" + name; + size_t length = ConvertShapeToLength(fInitializedTensors[name].shape()); + fGC += " std::copy(" + t + ", " + t + " + " + std::to_string(length) + ", tensor_" + name + ");\n"; + } + } + fGC += "\n"; + + fGC += "}\n"; +} + +void RModel::Generate(std::underlying_type_t options, int batchSize, long pos, bool verbose) +{ + fProfile = static_cast(options & static_cast>(Options::kProfile)); + fVerbose = verbose; + fBatchSize = batchSize; + fReadPos = pos; + + // session flag is used in operator initialize + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options) + fIsGNN = true; + if (static_cast>(Options::kGNNComponent) & options) + fIsGNNComponent = true; + + if (fProfile) + RModelProfiler::AddNeededStdLibs(*this); + + // initialize the model including all operators and sub-graphs + Initialize(batchSize, verbose); + + // if having dynamic tensor we need to have a Session + if (!fDynamicTensorInfos.empty()) { + fUseSession = true; + if (verbose) + std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl; + } + + std::string hgname; + if (!fIsGNNComponent && !fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo(hgname); + } + + // generate first code for the subgraphs + for (auto &graph : fSubGraphs) { + if (fVerbose) + std::cout << "generate session code for subgraph " << graph->fName << std::endl; + graph->GenerateSessionCode(); + fGC += graph->fGC; + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + // generate main session code + GenerateSessionCode(); + + if (!fIsGNNComponent && !fIsSubGraph) { + fGC += ("} //TMVA_SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::ReadInitializedTensorsFromFile(long pos) { + // generate the code to read initialized tensors from a text data file + if (fWeightFile == WeightFileType::Text) { + // check if there are tensors to write + + if (!fUseWeightFile) return; + + fGC += " std::ifstream f;\n"; + fGC += " f.open(filename);\n"; + fGC += " if (!f.is_open()) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open file \" + filename + \" for input weights\");\n"; + fGC += " }\n"; + + if(fIsGNNComponent) { + fGC += " f.seekg(" + std::to_string(pos) + ");\n"; + } + + fGC += " using SOFIE::ReadTensorFromStream;\n"; + + // loop on tensors and parse the file + for (auto& i: fInitializedTensors) { + // skip Constant and shape tensors (not written in a file) + if (!i.second.IsWeightTensor()) continue; + std::string tensor_name = "tensor_" + i.first; + if (i.second.type() == ETensorType::FLOAT) { + std::string length = std::to_string(ConvertShapeToLength(i.second.shape())); + fGC += " ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n"; + } else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); + } + } + fGC += " f.close();\n"; + } + + // generate the code to read initialized tensors from a ROOT data file + if(fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY + fGC += " {\n"; + fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; + fGC += " if (!rootFile->IsOpen()) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open ROOT file for input weights\");\n"; + fGC += " }\n"; + + std::string dirName = fName + "_weights"; + fGC += " if (!rootFile->GetKey(\"" + dirName + "\")) {\n"; + fGC += " throw std::runtime_error(\"sofie failed to open ROOT directory for input weights\");\n"; + fGC += " }\n"; + + for (auto &i : fInitializedTensors) { + // skip Constant and shape tensors + if (!i.second.IsWeightTensor()) continue; + fGC += " {\n"; + std::string tensor_name = "tensor_" + i.first; + if (i.second.type() == ETensorType::FLOAT) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + "/" + tensor_name + "\"));\n"; + } else if (i.second.type() == ETensorType::DOUBLE) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + + "/" + tensor_name + "\"));\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; + fGC += dirName + "/" + tensor_name + "\"));\n"; + } else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); + } + fGC += " }\n"; + } + fGC += " }\n"; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY + } +} + +long RModel::WriteInitializedTensorsToFile(std::string filename) { + // Determine the file extension based on the weight file type + std::string fileExtension; + switch (fWeightFile) { + case WeightFileType::None: + fileExtension = ".dat"; + break; + case WeightFileType::RootBinary: + fileExtension = ".root"; + break; + case WeightFileType::Text: + fileExtension = ".dat"; + break; + } + + // If filename is empty, use the model name as the base filename + if (filename.empty()) { + filename = fFileName + fileExtension; + } + + // Write the initialized tensors to the file + if (fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY + if(fIsGNNComponent || fIsGNN) { + throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); + } + std::unique_ptr outputFile(TFile::Open(filename.c_str(), "UPDATE")); + + std::string dirName = fName + "_weights"; + // check if directory exists, in case delete to replace with new one + if (outputFile->GetKey(dirName.c_str())) + outputFile->rmdir(dirName.c_str()); + + auto outputDir = outputFile->mkdir(dirName.c_str()); + + for (const auto& item : fInitializedTensors) { + // skip Constant tensors and tensors which are not writable (e.g. shape tensors) + if (!item.second.IsWeightTensor()) continue; + std::string tensorName = "tensor_" + item.first; + size_t length = 1; + length = ConvertShapeToLength(item.second.shape()); + if(item.second.type() == ETensorType::FLOAT) { + const float* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else if(item.second.type() == ETensorType::DOUBLE) { + const double* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else if(item.second.type() == ETensorType::INT64) { + const int64_t* data = item.second.data(); + std::vector tensorDataVector(data, data + length); + outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); + } + else { + throw std::runtime_error("sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) + + " cannot be written to a ROOT file"); + } + } + outputFile->Write(filename.c_str()); + + // this needs to be changed, similar to the text file + return -1; + +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY + } else if (fWeightFile == WeightFileType::Text) { + std::ofstream f; + if(fIsGNNComponent) { + // appending all GNN components into the same file + f.open(filename, std::ios::app); + } else { + f.open(filename); + } + if (!f.is_open()) + throw + std::runtime_error("sofie failed to open file " + filename + " for tensor weight data"); + for (auto& i: fInitializedTensors) { + // skip Constant tensors and not writable tensors (e.g. shape tensors) + if (!i.second.IsWeightTensor()) { + continue; + } + size_t length = ConvertShapeToLength(i.second.shape()); + std::string tensor_name = "tensor_" + i.first; + f << tensor_name << " " << length << "\n"; + if (i.second.type() == ETensorType::FLOAT) { + const float * data = i.second.data(); + for (size_t idx = 0; idx < length; idx++) { + // round to zero sub-normal values + float value = data[idx]; + if (value != 0. && std::abs(value) < std::numeric_limits::min() ) value = 0; + // handle non-finite values explicitly + if (std::isinf(value)) + f << (value > 0 ? "inf" : "-inf"); + else if (std::isnan(value)) + f << "nan"; + else + f << std::setprecision(std::numeric_limits::max_digits10) << value; + f << ( (idx < length-1) ? " " : "\n" ); + } + } + else { + throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); + } + if (f.fail()) + throw std::runtime_error("sofie failed to write tensor data to file for " + tensor_name); + } + long curr_pos = f.tellp(); + f.close(); + return curr_pos; + } else { + return -1; + } +} + +void RModel::PrintSummary() const { + std::cout << "Summary of model " << GetName() << std::endl; + for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ + auto& r = *fOperators[op_idx].get(); + std::string raw_name = typeid(r).name(); + // look for ROperator_NAME + std::string name = raw_name.substr(raw_name.find("ROperator_")+10, raw_name.size()); + std::cout << op_idx << " " << name << " : "; + for (auto & t_in : r.GetOpInputTensors()) std::cout << t_in << " "; + std::cout << " ----> "; + for (auto & t_out : r.GetOpOutputTensors()) std::cout << t_out << " "; + std::cout << std::endl; + } +} + +/// To emit the dimensions of the input tensors as a data member of a session, +/// which is helpful when validating the inference inputs. +void RModel::GenerateRequiredInputTensorInfo() +{ + fGC += "\n// Input tensor dimensions\n"; + fGC += "using SOFIE::SingleDim;\n"; + fGC += "using SOFIE::TensorDims;\n"; + fGC += "using SOFIE::makeDims;\n\n"; + bool hasDynamicInputTensors = false; + + for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) { + auto const &name = fInputTensorNames[iInput]; + if (IsDimInputTensor(name)) { + hasDynamicInputTensors = true; + } + std::vector shape = GetDimTensorShape(name); + fGC += "constexpr std::array dim_" + name + "{"; + for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) { + auto const &dim = shape[iDim]; + if (dim.isParam) { + fGC += "SingleDim{\"" + dim.GetVal() + "\"}"; + } else { + fGC += "SingleDim{" + dim.GetVal() + "}"; + } + if (iDim != shape.size() - 1) { + fGC += ", "; + } + } + fGC += "};\n"; + } + fGC += "\nconstexpr std::array inputTensorDims{\n"; + for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) { + auto const &name = fInputTensorNames[iInput]; + fGC += SP + "makeDims(dim_" + name + ")"; + if (iInput == fInputTensorNames.size() - 1) { + fGC += "\n"; + } else { + fGC += ",\n"; + } + } + fGC += "};\n"; + + fGC += + "\nconstexpr bool hasDynamicInputTensors{" + std::string{hasDynamicInputTensors ? "true" : "false"} + "};\n\n"; + + fGC += "\n// Output tensor dimensions\n"; + bool hasDynamicOutputTensors = false; + for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) { + auto const &name = fOutputTensorNames[iOutput]; + if (IsDynamicTensor(name)) { + hasDynamicOutputTensors = true; + } + std::vector shape = GetDimTensorShape(name); + fGC += "constexpr std::array dim_" + name + "{"; + for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) { + auto const &dim = shape[iDim]; + if (dim.isParam) { + fGC += "SingleDim{\"" + dim.GetVal() + "\"}"; + } else { + fGC += "SingleDim{" + dim.GetVal() + "}"; + } + if (iDim != shape.size() - 1) { + fGC += ", "; + } + } + fGC += "};\n"; + } + fGC += "\nconstexpr std::array outputTensorDims{\n"; + for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) { + auto const &name = fOutputTensorNames[iOutput]; + fGC += SP + "makeDims(dim_" + name + ")"; + if (iOutput == fOutputTensorNames.size() - 1) { + fGC += "\n"; + } else { + fGC += ",\n"; + } + } + fGC += "};\n"; + fGC += + "\nconstexpr bool hasDynamicOutputTensors{" + std::string{hasDynamicOutputTensors ? "true" : "false"} + "};\n\n"; +} + +void RModel::PrintRequiredInputTensors() const { + std::cout << "Model requires following inputs:\n"; + for (auto& inputInfo: fInputTensorInfos) { + std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t"; + std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { + if (inputInfo.second.shape[i].isParam) { + std::cout << inputInfo.second.shape[i].param; + } else { + std::cout << inputInfo.second.shape[i].dim ; + } + if (i < inputInfo.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + + for (auto& inputInfo: fReadyInputTensorInfos) { + std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t"; + std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { + std::cout << inputInfo.second.shape[i]; + if (i < inputInfo.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintInitializedTensors() const { + std::cout << "Model initialized the following tensors:\n"; + for (auto& it: fInitializedTensors) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape().size(); i++) { + std::cout << it.second.shape()[i]; + if (i < it.second.shape().size() - 1) std::cout << ","; + } + std::cout << "]"; + if (it.second.IsConstantTensor()) std::cout << " (Constant)"; + if (it.second.IsNotWritable()) std::cout << " (Not Writable)"; + std::cout << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintIntermediateTensors() const { + std::cout << "Model specify the following intermediate tensors:\n"; + for (auto& it: fIntermediateTensorInfos) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape.size(); i++) { + std::cout << it.second.shape[i]; + if (i < it.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintDynamicTensors() const { + std::cout << "Model specify the following dynamic tensors:\n"; + for (auto& it: fDynamicTensorInfos) { + std::cout << "Tensor name: \"" << it.first << "\"\t"; + std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; + std::cout << "shape: ["; + for (size_t i = 0; i < it.second.shape.size(); i++) { + std::cout << it.second.shape[i].GetVal(); + if (i < it.second.shape.size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + } + std::cout << "\n"; +} + +void RModel::PrintOutputTensors() const { + std::cout << "Model specify the following output tensors:\n"; + for (auto& it: fOutputTensorNames) { + std::cout << "Tensor name: \"" << it << "\"\t"; + try { + auto shape = GetDimTensorShape(it); + std::cout << "with shape: " << ConvertDimShapeToString(shape) << std::endl; + } catch (...) { + std::cout << "with shape not yet defined" << std::endl; + } + } + std::cout << "\n"; +} + +void RModel::HeadInitializedTensors(std::string name, int n_print) { + auto it = fInitializedTensors.find(name); + if (it == fInitializedTensors.end()) { + std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl; + return; + } + + std::cout << "Tensor name: " << it->first << "\t"; + std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t"; + int length =1; + std::cout << "shape: ["; + for (size_t i = 0; i < it->second.shape().size(); i++) { + std::cout << it->second.shape()[i]; + length *= it->second.shape()[i]; + if (i < it->second.shape().size() - 1) std::cout << ","; + } + std::cout << "]" << std::endl; + bool ellipsis = true; + if (n_print > length) { + n_print = length; + ellipsis = false; + } + + std::cout << "data: [" << std::endl; + if (it->second.type() == ETensorType::FLOAT) { + auto converted_data = it->second.data(); + for (int i =0; i < n_print; i++) { + std::cout << converted_data[i]; + if (i < n_print - 1) std::cout << " ,"; + } + } + if (ellipsis) std::cout << ", ..."; + std::cout << "]" << std::endl; + +} + +void RModel::OutputGenerated(std::string filename, bool append) { + + RModel_Base::OutputGenerated(filename, append); + + // write weights in a text file + if (fUseWeightFile) { + if (!filename.empty()) { + size_t pos = filename.find(".hxx"); + if (fWeightFile == WeightFileType::Text) + filename.replace(pos, 4, ".dat"); + if (fWeightFile == WeightFileType::RootBinary) { + filename = filename.erase(pos, 4); + filename += ".root"; + } + } else { + filename = fName; + filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root"; + } + WriteInitializedTensorsToFile(filename); + } +} + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +void RModel::Streamer(TBuffer &R__b) { + if (R__b.IsReading()) { + RModel::Class()->ReadBuffer(R__b, this); + for (auto & i : fInitializedTensors) { + i.second.CastPersistentToShared(); + } + } + else { + for (auto & i : fInitializedTensors) { + i.second.CastSharedToPersistent(); + } + RModel::Class()->WriteBuffer(R__b, this); + } +} +#endif + +}//SOFIE diff --git a/core/src/RModelProfiler.cxx b/core/src/RModelProfiler.cxx new file mode 100644 index 0000000..25efbd2 --- /dev/null +++ b/core/src/RModelProfiler.cxx @@ -0,0 +1,121 @@ +#include "SOFIE/RModelProfiler.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +void RModelProfiler::AddNeededStdLibs(RModel &model) +{ + model.AddNeededStdLib("chrono"); + model.AddNeededStdLib("vector"); + model.AddNeededStdLib("string"); + model.AddNeededStdLib("map"); + model.AddNeededStdLib("iostream"); + model.AddNeededStdLib("iomanip"); + model.AddNeededStdLib("algorithm"); + model.AddNeededStdLib("cmath"); + model.AddNeededStdLib("tuple"); +} + +std::string RModelProfiler::GenerateSessionMembers() +{ + std::string gc; + gc += "// Maps an operator name to a vector of its execution times (in microseconds).\n"; + gc += "mutable std::map> fProfilingResults;\n\n"; + return gc; +} + +std::string RModelProfiler::GenerateUtilityFunctions() +{ + std::string gc; + + gc += " // Print profiling results sorted by average time (highest first).\n"; + gc += " void PrintProfilingResults(bool order = true) const {\n"; + gc += " if (fProfilingResults.empty()) {\n"; + gc += " std::cout << \"No profiling results to display.\" << std::endl;\n"; + gc += " return;\n"; + gc += " }\n"; + gc += " std::vector> averageResults;\n"; + gc += " std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n"; + gc += " std::cout << \" CPU PROFILING RESULTS\" << std::endl;\n"; + gc += " std::cout << std::string(60, '=') << std::endl;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double sum = 0.0, sum2 = 0.0;\n"; + gc += " for (double time : op.second) { sum += time; sum2 += time*time; }\n"; + gc += " double average = sum / op.second.size();\n"; + gc += " double stddev = (op.second.size() > 1) ? std::sqrt((sum2 - sum*average) / (op.second.size()-1)) : 0.0;\n"; + gc += " averageResults.push_back({op.first, average, stddev, (int)op.second.size()});\n"; + gc += " }\n"; + gc += " if (order) {\n"; + gc += " std::sort(averageResults.begin(), averageResults.end(),\n"; + gc += " [](const auto& a, const auto& b){ return std::get<1>(a) > std::get<1>(b); });\n"; + gc += " }\n"; + gc += " for (const auto& r : averageResults) {\n"; + gc += " std::cout << \" \" << std::left << std::setw(30) << std::get<0>(r)\n"; + gc += " << \": \" << std::fixed << std::setprecision(3) << std::get<1>(r)\n"; + gc += " << \" +/- \" << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n"; + gc += " << \" (\" << std::get<3>(r) << \" runs)\" << std::endl;\n"; + gc += " }\n"; + gc += " std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n"; + gc += " }\n\n"; + + gc += " void ResetProfilingResults() {\n"; + gc += " fProfilingResults.clear();\n"; + gc += " }\n\n"; + + gc += " std::map GetOpAvgTime() const {\n"; + gc += " std::map avg;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double sum = 0.0;\n"; + gc += " for (double t : op.second) sum += t;\n"; + gc += " avg[op.first] = sum / op.second.size();\n"; + gc += " }\n"; + gc += " return avg;\n"; + gc += " }\n\n"; + + gc += " std::map GetOpVariance() const {\n"; + gc += " std::map variance;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double mean = 0.0, mean2 = 0.0;\n"; + gc += " for (double t : op.second) { mean += t; mean2 += t*t; }\n"; + gc += " mean /= op.second.size(); mean2 /= op.second.size();\n"; + gc += " variance[op.first] = mean2 - mean*mean;\n"; + gc += " }\n"; + gc += " return variance;\n"; + gc += " }\n\n"; + + return gc; +} + +std::string RModelProfiler::GenerateBeginInferCode() +{ + std::string gc; + gc += " // Profiling timers\n"; + gc += " std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n"; + gc += " tp_overall_start = std::chrono::steady_clock::now();\n"; + gc += " auto & fProfilingResults = session.fProfilingResults;\n\n"; + return gc; +} + +std::string RModelProfiler::GenerateOperatorCode(ROperator &op, size_t op_idx) +{ + std::string gc; + gc += " // -- Profiling operator: " + op.Name() + " --\n"; + gc += " tp_start = std::chrono::steady_clock::now();\n"; + gc += op.Generate(std::to_string(op_idx)); + gc += "\n fProfilingResults[\"" + op.Name() + "\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_start).count());\n\n"; + return gc; +} + +std::string RModelProfiler::GenerateEndInferCode() +{ + std::string gc; + gc += " // -- Record overall inference time --\n"; + gc += " fProfilingResults[\"Overall_Time\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_overall_start).count());\n"; + return gc; +} + +} // namespace SOFIE diff --git a/core/src/RModelProfilerGPU.cxx b/core/src/RModelProfilerGPU.cxx new file mode 100644 index 0000000..bf946b5 --- /dev/null +++ b/core/src/RModelProfilerGPU.cxx @@ -0,0 +1,184 @@ +#include "SOFIE/RModelProfilerGPU.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +void RModelProfilerGPU::AddNeededStdLibs(RModel &model) +{ + model.AddNeededStdLib("chrono"); + model.AddNeededStdLib("vector"); + model.AddNeededStdLib("string"); + model.AddNeededStdLib("map"); + model.AddNeededStdLib("iostream"); + model.AddNeededStdLib("iomanip"); + model.AddNeededStdLib("algorithm"); + model.AddNeededStdLib("cmath"); + model.AddNeededStdLib("tuple"); +} + +std::string RModelProfilerGPU::GenerateSessionMembers() +{ + std::string gc; + gc += "// Maps operator name to GPU execution times (microseconds, wall-clock with sync).\n"; + gc += "mutable std::map> fProfilingResults;\n\n"; + return gc; +} + +std::string RModelProfilerGPU::GenerateUtilityFunctions() +{ + std::string gc; + + gc += " // Print GPU profiling results sorted by average time (highest first).\n"; + gc += " void PrintProfilingResults(bool order = true) const {\n"; + gc += " if (fProfilingResults.empty()) {\n"; + gc += " std::cout << \"No GPU profiling results to display.\" << std::endl;\n"; + gc += " return;\n"; + gc += " }\n"; + gc += " std::vector> averageResults;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double sum = 0.0, sum2 = 0.0;\n"; + gc += " for (double t : op.second) { sum += t; sum2 += t*t; }\n"; + gc += " double average = sum / op.second.size();\n"; + gc += " double stddev = (op.second.size() > 1) ? std::sqrt((sum2 - sum*average) / (op.second.size()-1)) : 0.0;\n"; + gc += " averageResults.push_back({op.first, average, stddev, (int)op.second.size()});\n"; + gc += " }\n"; + gc += " if (order) {\n"; + gc += " std::sort(averageResults.begin(), averageResults.end(),\n"; + gc += " [](const auto& a, const auto& b){ return std::get<1>(a) > std::get<1>(b); });\n"; + gc += " }\n"; + gc += " std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n"; + gc += " std::cout << \" GPU PROFILING RESULTS\" << std::endl;\n"; + gc += " std::cout << \" (wall-clock with alpaka::wait synchronization)\" << std::endl;\n"; + gc += " std::cout << std::string(60, '=') << std::endl;\n"; + gc += " for (const auto& r : averageResults) {\n"; + gc += " std::cout << \" \" << std::left << std::setw(30) << std::get<0>(r)\n"; + gc += " << \": \" << std::fixed << std::setprecision(3) << std::get<1>(r)\n"; + gc += " << \" +/- \" << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n"; + gc += " << \" (\" << std::get<3>(r) << \" runs)\" << std::endl;\n"; + gc += " }\n"; + gc += " std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n"; + gc += " }\n\n"; + + gc += " void ResetProfilingResults() {\n"; + gc += " fProfilingResults.clear();\n"; + gc += " }\n\n"; + + gc += " std::map GetOpAvgTime() const {\n"; + gc += " std::map avg;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double sum = 0.0;\n"; + gc += " for (double t : op.second) sum += t;\n"; + gc += " avg[op.first] = sum / op.second.size();\n"; + gc += " }\n"; + gc += " return avg;\n"; + gc += " }\n\n"; + + return gc; +} + +RModelProfilerGPU::MemoryInfo RModelProfilerGPU::ComputeMemoryInfo(const RModel &model) +{ + MemoryInfo info; + + for (const auto &it : model.fInitializedTensors) { + if (it.second.IsNotWritable()) continue; + size_t bytes = ConvertShapeToLength(it.second.shape()) * GetTypeSize(it.second.type()); + if (!model.fUseWeightFile || it.second.IsConstantTensor()) { + info.constantTensorBytes += bytes; // embedded as C++ array in generated code + } else { + info.weightTensorBytes += bytes; // loaded from .dat into temp CPU vector then H2D + } + // Every initialized tensor (constant or weight file) gets its own GPU device buffer. + info.weightDeviceBytes += bytes; + } + + // CPU intermediate memory pool (0 in the GPU path — intermediates live on device) + info.intermediateCPUBytes = model.fOtherTensorSize; + + // GPU intermediate device buffers. + // Skip fused-kernel intermediates: those tensors share the fused kernel's + // input/output buffers and are never separately allocated on the device. + for (const auto &it : model.fIntermediateTensorInfos) { + if (model.fFusionIntermediateTensors.count(it.first)) continue; + size_t len = ConvertShapeToLength(it.second.shape); + info.intermediateGPUBytes += len * GetTypeSize(it.second.type); + } + + return info; +} + +std::string RModelProfilerGPU::GenerateMemoryReport(const MemoryInfo &info) +{ + auto toMB = [](size_t bytes) -> double { return bytes / (1024.0 * 1024.0); }; + + size_t totalCPU = info.constantTensorBytes + info.weightTensorBytes + info.intermediateCPUBytes; + size_t totalGPU = info.weightDeviceBytes + info.intermediateGPUBytes; + + std::string gc; + gc += " // Print memory usage breakdown computed at code-generation time.\n"; + gc += " void PrintMemoryInfo() const {\n"; + gc += " std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n"; + gc += " std::cout << \" MEMORY USAGE BREAKDOWN\" << std::endl;\n"; + gc += " std::cout << std::string(60, '=') << std::endl;\n"; + gc += " std::cout << \" CPU Memory (during session init):\" << std::endl;\n"; + gc += " std::cout << \" Constant/embedded tensors : " + + std::to_string(info.constantTensorBytes) + " bytes (" + + std::to_string(toMB(info.constantTensorBytes)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" Weight tensors (.dat file): " + + std::to_string(info.weightTensorBytes) + " bytes (" + + std::to_string(toMB(info.weightTensorBytes)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" Intermediate memory pool : " + + std::to_string(info.intermediateCPUBytes) + " bytes (" + + std::to_string(toMB(info.intermediateCPUBytes)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" Total CPU : " + + std::to_string(totalCPU) + " bytes (" + + std::to_string(toMB(totalCPU)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" GPU Memory (device buffers):\" << std::endl;\n"; + gc += " std::cout << \" Initialized bufs (const+weights): " + + std::to_string(info.weightDeviceBytes) + " bytes (" + + std::to_string(toMB(info.weightDeviceBytes)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" Intermediate device bufs : " + + std::to_string(info.intermediateGPUBytes) + " bytes (" + + std::to_string(toMB(info.intermediateGPUBytes)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << \" Total GPU : " + + std::to_string(totalGPU) + " bytes (" + + std::to_string(toMB(totalGPU)).substr(0, 6) + " MB)\" << std::endl;\n"; + gc += " std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n"; + gc += " }\n\n"; + return gc; +} + +std::string RModelProfilerGPU::GenerateBeginInferCode() +{ + std::string gc; + gc += " // GPU profiling timers\n"; + gc += " std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n"; + gc += " tp_overall_start = std::chrono::steady_clock::now();\n\n"; + return gc; +} + +std::string RModelProfilerGPU::GenerateOperatorCode(ROperator &op, size_t op_idx) +{ + std::string gc; + gc += " // -- GPU Profiling operator: " + op.Name() + " --\n"; + gc += " tp_start = std::chrono::steady_clock::now();\n"; + gc += op.Generate_GPU_ALPAKA(std::to_string(op_idx)); + // Force synchronisation so chrono measures actual GPU execution time + gc += " alpaka::wait(queue);\n"; + gc += " fProfilingResults[\"" + op.Name() + "\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_start).count());\n\n"; + return gc; +} + +std::string RModelProfilerGPU::GenerateEndInferCode() +{ + std::string gc; + gc += " // -- Record overall GPU inference time --\n"; + gc += " fProfilingResults[\"Overall_Time\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_overall_start).count());\n"; + return gc; +} + +} // namespace SOFIE diff --git a/core/src/RModel_ALPAKA.cxx b/core/src/RModel_ALPAKA.cxx new file mode 100644 index 0000000..9e0e84c --- /dev/null +++ b/core/src/RModel_ALPAKA.cxx @@ -0,0 +1,861 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef SOFIE_SUPPORT_ROOT_BINARY +#include "TFile.h" +#endif + +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelProfilerGPU.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator_Gemm.hxx" +#include "SOFIE/ROperator_LeakyRelu.hxx" +#include "SOFIE/ROperator_Relu.hxx" + +namespace SOFIE { + +void RModel::ComputeEltwiseFusionGroups() { + fEltwiseFusionGroups.clear(); + fOpToFusionGroupIdx.clear(); + fFusionIntermediateTensors.clear(); + + // Build tensor -> consumer op indices map + std::unordered_map> tensorConsumers; + for (size_t i = 0; i < fOperators.size(); i++) { + for (const auto& name : fOperators[i]->GetOpInputTensors()) + tensorConsumers[std::string(name)].push_back(i); + } + + // Returns true if tensorName is safe to treat as a fusion intermediate: + // consumed by exactly one op AND not a model output. + auto isFuseSafe = [&](const std::string& tensorName) -> bool { + for (const auto& outName : fOutputTensorNames) + if (outName == tensorName) return false; + auto it = tensorConsumers.find(tensorName); + return it != tensorConsumers.end() && it->second.size() == 1; + }; + + std::vector opAssigned(fOperators.size(), false); + + for (size_t i = 0; i < fOperators.size(); i++) { + if (opAssigned[i]) continue; + opAssigned[i] = true; + + EltwiseFusionGroup group; + group.opIndices.push_back(i); + + auto firstInputs = fOperators[i]->GetOpInputTensors(); + group.inputTensor = firstInputs.empty() ? "" : std::string(firstInputs[0]); + + // Extend chain: only if CURRENT op is elementwise and its single output can be fused + size_t current = i; + while (fOperators[current]->IsElementwise()) { + auto curOutputs = fOperators[current]->GetOpOutputTensors(); + if (curOutputs.size() != 1) break; + std::string curOut = std::string(curOutputs[0]); + if (!isFuseSafe(curOut)) break; + + size_t nextIdx = tensorConsumers.find(curOut)->second[0]; + // Must be strictly the next op in sequence and itself elementwise with single input + if (nextIdx != current + 1) break; + if (opAssigned[nextIdx]) break; + if (!fOperators[nextIdx]->IsElementwise()) break; + auto nextInputs = fOperators[nextIdx]->GetOpInputTensors(); + if (nextInputs.size() != 1) break; + + opAssigned[nextIdx] = true; + group.opIndices.push_back(nextIdx); + current = nextIdx; + } + + // Output tensor is the last op's output + auto lastOutputs = fOperators[current]->GetOpOutputTensors(); + group.outputTensor = lastOutputs.empty() ? "" : std::string(lastOutputs[0]); + + // Element count from intermediate tensor info (all op outputs are intermediates) + if (!group.outputTensor.empty()) { + auto it = fIntermediateTensorInfos.find(group.outputTensor); + if (it != fIntermediateTensorInfos.end()) + group.numElements = ConvertShapeToLength(it->second.shape); + } + + size_t gIdx = fEltwiseFusionGroups.size(); + for (auto opIdx : group.opIndices) + fOpToFusionGroupIdx[opIdx] = gIdx; + + // Mark all-but-last outputs as fusion intermediates (skip allocation) + if (group.isFused()) { + for (size_t k = 0; k + 1 < group.opIndices.size(); k++) { + auto midOuts = fOperators[group.opIndices[k]]->GetOpOutputTensors(); + if (!midOuts.empty()) + fFusionIntermediateTensors.insert(std::string(midOuts[0])); + } + } + + fEltwiseFusionGroups.push_back(std::move(group)); + } +} + + +void RModel::FuseGemmActivations_GPU() { + std::unordered_map consumerCount; + for (const auto& op : fOperators) + for (const auto& inp : op->GetOpInputTensors()) + ++consumerCount[std::string(inp)]; + + const size_t N = fOperators.size(); + for (size_t i = 0; i + 1 < N; ++i) { + if (fSkipOperators.count(i)) continue; + + auto* gemm = dynamic_cast*>(fOperators[i].get()); + if (!gemm) continue; + if (gemm->GetActivationType() != EActivationType::UNDEFINED) continue; + + auto* lrelu = dynamic_cast*>(fOperators[i + 1].get()); + auto* relu = dynamic_cast*>(fOperators[i + 1].get()); + if (!lrelu && !relu) continue; + + std::string gemmOut = std::string(fOperators[i]->GetOpOutputTensors()[0]); + std::string actIn = std::string(fOperators[i + 1]->GetOpInputTensors()[0]); + if (gemmOut != actIn) continue; + + if (consumerCount[gemmOut] != 1) continue; + + std::string actOut = std::string(fOperators[i + 1]->GetOpOutputTensors()[0]); + + if (lrelu) { + gemm->SetActivation(EActivationType::LEAKYRELU, lrelu->GetAlpha()); + } else { + gemm->SetActivation(EActivationType::RELU, 0.f); + } + + gemm->UpdateFusableTensorName(actOut, [&](const std::string& old) { + fFusionIntermediateTensors.insert(old); + }); + + fSkipOperators.insert(i + 1); + } +} + +void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() { + if (!fInitializedTensors.empty()){ + fGC += "\n// initialized tensors for weights\n"; + } + + for (auto &i : fInitializedTensors) { + if (!fUseWeightFile || i.second.IsConstantTensor()) { + if (i.second.type() == ETensorType::FLOAT) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT64) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT32) + fGC += GenerateConstantTensorCode(i); + + else if (i.second.type() == ETensorType::BOOL || + i.second.type() == ETensorType::UINT8) + fGC += GenerateConstantTensorCode(i); + } + + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type() == ETensorType::INT32) { + fGC += "BufI321D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type() == ETensorType::BOOL || + i.second.type() == ETensorType::UINT8) { + fGC += "BufUI81D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + + } +} + +void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA() +{ + if (!fInitializedTensors.empty()) + fGC += "// temporary initialized tensors for loading weights\n"; + + for (auto &i : fInitializedTensors) { + if (fUseWeightFile && !i.second.IsConstantTensor()) { + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } else if (i.second.type() == ETensorType::INT32) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } else if (i.second.type() == ETensorType::BOOL || + i.second.type() == ETensorType::UINT8) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } + } + } +} + +void RModel::GenerateGPU_ALPAKA_Buffers() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + + for (auto &i : fIntermediateTensorInfos) { + // Skip tensors that are purely intermediate within a fused kernel chain + if (fFusionIntermediateTensors.count(i.first)) continue; + + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "BufD1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::INT32) { + tensor_declaration_block += "BufI321D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::BOOL) { + tensor_declaration_block += "BufUI81D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + fGC += "using bufDev_float = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_double = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_int64 = alpaka::Buf, size_t>;\n"; + + for (auto &i : fDynamicTensorInfos) { + if (i.second.type == ETensorType::FLOAT) { + fGC += "bufDev_float bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + fGC += "bufDev_double bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::INT64) { + fGC += "bufDev_int64 bufDev_" + i.first + ";\n"; + } + } + } +} + +void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() { + fGC += "//---- allocate the intermediate dynamic tensors\n"; + std::stringstream out; + + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDimShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << "auto bufDev_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" << length << "}));\n"; + out << SP << "}\n"; + } + fGC += out.str(); +} + +std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) { + + auto GetBufType = [this](const std::string& name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "BufF1D"; + if (type == ETensorType::DOUBLE) return "BufD1D"; + if (type == ETensorType::INT32) return "BufI321D"; + if (type == ETensorType::INT64) return "BufI641D"; + if (type == ETensorType::BOOL) return "BufUI81D"; + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + }; + + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + rGC += GetBufType(name) + " const "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back(); // remove last "," + return rGC; +} + +std::string RModel::GenerateImplSignature_GPU_ALPAKA(bool isdecl) { + + auto GetViewConstType = [this](const std::string& name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "ViewConstF1D"; + if (type == ETensorType::DOUBLE) return "ViewConstD1D"; + if (type == ETensorType::INT32) return "ViewConstI321D"; + if (type == ETensorType::INT64) return "ViewConstI641D"; + if (type == ETensorType::BOOL) return "ViewConstUI81D"; + throw std::runtime_error("sofie: input tensor " + name + + " is of a data type which is not yet supported."); + }; + + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + rGC += GetViewConstType(name) + " const& "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back(); + return rGC; +} + +void RModel::GenerateOutput_GPU_ALPAKA() { + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + size_t outputSize = fOutputTensorNames.size(); + if (outputSize == 0) + throw std::runtime_error("sofie: output size=0 are not supported"); + + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); + bool sameOutputTypes = true; + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) + sameOutputTypes = false; + } + + auto GetViewConstType = [this](const std::string &name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "ViewConstF1D"; + if (type == ETensorType::DOUBLE) return "ViewConstD1D"; + if (type == ETensorType::INT32) return "ViewConstI321D"; + if (type == ETensorType::INT64) return "ViewConstI641D"; + if (type == ETensorType::BOOL) return "ViewConstUI81D"; + throw std::runtime_error("sofie: input tensor " + name + " is of an unsupported data type."); + }; + + // Collect deduplicated dynamic dimension parameter names in declaration order + std::vector dynParamNames; + { + std::unordered_map seen; + for (auto &name : fInputTensorNames) { + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + if (d.isParam && seen.count(d.param) == 0) { + dynParamNames.push_back(d.param); + seen[d.param] = 1; + } + } + } + } + } + + fGC += "\n\n"; + + fGC += "void _infer_impl("; + fGC += GenerateImplSignature_GPU_ALPAKA(); + fGC += "){\n"; + + // GPU profiling: _infer_impl is a member of Session, so fProfilingResults + // is directly accessible without any alias. + if (fProfile) { + fGC += RModelProfilerGPU::GenerateBeginInferCode(); + } + + std::set fusedGroupsLaunched; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + + if (fSkipOperators.count(op_idx)) continue; + + auto gIt = fOpToFusionGroupIdx.find(op_idx); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + // Only emit the fused kernel launch once, at the chain leader + if (fEltwiseFusionGroups[gIdx].opIndices[0] == op_idx && !fusedGroupsLaunched.count(gIdx)) { + const auto& grp = fEltwiseFusionGroups[gIdx]; + std::string sfx = grp.suffix(); + std::string kname = "fusedEltwiseKernel" + sfx; + std::string fusedCode; + fusedCode += "\n//------ FUSED_ELTWISE_GPU_ALPAKA" + sfx + "\n"; + fusedCode += SP + "{\n"; + fusedCode += SP + SP + "auto const elementsPerThread_fused" + sfx + " = Vec::all(static_cast(1));\n"; + fusedCode += SP + SP + "auto const elementsPerGrid_fused" + sfx + " = Vec::all(Idx{" + std::to_string(grp.numElements) + "});\n"; + fusedCode += SP + SP + "auto const workDiv_fused" + sfx + " = sofie_workdiv(elementsPerGrid_fused" + sfx + ");\n"; + fusedCode += SP + SP + "auto task_fused" + sfx + " = alpaka::createTaskKernel(workDiv_fused" + sfx + ", " + kname + + ", alpaka::getPtrNative(deviceBuf_" + grp.inputTensor + "), alpaka::getPtrNative(deviceBuf_" + grp.outputTensor + + "), static_cast(" + std::to_string(grp.numElements) + "));\n"; + fusedCode += SP + SP + "alpaka::enqueue(queue, task_fused" + sfx + ");\n"; + fusedCode += SP + "}\n"; + if (fProfile) { + // wrap fused group with profiling + std::string fusedName = "FusedKernel" + sfx; + fGC += " // -- GPU Profiling fused group: " + fusedName + " --\n"; + fGC += " tp_start = std::chrono::steady_clock::now();\n"; + fGC += fusedCode; + fGC += " alpaka::wait(queue);\n"; + fGC += " fProfilingResults[\"" + fusedName + "\"].push_back(\n"; + fGC += " std::chrono::duration_cast>(\n"; + fGC += " std::chrono::steady_clock::now() - tp_start).count());\n\n"; + } else { + fGC += fusedCode; + } + fusedGroupsLaunched.insert(gIdx); + } + // Chain followers: skip — their logic is inside the fused kernel + } else { + if (fProfile) { + fGC += RModelProfilerGPU::GenerateOperatorCode(*fOperators[op_idx], op_idx); + } else { + fGC += fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx)); + } + } + } + // Final wait (no-op when profiling since each op already syncs) + fGC += "\n\n alpaka::wait(queue);\n"; + + if (fProfile) { + fGC += RModelProfilerGPU::GenerateEndInferCode(); + } + + fGC += "}\n\n"; + + + std::string spanDynDecl; + for (auto &p : dynParamNames) + spanDynDecl += ", size_t " + p; + + fGC += "void infer(std::span inputs, std::span outputs" + spanDynDecl + "){\n"; + + { + fGC += SP + "_infer_impl("; + bool first = true; + for (auto &p : dynParamNames) { + if (!first) fGC += ", "; + fGC += p; + first = false; + } + for (size_t i = 0; i < fInputTensorNames.size(); i++) { + if (!first) fGC += ", "; + fGC += "inputs[" + std::to_string(i) + "]"; + first = false; + } + fGC += ");\n"; + } + + // Copy member output buffers into caller-provided output views + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + fGC += SP + "alpaka::memcpy(queue, outputs[" + std::to_string(i) + "], deviceBuf_" + tensorName + ");\n"; + } + fGC += SP + "alpaka::wait(queue);\n"; + fGC += "}\n\n"; + + + std::string returnType; + if (outputSize == 1) { + returnType = "alpaka::Buf"; + } else if (sameOutputTypes) { + returnType = "std::array, " + std::to_string(outputSize) + ">"; + } else { + returnType = "std::tuple<"; + for (size_t i = 0; i < outputSize; i++) { + std::string tname = *(fOutputTensorNames.begin() + i); + returnType += "alpaka::Buf"; + if (i < outputSize - 1) returnType += ","; + } + returnType += ">"; + } + + fGC += returnType + " infer("; + fGC += GenerateInferSignature_GPU_ALPAKA(); + fGC += "){\n"; + + // Wrap each typed input buffer in a ViewConstXX, then call _infer_impl + std::vector typedImplArgs; + for (auto &p : dynParamNames) + typedImplArgs.push_back(p); + for (auto &name : fInputTensorNames) { + std::string viewType = GetViewConstType(name); + fGC += SP + viewType + " const view_" + name + + "{alpaka::getPtrNative(deviceBuf_" + name + "), devAcc, alpaka::getExtents(deviceBuf_" + name + ")};\n"; + typedImplArgs.push_back("view_" + name); + } + + fGC += SP + "_infer_impl("; + for (size_t i = 0; i < typedImplArgs.size(); i++) { + if (i > 0) fGC += ", "; + fGC += typedImplArgs[i]; + } + fGC += ");\n"; + + // Return the member output buffer(s) + fGC += SP + "return "; + if (outputSize > 1) fGC += "{"; + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + fGC += "deviceBuf_" + tensorName; + if (i < outputSize - 1) fGC += ","; + } + if (outputSize > 1) fGC += "}"; + fGC += ";\n"; + fGC += "}\n"; +} + +void RModel::GenerateSessionCode_GPU_ALPAKA() { + + std::set registered_operators; + std::set fusedGroupsEmitted; // tracks which fusion groups have had their struct/decl emitted + + std::set single_initialized_operators = { + SOFIE::OperatorKind::RELU, + SOFIE::OperatorKind::SIGMOID, + SOFIE::OperatorKind::TANH, + SOFIE::OperatorKind::SOFTMAX, + SOFIE::OperatorKind::LEAKYRELU, + SOFIE::OperatorKind::EINSUM, + SOFIE::OperatorKind::ELU, + SOFIE::OperatorKind::UNARY_RECIPROCAL, + SOFIE::OperatorKind::UNARY_SQRT, + SOFIE::OperatorKind::UNARY_NEG, + SOFIE::OperatorKind::UNARY_EXP, + SOFIE::OperatorKind::UNARY_LOG, + SOFIE::OperatorKind::UNARY_SIN, + SOFIE::OperatorKind::UNARY_COS, + SOFIE::OperatorKind::UNARY_ABS, + SOFIE::OperatorKind::NOT + }; + + bool OpNeedsBlas = false; + + fGC += "\n//--- ALPAKA Kernels\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + if(fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) { + OpNeedsBlas = true; + } + + auto gIt = fOpToFusionGroupIdx.find(id); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + // Only emit the fused kernel struct once, at the chain leader + if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) { + const auto& grp = fEltwiseFusionGroups[gIdx]; + std::string sfx = grp.suffix(); + fGC += "\n//------ FUSED_ELTWISE_KERNEL" + sfx + "\n"; + fGC += "struct FusedEltwiseKernel" + sfx + " {\n"; + fGC += SP + "template\n"; + fGC += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t n) const {\n"; + fGC += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + fGC += SP + SP + "if (idx < n) {\n"; + fGC += SP + SP + SP + "T v = data[idx];\n"; + for (size_t opIdx : grp.opIndices) + fGC += SP + SP + SP + "v = " + fOperators[opIdx]->GetElementwiseExpr("v") + ";\n"; + fGC += SP + SP + SP + "out[idx] = v;\n"; + fGC += SP + SP + "}\n"; + fGC += SP + "}\n"; + fGC += "};\n"; + fusedGroupsEmitted.insert(gIdx); + } + // Chain followers: skip (their logic is inside the fused kernel) + } else { + // Unfused op: generate individual kernel struct (with dedup for single_initialized_operators) + if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + if (fVerbose) + std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + } + } + } + + + fGC += "\ntemplate\n"; + fGC += "inline alpaka::WorkDivMembers sofie_workdiv(\n"; + fGC += " alpaka::Vec const& numElems, TIdx blockSz = TIdx{256})\n{\n"; + fGC += " auto const numBlocks = alpaka::Vec::all(\n"; + fGC += " (numElems[0] + blockSz - TIdx{1}) / blockSz);\n"; + fGC += " return alpaka::WorkDivMembers(\n"; + fGC += " numBlocks,\n"; + fGC += " alpaka::Vec::all(blockSz),\n"; + fGC += " alpaka::Vec::all(TIdx{1}));\n"; + fGC += "}\n\n"; + + // define the Session struct (for GNN this is generated in RModel_GNN) + fGC += "\n\ntemplate \n"; + if (fUseSession) { + if (!fIsSubGraph) + fGC += "struct Session {\n\n"; + else + fGC += "struct Session_" + fName + " {\n\n"; + } + + // define host and device accelerators + fGC += "using Idx = std::size_t;\n"; + fGC += "using Dim = alpaka::DimInt<1>;\n"; + fGC += "using Acc = alpaka::TagToAcc;\n"; + fGC += "using DevAcc = alpaka::Dev;\n\n"; + fGC += "using QueueProperty = alpaka::NonBlocking;\n"; + fGC += "using QueueAcc = alpaka::Queue;\n\n"; + fGC += "using BufF1D = alpaka::Buf;\n"; + fGC += "using BufD1D = alpaka::Buf;\n"; + fGC += "using BufI321D = alpaka::Buf;\n"; + fGC += "using BufI641D = alpaka::Buf;\n"; + fGC += "using BufUI81D = alpaka::Buf;\n\n"; + fGC += "// Non-owning device view types (ViewPlainPtr) for the span-based infer interface\n"; + fGC += "using ViewF1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstF1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewD1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstD1D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewI321D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstI321D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewI641D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstI641D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewUI81D = alpaka::ViewPlainPtr;\n"; + fGC += "using ViewConstUI81D = alpaka::ViewPlainPtr;\n\n"; + + fGC += "\nalpaka::Platform const platform{};\n"; + fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n"; + fGC += "alpaka::PlatformCpu platformHost{};\n"; + fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n"; + fGC += "QueueAcc queue{devAcc};\n"; + fGC += "Idx threadsPerBlock = 256;\n"; + fGC += "\nusing Ext1D = alpaka::Vec;\n"; + fGC += "using Vec = alpaka::Vec;\n"; + if (OpNeedsBlas) { + fGC += "\n\n// BLAS declarations\n"; + fGC += "sofieBLAS blas{queue};\n"; + } + + GenerateInitializedTensorInfo_GPU_ALPAKA(); + GenerateGPU_ALPAKA_Buffers(); + GenerateOperatorDeclarations(); + // inject profiling session data member + if (fProfile) { + fGC += RModelProfilerGPU::GenerateSessionMembers(); + } + + // Session constructor + if (fUseSession) { + std::string sessionName = "\n\nSession"; + if (fIsSubGraph) + sessionName += "_" + fName; + + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) + fileName += ".dat"; + if (fWeightFile == WeightFileType::RootBinary) + fileName += ".root"; + + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + fGC += sessionName + "(std::string = \"\""; + } + + if (!fShapeParams.empty()) { + for (auto &p : fShapeParams) { + fGC += ",\n"; + fGC += " size_t " + p.first + " = " + p.second; + } + } + fGC += ") {\n"; + + GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(0); + fGC += "\n"; + } + + MoveInitializedTensorsToBuffers_ALPAKA(); + GenerateDynamicTensorInfo_GPU_ALPAKA(); + + for (size_t id = 0; id < fOperators.size(); id++) { + if (fSkipOperators.count(id)) continue; + fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA(); + if (fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) { + // GetBlasConfig() returns "" for ops that use gemmStridedBatched + // (legacy cuBLAS path, no cuBLASLt layout registration needed). + auto blasCfg = fOperators[id]->GetBlasConfig(); + if (!blasCfg.empty()) + fGC += "\nblas.addLayoutConfig("+blasCfg+");\n"; + } + } + + fGC += "\nalpaka::wait(queue);\n"; + fGC += "}\n\n"; + } + + registered_operators.clear(); + fusedGroupsEmitted.clear(); + + for (size_t id = 0; id < fOperators.size(); id++) { + // Same as the kernel-struct loop above: fused activation ops must still + // declare their member variable (e.g. `leakyReluKernel`) even though + // their Generate_GPU_ALPAKA call is skipped in the infer-body loop. + + auto gIt = fOpToFusionGroupIdx.find(id); + size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX; + bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused(); + + if (inFusedGroup) { + if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) { + std::string sfx = fEltwiseFusionGroups[gIdx].suffix(); + fGC += SP + "FusedEltwiseKernel" + sfx + " fusedEltwiseKernel" + sfx + ";\n"; + fusedGroupsEmitted.insert(gIdx); + } + } else { + if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + if (fVerbose) + std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + } + } + } + + GenerateOutput_GPU_ALPAKA(); + + // inject GPU profiling utility functions and memory report inside Session struct + if (fProfile && fUseSession) { + fGC += RModelProfilerGPU::GenerateUtilityFunctions(); + auto memInfo = RModelProfilerGPU::ComputeMemoryInfo(*this); + fGC += RModelProfilerGPU::GenerateMemoryReport(memInfo); + } + + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n"; + } +} + +void RModel::GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize, bool verbose) { + fProfile = static_cast(options & static_cast>(Options::kProfile)); + fVerbose = true; + fBatchSize = batchSize; + + if (fProfile) + RModelProfilerGPU::AddNeededStdLibs(*this); + + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options || + static_cast>(Options::kGNNComponent) & options) + throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference."); + + Initialize(batchSize, verbose); + FuseGemmActivations_GPU(); // must run before elementwise fusion (redirects tensors) + ComputeEltwiseFusionGroups(); + + std::string hgname; + if (!fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo_GPU_ALPAKA(hgname); + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + GenerateSessionCode_GPU_ALPAKA(); + + if (!fIsSubGraph) { + fGC += ("} //SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){ + for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; + std::string tensor_name = "tensor_" + i.first; + auto length = ConvertShapeToLength(i.second.shape()); + std::string slength = std::to_string(length); + // Use the 3-argument createView(dev, container, extent) which calls std::data() + // internally — works for both std::vector and raw C arrays. + fGC += " auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+", " + slength + ");\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } + } + +} // namespace SOFIE diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/core/src/RModel_Base.cxx similarity index 60% rename from src/SOFIE_core/src/RModel_Base.cxx rename to core/src/RModel_Base.cxx index d4d1f1c..9c49e37 100644 --- a/src/SOFIE_core/src/RModel_Base.cxx +++ b/core/src/RModel_Base.cxx @@ -32,9 +32,16 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; if (fUseWeightFile) fGC += "#include \n"; - // Include TFile when saving the weights in a binary ROOT file - if (fWeightFile == WeightFileType::RootBinary) - fGC += "#include \"TFile.h\"\n"; + + if (fWeightFile == WeightFileType::RootBinary){ + #ifdef SOFIE_SUPPORT_ROOT_BINARY + // Include TFile when saving the weights in a binary ROOT file + fGC += "#include \"TFile.h\"\n"; + #else + throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support"); + #endif + + } fGC += "\nnamespace SOFIE_" + fName + "{\n"; if (!fNeededBlasRoutines.empty()) { @@ -58,6 +65,45 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { } } +void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) { + fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n"); + // add header guards + hgname = fName; + std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) { + return std::toupper(c); + } ); + hgname = "SOFIE_" + hgname; + fGC += "\n#ifndef " + hgname + "\n"; + fGC += "#define " + hgname + "\n\n"; + for (auto& i: fNeededStdLib) { + fGC += "#include <" + i + ">\n"; + } + for (auto& i: fCustomOpHeaders) { + fGC += "#include \"" + i + "\"\n"; + } + fGC += "#include \n"; + fGC += "#include \n"; + fGC += "#include \n"; + + // for the session we need to include SOFIE_Common functions + //needed for convolution operator (need to add a flag) + fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; + if (fUseWeightFile) + fGC += "#include \n"; + + if (fWeightFile == WeightFileType::RootBinary){ + #ifdef SOFIE_SUPPORT_ROOT_BINARY + // Include TFile when saving the weights in a binary ROOT file + fGC += "#include \"TFile.h\"\n"; + #else + throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support"); + #endif + } + + fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n"; + fGC += "\nnamespace SOFIE_" + fName + "{\n"; +} + void RModel_Base::OutputGenerated(std::string filename, bool append) { // the model can be appended only if a file name is provided if (filename.empty()) { @@ -71,7 +117,7 @@ void RModel_Base::OutputGenerated(std::string filename, bool append) { else f.open(filename); if (!f.is_open()) { - throw std::runtime_error("tmva-sofie failed to open file for output generated inference code"); + throw std::runtime_error("sofie failed to open file for output generated inference code"); } f << fGC; f.close(); diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/core/src/RModel_GNN.cxx similarity index 98% rename from src/SOFIE_core/src/RModel_GNN.cxx rename to core/src/RModel_GNN.cxx index a1dfe06..3dae254 100644 --- a/src/SOFIE_core/src/RModel_GNN.cxx +++ b/core/src/RModel_GNN.cxx @@ -94,7 +94,7 @@ void RModel_GNN::Generate() { // the number of output edges features can be smaller, so we need to correct here auto num_edge_features_input = num_edge_features; - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -117,7 +117,7 @@ void RModel_GNN::Generate() { // we need to correct the output number of node features auto num_node_features_input = num_node_features; - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/core/src/RModel_GraphIndependent.cxx similarity index 97% rename from src/SOFIE_core/src/RModel_GraphIndependent.cxx rename to core/src/RModel_GraphIndependent.cxx index bab06b3..cd62d0c 100644 --- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx +++ b/core/src/RModel_GraphIndependent.cxx @@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() { // the number of output edges features can be smaller, so we need to correct here // assume num_edge_features is not a parametric shape - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() { fGC+="};\n}\n"; // we need to correct the output number of node features - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } @@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() { // we need to correct the output number of global features // global features are in shape[1] #if 0 - auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) { num_global_features = globals_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/core/src/SOFIE_common.cxx similarity index 50% rename from src/SOFIE_core/src/SOFIE_common.cxx rename to core/src/SOFIE_common.cxx index ad74313..a2bafde 100644 --- a/src/SOFIE_core/src/SOFIE_common.cxx +++ b/core/src/SOFIE_common.cxx @@ -1,15 +1,18 @@ #include "SOFIE/SOFIE_common.hxx" -#include + +#include #include #include +#include +#include +#include - -namespace SOFIE{ +namespace SOFIE { /// @brief Convert shape from integer format to dynamic one (based on Dim) /// @param shape /// @return shape based on Dim -std::vector ConvertShapeToDim(std::vector shape){ +std::vector ConvertShapeToDim(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ ret_shape[i].dim = shape[i]; @@ -20,7 +23,7 @@ std::vector ConvertShapeToDim(std::vector shape){ /// @brief Convert shape based on Dim to integer format /// @param shape /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter) -std::vector ConvertShapeToInt(std::vector shape){ +std::vector ConvertShapeToInt(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ if (shape[i].isParam) { @@ -46,18 +49,35 @@ std::vector ConvertShapeToInt(std::vector shape){ } -std::size_t ConvertShapeToLength(std::vector shape){ +std::size_t ConvertShapeToLength(const std::vector & shape){ // Empty shape represent scalar values, so we return a length=1 std::size_t fLength = 1; for (auto& dim: shape) fLength *= dim; return fLength; } +std::size_t ConvertShapeToLength(const std::vector & shape){ + // convert generic shape to a string + // multiply all the integer specified dimensions of the shape + std::size_t length = 1; + for (size_t i = 0; i < shape.size(); i++) { + if (!shape[i].isParam) { + length *= shape[i].dim; + } else { + return static_cast(-1); // return -1 in case of parametric shapes + } + } + return length; +} + std::string ConvertTypeToString(ETensorType type){ switch(type){ case ETensorType::FLOAT : { return "float"; } + case ETensorType::INT8 : { + return "int8_t"; + } case ETensorType::INT16 : { return "int16_t"; } @@ -67,6 +87,9 @@ std::string ConvertTypeToString(ETensorType type){ case ETensorType::INT64 : { return "int64_t"; } + case ETensorType::UINT8 : { + return "uint8_t"; + } case ETensorType::UINT16 : { return "uint16_t"; } @@ -80,7 +103,7 @@ std::string ConvertTypeToString(ETensorType type){ return "double"; } case ETensorType::BOOL : { - return "bool"; + return "uint8_t"; } default:{ return "other_" + std::to_string( (int) type); @@ -106,7 +129,7 @@ ETensorType ConvertStringToType(std::string type){ } } -std::string ConvertShapeToString(std::vector shape) { +std::string ConvertShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { @@ -117,41 +140,49 @@ std::string ConvertShapeToString(std::vector shape) { return out.str(); } -std::string ConvertDynamicShapeToString(std::vector shape) { +std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { - out << shape[i].GetVal(); + out << shape[i]; if (i < shape.size()-1) out << " , "; } out << " }"; return out.str(); } -std::string ConvertDynamicShapeToLength(std::vector shape) { +std::string ConvertDimShapeToLength(const std::vector & shape) { // convert generic shape to a string // multiply all the integer specified dimensions of the shape std::string length; - size_t int_length = 0; + // case of empty vectors return 1 + if (shape.empty()) return "1"; + int64_t int_length = -1; for (size_t i = 0; i < shape.size(); i++) { if (shape[i].isParam) { if (!length.empty()) length += " * "; length += shape[i].param; } else { - if (int_length == 0) + if (int_length == -1) int_length = shape[i].dim; else int_length *= shape[i].dim; } } // multiply the integer components to the parametric one - if (int_length > 0) { - if (!length.empty()) length += " * "; - length += std::to_string(int_length); + // if larger than 1 - otherwise returns -1 + if (int_length >= 0) { + if (!length.empty() && int_length > 1) { + length += " * "; + length += std::to_string(int_length); + } else if (length.empty()) { // case is full known shape + length = std::to_string(int_length); + } } return length; } + namespace{ template static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* input, T* target){ //only visible within this translation unit @@ -169,6 +200,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp } } +bool IsInteger(const std::string & s) { + int value; + auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value); + return ec == std::errc() && ptr == s.data() + s.size(); +} + bool UTILITY::AreSameShape(const std::vector& shapeA, const std::vector& shapeB) { if (shapeA.size() != shapeB.size()) { return false; @@ -330,17 +367,24 @@ std::vector UTILITY::MultidirectionalBroadcastShape(std::vector UTILITY::UnidirectionalBroadcastShape(std::vector shapeA, std::vector shapeB) +// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's +// return a pair of integer flag and new broadcasted shape +// if flag = 0: shape are identical +// flag = 1: return shape is equal to A, we broadcast B +// flag = 2: return shape is equal to B we broadcast A +// flag = 3: return shape is common of two we broadcast A and B to output +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { size_t sizeA = shapeA.size(); size_t sizeB = shapeB.size(); // Check if A and B have the same shape if (UTILITY::AreSameShape(shapeA, shapeB)){ - return shapeA; + return std::make_pair(0, shapeA); } // Find the common shape of A and B size_t size = std::max(sizeA, sizeB); if (sizeA < size) { + // prepend 1's in A to make of same shape as B std::vector newShapeA(size, 1); size_t offset = size - sizeA; std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); @@ -359,36 +403,117 @@ std::vector UTILITY::UnidirectionalBroadcastShape(std::vector s break; } } + int broadcastFlag = 0; if (broadcastable) { // The output shape is max(outShape, targetShape) std::vector targetShape(size, 1); for (size_t i = 0; i < size; i++) { targetShape[i] = std::max(shapeA[i], shapeB[i]); + if (shapeB[i] < targetShape[i]) broadcastFlag |= 1; + if (shapeA[i] < targetShape[i]) broadcastFlag |= 2; } - return targetShape; + return std::make_pair(broadcastFlag, targetShape); } else { throw - std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) + " to a common shape."); } } +// unidirectional broadcast- of shape A to target B +std::vector UTILITY::UnidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) +{ + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA); + if (ret.first > 1) { + throw + std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + + ConvertShapeToString(shapeA) + " to " + ConvertShapeToString(shapeB) + + " in a common shape."); + } + return ret.second; +} + +// for broadcasting Dim shapes +// flag indicates also which vector needs to be broadcasted +// flag & 1 == 1 : broadcast B -> A +// flag & 2 == 2 : broadcast A -> B +// flag & 4 == 4 a run time check is needed on shapes with values +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { + size_t sizeA = shapeA.size(); + size_t sizeB = shapeB.size(); + // Check if A and B have the same shape + if (UTILITY::AreSameShape(shapeA, shapeB)){ + return std::make_pair(0, shapeA); + } + // Find the common shape of A and B + size_t size = std::max(sizeA, sizeB); + if (sizeA < size) { + // prepend 1's in A to make of same shape as B + std::vector newShapeA(size, Dim{1}); + size_t offset = size - sizeA; + std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); + shapeA = std::move(newShapeA); + } + if (sizeB < size) { + std::vector newShapeB(size, Dim{1}); + size_t offset = size - sizeB; + std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset); + shapeB = std::move(newShapeB); + } + + int broadcastFlag = 0; + // The output shape is targetShape + std::vector targetShape(size); + for (size_t i = 0; i < size; i++) { + // assume we broadcast to the parametric value + if (shapeA[i] == shapeB[i]) { + targetShape[i] = shapeA[i]; + } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) { + // broadcast B to A (case A is parametric with ) + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) { + // broadcast A to B + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else if (!shapeA[i].isParam && !shapeB[i].isParam) { + if (shapeB[i].dim == 1) { + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].dim == 1) { + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else { + // non broadcastable case cannot have A and B two different defined shapes different than one + broadcastFlag = -1; + } + } else if (shapeA[i].isParam && shapeB[i].isParam) { + // full dynamic case - we will decided at run time + std::stringstream s; + s << "std::max(" << shapeA[i] << "," << shapeB[i] << ")"; + // use -1 for dim to indicate is an expression + targetShape[i] = Dim { s.str() , static_cast(-1)}; + broadcastFlag |= 4; + } else if (shapeA[i].isParam && !shapeB[i].isParam) { + // A -> B need to check at run time if consistent + targetShape[i] = shapeB[i]; + broadcastFlag |= 6; + } else if (!shapeA[i].isParam && shapeB[i].isParam) { + // B -> A need to check at run time if consistent + targetShape[i] = shapeA[i]; + broadcastFlag |= 5; + } else { + // all cases should be covered + throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape"); + } + } + if (broadcastFlag == -1) { + throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + + ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) + + " to a common shape."); + } -// UNidirectional boradcast specializaiton for vector - -// specialization for vector of boolean -void UTILITY::UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData) - { - // Prepend shape with ones - auto ncdata = const_cast &>(data); - if (shape.size() < targetShape.size()) { - size_t targetSize = targetShape.size(); - std::vector newShape(targetSize, 1); - size_t offset = targetSize - shape.size(); - std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - UTILITY::BroadcastTensor &, std::vector &>(ncdata, newShape, targetShape, broadcastedData); - } - UTILITY::BroadcastTensor &, std::vector &>(ncdata, shape, targetShape, broadcastedData); + return std::make_pair(broadcastFlag, targetShape); } std::string UTILITY::Clean_name(std::string input_tensor_name){ @@ -413,15 +538,146 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) // assume row major layout const auto size = shape.size(); std::vector strides(size); - strides[size-1] = Dim{1}; - for (std::size_t i = 1; i < size; i++) { - if (!shape[size-i].isParam && !strides[size-i].isParam) - strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; - else - strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + if (size > 0) { + strides[size-1] = Dim{1}; + for (std::size_t i = 1; i < size; i++) { + if (!shape[size-i].isParam && !strides[size-i].isParam) + strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; + else { + if (strides[size-i].GetVal() == "1") + strides[size - 1 - i] = shape[size-i]; + else if (shape[size-i].GetVal() == "1") + strides[size - 1 - i] = strides[size-i]; + else + strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + } + } } return strides; } +struct FreeBlock { + std::size_t offset; + std::size_t size; + bool operator<(const FreeBlock& other) const { + // order by offset for deterministic coalescing + return offset < other.offset; + } +}; + +struct MemoryEvent { + int t; // time (i.e. operator index) + int type; // 0 = END first, 1 = START + int idx; // tensor index + bool operator<(const MemoryEvent& o) const { + if (t != o.t) return t < o.t; + return type < o.type; // END before START at the same time + } +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ) +{ + // Basic validation + for (const auto &t : tensorsInfo) { + if (!(t.end > t.begin)) { + throw std::runtime_error("Each tensor must have end > begin."); + } + } + + // Build events: free before allocate at equal times. + std::vector events; + events.reserve(tensorsInfo.size() * 2); + for (int i = 0; i < (int)tensorsInfo.size(); ++i) { + events.push_back({tensorsInfo[i].end, 0, i}); // END + events.push_back({tensorsInfo[i].begin, 1, i}); // START + } + std::sort(events.begin(), events.end()); + + std::vector tensorsOffset(tensorsInfo.size()); + + // Free list ordered by offset (for O(log n) coalescing) + // and faster insert/erase with respect to a vector + std::set free_list; + + // Bookkeeping: size/offset map for frees. + std::unordered_map live_size; + std::unordered_map live_offset; + + std::size_t total_bytes = 0; + + auto allocate_best_fit = [&](std::size_t need) -> std::size_t { + // Find the *smallest* block whose size >= need (best-fit). + // Since free_list is ordered by offset, we scan to find best by size. + // (For very large sets you could maintain a multimap by size as well.) + auto best = free_list.end(); + for (auto it = free_list.begin(); it != free_list.end(); ++it) { + if (it->size >= need) { + if (best == free_list.end() || it->size < best->size) + best = it; + } + } + if (best != free_list.end()) { + std::size_t off = best->offset; + if (best->size == need) { + free_list.erase(best); + } else { + FreeBlock updated{best->offset + need, best->size - need}; + free_list.erase(best); + free_list.insert(updated); + } + return off; + } + // No free block large enough; grow the heap. + std::size_t off = total_bytes; + total_bytes += need; + return off; + }; + + auto try_coalesce = [&](std::set::iterator it) { + // Coalesce with previous + if (it != free_list.begin()) { + auto prev = std::prev(it); + if (prev->offset + prev->size == it->offset) { + FreeBlock merged{prev->offset, prev->size + it->size}; + free_list.erase(prev); + it = free_list.erase(it); + it = free_list.insert(merged).first; + } + } + // Coalesce with next + auto next = std::next(it); + if (next != free_list.end() && it->offset + it->size == next->offset) { + FreeBlock merged{it->offset, it->size + next->size}; + free_list.erase(next); + it = free_list.erase(it); + free_list.insert(merged); + } + }; + + // Sweep through time. + for (const auto &e : events) { + if (e.type == 0) { // END: free + auto it_sz = live_size.find(e.idx); + auto it_off = live_offset.find(e.idx); + if (it_sz != live_size.end() && it_off != live_offset.end()) { + FreeBlock fb{it_off->second, it_sz->second}; + // Insert and coalesce with neighbors + auto it = free_list.insert(fb).first; + try_coalesce(it); + live_size.erase(it_sz); + live_offset.erase(it_off); + } + } else { // START: allocate + auto &t = tensorsInfo[e.idx]; + std::size_t off = allocate_best_fit(t.size); + tensorsOffset[e.idx] = off; + live_size[e.idx] = t.size; + live_offset[e.idx] = off; + } + } + + return MemoryResult{total_bytes, std::move(tensorsOffset)}; +} -}//SOFIE +} // namespace SOFIE diff --git a/src/SOFIE_parsers/CMakeLists.txt b/parsers/CMakeLists.txt similarity index 78% rename from src/SOFIE_parsers/CMakeLists.txt rename to parsers/CMakeLists.txt index 379b7d7..7174e90 100644 --- a/src/SOFIE_parsers/CMakeLists.txt +++ b/parsers/CMakeLists.txt @@ -5,7 +5,7 @@ # For the list of contributors see $ROOTSYS/README/CREDITS. ############################################################################ -# CMakeLists.txt file for building TMVA SOFIE package +# CMakeLists.txt file for building SOFIE package ############################################################################ #Author: Sitong An, Lorenzo Moneta 10/03/2021 @@ -26,13 +26,15 @@ set(source_headers ) list(TRANSFORM source_headers PREPEND "inc/") target_include_directories(SOFIE_parsers - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/inc + PUBLIC + $ + $ ) set(sources_cxx src/RModelParser_ONNX.cxx src/ParseBasicUnary.cxx src/ParseBasicBinary.cxx + src/ParseBasicIs.cxx src/ParseBatchNormalization.cxx src/ParseCast.cxx src/ParseConcat.cxx @@ -61,6 +63,7 @@ set(sources_cxx src/ParseLayerNormalization.cxx src/ParseExpand.cxx src/ParseGather.cxx + src/ParseGatherND.cxx src/ParseElu.cxx src/ParseFuseConvAdd.cxx src/ParseFuseConvTransposeAdd.cxx @@ -79,7 +82,11 @@ set(sources_cxx src/ParseWhere.cxx src/ParseEinsum.cxx src/ParseRandom.cxx + src/ParseNot.cxx + src/ParseClip.cxx src/ParseScatterElements.cxx + src/ParseTrilu.cxx + src/ParseLogic.cxx ${PROTO_SRCS} ${DEPENDENCIES} ${SOFIE_core} @@ -102,7 +109,21 @@ target_include_directories(SOFIE_parsers PUBLIC set_target_properties(SOFIE_parsers PROPERTIES POSITION_INDEPENDENT_CODE TRUE) +if(SOFIE_WITH_ROOT AND ROOT_FOUND) + ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_parsers + OPTIONS --deep + ) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap + DESTINATION lib) +endif() + install(TARGETS SOFIE_parsers - LIBRARY DESTINATION lib + EXPORT SOFIETargets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} +) +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) -install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include") diff --git a/src/SOFIE_parsers/inc/LinkDef.h b/parsers/inc/LinkDef.h similarity index 100% rename from src/SOFIE_parsers/inc/LinkDef.h rename to parsers/inc/LinkDef.h diff --git a/src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx b/parsers/inc/SOFIE/RModelParser_ONNX.hxx similarity index 100% rename from src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx rename to parsers/inc/SOFIE/RModelParser_ONNX.hxx diff --git a/src/SOFIE_parsers/onnx_proto3 b/parsers/onnx_proto3 similarity index 100% rename from src/SOFIE_parsers/onnx_proto3 rename to parsers/onnx_proto3 diff --git a/src/SOFIE_parsers/src/ParseBasicBinary.cxx b/parsers/src/ParseBasicBinary.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBasicBinary.cxx rename to parsers/src/ParseBasicBinary.cxx diff --git a/parsers/src/ParseBasicIs.cxx b/parsers/src/ParseBasicIs.cxx new file mode 100644 index 0000000..a1abad4 --- /dev/null +++ b/parsers/src/ParseBasicIs.cxx @@ -0,0 +1,66 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Basic_Is.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +template +std::unique_ptr ParseBasicIs(RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + + std::string input_name = nodeproto.input(0); + if (!parser.IsRegisteredTensorType(input_name)) { + throw + std::runtime_error("SOFIE ONNX Parser " + IsOpTraits::Name() + " op has input tensor " + input_name + + " but its type is not yet registered"); + } + + // get attributes for the IsInf operator + int detect_negative = 1; + int detect_positive = 1; + for (int_t i = 0; i < nodeproto.attribute_size(); i++) { + std::string attribute_name = nodeproto.attribute(i).name(); + if (attribute_name == "detect_negative") + detect_negative = nodeproto.attribute(i).i(); + if (attribute_name == "detect_positive") + detect_positive = nodeproto.attribute(i).i(); + } + + if (detect_positive == 0 && detect_negative == 0) + throw std::runtime_error("SOFIE ONNX Parser IsInf op has invalide attributes"); + + + std::unique_ptr op; + std::string output_name = nodeproto.output(0); + + if (nodeproto.attribute_size() == 0 || (detect_negative == 1 && detect_positive == 1)) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + else if (nodeproto.attribute_size() > 0) { + // case detect_negative or detective_positive are set + if (detect_negative == 0) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + else if (detect_positive == 0) + op.reset(new ROperator_Basic_Is(input_name, output_name)); + } else + throw std::runtime_error("SOFIE ONNX Parser " + IsOpTraits::Name() + " operator - invalid attributes"); + + // Register the output type (is always BOOL) + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, ETensorType::BOOL); + } + + return op; +}; + +// Parse IsNaN +ParserFuncSignature ParseIsNaN = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicIs(parser, nodeproto); +}; + +// Parse IsInf +ParserFuncSignature ParseIsInf = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicIs(parser, nodeproto); +}; + + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseBasicNary.cxx b/parsers/src/ParseBasicNary.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBasicNary.cxx rename to parsers/src/ParseBasicNary.cxx diff --git a/src/SOFIE_parsers/src/ParseBasicUnary.cxx b/parsers/src/ParseBasicUnary.cxx similarity index 82% rename from src/SOFIE_parsers/src/ParseBasicUnary.cxx rename to parsers/src/ParseBasicUnary.cxx index 1470f26..40d0225 100644 --- a/src/SOFIE_parsers/src/ParseBasicUnary.cxx +++ b/parsers/src/ParseBasicUnary.cxx @@ -79,5 +79,20 @@ ParserFuncSignature ParseAbs = [](RModelParser_ONNX &parser, const onnx::NodePro return ParseBasicUnary(parser, nodeproto); }; +//Parse Softplus +ParserFuncSignature ParseSoftplus = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + +//Parse Atan +ParserFuncSignature ParseAtan = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + +//Parse Floor +ParserFuncSignature ParseFloor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBasicUnary(parser, nodeproto); +}; + } // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseBatchNormalization.cxx b/parsers/src/ParseBatchNormalization.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseBatchNormalization.cxx rename to parsers/src/ParseBatchNormalization.cxx diff --git a/src/SOFIE_parsers/src/ParseCast.cxx b/parsers/src/ParseCast.cxx similarity index 78% rename from src/SOFIE_parsers/src/ParseCast.cxx rename to parsers/src/ParseCast.cxx index 7685421..a0993d4 100644 --- a/src/SOFIE_parsers/src/ParseCast.cxx +++ b/parsers/src/ParseCast.cxx @@ -13,20 +13,19 @@ ParserFuncSignature ParseCast = [](RModelParser_ONNX &parser, const onnx::NodePr } std::unique_ptr op; - std::string attr_type; + ETensorType attr_type; for (int_t i = 0; i < nodeproto.attribute_size(); i++) { std::string attribute_name = nodeproto.attribute(i).name(); if (attribute_name == "to") - attr_type = ConvertTypeToString(static_cast(nodeproto.attribute(i).i())); + attr_type = static_cast(nodeproto.attribute(i).i()); } std::string output_name = nodeproto.output(0); op.reset(new ROperator_Cast(attr_type, nodeproto.input(0), output_name)); if (!parser.IsRegisteredTensorType(output_name)) { - ETensorType output_type = ConvertStringToType(attr_type); - parser.RegisterTensorType(output_name, output_type); + parser.RegisterTensorType(output_name, attr_type); } return op; diff --git a/parsers/src/ParseClip.cxx b/parsers/src/ParseClip.cxx new file mode 100644 index 0000000..4424c76 --- /dev/null +++ b/parsers/src/ParseClip.cxx @@ -0,0 +1,46 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Clip.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +ParserFuncSignature ParseClip = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + ETensorType input_type = ETensorType::UNDEFINED; + + std::string input_name = nodeproto.input(0); + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + } else { + throw std::runtime_error("SOFIE ONNX Parser Clip op has input tensor " + input_name + + " but its type is not yet registered"); + } + + std::string output_name = nodeproto.output(0); + + // ONNX opset 11+: min and max are optional tensor inputs (empty string when absent) + std::string min_name = (nodeproto.input_size() > 1 && !nodeproto.input(1).empty()) + ? nodeproto.input(1) : ""; + std::string max_name = (nodeproto.input_size() > 2 && !nodeproto.input(2).empty()) + ? nodeproto.input(2) : ""; + + std::unique_ptr op; + switch (input_type) { + case ETensorType::FLOAT: + op.reset(new ROperator_Clip(input_name, output_name, min_name, max_name)); + break; + case ETensorType::DOUBLE: + op.reset(new ROperator_Clip(input_name, output_name, min_name, max_name)); + break; + default: + throw std::runtime_error("SOFIE ONNX Parser Clip op does not yet support input type " + + std::to_string(static_cast(input_type))); + } + + if (!parser.IsRegisteredTensorType(output_name)) + parser.RegisterTensorType(output_name, input_type); + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseComparision.cxx b/parsers/src/ParseComparision.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseComparision.cxx rename to parsers/src/ParseComparision.cxx diff --git a/src/SOFIE_parsers/src/ParseConcat.cxx b/parsers/src/ParseConcat.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConcat.cxx rename to parsers/src/ParseConcat.cxx diff --git a/src/SOFIE_parsers/src/ParseConstant.cxx b/parsers/src/ParseConstant.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConstant.cxx rename to parsers/src/ParseConstant.cxx diff --git a/src/SOFIE_parsers/src/ParseConv.cxx b/parsers/src/ParseConv.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConv.cxx rename to parsers/src/ParseConv.cxx diff --git a/src/SOFIE_parsers/src/ParseConvTranspose.cxx b/parsers/src/ParseConvTranspose.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseConvTranspose.cxx rename to parsers/src/ParseConvTranspose.cxx diff --git a/src/SOFIE_parsers/src/ParseEinsum.cxx b/parsers/src/ParseEinsum.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseEinsum.cxx rename to parsers/src/ParseEinsum.cxx diff --git a/src/SOFIE_parsers/src/ParseElu.cxx b/parsers/src/ParseElu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseElu.cxx rename to parsers/src/ParseElu.cxx diff --git a/src/SOFIE_parsers/src/ParseErf.cxx b/parsers/src/ParseErf.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseErf.cxx rename to parsers/src/ParseErf.cxx diff --git a/src/SOFIE_parsers/src/ParseExpand.cxx b/parsers/src/ParseExpand.cxx similarity index 81% rename from src/SOFIE_parsers/src/ParseExpand.cxx rename to parsers/src/ParseExpand.cxx index c4ed54f..0793880 100644 --- a/src/SOFIE_parsers/src/ParseExpand.cxx +++ b/parsers/src/ParseExpand.cxx @@ -35,9 +35,19 @@ ParserFuncSignature ParseExpand = [](RModelParser_ONNX &parser, const onnx::Node case ETensorType::FLOAT: op.reset(new ROperator_Expand(input_name, shape_name, output_name)); break; + case ETensorType::DOUBLE: + op.reset(new ROperator_Expand(input_name, shape_name, output_name)); + break; + case ETensorType::INT32: + op.reset(new ROperator_Expand(input_name, shape_name, output_name)); + break; case ETensorType::INT64: op.reset(new ROperator_Expand(input_name, shape_name, output_name)); break; + case ETensorType::BOOL: + case ETensorType::UINT8: + op.reset(new ROperator_Expand(input_name, shape_name, output_name)); + break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Expand Operator does " "not support input type " + diff --git a/src/SOFIE_parsers/src/ParseEyeLike.cxx b/parsers/src/ParseEyeLike.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseEyeLike.cxx rename to parsers/src/ParseEyeLike.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx b/parsers/src/ParseFuseBatchnormRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx rename to parsers/src/ParseFuseBatchnormRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseConvAdd.cxx b/parsers/src/ParseFuseConvAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseConvAdd.cxx rename to parsers/src/ParseFuseConvAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx b/parsers/src/ParseFuseConvTransposeAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx rename to parsers/src/ParseFuseConvTransposeAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx b/parsers/src/ParseFuseGemmRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx rename to parsers/src/ParseFuseGemmRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx b/parsers/src/ParseFuseMatMulAdd.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx rename to parsers/src/ParseFuseMatMulAdd.cxx diff --git a/src/SOFIE_parsers/src/ParseGRU.cxx b/parsers/src/ParseGRU.cxx similarity index 97% rename from src/SOFIE_parsers/src/ParseGRU.cxx rename to parsers/src/ParseGRU.cxx index ec2cddf..58ce983 100644 --- a/src/SOFIE_parsers/src/ParseGRU.cxx +++ b/parsers/src/ParseGRU.cxx @@ -46,7 +46,7 @@ ParserFuncSignature ParseGRU = [](RModelParser_ONNX &parser, const onnx::NodePro } else if (attribute_name == "linear_before_reset") { attr_linear_before_reset = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseGather.cxx b/parsers/src/ParseGather.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseGather.cxx rename to parsers/src/ParseGather.cxx diff --git a/parsers/src/ParseGatherND.cxx b/parsers/src/ParseGatherND.cxx new file mode 100644 index 0000000..57beb01 --- /dev/null +++ b/parsers/src/ParseGatherND.cxx @@ -0,0 +1,49 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_GatherND.hxx" +#include "onnx_proto3.pb.h" +#include + + +namespace SOFIE { + +ParserFuncSignature ParseGatherND = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + ETensorType input_type = ETensorType::UNDEFINED; + auto input_name = nodeproto.input(0); + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + } else { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op has input tensor " + input_name + + " but its type is not yet registered"); + } + + auto indices_name = nodeproto.input(1); + if (parser.IsRegisteredTensorType(indices_name)) { + ETensorType indices_type = parser.GetTensorType(indices_name); + if (indices_type != ETensorType::INT64) { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op indices tensor must be INT64, got " + + indices_name); + } + } + + int64_t batch_dims = 0; + for (int i = 0; i < nodeproto.attribute_size(); ++i) { + const auto& attr = nodeproto.attribute(i); + if (attr.name() == "batch_dims") { + batch_dims = attr.i(); + break; + } + } + + std::string output_name = nodeproto.output(0); + + std::unique_ptr op( + new ROperator_GatherND(batch_dims, input_name, indices_name, output_name)); + + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, input_type); + } + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseGemm.cxx b/parsers/src/ParseGemm.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseGemm.cxx rename to parsers/src/ParseGemm.cxx diff --git a/src/SOFIE_parsers/src/ParseIdentity.cxx b/parsers/src/ParseIdentity.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseIdentity.cxx rename to parsers/src/ParseIdentity.cxx diff --git a/src/SOFIE_parsers/src/ParseIf.cxx b/parsers/src/ParseIf.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseIf.cxx rename to parsers/src/ParseIf.cxx diff --git a/src/SOFIE_parsers/src/ParseLSTM.cxx b/parsers/src/ParseLSTM.cxx similarity index 97% rename from src/SOFIE_parsers/src/ParseLSTM.cxx rename to parsers/src/ParseLSTM.cxx index b9dc165..a95ee01 100644 --- a/src/SOFIE_parsers/src/ParseLSTM.cxx +++ b/parsers/src/ParseLSTM.cxx @@ -46,7 +46,7 @@ ParserFuncSignature ParseLSTM = [](RModelParser_ONNX &parser, const onnx::NodePr } else if (attribute_name == "layout") { attr_layout = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseLayerNormalization.cxx b/parsers/src/ParseLayerNormalization.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseLayerNormalization.cxx rename to parsers/src/ParseLayerNormalization.cxx diff --git a/src/SOFIE_parsers/src/ParseLeakyRelu.cxx b/parsers/src/ParseLeakyRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseLeakyRelu.cxx rename to parsers/src/ParseLeakyRelu.cxx diff --git a/parsers/src/ParseLogic.cxx b/parsers/src/ParseLogic.cxx new file mode 100644 index 0000000..1609678 --- /dev/null +++ b/parsers/src/ParseLogic.cxx @@ -0,0 +1,181 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Logic.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +// ───────────────────────────────────────────────────────────────────────────── +// Helper: parse a binary logical op (And / Or / Xor) +// +// ONNX spec: both inputs are bool; output is bool. +// In SOFIE, BOOL tensors are stored as uint8_t. +// ───────────────────────────────────────────────────────────────────────────── + +template +static std::unique_ptr ParseLogicalBinary(RModelParser_ONNX &parser, + const onnx::NodeProto &nodeproto) +{ + const std::string input_a = nodeproto.input(0); + const std::string input_b = nodeproto.input(1); + const std::string output = nodeproto.output(0); + + for (const auto &name : { input_a, input_b }) { + if (!parser.IsRegisteredTensorType(name)) + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser " + + LogicBinaryTrait::Name() + + ": input tensor '" + name + "' type not yet registered"); + ETensorType t = parser.GetTensorType(name); + if (t != ETensorType::BOOL && t != ETensorType::UINT8) + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser " + + LogicBinaryTrait::Name() + + ": input '" + name + "' must be bool, got " + + ConvertTypeToString(t)); + } + + std::unique_ptr op( + new ROperator_LogicBinary(input_a, input_b, output)); + + if (!parser.IsRegisteredTensorType(output)) + parser.RegisterTensorType(output, ETensorType::BOOL); + + return op; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helper: parse a binary bitwise op (BitwiseAnd / BitwiseOr / BitwiseXor) +// +// ONNX spec: inputs can be any integer type; output has same type. +// ───────────────────────────────────────────────────────────────────────────── + +template +static std::unique_ptr ParseBitwiseBinary(RModelParser_ONNX &parser, + const onnx::NodeProto &nodeproto) +{ + const std::string input_a = nodeproto.input(0); + const std::string input_b = nodeproto.input(1); + const std::string output = nodeproto.output(0); + + if (!parser.IsRegisteredTensorType(input_a)) + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser " + + LogicBinaryTrait::Name() + + ": input tensor '" + input_a + "' type not yet registered"); + + const ETensorType input_type = parser.GetTensorType(input_a); + + std::unique_ptr op; + switch (input_type) { + case ETensorType::INT8: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::UINT8: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::INT16: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::UINT16: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::INT32: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::UINT32: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::INT64: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + case ETensorType::UINT64: + op.reset(new ROperator_LogicBinary(input_a, input_b, output)); break; + default: + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser " + + LogicBinaryTrait::Name() + + ": unsupported input type " + ConvertTypeToString(input_type)); + } + + if (!parser.IsRegisteredTensorType(output)) + parser.RegisterTensorType(output, input_type); + + return op; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Logical binary parsers +// ───────────────────────────────────────────────────────────────────────────── + +ParserFuncSignature ParseAnd = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseLogicalBinary(parser, nodeproto); +}; + +ParserFuncSignature ParseOr = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseLogicalBinary(parser, nodeproto); +}; + +ParserFuncSignature ParseXor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseLogicalBinary(parser, nodeproto); +}; + +// ───────────────────────────────────────────────────────────────────────────── +// Bitwise binary parsers +// ───────────────────────────────────────────────────────────────────────────── + +ParserFuncSignature ParseBitwiseAnd = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBitwiseBinary(parser, nodeproto); +}; + +ParserFuncSignature ParseBitwiseOr = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBitwiseBinary(parser, nodeproto); +}; + +ParserFuncSignature ParseBitwiseXor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseBitwiseBinary(parser, nodeproto); +}; + +// ───────────────────────────────────────────────────────────────────────────── +// BitwiseNot parser +// +// ONNX spec: any integer type; output same type as input. +// ───────────────────────────────────────────────────────────────────────────── + +ParserFuncSignature ParseBitwiseNot = [](RModelParser_ONNX &parser, + const onnx::NodeProto &nodeproto) + -> std::unique_ptr +{ + const std::string input_name = nodeproto.input(0); + const std::string output_name = nodeproto.output(0); + + if (!parser.IsRegisteredTensorType(input_name)) + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser BitwiseNot: input tensor '" + + input_name + "' type not yet registered"); + + const ETensorType input_type = parser.GetTensorType(input_name); + + std::unique_ptr op; + switch (input_type) { + case ETensorType::INT8: + op.reset(new ROperator_BitwiseNot (input_name, output_name)); break; + case ETensorType::UINT8: + op.reset(new ROperator_BitwiseNot (input_name, output_name)); break; + case ETensorType::INT16: + op.reset(new ROperator_BitwiseNot (input_name, output_name)); break; + case ETensorType::UINT16: + op.reset(new ROperator_BitwiseNot(input_name, output_name)); break; + case ETensorType::INT32: + op.reset(new ROperator_BitwiseNot (input_name, output_name)); break; + case ETensorType::UINT32: + op.reset(new ROperator_BitwiseNot(input_name, output_name)); break; + case ETensorType::INT64: + op.reset(new ROperator_BitwiseNot (input_name, output_name)); break; + case ETensorType::UINT64: + op.reset(new ROperator_BitwiseNot(input_name, output_name)); break; + default: + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser BitwiseNot: unsupported input type " + + ConvertTypeToString(input_type)); + } + + if (!parser.IsRegisteredTensorType(output_name)) + parser.RegisterTensorType(output_name, input_type); + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseMatMul.cxx b/parsers/src/ParseMatMul.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseMatMul.cxx rename to parsers/src/ParseMatMul.cxx diff --git a/parsers/src/ParseNot.cxx b/parsers/src/ParseNot.cxx new file mode 100644 index 0000000..ca315eb --- /dev/null +++ b/parsers/src/ParseNot.cxx @@ -0,0 +1,38 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Not.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +ParserFuncSignature ParseNot = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) +{ + ETensorType input_type = ETensorType::UNDEFINED; + + if (nodeproto.input_size() != 1 || nodeproto.output_size() != 1) + std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input or output size "); + + std::string input_name = nodeproto.input(0); + + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + if (input_type !=ETensorType::BOOL && input_type !=ETensorType::UINT8 ) + throw std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input type " + ConvertTypeToString(input_type)); + } else { + throw + std::runtime_error("TMVA::SOFIE ONNX Parser Not op has input tensor " + input_name + + " but its type is not yet registered"); + } + + std::string output_name = nodeproto.output(0); + std::unique_ptr op(new ROperator_Not(input_name, output_name)); + + // Infer the output type + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, input_type); + } + + return op; +}; + + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParsePad.cxx b/parsers/src/ParsePad.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParsePad.cxx rename to parsers/src/ParsePad.cxx diff --git a/src/SOFIE_parsers/src/ParsePool.cxx b/parsers/src/ParsePool.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParsePool.cxx rename to parsers/src/ParsePool.cxx diff --git a/src/SOFIE_parsers/src/ParseRNN.cxx b/parsers/src/ParseRNN.cxx similarity index 96% rename from src/SOFIE_parsers/src/ParseRNN.cxx rename to parsers/src/ParseRNN.cxx index d75b577..2d20e15 100644 --- a/src/SOFIE_parsers/src/ParseRNN.cxx +++ b/parsers/src/ParseRNN.cxx @@ -43,7 +43,7 @@ ParserFuncSignature ParseRNN = [](RModelParser_ONNX &parser, const onnx::NodePro } else if (attribute_name == "layout") { attr_layout = nodeproto.attribute(i).i(); } else { - std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " + std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode " << nodeproto.name() << " is not defined in ONNX IR and not applied!\n"; } } diff --git a/src/SOFIE_parsers/src/ParseRandom.cxx b/parsers/src/ParseRandom.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRandom.cxx rename to parsers/src/ParseRandom.cxx diff --git a/src/SOFIE_parsers/src/ParseRange.cxx b/parsers/src/ParseRange.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRange.cxx rename to parsers/src/ParseRange.cxx diff --git a/src/SOFIE_parsers/src/ParseReduce.cxx b/parsers/src/ParseReduce.cxx similarity index 86% rename from src/SOFIE_parsers/src/ParseReduce.cxx rename to parsers/src/ParseReduce.cxx index 45696a5..63ff834 100644 --- a/src/SOFIE_parsers/src/ParseReduce.cxx +++ b/parsers/src/ParseReduce.cxx @@ -21,6 +21,10 @@ std::unique_ptr ParseReduce(RModelParser_ONNX &parser, const onnx::No op_mode = ReduceProd; else if (nodeproto.op_type() == "ReduceSum") op_mode = ReduceSum; + else if (nodeproto.op_type() == "ReduceL2") + op_mode = ReduceL2; + else if (nodeproto.op_type() == "ReduceMax") + op_mode = ReduceMax; if (op_mode == InvalidReduceOp) { throw std::runtime_error("TMVA::SOFIE - Reduce op mode not supported."); @@ -91,5 +95,15 @@ ParserFuncSignature ParseReduceSum = [](RModelParser_ONNX &parser, const onnx::N return ParseReduce(parser, nodeproto); }; +// Parse ReduceL2 +ParserFuncSignature ParseReduceL2 = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseReduce(parser, nodeproto); +}; + +// Parse ReduceMax +ParserFuncSignature ParseReduceMax = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + return ParseReduce(parser, nodeproto); +}; + } // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseRelu.cxx b/parsers/src/ParseRelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseRelu.cxx rename to parsers/src/ParseRelu.cxx diff --git a/src/SOFIE_parsers/src/ParseReshape.cxx b/parsers/src/ParseReshape.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseReshape.cxx rename to parsers/src/ParseReshape.cxx diff --git a/src/SOFIE_parsers/src/ParseScatterElements.cxx b/parsers/src/ParseScatterElements.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseScatterElements.cxx rename to parsers/src/ParseScatterElements.cxx diff --git a/src/SOFIE_parsers/src/ParseSelu.cxx b/parsers/src/ParseSelu.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSelu.cxx rename to parsers/src/ParseSelu.cxx diff --git a/src/SOFIE_parsers/src/ParseShape.cxx b/parsers/src/ParseShape.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseShape.cxx rename to parsers/src/ParseShape.cxx diff --git a/src/SOFIE_parsers/src/ParseSigmoid.cxx b/parsers/src/ParseSigmoid.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSigmoid.cxx rename to parsers/src/ParseSigmoid.cxx diff --git a/src/SOFIE_parsers/src/ParseSlice.cxx b/parsers/src/ParseSlice.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSlice.cxx rename to parsers/src/ParseSlice.cxx diff --git a/src/SOFIE_parsers/src/ParseSoftmax.cxx b/parsers/src/ParseSoftmax.cxx similarity index 91% rename from src/SOFIE_parsers/src/ParseSoftmax.cxx rename to parsers/src/ParseSoftmax.cxx index aea042e..19bd57a 100644 --- a/src/SOFIE_parsers/src/ParseSoftmax.cxx +++ b/parsers/src/ParseSoftmax.cxx @@ -24,7 +24,7 @@ ParserFuncSignature ParseSoftmax = [](RModelParser_ONNX &parser, const onnx::Nod attr_axis = nodeproto.attribute(0).i(); switch (input_type) { - case ETensorType::FLOAT: op.reset(new ROperator_Softmax(attr_axis, input_name, output_name)); break; + case ETensorType::FLOAT: op.reset(new ROperator_Softmax(attr_axis, input_name, output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Softmax does not yet support input type " + std::to_string(static_cast(input_type))); diff --git a/src/SOFIE_parsers/src/ParseSplit.cxx b/parsers/src/ParseSplit.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseSplit.cxx rename to parsers/src/ParseSplit.cxx diff --git a/src/SOFIE_parsers/src/ParseTanh.cxx b/parsers/src/ParseTanh.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTanh.cxx rename to parsers/src/ParseTanh.cxx diff --git a/src/SOFIE_parsers/src/ParseTile.cxx b/parsers/src/ParseTile.cxx similarity index 93% rename from src/SOFIE_parsers/src/ParseTile.cxx rename to parsers/src/ParseTile.cxx index 20dbfb6..8b8c47f 100644 --- a/src/SOFIE_parsers/src/ParseTile.cxx +++ b/parsers/src/ParseTile.cxx @@ -29,6 +29,7 @@ ParserFuncSignature ParseTile = [](RModelParser_ONNX &parser, const onnx::NodePr switch (input_type) { case ETensorType::FLOAT: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; + case ETensorType::INT64: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Tile does not yet support input type " + std::to_string(static_cast(input_type))); diff --git a/src/SOFIE_parsers/src/ParseTopK.cxx b/parsers/src/ParseTopK.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTopK.cxx rename to parsers/src/ParseTopK.cxx diff --git a/src/SOFIE_parsers/src/ParseTranspose.cxx b/parsers/src/ParseTranspose.cxx similarity index 100% rename from src/SOFIE_parsers/src/ParseTranspose.cxx rename to parsers/src/ParseTranspose.cxx diff --git a/parsers/src/ParseTrilu.cxx b/parsers/src/ParseTrilu.cxx new file mode 100644 index 0000000..c196caf --- /dev/null +++ b/parsers/src/ParseTrilu.cxx @@ -0,0 +1,67 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_Trilu.hxx" +#include "onnx_proto3.pb.h" + +namespace SOFIE { + +ParserFuncSignature ParseTrilu = [](RModelParser_ONNX &parser, + const onnx::NodeProto &nodeproto) + -> std::unique_ptr +{ + // ── Validate primary input ───────────────────────────────────────────── + const std::string input_name = nodeproto.input(0); + if (!parser.IsRegisteredTensorType(input_name)) + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser Trilu: input tensor '" + input_name + + "' type not yet registered"); + + const ETensorType input_type = parser.GetTensorType(input_name); + const std::string output_name = nodeproto.output(0); + + // ── Parse 'upper' attribute (default 1) ─────────────────────────────── + int attr_upper = 1; + for (int i = 0; i < nodeproto.attribute_size(); ++i) { + if (nodeproto.attribute(i).name() == "upper") + attr_upper = static_cast(nodeproto.attribute(i).i()); + } + + // ── Optional k input (second input, scalar int64) ───────────────────── + std::string k_name; + if (nodeproto.input_size() > 1 && !nodeproto.input(1).empty()) { + k_name = nodeproto.input(1); + // Register k tensor type if not yet seen (it is always int64). + if (!parser.IsRegisteredTensorType(k_name)) + parser.RegisterTensorType(k_name, ETensorType::INT64); + } + + // ── Create operator (templated on the primary input type) ────────────── + std::unique_ptr op; + + auto make_op = [&]() { + if (k_name.empty()) + op.reset(new ROperator_Trilu(attr_upper, input_name, output_name)); + else + op.reset(new ROperator_Trilu(attr_upper, input_name, k_name, output_name)); + }; + + switch (input_type) { + case ETensorType::FLOAT: make_op.template operator()(); break; + case ETensorType::DOUBLE: make_op.template operator()(); break; + case ETensorType::INT32: make_op.template operator()(); break; + case ETensorType::INT64: make_op.template operator()(); break; + case ETensorType::UINT8: make_op.template operator()(); break; + case ETensorType::BOOL: make_op.template operator()(); break; + default: + throw std::runtime_error( + "TMVA::SOFIE ONNX Parser Trilu: unsupported input type " + + std::to_string(static_cast(input_type))); + } + + // ── Register output type ─────────────────────────────────────────────── + if (!parser.IsRegisteredTensorType(output_name)) + parser.RegisterTensorType(output_name, input_type); + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseWhere.cxx b/parsers/src/ParseWhere.cxx similarity index 80% rename from src/SOFIE_parsers/src/ParseWhere.cxx rename to parsers/src/ParseWhere.cxx index ea73cff..636c7e2 100644 --- a/src/SOFIE_parsers/src/ParseWhere.cxx +++ b/parsers/src/ParseWhere.cxx @@ -11,6 +11,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has invalid input size"); } // condition boolean vector is input 0 + if (!parser.IsRegisteredTensorType(nodeproto.input(0))){ + throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " + nodeproto.input(0) + + " but its type is not yet registered"); + } if (!parser.IsRegisteredTensorType(nodeproto.input(1))){ throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " + nodeproto.input(1) + " but its type is not yet registered"); @@ -31,10 +35,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP switch (input_type) { case ETensorType::FLOAT: - op.reset(new ROperator_Where(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name)); + op.reset(new ROperator_Where(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name)); break; case ETensorType::INT64: - op.reset(new ROperator_Where(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name)); + op.reset(new ROperator_Where(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Where Operator does not yet support input type " + diff --git a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx b/parsers/src/RModelParser_ONNX.cxx similarity index 84% rename from src/SOFIE_parsers/src/RModelParser_ONNX.cxx rename to parsers/src/RModelParser_ONNX.cxx index 68662ae..afb8b93 100644 --- a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx +++ b/parsers/src/RModelParser_ONNX.cxx @@ -1,4 +1,3 @@ -#include "Byteswap.h" #include "SOFIE/RModelParser_ONNX.hxx" #include "onnx_proto3.pb.h" @@ -8,7 +7,12 @@ #include #include #include +#include #include +#include +#include +#include +#include #include "SOFIE/SOFIE_common.hxx" @@ -24,6 +28,10 @@ extern ParserFuncSignature ParseLog; extern ParserFuncSignature ParseSin; extern ParserFuncSignature ParseCos; extern ParserFuncSignature ParseAbs; +extern ParserFuncSignature ParseSoftplus; +extern ParserFuncSignature ParseAtan; +extern ParserFuncSignature ParseFloor; + // Binary operators extern ParserFuncSignature ParseAdd; extern ParserFuncSignature ParseSub; @@ -41,11 +49,18 @@ extern ParserFuncSignature ParseLess; extern ParserFuncSignature ParseLessEq; extern ParserFuncSignature ParseGreater; extern ParserFuncSignature ParseGreaterEq; +//Is Operators +extern ParserFuncSignature ParseIsInf; +extern ParserFuncSignature ParseIsNaN; +extern ParserFuncSignature ParseNot; +extern ParserFuncSignature ParseClip; // Reduce operators extern ParserFuncSignature ParseReduceMean; extern ParserFuncSignature ParseReduceSum; extern ParserFuncSignature ParseReduceSumSquare; extern ParserFuncSignature ParseReduceProd; +extern ParserFuncSignature ParseReduceL2; +extern ParserFuncSignature ParseReduceMax; // Others extern ParserFuncSignature ParseBatchNormalization; extern ParserFuncSignature ParseConstant; @@ -73,6 +88,7 @@ extern ParserFuncSignature ParseShape; extern ParserFuncSignature ParseMatMul; extern ParserFuncSignature ParseLayerNormalization; extern ParserFuncSignature ParseGather; +extern ParserFuncSignature ParseGatherND; extern ParserFuncSignature ParseErf; extern ParserFuncSignature ParseElu; extern ParserFuncSignature ParseEyeLike; @@ -86,6 +102,14 @@ extern ParserFuncSignature ParseWhere; extern ParserFuncSignature ParseEinsum; extern ParserFuncSignature ParseRandom; extern ParserFuncSignature ParseScatterElements; +extern ParserFuncSignature ParseTrilu; +extern ParserFuncSignature ParseAnd; +extern ParserFuncSignature ParseOr; +extern ParserFuncSignature ParseXor; +extern ParserFuncSignature ParseBitwiseAnd; +extern ParserFuncSignature ParseBitwiseOr; +extern ParserFuncSignature ParseBitwiseXor; +extern ParserFuncSignature ParseBitwiseNot; // Declaration of fused operators extern ParserFuseFuncSignature ParseFuseConvAdd; extern ParserFuseFuncSignature ParseFuseGemmRelu; @@ -132,18 +156,31 @@ struct ExtractDataFromTP { static_cast(data)); } }; +// Reverse the bytes of a trivially-copyable value (used on big-endian hosts). +// ONNX raw_data is always stored in little-endian order. +template +static T bswap_value(T value) noexcept { + static_assert(std::is_trivially_copyable_v); + std::array bytes; + std::memcpy(bytes.data(), &value, sizeof(T)); + std::reverse(bytes.begin(), bytes.end()); + T result; + std::memcpy(&result, bytes.data(), sizeof(T)); + return result; +} + template std::shared_ptr GetInitializedTensorData(onnx::TensorProto * tensorproto, size_t length) { + std::cout<<"Getting Initialized Tensor data for tensor " << tensorproto->name() << " of type " << tensorproto->data_type() << " and length " << length << std::endl; std::shared_ptr data(malloc(length * sizeof(T)), free); if (!tensorproto->raw_data().empty()) { -#ifdef R__BYTESWAP std::memcpy(data.get(), tensorproto->raw_data().c_str(), length * sizeof(T)); -#else - for (std::size_t k = 0; k < length; ++k) - (reinterpret_cast::value_type *>(data.get()))[k] = - RByteSwap::bswap((reinterpret_cast::value_type *>(tensorproto->raw_data().c_str()))[k]); -#endif + if constexpr (std::endian::native != std::endian::little) { + T *ptr = static_cast(data.get()); + for (std::size_t k = 0; k < length; ++k) + ptr[k] = bswap_value(ptr[k]); + } } else { ExtractDataFromTP::Copy(tensorproto, data.get()); } @@ -162,6 +199,10 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("Sin", ParseSin); RegisterOperator("Cos", ParseCos); RegisterOperator("Abs", ParseAbs); + RegisterOperator("Softplus", ParseSoftplus); + RegisterOperator("Atan", ParseAtan); + RegisterOperator("Floor", ParseFloor); + // Binary operators RegisterOperator("Add", ParseAdd); RegisterOperator("Sub", ParseSub); @@ -179,11 +220,18 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("LessOrEqual", ParseLessEq); RegisterOperator("Greater", ParseGreater); RegisterOperator("GreaterOrEqual", ParseGreaterEq); + // Is / Not operators + RegisterOperator("IsInf", ParseIsInf); + RegisterOperator("IsNaN", ParseIsNaN); + RegisterOperator("Not", ParseNot); + RegisterOperator("Clip", ParseClip); // Reduce operators RegisterOperator("ReduceMean", ParseReduceMean); RegisterOperator("ReduceSum", ParseReduceSum); RegisterOperator("ReduceSumSquare", ParseReduceSumSquare); RegisterOperator("ReduceProd", ParseReduceProd); + RegisterOperator("ReduceL2", ParseReduceL2); + RegisterOperator("ReduceMax", ParseReduceMax); // Others RegisterOperator("BatchNormalization", ParseBatchNormalization); RegisterOperator("Constant", ParseConstant); @@ -217,6 +265,7 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("LayerNormalization", ParseLayerNormalization); RegisterOperator("Expand", ParseExpand); RegisterOperator("Gather", ParseGather); + RegisterOperator("GatherND", ParseGatherND); RegisterOperator("Erf", ParseErf); RegisterOperator("Elu", ParseElu); RegisterOperator("EyeLike", ParseEyeLike); @@ -233,6 +282,16 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("RandomUniform", ParseRandom); RegisterOperator("RandomUniformLike", ParseRandom); RegisterOperator("ScatterElements", ParseScatterElements); + RegisterOperator("Trilu", ParseTrilu); + // Logical operators + RegisterOperator("And", ParseAnd); + RegisterOperator("Or", ParseOr); + RegisterOperator("Xor", ParseXor); + // Bitwise operators + RegisterOperator("BitwiseAnd", ParseBitwiseAnd); + RegisterOperator("BitwiseOr", ParseBitwiseOr); + RegisterOperator("BitwiseXor", ParseBitwiseXor); + RegisterOperator("BitwiseNot", ParseBitwiseNot); } // Destructor of the parser @@ -584,6 +643,13 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & if (verbose) std::cout << "add INT64 initialized tensor " << input_name << " shape " << ConvertShapeToString(shape) << std::endl; rmodel.AddInitializedTensor(input_name, ETensorType::INT64, shape, data); allInitializedTensors[input_name] = i; + std::cout<<"Printing initialized values for tensor: "<(data.get()); + + for (size_t i = 0; i < fLength; ++i) { + std::cout << rawData[i] << " "; + } + std::cout << std::endl; break; } default: @@ -613,6 +679,18 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & nodesOrder.reserve(graph.node_size()); std::vector foundNodes(graph.node_size()); + // Pre-compute the set of all tensor names that belong to THIS graph: + // graph inputs, initializers, and node outputs. A tensor is an "outer-scope + // reference" (from an enclosing graph) only if it is NOT in this set. + std::unordered_set graphLocalTensors; + for (int i = 0; i < graph.input_size(); i++) + graphLocalTensors.insert(graph.input(i).name()); + for (int i = 0; i < graph.initializer_size(); i++) + graphLocalTensors.insert(graph.initializer(i).name()); + for (int i = 0; i < graph.node_size(); i++) + for (int j = 0; j < graph.node(i).output_size(); j++) + graphLocalTensors.insert(graph.node(i).output(j)); + // loop at graph inputs std::map allInputs; for (int i = 0; i < graph.input_size(); i++) { @@ -633,13 +711,22 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & std::string name = graph.node(i).input(j); // skip empty names if (!name.empty()) { - existInputs &= (allInputs.find(name) != allInputs.end() || - allInitializedTensors.find(name) != allInitializedTensors.end()); + // A tensor is available if it is: a graph input/previously computed node output + // (allInputs), an initializer (allInitializedTensors), or an outer-scope tensor + // referenced from a subgraph. Outer-scope means: registered in the parser's type + // map AND not produced by any node/input/initializer of the current graph. The + // second condition prevents cross-model contamination from prior parsing passes. + bool isOuterScope = !graphLocalTensors.count(name) && IsRegisteredTensorType(name); + bool available = (allInputs.find(name) != allInputs.end() || + allInitializedTensors.find(name) != allInitializedTensors.end() || + isOuterScope); + existInputs &= available; if (fVerbose) { std::cout << "\t\t input " << name << " " << bool(allInputs.find(name) != allInputs.end()) << " " << bool(allInitializedTensors.find(name) != allInitializedTensors.end()) << " " << - existInputs << std::endl; + bool(isOuterScope) << " " + << existInputs << std::endl; } } } @@ -720,7 +807,11 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & } // we have to record order of node execution separately to - // account for fused operators + // account for fused operators. + // Save and restore fFusedOperators around the parsing loop so that + // recursive ParseONNXGraph calls (for If/Loop subgraphs) do not + // corrupt the parent graph's fused-operator bookkeeping. + auto savedFusedOperators = std::move(fFusedOperators); size_t node_order_exec = 0; fFusedOperators = std::vector(graph.node_size(), false); for (int i = 0; i < graph.node_size(); i++) { @@ -730,7 +821,7 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & std::cout << "\t" << i << " " << nodesOrder[i] << " parsing operator " << op_type << std::endl; } - std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]); + std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]); if (!op) { if (verbose) { std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl; @@ -738,9 +829,19 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & // for skipping the fused nodes like Add after MatMul continue; } + // assign operator name for profiling + const auto &nodeproto = graph.node(nodesOrder[i]); + op->fName = nodeproto.name(); + if (op->fName.empty()) { + op->fName = nodeproto.op_type() + "_" + std::to_string(i); + } rmodel.AddOperator(std::move(op), node_order_exec++); } + // Restore the parent graph's fFusedOperators (may have been saved as empty + // for the top-level call, which is fine — we're done with the loop). + fFusedOperators = std::move(savedFusedOperators); + std::vector outputnames; if (verbose) std::cout << "\nParsing Graph output list\n"; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt deleted file mode 100644 index c48e8d1..0000000 --- a/src/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 1995-2019, Rene Brun and Fons Rademakers. -# All rights reserved. -# -# For the licensing terms see $ROOTSYS/LICENSE. -# For the list of contributors see $ROOTSYS/README/CREDITS. - -set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE) - -add_subdirectory(SOFIE_core) -add_subdirectory(SOFIE_parsers) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx deleted file mode 100644 index edbec58..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef SOFIE_ROPERATOR -#define SOFIE_ROPERATOR - -#include -#include - -#include "SOFIE/SOFIE_common.hxx" -//#include "RModel.hxx" - - - - -namespace SOFIE{ - -class RModel; - -class ROperator{ - - -public: - virtual std::vector GetBlasRoutines() { return {}; } - virtual std::vector GetStdLibs() { return {}; } - virtual std::vector> ShapeInference(std::vector>) = 0; - virtual std::vector TypeInference(std::vector) = 0; - virtual void Initialize(RModel&) = 0; - virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel - // generate initialization code for session constructor - virtual std::string GenerateInitCode() { return "";} - // generate some specific declaration code for Session - virtual std::string GenerateDeclCode() { return "";} - // generate session data members specific to operator - virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } - virtual std::string Header() { return "";} - - //virtual void Forward_reference() = 0; - //virtual void Forward_blas() = 0; - virtual ~ROperator(){} - -protected: - - const std::string SP = " "; ///< space used to correctly indent the generated C++ code - bool fUseSession = false; ///< flag to identify if using the session class - bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) - - mutable std::vector fInputTensorNames; - mutable std::vector fOutputTensorNames; - -public: - std::span GetOpInputTensors() const { - return fInputTensorNames; - } - - std::span GetOpOutputTensors() const { - return fOutputTensorNames; - } - -}; - - - -}//SOFIE - - -#endif //SOFIE_OPERATOR diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx deleted file mode 100644 index 127eaff..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx +++ /dev/null @@ -1,216 +0,0 @@ -#ifndef SOFIE_ROperator_BasicBinary -#define SOFIE_ROperator_BasicBinary - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - -namespace SOFIE{ - -enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow }; - -template -struct BinaryOperatorTrait {}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Add"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; } - static T Func(T t1, T t2) {return t1 + t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Sub"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; } - static T Func (T t1, T t2) { return t1 - t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Mul"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; } - static T Func (T t1, T t2) { return t1 * t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Div"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; } - static T Func (T t1, T t2) { return t1/t2;} -}; - -template -struct BinaryOperatorTrait { - static const std::string Name() { return "Pow"; } - static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } - static T Func (T t1, T t2) { return std::pow(t1,t2);} -}; - -template -class ROperator_BasicBinary final : public ROperator{ -private: - - std::string fNA; - std::string fNB; - std::string fNBroadcastedA; - std::string fNBroadcastedB; - std::string fNY; - - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeY; - -public: - ROperator_BasicBinary(){} - ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - // assume now inputs have same shape (no broadcasting) - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ - throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); - } - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB); - if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB); - bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY); - bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY); - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; - } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); - } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) << std::endl; - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast(broadcastedData.get())) << std::endl; - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); - } - } - } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) { - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) { - dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); - } - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a fil - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Binary op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); - } - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string OpName) override { - - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//------ " << BinaryOperatorTrait::Name() << "\n"; - size_t length = ConvertShapeToLength(fShapeY); - std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - // use broadcasting function where we pass an already allocated tensor to minimize memory allocations - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = " << BinaryOperatorTrait::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") << " ;\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { - if (Op == EBasicBinaryOperator::Pow) { - return { std::string("cmath") }; - } else { - return {}; - } - } -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx deleted file mode 100644 index c18c17e..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx +++ /dev/null @@ -1,121 +0,0 @@ -#ifndef SOFIE_ROPERATOR_BASIC_UNARY -#define SOFIE_ROPERATOR_BASIC_UNARY - -#include -#include -#include - - -namespace SOFIE { - -enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs }; - -template -struct UnaryOpTraits { -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Reciprocal"; } - static std::string Op(const std::string &X) { return "1/" + X; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Sqrt"; } - static std::string Op(const std::string &X) { return "std::sqrt(" + X + ")"; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Neg"; } - static std::string Op(const std::string &X) { return "-" + X; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Exp"; } - static std::string Op(const std::string &X) { return "std::exp(" + X + ")"; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Log"; } - static std::string Op(const std::string &X) { return "std::log(" + X + ")"; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Sin"; } - static std::string Op(const std::string &X) { return "std::sin(" + X + ")"; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Cos"; } - static std::string Op(const std::string &X) { return "std::cos(" + X + ")"; } -}; - -template -struct UnaryOpTraits { - static std::string Name() { return "Abs"; } - static std::string Op(const std::string &X) { return "std::abs(" + X + ")"; } -}; - -template -class ROperator_BasicUnary final : public ROperator { -private: - std::string fNX; - std::string fNY; - - std::vector fShapeX; - std::vector fShapeY; - -public: - ROperator_BasicUnary() {} - - ROperator_BasicUnary(std::string nameX, std::string nameY) - : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector> ShapeInference(std::vector> input) override { return input; } - - std::vector TypeInference(std::vector input) override { return input; } - - void Initialize(RModel& model) override { - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); - } - fShapeX = model.GetTensorShape(fNX); - fShapeY = ShapeInference({fShapeX})[0]; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - - std::string Generate(std::string OpName) override - { - OpName = "op_" + OpName; - std::stringstream out; - - out << SP << "\n//---- Operator" << UnaryOpTraits::Name() << " " << OpName << "\n"; - size_t length = ConvertShapeToLength(fShapeX); - out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n"; - out << SP << SP << "tensor_" << fNY << "[i] = " << UnaryOpTraits::Op("tensor_" + fNX + "[i]") << ";\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { - if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) { - return { std::string("cmath") }; - } else { - return {}; - } - } -}; - -} // namespace SOFIE - -#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx deleted file mode 100644 index 47c3d66..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Cast -#define SOFIE_ROPERATOR_Cast - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - -class ROperator_Cast final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - std::string fAttrType = "float"; - -public: - ROperator_Cast(){} - ROperator_Cast(std::string attr_type,std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), - fAttrType(attr_type) { - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - // shoud we add a check if the same type - auto inputType = model.GetTensorType(fNX); - if (model.IsInitializedTensor(fNX)) { - fIsOutputConstant = true; - auto inputData = model.GetInitializedTensorData(fNX); - if (ConvertStringToType(fAttrType) == ETensorType::INT64) { - model.AddConstantTensor(fNY, fShape, static_cast(inputData.get())); - model.SetNotWritableInitializedTensor(fNX); - } - else - fIsOutputConstant = false; - } - if (!fIsOutputConstant) - model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape); - if (model.Verbose()) { - std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY; - if (fIsOutputConstant) std::cout << " (constant) "; - std::cout << std::endl; - } - } - - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - - // out << SP << ETensorType << " " << OpName << "_attr = " << fattr << ";\n"; - out << "\n//------ CAST\n"; - // no generated code for constant outputs - if (fIsOutputConstant) return out.str(); - - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - - out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< fAttrType << ">(tensor_" << fNX << "[id]);\n"; - - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_Cast diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx deleted file mode 100644 index 0d5e574..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ /dev/null @@ -1,263 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Concat - #define SOFIE_ROPERATOR_Concat - - - #include "SOFIE/SOFIE_common.hxx" - #include "SOFIE/ROperator.hxx" - #include "SOFIE/RModel.hxx" - - #include - #include - #include - #include - #include - - namespace SOFIE{ - - class ROperator_Concat final : public ROperator - { - private: - int fAxis=0; - int fnewAxis=0; - std::vector fInputs; - std::string fOutput; - std::vectorfOutputShape; - std::vector> fInputShapes; - - public: - ROperator_Concat(){} - ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): - fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { - fInputs.reserve(inputs.size()); - for (auto & name : inputs) - fInputs.push_back(UTILITY::Clean_name(name)); - - fInputTensorNames.resize(fInputs.size()); - std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(), - [](const std::string& s) -> std::string_view { return s; }); - fOutputTensorNames = { fOutput }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(std::vector> inputs) override { - std::vector> ret(1); - // treat negative axis case - if (fAxis<0) { - fAxis = inputs[0].size()+fAxis; - } - if (fAxis < 0 || fAxis >= (int) inputs[0].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - - int concat_dim=0; - if(fnewAxis == 0){ - for (size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i - 1].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + - ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int)iaxis == fAxis) - concat_dim += inputs[i][iaxis]; - else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis]) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertShapeToString(inputs[i]) + " and " + - ConvertShapeToString(inputs[i - 1])); - } - } - - // output shape - ret[0] = inputs[0]; - ret[0][fAxis] = concat_dim; - } - std::vector stack; - if(fnewAxis == 1){ - for(size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i-1].size() ) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int) iaxis == fAxis) - stack.push_back(inputs[i][iaxis]); - else - if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis]) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1])); - } - - } - for(auto it:stack) - ret[0].push_back(it); - } - - return ret; - } - - // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(const std::vector> & inputs) { - std::vector> ret(1); - // treat negative axis case - if (fAxis<0) { - fAxis = inputs[0].size()+fAxis; - } - if (fAxis < 0 || fAxis >= (int) inputs[0].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - - int concat_dim=0; - if(fnewAxis == 0){ - for (size_t i = 0; i < inputs.size(); i++) { - if (i > 0 && inputs[i].size() != inputs[i - 1].size()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1])); - for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { - if ((int)iaxis == fAxis) { - // support only non-params shape for the concatenation axis - if (inputs[i][iaxis].isParam) - throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " + - ConvertDynamicShapeToString(inputs[i])); - concat_dim += inputs[i][iaxis].dim; - } - // other dimensions must be the same - else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal()) - throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertDynamicShapeToString(inputs[i]) + " and " + - ConvertDynamicShapeToString(inputs[i - 1])); - } - } - - // output shape - ret[0] = inputs[0]; - ret[0][fAxis].dim = concat_dim; - } - // case of stacking (not supported yet) - // here we need to check that input shapes are the same - // for example for fAxis == 0 - // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....] - if(fnewAxis == 1){ - throw std::runtime_error("TMVA SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported "); - } - return ret; - } - - void Initialize(RModel& model) override { - for (auto &it : fInputs) { - if (model.CheckIfTensorAlreadyExist(it) == false) { - throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model"); - } - fInputShapes.push_back(model.GetDynamicTensorShape(it)); - } - fOutputShape = ShapeInference(fInputShapes)[0]; - if (model.Verbose()) - std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl; - - // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) - if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { - fIsOutputConstant = true; - for ( auto & input : fInputs) { - if (!model.IsInitializedTensor(input)) { - fIsOutputConstant = false; - break; - } - } - if (fIsOutputConstant) { - auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible - std::vector outputData(ConvertShapeToLength(outputShape)); - size_t offset = 0; - for ( auto & input : fInputs) { - auto inputData = static_cast(model.GetInitializedTensorData(input).get()); - auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant - size_t inputLength = ConvertShapeToLength(inputShape); - std::copy(inputData, inputData + inputLength, outputData.begin() + offset ); - offset += inputLength; - // data do not need to be written as a weight - model.SetNotWritableInitializedTensor(input); - } - model.AddConstantTensor(fOutput, outputShape, outputData.data()); - if (model.Verbose()) { - std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " - << ConvertValuesToString(outputData) << std::endl; - } - } - } - if (!fIsOutputConstant) { - model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape); - if (model.Verbose()) { - std::cout << "Concat ---> " << fOutput << " " << ConvertDynamicShapeToString(fOutputShape) << std::endl; - } - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - OpName = "op_"+OpName; - if(fOutputShape.empty()){ - throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first"); - } - std::stringstream out; - out<<"\n//--------- Concat\n"; - // special case when memory is contiguous - bool hasShapeOnes = true; - for(int i = 0; i 0) out << offset; - offset += " + " + length; - out << ");\n"; - } - } - else { - - std::vector outStride = UTILITY::ComputeStrideFromShape(fOutputShape); - std::vector> inStrides(fInputs.size()); - int idx = 0; - for ( auto &s : inStrides) { - s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]); - idx++; - } - for (int i = 0; i < fAxis; ++i) { - // loop on dimensions - out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n"; - } - - out << SP << SP << SP << "int idxOut = "; - for (int k = 0; k < fAxis; k++) { - if (k > 0) out << " + "; - out << outStride[k].GetVal() << "*i" << k; - } - out << ";\n"; - - for (size_t j = 0; j < fInputs.size(); j++) { - if (j>0) - out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n"; - out << SP << SP << SP << "int idxIn" << j <<" = "; - for (int k = 0; k < fAxis; k++) { - if (k > 0) out << " + "; - out << inStrides[j][k].GetVal() << "*i" << k; - } - out << ";\n"; - out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n"; - out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; - out << SP << SP << SP << "}\n"; - // concatenate the axis values - } - for (int i = 0; i < fAxis; ++i) { - out << SP << "}\n"; - } - } - - return out.str(); - } - }; - }//SOFIE - - #endif //SOFIE_ROPERATOR_CONCAT diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx deleted file mode 100644 index 15ca91e..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx +++ /dev/null @@ -1,531 +0,0 @@ -#ifndef SOFIE_SOFIE_ROPERATOR_CONV -#define SOFIE_SOFIE_ROPERATOR_CONV - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include -#include -#include -#include - - -namespace SOFIE { - -template -class ROperator_Conv final : public ROperator -{ -private: - std::string fAttrAutopad; - std::vector fAttrDilations; - size_t fAttrGroup; - std::vector fAttrKernelShape; - std::vector fAttrPads; - std::vector fAttrStrides; - - std::string fNX; - std::string fNW; - std::string fNB; - std::string fNB2; // bias tensor name after broadcasting - std::string fNY; - - std::string convK; - std::string imcol; - - std::vector fShapeX; - std::vector fShapeW; - std::vector fShapeB; - std::vector fShapeY; - - std::string fType; - - size_t fDim; // dimension of the convolution - - -public: - - ROperator_Conv() {} - - ROperator_Conv(std::string autopad, std::vector dilations, - size_t group, std::vector kernelShape, std::vector pads, - std::vector strides, std::string nameX, std::string nameW, - std::string nameB, std::string nameY): - fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), - fAttrPads(pads), fAttrStrides(strides), - fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), - fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value) { - fType = "float"; - } else { - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); - } - fInputTensorNames = { fNX, fNB }; - fOutputTensorNames = { fNY }; - } - - ROperator_Conv(std::string autopad, std::vector dilations, - size_t group, std::vector kernelShape, std::vector pads, - std::vector strides, std::string nameX, std::string nameW, - std::string nameY): - fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape), - fAttrPads(pads), fAttrStrides(strides), - fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value) { - fType = "float"; - } else { - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator"); - } - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - ETensorType out = input[0]; - return {out}; - } - - // function returning output shape given input - std::vector> ShapeInference(std::vector> input) override { - // shape of convolution input has to be (according to ONNX): N x C x H x W - // Where N : batch size, C : input channels, H : input height, W : input width - - if (input.size() > 3 ) { - throw - std::runtime_error("TMVA SOFIE Conv Op Shape inference need 2 or 3 input tensors"); - } - for(size_t i = 0; i < input.size(); i++) { - if (input[i].size() -2 != fDim) { - throw - std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid inputs "); - } - } - - if (fAttrGroup == 0) { - fAttrGroup = input[0][1] / input[1][1]; - } - - // kernel shape - size_t k1 = ((fAttrKernelShape.empty())? input[1][2] : fAttrKernelShape[0]); - size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? input[1][3] : fAttrKernelShape[1]) : 1; - size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? input[1][4] : fAttrKernelShape[2]) : 1; - - - size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1; - size_t i2 = (fDim > 2) ? 4 : 3; - size_t i3 = 5; - - if (fAttrDilations.empty()) { - fAttrDilations = {1, 1, 1}; - } - fAttrDilations.resize(3); - if (fDim < 3) { - fAttrDilations.resize(3, 1); - } - // Shape of the kernel - fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1), - k2 + (fAttrDilations[1] - 1) * (k2 - 1), - k3 + (fAttrDilations[2] - 1) * (k3 - 1)}; - - if (fAttrAutopad == "NOTSET") { - if (fAttrPads.empty()) { - fAttrPads = {1, 1, 1, 1, 1, 1}; - } - } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") { - if (fDim == 1) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[0] / 2}; - else if (fDim == 2) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2}; - else if (fDim == 3) - fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2, - fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2}; - // add extra padding at beginning or end (depending if SAME_UPPER or SAME_LOWER) - // need to check this! - if (fAttrKernelShape[0] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[0]++ : fAttrPads[i1]++; - } - if (fDim > 1 && fAttrKernelShape[1] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[1]++ : fAttrPads[i2]++; - } - if (fDim > 2 && fAttrKernelShape[2] % 2 == 1) { - (fAttrAutopad == "SAME_UPPER") ? fAttrPads[2]++ : fAttrPads[i3]++; - } - } else if (fAttrAutopad != "VALID") { - throw - std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad"); - } - // to be sure pad is vector of size 6 - if (fDim < 3) fAttrPads.resize(6, 0); - - if (fAttrStrides.empty()) { - fAttrStrides = {1, 1, 1}; - } - if (fDim < 3) - fAttrStrides.resize(3, 1); - - - size_t input1 = input[0][2]; - size_t input2 = (fDim > 1) ? input[0][3] : 1; - size_t input3 = (fDim > 2) ? input[0][4] : 1; - - size_t pad1 = fAttrPads[0] + fAttrPads[i1]; - size_t output1 = (input1 + pad1 - fAttrKernelShape[0]) / fAttrStrides[0] + 1; - - size_t batch_size = input[0][0]; // first element in input tensor - size_t output_channels = input[1][0]; // first element in weight tensor - - std::vector> ret({{ batch_size, output_channels, output1 }}); - - if (fDim == 1) - return ret; - - size_t pad2 = fAttrPads[1] + fAttrPads[i2]; - size_t output2 = (input2 + pad2 - fAttrKernelShape[1]) / fAttrStrides[1] + 1; - // output is N x M x OH x OW - ret[0].push_back(output2); - if (fDim == 2) - return ret; - - size_t pad3 = fAttrPads[2] + fAttrPads[i3]; - size_t output3 = (input3 + pad3 - fAttrKernelShape[2] ) / fAttrStrides[2] + 1; - - // output is N x M x OH x OW x OD - ret[0].push_back(output3); - return ret; - } - - void Initialize(RModel& model) override { - fUseSession = model.UseSession(); - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - if (fShapeX.size() < 3 || fShapeX.size() > 5) { - std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl; - throw - std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions"); - } - fDim = fShapeX.size() - 2; - if (!model.CheckIfTensorAlreadyExist(fNW)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model"); - } - fShapeW = model.GetTensorShape(fNW); - if (fShapeW.size() < 3 || fShapeW.size() > 5) { - std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl; - throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions"); - } - fShapeY = ShapeInference({fShapeX, fShapeW})[0]; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - if (fNB != "") { - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw - std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model"); - } - fShapeB = model.GetTensorShape(fNB); - std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); - bool broadcast_needed = !UTILITY::AreSameShape(fShapeB, targetShape); - if (broadcast_needed) { - auto original_data = model.GetInitializedTensorData(fNB); - // make bias shape equal to Y shape by adding 1 - if (fShapeB.size() < 1) - throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape"); - // we assume bias tensor dimension is equal to number of filters that is the second dimension in - // the output tensor - if (fShapeB[0] != fShapeY[1]) - throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " + - ConvertShapeToString(fShapeB)); - if (fType != "float") - throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported"); - // here is the actual broadcasting - if (!fUseSession) { - std::vector shape(fDim + 1, 1); - shape[0] = fShapeB[0]; - std::shared_ptr new_data_ptr( - UTILITY::UnidirectionalBroadcast(static_cast(original_data.get()), shape, targetShape), - std::default_delete()); - model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), targetShape, new_data_ptr); - fShapeB = model.GetTensorShape(fNB); - fNB2 = fNB; // use same name - } - else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNB2 = fNB + "bcast"; - model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape); - } - } - } - - size_t outputChannelSize = fShapeY[2]; // size/channel = D * H * W - size_t kernelSize = fAttrKernelShape[0]; - for (size_t i = 1; i < fDim; i++) { - outputChannelSize *= fShapeY[2 + i]; - kernelSize *= fAttrKernelShape[i]; - } - - std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; - std::vector shape2 = {fShapeW[1], kernelSize, outputChannelSize}; - model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); - model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); - convK = fNX +"_f"; - imcol = fNX +"_xcol"; - fOutputTensorNames.emplace_back(convK); - fOutputTensorNames.emplace_back(imcol); - } - - std::string GenerateInitCode() override { - std::stringstream out; - // Generate initialization code for broadcasting of bias tensor - if (!fNB2.empty()) { - // include a separate scope to avoid defining unique operator temp variables - std::vector shape(fDim + 1, 1); - shape[0] = fShapeB[0]; - std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); - out << SP << "{\n"; - out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n"; - out << SP << SP << "delete[] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - - if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) { - throw - std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first"); - } - - std::stringstream out; - size_t bsize = fShapeX[0]; - size_t kDepth = (fDim > 2) ? fShapeW[2] : 1; // kernel depth - size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height - size_t kWidth = fShapeW[fDim+1]; // kernel width - size_t iDepth = (fDim > 2) ? fShapeX[2] : 1; // input depth - size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height - size_t iWidth = fShapeX[fDim+1]; // input width - size_t oDepth = (fDim > 2) ? fShapeY[2] : 1; // output depth - size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1; // ouput height - size_t oWidth = fShapeY[fDim+1]; // output width - - out << "\n//---- operator Conv " << OpName << "\n"; - - // vectorize the (dilated)convolution kernels into a matrix - // no need to transpose the matrix - // to fix for 1d and 3d - - size_t id = (fDim > 2) ? fDim-3 : 2; - size_t ih = (fDim > 1) ? fDim-2 : 1; - size_t iw = fDim-1; - - size_t wstrideDil = fAttrDilations[iw]; - size_t hstride = kWidth; - size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw]; // stride dilated in the height - size_t dstride = kHeight * kWidth; - size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; - size_t icstride = kHeight * kWidth * kDepth; - size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw]; - size_t ocstride = fShapeW[1] * icstride; - size_t ocstrideDil = fShapeW[1] * icstrideDil; - - out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n"; - out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n"; - if (fDim > 2) - out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n"; - if (fDim > 1) - out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n"; - out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n"; - - out << SP << SP << SP << SP << SP << "tensor_" < 2) out << " + kd * " << dstrideDil; - if (fDim > 1) out << " + kh * " << hstrideDil; - out << " + kw * " << wstrideDil << " ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride; - if (fDim > 2) out << " + kd * " << dstride; - if (fDim > 1) out << " + kh * " << hstride; - out << " + kw ];\n"; - - out << SP << SP << SP << SP << "}\n"; - if (fDim > 1) out << SP << SP << SP << "}\n"; - if (fDim > 2) out << SP << SP << SP << "}\n"; - out << SP << SP << "}\n"; - out << SP << "}\n"; - - //out << SP << "char " << OpName << "_transA = 'T';\n"; - out << SP << "char " << OpName << "_transA = 'N';\n"; - out << SP << "char " << OpName << "_transB = 'N';\n"; - out << SP << "int " << OpName << "_m = " << oHeight * oWidth * oDepth << ";\n"; // output h*w - assert(fShapeY[1] == fShapeW[0]); - assert(fShapeW[1] == fShapeX[1] / fAttrGroup); - out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels - out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n"; - out << SP << "float " << OpName << "_alpha = 1.0;\n"; - out << SP << "float " << OpName << "_beta = 0.0;\n"; - - - // Loop on batch size - out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n"; - - // IM2COL: Unroll the input tensor - // order input data as (e.g. kernel 2x2) and (xa,ya) is channel 1 and (xb,yb) is channel 2 - // (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk) - // (xa2,...xak+1,ya1,...yak)(......) - // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows. - // By doing this one has consecutive memory reads and writes - // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w) - if (fDim ==1) { - if (fAttrPads[0] != fAttrPads[1] ) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " - << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2; - } - fAttrPads[1] = 0; - fAttrStrides[1] = 1; - } - if (fDim == 2) { - if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2; - fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2; - } - } - if (fDim == 3) { - if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) { - std::cout << "TMVA SOFIE Operator Conv: asymmetric padding not supported. Assume an average padding " << std::endl; - fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2; - fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2; - fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2; - } - } - out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - - if (fAttrGroup == 1) { - out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth << ";\n"; - // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y, output_h * - // output_w) - if (fDim < 3) { - out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX - << " + x_offset," - // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, - // dilation_w, - // - << fShapeW[1] << "," << iHeight << "," << iWidth << ","; - if (fDim == 1) - out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," - << fAttrDilations[0]; - else // dim ==2 - out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," - << fAttrDilations[1]; - out << "," << "tensor_" <(tensor_" << fNX - << " + x_offset," - // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, - // dilation_d, dilation_h, dilation_w, - // - << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," - << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," - << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << "," - << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << "," - << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," - << "tensor_" << fNX << "_xcol);\n\n "; - } - // BLAS - out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" - << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName - << "_m,\n"; // use m if op_xcol is not transpose , otherwise k - out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY - << " + out_offset, &" << OpName << "_m);\n"; - } else { - // case of group convolution - // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each - // group) - // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n"; - out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << " + g * " - << fShapeW[1] * iDepth * iHeight * iWidth << ";\n "; - out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << " + g * " - << fShapeW[0] * oDepth * oHeight * oWidth / fAttrGroup << ";\n "; - - if (fDim < 3) { - out << SP << SP << "SOFIE::UTILITY::Im2col(tensor_" << fNX - << " + x_offset," - // channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, - // dilation_w, - // - << fShapeW[1] << "," << iHeight << "," << iWidth << ","; - if (fDim == 1) - out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1," - << fAttrDilations[0]; - else // dim ==2 - out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << "," - << fAttrDilations[1]; - out << ", tensor_" << fNX << "_xcol);\n\n "; - } else { - // 3d im2col - out << SP << SP << "SOFIE::UTILITY::Im2col_3d(tensor_" << fNX - << " + x_offset," - // channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, - // dilation_d, dilation_h, dilation_w, - // - << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << "," - << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] - << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] - << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX - << "_xcol);\n\n "; - } - - // BLAS - // n must be divided by the number of groups - out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n"; - // offset g must be g * k * n - out << SP << SP << SP << "size_t offset_f = g * " - << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup - << ";\n"; - out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &" - << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName - << "_m,\n"; // use m if op_xcol is not transpose , otherwise k - out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY - << " + out_offset" - << ", &" << OpName << "_m);\n"; - - out << SP << SP << "}\n"; // end of group loop - } - - if (fNB2 != "") { - out << SP << "int " << OpName << "_size = " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n"; - out << SP << "float " << OpName << "_gamma = 1.0;\n"; - out << SP << "int " << OpName << "_incx = 1;\n"; - out << SP << "int " << OpName << "_incy = 1;\n"; - - out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &" - << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n"; - - } - out << SP << "}\n"; // end of batch size loop - - return out.str(); - } - - /*! \brief Returns the blas routines needed to compile the generated code - */ - std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } -}; - -} // namespace SOFIE - -#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx deleted file mode 100644 index c834a06..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef SOFIE_ROperator_Expand -#define SOFIE_ROperator_Expand - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Expand final : public ROperator{ -private: - - std::vector fShapeX; - std::vector fShape; - std::vector fShapeY; - - std::string fNX; - std::string fNShape; - std::string fNY; - std::string fType; - - bool fInitialized = false; - -public: - ROperator_Expand(){} - ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - return input; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Expand Op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - if (!model.IsInitializedTensor(fNShape)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNShape + " is not initialized."); - } - int64_t *shapeData = - static_cast(model.GetInitializedTensorData(fNShape).get()); - fShape = model.GetTensorShape(fNShape); - if (fShape.size() != 1) { - throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); - } - size_t N = fShape[0]; - std::vector shape(shapeData, shapeData + N); - // Y is the common shape of fShapeX and shape - fShapeY = SOFIE::UTILITY::UnidirectionalBroadcastShape( - fShapeX, shape); - fInitialized = model.IsInitializedTensor(fNX); - // Broadcast X to the common shape fShapeY - bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY); - if (model.IsInitializedTensor(fNX)) { - // If X is an initialized tensor (constant) - auto data = model.GetInitializedTensorData(fNX); - if (broadcast) { - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX, fShapeY), - std::default_delete()); - // Update the data and the shape of X - model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), fShapeY, broadcastedData); - fShapeX = fShapeY; - // need to set as a not writable tensor - model.SetNotWritableInitializedTensor(fNX); - data = broadcastedData; - } - if (broadcast || model.IsConstantTensor(fNX)) { - fIsOutputConstant = true; // constant output in this case - model.AddConstantTensor(fNY, model.GetTensorType(fNX), fShapeY, data); - fOutputTensorNames.pop_back(); - } else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - } else { - // case input is not initialized - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) - std::cout << "Expand - output is with shape " << ConvertShapeToString(fShapeY) << std::endl; - } - - std::string GenerateInitCode() override { - std::stringstream out; - if (!fIsOutputConstant && (fInitialized || fShapeX == fShapeY ) ) { - size_t length = ConvertShapeToLength(fShapeY); - out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n"; - out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; - } - return out.str(); - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - OpName = "op_" + OpName; - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//------ Expand Op" << "\n"; - // No need to broadcast A if it's an initialized tensor or shapes are the same - if (!fInitialized && fShapeX != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY) - << ", std::span<"<(tensor_"< -#include -#include - - -namespace SOFIE{ - -class ROperator_Gather final : public ROperator -{ -private: - - int64_t fAttrAxis = 0; - - std::string fNX; - std::string fNIndices; - std::string fNY; - - std::vector fShapeX; - std::vector fShapeIndices; - std::vector fShapeY; - - std::vector fIndices; // indices vector in case they are known at initialization - - std::string fType; - -public: - ROperator_Gather(){} - ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY): - fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) { - fInputTensorNames = { fNX, fNIndices }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; - return ret; - } - - void Initialize(RModel& model) override { - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA SOFIE Gather Op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - fShapeIndices = model.GetTensorShape(fNIndices); - size_t q = fShapeIndices.size(); - // Axis in range [0, r) where r=rank(X) - size_t r = fShapeX.size(); - // Set the axis - if (fAttrAxis < 0) { - fAttrAxis = fAttrAxis + int64_t(r); - } - // empty fShapeIndices is a scalar value for the indices - size_t indicesLength = ConvertShapeToLength(fShapeIndices); - - // case indices tensor is initialized - if (model.IsInitializedTensor(fNIndices)) { - int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); - //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code) - model.SetNotWritableInitializedTensor(fNIndices); - // update indices data in case of negative dim values - for (size_t i = 0; i < indicesLength; i++) { - if (indicesData[i] < 0) { - indicesData[i] += fShapeX[fAttrAxis]; - } - } - // Save in a vector gather Indices of size q - fIndices = std::vector(indicesData, indicesData + indicesLength); - } - // Output shape - if (model.Verbose()) - std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertShapeToString(fShapeIndices) << std::endl; - - if (fShapeY.empty()) { - fShapeY.resize(q + r - 1); - if (fAttrAxis > 0) { - // Copy shape of X[0, ..., axis) to Shape of Y[0, ..., axis) - std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin()); - } - // Set shape of Y[axis, ..., axis + q) - for (size_t i = 0; i < q; i++) { - fShapeY[fAttrAxis + i] = fShapeIndices[i]; - } - // Copy shape of X[axis + 1, ..., axis + r) to shape of Y[axis + q, ... q + r - 1) - std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q); - } - // case input is known (type is an integer) and input indices is a scalar (or vector of size 1) - if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) { - if (model.GetTensorType(fNX) == ETensorType::INT64) { - auto inputData = static_cast(model.GetInitializedTensorData(fNX).get()); - // if q <=1 and r = 1 output length = 1 (it is a scalar) - std::vector outputData(ConvertShapeToLength(fShapeY)); - outputData[0] = inputData[fIndices[0]]; - model.AddConstantTensor(fNY, fShapeY, outputData.data()); - if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl; - fIsOutputConstant = true; - } - } - if (!fIsOutputConstant) { - // Add output tensor - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << std::endl; - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) { - // no code to generate here for constant output. Tensor output is defined in Session constructor - return "//---------------------------------------\n"; - } - OpName = "op_" + OpName; - std::stringstream out; - out << "//--------- Gather operator \n"; - // The shape of the output is q + r - 1 - size_t r = fShapeX.size(); - // Indices of shape q - size_t q = fShapeIndices.size(); - // Strides - std::vector stridesX = UTILITY::ComputeStrideFromShape(fShapeX); - std::vector stridesY = UTILITY::ComputeStrideFromShape(fShapeY); - std::vector stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); - - // case fIndices is not known we need to correct for negative axis indices at run-time - if (fIndices.empty()) { - size_t indicesLength = ConvertShapeToLength(fShapeIndices); - out << SP << "// correct in case of negative gather indices\n"; - out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n"; - out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n"; - out << SP << SP << SP << "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n"; - out << SP << "}\n"; - } - - - // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}] - // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1) - // iterate in [0 ... axis) [0 ... q) [axis ... r - 1) - // for j_0, j_1, ..., j_{axis-1} - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - std::string index = "j_" + std::to_string(j); - out << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; - } - // for i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "{\n"; // add a scope for local variables - for (size_t i = 0; i < q; i++) { - std::string index = "i_" + std::to_string(i); - out << SP << SP << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; - } - // for j_axis, j_{axis + 1}, ..., j_{r - 1} - for (size_t j = fAttrAxis; j + 1 < r; j++) { - std::string index = "j_" + std::to_string(j); - out << SP << SP << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; - } - - out << SP << SP << SP << "size_t y_index = 0;\n"; - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[j] << ";\n"; - } - for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "y_index += i_" + std::to_string(i) + " * " << stridesY[fAttrAxis + i] << ";\n"; - } - for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[q + j] << ";\n"; - } - // Indices - out << SP << SP << SP << "size_t i_index = 0;\n"; - for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "i_index += i_" + std::to_string(i) + " * " << stridesIndices[i] << ";\n"; - } - // K - out << SP << SP << SP << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; - // Input - out << SP << SP << SP << "size_t x_index = k * " << stridesX[fAttrAxis] << ";\n"; - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j) + " * " << stridesX[j] << ";\n"; - } - for (size_t j = fAttrAxis + 1; j < r; j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j - 1) + " * " << stridesX[j] << ";\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; - - // end loops j_k, j_{k + 1}, ..., j_{r - 2} - for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "}\n"; - } - // end loops i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "}\n"; // end of scope for q = 0 - for (size_t i = 0; i < q; i++) { - out << SP << SP << "}\n"; - } - // end loops j_0, j_1, ..., j_{axis - 1} - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << "}\n"; - } - - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_RELU diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx deleted file mode 100644 index 046bf56..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx +++ /dev/null @@ -1,399 +0,0 @@ -#ifndef SOFIE_ROPERATOR_GEMM -#define SOFIE_ROPERATOR_GEMM - - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include -#include -#include -#include - - -namespace SOFIE{ - - - template - class ROperator_Gemm final : public ROperator - { - - private: - bool fIsDynamic = false; - - float fAttrAlpha = 1.0; - float fAttrBeta = 1.0; - int_t fAttrTransA = 0; - int_t fAttrTransB = 0; - - std::string fNA; - std::string fNB; - std::string fNC = ""; - std::string fNC2; // bias tensor name after broadcasting - std::string fNY; - std::string fType; - EActivationType fActivation; - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeC; - std::vector fShapeY; - - public: - - ROperator_Gemm(){} - ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): - fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), - fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) - { - fActivation = activation; - fType = "float"; - static_assert(std::is_same_v, - "TMVA::SOFIE - Unsupported type parsing a Gemm operator"); - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } - - ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED): - fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), - fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) - { - fActivation = activation; - fType = "float"; - - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - ETensorType out = input[0]; - return {out}; - } - - template - std::vector> DoShapeInference(const std::vector> & input){ - if (input.size() > 3) throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor"); - // accept tensor with input dimensions > 2 - // example: A = (d1,d2,...,N1,N2) B = (d1,d2,...,N2,N3) --> Y = (d1,d2,..,N1,N3) - for (auto& i: input){ - if (i.size() < 2){ - throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions"); - } - } - - std::vector> ret; - // when there are 3 inputs shape of Y is the one of C - if (input.size() == 3){ - ret.push_back(input[2]); //shape of C is shape of Y - return ret; - } - // ioffset cannot be less than 2 - int ioffset = input[0].size()-2; // in case of tensors with dim > 2 - - std::vector s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2); - std::vector s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2); - // reverse in case of transpose - if (fAttrTransA){ - std::reverse(s_a.begin(), s_a.end()); - } - if (fAttrTransB){ - std::reverse(s_b.begin(), s_b.end()); - } - std::vector s_y; - s_y.reserve(input[0].size()); - if (input[0].size() > 2 && input[1].size() == input[0].size()) { - // in case of dim > 2 first dimensions are equal to the input ones not - // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) - for (size_t i = 0; i < input[0].size()-2; i++) { - Dim valueA = input[0][i]; - Dim valueB = input[1][i]; - if (valueA.GetVal() != valueB.GetVal()) { - if (valueB.GetVal() == "1") - s_y.push_back(input[0][i]); - else if (valueA.GetVal() == "1") - s_y.push_back(input[1][i]); - else - throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " - + valueB.GetVal()); - } - s_y.push_back(input[0][i]); - } - } - - s_y.push_back(s_a[0]); - s_y.push_back(s_b[1]); - ret.push_back(s_y); - return ret; - } - - std::vector> ShapeInference(std::vector> input) override { - return DoShapeInference(input); - } - std::vector> DynamicShapeInference(const std::vector> & input){ - return DoShapeInference(input); - } - - - - void Initialize(RModel& model) override { - //TODO: propagate A or B as specified by ONNX standard - - if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model"); - } - if (fNC != ""){ - if (model.CheckIfTensorAlreadyExist(fNC) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is not found in model"); - } - } - if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) { - fShapeA = model.GetDynamicTensorShape(fNA); - fIsDynamic = true; - } else { - auto shapeA_int = model.GetTensorShape(fNA); - fShapeA = ConvertShapeToDim(shapeA_int); - } - // case A is of dim1 we prepend a 1 but we need to remove later - bool prependOne = false; - if (fShapeA.size() == 1) { - fShapeA.insert(fShapeA.begin(), Dim(1)); - prependOne = true; - } - - if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) { - fShapeB = model.GetDynamicTensorShape(fNB); - fIsDynamic = true; - } - else { - auto shapeB_int = model.GetTensorShape(fNB); - fShapeB = ConvertShapeToDim(shapeB_int); - } - // case B is dim1 we append a 1 but we need to remove later - bool appendOne = false; - if (fShapeB.size() == 1) { - fShapeB.insert(fShapeB.end(), Dim(1)); - appendOne = true; - } - // assume if not shape is 2 that extra values are 1. - // implement also MatMul case where we stack matrices (see numpy.matmul) - if (fShapeA.size() != fShapeB.size()) { - // if different dimensions we prepend 1 values - if (fShapeA.size() < fShapeB.size()) { - fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1)); - } else if (fShapeB.size() < fShapeA.size()) { - fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1)); - } - } - - fShapeY = DynamicShapeInference({fShapeA, fShapeB})[0]; - std::vector shapeY; - if (!fIsDynamic) { - shapeY = ConvertShapeToInt(fShapeY); - if (shapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY)); - } - } - - // bias is normally not dynamic (not support it for time being) - if (fNC != ""){ - // normally bias is fixed and not dynamic - if (model.IsDynamicTensor(fNC)) { - throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported"); - } - fShapeC = model.GetTensorShape(fNC); - fNC2 = fNC; - size_t lengthC = ConvertShapeToLength(fShapeC); - size_t lengthY = ConvertShapeToLength(shapeY); - // for dynamic outputs broadcasting is always done - bool broadcast_needed = lengthC != lengthY; - - - if (broadcast_needed) { - if (!model.UseSession()) { - // without session dynamic tensors not supported in Gemm - if (fIsDynamic) { - throw std::runtime_error("TMVA SOFIE Gemm Op: dynamic tensors not supported without a session"); - } - auto original_data = model.GetInitializedTensorData(fNC); - auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY); - if (fType == "float") { - std::shared_ptr new_data_ptr(UTILITY::UnidirectionalBroadcast( - static_cast(original_data.get()), fShapeC, targetShape), - std::default_delete()); - - model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr); - fShapeC = shapeY; - } - } else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNC2 = fNC + "bcast"; - if (!fIsDynamic) { - model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY); - } - else - model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY); - } - } - } - - // remove appended or prepended value of 1 - if (prependOne) { - if (fIsDynamic) - fShapeY.erase(fShapeY.begin()); - else - shapeY.erase(shapeY.begin()); - } - if (appendOne) { - if (fIsDynamic) - fShapeY.erase(fShapeY.end()-1); - else - shapeY.erase(shapeY.end()-1); - } - - if (!fIsDynamic) - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY); - else - model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY); - - if (model.Verbose()){ - std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; - if (fIsDynamic) - std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl; - else - std::cout << ConvertShapeToString(shapeY) << std::endl; - } - - model.AddNeededStdLib("algorithm"); - } - - std::string GenerateInitCode() override { - std::stringstream out; - // generate initialization code for broadcasting of bias tensor - if (fShapeC.size() != fShapeY.size() && fNC != fNC2) { - // we broadcast here always C in Y output, so target shape is the one of Y - // no need to call UTILITY::UnidirectionalBroadcastShape. - // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code. - auto targetShape = fShapeY; - // include a separate scope to avoid defining unique operator temp variables - out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; - out << SP << "{\n"; - out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n"; - auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n"; - out << SP << SP << "delete [] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string opName) override { - opName = "op_" + opName; - - if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { - throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); - } - std::stringstream out; - out << "\n//--------- Gemm\n"; - out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; - out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; - // need to consider case A and B have dim > 2 (for MatMul) - int64_t dimA = fShapeA.size(); - int64_t dimB = fShapeB.size(); - int64_t dimY = fShapeY.size(); - if (dimA != dimB || dimA != dimY) { - throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output"); - } - auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); - auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); - auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); - std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; - // extra dimensions in case of stacked MatMul - std::vector sA; - for (int64_t i = 0; i < dimY-2; i++) { - sA.push_back(fShapeY[i]); - } - auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation - auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) - - out << SP << "int " << opName << "_m = " << m << ";\n"; - out << SP << "int " << opName << "_n = " << n << ";\n"; - out << SP << "int " << opName << "_k = " << k << ";\n"; - out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; - out << SP << "float " << opName << "_beta = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ";\n"; - out << SP << "int " << opName << "_lda = " << (fAttrTransA ? m : k) << ";\n"; - out << SP << "int " << opName << "_ldb = " << (fAttrTransB ? k : n) << ";\n"; - - // case bias is present - if (!fNC.empty()){ - if (fNC2 == fNC) { - // add a check in case broadcasting was not needed or done outside of session - // C should have smaller dimension of Y - if (!fIsDynamic) { - if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) - throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size " - + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); - } else { - // add a dynamic check (C should not be a dynamic tensor) - out << SP << "assert(" << lengthGemm << " != " << ConvertShapeToLength(fShapeC) << ");\n"; - } - } - } else { - //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use - // the previous result - if (fAttrBeta != 0) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero"); - } - } - - // include MatMul case where we stack the Gemm operations - // exclude case where we have only 1's in the additional dims - bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); - if (doStackMul) { - out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations - out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; - out << SP; - } - // in the case of bias - if (!fNC.empty()){ - out << SP << "std::copy(" << "tensor_" << fNC2 << ", " << "tensor_" << fNC2 << " + " << lengthGemm << ", " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ");\n"; - } - - - if (fType == "float"){ - - out << SP << "BLAS::sgemm_(&" << opName << "_transB, &" << opName << "_transA, &" << opName - << "_n, &" << opName << "_m, &" << opName << "_k, &" << opName << "_alpha, " << "tensor_" << fNB - << ", &" << opName << "_ldb, " << "tensor_" << fNA << ", &" << opName << "_lda, &" << opName << "_beta, " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ", &" << opName << "_n);\n"; - - if(fActivation == EActivationType::RELU){ - out << SP << "for (int id = 0; id < " << SOFIE::ConvertDynamicShapeToLength(fShapeY) << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n"; - out << SP << "}\n"; - } - } - - if (doStackMul) { - out << SP << SP << opName << "_yoffset += " << lengthGemm << ";\n"; - out << "}\n"; // end of loop on the stacked multiplications - } - - return out.str(); - } - - std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } - - }; - - -}//SOFIE - -#endif //SOFIE_ROPERATOR_GEMM diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx deleted file mode 100644 index 17b77b3..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx +++ /dev/null @@ -1,343 +0,0 @@ -#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION -#define SOFIE_ROPERATOR_LAYERNORMALIZATION - -#include "SOFIE/RModel.hxx" -#include "SOFIE/SOFIE_common.hxx" - -#include -#include - - -namespace SOFIE { - -template -class ROperator_LayerNormalization : public ROperator { -private: - int fAttrAxis; - float fAttrEpsilon; - size_t fAttrStashType; - - std::string fNX; - std::string fNScale; - std::string fNB; - std::string fNY; - std::string fNMean; - std::string fNInvStdDev; - - std::string fNCastedX; - std::string fNNormalizedX; - std::string fNBroadcastedB; - - std::vector fShapeX; - std::vector fShapeScale; - std::vector fShapeB; // shape of input Bias (B) is assumed to be fully defined - std::vector fShapeY; - std::vector fShapeMean; - std::vector fShapeInvStdDev; - - size_t fAxis; // axis in [0, size) - size_t fSize; // Size of the input - // size_t fAxisDim; - - std::vector fNormalizedShape; - std::vector fAxesShape; - // lengths in string format - std::string fLength; // Length of the input - std::string fNormalizedLength; - std::string fAxesLength; - - std::string fType; - -public: - ROperator_LayerNormalization() {} - - ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX, - const std::string &nameScale, const std::string &nameB, const std::string &nameY, - const std::string &nameMean, const std::string &nameInvStdDev) - : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)), - fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)), - fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev)) - { - fInputTensorNames = { fNX, fNScale }; - if (!fNB.empty()){ - fInputTensorNames.emplace_back(fNB); - } - - fOutputTensorNames = { fNY }; - if (!fNMean.empty()){ - fOutputTensorNames.emplace_back(fNMean); - } - if (!fNInvStdDev.empty()){ - fOutputTensorNames.emplace_back(fNInvStdDev); - } - } - - std::vector> ShapeInference(std::vector> input) override { return input; } - - std::vector TypeInference(std::vector input) override { return input; } - - void Initialize(RModel& model) override { - if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); - } - bool isDynamic = model.IsDynamicTensor(fNX); - fShapeX = model.GetDynamicTensorShape(fNX); - fShapeY = fShapeX; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - // Type of the output - fType = ConvertTypeToString(model.GetTensorType(fNX)); - // Size of the input - fSize = fShapeX.size(); - // Axis in [0, size) - fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis; - // Shape of fShapeX[0, ..., fAxis) - fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); - // Length of the axes - fAxesLength = ConvertDynamicShapeToLength(fAxesShape); - // Shape of fShapeX[fAxis, ..., fSize) - fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); - // Length of the normalized axis - fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape); - // length of the input - fLength = ConvertDynamicShapeToLength(fShapeX); - // Type of mean and std - ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); - // Mean - if (fNMean.empty()) { - fNMean = "Mean" + fNX; - // cannot use initializer list with one element since it is ambiguous - if (isDynamic) - // add size_t(-1) to indicate that shape is an expression - model.AddIntermediateTensor(fNMean, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); - else - model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); - } - // Inverse Standard Deviation - if (fNInvStdDev.empty()) { - fNInvStdDev = "InvStdDev" + fNX; - if (isDynamic) - model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); - else - model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); - } - // Cast X to float - if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { - fNCastedX = "Casted" + fNX; - model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); - fNNormalizedX = "Normalized" + fNX; - model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); - } - // Broadcast the bias - if (!fNB.empty()) { - fShapeB = model.GetTensorShape(fNB); - size_t lengthB = ConvertShapeToLength(fShapeB); - if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { - fNBroadcastedB = "Broadcasted" + fNB; - model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); - } - } - model.AddNeededStdLib("cmath"); - } - - std::string GenerateInitCode() override - { - std::stringstream out; - if (!fNBroadcastedB.empty()) { - out << SP << "// Broadcasting the bias of LayerNormalization op\n"; - out << SP << "{\n"; - out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; - out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n"; - out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; - out << SP << "delete[] data;\n"; - out << SP << "}\n"; - } - return out.str(); - } - - std::string Generate(std::string opName) override - { - opName = "op_" + opName; - if (fShapeX.empty()) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + - " called to generate without being initialized first."); - } - if (fShapeX.size() > 5) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not " - "implemented for input tensor of size > 5."); - } - - std::stringstream out; - - out << "//---- Layer Normalization operator " << opName << "\n"; - - // Loop over all the normalized axes i.e. [axis, ..., size) - std::vector inputShape(fSize); - - for (size_t i = 0; i < fSize; i++) { - inputShape[i] = fShapeX[i].GetVal(); - } - - auto strides = UTILITY::ComputeStrideFromShape(fShapeX); - std::string InputIndex = "axis_0 * " + strides[0].GetVal(); - for (size_t i = 1; i < fSize; i++) { - InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal(); - } - - auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); - std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); - for (size_t i = 1; i < fAxis; i++) { - axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); - } - - auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape); - std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal(); - for (size_t i = fAxis + 1; i < fSize; i++) { - normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal(); - } - - if (!fNCastedX.empty()) { - // Cast X to float - out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n"; - out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast(tensor_" << fNX; - out << "[i]);\n"; - out << SP << "}\n"; - } - - out << SP << "// Compute the mean\n"; - // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++) {\n"; - } - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++) {\n"; - } - out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "("; - out << fNormalizedLength << ");\n"; - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - - out << SP << "// Compute the inverse Standard Deviation\n"; - // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - // Set sum = 0 - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_" - << fNMean << "[" << axesIndex << "];\n"; - out << SP << SP << SP << "sum += tmp*tmp;\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt("; - out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; - for (size_t i = 0; i < fAxis; i++) { - out << SP << "}\n"; - } - - if (!fNCastedX.empty()) { - out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_"; - out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex; - out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - out << "// Y = Scale o NormalizedX"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex; - out << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } else { - out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex; - out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "["; - out << axesIndex << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } - - if (!fNB.empty()) { - std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB); - out << SP << "// Add the bias to Y\n"; - out << SP << "int " << opName << "_n = " << fLength << ";\n"; - out << SP << "float " << opName << "_alpha = 1.;\n"; - out << SP << "int " << opName << "_inc = 1;\n"; - out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &"; - out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n"; - } - - return out.str(); - } - - std::vector GetBlasRoutines() override { return { std::string("Axpy") }; } - - std::vector GetStdLibs() override { return { std::string("cmath") }; } -}; - -} // namespace SOFIE - - -#endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx deleted file mode 100644 index 8fefa6d..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef SOFIE_ROPERATOR_LeakyRelu -#define SOFIE_ROPERATOR_LeakyRelu - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_LeakyRelu final : public ROperator -{ - -private: - - /* Attributes*/ - float falpha=0.01; //default value - std::string fNX; - std::string fNY; - std::vector fShape; - std::string fType; - -public: - ROperator_LeakyRelu(){} - ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY): - falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { - if(std::is_same::value){ - fType = "float"; - } - else{ - throw - std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Leaky Relu operator"); - } - - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Leaky Relu Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Leaky Relu called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - - out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; - - out << "\n//------ LEAKY RELU\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_LeakyRelu diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx deleted file mode 100644 index 886aef1..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx +++ /dev/null @@ -1,270 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Reduce -#define SOFIE_ROPERATOR_Reduce - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include -#include -#include -#include - - -namespace SOFIE{ - -enum EReduceOpMode { ReduceMean, ReduceSum, ReduceSumSquare, ReduceProd, InvalidReduceOp }; - -template -class ROperator_Reduce final : public ROperator -{ -private: - /* Attributes*/ - int fkeepdims = 1; //default value - std::vector fAttrAxes; - EReduceOpMode fReduceOpMode; - std::string fNX; - std::string fNAxes; - std::string fNY; - std::vector fShapeX; - std::vector fShapeY; - std::vector fShapeYNotPruned; // needed for fKeepdims=0 - - -public: - - std::string Name() { - if (fReduceOpMode == ReduceMean) return "ReduceMean"; - else if (fReduceOpMode == ReduceSumSquare ) return "ReduceSumSquare"; - else if (fReduceOpMode == ReduceProd ) return "ReduceProd"; - else if (fReduceOpMode == ReduceSum) return "ReduceSum"; - return "Invalid"; - } - - ROperator_Reduce(){} - ROperator_Reduce(int keepdims, std::vector attrAxes, std::string nameX, std::string nameAxes, std::string nameY): - fkeepdims(keepdims), fAttrAxes(attrAxes), fNX(UTILITY::Clean_name(nameX)), fNAxes(UTILITY::Clean_name(nameAxes)), fNY(UTILITY::Clean_name(nameY)) { - fReduceOpMode = Op; - - fInputTensorNames = { fNX }; - if(!fNAxes.empty()){ - fInputTensorNames.emplace_back(fNAxes); - } - - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - auto & outputShape = ret[0]; - for (size_t j = 0; j < fAttrAxes.size(); j++) { - if (fAttrAxes[j] < 0) fAttrAxes[j] += outputShape.size(); - if (fAttrAxes[j] < 0 || (size_t) fAttrAxes[j] >= outputShape.size() ) - throw std::runtime_error("TMVA SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j])); - // set to 1 the reduced dims - outputShape[fAttrAxes[j]] = 1; - } - fShapeYNotPruned = outputShape; - // in case of pruning dimension we need to sort axes attributes - if (fkeepdims == 0) { - auto ax = fAttrAxes; - std::sort(ax.begin(), ax.end()); - for (size_t j = 0; j < ax.size(); j++) { - // erase reduced dimensions, but keep last one - if (outputShape.size() > 1) { - outputShape.erase(outputShape.begin() + ax[j]); - for (size_t k = j+1; k < ax.size(); k++) - ax[k] -= 1; // decrease by one since we have removed a value - } - } - } - return ret; - } - void Initialize(RModel& model) override { - - fUseSession = model.UseSession(); - - if (!model.CheckIfTensorAlreadyExist(fNX)) { - // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Reduce Op Input Tensor " + fNX + " is not found in model"); - } - fShapeX = model.GetTensorShape(fNX); - // check if tensor with axes is provided - if (!fNAxes.empty()) { - auto ax_shptr = model.GetInitializedTensorData(fNAxes); - auto ax_ptr = static_cast(ax_shptr.get()); - auto ax_shape = model.GetTensorShape(fNAxes); - size_t ax_length = ConvertShapeToLength(ax_shape); - fAttrAxes = std::vector(ax_ptr, ax_ptr+ax_length); - } else if (fAttrAxes.empty()) { - // in case no axes is passed assume full reduction - fAttrAxes.resize(fShapeX.size()); - for (size_t i = 0; i < fAttrAxes.size(); i++) - fAttrAxes[i] = i; - } - // find shape of Y and add it in the list of intermediate tensors - fShapeY = ShapeInference({fShapeX})[0]; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - if (model.Verbose()){ - std::cout << Name() << " : " << fNX << " -> " << fNY << " shape " << ConvertShapeToString(fShapeY) << std::endl; - } - model.AddNeededStdLib("algorithm"); - } - - std::string Generate(std::string opName) override { - opName = "op_" + opName; - if (fShapeX.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Reduce Op called to Generate without being initialized first"); - } - - size_t inputLength = SOFIE::ConvertShapeToLength(fShapeX); - size_t outputLength = SOFIE::ConvertShapeToLength(fShapeY); - - auto inputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeX); - // output stride (or not pruned vector) - auto outputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeYNotPruned); - - // write here according to size of shape - // in generation code can be done automatically - // i0 = i / stride0 % shape0; i1 = i / stride1 % shape1 and so on - // and we have for the inverse - // i = i0 * s0 + i1 * s1 + i2 * s2 + i3 * s3 .... - - // don't need to divide by last stride s[n-1] since it is 1 by definition - - std::stringstream out; - out << "\n//---- operator " << Name() << " " << opName << "\n"; - // check where is reduced axes are first or last one. In these case we can do a faster implementation - enum EReduceDim {kFirst, kLast, kMiddle}; - EReduceDim reduceDims = kLast; - int kmin = fShapeX.size()-fAttrAxes.size(); - for (int k = fShapeX.size()-1; k >= kmin; k--) { - // if k is not a reduced axis is not last ones - if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { - reduceDims = kMiddle; - break; - } - } - if (reduceDims == kMiddle) { - reduceDims = kFirst; - // check if at the beginning - for (size_t k = 0; k < fAttrAxes.size(); k++) { - // if k is not a reduced axis is not first ones - if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { - reduceDims = kMiddle; - break; - } - } - } - size_t reducedLength = inputLength / outputLength; - if (reduceDims == kLast) { - //std::cout << "reduction for operator " << opName << " is last" << std::endl; - // new faster implementation using a single loop - // faster to loop first on reduced dimension and then output - // reset output tensors - - // loop on output dimensions - out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n"; - // loop on reduce dimensions - std::string startingValue = (fReduceOpMode == ReduceProd) ? "1" : "0"; - out << SP << SP << "tensor_" << fNY << "[i] = " << startingValue << ";\n"; - out << SP << SP << "for (size_t j = 0; j < " << reducedLength << "; j++) {\n"; - - if (fReduceOpMode == ReduceProd) - out << SP << SP << SP << "tensor_" << fNY << "[i] *= tensor_" << fNX << "[i * " << reducedLength << " + j];\n"; - else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) - out << SP << SP << SP << "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j];\n"; - else if(fReduceOpMode == ReduceSumSquare) - out << SP << SP << SP << "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j] * tensor_" - << fNX << "[i * " << reducedLength << " + j];\n"; - out << SP << SP << "}\n"; // end j loop - if(fReduceOpMode == ReduceMean) - out << SP << SP << "tensor_" << fNY << "[i] /= static_cast(" << reducedLength << ");\n"; - - out << SP << "}\n"; // end i loop - } else if (reduceDims == kFirst) { - //std::cout << "reduction for operator " << opName << " is first" << std::endl; - // case reduction is at beginning - // reset output tensors - if (fReduceOpMode == ReduceProd) - out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n"; - else - out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 0);\n"; - - out << SP << "for (size_t i = 0; i < " << reducedLength << "; i++) {\n"; - out << SP << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n"; - - if (fReduceOpMode == ReduceProd) - out << SP << SP << SP << "tensor_" << fNY << "[j] *= tensor_" << fNX << "[i * " << outputLength << " + j];\n"; - else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) - out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j];\n"; - else if(fReduceOpMode == ReduceSumSquare) - out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j] * tensor_" - << fNX << "[i * " << outputLength << " + j];\n"; - out << SP << SP << "}\n"; // end j loop - out << SP << "}\n"; // end i loop - if(fReduceOpMode == ReduceMean) { - out << SP << "for (size_t j = 0; i < " << outputLength << "; j++) {\n"; - out << SP << SP << "tensor_" << fNY << "[j] /= static_cast(" << reducedLength << ");\n"; - out << SP << "}\n"; // end j loop - } - } - else - { // standard case - //std::cout << "reduction for operator " << opName << " is middle" << std::endl; - // reset output tensors - if (fReduceOpMode == ReduceProd) - out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n"; - else - out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ",0);\n"; - - out << SP << "for (size_t i = 0; i < " << inputLength << "; i++) {\n"; - - size_t dim = fShapeX.size(); // this is the input dimension (e.g. 2, 3 or 4 or more) - - // here we find output index - out << SP << SP << "size_t outputIndex = 0;\n"; - for (size_t k = 0; k < dim; k++) { - if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) { - // do for not reducing axes - out << SP << SP << "size_t i_" << k << " = i / " << inputStrides[k] << " % " << fShapeX[k] << ";\n"; - out << SP << SP << "outputIndex += i_" << k << " * " << outputStrides[k] << ";\n"; - } - } - // now compute reduction - out << SP << SP << "// compute reduction....\n"; - if (fReduceOpMode == ReduceProd) - out << SP << SP << "tensor_" << fNY << "[outputIndex] *= tensor_" << fNX << "[i];\n"; - else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean) - out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i];\n"; - else if (fReduceOpMode == ReduceSumSquare) { - out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i] * tensor_" << fNX - << "[i];\n"; - } - out << SP << "}\n"; // end loop on input elements - // normalize for reduced mean - if (fReduceOpMode == ReduceMean) { - out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n"; - out << SP << SP << "tensor_" << fNY << "[i] /= static_cast(" << reducedLength << ");\n"; - out << SP << "}\n"; - } - } - - return out.str(); - } - -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_Reduce - diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx deleted file mode 100644 index 8062dca..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef SOFIE_ROPERATOR_RELU -#define SOFIE_ROPERATOR_RELU - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Relu final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Relu(){} - ROperator_Relu(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); - } - - fShape = model.GetDynamicTensorShape(fNX); - - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - if (model.Verbose()) { - std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl; - } - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first"); - } - std::stringstream out; - auto length = ConvertDynamicShapeToLength(fShape); - out << "\n//------ RELU\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_RELU diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx deleted file mode 100644 index 66a7e09..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx +++ /dev/null @@ -1,252 +0,0 @@ -#ifndef SOFIE_ROPERATOR_RESHAPE -#define SOFIE_ROPERATOR_RESHAPE - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include - -namespace SOFIE{ - -enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze }; - - -class ROperator_Reshape final : public ROperator -{ - -private: - - bool fVerbose = false; - ReshapeOpMode fOpMode = Reshape; // type of Reshape operator - - int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape - int fAxis = 1; // (for Flatten) - - std::string fNData; // input data tensor name - std::string fNShape; // reshape tensor name - std::string fNOutput; // output tensor name - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data - std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) - -public: - - std::string Name() const { - if (fOpMode == Reshape) return "Reshape"; - if (fOpMode == Flatten) return "Flatten"; - if (fOpMode == Squeeze) return "Squeeze"; - if (fOpMode == Unsqueeze) return "Unsqueeze"; - return ""; - } - - ROperator_Reshape(){} - ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - if (opMode == Reshape) fAllowZero = attr_value; - if (opMode == Flatten) fAxis = attr_value; - - fInputTensorNames = { fNData }; - if(!fNShape.empty()){ - fInputTensorNames.emplace_back(fNShape); - } - fOutputTensorNames = { fNOutput }; - } - - // for squeeze/unsqueezed operators following old ONNX version (< 10) - // In this cases axes are passed as attribute values - ROperator_Reshape(ReshapeOpMode opMode, std::vector attrAxes, std::string nameData, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)), - fAttrAxes(attrAxes) - { - assert(fOpMode == Squeeze || fOpMode == Unsqueeze); - } - - // output type is same as input - std::vector TypeInference(std::vector input) override { - auto ret = std::vector(1, input[0]); - return ret; - } - - // output shape - std::vector> ShapeInference(std::vector> input) override { - std::vector> ret; - auto & input_shape = input[0]; - - if (fOpMode == Reshape) { - if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors"); - auto output_shape = input[1]; // the provided shape - size_t input_length = ConvertShapeToLength(input_shape); - size_t output_length = ConvertShapeToLength(output_shape); - // (input_length == output_length) is the easy case : (2,3,4) -> (2,12) - if (input_length != output_length) { - if ((output_length == 0 && fAllowZero == 0) || static_cast(output_length) < 0) { - // in this case value 0 or -1 in shape are automatically corrected - bool replacementDone = false; - for (size_t i = 0; i < output_shape.size(); i++) { - if (output_shape[i] == 0 || output_shape[i] == static_cast(-1)) { - if (replacementDone) { - throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values"); - } - auto tmp = output_shape; - tmp.erase(tmp.begin() + i); - auto tmp_length = ConvertShapeToLength(tmp); - output_shape[i] = input_length / tmp_length; - replacementDone = true; - } - } - if (fVerbose) - std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1]) - << " to " << ConvertShapeToString(output_shape) << std::endl; - } - if (ConvertShapeToLength(output_shape) != input_length) { - throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertShapeToString(input_shape) + - ConvertShapeToString(output_shape)); - } - } - ret.push_back(output_shape); - - } else if (fOpMode == Flatten) { - // flattenig case - size_t inputSize = ConvertShapeToLength(input_shape); - size_t b = input[0][0]; - std::vector newShape = {b, inputSize / b}; - ret.push_back(newShape); - - } else if (fOpMode == Squeeze) { - // squeeze - // assume no axis is provided - remove all axes with value equal to 1 - auto output_shape = input[0]; - if (input.size() == 1) { - size_t i = 0; - while (i < output_shape.size()) { - if (output_shape[i] == 1 ) { - output_shape.erase(output_shape.begin() + i); - } - else { - i++; - } - } - } else if (input.size() == 2) { - auto & axes = input[1]; - for (size_t i = 0; i < axes.size(); i++){ - if (output_shape[axes[i]] != 1) - throw std::runtime_error("TMVA Squeeze Op : Invalid axes : " + ConvertShapeToString(axes) + - ConvertShapeToString(output_shape)); - output_shape.erase(output_shape.begin() + axes[i]); - } - } - ret.push_back(output_shape); - } - - else if (fOpMode == Unsqueeze) { - // unsqueeze - assert(input.size() == 2); - auto output_shape = input[0]; - auto &axes = input[1]; - // output rank - int64_t r = input[0].size() + axes.size(); - for (auto & a : axes) { - int64_t i = static_cast(a); - if ( i < -r || i > r - 1 ) - throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); - if (i >= 0) - output_shape.insert(output_shape.begin() + i, 1); - else - //negative axes - output_shape.insert(output_shape.end() + i + 1, 1); - } - ret.push_back(output_shape); - } - return ret; - } - - void Initialize(RModel& model) override { - - fVerbose = model.Verbose(); - if (model.CheckIfTensorAlreadyExist(fNData) == false) { - // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); - } - fShapeInput = model.GetTensorShape(fNData); - // check if optional shape tensor exist - if (!fNShape.empty()) { - if (model.CheckIfTensorAlreadyExist(fNShape)) { - auto dptr = model.GetInitializedTensorData(fNShape); - auto input_shape = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNShape); - assert(vec.size() == 1); - size_t n = vec[0]; // size of shape input tensor - - std::vector descShape(n); - std::copy(input_shape, input_shape + n, descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed - model.SetNotWritableInitializedTensor(fNShape); - } else { - throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model"); - } - } else if (!fAttrAxes.empty()) { - // case fNShape is empty and axes are provided as attributes - std::vector descShape(fAttrAxes.size()); - std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - } else if (fOpMode == Flatten || fOpMode == Squeeze) { - fShapeOutput = ShapeInference({fShapeInput})[0]; - } else { - throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data"); - } - // check if output is constant or not - if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { - fIsOutputConstant = true; - auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput)) - throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); - model.AddConstantTensor(fNOutput, fShapeOutput, inputData); - if (model.Verbose()) { - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl; - } - } else { - // non-constant case - model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); - if (model.Verbose()) - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors - - OpName = "op_" + OpName; - - // output of reshape is same as input - size_t length = ConvertShapeToLength(fShapeOutput); - if (length != ConvertShapeToLength(fShapeInput)) { - throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " + - ConvertShapeToString(fShapeOutput) + " and input is " + - ConvertShapeToString(fShapeInput)); - } - std::stringstream out; - std::string opName = "Reshape"; - if (fOpMode == Flatten) - opName = "Flatten"; - else if (fOpMode == Squeeze) - opName = "Squeeze"; - else if (fOpMode == Unsqueeze) - opName = "Unsquueze"; - - out << SP << "///--------" << opName << " operator\n" << std::endl; - out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput - << ");\n"; - return out.str(); - } -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_RESHAPE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx deleted file mode 100644 index 6951017..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx +++ /dev/null @@ -1,176 +0,0 @@ -#ifndef SOFIE_ROperator_ScatterElements -#define SOFIE_ROperator_ScatterElements - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - -class ROperator_ScatterElements final : public ROperator{ -private: - - int64_t fAxis; - - std::string fNX; - std::string fNI; - std::string fNU; - std::string fNY; - std::string fReduction; - - std::vector fShapeX; - std::vector fShapeI; - std::vector fShapeY; - - // define reduction function. Possibilities are: - // none (default), add, mul, max, min - std::string ReductionFunction(const std::string & t1, const std::string & t2 ) { - std::string name = fReduction; - if (name.empty() || name == "none") - return t2; - else if (name == "add") - return t1 + " + " + t2; - else if (name == "mul") - return t1 + " * " + t2; - else if (name == "max") - return "std::max(" + t1 + "," + t2 + ")"; - else if (name == "min") - return "std::min(" + t1 + "," + t2 + ")"; - else - throw std::runtime_error("TMVA SOFIE ScatterElements : invalid reduction attribute"); - - return std::string(); - } - -public: - ROperator_ScatterElements(){} - ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY, - int axis, std::string reduction): - fAxis(axis), - fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)), - fNY(UTILITY::Clean_name(nameY)), - fReduction(reduction) - { - fInputTensorNames = { fNX, fNI, fNU }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNX)){ - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNI)) { - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNU)) { - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model"); - } - //tbd check for constant tensors - - fShapeX = model.GetTensorShape(fNX); - fShapeI = model.GetTensorShape(fNI); - if (model.GetTensorShape(fNU) != fShapeI) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - update tensor has invalid shape ")) ; - if (fShapeX.size() == 0) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - input tensor has zero rank ")) ; - if (fShapeX.size() != fShapeI.size()) - throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - index tensor has invalid rank ")) ; - - if (fAxis < 0) fAxis += fShapeX.size(); - - // assume output shape is identical to input shape - fShapeY = fShapeX; - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string opName) override { - - if (fIsOutputConstant) return ""; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//-------- ScatterElements --- " << opName << "\n"; - - auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); - auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); - - size_t length = ConvertShapeToLength(fShapeY); - - // function to write compute expression for global index from axes indices - auto tensorIndex = [](const std::vector & stride, const std::vector & idx) { - std::stringstream strst; - int dims = idx.size(); - assert (dims == (int) stride.size()); - for (int i = 0; i < dims; i++) { - if (stride[i] != 1) - strst << stride[i] << "*" << idx[i]; - else - strst << idx[i]; - if (i < dims-1) - strst << " + "; - } - return strst.str(); - }; - - - // copy first input in output (maybe can be avoided??) - out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; - - // loop on tensor rank - int dims = fShapeY.size(); - std::vector idx(dims); - for (int i = 0; i < dims; i++) { - idx[i] = std::string("i") + std::to_string(i); - for (int j = 0; j <= i; j++) out << SP; - out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n"; - } - // correct index for specific axis - for (int j = 0; j <= dims; j++) out << SP; - out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "int iAxis = tensor_" << fNI << "[updateIndex];\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis] << ";\n"; - idx[fAxis] = "iAxis"; - for (int j = 0; j <= dims; j++) out << SP; - out << "int outIndex = " << tensorIndex(strideY, idx) << ";\n"; - for (int j = 0; j <= dims; j++) out << SP; - out << "tensor_" << fNY << "[outIndex] = " - << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n"; - - for (int i = dims; i > 0; i--) { - for (int j = 0; j < i; j++) out << SP; - out << "}\n"; - } - return out.str(); - } - -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_ScatterElements diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx deleted file mode 100644 index 68edd01..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Sigmoid -#define SOFIE_ROPERATOR_Sigmoid - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - -namespace SOFIE{ - -template -class ROperator_Sigmoid final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Sigmoid(){} - ROperator_Sigmoid(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Sigmoid Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - } - - - std::string Generate(std::string opName) override { - if (fShape.empty()){ - throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first"); - } - std::stringstream out; - int length = 1; - for(auto& i: fShape){ - length *= i; - } - out << "\n//------ Sigmoid -- " << opName << "\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_" << fNX << "[id]));\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { return { std::string("cmath") };} -}; - -}//SOFIE - -#endif //SOFIE_ROPERATOR_Sigmoid diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx deleted file mode 100644 index 6d40003..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx +++ /dev/null @@ -1,263 +0,0 @@ -#ifndef SOFIE_ROPERATOR_SLICE -#define SOFIE_ROPERATOR_SLICE - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include -#include - - -namespace SOFIE{ - -// slice operator - -template -class ROperator_Slice final : public ROperator -{ - -private: - - std::string fNData; // input data tensor name - std::string fNOutput; // output data name - std::vector fNames; // tensor names for meta(axis) information - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data - // saved Start/End.Steps are corrected from initial ONNX for negative/default values - // and are available for each axis - std::vector fStart; // starting values of slices - std::vector fEnd; // End values of slices - std::vector fSteps; // step values of slices - - std::vector> fAttributes; // attributes for the version <=10 case - - -public: - - ROperator_Slice(){} - - // ctor for versions >= 10 - ROperator_Slice(std::string nameData, std::vector names, std::string nameOutput) - : fNData(UTILITY::Clean_name(nameData)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - fNames.resize(4); - // axes and steps can be optional - for (size_t i = 0; i < names.size(); ++i) { - fNames[i] = UTILITY::Clean_name(names[i]); - } - - fInputTensorNames = { fNData }; - fOutputTensorNames = { fNOutput }; - } - // ctor for versions < 10 - ROperator_Slice(std::string nameData, std::vector starts, std::vector ends, std::vector axes, std::string nameOutput) - : fNData(UTILITY::Clean_name(nameData)), - fNOutput(UTILITY::Clean_name(nameOutput)) - { - fAttributes.push_back(starts); - fAttributes.push_back(ends); - fAttributes.push_back(axes); - } - - // output type is same as input - std::vector TypeInference(std::vector input) override { - auto ret = std::vector(1, input[0]); - return ret; - } - - // output shape - std::vector> ShapeInference(std::vector> input) override { - auto & input_shape = input[0]; - // assume dimension of output shape is SAME AS INPUT ! - std::vector> ret(1, input_shape); - auto & output_shape = ret[0]; - for (size_t i = 0; i < input_shape.size(); i++) { - output_shape[i] = (fEnd[i]-fStart[i])/ fSteps[i]; - } - return ret; - } - - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model"); - } - - std::vector> shapes; - fShapeInput = model.GetTensorShape(fNData); - shapes.push_back(fShapeInput); - - std::vector> itensors(4); - if (fNames.size() > 0) { - // loop on the extra 2 or 3 or 4 inputs - for (size_t i = 0; i < fNames.size(); ++i) { - if (!fNames[i].empty()) { - // std::cout << " i " << i << " getting data for tensor " << fNames[i] << std::endl; - auto dptr = model.GetInitializedTensorData(fNames[i]); - auto tensor = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNames[i]); - assert(vec.size() == 1); - itensors[i] = std::vector(tensor, tensor + vec[0]); - } else { - switch (i) { - case 2: // missing axes - itensors[2] = std::vector(fShapeInput.size()); - std::iota(itensors[2].begin(), itensors[2].end(), 0); - break; - case 3: // missing steps - itensors[3] = std::vector(itensors[0].size(), 1); - default: break; - } - } - } - } else { - assert(fAttributes.size() > 1); - for (size_t i = 0; i < fAttributes.size(); i++) { - itensors[i] = fAttributes[i]; - } - } - size_t dim = fShapeInput.size(); - - fSteps = std::vector(dim, 1); - fStart = std::vector(dim, 0); - fEnd = std::vector(dim, 0); - std::copy(fShapeInput.begin(), fShapeInput.end(), fEnd.begin()); - - auto istart = itensors[0]; - auto iend = itensors[1]; - auto iaxes = itensors[2]; - auto isteps = itensors[3]; - - // make tensor axis - // if iaxes.size is =0 tensor axis is missing and use defaults - if (iaxes.size() > 0) { - for (size_t i = 0; i < iaxes.size(); i++) { - // negative axes - they count from the back - if (iaxes[i] < 0) iaxes[i] = dim + iaxes[i]; - if (iaxes[i] < 0 || iaxes[i] >= static_cast(dim)) - throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(iaxes[i]) + - " for " + std::to_string(i)); - - size_t iAxisDim = fShapeInput[iaxes[i]]; - // find start/end/step for given axis - // check step size for clamping starting/end value - if (istart[i] < 0) istart[i] = iAxisDim + istart[i]; - if (iend[i] < 0) iend[i] = iAxisDim + iend[i]; - if (istart[i] < 0) istart[i] = 0; - if (isteps[i] > 0) { - if (istart[i] > static_cast(iAxisDim)) istart[i] = static_cast(iAxisDim); - if (iend[i] < 0) iend[i] = 0; - if (iend[i] > static_cast(iAxisDim)) iend[i] = static_cast(iAxisDim); - } else if (isteps[i] < 0) { - if (istart[i] > static_cast(iAxisDim)-1) istart[i] = static_cast(iAxisDim) -1; - if (iend[i] < -1) iend[i] = -1; - if (iend[i] > static_cast(iAxisDim)-1) iend[i] = static_cast(iAxisDim) -1; - } else { - throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(isteps[i]) + - " for " + std::to_string(i)); - } - fStart[iaxes[i]] = istart[i]; - fEnd[iaxes[i]] = iend[i]; - fSteps[iaxes[i]] = isteps[i]; - } - } - - fShapeOutput = ShapeInference({fShapeInput})[0]; - // case input is a constant tensor and of int64 type - if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { - fIsOutputConstant = true; - auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - size_t outputSize = ConvertShapeToLength(fShapeOutput); - std::vector outputData(outputSize); - std::vector inputStride = UTILITY::ComputeStrideFromShape(fShapeInput); - // perform slice using a recursive function- need to use two lambda functions for this - auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) { - auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) { - // compute indices - std::vector indices; - for (IType i = fStart[iax]; (fSteps[iax] > 0) ? i < fEnd[iax] : i > fEnd[iax]; i += fSteps[iax] ) - indices.push_back(i); - if (iax == dim-1) { // last axis - for (size_t i = 0; i < indices.size(); i++) { - outputData[outputIdx] = inputData[inputOffset + indices[i]]; - outputIdx++; - } - return; - } else { - for (size_t i = 0; i < indices.size(); i++) { - size_t offset = inputOffset + inputStride[iax]*indices[i]; - sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl); - } - } - }; - slice_impl(iaxis, outIdx, inOffset,slice_impl); - }; - size_t idx = 0; - size_t offset = 0; - sliceRecursive(0, idx, offset); - - model.AddConstantTensor(fNOutput, fShapeOutput, outputData.data()); - if (model.Verbose()) { - std::cout << "Slice: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : " - << ConvertValuesToString(outputData) << std::endl; - } - } - else { - model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); - if (model.Verbose()) { - std::cout << "Slice ---> " << fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; - } - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors - - OpName = "op_" + OpName; - if (fShapeInput.empty() || fShapeOutput.empty()){ - throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first"); - } - - std::stringstream out; - //std::string opName = "Slice"; - - out << SP << "///------- Slice operator\n" << std::endl; - // loop on the dimensions depending no the orders - size_t ndim = fShapeInput.size(); - std::vector strides(ndim,1); - for (int i = int(ndim-2); i >=0 ; i--) { - strides[i] = strides[i+1]*fShapeInput[i+1]; - } - - out << SP << "{\n"; // define operator scope - out << SP << "size_t iOut = 0;\n"; - std::string MSP = SP; - for (size_t idim = 0; idim < ndim; idim++) { - out << MSP << "for (size_t i" << idim << " = " << fStart[idim] << "; i" << idim << " < " << fEnd[idim] - << "; i" << idim << "+= " << fSteps[idim] << ") {\n"; - MSP += SP; - if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n"; - } - out << MSP << "size_t iInput = "; - for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + "; - // here should be step size ? - out << "i" << ndim-1 << ";\n"; - out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" < - -namespace SOFIE { - -template -class ROperator_Softmax final : public ROperator { - -private: - int64_t fAttrAxis; - - std::string fNX; - std::string fNY; - std::vector fShape; - - std::string fType; - -public: - ROperator_Softmax() {} - ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY) - : fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { return input; } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; // suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNX) == - false) { // input must be a graph input, or already initialized intermediate tensor - throw std::runtime_error("TMVA SOFIE Softmax Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) { - std::cout << "Softmax -> " << fNY << " " << ConvertShapeToString(fShape) << std::endl; - } - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Operator Softmax called to Generate without being initialized first"); - } - std::stringstream out; - size_t size = fShape.size(); - size_t length = ConvertShapeToLength(fShape); - size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis; - out << "\n" << SP << "//------ SOFTMAX - " << size << " " << length << " " << axis << "\n"; - // use safe numerically implementation by subtracting max of tensor - if (size == 1) { - out << SP << fType << " vmax = tensor_" << fNX << "[0];\n"; - out << SP << "for (size_t i = 1; i < " << length << " ; i++){\n"; - out << SP << SP << "if (tensor_" << fNX << "[i] > vmax) vmax = tensor_" << fNX << "[i];\n"; - out << SP << "}\n"; - out << SP << fType << " sum = 0.0;\n"; - out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n"; - out << SP << SP << "tensor_" << fNY << "[i] = std::exp(tensor_" << fNX << "[i] - vmax);\n"; - out << SP << SP << "sum += tensor_" << fNY << "[i];\n"; - out << SP << "}\n"; - out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n"; - out << SP << SP << "tensor_" << fNY << "[i] /= sum;\n"; - out << SP << "}\n"; - } else { - size_t batch = fShape[0]; - size_t channel = fShape[1]; - size_t width = (size > 2) ? fShape[size - 1] : 1; - size_t height = (size > 3) ? fShape[size - 2] : 1; - size_t depth = (size > 4) ? fShape[size - 3] : 1; - size_t hStride = width; - size_t dStride = height * width; - size_t cStride = depth * dStride; - size_t bStride = channel * cStride; - - size_t N = 0; // Size of the axis - size_t iStride = 0; - if (axis == 0) { - N = batch; - iStride = bStride; - } else if (axis == 1) { - N = channel; - iStride = cStride; - } else if (axis == size - 1) { - N = width; - iStride = 1; - } else if (size > 3 && axis == size - 2) { - N = height; - iStride = hStride; - } else if (size == 5 && axis == size - 3) { - N = depth; - iStride = dStride; - } else { - throw - std::runtime_error("TMVA::SOFIE - Softmax operator along the axis " - + std::to_string(fAttrAxis) + " with " + std::to_string(size) - + "d input tensor not supported."); - } - - bool notBatch = axis != 0; - bool notChannel = axis != 1; - bool notDepth = (size == 5 && axis != 2); - bool notHeight = (size == 5 && axis != 3) || (size == 4 && axis != 2); - bool notWidth = (size == 5 && axis != 4) || (size == 4 && axis != 3) || (size == 3 && axis != 2); - - if (notBatch) { - out << SP << "for (size_t n = 0; n < " << batch << " ; n++){\n"; - } - if (notChannel) { - out << SP << SP << "for (size_t c = 0; c < " << channel << " ; c++){\n"; - } - if (notDepth) { - out << SP << SP << "for (size_t d = 0; d < " << depth << " ; d++){\n"; - } - if (notHeight) { - out << SP << SP << "for (size_t h = 0; h < " << height << " ; h++){\n"; - } - if (notWidth) { - out << SP << SP << "for (size_t w = 0; w < " << width << " ; w++){\n"; - } - out << SP << SP << SP << fType << " sum = 0.;\n"; - out << SP << SP << SP << "size_t index = 0"; - if (notBatch) { - out << " + n * " << bStride; - } - if (notChannel) { - out << "+ c * " << cStride; - } - if (notDepth) { - out << " + d * " << dStride; - } - if (notHeight) { - out << " + h * " << hStride; - } - if (notWidth) { - out << " + w"; - } - out << ";\n"; - // apply softmax along the axis - find first maximum value for numerical stability - if (N == 0) - throw std::runtime_error("TMVA::SOFIE - Softmax operator is along axis with zero elements"); - out << SP << SP << SP << fType << " vmax = tensor_" << fNX << "[index];\n"; - out << SP << SP << SP << "for (size_t i = 1; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "if (tensor_" << fNX << "[index + i*" << iStride << "] > vmax)\n"; - out << SP << SP << SP << SP << SP << "vmax = tensor_" << fNX << "[index + i*" << iStride << "];\n"; - out << SP << SP << SP << "}\n"; - out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] = std::exp(tensor_" << fNX - << "[index + i*" << iStride << "] - vmax);\n"; - out << SP << SP << SP << SP << "sum += tensor_" << fNY << "[index + i*" << iStride << "];\n"; - out << SP << SP << SP << "}\n"; - out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n"; - out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] /= sum;\n"; - out << SP << SP << SP << "}\n"; - if (notWidth) { - out << SP << SP << "}\n"; // end w - } - if (notHeight) { - out << SP << SP << "}\n"; // end h - } - if (notDepth) { - out << SP << SP << "}\n"; // end d - } - if (notChannel) { - out << SP << SP << "}\n"; // end c - } - if (notBatch) { - out << SP << "}\n"; // end n - } - } - return out.str(); - } -}; - -} // namespace SOFIE - -#endif // SOFIE_ROPERATOR_Softmax diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx deleted file mode 100644 index 37c92ee..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Tanh -#define SOFIE_ROPERATOR_Tanh - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Tanh final : public ROperator -{ - -private: - - std::string fNX; - std::string fNY; - std::vector fShape; - -public: - ROperator_Tanh(){} - ROperator_Tanh(std::string nameX, std::string nameY): - fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNX }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNX) == false){ - throw std::runtime_error("TMVA SOFIE Tanh Op Input Tensor is not found in model"); - } - fShape = model.GetTensorShape(fNX); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); - - } - - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Tanh operator called to Generate without being initialized first"); - } - std::stringstream out; - size_t length = ConvertShapeToLength(fShape); - out << "\n//------ TANH\n"; - out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n"; - out << SP << "}\n"; - return out.str(); - } - - std::vector GetStdLibs() override { return { std::string("cmath") };} -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_Tanh diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx deleted file mode 100644 index 354fbe3..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef SOFIE_ROPERATOR_Tile -#define SOFIE_ROPERATOR_Tile - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - -template -class ROperator_Tile final : public ROperator -{ - -private: - - std::string fNRepeats; - std::string fNInput; - std::string fNY; - std::vectorfShapeInput; - std::vector fShapeY; - -public: - ROperator_Tile(){} - ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY): - fNRepeats(UTILITY::Clean_name(nameRepeat)),fNInput(UTILITY::Clean_name(nameInput)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNRepeats, fNInput }; - fOutputTensorNames = { fNY }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - std::vector ret = input[0]; - - for(size_t i=0; i < input[1].size(); i++) { - ret[i]=ret[i]*input[1][i]; - } - return {ret}; - } - - void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNInput) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - fShapeInput=model.GetTensorShape(fNInput); - - // if repeats vector is not initialized we cannot deduce shape of output - // not support for time being this case - if (!model.IsInitializedTensor(fNRepeats)) { - throw std::runtime_error("TMVA SOFIE Tile Op: non-initialized repeats input is not supported"); - } - - // Retrieve the data pointer for the repeats tensor - auto repptr = model.GetInitializedTensorData(fNRepeats); - // Cast the raw pointer to the appropriate type (size_t*) - auto repeats_data = static_cast(repptr.get()); - if (repeats_data == nullptr) { - throw std::runtime_error("Failed to retrieve the data for the repeats tensor."); - } - // Get the shape of the repeats tensor to determine the number of elements - auto repeats_shape = model.GetTensorShape(fNRepeats); - // Ensure the repeats tensor is 1D and get the number of elements - if (repeats_shape.size() != 1) { - throw std::runtime_error("Repeats tensor is not 1D."); - } - size_t num_elements = repeats_shape[0]; - // Convert the data to a vector of size_t - std::vector repeats_vector(num_elements); - std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin()); - - - fShapeY = ShapeInference({fShapeInput,repeats_vector})[0]; - - model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); - - if (model.Verbose()) - std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl; - } - - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; - if (fShapeInput.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first"); - } - - //size_t input_length = ConvertShapeToLength(fShapeInput); - //size_t output_length = ConvertShapeToLength(fShapeY); - - - std::stringstream out; - std::string input = "tensor_" + fNInput; - std::string output = "tensor_" + fNY; - out << "///-------- Tile operator\n"; - out << "{\n"; // add scope to re-use same names - out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n"; - - out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; - out << "int s = 1;\n"; - // loop from inverse dim order - out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n"; - out << SP << "int r = tensor_" << fNRepeats << "[i];\n"; - // we cannot exclude case where repeats=1 since we need offset - //out << SP << "if (r == 1 && i < " << fShapeInput.size()-1 << ") continue;\n"; - out << SP << "int i_offset = 0, o_offset = 0;\n"; - out << SP << "s = s * input_shape[i];\n"; - // case we have first copy - out << SP << "if (i == " << fShapeInput.size()-1 << ") {\n"; - out << SP << SP << "for (int j = 0; j < inputLength/s ; j++) {\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << input << "+ i_offset, " - << input << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << SP << "i_offset += s;\n"; - out << SP << SP << "}\n"; // end j loop - out << SP << "} else {\n"; // second copy we do from output to output - // and we need to loop on j from reverse order to avoir re-writing in output tensor - out << SP << SP << "for (int j = inputLength/s - 1 ; j>=0; j--) {\n"; - out << SP << SP << SP << "o_offset = j*s*r;\n"; - out << SP << SP << SP << "i_offset = j*s;\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << output << "+ i_offset, " - << output << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << "}\n"; // end j loop - out << SP << "}\n"; // end if - out << SP << "s *= r;\n"; - out << SP << "inputLength *= r;\n"; - out << "}\n"; // end i loop - out << "}\n"; // end of scope - return out.str(); - } -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_Tile diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx deleted file mode 100644 index 11c40bb..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef SOFIE_ROPERATOR_TRANSPOSE -#define SOFIE_ROPERATOR_TRANSPOSE - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include -#include - - -namespace SOFIE{ - - - - -template -class ROperator_Transpose final : public ROperator -{ - -private: - std::vector fAttrPerm; - - std::string fNData; - std::string fNOutput; - std::vector fShapeData; - std::vector fShapeOutput; - -public: - - ROperator_Transpose(){} - ROperator_Transpose(std::vector attr_perm, std::string nameData, std::string nameOutput): - fAttrPerm(attr_perm), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) { - fInputTensorNames = { fNData }; - fOutputTensorNames = { fNOutput }; - } - - ROperator_Transpose(std::string nameData, std::string nameOutput): - fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) { - fInputTensorNames = { fNData }; - fOutputTensorNames = { fNOutput }; - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - if (input.size() > 1) throw std::runtime_error("TMVA SOFIE Tranpose Op Shape Inference only need 1 input tensor"); - auto& data = input[0]; - if (fAttrPerm.size() != data.size() ) - throw std::runtime_error("TMVA SOFIE Tranpose Op - Invalid axes attributes"); - - std::vector output_shape(fAttrPerm.size()); - for (size_t i = 0; i < fAttrPerm.size(); i++){ - output_shape[i] = data[fAttrPerm[i]]; - } - std::vector> ret; - ret.push_back(output_shape); - return ret; - } - - - void Initialize(RModel& model) override { - if (model.CheckIfTensorAlreadyExist(fNData) == false){ //input must be a graph input, or already initialized intermediate tensor - std::cout<<"Input tensor for transpose: "<= 0; i--){ - fAttrPerm.push_back(i); - } - } - std::vector> inputs = { fShapeData }; - fShapeOutput = ShapeInference(inputs).front(); - if (model.IsInitializedTensor(fNData)) { - fIsOutputConstant = true; - // case input is a constant or initialized tensor we perform here the transpose - auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData); - auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput); - size_t length = ConvertShapeToLength(fShapeOutput); - auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - size_t dim = fShapeData.size(); - std::vector outputIdx(dim); - std::vector outputData(length); - for (size_t i = 0; i < length; i++) { - outputIdx[0] = i / outStrides[0]; - for (size_t j = 1; j < dim; j++) { - outputIdx[j] = (i % outStrides[j-1]) / outStrides[j]; - } - // compute input index - size_t inputIndex = 0; - for (size_t j = 0; j < dim; j++) { - // find value in fAtrrPerm corresponding to j - int k = std::find(fAttrPerm.begin(), fAttrPerm.end(), j) - fAttrPerm.begin(); - inputIndex += outputIdx[k] * inStrides[j]; - } - outputData[i] = inputData[inputIndex]; - } - model.AddConstantTensor(fNOutput, fShapeOutput, outputData.data()); - if (model.Verbose()) { - std::cout << "Transpose: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : " - << ConvertValuesToString(outputData) << std::endl; - } - } else { - model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); - if (model.Verbose()) { - std::cout << "Transpose ---> " << fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; - } - } - } - - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors - OpName = "op_" + OpName; - if (fShapeData.empty() || fShapeOutput.empty()){ - throw std::runtime_error("TMVA SOFIE Transpose Op called to Generate without being initialized first"); - } - int dim = fShapeData.size(); - auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData); - auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput); - size_t length = inStrides[0]*fShapeData[0]; // total tensor size - assert (length == outStrides[0]*fShapeOutput[0]); - - std::stringstream out; - // Implement transpose operator using consecutive read inputs. - // But - // tensorOut[id] = tensorInput[ inStrides[0]*i0 + inStrides[1]*i1 + inStrides[2]*i2 + ...] - // now if (j0,j1,j2) are the output indices - // j0 = id / outStrides[0] - // j1 = (id % outStrides[0])/outStrides[1] - // j2 = (id % outStrides[1])/outStrides[2] - //...... - // and we have j_k = i_fAttrPerm[k] - // since we are using consecutive writes we should find the inverse of fAttrPerm - out << SP << "///------- Transpose operator\n" << std::endl; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNOutput << "[id] = tensor_" << fNData << "[ "; - // compute output j indices - std::vector i_out(dim); - for (int k =0; k < dim; k++){ - if (k == 0) - i_out[k] = "id"; - else - i_out[k] = "(id % " + std::to_string(outStrides[k-1]) + ")"; - if (k < dim-1) - i_out[k] += " / " + std::to_string(outStrides[k]); - } - // use now them for input tensors - // need to invert the fAttrPerm[k] - for (int k =0; k < dim; k++){ - // find value in fAtrrPerm corresponding to k - int l = std::find(fAttrPerm.begin(), fAttrPerm.end(), k) - fAttrPerm.begin(); - assert(l >= 0 && l < dim); - out << "( " << i_out[l] << " )"; - if (k < dim-1) { - out << " * " << inStrides[k]; - out << " + "; - } - } - out << "];\n"; - out << SP << "}\n"; - return out.str(); - } - - -}; - -}//SOFIE - - -#endif //SOFIE_ROPERATOR_TRANSPOSE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx deleted file mode 100644 index 28ac093..0000000 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx +++ /dev/null @@ -1,243 +0,0 @@ -#ifndef SOFIE_ROperator_Where -#define SOFIE_ROperator_Where - -#include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" -#include "SOFIE/RModel.hxx" - -#include - - -namespace SOFIE{ - - - -template -class ROperator_Where final : public ROperator{ -private: - - bool fIsInputBoolTensor = false; - - - std::string fNA; - std::string fNB; - std::string fNC; - std::string fNBroadcastedA; - std::string fNBroadcastedB; - std::string fNBroadcastedC; - std::string fNY; - - - std::vector fShapeA; - std::vector fShapeB; - std::vector fShapeC; - std::vector fShapeY; - - -public: - ROperator_Where(){} - ROperator_Where(const std::string & nameA, const std::string & nameB, const std::string & nameC, const std::string & nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB, fNC }; - fOutputTensorNames = { fNY }; - } - - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { - // assume now inputs have same shape (no broadcasting) - auto ret = std::vector>(1, input[0]); // return vector size 1 with first input - return ret; - } - - void Initialize(RModel& model) override { - // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNA + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNB)) { - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNB + "is not found in model"); - } - if (!model.CheckIfTensorAlreadyExist(fNC)) { - throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNC + "is not found in model"); - } - // check if fNC input tensor is boolean - if (model.IsReadyInputTensor(fNC)) - fIsInputBoolTensor = true; - // check broadcast for A, B and C - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - fShapeC = model.GetTensorShape(fNC); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB) || !UTILITY::AreSameShape(fShapeA, fShapeC); - if (broadcast) { - // find shape to broadcast between A,B,C looking for max length - size_t lengthA = ConvertShapeToLength(fShapeA); - size_t lengthB = ConvertShapeToLength(fShapeB); - size_t lengthC = ConvertShapeToLength(fShapeC); - bool broadcastA = false, broadcastB = false, broadcastC = false; - if (lengthA >= lengthB && lengthA >= lengthC) { - fShapeY = fShapeA; - //broadcast B and C if different than A - broadcastB = (lengthB != lengthA); - broadcastC = (lengthC != lengthA); - } - else if (lengthB >= lengthA && lengthB >= lengthC) { - fShapeY = fShapeB; - //broadcast A and C if different than B - broadcastA = (lengthA != lengthB); - broadcastC = (lengthC != lengthB); - } - else if (lengthC >= lengthA && lengthC >= lengthB) { - fShapeY = fShapeC; - //broadcast A and B if different than C - broadcastA = (lengthA != lengthC); - broadcastB = (lengthB != lengthC); - } - - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "BC_" + fNA + "_to_" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; - } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); - } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "BC_" + fNB + "_to_" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); - } - } - // Broadcast C to Y - if (broadcastC) { - fNBroadcastedC = "BC_" + fNC + "_to_" + fNY; - if (model.IsInitializedTensor(fNC)) { - auto data = model.GetInitializedTensorData(fNC); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeY), - std::default_delete()); - // do not update tensor C but add broadcasted one (since it can be input to some other operators) - model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData); - fShapeC = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY); - } - } - } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) { - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) - dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a file - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - model.SetNotWritableInitializedTensor(nameC); - - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Where op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - - // output is a constant tensor - fOutputTensorNames.pop_back(); - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); - } - } - - std::string GenerateInitCode() override { - std::stringstream out; - return out.str(); - } - - std::string Generate(std::string OpName) override { - - if (fIsOutputConstant) return ""; - - OpName = "op_" + OpName; - - if (fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first"); - } - std::stringstream out; - out << SP << "\n//-------- Where \n"; - size_t length = ConvertShapeToLength(fShapeY); - std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - // Broadcast C if it's uninitialized - if (fShapeC != fShapeY) { - // special case if C is an input tensor - if (fIsInputBoolTensor) { - size_t inputLength = ConvertShapeToLength(fShapeC); - out << SP << "std::vector fTensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; - } - out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n"; - //out << SP << "{\n"; - // for boolean we need to pass vector and use the non-template version of the function - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedC << ");\n"; - } - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - // get output tensor applying condition (note we need to use directly the vector since v.data(), i.e the data pointer, does not exist) - out << SP << SP << "tensor_" << fNY << "[id] = " << "(fTensor_" << nameC << "[id]) ? tensor_" - << nameA << "[id] : tensor_" + nameB + "[id];\n"; - out << SP << "}\n"; - return out.str(); - } - -}; - -}//SOFIE - - -#endif //SOFIE_ROperator_Where diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx deleted file mode 100644 index e5495ed..0000000 --- a/src/SOFIE_core/src/RModel.cxx +++ /dev/null @@ -1,1327 +0,0 @@ -#include -#include -#include -#include -#include - -#include "TFile.h" - -#include "SOFIE/RModel.hxx" -#include "SOFIE/SOFIE_common.hxx" - - -namespace SOFIE { - -std::underlying_type_t operator|(Options opA, Options opB) { - return static_cast>(opA) | static_cast>(opB); -} -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { - return opA | static_cast>(opB); -} - -RModel::RModel(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; -} - -RModel& RModel::operator=(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; - return *this; -} - -const std::vector& RModel::GetTensorShape(std::string name) const { - auto f = fReadyInputTensorInfos.find(name); - if (f != fReadyInputTensorInfos.end()) { - return f->second.shape; - } - auto f2 = fInitializedTensors.find(name); - if (f2 != fInitializedTensors.end()) { - return f2->second.shape(); - } - auto f3 = fInputTensorInfos.find(name); - if (f3 != fInputTensorInfos.end()) { - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter"); - } - auto f4 = fIntermediateTensorInfos.find(name); - if (f4 != fIntermediateTensorInfos.end()) { - return f4->second.shape; - } - if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); - - if (fIsSubGraph && fParentGraph) - return fParentGraph->GetTensorShape(name); - - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); -} - -std::vector RModel::GetDynamicTensorShape(std::string name) const { - if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { - return f->second.shape; - } - if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { - return f->second.shape; - } - // in case is not a dynamic tensor convert normal shape to Dim one - // for this we need to return the vector by value - return ConvertShapeToDim(GetTensorShape(name)); -} - -const ETensorType& RModel::GetTensorType(std::string name) const { - auto f = fReadyInputTensorInfos.find(name); - if (f != fReadyInputTensorInfos.end()) { - return f->second.type; - } - auto f2 = fInitializedTensors.find(name); - if (f2 != fInitializedTensors.end()) { - return f2->second.type(); - } - auto f3 = fInputTensorInfos.find(name); - if (f3 != fInputTensorInfos.end()) { - return f3->second.type; - } - auto f4 = fIntermediateTensorInfos.find(name); - if (f4 != fIntermediateTensorInfos.end()) { - return f4->second.type; - } - auto f5 = fDynamicTensorInfos.find(name); - if (f5 != fDynamicTensorInfos.end()){ - return f5->second.type; - } - - if (fIsSubGraph && fParentGraph) - return fParentGraph->GetTensorType(name); - - throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName); -} - -bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { - if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end()) return true; - if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true; - if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; - if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; - if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; - if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); - return false; -} - -void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { - input_name = UTILITY::Clean_name(input_name); - if (CheckIfTensorAlreadyExist(input_name)) { - throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n"); - } - - InputTensorInfo inputInfo { type, shape }; - fInputTensorInfos[input_name] = inputInfo; -} - -void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape) { - input_name = UTILITY::Clean_name(input_name); - if (CheckIfTensorAlreadyExist(input_name)) { - throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n"); - } - TensorInfo inputInfo { type, shape }; - fReadyInputTensorInfos[input_name] = inputInfo; -} - -void RModel::AddInputTensorName(std::string input_name) { - fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name)); -} - -void RModel::AddOperator(std::unique_ptr op, int order_execution) { - AddBlasRoutines(op->GetBlasRoutines()); - auto libs = op->GetStdLibs(); - auto op_input_tensors = op->GetOpInputTensors(); - for (auto& stdlib : libs) { - AddNeededStdLib(stdlib); - } - if (order_execution >= 0) { - fOperators.insert(fOperators.begin() + order_execution, std::move(op)); - } else { - fOperators.push_back(std::move(op)); - } - - // storing the last usage of tensors which are input to - // operators (but are not inputs to the model, i.e. they are intermediate - // tensors). This information is needed to keep a check on when a - // particular intermediate tensor can be flushed to free up memory for reuse. - for(size_t index = 0; index shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - //NB: own data - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); - } - InitializedTensor new_tensor {type, shape, data}; - fInitializedTensors[tensor_name] = new_tensor; -} - -void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - //NB: own data - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); - } - InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor - fInitializedTensors[tensor_name] = new_tensor; -} - -bool RModel::IsInitializedTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fInitializedTensors.find(name) != fInitializedTensors.end(); -} -bool RModel::IsConstantTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - auto itr = fInitializedTensors.find(name); - if (itr == fInitializedTensors.end()) return false; - return itr->second.IsConstantTensor(); -} - -bool RModel::IsDynamicTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); -} -bool RModel::IsDimInputTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fInputTensorInfos.find(name) != fInputTensorInfos.end(); -} -bool RModel::IsReadyInputTensor(const std::string& tensorName) const { - std::string name = UTILITY::Clean_name(tensorName); - return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end(); -} - -// generic addition of a tensor -void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape) { - auto int_shape = ConvertShapeToInt(dim_shape); - if (!int_shape.empty()) - AddIntermediateTensor(tensor_name, type, int_shape); - else - AddDynamicTensor(tensor_name, type, dim_shape); -} - -void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape) { - tensor_name = UTILITY::Clean_name(tensor_name); - if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); - } - TensorInfo new_tensor {type, shape}; - fIntermediateTensorInfos[tensor_name] = new_tensor; -} - -void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape){ - tensor_name = UTILITY::Clean_name(tensor_name); - if (CheckIfTensorAlreadyExist(tensor_name)){ - throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n"); - } - DynamicTensorInfo new_tensor {type, shape}; - fDynamicTensorInfos[tensor_name] = new_tensor; - // store shape parameter if not existing - for (auto &d : shape) { - if (d.isParam) { - if (fShapeParams.count(d.param) == 0) { - // case parameter is an expression of some other existing parameter, no need to - // register it - if (d.dim != size_t(-1)) { - fShapeParams[d.param] = std::to_string(d.dim); - } - } - } - } -} - -void RModel::AddOutputTensorNameList(std::vector outputtensornames) { - fOutputTensorNames.clear(); - for(auto& it : outputtensornames) { - fOutputTensorNames.emplace_back(UTILITY::Clean_name(it)); - } -} - -void RModel::UpdateOutputTensorList(std::vector curr_output_tensors, std::vector new_output_tensors) { - for(auto& it:curr_output_tensors) { - fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end()); - } - fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end()); -} - -void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data) { - tensor_name = UTILITY::Clean_name(tensor_name); - if (!CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to update it"); - } - InitializedTensor new_tensor {type, shape, data}; - fInitializedTensors[tensor_name] = new_tensor; -} - -std::shared_ptr RModel::GetInitializedTensorData(std::string tensor_name) { - auto f = fInitializedTensors.find(tensor_name); - if (f == fInitializedTensors.end()) { - throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to get its data"); - } else { - return f->second.sharedptr(); - } -} - -void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { - auto t = fInitializedTensors.find(tensor_name); - if (t == fInitializedTensors.end()) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor " + tensor_name + " not found when trying to get its info"); - } - t->second.SetNotWritable(); - } - -std::string RModel:: AllocateIntermediateMemory(std::span op_output_tensors) { - - std::string memory_allocation_string = ""; - bool allocated; - - for (auto& it : op_output_tensors) { - allocated = false; - if (GetTensorType(std::string(it)) == ETensorType::BOOL || - fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() || - fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue; - - auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it))); - memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes"; - - for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) { - - // check if available memory chunks can accommodate the tensor - if (chunk->second >= tensor_size) { - auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size); - auto new_chunk_location = chunk->first+chunk->second-tensor_size; - fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; - - memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) + - "* tensor_" + std::string(it) + - " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n"; - chunk->second -= tensor_size; - - allocated = true; - - if (chunk->second == 0) { - chunk = fIntermediateMemoryInfo.available_stack.erase(chunk); - } - - break; - } - ++chunk; - } - - if (!allocated) { - size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() - ? 0 - : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; - - fIntermediateMemoryInfo.total_stack[chunk_idx] = - { - it, - tensor_size - }; - - memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n"; - } - } - return memory_allocation_string; -} - -void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ - for (auto &it : op_input_tensors){ - // last occurence of the tensor is reached => flush it from memory - if (fIntermediateTensorFrequencyLookup[it] == op_idx) { - for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); - chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) { - if (chunk->second.tensor_name == it) { - - // check if nearby chunks in available memory can coalesce - auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx - auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx - - // check if the next stack entry is actually adjacent in memory - if (last_smaller->first+last_smaller->second + 1 == chunk->first){ - last_smaller->second += chunk->second.tensor_size; - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); - - if (last_smaller->first + last_smaller->second + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - } else{ - if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - fIntermediateMemoryInfo.available_stack.insert({ - chunk->first, - chunk->second.tensor_size - }); - } - } - } - } - } -} - - - -void RModel::Initialize(int batchSize, bool verbose) { - std::map inputParams; - if (batchSize > 0) { - inputParams["input_size"] = batchSize; - inputParams["batch_size"] = batchSize; - inputParams["bs"] = batchSize; - } - Initialize(inputParams, verbose); - fIntermediateMemoryInfo = MemoryPoolInfo(); -} -void RModel::Initialize(const std::map & inputParams, bool verbose) { - - fVerbose = int(verbose); - - if (fIsInitialized) { - if (verbose) - std::cout << "Model is already initialized - skip initialization " << std::endl; - return; - } - fIntermediateTensorInfos.clear(); - fDynamicTensorInfos.clear(); - - // loop on inputs and see if shape can be full specified - // if the batch size is provided it can be used to specify the full shape - // Add the full specified tensors in fReadyInputTensors collection - auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements - for (auto &input : originalInputTensorInfos) { - if (verbose) std::cout << "looking at the tensor " << input.first << std::endl; - // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one - if (!inputParams.empty()) { - for (auto &d : input.second.shape) { - if (d.isParam) { - std::string pname = d.param; - if (pname == input.first + "_size") pname = "input_size"; - auto itr = inputParams.find(pname); - if (itr != inputParams.end() ) { - d = Dim{ itr->second }; - if (verbose) - std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl; - } - } - } - } - // see if shape now is fully defined - auto shape = ConvertShapeToInt(input.second.shape); - if (verbose) - std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " - << ConvertDynamicShapeToString(input.second.shape) << std::endl; - if (!shape.empty()) { - // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and - // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored - fInputTensorInfos.erase(input.first); - // add to the ready input tensor information the new fixed shape - AddInputTensorInfo(input.first, input.second.type, shape); - // check consistency - assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size()); - } - // store the parameters of the input tensors - else { - // store the found parametric shape parameters - for (auto &d : input.second.shape) { - if (d.isParam) - fShapeParams[d.param] = std::to_string(d.dim); - } - } - } - - if (verbose) { - PrintRequiredInputTensors(); - PrintDynamicTensors(); - } - - // check if there are initialized tensors to write in a weight file - // support for the time being only weight of FLOAT type - if (fUseWeightFile) { - bool modelHasWeights = false; - for (auto &i : fInitializedTensors) { - if (i.second.type() == ETensorType::FLOAT) { - modelHasWeights = true; - break; - } - } - if (!modelHasWeights) - fUseWeightFile = false; - } - // Go through model and initialize each operator - int i = 0; - - std::vector temp_available_stack; // vector stores individual chunks of available memory that maybe reused - - for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){ - if (verbose) { - auto& r = *fOperators[op_idx].get(); - std::cout << "Initializing operator " << i << " " << typeid(r).name() << std::endl; - } - fOperators[op_idx]->Initialize(*this); - for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ - if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && - std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() && - fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() && - fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){ - fIntermediateTensorFrequencyLookup[it] = op_idx; - } - } - i++; - } - - fIsInitialized = true; -} - -void RModel::InitializeSubGraph(std::shared_ptr graph) { - // add the subgraph to the list - fSubGraphs.push_back(graph); - //this needs to be done before initializing - graph->fParentGraph = this; - graph->fIsSubGraph = true; - - graph->Initialize(fBatchSize, fVerbose); - // set the same options as parent model - graph->fWeightFile = fWeightFile; - graph->fUseWeightFile = fUseWeightFile; - graph->fUseSession = fUseSession; - // add needed blas routines and libs - std::vector blasRoutines; - for (auto & e : graph->fNeededBlasRoutines) - blasRoutines.push_back(e); - AddBlasRoutines(blasRoutines); - for (auto e : graph->fNeededStdLib) - AddNeededStdLib(e); - - // add parent input tensors to current graph - for (auto & name : fInputTensorNames) - graph->fInputTensorNames.emplace_back(name); - - // clean graph name - graph->fName = UTILITY::Clean_name(graph->fName); - -} - -// Function to generate the code for declaring and initializing constant tensors -// This is for tensors which are not part of weight files and can be created from the Constant operator -template -std::string GenerateConstantTensorCode(const std::pair &t) -{ - std::stringstream strs; - std::string type = ConvertTypeToString(t.second.type()); - size_t length = ConvertShapeToLength(t.second.shape()); - // avoid using stack sizes for constant tensors to reduce compilation time - bool allocateOnStack = (length > 100) ? false : true; - - const T *data = t.second.data(); - - // and check if all values are the same - bool sameData = false; - // for non stack allocation check if data are the same - if (!allocateOnStack && length > 1) { - size_t idx = 1; - do { - sameData = (data[idx] == data[idx - 1]); - idx++; - } while (sameData && idx < length); - } - if (allocateOnStack) { - strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; - } else { - strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; - if (sameData) - strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; - else { - strs << ConvertValuesToString(length, data) << ";\n"; - } - strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; - } - return strs.str(); -} - -void RModel::GenerateInitializedTensorInfo() -{ - if (!fInitializedTensors.empty()) - fGC += "// initialized tensors\n"; - - for (auto &i : fInitializedTensors) { - if (!fUseWeightFile || i.second.IsConstantTensor()) { - if (i.second.type() == ETensorType::FLOAT) - fGC += GenerateConstantTensorCode(i); - else if (i.second.type() == ETensorType::INT64) - fGC += GenerateConstantTensorCode(i); - - } else { - // case of tensors which are read from a file - size_t length = ConvertShapeToLength(i.second.shape()); - if (i.second.type() == ETensorType::FLOAT) { - fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - } - } -} - -void RModel::GenerateIntermediateMemoryPool() { - if (fIntermediateMemoryInfo.total_stack.size() == 0) return; - fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; - - // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors - // of other data types - fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n"; -} - -void RModel::GenerateIntermediateTensorInfo() { - if (!fIntermediateTensorInfos.empty()) { - std::string tensor_declaration_block = ""; - - for (auto &i : fIntermediateTensorInfos) { - if (i.second.type == ETensorType::BOOL) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; - // No pointer allocation needed for BOOL - } - if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) { - size_t length = ConvertShapeToLength(i.second.shape); - - if (i.second.type == ETensorType::FLOAT) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - else if (i.second.type == ETensorType::DOUBLE) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - else if (i.second.type == ETensorType::INT64) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; - tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; - } - } - } - - if (tensor_declaration_block.length()) { - fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; - } - } - // add also the dynamic tensors (only declarations, allocation will be done later) - if (!fDynamicTensorInfos.empty()) { - fGC += "//--- declare the dynamic tensors\n"; - for (auto &i : fDynamicTensorInfos) { - if (i.second.type == ETensorType::FLOAT) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "float * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::DOUBLE) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "double * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::INT64) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "int64_t * tensor_" + i.first + " = nullptr;\n"; - } - } - } -} - -// generate code for specific operator declarations to be defined in the Session class -void RModel::GenerateOperatorDeclarations() { - std::string strcode; - for (auto & op : fOperators) { - strcode += op->GenerateDeclCode(); - } - if (strcode.empty()) return; - fGC += "\n//---- operator declarations \n"; - fGC += strcode; - fGC += "\n"; -} - -void RModel::GenerateDynamicTensorInfo() { - fGC += "//---- allocate the intermediate dynamic tensors\n"; - std::stringstream out; - for (auto & i: fDynamicTensorInfos) { - auto length = ConvertDynamicShapeToLength(i.second.shape); - out << SP << "if (" << length << " > 0) {\n"; - out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; - out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; - out << SP << "}\n"; - } - fGC += out.str(); -} - -std::string RModel::GenerateInferSignature(bool isdecl) { - // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2" - // if (decl = false) generate only calling signature (tensor1,tensor2,....) - std::string rGC; - std::unordered_map inputParams; - int i_input = 0; - for (auto &name : fInputTensorNames) { - // if is a dynamic tensor pass initial parameters - if (IsDimInputTensor(name)) { - auto shape = GetDynamicTensorShape(name); - for (auto &d : shape) { - std::string pName = d.param; - // need to check if the input parameters is already existing in another input tensor - if (d.isParam && inputParams.count(pName) == 0) { - if (isdecl) rGC += "size_t "; - rGC += d.param + ","; - inputParams[pName] = i_input; - } - } - } - if (isdecl) { - std::string type = ConvertTypeToString(GetTensorType(name)); - if (type == "other") - throw std::runtime_error("TMVA-SOFIE: input tensor " + name + - " is of a data type which is not yet supported."); - rGC += type + "* "; - } - rGC += "tensor_" + name + ","; - i_input++; - } - - if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last "," - return rGC; -} - -namespace { - -std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor) -{ - if(name.empty()) return "{}"; - ETensorType eOutputType = rmodel.GetTensorType(name); - std::string outputType = ConvertTypeToString(eOutputType); - if (isIntermediateTensor) { - - if (eOutputType == ETensorType::BOOL) { - return "fTensor_" + name; - } else { - // need to check is size is the same(don't want to return a vector with larger size) - // in that case better to copy - return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " + - std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")"; - } - } - // include also dynamic tensors since the vectors can be allocated with a size larger than their output - // we need a special handling for bool type allocated as vector - auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name)); - if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) { - return "std::vector(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")"; - } - return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")"; -} - -} // namespace - -void RModel::GenerateOutput() { - - if (fVerbose) - std::cout << "Generating main inference code for " << fName << std::endl; - - size_t outputSize = fOutputTensorNames.size(); - // assume output types are all the same - if (outputSize == 0) - throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); - - bool sameOutputTypes = true; - std::string inferReturnType; // type return by infer function - ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin()); - std::string outputType = ConvertTypeToString(eOutputType); - fGC += "\n\n"; - if (outputSize == 1) { - fGC += "std::vector<" + outputType + ">"; - } else { - // if all output types are the same we return an std::vector - otherwise a tuple - for (size_t i = 1; i < outputSize; i++) { - if (GetTensorType(fOutputTensorNames[i]) != eOutputType) - sameOutputTypes = false; - } - if (sameOutputTypes) - fGC += "std::vector>"; - else { - inferReturnType = "std::tuple<"; - for (size_t i = 0; i < outputSize; i++) { - inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">"; - if (i < outputSize-1) inferReturnType += ","; - } - inferReturnType += ">"; - fGC += inferReturnType; - } - } - - fGC += " infer("; - - fGC += GenerateInferSignature(); - - fGC += "){\n"; - - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl; - fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); - } - - fGC += SP + "return {"; - for (size_t i = 0; i < outputSize; i++) { - std::string tensorName = *(fOutputTensorNames.begin() + i); - bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; - fGC += createOutputTensor(*this, tensorName, isIntermediate); - if (i < outputSize - 1) - fGC += ","; - } - fGC += "};\n"; - fGC += "}\n"; // end of infer function scope -} - -void RModel::GenerateSessionCode() -{ - - // define the Session struct (for GNN this is generated in RModel_GNN) - if (fUseSession && !fIsGNNComponent) { - if (!fIsSubGraph) - fGC += "struct Session {\n"; - else - fGC += "struct Session_" + fName + " {\n"; - } - - // generate code for declaring the initialized tensors - GenerateInitializedTensorInfo(); - - // evaluate total intermediate memory and position intermediate tensor addresses - std::string intermediate_memory_alloc_string = ""; - intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); - CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); - } - - // to check remaining unused fragments after memory allocation (lesser the better) - // for (const auto &it: fIntermediateMemoryInfo.available_stack){ - // std::cout<<"chunk_idx: "<fName + " fSession_" + graph->fName + ";\n"; - } - - // Generate code for Session constructor - if (fUseSession) { - std::string sessionName = "Session"; - if (fIsSubGraph) - sessionName += "_" + fName; - // add here specific operator code that needs to define session data members - fGC += "\n"; - for (size_t id = 0; id < fOperators.size(); id++) { - std::string opName = std::to_string(id); - fGC += fOperators[id]->GenerateSessionMembersCode(opName); - } - fGC += "\n"; - // here add initialization and reading of weight tensors - if (fUseWeightFile) { - std::string fileName = fName; - if (fWeightFile == WeightFileType::Text) { - fileName += ".dat"; - } - if (fWeightFile == WeightFileType::RootBinary) { - fileName += ".root"; - } - fGC += sessionName + "(std::string filename =\"" + fileName + "\""; - } else { - // no need to pass weight file since it is not used - // keep passing a string for compatibility - fGC += sessionName + "(std::string = \"\""; - } - // add initialization of shape parameters - // assume all parameters are of type size_t - if (!fShapeParams.empty()) { - for (auto &p : fShapeParams) { - fGC += ",\n"; - fGC += " size_t " + p.first + " = " + p.second; - } - } - fGC += ") {\n"; - - if (fUseWeightFile) { - fGC += "\n//--- reading weights from file\n"; - ReadInitializedTensorsFromFile(fReadPos); - fGC += "\n"; - // fUseWeightFile = fUseWeightFile; - } - - // now we have passed the parameters we can allocate the dynamic tensors - GenerateDynamicTensorInfo(); - - // add here initialization code for operator - for (size_t id = 0; id < fOperators.size(); id++) { - fGC += fOperators[id]->GenerateInitCode(); - } - - fGC += "}\n\n"; - } - // generate the inference code - GenerateOutput(); - - // end of session - if (fUseSession && !fIsGNNComponent) { - fGC += "}; // end of Session\n"; - } -} - -void RModel::Generate(std::underlying_type_t options, int batchSize, long pos, bool verbose) -{ - fVerbose = verbose; - fBatchSize = batchSize; - fReadPos = pos; - - // session flag is used in operator initialize - if (static_cast>(Options::kNoSession) & options) { - fUseSession = false; - fWeightFile = WeightFileType::None; - } - if (static_cast>(Options::kNoWeightFile) & options) { - fUseWeightFile = false; - fWeightFile = WeightFileType::None; - } - if (static_cast>(Options::kRootBinaryWeightFile) & options) { - fUseWeightFile = true; - fWeightFile = WeightFileType::RootBinary; - } - if (fUseWeightFile && !fUseSession) { - throw std::runtime_error( - "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class"); - } - - if (static_cast>(Options::kGNN) & options) - fIsGNN = true; - if (static_cast>(Options::kGNNComponent) & options) - fIsGNNComponent = true; - - // initialize the model including all operators and sub-graphs - Initialize(batchSize, verbose); - - std::string hgname; - if (!fIsGNNComponent && !fIsSubGraph) { - fGC.clear(); - GenerateHeaderInfo(hgname); - } - - // generate first code for the subgraphs - for (auto &graph : fSubGraphs) { - if (fVerbose) - std::cout << "generate session code for subgraph " << graph->fName << std::endl; - graph->GenerateSessionCode(); - fGC += graph->fGC; - } - - if (fVerbose) - std::cout << "generate Main session code - model " << fName << std::endl; - - // generate main session code - GenerateSessionCode(); - - if (!fIsGNNComponent && !fIsSubGraph) { - fGC += ("} //SOFIE_" + fName + "\n"); - fGC += "\n#endif // " + hgname + "\n"; - } -} - -void RModel::ReadInitializedTensorsFromFile(long pos) { - // generate the code to read initialized tensors from a text data file - if (fWeightFile == WeightFileType::Text) { - if (fInitializedTensors.empty()) return; - - fGC += " std::ifstream f;\n"; - fGC += " f.open(filename);\n"; - fGC += " if (!f.is_open()) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open file \" + filename + \" for input weights\");\n"; - fGC += " }\n"; - - if(fIsGNNComponent) { - fGC += " f.seekg(" + std::to_string(pos) + ");\n"; - } - - fGC += " std::string tensor_name;\n"; - fGC += " size_t length;\n"; - - // loop on tensors and parse the file - for (auto& i: fInitializedTensors) { - // skip Constant and shape tensors (not written in a file) - if (!i.second.IsWeightTensor()) continue; - std::string tensor_name = "tensor_" + i.first; - if (i.second.type() == ETensorType::FLOAT) { - size_t length = 1; - length = ConvertShapeToLength(i.second.shape()); - std::string slength = std::to_string(length); - fGC += " f >> tensor_name >> length;\n"; - fGC += " if (tensor_name != \"" + tensor_name + "\" ) {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " + - tensor_name + " , read \" + tensor_name;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " if (length != " + slength + ") {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " + - slength + " , read \" + std::to_string(length) ;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " for (size_t i = 0; i < length; ++i)\n"; - fGC += " f >> " + tensor_name + "[i];\n"; - fGC += " if (f.fail()) {\n"; - fGC += " throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n"; - fGC += " }\n"; - } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); - } - } - fGC += " f.close();\n"; - } - - // generate the code to read initialized tensors from a ROOT data file - if(fWeightFile == WeightFileType::RootBinary) { - fGC += " {\n"; - fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; - fGC += " if (!rootFile->IsOpen()) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open ROOT file for input weights\");\n"; - fGC += " }\n"; - - std::string dirName = fName + "_weights"; - fGC += " if (!rootFile->GetKey(\"" + dirName + "\")) {\n"; - fGC += " throw std::runtime_error(\"tmva-sofie failed to open ROOT directory for input weights\");\n"; - fGC += " }\n"; - - for (auto &i : fInitializedTensors) { - // skip Constant and shape tensors - if (!i.second.IsWeightTensor()) continue; - fGC += " {\n"; - std::string tensor_name = "tensor_" + i.first; - if (i.second.type() == ETensorType::FLOAT) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + "/" + tensor_name + "\"));\n"; - } else if (i.second.type() == ETensorType::DOUBLE) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + + "/" + tensor_name + "\"));\n"; - } else if (i.second.type() == ETensorType::INT64) { - fGC += " fTensor_" + i.first + " = *reinterpret_cast*>(rootFile->Get(\""; - fGC += dirName + "/" + tensor_name + "\"));\n"; - } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); - } - fGC += " }\n"; - } - fGC += " }\n"; - } -} - -long RModel::WriteInitializedTensorsToFile(std::string filename) { - // Determine the file extension based on the weight file type - std::string fileExtension; - switch (fWeightFile) { - case WeightFileType::None: - fileExtension = ".dat"; - break; - case WeightFileType::RootBinary: - fileExtension = ".root"; - break; - case WeightFileType::Text: - fileExtension = ".dat"; - break; - } - - // If filename is empty, use the model name as the base filename - if (filename.empty()) { - filename = fFileName + fileExtension; - } - - // Write the initialized tensors to the file - if (fWeightFile == WeightFileType::RootBinary) { - if(fIsGNNComponent || fIsGNN) { - throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); - } - std::unique_ptr outputFile(TFile::Open(filename.c_str(), "UPDATE")); - - std::string dirName = fName + "_weights"; - // check if directory exists, in case delete to replace with new one - if (outputFile->GetKey(dirName.c_str())) - outputFile->rmdir(dirName.c_str()); - - auto outputDir = outputFile->mkdir(dirName.c_str()); - - for (const auto& item : fInitializedTensors) { - // skip Constant tensors and tensors which are not writable (e.g. shape tensors) - if (!item.second.IsWeightTensor()) continue; - std::string tensorName = "tensor_" + item.first; - size_t length = 1; - length = ConvertShapeToLength(item.second.shape()); - if(item.second.type() == ETensorType::FLOAT) { - const float* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else if(item.second.type() == ETensorType::DOUBLE) { - const double* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else if(item.second.type() == ETensorType::INT64) { - const int64_t* data = item.second.data(); - std::vector tensorDataVector(data, data + length); - outputDir->WriteObjectAny(&tensorDataVector, "std::vector", tensorName.c_str()); - } - else { - std::runtime_error("tmva-sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) + - " cannot be written to a ROOT file"); - } - } - outputFile->Write(filename.c_str()); - - // this needs to be changed, similar to the text file - return -1; - - } else if (fWeightFile == WeightFileType::Text) { - std::ofstream f; - if(fIsGNNComponent) { - // appending all GNN components into the same file - f.open(filename, std::ios::app); - } else { - f.open(filename); - } - if (!f.is_open()) - throw - std::runtime_error("tmva-sofie failed to open file " + filename + " for tensor weight data"); - for (auto& i: fInitializedTensors) { - // skip Constant tensors and not writable tensors (e.g. shape tensors) - if (!i.second.IsWeightTensor()) { - continue; - } - size_t length = ConvertShapeToLength(i.second.shape()); - std::string tensor_name = "tensor_" + i.first; - f << tensor_name << " " << length << "\n"; - if (i.second.type() == ETensorType::FLOAT) { - const float * data = i.second.data(); - for (size_t idx = 0; idx < length; idx++) { - // round to zero sub-normal values - float value = data[idx]; - if (value != 0. && std::abs(value) < std::numeric_limits::min() ) value = 0; - f << std::setprecision(std::numeric_limits::max_digits10) << value; - f << ( (idx < length-1) ? " " : "\n" ); - } - } - else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); - } - if (f.fail()) - std::runtime_error("tmva-sofie failed to write tensor data to file for " + tensor_name); - } - long curr_pos = f.tellp(); - f.close(); - return curr_pos; - } else { - return -1; - } -} - -void RModel::PrintRequiredInputTensors() { - std::cout << "Model requires following inputs:\n"; - for (auto& inputInfo: fInputTensorInfos) { - std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t"; - std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { - if (inputInfo.second.shape[i].isParam) { - std::cout << inputInfo.second.shape[i].param; - } else { - std::cout << inputInfo.second.shape[i].dim ; - } - if (i < inputInfo.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - - for (auto& inputInfo: fReadyInputTensorInfos) { - std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t"; - std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < inputInfo.second.shape.size(); i++) { - std::cout << inputInfo.second.shape[i]; - if (i < inputInfo.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintInitializedTensors() { - std::cout << "Model initialized the following tensors:\n"; - for (auto& it: fInitializedTensors) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape().size(); i++) { - std::cout << it.second.shape()[i]; - if (i < it.second.shape().size() - 1) std::cout << ","; - } - std::cout << "]"; - if (it.second.IsConstantTensor()) std::cout << " (Constant)"; - else if (!it.second.IsWeightTensor()) std::cout << " (Not Writable)"; - std::cout << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintIntermediateTensors() { - std::cout << "Model specify the following intermediate tensors:\n"; - for (auto& it: fIntermediateTensorInfos) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape.size(); i++) { - std::cout << it.second.shape[i]; - if (i < it.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintDynamicTensors() { - std::cout << "Model specify the following dynamic tensors:\n"; - for (auto& it: fDynamicTensorInfos) { - std::cout << "Tensor name: \"" << it.first << "\"\t"; - std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t"; - std::cout << "shape: ["; - for (size_t i = 0; i < it.second.shape.size(); i++) { - std::cout << it.second.shape[i].GetVal(); - if (i < it.second.shape.size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - } - std::cout << "\n"; -} - -void RModel::PrintOutputTensors() { - std::cout << "Model specify the following output tensors:\n"; - for (auto& it: fOutputTensorNames) { - std::cout << "Tensor name: \"" << it << "\"\t"; - if (!IsDynamicTensor(it)) - std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; - else - std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl; - } - std::cout << "\n"; -} - -void RModel::HeadInitializedTensors(std::string name, int n_print) { - auto it = fInitializedTensors.find(name); - if (it == fInitializedTensors.end()) { - std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl; - return; - } - - std::cout << "Tensor name: " << it->first << "\t"; - std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t"; - int length =1; - std::cout << "shape: ["; - for (size_t i = 0; i < it->second.shape().size(); i++) { - std::cout << it->second.shape()[i]; - length *= it->second.shape()[i]; - if (i < it->second.shape().size() - 1) std::cout << ","; - } - std::cout << "]" << std::endl; - bool ellipsis = true; - if (n_print > length) { - n_print = length; - ellipsis = false; - } - - std::cout << "data: [" << std::endl; - if (it->second.type() == ETensorType::FLOAT) { - auto converted_data = it->second.data(); - for (int i =0; i < n_print; i++) { - std::cout << converted_data[i]; - if (i < n_print - 1) std::cout << " ,"; - } - } - if (ellipsis) std::cout << ", ..."; - std::cout << "]" << std::endl; - -} - -void RModel::OutputGenerated(std::string filename, bool append) { - - RModel_Base::OutputGenerated(filename, append); - - // write weights in a text file - if (fUseWeightFile) { - if (!filename.empty()) { - size_t pos = filename.find(".hxx"); - if (fWeightFile == WeightFileType::Text) - filename.replace(pos, 4, ".dat"); - if (fWeightFile == WeightFileType::RootBinary) { - filename = filename.erase(pos, 4); - filename += ".root"; - } - } else { - filename = fName; - filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root"; - } - WriteInitializedTensorsToFile(filename); - } -} - -void RModel::Streamer(TBuffer &R__b) { - if (R__b.IsReading()) { - RModel::Class()->ReadBuffer(R__b, this); - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastPersistentToShared(); - } - } - else { - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastSharedToPersistent(); - } - RModel::Class()->WriteBuffer(R__b, this); - } -} - -}//SOFIE diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt deleted file mode 100644 index 34bb49f..0000000 --- a/src/SOFIE_core/test/CMakeLists.txt +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. -# All rights reserved. -# -# For the licensing terms see $ROOTSYS/LICENSE. -# For the list of contributors see $ROOTSYS/README/CREDITS. - -############################################################################ -# CMakeLists.txt file for building TMVA SOFIE tests. -# @author Federico Sossai, Sanjiban Sengupta -############################################################################ - -include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc) -include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc) - -if (NOT ONNX_MODELS_DIR) - set(ONNX_MODELS_DIR input_models) -endif() - -# Finding .onnx files to be parsed and creating the appropriate code to -# parse all file. It is much faster to combine all parsing in a single executable -# which will avoid initialization time (especially when using ROOT) -set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");") -set(ALL_CAPTURES "") -# Finding .onnx files to be parsed and creating the appropriate command -file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") -foreach(onnx_file ${ONNX_FILES}) - get_filename_component(fname ${onnx_file} NAME_WE) - get_filename_component(fdir ${onnx_file} DIRECTORY) - string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR}) - string(REPLACE "@2" ${fname} cap ${cap}) - list(APPEND ALL_CAPTURES ${cap}) -endforeach() -string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}") -configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) -configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY) - -ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx - LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-build) - -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -ROOTTEST_ADD_TEST(SofieCompileModels_ONNX - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname} - FIXTURES_REQUIRED sofie-compile-models-onnx-build - FIXTURES_SETUP sofie-compile-models-onnx -) - -# Creating a Google Test -if (BLAS_FOUND) # we need BLAS for compiling the models - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx - LIBRARIES - MathCore - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-onnx - FIXTURES_SETUP - sofie-test-models-onnx-build - ) - target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromONNX - EXEC ./TestCustomModelsFromONNX - FIXTURES_REQUIRED sofie-test-models-onnx-build) -endif() - -# For testing serialisation of RModel object - -ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx - LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-root -) -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -# Automatic compilation of headers from root files -ROOTTEST_ADD_TEST(SofieCompileModels_ROOT - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT - FIXTURES_REQUIRED sofie-compile-models-onnx-root - FIXTURES_SETUP sofie-compile-models-root -) - -if (BLAS_FOUND) - # Creating a Google Test for Serialisation of RModel - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx - LIBRARIES - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-root - FIXTURES_SETUP - sofie-test-models-root-build - ) - target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromROOT - EXEC ./TestCustomModelsFromROOT - FIXTURES_REQUIRED sofie-test-models-root-build) -endif() - -# Look for needed Python modules -ROOT_FIND_PYTHON_MODULE(torch) -if (ROOT_TORCH_FOUND) - configure_file(Conv1dModelGenerator.py Conv1dModelGenerator.py COPYONLY) - configure_file(Conv2dModelGenerator.py Conv2dModelGenerator.py COPYONLY) - configure_file(Conv3dModelGenerator.py Conv3dModelGenerator.py COPYONLY) - configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY) - configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY) - configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY) - - if (BLAS_FOUND) - ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx - LIBRARIES - SOFIE_core - SOFIE_parsers - BLAS::BLAS - INCLUDE_DIRS - ${CMAKE_CURRENT_BINARY_DIR} - ) - endif() -endif() - -ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN) - -ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 0000000..12f19b1 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,191 @@ +cmake_minimum_required(VERSION 3.14) +include(FetchContent) + +############################################################################ +# Basic setup +############################################################################ +include_directories(${CMAKE_SOURCE_DIR}/core/inc) +include_directories(${CMAKE_SOURCE_DIR}/parsers/inc) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if (NOT ONNX_MODELS_DIR) + set(ONNX_MODELS_DIR input_models) +endif() + +option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF) + +set(ALPAKA_BACKEND "cuda" + CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)") +set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl) + +############################################################################ +# Generate emitter sources +############################################################################ +set(CAPTURE_STR +"try {\n\ + EmitModel(\"@1\", \"@2\");\n\ +} catch (const std::exception& e) {\n\ + std::string msg = e.what();\n\ + if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\ + } else if (msg.find(\"is of a data type which is not yet supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Operator with unsupported data type in @1: \" << msg << std::endl;\n\ + } else {\n\ + std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\ + failures++;\n\ + }\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\ + failures++;\n\ +}\n\ +") + +file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") + +set(ALL_CAPTURES "") +foreach(onnx_file ${ONNX_FILES}) + get_filename_component(fname ${onnx_file} NAME_WE) + string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}") + string(REPLACE "@2" "${fname}" cap "${cap}") + string(APPEND ALL_CAPTURES "${cap}") +endforeach() + +set(EMIT_CAPTURES "${ALL_CAPTURES}") + +configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) +configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY) + +############################################################################ +# Alpaka tests +############################################################################ +if (ENABLE_ALPAKA_TESTS) + + string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend) + if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND) + message(FATAL_ERROR "Unsupported ALPAKA_BACKEND=${ALPAKA_BACKEND}") + endif() + + FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG dev + ) + FetchContent_MakeAvailable(sofieBLAS) + + FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d + ) + FetchContent_MakeAvailable(alpaka) + + ########################################################################## + # Alpaka emitter + ########################################################################## + ROOTTEST_GENERATE_EXECUTABLE( + emitFromONNXAlpaka + EmitFromONNX_GPU_ALPAKA_all.cxx + LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers + FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build + ) + + target_compile_options(emitFromONNXAlpaka PRIVATE + -Wno-unused-parameter + -Wno-array-bounds + ) + + ROOTTEST_ADD_TEST( + SofieCompileModels_ONNX_Alpaka + COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build + FIXTURES_SETUP sofie-compile-models-onnx-alpaka + ) + + ########################################################################## + # CUDA backend + ########################################################################## + if (_alpaka_backend STREQUAL "cuda") + + message(STATUS "Enabling Alpaka CUDA tests") + + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + set_source_files_properties( + TestCustomModelsFromONNXForAlpakaCuda.cxx + PROPERTIES LANGUAGE CUDA + ) + + ROOTTEST_GENERATE_EXECUTABLE( + TestCustomModelsFromONNXForAlpakaCuda + TestCustomModelsFromONNXForAlpakaCuda.cxx + LIBRARIES SOFIE_core GTest::gtest GTest::gtest_main + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + FIXTURES_SETUP sofie-test-models-onnx-alpaka-build + ) + + target_include_directories( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${alpaka_SOURCE_DIR}/include + ${sofieblas_SOURCE_DIR}/include + ${CUDAToolkit_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR} + ) + + set_target_properties( + TestCustomModelsFromONNXForAlpakaCuda + PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES 70 80 86 + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + ) + + target_compile_definitions( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_HAS_STD_ATOMIC_REF + ) + + target_compile_options( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --generate-line-info + --use_fast_math + -g + -G + # -fsanitize=address + -O1 + -Wno-deprecated-gpu-targets + > + $<$: + -O2 + -g + -G + -fPIC + -pthread + > + ) + # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address") + + # ROOT-compatible: plain signature only + target_link_libraries( + TestCustomModelsFromONNXForAlpakaCuda + CUDA::cudart + CUDA::cublas + CUDA::cublasLt + ) + + ROOTTEST_ADD_TEST( + TestCustomModelsFromONNXForAlpakaCuda + EXEC ./TestCustomModelsFromONNXForAlpakaCuda + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + ) + + endif() # cuda backend +endif() # ENABLE_ALPAKA_TESTS diff --git a/src/SOFIE_core/test/Conv1dModelGenerator.py b/test/Conv1dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv1dModelGenerator.py rename to test/Conv1dModelGenerator.py diff --git a/src/SOFIE_core/test/Conv2dModelGenerator.py b/test/Conv2dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv2dModelGenerator.py rename to test/Conv2dModelGenerator.py diff --git a/src/SOFIE_core/test/Conv3dModelGenerator.py b/test/Conv3dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/Conv3dModelGenerator.py rename to test/Conv3dModelGenerator.py diff --git a/src/SOFIE_core/test/ConvTrans2dModelGenerator.py b/test/ConvTrans2dModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/ConvTrans2dModelGenerator.py rename to test/ConvTrans2dModelGenerator.py diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/test/EmitFromONNX.cxx.in similarity index 77% rename from src/SOFIE_core/test/EmitFromONNX.cxx.in rename to test/EmitFromONNX.cxx.in index f7a56e2..c464f4d 100644 --- a/src/SOFIE_core/test/EmitFromONNX.cxx.in +++ b/test/EmitFromONNX.cxx.in @@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) { int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/test/EmitFromONNX_GPU_ALPAKA.cxx.in new file mode 100644 index 0000000..58198c1 --- /dev/null +++ b/test/EmitFromONNX_GPU_ALPAKA.cxx.in @@ -0,0 +1,27 @@ +// Author: Sanjiban Sengupta + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +using namespace SOFIE; + +int EmitModel(std::string filename, std::string outname) { + + RModelParser_ONNX parser; + RModel model = parser.Parse(filename); + model.GenerateGPU_ALPAKA(); + model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx"); + + return 0; +} + +int main(int argc, char *argv[]) { + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; +} diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/test/EmitFromRoot.cxx.in similarity index 83% rename from src/SOFIE_core/test/EmitFromRoot.cxx.in rename to test/EmitFromRoot.cxx.in index 4a630c7..88c0789 100644 --- a/src/SOFIE_core/test/EmitFromRoot.cxx.in +++ b/test/EmitFromRoot.cxx.in @@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){ int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + int failures = 0; + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; + + @EMIT_CAPTURES@; + + std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/GNN/EmitGNN.cxx b/test/GNN/EmitGNN.cxx similarity index 100% rename from src/SOFIE_core/test/GNN/EmitGNN.cxx rename to test/GNN/EmitGNN.cxx diff --git a/src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx b/test/GNN/EmitGraphIndependent.cxx similarity index 100% rename from src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx rename to test/GNN/EmitGraphIndependent.cxx diff --git a/src/SOFIE_core/test/LinearModelGenerator.py b/test/LinearModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/LinearModelGenerator.py rename to test/LinearModelGenerator.py diff --git a/src/SOFIE_core/test/RecurrentModelGenerator.py b/test/RecurrentModelGenerator.py similarity index 100% rename from src/SOFIE_core/test/RecurrentModelGenerator.py rename to test/RecurrentModelGenerator.py diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/test/TestCustomModelsFromONNX.cxx similarity index 99% rename from src/SOFIE_core/test/TestCustomModelsFromONNX.cxx rename to test/TestCustomModelsFromONNX.cxx index d02dc5e..902cbcc 100644 --- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx +++ b/test/TestCustomModelsFromONNX.cxx @@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu) { constexpr float TOLERANCE = 1; - // Preparing the standard all-ones input + // Preparing input std::vector input({ 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, @@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){ }); SOFIE_Equal::Session s("Equal_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); @@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){ }); SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){ }); SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){ }); SOFIE_Greater::Session s("Greater_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); @@ -2615,7 +2615,7 @@ TEST(ONNX, Less){ }); SOFIE_Less::Session s("Less_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); @@ -2849,6 +2849,7 @@ TEST(ONNX, Slice_Neg) { } } + TEST(ONNX, RangeFloat) { constexpr float TOLERANCE = DEFAULT_TOLERANCE; diff --git a/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx new file mode 100644 index 0000000..fccacbe --- /dev/null +++ b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -0,0 +1,3163 @@ +#include +#include + +// ── Trilu ────────────────────────────────────────────────────────────────── +#include "Trilu_upper_FromONNX_GPU_ALPAKA.hxx" +#include "Trilu_lower_FromONNX_GPU_ALPAKA.hxx" +#include "Trilu_k2_FromONNX_GPU_ALPAKA.hxx" +#include "Trilu_kn1_FromONNX_GPU_ALPAKA.hxx" +#include "Trilu_3D_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Trilu_upper.ref.hxx" +#include "input_models/references/Trilu_upper_input.ref.hxx" +#include "input_models/references/Trilu_lower.ref.hxx" +#include "input_models/references/Trilu_lower_input.ref.hxx" +#include "input_models/references/Trilu_k2.ref.hxx" +#include "input_models/references/Trilu_k2_input.ref.hxx" +#include "input_models/references/Trilu_kn1.ref.hxx" +#include "input_models/references/Trilu_kn1_input.ref.hxx" +#include "input_models/references/Trilu_3D.ref.hxx" +#include "input_models/references/Trilu_3D_input.ref.hxx" +// ── Logic ─────────────────────────────────────────────────────────────────── +#include "Logic_And_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_Or_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_Xor_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_BitwiseAnd_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_BitwiseOr_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_BitwiseXor_FromONNX_GPU_ALPAKA.hxx" +#include "Logic_BitwiseNot_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Logic_And.ref.hxx" +#include "input_models/references/Logic_And_input.ref.hxx" +#include "input_models/references/Logic_Or.ref.hxx" +#include "input_models/references/Logic_Or_input.ref.hxx" +#include "input_models/references/Logic_Xor.ref.hxx" +#include "input_models/references/Logic_Xor_input.ref.hxx" +#include "input_models/references/Logic_BitwiseAnd.ref.hxx" +#include "input_models/references/Logic_BitwiseAnd_input.ref.hxx" +#include "input_models/references/Logic_BitwiseOr.ref.hxx" +#include "input_models/references/Logic_BitwiseOr_input.ref.hxx" +#include "input_models/references/Logic_BitwiseXor.ref.hxx" +#include "input_models/references/Logic_BitwiseXor_input.ref.hxx" +#include "input_models/references/Logic_BitwiseNot.ref.hxx" +#include "input_models/references/Logic_BitwiseNot_input.ref.hxx" +// ───────────────────────────────────────────────────────────────────────── + +#include "Linear_64_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_64.ref.hxx" + +#include "AddBroadcast1_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AddBroadcast1.ref.hxx" + +#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithLeakyRelu.ref.hxx" + +#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithSigmoid.ref.hxx" + +#include "Transpose_FromONNX_GPU_ALPAKA.hxx" + +#include "Concat_0D_FromONNX_GPU_ALPAKA.hxx" +#include "ScatterElements_FromONNX_GPU_ALPAKA.hxx" + +#include "Split_0_FromONNX_GPU_ALPAKA.hxx" +#include "Split_1_FromONNX_GPU_ALPAKA.hxx" +#include "Split_2_FromONNX_GPU_ALPAKA.hxx" + +#include "Tile5D_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Tile5D.ref.hxx" + +#include "GatherAxis0_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis3_FromONNX_GPU_ALPAKA.hxx" +#include "Gather2d_FromONNX_GPU_ALPAKA.hxx" +#include "GatherNegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/GatherAxis0.ref.hxx" +#include "input_models/references/GatherAxis1.ref.hxx" +#include "input_models/references/GatherAxis2.ref.hxx" +#include "input_models/references/GatherAxis3.ref.hxx" +#include "input_models/references/Gather2d.ref.hxx" +#include "input_models/references/GatherNegativeIndices.ref.hxx" + +#include "ExpandSameSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandSameSize.ref.hxx" + +#include "ExpandDiffSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandDiffSize.ref.hxx" + +#include "GatherND_Ex1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex3_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex4_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex5_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx" + +#include "Equal_FromONNX_GPU_ALPAKA.hxx" +#include "LessOrEqual_FromONNX_GPU_ALPAKA.hxx" +#include "GreaterOrEqual_FromONNX_GPU_ALPAKA.hxx" +#include "Greater_FromONNX_GPU_ALPAKA.hxx" +#include "Less_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Equal.ref.hxx" +#include "input_models/references/LessOrEqual.ref.hxx" +#include "input_models/references/GreaterOrEqual.ref.hxx" +#include "input_models/references/Greater.ref.hxx" +#include "input_models/references/Less.ref.hxx" + +#include "Slice_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Default_Axis_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Default_Steps_FromONNX_GPU_ALPAKA.hxx" +#include "Slice_Neg_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Slice.ref.hxx" +#include "input_models/references/Slice_Default_Axis.ref.hxx" +#include "input_models/references/Slice_Default_Steps.ref.hxx" +#include "input_models/references/Slice_Neg.ref.hxx" + +#include "Sin_FromONNX_GPU_ALPAKA.hxx" +#include "Cos_FromONNX_GPU_ALPAKA.hxx" +#include "Abs_FromONNX_GPU_ALPAKA.hxx" +#include "Sqrt_FromONNX_GPU_ALPAKA.hxx" +#include "Reciprocal_FromONNX_GPU_ALPAKA.hxx" +#include "Exp_FromONNX_GPU_ALPAKA.hxx" +#include "Log_FromONNX_GPU_ALPAKA.hxx" +#include "Neg_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Sqrt.ref.hxx" +#include "input_models/references/Reciprocal.ref.hxx" +#include "input_models/references/Exp.ref.hxx" +#include "input_models/references/Log.ref.hxx" +#include "input_models/references/Neg.ref.hxx" + +#include "Where_FromONNX_GPU_ALPAKA.hxx" + +#include "Softplus_FromONNX_GPU_ALPAKA.hxx" + +#include "ReduceMean_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceProd_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceSum_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceSumSquare_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceL2_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceL2Large_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceMax_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceMax_axis0_FromONNX_GPU_ALPAKA.hxx" +#include "ReduceMax_mid_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ReduceMean.ref.hxx" +#include "input_models/references/ReduceProd.ref.hxx" +#include "input_models/references/ReduceL2.ref.hxx" +#include "input_models/references/ReduceMax.ref.hxx" +#include "input_models/references/ReduceMax_axis0.ref.hxx" +#include "input_models/references/ReduceMax_mid.ref.hxx" + +#include "ConvWithPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithPadding.ref.hxx" + +#include "ConvWithoutPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithoutPadding.ref.hxx" + +#include "ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithAutopadSameLower.ref.hxx" + +#include "ConvWithStridesPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithStridesPadding.ref.hxx" + +#include "ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithStridesNoPadding.ref.hxx" + +#include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ConvWithAsymmetricPadding.ref.hxx" + +#include "BatchNorm_FromONNX_GPU_ALPAKA.hxx" +#include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx" + +#include "LayerNorm_FromONNX_GPU_ALPAKA.hxx" +#include "LayerNormScaleBias_FromONNX_GPU_ALPAKA.hxx" +#include "LayerNorm3D_FromONNX_GPU_ALPAKA.hxx" + +#include "IsInf_FromONNX_GPU_ALPAKA.hxx" +#include "IsNaN_FromONNX_GPU_ALPAKA.hxx" +#include "Clip_FromONNX_GPU_ALPAKA.hxx" +#include "Not_FromONNX_GPU_ALPAKA.hxx" + +#include "GNN_model_FromONNX_GPU_ALPAKA.hxx" + +#include +#include +#include +#include "gtest/gtest.h" + +constexpr float DEFAULT_TOLERANCE = 1e-3f; + +using Idx = std::size_t; +using Dim = alpaka::DimInt<1>; +using Ext1D = alpaka::Vec; + +class SofieAlpakaTest : public ::testing::Test { +protected: + // Shared devices and platforms + alpaka::PlatformCpu hostPlatform; + alpaka::DevCpu host; + alpaka::PlatformCudaRt platform; + alpaka::DevCudaRt device; + alpaka::Queue queue; + + SofieAlpakaTest() + : hostPlatform{} + , host(alpaka::getDevByIdx(hostPlatform, 0u)) + , platform{} + , device(alpaka::getDevByIdx(platform, 0u)) + , queue(device) + { + } + + void SetUp() override { + cudaDeviceSynchronize(); + } + + void TearDown() override { + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + ~SofieAlpakaTest() override { + cudaDeviceSynchronize(); + } +}; + + +TEST_F(SofieAlpakaTest, Linear64) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{6400})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 6400; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{6400})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{640})); + + { + SOFIE_Linear_64::Session session("Linear_64_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = Linear_64_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 640; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithLeakyRelu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, + -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, + 0.7057, -0.3749, -0.3310, 0.0986, -0.1370, 0.0832, -1.6465, -0.2793 + }); + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < input.size(); ++i) { + A_ptr[i] = input[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithLeakyRelu::Session session; + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs; + + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithSigmoid) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{48})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 48; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{48})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithSigmoid::Session session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithSigmoid_ExpectedOutput::all_ones; + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, AddBroadcast1) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{5})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + auto B = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + float *B_ptr = reinterpret_cast(alpaka::getPtrNative(B)); + + std::vector A_vec({-0.78023305, -1.34029483, -3.01482951, 0.53641361, + -1.22594789}); + std::vector B_vec({1.0626695, 0.43842875, 1.22476468, 0.79763274, 0.98688211, + 0.25267614, 0.44874883, 0.31516773, -0.78771195, 0.64565664, + 0.50450593, -0.41265227, -0.22474539, -0.22362374, 0.00509674, + 0.16927211, 1.06756969, -0.81634773, 0.88467744, 0.78902059}); + + for (Idx i = 0; i < A_vec.size(); ++i) { + A_ptr[i] = A_vec[i]; + } + + for (Idx i = 0; i < B_vec.size(); ++i) { + B_ptr[i] = B_vec[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{5})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto B_d = alpaka::allocBuf(device, Ext1D::all(Idx{20})); + alpaka::memcpy(queue, B_d, B); + alpaka::wait(queue); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + + { + SOFIE_AddBroadcast1::Session session; + auto result = session.infer(A_d, B_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = AddBroadcast1_ExpectedOutput::output; + for (size_t i = 0; i < 20; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Transpose) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Input shape: (2, 1, 3, 4) -> 24 elements + constexpr Idx inputSize = 24; + // Output shape: (2, 3, 4, 1) -> 24 elements + constexpr Idx outputSize = 24; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + std::vector input_vec({ + // shape (2, 1, 3, 4) + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }); + + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_vec[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Transpose::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + std::vector expected(outputSize); + std::vector inputShape = {2, 1, 3, 4}; + std::vector perm = {0, 2, 3, 1}; + std::vector outputShape = {2, 3, 4, 1}; + + std::vector inputStrides = {12, 12, 4, 1}; + std::vector outputStrides = {12, 4, 1, 1}; + + for (size_t i = 0; i < outputSize; ++i) + { + size_t remaining = i; + size_t inputIdx = 0; + for (size_t d = 0; d < 4; ++d) + { + size_t const coord = remaining / outputStrides[d]; + remaining = remaining - coord * outputStrides[d]; + inputIdx += coord * inputStrides[perm[d]]; + } + expected[i] = input_vec[inputIdx]; + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Concat0D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.40519865e+00, -2.87660856e-01}); + std::vector expected_output({ + 1.40519865e+00, -2.87660856e-01, + 1.40519865e+00, -2.87660856e-01 + }); + + // Host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + // Device input buffer + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host output buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected_output.size()})); + + { + SOFIE_Concat_0D::Session session("Concat_0D_FromONNX_GPU_ALPAKA.dat"); + + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + for (size_t i = 0; i < expected_output.size(); ++i) { + EXPECT_LE(std::abs(res_ptr[i] - expected_output[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, ScatterElements) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input (9, 0.f); + std::vector indices = { 1, 0, 2, 0, 2, 1 }; + std::vector updates = { 1.f, 1.1f, 1.2f, 2.f, 2.1f, 2.2f }; + std::vector correct = { 2.f, 1.1f, 0.f, 1.f, 0.f, 2.2f, 0.f, 2.1f, 1.2f }; + + // Allocate and fill host buffers + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + auto indices_h = alpaka::allocBuf(host, Ext1D::all(Idx{indices.size()})); + auto updates_h = alpaka::allocBuf(host, Ext1D::all(Idx{updates.size()})); + + float* input_ptr = reinterpret_cast (alpaka::getPtrNative(input_h)); + int64_t* indices_ptr = reinterpret_cast(alpaka::getPtrNative(indices_h)); + float* updates_ptr = reinterpret_cast (alpaka::getPtrNative(updates_h)); + + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + for (Idx i = 0; i < indices.size(); ++i) indices_ptr[i] = indices[i]; + for (Idx i = 0; i < updates.size(); ++i) updates_ptr[i] = updates[i]; + + // Allocate device buffers and copy + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + auto indices_d = alpaka::allocBuf(device, Ext1D::all(Idx{indices.size()})); + auto updates_d = alpaka::allocBuf(device, Ext1D::all(Idx{updates.size()})); + + alpaka::memcpy(queue, input_d, input_h); + alpaka::memcpy(queue, indices_d, indices_h); + alpaka::memcpy(queue, updates_d, updates_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ScatterElements::Session session; + auto result = session.infer(input_d, indices_d, updates_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 9u); + for (size_t i = 0; i < correct.size(); ++i){ + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Split_0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 0 in 2 tensors {2,2,3} -> {1,2,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,4.,5.,6.}, {7.,8.,9.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_0::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 1 in 2 tensors {2,2,3} -> {2,1,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,7.,8.,9.}, {4.,5.,6.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_1::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 2 in 2 tensors {2,2,3} -> {2,2,2} and {2,2,1} + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,4.,5.,7.,8.,10.,11.}, {3.,6.,9.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // outputs have different sizes: {2,2,2}=8 and {2,2,1}=4 + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_2::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Tile5D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input_data({ + 0.2386120855808258, 0.5549510717391968, -1.8190287351608276, 0.5724563598632812, -0.6596977710723877, + 0.17560836672782898, 0.7608169317245483, 0.08603227883577347, -0.049375515431165695, 0.2705111503601074, + 1.42119562625885, 0.032626643776893616, -1.212586522102356, -0.5129594802856445, -0.43296414613723755, + -0.1606937050819397, 1.1884371042251587, -0.662174642086029, -2.291109323501587, -0.6852569580078125, + 2.325223922729492, -0.19389064610004425, -0.5784135460853577, -0.39328137040138245, 0.2831517457962036, + 0.4496127665042877, -0.2029038816690445, 0.35477763414382935, 0.4266718924045563, 0.24683749675750732, + 1.90426504611969, -0.4861580729484558, 0.9139055013656616, -0.5031066536903381, 0.9583520293235779, + -0.23210509121418, 1.3183971643447876, 1.7042455673217773, -0.3201166093349457, -0.14444805681705475, + -0.8829464912414551, 1.725736141204834, 0.45657631754875183, 0.4920198321342468, -1.088847041130066, + 0.49437597393989563, -0.006085286382585764, 2.475630760192871, 0.12170185893774033, -0.8953945636749268, + 1.1430096626281738, 1.3278610706329346, 0.3076854348182678, 0.036237504333257675, 0.05180325731635094, + 0.2802475392818451, 0.5289335250854492, 0.9356630444526672, 0.7863689064979553, 0.4239695370197296, + 0.8723016977310181, -0.2248474359512329, 0.3891502320766449, 0.5463842153549194, -0.7782878875732422, + -0.8570080399513245, -2.593783378601074, -0.11392943561077118, 0.5637082457542419, 2.075004816055298, + -1.0598397254943848, 1.0823975801467896 + }); + + const std::size_t inputSize = input_data.size(); + const std::size_t outputSize = sizeof(Tile5D_ExpectedOutput::output) / sizeof(float); + + // Allocate and fill host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_data[i]; + + // Copy to device + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Tile5D::Session session; + auto result = session.infer(input_d); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Tile5D_ExpectedOutput::output; + + EXPECT_EQ(outputSize, sizeof(Tile5D_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis0::Session session("GatherAxis0_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis0_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis1::Session session("GatherAxis1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis1_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis2::Session session("GatherAxis2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis2_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis3::Session session("GatherAxis3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis3_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Gather2d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 9; + const std::size_t outputSize = sizeof(Gather2d_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Gather2d::Session session("Gather2d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Gather2d_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Gather2d_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherNegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 10; + const std::size_t outputSize = sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherNegativeIndices::Session session("GatherNegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherNegativeIndices_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandSameSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandSameSize::Session session("ExpandSameSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandSameSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandDiffSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandDiffSize::Session session("ExpandDiffSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandDiffSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherND_Ex1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {0.f, 3.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex1::Session session("GatherND_Ex1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 2u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {2.f, 3.f, 0.f, 1.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex2::Session session("GatherND_Ex2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex3::Session session("GatherND_Ex3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex4) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex4::Session session("GatherND_Ex4_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex5) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex5::Session session("GatherND_Ex5_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_NegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f}; + std::vector expected = {6.f, 2.f, 4.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_NegativeIndices::Session session("GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 3u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Batch) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data(24); + std::iota(data.begin(), data.end(), 0.f); + std::vector expected = {4.f,5.f,6.f,7.f, 20.f,21.f,22.f,23.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Batch::Session session("GatherND_Batch_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 8u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Equal) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + // Output is bool — allocate as bool buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Equal::Session session("Equal_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Equal_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LessOrEqual) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LessOrEqual::Session session("LessOrEqual_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = LessOrEqual_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GreaterOrEqual) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GreaterOrEqual::Session session("GreaterOrEqual_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = GreaterOrEqual_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Greater) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Greater::Session session("Greater_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Greater_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Less) +{ + std::vector input1 = {1.0f, 2.0f, 3.0f}; + std::vector input2 = {4.0f, 2.0f, 6.0f}; + const std::size_t outputSize = sizeof(Less_ExpectedOutput::outputs) / sizeof(bool); + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + alpaka::memcpy(queue, input2_d, input2_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Less::Session session("Less_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + bool* correct = Less_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice::input; + const std::size_t outputSize = sizeof(Slice::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice::Session session("Slice_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice::output; + EXPECT_EQ(outputSize, sizeof(Slice::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Default_Axis) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Default_Axis::input; + const std::size_t outputSize = sizeof(Slice_Default_Axis::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Default_Axis::Session session("Slice_Default_Axis_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Default_Axis::output; + EXPECT_EQ(outputSize, sizeof(Slice_Default_Axis::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Default_Steps) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Default_Steps::input; + const std::size_t outputSize = sizeof(Slice_Default_Steps::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Default_Steps::Session session("Slice_Default_Steps_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Default_Steps::output; + EXPECT_EQ(outputSize, sizeof(Slice_Default_Steps::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Slice_Neg) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = Slice_Neg::input; + const std::size_t outputSize = sizeof(Slice_Neg::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Slice_Neg::Session session("Slice_Neg_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Slice_Neg::output; + EXPECT_EQ(outputSize, sizeof(Slice_Neg::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Sin) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + -0.786738f, -0.197796f, -0.187787f, 0.142758f, + 0.876096f, -0.653239f, 0.145444f, -1.107658f, + 2.259171f, -0.947054f, -0.506689f, 1.801250f + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Sin::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 12u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::sin(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Cos) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 1.152504f, -1.459324f, 0.691594f, 0.347690f, + -1.307323f, 1.832516f, -1.261772f, 0.014224f, + 1.311477f, 1.147405f, -0.567206f, -0.530606f + }); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Cos::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 12u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::cos(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Abs) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.f, -2.f, -3.f, 4.f, -5.f, 6.f}); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Abs::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(input.size(), 6u); + for (size_t i = 0; i < input.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - std::abs(input[i])), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Sqrt) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.8344f, 0.4716f, 0.6226f, 0.8448f, 0.2483f, 0.9467f}); + const std::size_t outputSize = sizeof(Sqrt_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Sqrt::Session session("Sqrt_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Sqrt_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Sqrt_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Reciprocal) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.2691f, -1.2160f, 0.6393f, -0.4438f, 0.8065f, 0.2011f}); + const std::size_t outputSize = sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Reciprocal::Session session("Reciprocal_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Reciprocal_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Exp) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 1.46566453f, 0.63334515f, 2.4048165f, 0.54468453f, + -1.41271672f, -0.18609187f, 0.2754482f, 1.10615209f, + 0.88474389f, 0.47531232f + }); + const std::size_t outputSize = sizeof(Exp_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Exp::Session session("Exp_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Exp_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Exp_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Log) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.f, 2.f, 3.f, 4.f}); + const std::size_t outputSize = sizeof(Log_ExpectedOutput::outputs) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Log::Session session("Log_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Log_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Log_ExpectedOutput::outputs) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Neg) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + -1.9100f, 1.8811f, -1.7269f, -0.1094f, + -0.0145f, 0.2509f, 0.5893f, -2.2733f, + -0.7077f, 1.0645f, -0.8607f, 0.2085f + }); + const std::size_t outputSize = sizeof(Neg_ExpectedOutput::outputs) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Neg::Session session("Neg_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Neg_ExpectedOutput::outputs; + EXPECT_EQ(outputSize, sizeof(Neg_ExpectedOutput::outputs) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Softplus) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.1,-0.2,0.3,-0.4,0.5,1.}); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + + { + SOFIE_Softplus::Session session("Softplus_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < input.size(); ++i){ + double exp_value = std::log(std::exp(input[i])+1); + EXPECT_LE(std::abs(res_ptr[i] - exp_value), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Where) +{ + std::vector input1 = {1.f, 2.f}; + std::vector input2 = {3.f, 4.f, 5.f, 6.f}; + std::vector cond_vec = {true, false, true}; + std::vector correct = {1.f, 2.f, 5.f, 6.f, 1.f, 2.f}; + + auto input1_h = alpaka::allocBuf(host, Ext1D::all(Idx{input1.size()})); + float* in1_ptr = reinterpret_cast(alpaka::getPtrNative(input1_h)); + for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i]; + + auto input1_d = alpaka::allocBuf(device, Ext1D::all(Idx{input1.size()})); + alpaka::memcpy(queue, input1_d, input1_h); + + auto input2_h = alpaka::allocBuf(host, Ext1D::all(Idx{input2.size()})); + float* in2_ptr = reinterpret_cast(alpaka::getPtrNative(input2_h)); + for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i]; + + auto input2_d = alpaka::allocBuf(device, Ext1D::all(Idx{input2.size()})); + alpaka::memcpy(queue, input2_d, input2_h); + + auto cond_h = alpaka::allocBuf(host, Ext1D::all(Idx{cond_vec.size()})); + uint8_t* cond_ptr = reinterpret_cast(alpaka::getPtrNative(cond_h)); + for (Idx i = 0; i < cond_vec.size(); ++i) cond_ptr[i] = cond_vec[i]; + + auto cond_d = alpaka::allocBuf(device, Ext1D::all(Idx{cond_vec.size()})); + alpaka::memcpy(queue, cond_d, cond_h); + + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_Where::Session session("Where_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input1_d, input2_d, cond_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 6u); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceMean) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceMean::Session session("ReduceMean_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceMean_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceProd) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceProd::Session session("ReduceProd_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceProd_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceSum) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + std::vector correct = {24.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ReduceSum::Session session("ReduceSum_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 1u); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, ReduceSumSquare) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + std::vector correct = {38.f, 66.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ReduceSumSquare::Session session("ReduceSumSquare_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < correct.size(); ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +// ReduceL2: input [1,2,3]={5,2,3,5,5,4}, reduce axis=1, keepdims=0 → [1,3] +// Expected: {sqrt(50), sqrt(29), 5.0} +TEST_F(SofieAlpakaTest, ReduceL2) +{ + constexpr float TOLERANCE = 1e-3f; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceL2_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceL2::Session session("ReduceL2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceL2_ExpectedOutput::output; + EXPECT_EQ(outputSize, 3u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; +} + +// ReduceL2Large: input [4,512], reduce axis=1, keepdims=0 → [4] +// Row i is filled with (i+1), so L2 norm = (i+1)*sqrt(512). +// This test exercises the 256-thread block reduction with reducedLength > BLOCK_SIZE. +TEST_F(SofieAlpakaTest, ReduceL2Large) +{ + constexpr float TOLERANCE = 1e-2f; // slightly looser: large sum, float accumulation + + constexpr std::size_t nrows = 4; + constexpr std::size_t ncols = 512; + const std::size_t inputSize = nrows * ncols; + const std::size_t outputSize = nrows; + + // Fill row i with value (i+1) + std::vector input(inputSize); + for (std::size_t r = 0; r < nrows; ++r) + for (std::size_t c = 0; c < ncols; ++c) + input[r * ncols + c] = static_cast(r + 1); + + // Expected L2 per row: sqrt(ncols) * (row+1) + const float sqrt512 = std::sqrt(static_cast(ncols)); + std::vector correct(nrows); + for (std::size_t r = 0; r < nrows; ++r) + correct[r] = static_cast(r + 1) * sqrt512; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < inputSize; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ReduceL2Large::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(outputSize, nrows); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]) / correct[i], TOLERANCE) << "row=" << i; +} + +// ── ReduceMax: [1,2,3] axis=1 keepdims=0 (kLast path) ────────────────────── +TEST_F(SofieAlpakaTest, ReduceMax) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f}; + const std::size_t outputSize = sizeof(ReduceMax_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (std::size_t i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + { + SOFIE_ReduceMax::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceMax_ExpectedOutput::output; + for (std::size_t i = 0; i < outputSize; ++i) + EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << " i=" << i; +} + +// ── ReduceMax_axis0: [3,4] axis=0 keepdims=0 (kFirst path) ───────────────── +TEST_F(SofieAlpakaTest, ReduceMax_axis0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // numpy default_rng(42).standard_normal((3,4)) — same seed/sequence as generator + const std::size_t inputSize = 12; + const std::size_t outputSize = sizeof(ReduceMax_axis0_ExpectedOutput::output) / sizeof(float); + float vals[] = { 0.30471709f, -1.03998411f, 0.75045121f, 0.94056469f, + -1.95103514f, -1.30217946f, 0.12784040f, -0.31624261f, + -0.01680116f, -0.85304391f, 0.87939799f, 0.77779192f}; + std::vector input(vals, vals + inputSize); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (std::size_t i = 0; i < inputSize; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + { + SOFIE_ReduceMax_axis0::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceMax_axis0_ExpectedOutput::output; + for (std::size_t i = 0; i < outputSize; ++i) + EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << " i=" << i; +} + +// ── ReduceMax_mid: [2,3,4] axis=1 keepdims=0 (kMiddle path) ──────────────── +TEST_F(SofieAlpakaTest, ReduceMax_mid) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + const std::size_t inputSize = 24; // 2×3×4 + const std::size_t outputSize = sizeof(ReduceMax_mid_ExpectedOutput::output) / sizeof(float); + + // numpy default_rng(42).standard_normal((2,3,4)) — same seed/sequence as generator + float vals[] = { 0.06603070f, 1.12724125f, 0.46750933f, -0.85929245f, + 0.36875078f, -0.95888263f, 0.87845027f, -0.04992591f, + -0.18486236f, -0.68092954f, 1.22254133f, -0.15452948f, + -0.42832783f, -0.35213354f, 0.53230917f, 0.36544406f, + 0.41273260f, 0.43082100f, 2.14164758f, -0.40641502f, + -0.51224273f, -0.81377274f, 0.61597943f, 1.12897229f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (std::size_t i = 0; i < inputSize; ++i) input_ptr[i] = vals[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + { + SOFIE_ReduceMax_mid::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ReduceMax_mid_ExpectedOutput::output; + for (std::size_t i = 0; i < outputSize; ++i) + EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << " i=" << i; +} + +TEST_F(SofieAlpakaTest, ConvWithPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithPadding::Session session("ConvWithPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithPadding_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 25; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithoutPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithoutPadding::Session session("ConvWithoutPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithoutPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_convNoPad = sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_convNoPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } + +} + + +TEST_F(SofieAlpakaTest, ConvWithAutopadSameLower) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(25); + std::iota(input.begin(), input.end(), 0.0f); + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithAutopadSameLower_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithAutopadSameLower::Session session("ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithAutopadSameLower_ExpectedOutput::all_ones; + + for (size_t i = 0; i < 9; ++i) { + std::cout << "res: " << res_ptr[i] << ", correct: " << correct[i] << std::endl; + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithStridesPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithStridesPadding::Session session("ConvWithStridesPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithStridesPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_stridesPad = sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_stridesPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +TEST_F(SofieAlpakaTest, ConvWithStridesNoPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithStridesNoPadding::Session session("ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithStridesNoPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_stridesNoPad = sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_stridesNoPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + + +// Disables test (asymmetric padding not supported) +TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Preparing the standard all-ones input + std::vector input(35); + std::iota(input.begin(), input.end(), 0.0f); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float)})); + + { + SOFIE_ConvWithAsymmetricPadding::Session session("ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = ConvWithAsymmetricPadding_ExpectedOutput::all_ones; + constexpr size_t nOut_asymPad = sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float); + + for (size_t i = 0; i < nOut_asymPad; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, BatchNormalization) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = { + 1.f, 2.f, 3.f, 4.f, // channel 0 + 5.f, 6.f, 7.f, 8.f // channel 1 + }; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_BatchNorm::Session session("BatchNorm_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.f + 1e-5f); + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - input[i] * inv_std), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, BatchNormalizationRelu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input = { + -1.f, 2.f, -3.f, 4.f, + 5.f, -6.f, 7.f, -8.f + }; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_BatchNormRelu::Session session("BatchNormRelu_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.f + 1e-5f); + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) { + float expected = std::max(0.f, input[i] * inv_std); + EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, LayerNorm) +{ + constexpr float TOLERANCE = 1e-4f; + std::vector input = {1.f, 2.f, 3.f, 4.f, + 5.f, 6.f, 7.f, 8.f}; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNorm::Session session("LayerNorm_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + // Row 0: mean=2.5, std=sqrt(1.25+1e-5) ≈ 1.118034 + // Row 1: mean=6.5, std=sqrt(1.25+1e-5) ≈ 1.118034 + // Y[0] = (1-2.5)/1.118034 ≈ -1.3416 + // Y[1] = (2-2.5)/1.118034 ≈ -0.4472 + // Y[2] = (3-2.5)/1.118034 ≈ 0.4472 + // Y[3] = (4-2.5)/1.118034 ≈ 1.3416 + float inv_std = 1.f / std::sqrt(1.25f + 1e-5f); + std::vector expected = { + (1.f - 2.5f) * inv_std, (2.f - 2.5f) * inv_std, + (3.f - 2.5f) * inv_std, (4.f - 2.5f) * inv_std, + (5.f - 6.5f) * inv_std, (6.f - 6.5f) * inv_std, + (7.f - 6.5f) * inv_std, (8.f - 6.5f) * inv_std + }; + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LayerNormScaleBias) +{ + constexpr float TOLERANCE = 1e-4f; + + std::vector input = {1.f, 2.f, 3.f, 4.f, + 5.f, 6.f, 7.f, 8.f}; + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNormScaleBias::Session session("LayerNormScaleBias_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + float inv_std = 1.f / std::sqrt(1.25f + 1e-5f); + std::vector expected = { + 2.f * (1.f - 2.5f) * inv_std + 1.f, 2.f * (2.f - 2.5f) * inv_std + 1.f, + 2.f * (3.f - 2.5f) * inv_std + 1.f, 2.f * (4.f - 2.5f) * inv_std + 1.f, + 2.f * (5.f - 6.5f) * inv_std + 1.f, 2.f * (6.f - 6.5f) * inv_std + 1.f, + 2.f * (7.f - 6.5f) * inv_std + 1.f, 2.f * (8.f - 6.5f) * inv_std + 1.f + }; + ASSERT_EQ(outputSize, 8u); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, LayerNorm3D) +{ + constexpr float TOLERANCE = 1e-4f; + + std::vector input(24); + std::iota(input.begin(), input.end(), 0.f); // 0..23 + const std::size_t outputSize = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_LayerNorm3D::Session session("LayerNorm3D_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + auto compute_expected = [](std::vector row) { + float mean = 0.f; + for (float v : row) mean += v; + mean /= row.size(); + float var = 0.f; + for (float v : row) var += (v - mean) * (v - mean); + var /= row.size(); + float inv_std = 1.f / std::sqrt(var + 1e-5f); + std::vector out; + for (float v : row) out.push_back((v - mean) * inv_std); + return out; + }; + + std::vector row0(input.begin(), input.begin() + 12); + std::vector row1(input.begin() + 12, input.end()); + auto exp0 = compute_expected(row0); + auto exp1 = compute_expected(row1); + + ASSERT_EQ(outputSize, 24u); + for (size_t i = 0; i < 12; ++i) + EXPECT_LE(std::abs(res_ptr[i] - exp0[i]), TOLERANCE) << "row0 i=" << i; + for (size_t i = 0; i < 12; ++i) + EXPECT_LE(std::abs(res_ptr[12 + i] - exp1[i]), TOLERANCE) << "row1 i=" << i; +} + +TEST_F(SofieAlpakaTest, IsInf) +{ + // Input contains finite values, +inf, -inf; output is bool (uint8_t). + float pos_inf = std::numeric_limits::infinity(); + float neg_inf = -std::numeric_limits::infinity(); + std::vector input = {1.0f, pos_inf, neg_inf, 0.0f, -1.0f, 2.0f, neg_inf, pos_inf}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_IsInf::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), std::isinf(input[i])) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, IsNaN) +{ + // Input contains finite values, +inf, and NaN; output is bool (uint8_t). + float nan_val = std::numeric_limits::quiet_NaN(); + float pos_inf = std::numeric_limits::infinity(); + std::vector input = {1.0f, nan_val, 0.0f, pos_inf, nan_val, 2.0f, -1.0f, nan_val}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_IsNaN::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), std::isnan(input[i])) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, Clip) +{ + // Model clips to [-1.0, 1.0]. + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + constexpr float clip_min = -1.0f; + constexpr float clip_max = 1.0f; + + std::vector input = { + -2.0f, -1.5f, -1.0f, -0.5f, + 0.0f, 0.5f, 1.0f, 1.5f, + 2.0f, -0.3f, 0.7f, 1.2f + }; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_Clip::Session session("Clip_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 12u); + for (size_t i = 0; i < N; ++i) { + float expected = std::max(clip_min, std::min(clip_max, input[i])); + EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i; + } +} + +TEST_F(SofieAlpakaTest, Not) +{ + // Input and output are bool tensors (uint8_t on device). + std::vector input = {1, 0, 1, 1, 0, 0, 1, 0}; + const std::size_t N = input.size(); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + uint8_t* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{N})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + + { + SOFIE_Not::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N, 8u); + for (size_t i = 0; i < N; ++i) + EXPECT_EQ(static_cast(res_ptr[i]), !static_cast(input[i])) << "i=" << i; +} + +// GNN model: 3370 nodes (29 features each), 24126 edges (5 features each), +// edge_index shape [2, 24126]. Output: sigmoid score per edge in [0, 1]. +TEST_F(SofieAlpakaTest, GNN_model) +{ + // ---- sizes ------------------------------------------------------- + constexpr Idx N_x = 97730; // 3370 nodes × 29 features + constexpr Idx N_ef = 120630; // 24126 edges × 5 features + constexpr Idx N_ei = 48252; // 2 rows × 24126 edges (int64) + constexpr Idx N_out = 24126; // one sigmoid score per edge + + // ---- host buffers ------------------------------------------------- + auto x_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_x})); + auto ef_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_ef})); + auto ei_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_ei})); + + float* x_ptr = reinterpret_cast (alpaka::getPtrNative(x_h)); + float* ef_ptr = reinterpret_cast (alpaka::getPtrNative(ef_h)); + int64_t* ei_ptr = reinterpret_cast(alpaka::getPtrNative(ei_h)); + + for (Idx i = 0; i < N_x; ++i) x_ptr[i] = 0.5f; + for (Idx i = 0; i < N_ef; ++i) ef_ptr[i] = 0.5f; + for (Idx i = 0; i < N_ei; ++i) ei_ptr[i] = 0; // all self-loops on node 0 + + // ---- device buffers ----------------------------------------------- + auto x_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_x})); + auto ef_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_ef})); + auto ei_d = alpaka::allocBuf(device, Ext1D::all(Idx{N_ei})); + + alpaka::memcpy(queue, x_d, x_h); + alpaka::memcpy(queue, ef_d, ef_h); + alpaka::memcpy(queue, ei_d, ei_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N_out})); + + { + SOFIE_GNN_model::Session session("GNN_model_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(x_d, ef_d, ei_d); + alpaka::wait(session.queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(N_out, 24126u); + for (Idx i = 0; i < N_out; ++i) { + EXPECT_GE(res_ptr[i], 0.0f) << "output[" << i << "] < 0"; + EXPECT_LE(res_ptr[i], 1.0f) << "output[" << i << "] > 1"; + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Trilu operator tests +// ═══════════════════════════════════════════════════════════════════════════ + +// Helper: copy a host C-array into an Alpaka host buffer then to device. +template +static alpaka::Buf +makeDeviceBuf(alpaka::DevCpu const& host, + alpaka::DevCudaRt const& device, + alpaka::Queue& queue, + const T* src, std::size_t n) +{ + auto hbuf = alpaka::allocBuf(host, Ext1D::all(Idx{n})); + T* hp = reinterpret_cast(alpaka::getPtrNative(hbuf)); + for (std::size_t i = 0; i < n; ++i) hp[i] = src[i]; + auto dbuf = alpaka::allocBuf(device, Ext1D::all(Idx{n})); + alpaka::memcpy(queue, dbuf, hbuf); + alpaka::wait(queue); + return dbuf; +} + +// ── Trilu_upper: 4×4, upper=1, k=0 ───────────────────────────────────────── +TEST_F(SofieAlpakaTest, Trilu_upper) +{ + constexpr std::size_t N = 16; // 4×4 + auto d_input = makeDeviceBuf(host, device, queue, + Trilu_upper_Input::data, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Trilu_upper::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Trilu_upper_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << " index=" << i; +} + +// ── Trilu_lower: 4×4, upper=0, k=0 ───────────────────────────────────────── +TEST_F(SofieAlpakaTest, Trilu_lower) +{ + constexpr std::size_t N = 16; + auto d_input = makeDeviceBuf(host, device, queue, + Trilu_lower_Input::data, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Trilu_lower::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Trilu_lower_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << " index=" << i; +} + +// ── Trilu_k2: 3×5, upper=1, k=+2 ──────────────────────────────────────────── +TEST_F(SofieAlpakaTest, Trilu_k2) +{ + constexpr std::size_t N = 15; // 3×5 + auto d_input = makeDeviceBuf(host, device, queue, + Trilu_k2_Input::data, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Trilu_k2::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Trilu_k2_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << " index=" << i; +} + +// ── Trilu_kn1: 3×5, upper=0, k=-1 ──────────────────────────────────────────── +TEST_F(SofieAlpakaTest, Trilu_kn1) +{ + constexpr std::size_t N = 15; + auto d_input = makeDeviceBuf(host, device, queue, + Trilu_kn1_Input::data, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Trilu_kn1::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Trilu_kn1_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << " index=" << i; +} + +// ── Trilu_3D: 2×3×4, upper=1, k=0 (batched) ───────────────────────────────── +TEST_F(SofieAlpakaTest, Trilu_3D) +{ + constexpr std::size_t N = 24; // 2×3×4 + auto d_input = makeDeviceBuf(host, device, queue, + Trilu_3D_Input::data, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Trilu_3D::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* ref = Trilu_3D_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << " index=" << i; +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Logic / Bitwise operator tests +// ═══════════════════════════════════════════════════════════════════════════ + +// ── Logic_And: 4×4 bool, And ──────────────────────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_And) +{ + constexpr std::size_t N = 16; // 4×4 + auto d_a = makeDeviceBuf(host, device, queue, + Logic_And_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_And_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_And::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + uint8_t* ref = Logic_And_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_Or: 4×4 bool, Or ───────────────────────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_Or) +{ + constexpr std::size_t N = 16; + auto d_a = makeDeviceBuf(host, device, queue, + Logic_Or_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_Or_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_Or::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + uint8_t* ref = Logic_Or_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_Xor: 4×4 bool, Xor ─────────────────────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_Xor) +{ + constexpr std::size_t N = 16; + auto d_a = makeDeviceBuf(host, device, queue, + Logic_Xor_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_Xor_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_Xor::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + uint8_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + uint8_t* ref = Logic_Xor_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_BitwiseAnd: 3×5 int32, BitwiseAnd ──────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_BitwiseAnd) +{ + constexpr std::size_t N = 15; // 3×5 + auto d_a = makeDeviceBuf(host, device, queue, + Logic_BitwiseAnd_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_BitwiseAnd_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_BitwiseAnd::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + int32_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + int32_t* ref = Logic_BitwiseAnd_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_BitwiseOr: 3×5 int32, BitwiseOr ────────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_BitwiseOr) +{ + constexpr std::size_t N = 15; + auto d_a = makeDeviceBuf(host, device, queue, + Logic_BitwiseOr_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_BitwiseOr_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_BitwiseOr::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + int32_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + int32_t* ref = Logic_BitwiseOr_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_BitwiseXor: 3×5 int32, BitwiseXor ──────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_BitwiseXor) +{ + constexpr std::size_t N = 15; + auto d_a = makeDeviceBuf(host, device, queue, + Logic_BitwiseXor_Input::data_a, N); + auto d_b = makeDeviceBuf(host, device, queue, + Logic_BitwiseXor_Input::data_b, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_BitwiseXor::Session session; + auto result = session.infer(d_a, d_b); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + int32_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + int32_t* ref = Logic_BitwiseXor_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} + +// ── Logic_BitwiseNot: 2×3×4 int32, BitwiseNot ────────────────────────────── +TEST_F(SofieAlpakaTest, Logic_BitwiseNot) +{ + constexpr std::size_t N = 24; // 2×3×4 + auto d_input = makeDeviceBuf(host, device, queue, + Logic_BitwiseNot_Input::data_a, N); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{N})); + { + SOFIE_Logic_BitwiseNot::Session session; + auto result = session.infer(d_input); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + int32_t* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + int32_t* ref = Logic_BitwiseNot_ExpectedOutput::outputs; + for (std::size_t i = 0; i < N; ++i) + EXPECT_EQ(res[i], ref[i]) << " index=" << i; +} diff --git a/src/SOFIE_core/test/TestCustomModelsFromROOT.cxx b/test/TestCustomModelsFromROOT.cxx similarity index 100% rename from src/SOFIE_core/test/TestCustomModelsFromROOT.cxx rename to test/TestCustomModelsFromROOT.cxx diff --git a/src/SOFIE_core/test/TestSofieModels.cxx b/test/TestSofieModels.cxx similarity index 100% rename from src/SOFIE_core/test/TestSofieModels.cxx rename to test/TestSofieModels.cxx diff --git a/src/SOFIE_core/test/input_models/Abs.onnx b/test/input_models/Abs.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Abs.onnx rename to test/input_models/Abs.onnx diff --git a/src/SOFIE_core/test/input_models/Add.onnx b/test/input_models/Add.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Add.onnx rename to test/input_models/Add.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast1.onnx b/test/input_models/AddBroadcast1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast1.onnx rename to test/input_models/AddBroadcast1.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast2.onnx b/test/input_models/AddBroadcast2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast2.onnx rename to test/input_models/AddBroadcast2.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast3.onnx b/test/input_models/AddBroadcast3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast3.onnx rename to test/input_models/AddBroadcast3.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast4.onnx b/test/input_models/AddBroadcast4.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast4.onnx rename to test/input_models/AddBroadcast4.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast5.onnx b/test/input_models/AddBroadcast5.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast5.onnx rename to test/input_models/AddBroadcast5.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast6.onnx b/test/input_models/AddBroadcast6.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast6.onnx rename to test/input_models/AddBroadcast6.onnx diff --git a/src/SOFIE_core/test/input_models/AddBroadcast7.onnx b/test/input_models/AddBroadcast7.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AddBroadcast7.onnx rename to test/input_models/AddBroadcast7.onnx diff --git a/src/SOFIE_core/test/input_models/AvgPool.onnx b/test/input_models/AvgPool.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/AvgPool.onnx rename to test/input_models/AvgPool.onnx diff --git a/test/input_models/BatchNorm.onnx b/test/input_models/BatchNorm.onnx new file mode 100644 index 0000000..f03cd9a Binary files /dev/null and b/test/input_models/BatchNorm.onnx differ diff --git a/test/input_models/BatchNormRelu.onnx b/test/input_models/BatchNormRelu.onnx new file mode 100644 index 0000000..badf2c2 Binary files /dev/null and b/test/input_models/BatchNormRelu.onnx differ diff --git a/src/SOFIE_core/test/input_models/Cast.onnx b/test/input_models/Cast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Cast.onnx rename to test/input_models/Cast.onnx diff --git a/test/input_models/Clip.onnx b/test/input_models/Clip.onnx new file mode 100644 index 0000000..a91d748 Binary files /dev/null and b/test/input_models/Clip.onnx differ diff --git a/src/SOFIE_core/test/input_models/ComplexTopK.onnx b/test/input_models/ComplexTopK.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ComplexTopK.onnx rename to test/input_models/ComplexTopK.onnx diff --git a/src/SOFIE_core/test/input_models/Concat_0D.onnx b/test/input_models/Concat_0D.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Concat_0D.onnx rename to test/input_models/Concat_0D.onnx diff --git a/src/SOFIE_core/test/input_models/Constant.onnx b/test/input_models/Constant.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Constant.onnx rename to test/input_models/Constant.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTranspose1d.onnx b/test/input_models/ConvTranspose1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTranspose1d.onnx rename to test/input_models/ConvTranspose1d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTranspose2d.onnx b/test/input_models/ConvTranspose2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTranspose2d.onnx rename to test/input_models/ConvTranspose2d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx b/test/input_models/ConvTransposeBias2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx rename to test/input_models/ConvTransposeBias2d.onnx diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx b/test/input_models/ConvTransposeBias2dBatched.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx rename to test/input_models/ConvTransposeBias2dBatched.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx b/test/input_models/ConvWithAsymmetricPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx rename to test/input_models/ConvWithAsymmetricPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx b/test/input_models/ConvWithAutopadSameLower.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx rename to test/input_models/ConvWithAutopadSameLower.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithPadding.onnx b/test/input_models/ConvWithPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithPadding.onnx rename to test/input_models/ConvWithPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx b/test/input_models/ConvWithStridesNoPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx rename to test/input_models/ConvWithStridesNoPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx b/test/input_models/ConvWithStridesPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx rename to test/input_models/ConvWithStridesPadding.onnx diff --git a/src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx b/test/input_models/ConvWithoutPadding.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx rename to test/input_models/ConvWithoutPadding.onnx diff --git a/src/SOFIE_core/test/input_models/Cos.onnx b/test/input_models/Cos.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Cos.onnx rename to test/input_models/Cos.onnx diff --git a/src/SOFIE_core/test/input_models/Div.onnx b/test/input_models/Div.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Div.onnx rename to test/input_models/Div.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_3.onnx b/test/input_models/Einsum_3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_3.onnx rename to test/input_models/Einsum_3.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_4.onnx b/test/input_models/Einsum_4.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_4.onnx rename to test/input_models/Einsum_4.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_dotprod.onnx b/test/input_models/Einsum_dotprod.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_dotprod.onnx rename to test/input_models/Einsum_dotprod.onnx diff --git a/src/SOFIE_core/test/input_models/Einsum_matmul.onnx b/test/input_models/Einsum_matmul.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Einsum_matmul.onnx rename to test/input_models/Einsum_matmul.onnx diff --git a/src/SOFIE_core/test/input_models/Elu.onnx b/test/input_models/Elu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Elu.onnx rename to test/input_models/Elu.onnx diff --git a/src/SOFIE_core/test/input_models/Equal.onnx b/test/input_models/Equal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Equal.onnx rename to test/input_models/Equal.onnx diff --git a/src/SOFIE_core/test/input_models/Erf.onnx b/test/input_models/Erf.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Erf.onnx rename to test/input_models/Erf.onnx diff --git a/src/SOFIE_core/test/input_models/Exp.onnx b/test/input_models/Exp.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Exp.onnx rename to test/input_models/Exp.onnx diff --git a/src/SOFIE_core/test/input_models/ExpandDiffSize.onnx b/test/input_models/ExpandDiffSize.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ExpandDiffSize.onnx rename to test/input_models/ExpandDiffSize.onnx diff --git a/src/SOFIE_core/test/input_models/ExpandSameSize.onnx b/test/input_models/ExpandSameSize.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ExpandSameSize.onnx rename to test/input_models/ExpandSameSize.onnx diff --git a/src/SOFIE_core/test/input_models/EyeLike.onnx b/test/input_models/EyeLike.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/EyeLike.onnx rename to test/input_models/EyeLike.onnx diff --git a/test/input_models/GNN_model.onnx b/test/input_models/GNN_model.onnx new file mode 100644 index 0000000..833e34d Binary files /dev/null and b/test/input_models/GNN_model.onnx differ diff --git a/src/SOFIE_core/test/input_models/GRUBatchwise.onnx b/test/input_models/GRUBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUBatchwise.onnx rename to test/input_models/GRUBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/GRUBidirectional.onnx b/test/input_models/GRUBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUBidirectional.onnx rename to test/input_models/GRUBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/GRUDefaults.onnx b/test/input_models/GRUDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUDefaults.onnx rename to test/input_models/GRUDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/GRUInitialBias.onnx b/test/input_models/GRUInitialBias.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUInitialBias.onnx rename to test/input_models/GRUInitialBias.onnx diff --git a/src/SOFIE_core/test/input_models/GRUSeqLength.onnx b/test/input_models/GRUSeqLength.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GRUSeqLength.onnx rename to test/input_models/GRUSeqLength.onnx diff --git a/src/SOFIE_core/test/input_models/Gather2d.onnx b/test/input_models/Gather2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Gather2d.onnx rename to test/input_models/Gather2d.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis0.onnx b/test/input_models/GatherAxis0.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis0.onnx rename to test/input_models/GatherAxis0.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis1.onnx b/test/input_models/GatherAxis1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis1.onnx rename to test/input_models/GatherAxis1.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis2.onnx b/test/input_models/GatherAxis2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis2.onnx rename to test/input_models/GatherAxis2.onnx diff --git a/src/SOFIE_core/test/input_models/GatherAxis3.onnx b/test/input_models/GatherAxis3.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherAxis3.onnx rename to test/input_models/GatherAxis3.onnx diff --git a/test/input_models/GatherND_Batch.onnx b/test/input_models/GatherND_Batch.onnx new file mode 100644 index 0000000..4d146c6 Binary files /dev/null and b/test/input_models/GatherND_Batch.onnx differ diff --git a/test/input_models/GatherND_Ex1.onnx b/test/input_models/GatherND_Ex1.onnx new file mode 100644 index 0000000..bc1a910 Binary files /dev/null and b/test/input_models/GatherND_Ex1.onnx differ diff --git a/test/input_models/GatherND_Ex2.onnx b/test/input_models/GatherND_Ex2.onnx new file mode 100644 index 0000000..4cd511c Binary files /dev/null and b/test/input_models/GatherND_Ex2.onnx differ diff --git a/test/input_models/GatherND_Ex3.onnx b/test/input_models/GatherND_Ex3.onnx new file mode 100644 index 0000000..917008f Binary files /dev/null and b/test/input_models/GatherND_Ex3.onnx differ diff --git a/test/input_models/GatherND_Ex4.onnx b/test/input_models/GatherND_Ex4.onnx new file mode 100644 index 0000000..d3006a2 Binary files /dev/null and b/test/input_models/GatherND_Ex4.onnx differ diff --git a/test/input_models/GatherND_Ex5.onnx b/test/input_models/GatherND_Ex5.onnx new file mode 100644 index 0000000..be1ba0d Binary files /dev/null and b/test/input_models/GatherND_Ex5.onnx differ diff --git a/test/input_models/GatherND_NegativeIndices.onnx b/test/input_models/GatherND_NegativeIndices.onnx new file mode 100644 index 0000000..5fa05aa Binary files /dev/null and b/test/input_models/GatherND_NegativeIndices.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx b/test/input_models/GatherNegativeIndices.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx rename to test/input_models/GatherNegativeIndices.onnx diff --git a/src/SOFIE_core/test/input_models/Greater.onnx b/test/input_models/Greater.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Greater.onnx rename to test/input_models/Greater.onnx diff --git a/src/SOFIE_core/test/input_models/GreaterOrEqual.onnx b/test/input_models/GreaterOrEqual.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/GreaterOrEqual.onnx rename to test/input_models/GreaterOrEqual.onnx diff --git a/test/input_models/IsInf.onnx b/test/input_models/IsInf.onnx new file mode 100644 index 0000000..b47fe82 Binary files /dev/null and b/test/input_models/IsInf.onnx differ diff --git a/test/input_models/IsNaN.onnx b/test/input_models/IsNaN.onnx new file mode 100644 index 0000000..d1a6e05 Binary files /dev/null and b/test/input_models/IsNaN.onnx differ diff --git a/src/SOFIE_core/test/input_models/LSTMBatchwise.onnx b/test/input_models/LSTMBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMBatchwise.onnx rename to test/input_models/LSTMBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMBidirectional.onnx b/test/input_models/LSTMBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMBidirectional.onnx rename to test/input_models/LSTMBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMDefaults.onnx b/test/input_models/LSTMDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMDefaults.onnx rename to test/input_models/LSTMDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMInitialBias.onnx b/test/input_models/LSTMInitialBias.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMInitialBias.onnx rename to test/input_models/LSTMInitialBias.onnx diff --git a/src/SOFIE_core/test/input_models/LSTMPeepholes.onnx b/test/input_models/LSTMPeepholes.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LSTMPeepholes.onnx rename to test/input_models/LSTMPeepholes.onnx diff --git a/test/input_models/LayerNorm.onnx b/test/input_models/LayerNorm.onnx new file mode 100644 index 0000000..97142e7 Binary files /dev/null and b/test/input_models/LayerNorm.onnx differ diff --git a/test/input_models/LayerNorm3D.onnx b/test/input_models/LayerNorm3D.onnx new file mode 100644 index 0000000..c29afc0 Binary files /dev/null and b/test/input_models/LayerNorm3D.onnx differ diff --git a/test/input_models/LayerNormScaleBias.onnx b/test/input_models/LayerNormScaleBias.onnx new file mode 100644 index 0000000..99ea540 Binary files /dev/null and b/test/input_models/LayerNormScaleBias.onnx differ diff --git a/src/SOFIE_core/test/input_models/LayerNormalization2d.onnx b/test/input_models/LayerNormalization2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LayerNormalization2d.onnx rename to test/input_models/LayerNormalization2d.onnx diff --git a/src/SOFIE_core/test/input_models/LayerNormalization4d.onnx b/test/input_models/LayerNormalization4d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LayerNormalization4d.onnx rename to test/input_models/LayerNormalization4d.onnx diff --git a/src/SOFIE_core/test/input_models/Less.onnx b/test/input_models/Less.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Less.onnx rename to test/input_models/Less.onnx diff --git a/src/SOFIE_core/test/input_models/LessOrEqual.onnx b/test/input_models/LessOrEqual.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LessOrEqual.onnx rename to test/input_models/LessOrEqual.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx b/test/input_models/LinearWithLeakyRelu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx rename to test/input_models/LinearWithLeakyRelu.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithSelu.onnx b/test/input_models/LinearWithSelu.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithSelu.onnx rename to test/input_models/LinearWithSelu.onnx diff --git a/src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx b/test/input_models/LinearWithSigmoid.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx rename to test/input_models/LinearWithSigmoid.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_16.onnx b/test/input_models/Linear_16.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_16.onnx rename to test/input_models/Linear_16.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_32.onnx b/test/input_models/Linear_32.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_32.onnx rename to test/input_models/Linear_32.onnx diff --git a/src/SOFIE_core/test/input_models/Linear_64.onnx b/test/input_models/Linear_64.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Linear_64.onnx rename to test/input_models/Linear_64.onnx diff --git a/src/SOFIE_core/test/input_models/Log.onnx b/test/input_models/Log.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Log.onnx rename to test/input_models/Log.onnx diff --git a/test/input_models/LogicModelGenerator.py b/test/input_models/LogicModelGenerator.py new file mode 100644 index 0000000..adb5b16 --- /dev/null +++ b/test/input_models/LogicModelGenerator.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +Generate ONNX test models for the SOFIE Logic operators and write +the corresponding C++ reference headers. + +Models created +────────────── + Logic_And.onnx bool input 4×4, And + Logic_Or.onnx bool input 4×4, Or + Logic_Xor.onnx bool input 4×4, Xor + Logic_BitwiseAnd.onnx int32 input 3×5, BitwiseAnd + Logic_BitwiseOr.onnx int32 input 3×5, BitwiseOr + Logic_BitwiseXor.onnx int32 input 3×5, BitwiseXor + Logic_BitwiseNot.onnx int32 input 2×3×4, BitwiseNot + +Usage: + cd /SOFIE/core/test/input_models + python3 LogicModelGenerator.py +""" + +import os +import numpy as np +import onnx +from onnx import helper, TensorProto, numpy_helper + +OUT_DIR = os.path.dirname(os.path.abspath(__file__)) +REF_DIR = os.path.join(OUT_DIR, "references") +os.makedirs(REF_DIR, exist_ok=True) + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def ref_header(name, data, dtype_str="bool"): + """Emit a C++ header with the flattened expected output array.""" + flat = data.flatten() + if dtype_str == "bool": + # store as uint8 in C++ + vals = ", ".join(str(int(v)) for v in flat) + arr_type = "uint8_t" + else: + # int32 / int64 + vals = ", ".join(str(int(v)) for v in flat) + arr_type = dtype_str + return ( + f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"#include \n" + f"namespace {name}_ExpectedOutput {{\n" + f" static {arr_type} outputs[{flat.size}] = {{{vals}}};\n" + f"}} // namespace {name}_ExpectedOutput\n" + ) + + +def inp_header(name, data, dtype_str="bool"): + """Emit a C++ header for the input data.""" + flat = data.flatten() + if dtype_str == "bool": + vals = ", ".join(str(int(v)) for v in flat) + arr_type = "uint8_t" + else: + vals = ", ".join(str(int(v)) for v in flat) + arr_type = dtype_str + return ( + f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"#include \n" + f"namespace {name}_Input {{\n" + f" static {arr_type} data_a[{flat.size}] = {{{vals}}};\n" + f"}} // namespace {name}_Input\n" + ) + + +def inp_header2(name, data_a, data_b, dtype_str="bool"): + """Emit a C++ header with two input arrays (A and B).""" + flat_a = data_a.flatten() + flat_b = data_b.flatten() + if dtype_str == "bool": + arr_type = "uint8_t" + vals_a = ", ".join(str(int(v)) for v in flat_a) + vals_b = ", ".join(str(int(v)) for v in flat_b) + else: + arr_type = dtype_str + vals_a = ", ".join(str(int(v)) for v in flat_a) + vals_b = ", ".join(str(int(v)) for v in flat_b) + return ( + f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"#include \n" + f"namespace {name}_Input {{\n" + f" static {arr_type} data_a[{flat_a.size}] = {{{vals_a}}};\n" + f" static {arr_type} data_b[{flat_b.size}] = {{{vals_b}}};\n" + f"}} // namespace {name}_Input\n" + ) + + +# ───────────────────────────────────────────────────────────────────────────── +# Model builders +# ───────────────────────────────────────────────────────────────────────────── + +def make_binary_model(op_name, shape, onnx_dtype, name): + """Build a single-op binary model (A op B -> Y).""" + type_map = { + TensorProto.BOOL: "bool", + TensorProto.INT32: "int32", + TensorProto.INT64: "int64", + } + a = helper.make_tensor_value_info("input_a", onnx_dtype, shape) + b = helper.make_tensor_value_info("input_b", onnx_dtype, shape) + y = helper.make_tensor_value_info("output", onnx_dtype, shape) + + node = helper.make_node(op_name, inputs=["input_a", "input_b"], outputs=["output"]) + graph = helper.make_graph([node], name, [a, b], [y]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 18)]) + model.ir_version = 7 + onnx.checker.check_model(model) + return model + + +def make_unary_model(op_name, shape, onnx_dtype, name): + """Build a single-op unary model (X -> Y).""" + x = helper.make_tensor_value_info("input", onnx_dtype, shape) + y = helper.make_tensor_value_info("output", onnx_dtype, shape) + + node = helper.make_node(op_name, inputs=["input"], outputs=["output"]) + graph = helper.make_graph([node], name, [x], [y]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 18)]) + model.ir_version = 7 + onnx.checker.check_model(model) + return model + + +# ───────────────────────────────────────────────────────────────────────────── +# Test cases +# ───────────────────────────────────────────────────────────────────────────── + +rng = np.random.default_rng(42) + +BOOL_SHAPE = [4, 4] +INT_SHAPE = [3, 5] +INT3D_SHAPE = [2, 3, 4] + +# Bool inputs (stored as uint8 in SOFIE) +bool_a = rng.integers(0, 2, BOOL_SHAPE).astype(np.uint8) +bool_b = rng.integers(0, 2, BOOL_SHAPE).astype(np.uint8) + +# Int32 inputs +int_a = rng.integers(-100, 100, INT_SHAPE, dtype=np.int32) +int_b = rng.integers(-100, 100, INT_SHAPE, dtype=np.int32) + +int3d = rng.integers(-100, 100, INT3D_SHAPE, dtype=np.int32) + +BINARY_CASES = [ + # (model_name, onnx_op, onnx_dtype, shape, a, b, ref, dtype_str) + ("Logic_And", "And", TensorProto.BOOL, BOOL_SHAPE, bool_a, bool_b, + np.logical_and(bool_a, bool_b).astype(np.uint8), "bool"), + ("Logic_Or", "Or", TensorProto.BOOL, BOOL_SHAPE, bool_a, bool_b, + np.logical_or(bool_a, bool_b).astype(np.uint8), "bool"), + ("Logic_Xor", "Xor", TensorProto.BOOL, BOOL_SHAPE, bool_a, bool_b, + np.logical_xor(bool_a, bool_b).astype(np.uint8), "bool"), + ("Logic_BitwiseAnd","BitwiseAnd", TensorProto.INT32, INT_SHAPE, int_a, int_b, + (int_a & int_b).astype(np.int32), "int32_t"), + ("Logic_BitwiseOr", "BitwiseOr", TensorProto.INT32, INT_SHAPE, int_a, int_b, + (int_a | int_b).astype(np.int32), "int32_t"), + ("Logic_BitwiseXor","BitwiseXor", TensorProto.INT32, INT_SHAPE, int_a, int_b, + (int_a ^ int_b).astype(np.int32), "int32_t"), +] + +for (name, onnx_op, onnx_dtype, shape, a, b, ref, dtype_str) in BINARY_CASES: + model = make_binary_model(onnx_op, shape, onnx_dtype, name) + onnx_path = os.path.join(OUT_DIR, f"{name}.onnx") + onnx.save(model, onnx_path) + print(f"Saved {onnx_path}") + + ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx") + with open(ref_path, "w") as f: + f.write(ref_header(name, ref, dtype_str)) + print(f" → reference {ref_path}") + + inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx") + with open(inp_path, "w") as f: + f.write(inp_header2(name, a, b, dtype_str)) + print(f" → input ref {inp_path}") + + +# Unary BitwiseNot +name = "Logic_BitwiseNot" +model = make_unary_model("BitwiseNot", INT3D_SHAPE, TensorProto.INT32, name) +onnx_path = os.path.join(OUT_DIR, f"{name}.onnx") +onnx.save(model, onnx_path) +print(f"Saved {onnx_path}") + +ref_not = (~int3d).astype(np.int32) +ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx") +with open(ref_path, "w") as f: + f.write(ref_header(name, ref_not, "int32_t")) +print(f" → reference {ref_path}") + +inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx") +with open(inp_path, "w") as f: + # Single-input version + flat = int3d.flatten() + vals = ", ".join(str(int(v)) for v in flat) + content = ( + f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"#include \n" + f"namespace {name}_Input {{\n" + f" static int32_t data_a[{flat.size}] = {{{vals}}};\n" + f"}} // namespace {name}_Input\n" + ) + f.write(content) +print(f" → input ref {inp_path}") + +print("\nAll Logic test models and references generated successfully.") diff --git a/test/input_models/Logic_And.onnx b/test/input_models/Logic_And.onnx new file mode 100644 index 0000000..ea0dbce Binary files /dev/null and b/test/input_models/Logic_And.onnx differ diff --git a/test/input_models/Logic_BitwiseAnd.onnx b/test/input_models/Logic_BitwiseAnd.onnx new file mode 100644 index 0000000..a7bf522 Binary files /dev/null and b/test/input_models/Logic_BitwiseAnd.onnx differ diff --git a/test/input_models/Logic_BitwiseNot.onnx b/test/input_models/Logic_BitwiseNot.onnx new file mode 100644 index 0000000..6ec0a35 Binary files /dev/null and b/test/input_models/Logic_BitwiseNot.onnx differ diff --git a/test/input_models/Logic_BitwiseOr.onnx b/test/input_models/Logic_BitwiseOr.onnx new file mode 100644 index 0000000..49ae37e Binary files /dev/null and b/test/input_models/Logic_BitwiseOr.onnx differ diff --git a/test/input_models/Logic_BitwiseXor.onnx b/test/input_models/Logic_BitwiseXor.onnx new file mode 100644 index 0000000..aba2037 Binary files /dev/null and b/test/input_models/Logic_BitwiseXor.onnx differ diff --git a/test/input_models/Logic_Or.onnx b/test/input_models/Logic_Or.onnx new file mode 100644 index 0000000..563e9ec Binary files /dev/null and b/test/input_models/Logic_Or.onnx differ diff --git a/test/input_models/Logic_Xor.onnx b/test/input_models/Logic_Xor.onnx new file mode 100644 index 0000000..c7067e1 Binary files /dev/null and b/test/input_models/Logic_Xor.onnx differ diff --git a/src/SOFIE_core/test/input_models/Max.onnx b/test/input_models/Max.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Max.onnx rename to test/input_models/Max.onnx diff --git a/src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx b/test/input_models/MaxMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx rename to test/input_models/MaxMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool1d.onnx b/test/input_models/MaxPool1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool1d.onnx rename to test/input_models/MaxPool1d.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool2d.onnx b/test/input_models/MaxPool2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool2d.onnx rename to test/input_models/MaxPool2d.onnx diff --git a/src/SOFIE_core/test/input_models/MaxPool3d.onnx b/test/input_models/MaxPool3d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MaxPool3d.onnx rename to test/input_models/MaxPool3d.onnx diff --git a/src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx b/test/input_models/MeanMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx rename to test/input_models/MeanMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx b/test/input_models/MinMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx rename to test/input_models/MinMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/Mul.onnx b/test/input_models/Mul.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Mul.onnx rename to test/input_models/Mul.onnx diff --git a/src/SOFIE_core/test/input_models/Neg.onnx b/test/input_models/Neg.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Neg.onnx rename to test/input_models/Neg.onnx diff --git a/test/input_models/Not.onnx b/test/input_models/Not.onnx new file mode 100644 index 0000000..b29ca99 Binary files /dev/null and b/test/input_models/Not.onnx differ diff --git a/src/SOFIE_core/test/input_models/Pad.onnx b/test/input_models/Pad.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pad.onnx rename to test/input_models/Pad.onnx diff --git a/src/SOFIE_core/test/input_models/Pow.onnx b/test/input_models/Pow.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pow.onnx rename to test/input_models/Pow.onnx diff --git a/src/SOFIE_core/test/input_models/Pow_broadcast.onnx b/test/input_models/Pow_broadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Pow_broadcast.onnx rename to test/input_models/Pow_broadcast.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBatchwise.onnx b/test/input_models/RNNBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBatchwise.onnx rename to test/input_models/RNNBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBidirectional.onnx b/test/input_models/RNNBidirectional.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBidirectional.onnx rename to test/input_models/RNNBidirectional.onnx diff --git a/src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx b/test/input_models/RNNBidirectionalBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx rename to test/input_models/RNNBidirectionalBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RNNDefaults.onnx b/test/input_models/RNNDefaults.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNDefaults.onnx rename to test/input_models/RNNDefaults.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSeqLength.onnx b/test/input_models/RNNSeqLength.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSeqLength.onnx rename to test/input_models/RNNSeqLength.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSequence.onnx b/test/input_models/RNNSequence.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSequence.onnx rename to test/input_models/RNNSequence.onnx diff --git a/src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx b/test/input_models/RNNSequenceBatchwise.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx rename to test/input_models/RNNSequenceBatchwise.onnx diff --git a/src/SOFIE_core/test/input_models/RandomNormal.onnx b/test/input_models/RandomNormal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RandomNormal.onnx rename to test/input_models/RandomNormal.onnx diff --git a/src/SOFIE_core/test/input_models/RandomUniform.onnx b/test/input_models/RandomUniform.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RandomUniform.onnx rename to test/input_models/RandomUniform.onnx diff --git a/src/SOFIE_core/test/input_models/RangeFloat.onnx b/test/input_models/RangeFloat.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RangeFloat.onnx rename to test/input_models/RangeFloat.onnx diff --git a/src/SOFIE_core/test/input_models/RangeInt.onnx b/test/input_models/RangeInt.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/RangeInt.onnx rename to test/input_models/RangeInt.onnx diff --git a/src/SOFIE_core/test/input_models/Reciprocal.onnx b/test/input_models/Reciprocal.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Reciprocal.onnx rename to test/input_models/Reciprocal.onnx diff --git a/test/input_models/ReduceL2.onnx b/test/input_models/ReduceL2.onnx new file mode 100644 index 0000000..1aadbc8 Binary files /dev/null and b/test/input_models/ReduceL2.onnx differ diff --git a/test/input_models/ReduceL2Large.onnx b/test/input_models/ReduceL2Large.onnx new file mode 100644 index 0000000..75d4fc7 Binary files /dev/null and b/test/input_models/ReduceL2Large.onnx differ diff --git a/test/input_models/ReduceMax.onnx b/test/input_models/ReduceMax.onnx new file mode 100644 index 0000000..fc837e8 Binary files /dev/null and b/test/input_models/ReduceMax.onnx differ diff --git a/test/input_models/ReduceMaxModelGenerator.py b/test/input_models/ReduceMaxModelGenerator.py new file mode 100644 index 0000000..305948f --- /dev/null +++ b/test/input_models/ReduceMaxModelGenerator.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Generate ONNX test models for the SOFIE ReduceMax operator and write +the corresponding C++ reference headers. + +Models created +────────────── + ReduceMax.onnx [1,2,3] float, axis=1, keepdims=0 (kLast / matches ReduceMean) + ReduceMax_axis0.onnx [3,4] float, axis=0, keepdims=0 (kFirst) + ReduceMax_mid.onnx [2,3,4] float, axis=1, keepdims=0 (kMiddle) + +Usage: + cd /SOFIE/core/test/input_models + python3 ReduceMaxModelGenerator.py +""" + +import os +import numpy as np +import onnx +from onnx import helper, TensorProto + +OUT_DIR = os.path.dirname(os.path.abspath(__file__)) +REF_DIR = os.path.join(OUT_DIR, "references") +os.makedirs(REF_DIR, exist_ok=True) + + +def make_reducemax_model(input_shape, axes, keepdims, name): + """Build a single ReduceMax node model (opset 13, axes as attribute).""" + x = helper.make_tensor_value_info("input", TensorProto.FLOAT, input_shape) + # Compute output shape + out_shape = list(input_shape) + for ax in axes: + if keepdims: + out_shape[ax] = 1 + else: + out_shape[ax] = None # will be removed below + if not keepdims: + out_shape = [d for i, d in enumerate(out_shape) if i not in axes] + y = helper.make_tensor_value_info("output", TensorProto.FLOAT, out_shape) + + node = helper.make_node( + "ReduceMax", + inputs=["input"], + outputs=["output"], + axes=axes, + keepdims=keepdims, + ) + graph = helper.make_graph([node], name, [x], [y]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) + model.ir_version = 7 + onnx.checker.check_model(model) + return model + + +def ref_header(name, data): + flat = data.flatten() + vals = ", ".join(f"{v:.8f}f" for v in flat) + return ( + f"// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"namespace {name}_ExpectedOutput {{\n" + f" static float output[{flat.size}] = {{{vals}}};\n" + f"}} // namespace {name}_ExpectedOutput\n" + ) + + +rng = np.random.default_rng(42) + +CASES = [ + # (name, input_shape, axes, keepdims, input_data) + # axis=1 on [1,2,3] → reduces last dim → kLast code path + ("ReduceMax", [1, 2, 3], [1], 0, np.array([[[5., 2., 3.], [5., 5., 4.]]], dtype=np.float32)), + # axis=0 on [3,4] → reduces first dim → kFirst code path + ("ReduceMax_axis0", [3, 4], [0], 0, rng.standard_normal((3, 4)).astype(np.float32)), + # axis=1 on [2,3,4] → reduces middle dim → kMiddle code path + ("ReduceMax_mid", [2, 3, 4], [1], 0, rng.standard_normal((2, 3, 4)).astype(np.float32)), +] + +for (name, shape, axes, keepdims, x) in CASES: + model = make_reducemax_model(shape, axes, keepdims, name) + onnx_path = os.path.join(OUT_DIR, f"{name}.onnx") + onnx.save(model, onnx_path) + print(f"Saved {onnx_path}") + + y = np.max(x, axis=tuple(axes), keepdims=bool(keepdims)) + ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx") + with open(ref_path, "w") as f: + f.write(ref_header(name, y)) + print(f" → reference {ref_path} shape={list(y.shape)}") + +print("\nAll ReduceMax test models and references generated successfully.") diff --git a/test/input_models/ReduceMax_axis0.onnx b/test/input_models/ReduceMax_axis0.onnx new file mode 100644 index 0000000..632fbab Binary files /dev/null and b/test/input_models/ReduceMax_axis0.onnx differ diff --git a/test/input_models/ReduceMax_mid.onnx b/test/input_models/ReduceMax_mid.onnx new file mode 100644 index 0000000..d49a222 Binary files /dev/null and b/test/input_models/ReduceMax_mid.onnx differ diff --git a/src/SOFIE_core/test/input_models/ReduceMean.onnx b/test/input_models/ReduceMean.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceMean.onnx rename to test/input_models/ReduceMean.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceProd.onnx b/test/input_models/ReduceProd.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceProd.onnx rename to test/input_models/ReduceProd.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceSum.onnx b/test/input_models/ReduceSum.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceSum.onnx rename to test/input_models/ReduceSum.onnx diff --git a/src/SOFIE_core/test/input_models/ReduceSumSquare.onnx b/test/input_models/ReduceSumSquare.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ReduceSumSquare.onnx rename to test/input_models/ReduceSumSquare.onnx diff --git a/src/SOFIE_core/test/input_models/ScatterElements.onnx b/test/input_models/ScatterElements.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/ScatterElements.onnx rename to test/input_models/ScatterElements.onnx diff --git a/src/SOFIE_core/test/input_models/Shape.onnx b/test/input_models/Shape.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Shape.onnx rename to test/input_models/Shape.onnx diff --git a/src/SOFIE_core/test/input_models/Sin.onnx b/test/input_models/Sin.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sin.onnx rename to test/input_models/Sin.onnx diff --git a/src/SOFIE_core/test/input_models/Slice.onnx b/test/input_models/Slice.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice.onnx rename to test/input_models/Slice.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx b/test/input_models/Slice_Default_Axis.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx rename to test/input_models/Slice_Default_Axis.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx b/test/input_models/Slice_Default_Steps.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx rename to test/input_models/Slice_Default_Steps.onnx diff --git a/src/SOFIE_core/test/input_models/Slice_Neg.onnx b/test/input_models/Slice_Neg.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Slice_Neg.onnx rename to test/input_models/Slice_Neg.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax1d.onnx b/test/input_models/Softmax1d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax1d.onnx rename to test/input_models/Softmax1d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax2d.onnx b/test/input_models/Softmax2d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax2d.onnx rename to test/input_models/Softmax2d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax3d.onnx b/test/input_models/Softmax3d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax3d.onnx rename to test/input_models/Softmax3d.onnx diff --git a/src/SOFIE_core/test/input_models/Softmax4d.onnx b/test/input_models/Softmax4d.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Softmax4d.onnx rename to test/input_models/Softmax4d.onnx diff --git a/test/input_models/Softplus.onnx b/test/input_models/Softplus.onnx new file mode 100644 index 0000000..2f6a69f --- /dev/null +++ b/test/input_models/Softplus.onnx @@ -0,0 +1,11 @@ +  onnx-example:S + +inputoutput"SoftplusAbsZ +input +  + +b +output +  + +B \ No newline at end of file diff --git a/src/SOFIE_core/test/input_models/Split_0.onnx b/test/input_models/Split_0.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_0.onnx rename to test/input_models/Split_0.onnx diff --git a/src/SOFIE_core/test/input_models/Split_1.onnx b/test/input_models/Split_1.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_1.onnx rename to test/input_models/Split_1.onnx diff --git a/src/SOFIE_core/test/input_models/Split_2.onnx b/test/input_models/Split_2.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Split_2.onnx rename to test/input_models/Split_2.onnx diff --git a/src/SOFIE_core/test/input_models/Sqrt.onnx b/test/input_models/Sqrt.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sqrt.onnx rename to test/input_models/Sqrt.onnx diff --git a/src/SOFIE_core/test/input_models/Sub.onnx b/test/input_models/Sub.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Sub.onnx rename to test/input_models/Sub.onnx diff --git a/src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx b/test/input_models/SumMultidirectionalBroadcast.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx rename to test/input_models/SumMultidirectionalBroadcast.onnx diff --git a/src/SOFIE_core/test/input_models/Tanh.onnx b/test/input_models/Tanh.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Tanh.onnx rename to test/input_models/Tanh.onnx diff --git a/src/SOFIE_core/test/input_models/Tile5D.onnx b/test/input_models/Tile5D.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Tile5D.onnx rename to test/input_models/Tile5D.onnx diff --git a/src/SOFIE_core/test/input_models/TopK.onnx b/test/input_models/TopK.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/TopK.onnx rename to test/input_models/TopK.onnx diff --git a/test/input_models/Transpose.onnx b/test/input_models/Transpose.onnx new file mode 100644 index 0000000..0e08157 Binary files /dev/null and b/test/input_models/Transpose.onnx differ diff --git a/test/input_models/TriluModelGenerator.py b/test/input_models/TriluModelGenerator.py new file mode 100644 index 0000000..de110c4 --- /dev/null +++ b/test/input_models/TriluModelGenerator.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Generate ONNX test models for the SOFIE Trilu operator and write +the corresponding C++ reference headers. + +Models created +────────────── + Trilu_upper.onnx 4×4 float, upper=1, k=0 (standard upper triangular) + Trilu_lower.onnx 4×4 float, upper=0, k=0 (standard lower triangular) + Trilu_k2.onnx 3×5 float, upper=1, k=2 (offset shifts diagonal up) + Trilu_kn1.onnx 3×5 float, upper=0, k=-1 (offset shifts diagonal down) + Trilu_3D.onnx 2×3×4 float, upper=1, k=0 (batched) + +All k values are embedded as ONNX Constant nodes (scalar int64) so they are +resolved statically by SOFIE. + +Usage: + cd /SOFIE/core/test/input_models + python3 TriluModelGenerator.py +""" + +import os +import numpy as np +import onnx +from onnx import helper, TensorProto, numpy_helper + +OUT_DIR = os.path.dirname(os.path.abspath(__file__)) +REF_DIR = os.path.join(OUT_DIR, "references") +os.makedirs(REF_DIR, exist_ok=True) + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def make_trilu_model(input_shape: list[int], + upper: int, + k: int, + name: str) -> onnx.ModelProto: + """Build an ONNX model with a single Trilu node. + + k is embedded as a Constant node so SOFIE can resolve it statically. + """ + # Graph inputs + x = helper.make_tensor_value_info("input", TensorProto.FLOAT, input_shape) + + # Constant node for k + k_const_name = "k_const" + k_tensor = numpy_helper.from_array(np.array(k, dtype=np.int64), + name=k_const_name) + k_node = helper.make_node( + "Constant", + inputs=[], + outputs=[k_const_name], + value=k_tensor, + ) + + # Trilu node + trilu_node = helper.make_node( + "Trilu", + inputs=["input", k_const_name], + outputs=["output"], + upper=upper, + ) + + # Graph output (shape same as input) + y = helper.make_tensor_value_info("output", TensorProto.FLOAT, input_shape) + + graph = helper.make_graph( + nodes=[k_node, trilu_node], + name=name, + inputs=[x], + outputs=[y], + ) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) + model.ir_version = 7 + onnx.checker.check_model(model) + return model + + +def ref_header(name: str, data: np.ndarray) -> str: + """Emit a C++ header with the flattened expected output array.""" + flat = data.flatten() + vals = ", ".join(f"{v:.8f}f" for v in flat) + return ( + f"// Auto-generated by TriluModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"namespace {name}_ExpectedOutput {{\n" + f" static float outputs[{flat.size}] = {{{vals}}};\n" + f"}} // namespace {name}_ExpectedOutput\n" + ) + + +def trilu_ref(x: np.ndarray, upper: int, k: int) -> np.ndarray: + """NumPy reference implementation of the Trilu operator.""" + if upper: + return np.triu(x, k=k).astype(np.float32) + else: + return np.tril(x, k=k).astype(np.float32) + + +# ───────────────────────────────────────────────────────────────────────────── +# Test cases +# ───────────────────────────────────────────────────────────────────────────── + +rng = np.random.default_rng(42) + +CASES = [ + # (name, shape, upper, k, input_values) + ("Trilu_upper", [4, 4], 1, 0, rng.standard_normal((4, 4)).astype(np.float32)), + ("Trilu_lower", [4, 4], 0, 0, rng.standard_normal((4, 4)).astype(np.float32)), + ("Trilu_k2", [3, 5], 1, 2, rng.standard_normal((3, 5)).astype(np.float32)), + ("Trilu_kn1", [3, 5], 0, -1, rng.standard_normal((3, 5)).astype(np.float32)), + ("Trilu_3D", [2, 3, 4], 1, 0, rng.standard_normal((2, 3, 4)).astype(np.float32)), +] + +for (name, shape, upper, k, x) in CASES: + # ── ONNX model ─────────────────────────────────────────────────────────── + model = make_trilu_model(shape, upper, k, name) + onnx_path = os.path.join(OUT_DIR, f"{name}.onnx") + onnx.save(model, onnx_path) + print(f"Saved {onnx_path}") + + # ── Reference output ───────────────────────────────────────────────────── + y = trilu_ref(x, upper, k) + ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx") + with open(ref_path, "w") as f: + f.write(ref_header(name, y)) + print(f" → reference {ref_path} shape={list(y.shape)}") + + # Also save the input so tests can reconstruct it. + inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx") + with open(inp_path, "w") as f: + flat_in = x.flatten() + vals_in = ", ".join(f"{v:.8f}f" for v in flat_in) + f.write( + f"// Auto-generated by TriluModelGenerator.py — DO NOT EDIT\n" + f"#pragma once\n" + f"namespace {name}_Input {{\n" + f" static float data[{flat_in.size}] = {{{vals_in}}};\n" + f"}} // namespace {name}_Input\n" + ) + print(f" → input ref {inp_path}") + +print("\nAll Trilu test models and references generated successfully.") diff --git a/test/input_models/Trilu_3D.onnx b/test/input_models/Trilu_3D.onnx new file mode 100644 index 0000000..0a17c1b Binary files /dev/null and b/test/input_models/Trilu_3D.onnx differ diff --git a/test/input_models/Trilu_k2.onnx b/test/input_models/Trilu_k2.onnx new file mode 100644 index 0000000..c484241 Binary files /dev/null and b/test/input_models/Trilu_k2.onnx differ diff --git a/test/input_models/Trilu_kn1.onnx b/test/input_models/Trilu_kn1.onnx new file mode 100644 index 0000000..c9865c3 Binary files /dev/null and b/test/input_models/Trilu_kn1.onnx differ diff --git a/test/input_models/Trilu_lower.onnx b/test/input_models/Trilu_lower.onnx new file mode 100644 index 0000000..9ac93d6 Binary files /dev/null and b/test/input_models/Trilu_lower.onnx differ diff --git a/test/input_models/Trilu_upper.onnx b/test/input_models/Trilu_upper.onnx new file mode 100644 index 0000000..637567b Binary files /dev/null and b/test/input_models/Trilu_upper.onnx differ diff --git a/src/SOFIE_core/test/input_models/Where.onnx b/test/input_models/Where.onnx similarity index 100% rename from src/SOFIE_core/test/input_models/Where.onnx rename to test/input_models/Where.onnx diff --git a/src/SOFIE_core/test/input_models/references/Add.ref.hxx b/test/input_models/references/Add.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Add.ref.hxx rename to test/input_models/references/Add.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx b/test/input_models/references/AddBroadcast1.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx rename to test/input_models/references/AddBroadcast1.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx b/test/input_models/references/AddBroadcast2.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx rename to test/input_models/references/AddBroadcast2.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx b/test/input_models/references/AddBroadcast3.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx rename to test/input_models/references/AddBroadcast3.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx b/test/input_models/references/AddBroadcast4.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx rename to test/input_models/references/AddBroadcast4.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx b/test/input_models/references/AddBroadcast5.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx rename to test/input_models/references/AddBroadcast5.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx b/test/input_models/references/AddBroadcast6.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx rename to test/input_models/references/AddBroadcast6.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx b/test/input_models/references/AddBroadcast7.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx rename to test/input_models/references/AddBroadcast7.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx b/test/input_models/references/AvgPool.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx rename to test/input_models/references/AvgPool.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Cast.ref.hxx b/test/input_models/references/Cast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Cast.ref.hxx rename to test/input_models/references/Cast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx b/test/input_models/references/ComplexTopK.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx rename to test/input_models/references/ComplexTopK.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Constant.ref.hxx b/test/input_models/references/Constant.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Constant.ref.hxx rename to test/input_models/references/Constant.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx b/test/input_models/references/ConvTranspose1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx rename to test/input_models/references/ConvTranspose1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx b/test/input_models/references/ConvTranspose2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx rename to test/input_models/references/ConvTranspose2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx b/test/input_models/references/ConvTranspose3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx rename to test/input_models/references/ConvTranspose3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx b/test/input_models/references/ConvTransposeBias2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx rename to test/input_models/references/ConvTransposeBias2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx b/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx rename to test/input_models/references/ConvTransposeBias2dBatched.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx b/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx rename to test/input_models/references/ConvWithAsymmetricPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx b/test/input_models/references/ConvWithAutopadSameLower.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx rename to test/input_models/references/ConvWithAutopadSameLower.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx b/test/input_models/references/ConvWithPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx rename to test/input_models/references/ConvWithPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx b/test/input_models/references/ConvWithStridesNoPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx rename to test/input_models/references/ConvWithStridesNoPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx b/test/input_models/references/ConvWithStridesPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx rename to test/input_models/references/ConvWithStridesPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx b/test/input_models/references/ConvWithoutPadding.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx rename to test/input_models/references/ConvWithoutPadding.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Div.ref.hxx b/test/input_models/references/Div.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Div.ref.hxx rename to test/input_models/references/Div.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Elu.ref.hxx b/test/input_models/references/Elu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Elu.ref.hxx rename to test/input_models/references/Elu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Equal.ref.hxx b/test/input_models/references/Equal.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Equal.ref.hxx rename to test/input_models/references/Equal.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Erf.ref.hxx b/test/input_models/references/Erf.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Erf.ref.hxx rename to test/input_models/references/Erf.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Exp.ref.hxx b/test/input_models/references/Exp.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Exp.ref.hxx rename to test/input_models/references/Exp.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx b/test/input_models/references/ExpandDiffSize.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx rename to test/input_models/references/ExpandDiffSize.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx b/test/input_models/references/ExpandSameSize.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx rename to test/input_models/references/ExpandSameSize.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx b/test/input_models/references/EyeLike.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx rename to test/input_models/references/EyeLike.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx b/test/input_models/references/GRUBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx rename to test/input_models/references/GRUBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx b/test/input_models/references/GRUBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx rename to test/input_models/references/GRUBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx b/test/input_models/references/GRUDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx rename to test/input_models/references/GRUDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx b/test/input_models/references/GRUInitialBias.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx rename to test/input_models/references/GRUInitialBias.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx b/test/input_models/references/GRUSeqLength.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx rename to test/input_models/references/GRUSeqLength.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx b/test/input_models/references/Gather2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx rename to test/input_models/references/Gather2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx b/test/input_models/references/GatherAxis0.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx rename to test/input_models/references/GatherAxis0.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx b/test/input_models/references/GatherAxis1.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx rename to test/input_models/references/GatherAxis1.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx b/test/input_models/references/GatherAxis2.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx rename to test/input_models/references/GatherAxis2.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx b/test/input_models/references/GatherAxis3.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx rename to test/input_models/references/GatherAxis3.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx b/test/input_models/references/GatherNegativeIndices.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx rename to test/input_models/references/GatherNegativeIndices.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Greater.ref.hxx b/test/input_models/references/Greater.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Greater.ref.hxx rename to test/input_models/references/Greater.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx b/test/input_models/references/GreaterOrEqual.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx rename to test/input_models/references/GreaterOrEqual.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx b/test/input_models/references/LSTMBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx rename to test/input_models/references/LSTMBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx b/test/input_models/references/LSTMBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx rename to test/input_models/references/LSTMBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx b/test/input_models/references/LSTMDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx rename to test/input_models/references/LSTMDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx b/test/input_models/references/LSTMInitialBias.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx rename to test/input_models/references/LSTMInitialBias.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx b/test/input_models/references/LSTMPeepholes.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx rename to test/input_models/references/LSTMPeepholes.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx b/test/input_models/references/LayerNormalization2d.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx rename to test/input_models/references/LayerNormalization2d.hxx diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx b/test/input_models/references/LayerNormalization4d.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx rename to test/input_models/references/LayerNormalization4d.hxx diff --git a/src/SOFIE_core/test/input_models/references/Less.ref.hxx b/test/input_models/references/Less.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Less.ref.hxx rename to test/input_models/references/Less.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx b/test/input_models/references/LessOrEqual.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx rename to test/input_models/references/LessOrEqual.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx b/test/input_models/references/LinearWithLeakyRelu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx rename to test/input_models/references/LinearWithLeakyRelu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx b/test/input_models/references/LinearWithSelu.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx rename to test/input_models/references/LinearWithSelu.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx b/test/input_models/references/LinearWithSigmoid.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx rename to test/input_models/references/LinearWithSigmoid.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx b/test/input_models/references/Linear_16.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx rename to test/input_models/references/Linear_16.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx b/test/input_models/references/Linear_32.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx rename to test/input_models/references/Linear_32.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx b/test/input_models/references/Linear_64.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx rename to test/input_models/references/Linear_64.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Log.ref.hxx b/test/input_models/references/Log.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Log.ref.hxx rename to test/input_models/references/Log.ref.hxx diff --git a/test/input_models/references/Logic_And.ref.hxx b/test/input_models/references/Logic_And.ref.hxx new file mode 100644 index 0000000..02b36dc --- /dev/null +++ b/test/input_models/references/Logic_And.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_And_ExpectedOutput { + static uint8_t outputs[16] = {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0}; +} // namespace Logic_And_ExpectedOutput diff --git a/test/input_models/references/Logic_And_input.ref.hxx b/test/input_models/references/Logic_And_input.ref.hxx new file mode 100644 index 0000000..0caf6a7 --- /dev/null +++ b/test/input_models/references/Logic_And_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_And_Input { + static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1}; + static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0}; +} // namespace Logic_And_Input diff --git a/test/input_models/references/Logic_BitwiseAnd.ref.hxx b/test/input_models/references/Logic_BitwiseAnd.ref.hxx new file mode 100644 index 0000000..d8c16af --- /dev/null +++ b/test/input_models/references/Logic_BitwiseAnd.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseAnd_ExpectedOutput { + static int32_t outputs[15] = {14, 2, 5, 32, 66, 64, -47, 26, -92, 1, 40, 32, 32, 84, 48}; +} // namespace Logic_BitwiseAnd_ExpectedOutput diff --git a/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx b/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx new file mode 100644 index 0000000..99c049b --- /dev/null +++ b/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseAnd_Input { + static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11}; + static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48}; +} // namespace Logic_BitwiseAnd_Input diff --git a/test/input_models/references/Logic_BitwiseNot.ref.hxx b/test/input_models/references/Logic_BitwiseNot.ref.hxx new file mode 100644 index 0000000..a8de28f --- /dev/null +++ b/test/input_models/references/Logic_BitwiseNot.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseNot_ExpectedOutput { + static int32_t outputs[24] = {26, -94, 17, 34, -82, 25, 84, 6, -60, 62, 7, 74, -38, 4, 33, 54, -13, -34, -89, 12, 67, -67, -26, -41}; +} // namespace Logic_BitwiseNot_ExpectedOutput diff --git a/test/input_models/references/Logic_BitwiseNot_input.ref.hxx b/test/input_models/references/Logic_BitwiseNot_input.ref.hxx new file mode 100644 index 0000000..34408f7 --- /dev/null +++ b/test/input_models/references/Logic_BitwiseNot_input.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseNot_Input { + static int32_t data_a[24] = {-27, 93, -18, -35, 81, -26, -85, -7, 59, -63, -8, -75, 37, -5, -34, -55, 12, 33, 88, -13, -68, 66, 25, 40}; +} // namespace Logic_BitwiseNot_Input diff --git a/test/input_models/references/Logic_BitwiseOr.ref.hxx b/test/input_models/references/Logic_BitwiseOr.ref.hxx new file mode 100644 index 0000000..25ff8ac --- /dev/null +++ b/test/input_models/references/Logic_BitwiseOr.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseOr_ExpectedOutput { + static int32_t outputs[15] = {-18, 43, 127, -69, -57, -27, -5, -1, -67, 59, -70, -14, -83, 94, -11}; +} // namespace Logic_BitwiseOr_ExpectedOutput diff --git a/test/input_models/references/Logic_BitwiseOr_input.ref.hxx b/test/input_models/references/Logic_BitwiseOr_input.ref.hxx new file mode 100644 index 0000000..c77c05e --- /dev/null +++ b/test/input_models/references/Logic_BitwiseOr_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseOr_Input { + static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11}; + static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48}; +} // namespace Logic_BitwiseOr_Input diff --git a/test/input_models/references/Logic_BitwiseXor.ref.hxx b/test/input_models/references/Logic_BitwiseXor.ref.hxx new file mode 100644 index 0000000..0885a6a --- /dev/null +++ b/test/input_models/references/Logic_BitwiseXor.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseXor_ExpectedOutput { + static int32_t outputs[15] = {-32, 41, 122, -101, -123, -91, 42, -27, 25, 58, -110, -46, -115, 10, -59}; +} // namespace Logic_BitwiseXor_ExpectedOutput diff --git a/test/input_models/references/Logic_BitwiseXor_input.ref.hxx b/test/input_models/references/Logic_BitwiseXor_input.ref.hxx new file mode 100644 index 0000000..dae7b9c --- /dev/null +++ b/test/input_models/references/Logic_BitwiseXor_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_BitwiseXor_Input { + static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11}; + static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48}; +} // namespace Logic_BitwiseXor_Input diff --git a/test/input_models/references/Logic_Or.ref.hxx b/test/input_models/references/Logic_Or.ref.hxx new file mode 100644 index 0000000..311de40 --- /dev/null +++ b/test/input_models/references/Logic_Or.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_Or_ExpectedOutput { + static uint8_t outputs[16] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}; +} // namespace Logic_Or_ExpectedOutput diff --git a/test/input_models/references/Logic_Or_input.ref.hxx b/test/input_models/references/Logic_Or_input.ref.hxx new file mode 100644 index 0000000..6d3bd98 --- /dev/null +++ b/test/input_models/references/Logic_Or_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_Or_Input { + static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1}; + static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0}; +} // namespace Logic_Or_Input diff --git a/test/input_models/references/Logic_Xor.ref.hxx b/test/input_models/references/Logic_Xor.ref.hxx new file mode 100644 index 0000000..af57d25 --- /dev/null +++ b/test/input_models/references/Logic_Xor.ref.hxx @@ -0,0 +1,6 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_Xor_ExpectedOutput { + static uint8_t outputs[16] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1}; +} // namespace Logic_Xor_ExpectedOutput diff --git a/test/input_models/references/Logic_Xor_input.ref.hxx b/test/input_models/references/Logic_Xor_input.ref.hxx new file mode 100644 index 0000000..51d62e3 --- /dev/null +++ b/test/input_models/references/Logic_Xor_input.ref.hxx @@ -0,0 +1,7 @@ +// Auto-generated by LogicModelGenerator.py — DO NOT EDIT +#pragma once +#include +namespace Logic_Xor_Input { + static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1}; + static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0}; +} // namespace Logic_Xor_Input diff --git a/src/SOFIE_core/test/input_models/references/Max.ref.hxx b/test/input_models/references/Max.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Max.ref.hxx rename to test/input_models/references/Max.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx rename to test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx b/test/input_models/references/MaxPool1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx rename to test/input_models/references/MaxPool1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx b/test/input_models/references/MaxPool2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx rename to test/input_models/references/MaxPool2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx b/test/input_models/references/MaxPool3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx rename to test/input_models/references/MaxPool3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx rename to test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx rename to test/input_models/references/MinMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Mul.ref.hxx b/test/input_models/references/Mul.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Mul.ref.hxx rename to test/input_models/references/Mul.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Neg.ref.hxx b/test/input_models/references/Neg.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Neg.ref.hxx rename to test/input_models/references/Neg.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Pow.ref.hxx b/test/input_models/references/Pow.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Pow.ref.hxx rename to test/input_models/references/Pow.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx b/test/input_models/references/Pow_broadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx rename to test/input_models/references/Pow_broadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx b/test/input_models/references/RNNBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx rename to test/input_models/references/RNNBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx b/test/input_models/references/RNNBidirectional.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx rename to test/input_models/references/RNNBidirectional.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx b/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx rename to test/input_models/references/RNNBidirectionalBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx b/test/input_models/references/RNNDefaults.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx rename to test/input_models/references/RNNDefaults.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx b/test/input_models/references/RNNSeqLength.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx rename to test/input_models/references/RNNSeqLength.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx b/test/input_models/references/RNNSequence.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx rename to test/input_models/references/RNNSequence.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx b/test/input_models/references/RNNSequenceBatchwise.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx rename to test/input_models/references/RNNSequenceBatchwise.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx b/test/input_models/references/RangeFloat.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx rename to test/input_models/references/RangeFloat.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx b/test/input_models/references/RangeInt.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx rename to test/input_models/references/RangeInt.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx b/test/input_models/references/Reciprocal.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx rename to test/input_models/references/Reciprocal.ref.hxx diff --git a/test/input_models/references/ReduceL2.ref.hxx b/test/input_models/references/ReduceL2.ref.hxx new file mode 100644 index 0000000..079b68b --- /dev/null +++ b/test/input_models/references/ReduceL2.ref.hxx @@ -0,0 +1,9 @@ +namespace ReduceL2_ExpectedOutput{ + // Input [1,2,3] = {5,2,3,5,5,4}, ReduceL2 over axis=1, keepdims=0 → shape [1,3] + // col0: sqrt(5^2+5^2)=sqrt(50), col1: sqrt(2^2+5^2)=sqrt(29), col2: sqrt(3^2+4^2)=5 + float output[] = { + 7.0710678118654755f, + 5.385164807134504f, + 5.0f + }; +} // namespace ReduceL2_ExpectedOutput diff --git a/test/input_models/references/ReduceMax.ref.hxx b/test/input_models/references/ReduceMax.ref.hxx new file mode 100644 index 0000000..b986048 --- /dev/null +++ b/test/input_models/references/ReduceMax.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT +#pragma once +namespace ReduceMax_ExpectedOutput { + static float output[3] = {5.00000000f, 5.00000000f, 4.00000000f}; +} // namespace ReduceMax_ExpectedOutput diff --git a/test/input_models/references/ReduceMax_axis0.ref.hxx b/test/input_models/references/ReduceMax_axis0.ref.hxx new file mode 100644 index 0000000..1d16a5a --- /dev/null +++ b/test/input_models/references/ReduceMax_axis0.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT +#pragma once +namespace ReduceMax_axis0_ExpectedOutput { + static float output[4] = {0.30471709f, -0.85304391f, 0.87939799f, 0.94056469f}; +} // namespace ReduceMax_axis0_ExpectedOutput diff --git a/test/input_models/references/ReduceMax_mid.ref.hxx b/test/input_models/references/ReduceMax_mid.ref.hxx new file mode 100644 index 0000000..4d88eaf --- /dev/null +++ b/test/input_models/references/ReduceMax_mid.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT +#pragma once +namespace ReduceMax_mid_ExpectedOutput { + static float output[8] = {0.36875078f, 1.12724125f, 1.22254133f, -0.04992591f, 0.41273260f, 0.43082100f, 2.14164758f, 1.12897229f}; +} // namespace ReduceMax_mid_ExpectedOutput diff --git a/src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx b/test/input_models/references/ReduceMean.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx rename to test/input_models/references/ReduceMean.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx b/test/input_models/references/ReduceProd.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx rename to test/input_models/references/ReduceProd.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Shape.ref.hxx b/test/input_models/references/Shape.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Shape.ref.hxx rename to test/input_models/references/Shape.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice.ref.hxx b/test/input_models/references/Slice.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice.ref.hxx rename to test/input_models/references/Slice.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx b/test/input_models/references/Slice_Default_Axis.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx rename to test/input_models/references/Slice_Default_Axis.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx b/test/input_models/references/Slice_Default_Steps.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx rename to test/input_models/references/Slice_Default_Steps.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx b/test/input_models/references/Slice_Neg.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx rename to test/input_models/references/Slice_Neg.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx b/test/input_models/references/Softmax1d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx rename to test/input_models/references/Softmax1d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx b/test/input_models/references/Softmax2d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx rename to test/input_models/references/Softmax2d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx b/test/input_models/references/Softmax3d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx rename to test/input_models/references/Softmax3d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx b/test/input_models/references/Softmax4d.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx rename to test/input_models/references/Softmax4d.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx b/test/input_models/references/Sqrt.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx rename to test/input_models/references/Sqrt.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Sub.ref.hxx b/test/input_models/references/Sub.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Sub.ref.hxx rename to test/input_models/references/Sub.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx b/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx rename to test/input_models/references/SumMultidirectionalBroadcast.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Tanh.ref.hxx b/test/input_models/references/Tanh.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Tanh.ref.hxx rename to test/input_models/references/Tanh.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx b/test/input_models/references/Tile5D.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx rename to test/input_models/references/Tile5D.ref.hxx diff --git a/src/SOFIE_core/test/input_models/references/TopK.ref.hxx b/test/input_models/references/TopK.ref.hxx similarity index 100% rename from src/SOFIE_core/test/input_models/references/TopK.ref.hxx rename to test/input_models/references/TopK.ref.hxx diff --git a/test/input_models/references/Trilu_3D.ref.hxx b/test/input_models/references/Trilu_3D.ref.hxx new file mode 100644 index 0000000..ac94083 --- /dev/null +++ b/test/input_models/references/Trilu_3D.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_3D_ExpectedOutput { + static float outputs[24] = {0.16275306f, 0.58622235f, 0.71122658f, 0.79334724f, 0.00000000f, -0.46235180f, 0.85797590f, -0.19130433f, 0.00000000f, 0.00000000f, -0.91945231f, 0.49716073f, 0.14242573f, 0.69048536f, -0.42725265f, 0.15853970f, 0.00000000f, -0.30934653f, 0.45677525f, -0.66192591f, 0.00000000f, 0.00000000f, -1.19583964f, 0.48697248f}; +} // namespace Trilu_3D_ExpectedOutput diff --git a/test/input_models/references/Trilu_3D_input.ref.hxx b/test/input_models/references/Trilu_3D_input.ref.hxx new file mode 100644 index 0000000..257553f --- /dev/null +++ b/test/input_models/references/Trilu_3D_input.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_3D_Input { + static float data[24] = {0.16275306f, 0.58622235f, 0.71122658f, 0.79334724f, -0.34872508f, -0.46235180f, 0.85797590f, -0.19130433f, -1.27568626f, -1.13328719f, -0.91945231f, 0.49716073f, 0.14242573f, 0.69048536f, -0.42725265f, 0.15853970f, 0.62559038f, -0.30934653f, 0.45677525f, -0.66192591f, -0.36305386f, -0.38173789f, -1.19583964f, 0.48697248f}; +} // namespace Trilu_3D_Input diff --git a/test/input_models/references/Trilu_k2.ref.hxx b/test/input_models/references/Trilu_k2.ref.hxx new file mode 100644 index 0000000..b8828e0 --- /dev/null +++ b/test/input_models/references/Trilu_k2.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_k2_ExpectedOutput { + static float outputs[15] = {0.00000000f, 0.00000000f, 0.61597943f, 1.12897229f, -0.11394746f, 0.00000000f, 0.00000000f, 0.00000000f, 0.74325418f, 0.54315424f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 0.87142879f}; +} // namespace Trilu_k2_ExpectedOutput diff --git a/test/input_models/references/Trilu_k2_input.ref.hxx b/test/input_models/references/Trilu_k2_input.ref.hxx new file mode 100644 index 0000000..7078b76 --- /dev/null +++ b/test/input_models/references/Trilu_k2_input.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_k2_Input { + static float data[15] = {-0.51224273f, -0.81377274f, 0.61597943f, 1.12897229f, -0.11394746f, -0.84015650f, -0.82448119f, 0.65059280f, 0.74325418f, 0.54315424f, -0.66550970f, 0.23216133f, 0.11668581f, 0.21868859f, 0.87142879f}; +} // namespace Trilu_k2_Input diff --git a/test/input_models/references/Trilu_kn1.ref.hxx b/test/input_models/references/Trilu_kn1.ref.hxx new file mode 100644 index 0000000..52fc33c --- /dev/null +++ b/test/input_models/references/Trilu_kn1.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_kn1_ExpectedOutput { + static float outputs[15] = {0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, -1.45715582f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 1.49494135f, -0.86583114f, 0.00000000f, 0.00000000f, 0.00000000f}; +} // namespace Trilu_kn1_ExpectedOutput diff --git a/test/input_models/references/Trilu_kn1_input.ref.hxx b/test/input_models/references/Trilu_kn1_input.ref.hxx new file mode 100644 index 0000000..4852760 --- /dev/null +++ b/test/input_models/references/Trilu_kn1_input.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_kn1_Input { + static float data[15] = {0.22359554f, 0.67891353f, 0.06757907f, 0.28911939f, 0.63128823f, -1.45715582f, -0.31967121f, -0.47037265f, -0.63887787f, -0.27514225f, 1.49494135f, -0.86583114f, 0.96827835f, -1.68286979f, -0.33488503f}; +} // namespace Trilu_kn1_Input diff --git a/test/input_models/references/Trilu_lower.ref.hxx b/test/input_models/references/Trilu_lower.ref.hxx new file mode 100644 index 0000000..dcacea7 --- /dev/null +++ b/test/input_models/references/Trilu_lower.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_lower_ExpectedOutput { + static float outputs[16] = {0.36875078f, 0.00000000f, 0.00000000f, 0.00000000f, -0.18486236f, -0.68092954f, 0.00000000f, 0.00000000f, -0.42832783f, -0.35213354f, 0.53230917f, 0.00000000f, 0.41273260f, 0.43082100f, 2.14164758f, -0.40641502f}; +} // namespace Trilu_lower_ExpectedOutput diff --git a/test/input_models/references/Trilu_lower_input.ref.hxx b/test/input_models/references/Trilu_lower_input.ref.hxx new file mode 100644 index 0000000..ca44378 --- /dev/null +++ b/test/input_models/references/Trilu_lower_input.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_lower_Input { + static float data[16] = {0.36875078f, -0.95888263f, 0.87845027f, -0.04992591f, -0.18486236f, -0.68092954f, 1.22254133f, -0.15452948f, -0.42832783f, -0.35213354f, 0.53230917f, 0.36544406f, 0.41273260f, 0.43082100f, 2.14164758f, -0.40641502f}; +} // namespace Trilu_lower_Input diff --git a/test/input_models/references/Trilu_upper.ref.hxx b/test/input_models/references/Trilu_upper.ref.hxx new file mode 100644 index 0000000..920861b --- /dev/null +++ b/test/input_models/references/Trilu_upper.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_upper_ExpectedOutput { + static float outputs[16] = {0.30471709f, -1.03998411f, 0.75045121f, 0.94056469f, 0.00000000f, -1.30217946f, 0.12784040f, -0.31624261f, 0.00000000f, 0.00000000f, 0.87939799f, 0.77779192f, 0.00000000f, 0.00000000f, 0.00000000f, -0.85929245f}; +} // namespace Trilu_upper_ExpectedOutput diff --git a/test/input_models/references/Trilu_upper_input.ref.hxx b/test/input_models/references/Trilu_upper_input.ref.hxx new file mode 100644 index 0000000..cae440f --- /dev/null +++ b/test/input_models/references/Trilu_upper_input.ref.hxx @@ -0,0 +1,5 @@ +// Auto-generated by TriluModelGenerator.py — DO NOT EDIT +#pragma once +namespace Trilu_upper_Input { + static float data[16] = {0.30471709f, -1.03998411f, 0.75045121f, 0.94056469f, -1.95103514f, -1.30217946f, 0.12784040f, -0.31624261f, -0.01680116f, -0.85304391f, 0.87939799f, 0.77779192f, 0.06603070f, 1.12724125f, 0.46750933f, -0.85929245f}; +} // namespace Trilu_upper_Input diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt new file mode 100644 index 0000000..36cfc55 --- /dev/null +++ b/utils/CMakeLists.txt @@ -0,0 +1,15 @@ +add_library(utils INTERFACE) + +target_include_directories(utils INTERFACE + $ + $ +) + +install(TARGETS utils + EXPORT SOFIETargets +) + +install( + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) diff --git a/utils/SOFIE/RTensor.hxx b/utils/SOFIE/RTensor.hxx new file mode 100644 index 0000000..db82dc9 --- /dev/null +++ b/utils/SOFIE/RTensor.hxx @@ -0,0 +1,628 @@ +#ifndef SOFIE_RTENSOR +#define SOFIE_RTENSOR + +#include +#include // std::size_t +#include +#include // std::runtime_error +#include // std::stringstream +#include // std::shared_ptr +#include // std::is_convertible +#include // std::reverse +#include // std::random_access_iterator_tag + +namespace SOFIE { + +/// Memory layout type +enum class MemoryLayout : uint8_t { + RowMajor = 0x01, + ColumnMajor = 0x02 +}; + +namespace Internal { + +/// \brief Get size of tensor from shape vector +/// \param[in] shape Shape vector +/// \return Size of contiguous memory +template +inline std::size_t GetSizeFromShape(const T &shape) +{ + if (shape.size() == 0) + return 0; + std::size_t size = 1; + for (auto &s : shape) + size *= s; + return size; +} + +/// \brief Compute strides from shape vector. +/// \param[in] shape Shape vector +/// \param[in] layout Memory layout +/// \return Size of contiguous memory +/// +/// This information is needed for the multi-dimensional indexing. See here: +/// https://en.wikipedia.org/wiki/Row-_and_column-major_order +/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html +template +inline std::vector ComputeStridesFromShape(const T &shape, MemoryLayout layout) +{ + const auto size = shape.size(); + T strides(size); + if (layout == MemoryLayout::RowMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[size - 1 - i] = 1; + } else { + strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1]; + } + } + } else if (layout == MemoryLayout::ColumnMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[i] = 1; + } else { + strides[i] = strides[i - 1] * shape[i - 1]; + } + } + } else { + std::stringstream ss; + ss << "Memory layout type is not valid for calculating strides."; + throw std::runtime_error(ss.str()); + } + return strides; +} + +/// \brief Compute indices from global index +/// \param[in] shape Shape vector +/// \param[in] idx Global index +/// \param[in] layout Memory layout +/// \return Indice vector +template +inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx) +{ + const auto size = shape.size(); + auto strides = ComputeStridesFromShape(shape, layout); + T indices(size); + auto r = idx; + for (std::size_t i = 0; i < size; i++) { + indices[i] = int(r / strides[i]); + r = r % strides[i]; + } + return indices; +} + +/// \brief Compute global index from indices +/// \param[in] strides Strides vector +/// \param[in] idx Indice vector +/// \return Global index +template +inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx) +{ + std::size_t globalIndex = 0; + const auto size = idx.size(); + for (std::size_t i = 0; i < size; i++) { + globalIndex += strides[size - 1 - i] * idx[size - 1 - i]; + } + return globalIndex; +} + +/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible +template +struct and_types : std::true_type { +}; + +template +struct and_types : std::integral_constant()> { +}; + +/// \brief Copy slice of a tensor recursively from here to there +/// \param[in] here Source tensor +/// \param[in] there Target tensor (slice of source tensor) +/// \param[in] mins Minimum of indices for each dimension +/// \param[in] maxs Maximum of indices for each dimension +/// \param[in] idx Current indices +/// \param[in] active Active index needed to stop the recursion +/// +/// Copy the content of a slice of a tensor from source to target. This is done +/// by recursively iterating over the ranges of the slice for each dimension. +template +void RecursiveCopy(const T &here, T &there, + const std::vector &mins, const std::vector &maxs, + std::vector idx, std::size_t active) +{ + const auto size = idx.size(); + for (std::size_t i = mins[active]; i < maxs[active]; i++) { + idx[active] = i; + if (active == size - 1) { + auto idxThere = idx; + for (std::size_t j = 0; j < size; j++) { + idxThere[j] -= mins[j]; + } + there(idxThere) = here(idx); + } else { + Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1); + } + } +} + +} // namespace SOFIE::Internal + +/// \class SOFIE::RTensor +/// \brief RTensor is a container with contiguous memory and shape information. +/// \tparam T Data-type of the tensor +/// +/// An RTensor is a vector-like container, which has additional shape information. +/// The elements of the multi-dimensional container can be accessed by their +/// indices in a coherent way without taking care about the one-dimensional memory +/// layout of the contiguous storage. This also allows to manipulate the shape +/// of the container without moving the actual elements in memory. Another feature +/// is that an RTensor can own the underlying contiguous memory but can also represent +/// only a view on existing data without owning it. +template > +class RTensor { +public: + // Typedefs + using Value_t = V; + using Shape_t = std::vector; + using Index_t = Shape_t; + using Slice_t = std::vector; + using Container_t = C; + +private: + Shape_t fShape; + Shape_t fStrides; + std::size_t fSize; + MemoryLayout fLayout; + Value_t *fData; + std::shared_ptr fContainer; + +protected: + void ReshapeInplace(const Shape_t &shape); + +public: + // Constructors + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + } + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] strides Strides vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + } + + /// \brief Construct a tensor owning externally provided data + /// \param[in] container Shared pointer to data container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(std::shared_ptr container, Shape_t shape, + MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fContainer(container) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fData = std::data(*fContainer); + } + + /// \brief Construct a tensor owning data initialized with new container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout) + { + // TODO: Document how data pointer is determined using STL iterator interface. + // TODO: Sanitize given container type with type traits + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fContainer = std::make_shared(fSize); + fData = std::data(*fContainer); + } + + // Access elements + Value_t &operator()(const Index_t &idx); + const Value_t &operator() (const Index_t &idx) const; + template Value_t &operator()(Idx... idx); + template const Value_t &operator() (Idx... idx) const; + + // Access properties + std::size_t GetSize() const { return fSize; } + const Shape_t &GetShape() const { return fShape; } + const Shape_t &GetStrides() const { return fStrides; } + Value_t *GetData() { return fData; } + const Value_t *GetData() const { return fData; } + std::shared_ptr GetContainer() { return fContainer; } + const std::shared_ptr GetContainer() const { return fContainer; } + MemoryLayout GetMemoryLayout() const { return fLayout; } + bool IsView() const { return fContainer == nullptr; } + bool IsOwner() const { return !IsView(); } + + // Copy + RTensor Copy(MemoryLayout layout = MemoryLayout::RowMajor) const; + + // Transformations + RTensor Transpose() const; + RTensor Squeeze() const; + RTensor ExpandDims(int idx) const; + RTensor Reshape(const Shape_t &shape) const; + RTensor Resize(const Shape_t &shape); + RTensor Slice(const Slice_t &slice); + + // Iterator class + class Iterator { + private: + RTensor& fTensor; + Index_t::value_type fGlobalIndex; + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = Value_t; + using difference_type = std::ptrdiff_t; + using pointer = Value_t *; + using reference = Value_t &; + + Iterator(RTensor& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {} + Iterator& operator++() { fGlobalIndex++; return *this; } + Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; } + Iterator& operator--() { fGlobalIndex--; return *this; } + Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; } + Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); } + Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); } + difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); } + Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; } + Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; } + Value_t& operator*() + { + auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex); + return fTensor(idx); + } + bool operator==(const Iterator& rhs) const + { + if (fGlobalIndex == rhs.GetGlobalIndex()) return true; + return false; + } + bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }; + bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); } + bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); } + bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); } + bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); } + typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; }; + }; + + // Iterator interface + // TODO: Document that the iterator always iterates following the physical memory layout. + Iterator begin() noexcept { + return Iterator(*this, 0); + } + Iterator end() noexcept { + return Iterator(*this, fSize); + } +}; + +/// \brief Reshape tensor in place +/// \param[in] shape Shape vector +/// Reshape tensor without changing the overall size +template +inline void RTensor::ReshapeInplace(const Shape_t &shape) +{ + const auto size = Internal::GetSizeFromShape(shape); + if (size != fSize) { + std::stringstream ss; + ss << "Cannot reshape tensor with size " << fSize << " into shape { "; + for (std::size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 1) { + ss << shape[i] << ", "; + } else { + ss << shape[i] << " }."; + } + } + throw std::runtime_error(ss.str()); + } + + // Compute new strides from shape + auto strides = Internal::ComputeStridesFromShape(shape, fLayout); + fShape = shape; + fStrides = strides; +} + + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline Value_t &RTensor::operator()(const Index_t &idx) +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline const Value_t &RTensor::operator() (const Index_t &idx) const +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +Value_t &RTensor::operator()(Idx... idx) +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +const Value_t &RTensor::operator() (Idx... idx) const +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Transpose +/// \returns New RTensor +/// The tensor is transposed by inverting the associated memory layout from row- +/// major to column-major and vice versa. Therefore, the underlying data is not +/// touched. +template +inline RTensor RTensor::Transpose() const +{ + MemoryLayout layout; + // Transpose by inverting memory layout + if (fLayout == MemoryLayout::RowMajor) { + layout = MemoryLayout::ColumnMajor; + } else if (fLayout == MemoryLayout::ColumnMajor) { + layout = MemoryLayout::RowMajor; + } else { + throw std::runtime_error("Memory layout is not known."); + } + + // Create copy of container + RTensor x(fData, fShape, fStrides, layout); + + // Reverse shape + std::reverse(x.fShape.begin(), x.fShape.end()); + + // Reverse strides + std::reverse(x.fStrides.begin(), x.fStrides.end()); + + return x; +} + +/// \brief Squeeze dimensions +/// \returns New RTensor +/// Squeeze removes the dimensions of size one from the shape. +template +inline RTensor RTensor::Squeeze() const +{ + // Remove dimensions of one and associated strides + Shape_t shape; + Shape_t strides; + for (std::size_t i = 0; i < fShape.size(); i++) { + if (fShape[i] != 1) { + shape.emplace_back(fShape[i]); + strides.emplace_back(fStrides[i]); + } + } + + // If all dimensions are 1, we need to keep one. + // This does not apply if the inital shape is already empty. Then, return + // the empty shape. + if (shape.size() == 0 && fShape.size() != 0) { + shape.emplace_back(1); + strides.emplace_back(1); + } + + // Create copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Expand dimensions +/// \param[in] idx Index in shape vector where dimension is added +/// \returns New RTensor +/// Inserts a dimension of one into the shape. +template +inline RTensor RTensor::ExpandDims(int idx) const +{ + // Compose shape vector with additional dimensions and adjust strides + const int len = fShape.size(); + auto shape = fShape; + auto strides = fStrides; + if (idx < 0) { + idx = len + 1 + idx; + } + if (idx < 0) { + throw std::runtime_error("Given negative index is invalid."); + } + else if (idx > len) { + throw std::runtime_error("Given index is invalid."); + } + shape.insert(shape.begin() + idx, 1); + strides = Internal::ComputeStridesFromShape(shape, fLayout); + + // Create view copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Reshape tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Reshape tensor without changing the overall size +template +inline RTensor RTensor::Reshape(const Shape_t &shape) const +{ + // Create copy, replace and return + RTensor x(*this); + x.ReshapeInplace(shape); + return x; +} + +/// \brief Resize tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Resize tensor into new shape +template +inline RTensor RTensor::Resize(const Shape_t &shape) +{ + // Create new tensor with the specified shape + RTensor x(shape, fLayout); + + // Copying contents from previous tensor + size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize(); + std::copy(this->GetData(), this->GetData() + n, x.GetData() ); + + return x; +} + +/// \brief Create a slice of the tensor +/// \param[in] slice Slice vector +/// \returns New RTensor +/// A slice is a subset of the tensor defined by a vector of pairs of indices. +template +inline RTensor RTensor::Slice(const Slice_t &slice) +{ + // Sanitize size of slice + const auto sliceSize = slice.size(); + const auto shapeSize = fShape.size(); + if (sliceSize != shapeSize) { + std::stringstream ss; + ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ")."; + throw std::runtime_error(ss.str()); + } + + // Sanitize slice indices + // TODO: Sanitize slice indices + /* + for (std::size_t i = 0; i < sliceSize; i++) { + } + */ + + // Convert -1 in slice to proper pair of indices + // TODO + + // Recompute shape and size + Shape_t shape(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + shape[i] = slice[i][1] - slice[i][0]; + } + auto size = Internal::GetSizeFromShape(shape); + + // Determine first element contributing to the slice and get the data pointer + Value_t *data; + Shape_t idx(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + idx[i] = slice[i][0]; + } + data = &operator()(idx); + + // Create copy and modify properties + RTensor x(*this); + x.fData = data; + x.fShape = shape; + x.fSize = size; + + // Squeeze tensor and return + return x.Squeeze(); +} + +/// Copy RTensor to new object +/// \param[in] layout Memory layout of the new RTensor +/// \returns New RTensor +/// The operation copies all elements of the current RTensor to a new RTensor +/// with the given layout contiguous in memory. Note that this copies by default +/// to a row major memory layout. +template +inline RTensor RTensor::Copy(MemoryLayout layout) const +{ + // Create new tensor with zeros owning the memory + RTensor r(fShape, layout); + + // Copy over the elements from this tensor + const auto mins = Shape_t(fShape.size()); + const auto maxs = fShape; + auto idx = mins; + Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0); + + return r; +} + +/// \brief Pretty printing +/// \param[in] os Output stream +/// \param[in] x RTensor +/// \return Modified output stream +template +std::ostream &operator<<(std::ostream &os, RTensor &x) +{ + const auto shapeSize = x.GetShape().size(); + if (shapeSize == 1) { + os << "{ "; + const auto size = x.GetSize(); + for (std::size_t i = 0; i < size; i++) { + os << x({i}); + if (i != size - 1) + os << ", "; + } + os << " }"; + } else if (shapeSize == 2) { + os << "{"; + const auto shape = x.GetShape(); + for (std::size_t i = 0; i < shape[0]; i++) { + os << " { "; + for (std::size_t j = 0; j < shape[1]; j++) { + os << x({i, j}); + if (j < shape[1] - 1) { + os << ", "; + } else { + os << " "; + } + } + os << "}"; + } + os << " }"; + } else { + os << "{ printing not yet implemented for this rank }"; + } + return os; +} + +} // namespace SOFIE + +namespace cling { +template +std::string printValue(SOFIE::RTensor *x) +{ + std::stringstream ss; + ss << *x; + return ss.str(); +} +} // namespace cling + +#endif // SOFIE_RTENSOR