diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..47452ee
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,158 @@
+name: Benchmark
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      warmup:
+        description: "Warmup iterations"
+        default: "5"
+      iterations:
+        description: "Timed iterations"
+        default: "100"
+
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  LCG_VIEW:     /cvmfs/sft.cern.ch/lcg/views/LCG_106a/x86_64-el9-gcc13-opt
+  CUDA_CVMFS:   /cvmfs/sft.cern.ch/lcg/contrib/cuda/12.4/x86_64-el9
+  CUDA_ARCH:    "90"
+  BUILD_TYPE:   Release
+  BENCH_WARMUP: ${{ github.event.inputs.warmup || '5' }}
+  BENCH_ITERS:  ${{ github.event.inputs.iterations || '100' }}
+  DEPS_CACHE:   /tmp/sofie-cmake-deps
+
+jobs:
+  benchmark:
+    name: Benchmark Comparison (H100)
+    runs-on: ml4ep-h100
+    container: registry.cern.ch/ngt/lxplus-like:9
+    timeout-minutes: 120
+
+    steps:
+      - name: GPU check
+        run: nvidia-smi
+
+      - name: Setup build environment
+        run: |
+          set -euo pipefail
+
+          if [ -f "${{ env.LCG_VIEW }}/setup.sh" ]; then
+            set +u; source "${{ env.LCG_VIEW }}/setup.sh"; set -u
+          else
+            echo "LCG view not found — installing from dnf"
+            dnf install -y epel-release
+            dnf install -y cmake ninja-build gcc-c++ python3 git \
+                           protobuf-devel openblas-devel
+          fi
+
+          if [ -x "${{ env.CUDA_CVMFS }}/bin/nvcc" ]; then
+            echo "${{ env.CUDA_CVMFS }}/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=${{ env.CUDA_CVMFS }}" >> "$GITHUB_ENV"
+          elif [ -x /usr/local/cuda/bin/nvcc ]; then
+            echo "/usr/local/cuda/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=/usr/local/cuda" >> "$GITHUB_ENV"
+          else
+            dnf config-manager --add-repo \
+              https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+            dnf install -y cuda-compiler-12-4 cuda-cudart-devel-12-4 cuda-libraries-devel-12-4
+            echo "/usr/local/cuda-12.4/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=/usr/local/cuda-12.4" >> "$GITHUB_ENV"
+          fi
+
+          echo "PATH=$PATH"                               >> "$GITHUB_ENV"
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}"    >> "$GITHUB_ENV"
+          echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}" >> "$GITHUB_ENV"
+
+      - name: Checkout PR branch
+        uses: actions/checkout@v4
+        with:
+          path: sofie-pr
+
+      - name: Checkout main branch
+        if: github.event_name == 'pull_request'
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: sofie-main
+
+      - name: Cache FetchContent dependencies
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.DEPS_CACHE }}
+          key: cmake-deps-bench-${{ hashFiles('sofie-pr/benchmark/CMakeLists.txt') }}
+          restore-keys: cmake-deps-bench-
+
+      - name: Configure PR build
+        run: |
+          cmake -B sofie-pr/build -S sofie-pr \
+            -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \
+            -DSOFIE_WITH_ROOT=OFF \
+            -DSOFIE_BENCHMARK=ON \
+            -DSOFIE_BENCHMARK_BACKEND=CUDA \
+            "-DSOFIE_BENCHMARK_CUDA_ARCH=${{ env.CUDA_ARCH }}" \
+            "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \
+            "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}"
+
+      - name: Build PR benchmark
+        run: cmake --build sofie-pr/build --target sofie_benchmark -j$(nproc)
+
+      - name: Run PR benchmark
+        working-directory: sofie-pr/build/benchmark
+        run: |
+          ./sofie_benchmark \
+            -w ${{ env.BENCH_WARMUP }} \
+            -n ${{ env.BENCH_ITERS }} \
+            | tee benchmark_pr.txt
+
+      - name: Configure main build
+        if: github.event_name == 'pull_request'
+        run: |
+          cmake -B sofie-main/build -S sofie-main \
+            -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \
+            -DSOFIE_WITH_ROOT=OFF \
+            -DSOFIE_BENCHMARK=ON \
+            -DSOFIE_BENCHMARK_BACKEND=CUDA \
+            "-DSOFIE_BENCHMARK_CUDA_ARCH=${{ env.CUDA_ARCH }}" \
+            "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \
+            "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}"
+
+      - name: Build main benchmark
+        if: github.event_name == 'pull_request'
+        run: cmake --build sofie-main/build --target sofie_benchmark -j$(nproc)
+
+      - name: Run main benchmark
+        if: github.event_name == 'pull_request'
+        working-directory: sofie-main/build/benchmark
+        run: |
+          ./sofie_benchmark \
+            -w ${{ env.BENCH_WARMUP }} \
+            -n ${{ env.BENCH_ITERS }} \
+            | tee benchmark_main.txt
+
+      - name: Summarise PR vs main
+        if: github.event_name == 'pull_request'
+        run: |
+          echo "### Benchmark comparison: PR vs main" >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+          echo "── PR ──────────────────────────────────────────────────────────────" \
+            >> "$GITHUB_STEP_SUMMARY"
+          cat sofie-pr/build/benchmark/benchmark_pr.txt   >> "$GITHUB_STEP_SUMMARY"
+          echo "" >> "$GITHUB_STEP_SUMMARY"
+          echo "── main ────────────────────────────────────────────────────────────" \
+            >> "$GITHUB_STEP_SUMMARY"
+          cat sofie-main/build/benchmark/benchmark_main.txt >> "$GITHUB_STEP_SUMMARY"
+          echo '```' >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results-${{ github.run_id }}
+          path: |
+            sofie-pr/build/benchmark/benchmark_pr.txt
+            sofie-main/build/benchmark/benchmark_main.txt
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000..475b782
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,113 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+concurrency:
+  group: tests-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  LCG_VIEW:   /cvmfs/sft.cern.ch/lcg/views/LCG_106a/x86_64-el9-gcc13-opt
+  CUDA_CVMFS: /cvmfs/sft.cern.ch/lcg/contrib/cuda/12.4/x86_64-el9
+  CUDA_ARCH:  "90"
+  BUILD_TYPE: Release
+  DEPS_CACHE: /tmp/sofie-cmake-deps
+
+jobs:
+  gpu-tests:
+    name: GPU Unit Tests (NVIDIA/H100)
+    runs-on: ml4ep-h100
+    container: registry.cern.ch/ngt/lxplus-like:9
+    timeout-minutes: 60
+
+    steps:
+      - name: GPU check
+        run: nvidia-smi
+
+      - name: Setup build environment
+        run: |
+          set -euo pipefail
+
+          # LCG view (cmake, gcc-13, protobuf, openblas)
+          if [ -f "${{ env.LCG_VIEW }}/setup.sh" ]; then
+            set +u; source "${{ env.LCG_VIEW }}/setup.sh"; set -u
+          else
+            echo "LCG view not found — installing from dnf"
+            dnf install -y epel-release
+            dnf install -y cmake ninja-build gcc-c++ python3 git \
+                           protobuf-devel openblas-devel
+          fi
+
+          # CUDA toolkit (nvcc + headers)
+          if [ -x "${{ env.CUDA_CVMFS }}/bin/nvcc" ]; then
+            echo "${{ env.CUDA_CVMFS }}/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=${{ env.CUDA_CVMFS }}" >> "$GITHUB_ENV"
+          elif [ -x /usr/local/cuda/bin/nvcc ]; then
+            echo "/usr/local/cuda/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=/usr/local/cuda" >> "$GITHUB_ENV"
+          else
+            echo "nvcc not found — installing CUDA toolkit from NVIDIA repo"
+            dnf config-manager --add-repo \
+              https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+            dnf install -y cuda-compiler-12-4 cuda-cudart-devel-12-4 cuda-libraries-devel-12-4
+            echo "/usr/local/cuda-12.4/bin" >> "$GITHUB_PATH"
+            echo "CUDA_HOME=/usr/local/cuda-12.4" >> "$GITHUB_ENV"
+          fi
+
+          # GTest
+          dnf install -y gtest-devel 2>/dev/null || \
+          dnf install -y googletest-devel 2>/dev/null || (
+            cd /tmp
+            git clone --depth 1 -b v1.14.0 https://github.com/google/googletest.git
+            cmake -B gtest-build -S googletest \
+                  -DCMAKE_INSTALL_PREFIX=/usr/local -DBUILD_SHARED_LIBS=ON
+            cmake --build gtest-build -j$(nproc)
+            cmake --install gtest-build
+          )
+
+          echo "PATH=$PATH"                               >> "$GITHUB_ENV"
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}"    >> "$GITHUB_ENV"
+          echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}" >> "$GITHUB_ENV"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache FetchContent dependencies
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.DEPS_CACHE }}
+          key: cmake-deps-tests-${{ hashFiles('test/CMakeLists.txt') }}
+          restore-keys: cmake-deps-tests-
+
+      - name: Configure
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_BUILD_TYPE=${{ env.BUILD_TYPE }} \
+            -DSOFIE_WITH_ROOT=OFF \
+            -Dtesting=ON \
+            -DENABLE_ALPAKA_TESTS=ON \
+            -DALPAKA_BACKEND=cuda \
+            "-DCMAKE_CUDA_ARCHITECTURES=${{ env.CUDA_ARCH }}" \
+            "-DFETCHCONTENT_BASE_DIR=${{ env.DEPS_CACHE }}"
+
+      - name: Build tests
+        run: |
+          cmake --build build \
+            --target TestCustomModelsFromONNXForAlpakaCuda \
+            -j$(nproc)
+
+      - name: Run tests
+        working-directory: build
+        run: ctest --output-on-failure -j1
+
+      - name: Upload test log
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-log-${{ github.run_id }}
+          path: build/Testing/Temporary/LastTest.log
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c9bd226..16f4782 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,9 +4,16 @@ project(Sofie
     DESCRIPTION "SOFIE"
     LANGUAGES CXX)
 
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
 find_package(BLAS)
 if(NOT BLAS_FOUND)
-  message(WARNING "BLAS not found: TMVA-SOFIE will not be fully tested")
+  message(WARNING "BLAS not found: sofie will not be fully tested")
 endif()
 
 message(STATUS "Looking for Protobuf")
@@ -17,49 +24,101 @@ if(NOT Protobuf_FOUND)
 endif()
 if(NOT Protobuf_FOUND)
   if(fail-on-missing)
-    message(FATAL_ERROR "Protobuf libraries not found and they are required (tmva-sofie option enabled)")
+    message(FATAL_ERROR "Protobuf libraries not found and they are required (sofie option enabled)")
   else()
-    message(STATUS "Protobuf not found. Switching off tmva-sofie option")
+    message(STATUS "Protobuf not found. Switching off sofie option")
     message(FATAL_ERROR "SOFIE cannot be installed without Protobuf")
   endif()
 else()
   if(Protobuf_VERSION LESS 3.0)
     if(fail-on-missing)
-      message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (tmva-sofie option enabled)")
+      message(FATAL_ERROR "Protobuf libraries found but is less than the version required (3.0) (sofie option enabled)")
     else()
-      message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off tmva-sofie option")
+      message(STATUS "Protobuf found but its version is not high enough (>3.0). Switching off sofie option")
       message(FATAL_ERROR "SOFIE cannot be installed without Protobuf")
     endif()
   else()
     if(NOT TARGET protobuf::protoc)
       if(fail-on-missing)
-        message(FATAL_ERROR "Protobuf compiler not found (tmva-sofie option enabled)")
+        message(FATAL_ERROR "Protobuf compiler not found (sofie option enabled)")
       else()
-        message(STATUS "Protobuf compiler not found. Switching off tmva-sofie option")
+        message(STATUS "Protobuf compiler not found. Switching off sofie option")
         message(FATAL_ERROR "SOFIE cannot be installed without Protobuf")
       endif()
     endif()
   endif()
 endif()
 
-find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree)
-include(${ROOT_USE_FILE})
+option(SOFIE_WITH_ROOT "Enable ROOT support (required for .root weight files and ROOT serialization)" OFF)
+
+if(SOFIE_WITH_ROOT)
+  find_package(ROOT REQUIRED COMPONENTS Core TMVA Tree)
+  if(ROOT_FOUND)
+    include(${ROOT_USE_FILE})
+    message(STATUS "ROOT found: enabling ROOT support in SOFIE")
+  else()
+    message(FATAL_ERROR "SOFIE_WITH_ROOT is ON but ROOT was not found")
+  endif()
+else()
+  message(STATUS "Building SOFIE without ROOT support (SOFIE_WITH_ROOT=OFF)")
+endif()
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_INSTALL_BINDIR "bin" CACHE PATH "user executables (bin)")
-set(CMAKE_INSTALL_INCLUDEDIR "include" CACHE PATH "header files")
-set(CMAKE_INSTALL_LIBDIR "lib" CACHE PATH "libraries")
 
 if(ccache)
   set(CMAKE_C_COMPILER_LAUNCHER ccache)
   set(CMAKE_CXX_COMPILER_LAUNCHER ccache)
 endif()
 
+option(testing "Build and run tests" OFF)
 if(testing)
-    find_package(GTest REQUIRED)
+  find_package(GTest REQUIRED)
   enable_testing()
 endif()
 
-include(cmake/modules/RoottestMacros.cmake)
+option(SOFIE_BENCHMARK "Build the SOFIE CUDA benchmark toolkit" OFF)
+
+if(SOFIE_WITH_ROOT AND ROOT_FOUND)
+  include(cmake/modules/RoottestMacros.cmake)
+else()
+  include(cmake/modules/SofieTestMacros.cmake)
+endif()
+
+add_subdirectory(utils)
+add_subdirectory(core)
+add_subdirectory(parsers)
+
+if(testing)
+  add_subdirectory(test)
+endif()
+
+if(SOFIE_BENCHMARK)
+  add_subdirectory(benchmark)
+endif()
+
+# ── Install cmake package config files ──────────────────────────────────────
+
+configure_package_config_file(
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/SOFIEConfig.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE
+)
+
+write_basic_package_version_file(
+  ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake
+  VERSION ${PROJECT_VERSION}
+  COMPATIBILITY AnyNewerVersion
+)
+
+install(
+  EXPORT SOFIETargets
+  FILE SOFIETargets.cmake
+  NAMESPACE SOFIE::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE
+)
 
-add_subdirectory(src)
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfig.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/SOFIEConfigVersion.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/SOFIE
+)
diff --git a/README.md b/README.md
index 97902f8..5c4042b 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,259 @@
 # SOFIE
-This is an experimental standalone version of SOFIE - a tool for Fast ML Inference within ROOT - the scientific data analysis framework.
 
-Since SOFIE is a part of ROOT and therefore needs to be built altogether, it takes quite a long time in its development and testing. This standalone version allows you to just build SOFIE with the pre-built binaries of ROOT- making the entire development process way faster.
+This is an experimental standalone version of **SOFIE** — a tool for Fast ML Inference
+within [ROOT](https://root.cern), the scientific data analysis framework.
 
+This standalone is especially developed for implementing and evaluating inference on
+**heterogeneous architectures** (CUDA GPUs, AMD GPUs via HIP/ROCm, CPUs) using the
+[Alpaka](https://github.com/alpaka-group/alpaka) portability layer.
+
+---
 
 ## Installation
 
-1. Getting a ROOT binary.  
-Download a pre-built binary of ROOT based on your architecture from [here](https://root.cern/install/).
+### Prerequisites
+
+- CMake ≥ 3.16
+- C++20-capable compiler (GCC ≥ 11, Clang ≥ 14)
+- [Protocol Buffers](https://protobuf.dev/) ≥ 3.0 (for ONNX model parsing)
+- *(Optional)* ROOT ≥ 6.28 — only needed if using `.root` weight files or ROOT-based
+  serialization (`-DSOFIE_WITH_ROOT=ON`)
+- *(Optional for GPU testing/benchmarking)* CUDA Toolkit ≥ 11.8
+
+### 1. Clone and build
 
-2. Build standalone SOFIE
 ```bash
 git clone https://github.com/sanjibansg/SOFIE.git
 cd SOFIE
 mkdir build && cd build
-cmake -Dtesting=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
-cmake --build . --target install -j10
+cmake -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . --target install -j$(nproc)
+```
+
+To disable ROOT (build without ROOT dependency):
+
+```bash
+cmake -DSOFIE_WITH_ROOT=OFF -DCMAKE_INSTALL_PREFIX=../install ..
 ```
-The commands above should build the SOFIE standalone. To include it within the ROOT binary and run altogether, we need to source the shared libraries for `SOFIE_core` and `SOFIE_parsers`. Within the SOFIE repository we may call
+
+### 2. Source the environment (ROOT-integrated workflow only)
+
+If you need the SOFIE libraries to be accessible from within a ROOT session:
+
+```bash
+# Example — adjust the ROOT tarball name to match your download
+source root_v6.36.02.Linux-ubuntu24.04-x86_64-gcc13.3/root/bin/thisroot.sh
+source setup.sh   # adds SOFIE_core and SOFIE_parsers to LD_LIBRARY_PATH
+```
+
+This step is **not required** when building without ROOT
+(`-DSOFIE_WITH_ROOT=OFF`).
+
+---
+
+## Testing
+
+Unit and integration tests are enabled with `-Dtesting=ON` and require
+[GoogleTest](https://github.com/google/googletest).
+
+### CPU / default tests
+
+```bash
+cmake -Dtesting=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . -j$(nproc)
+ctest --output-on-failure
+```
+
+### GPU tests (Alpaka/CUDA)
+
+Alpaka-based GPU tests compile SOFIE-generated inference code as CUDA and verify
+correctness against reference outputs.  They require the CUDA Toolkit and a
+compatible NVIDIA GPU.
+
+```bash
+cmake -Dtesting=ON \
+      -DENABLE_ALPAKA_TESTS=ON \
+      -DALPAKA_BACKEND=cuda \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+cmake --build . -j$(nproc)
+ctest --output-on-failure
+```
+
+| CMake flag | Default | Description |
+|---|---|---|
+| `-Dtesting=ON` | `OFF` | Enable the test suite |
+| `-DENABLE_ALPAKA_TESTS=ON` | `OFF` | Enable Alpaka GPU tests |
+| `-DALPAKA_BACKEND=<val>` | `cuda` | Alpaka backend: `cuda`, `hip`, `cpu`, `sycl` |
+
+The test executable is `TestCustomModelsFromONNXForAlpakaCuda`.  ONNX model files
+used as test inputs are located in `core/test/input_models/`.  Models with symbolic
+(dynamic) input dimensions are specialised by the emitter before testing.
+
+---
+
+## Benchmarking
+
+The benchmark toolkit (`benchmark/`) measures **inference latency and throughput** for
+ONNX models compiled by SOFIE and executed via Alpaka.  It supports an optional
+side-by-side comparison with **ONNX Runtime GPU**.
+
+### Supported backends
+
+| Backend | CMake value | Status |
+|---------|-------------|--------|
+| NVIDIA CUDA | `CUDA` (default) | Supported |
+| AMD HIP/ROCm | `HIP` | Planned |
+
+### Quick start
+
+```bash
+# Place .onnx models in benchmark/models/ first
+cmake -B build \
+      -DSOFIE_BENCHMARK=ON \
+      -DSOFIE_BENCHMARK_BACKEND=CUDA \
+      -DSOFIE_BENCHMARK_CUDA_ARCH=86 \   # e.g. 86 for RTX 30xx, 80 for A100
+      /path/to/SOFIE
+cmake --build build --target sofie_benchmark -j$(nproc)
+cd build/benchmark && ./sofie_benchmark
+```
+
+For a full reference of benchmark CMake flags, runtime options, the large-input
+cluster benchmark, and instructions for adding new backends, see
+[benchmark/README.md](benchmark/README.md).
+
+### Profiling
+
+Add `-DSOFIE_BENCHMARK_PROFILE=ON` to enable **per-operator GPU timing** and a
+**CPU/GPU memory breakdown** printed after each model's throughput line.
 
 ```bash
-source setup.sh
+cmake -B build \
+      -DSOFIE_BENCHMARK=ON \
+      -DSOFIE_BENCHMARK_PROFILE=ON \
+      /path/to/SOFIE
+cmake --build build --target sofie_benchmark -j$(nproc)
+cd build/benchmark && ./sofie_benchmark
+```
+
+> Profiling inserts `alpaka::wait(queue)` after each operator, which serialises
+> GPU execution.  Use a non-profile build for peak-throughput numbers.
+
+Profiling can also be enabled on a per-model basis outside the benchmark by
+passing `Options::kProfile` at code-generation time (see
+[Profiling in user code](#profiling-in-user-code) below).
+
+---
+
+## GPU Architecture Support
+
+SOFIE generates Alpaka-based inference code that is portable across GPU
+architectures:
+
+- **NVIDIA CUDA** — select the SM architecture with
+  `-DSOFIE_BENCHMARK_CUDA_ARCH=<sm>` (e.g. `75` for Turing, `86` for Ampere,
+  `90` for Hopper).
+- **AMD HIP/ROCm** — the Alpaka backend tag (`alpaka::TagGpuHipRt`) and the
+  `SOFIE_BACKEND_HIP` compile-time define are already wired in
+  `benchmark/src/BenchmarkBackend.hxx`; full build-system integration is in
+  progress.
+- **CPU** — a serial CPU Alpaka backend (`alpaka::TagCpuSerial`) is available as a
+  fallback for debugging and portability testing.
+
+---
 
+## Project Structure
+
+```
+SOFIE/
+├── core/           # Core SOFIE library (RModel, operators, code generators)
+│   └── test/       # Unit/integration tests
+├── parsers/        # ONNX → RModel parser
+├── benchmark/      # Latency / throughput benchmark toolkit
+│   ├── models/     # Place .onnx benchmark models here
+│   └── src/        # CMake-configured source templates
+├── utils/          # Utility targets
+└── cmake/          # CMake modules and config templates
 ```
-Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull.
 
+---
+
+## Profiling in user code
+
+Both the CPU and GPU code generators accept `Options::kProfile` to embed
+per-operator timing and memory reporting directly in the generated session struct.
+
+### CPU inference
+
+```cpp
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelParser_ONNX.hxx"
+
+SOFIE::RModelParser_ONNX parser;
+SOFIE::RModel model = parser.Parse("my_model.onnx");
+
+// Generate with profiling enabled
+model.Generate(SOFIE::Options::kProfile);
+model.OutputGenerated("MyModel.hxx");
+```
+
+The generated `Session` struct gains:
+
+| Method | Description |
+|--------|-------------|
+| `PrintProfilingResults(bool order=true)` | Per-operator mean ± stderr (µs), sorted by avg time |
+| `ResetProfilingResults()` | Clear accumulated timing data |
+| `GetOpAvgTime()` | `std::map<std::string, double>` of averages |
+| `GetOpVariance()` | `std::map<std::string, double>` of variances |
+
+```cpp
+#include "MyModel.hxx"
+SOFIE_MyModel::Session session("MyModel.dat");
+
+// Warmup
+for (int i = 0; i < 10; ++i) session.infer(input);
+session.ResetProfilingResults();
+
+// Timed runs
+for (int i = 0; i < 100; ++i) session.infer(input);
+session.PrintProfilingResults();
+```
+
+### GPU inference (Alpaka/CUDA)
+
+```cpp
+model.GenerateGPU_ALPAKA(SOFIE::Options::kProfile);
+model.OutputGenerated("MyModel_GPU_ALPAKA.hxx");
+```
+
+The generated GPU `Session` additionally provides:
+
+| Method | Description |
+|--------|-------------|
+| `PrintProfilingResults(bool order=true)` | Per-operator GPU wall-clock time (µs) with `alpaka::wait` sync |
+| `ResetProfilingResults()` | Clear accumulated timing data |
+| `GetOpAvgTime()` | `std::map<std::string, double>` of averages |
+| `PrintMemoryInfo()` | CPU/GPU memory breakdown (computed at code-gen time) |
+
+```cpp
+#include "MyModel_GPU_ALPAKA.hxx"
+SOFIE_MyModel::Session<AccTag> session("MyModel_GPU_ALPAKA.dat");
+
+for (int i = 0; i < 10; ++i) session.infer(input_d);  // warmup
+session.ResetProfilingResults();
+
+for (int i = 0; i < 100; ++i) session.infer(input_d);  // timed
+session.PrintProfilingResults();
+session.PrintMemoryInfo();
+```
+
+> **Timing accuracy:** `alpaka::wait(queue)` is called after each operator kernel
+> so the wall-clock measurement captures actual GPU execution time.  This
+> disables kernel pipelining; use a non-profile build for throughput measurement.
+
+---
 
-    
 ## Inspiration
-The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit).
+
+The standalone version of SOFIE is developed with inspiration from the standalone
+version of RooFit developed by Jonas Rembser, which can be found
+[here](https://github.com/guitargeek/roofit).
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 0000000..5ba3a9d
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,721 @@
+cmake_minimum_required(VERSION 3.18)
+include(FetchContent)
+
+################################################################################
+# SOFIE Alpaka Benchmark Toolkit
+#
+# Usage:
+#   cmake -Bbuild -DSOFIE_BENCHMARK=ON .
+#   cmake --build build --target sofie_benchmark
+#   cd build/benchmark && ./sofie_benchmark [options]
+#
+# To also benchmark with ONNX Runtime GPU:
+#   cmake -Bbuild -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_ORT=ON \
+#         [-DONNXRUNTIME_ROOT=/usr/local/onnxruntime] .
+#   ./sofie_benchmark --onnxruntime
+#
+# Place .onnx models in benchmark/models/ and re-run cmake to register them.
+################################################################################
+
+option(SOFIE_BENCHMARK_ORT
+    "Also benchmark ONNX Runtime GPU alongside SOFIE (requires ORT ≥ 1.18)"
+    OFF)
+
+option(SOFIE_BENCHMARK_PROFILE
+    "Enable per-operator GPU profiling (per-op timing + memory breakdown). \
+Mutually exclusive with throughput benchmarking — rebuild without this flag \
+to measure throughput."
+    OFF)
+
+################################################################################
+# Backend / architecture selection
+#
+# SOFIE_BENCHMARK_BACKEND applies to both throughput benchmarking and profiling.
+# Currently only CUDA is supported; setting any other value is a hard error.
+# When AMD GPU (HIP/ROCm) support is added, this option will accept "HIP".
+################################################################################
+
+set(SOFIE_BENCHMARK_BACKEND "CUDA" CACHE STRING
+    "Target accelerator backend for SOFIE benchmark and profiling (currently only CUDA)")
+set_property(CACHE SOFIE_BENCHMARK_BACKEND PROPERTY STRINGS CUDA)
+
+string(TOUPPER "${SOFIE_BENCHMARK_BACKEND}" _bench_backend)
+
+if(NOT _bench_backend STREQUAL "CUDA")
+    message(FATAL_ERROR
+        "SOFIE Benchmark: SOFIE_BENCHMARK_BACKEND='${SOFIE_BENCHMARK_BACKEND}' is not "
+        "supported.  Only 'CUDA' is currently implemented.  "
+        "AMD GPU (HIP/ROCm) support is planned for a future release.")
+endif()
+
+# Compile-time defines propagated to every benchmark translation unit.
+# These drive BenchmarkBackend.hxx type aliases and SOFIE_BENCH_DEVICE_SYNC().
+set(_SOFIE_BENCH_ALPAKA_DEFINE ALPAKA_ACC_GPU_CUDA_ENABLED)
+set(_SOFIE_BENCH_BACKEND_DEFINE SOFIE_BACKEND_CUDA)
+
+if(SOFIE_BENCHMARK_PROFILE)
+    message(STATUS "SOFIE Benchmark: profiling ENABLED  "
+                   "(backend = ${SOFIE_BENCHMARK_BACKEND}, throughput benchmarking disabled)")
+else()
+    message(STATUS "SOFIE Benchmark: backend = ${SOFIE_BENCHMARK_BACKEND}")
+endif()
+
+include_directories(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../core/inc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc
+)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+################################################################################
+# Discover models
+################################################################################
+
+file(GLOB BENCHMARK_ONNX_MODELS
+    "${CMAKE_CURRENT_SOURCE_DIR}/models/*.onnx")
+
+# Exclude the base/template models used by specialize_models.py to generate
+# fixed-size variants (e.g. gnn_h32_k2_n100_e500.onnx).  The base models
+# have symbolic dim_param input dimensions that SOFIE cannot resolve during
+# code generation:
+#   • GetTensorShape() throws "unspecified dimension parameter" for input
+#     tensors whose shapes were not concretised by ONNX shape inference.
+#   • GetTensorShape() throws "is a dynamic tensor" when an operator tries
+#     to read a concrete size from an intermediate tensor that remained
+#     dynamic because its input shapes were unknown.
+# The concrete specialised variants produced by specialize_models.py are
+# benchmarked instead.
+list(FILTER BENCHMARK_ONNX_MODELS EXCLUDE REGEX
+    "/(gnn_h32_k2|gnn_h64_k4|punet_h32_k2_heads4_layers2|punet_h64_k4_heads4_layers2|transformer_d32_h2_L6_ff32)\\.onnx$")
+
+# Exclude models that exceed GPU memory (cudaErrorMemoryAllocation) or trigger
+# a cuBLAS EXECUTION_FAILED error (status 13) on the available hardware
+# (RTX 2070 SUPER, 8 GB VRAM).  They stay on disk for reference and are
+# compiled into sofie_benchmark_large (cluster target) instead.
+list(FILTER BENCHMARK_ONNX_MODELS EXCLUDE REGEX
+    "/(punet_h32_k2_heads4_layers2_n10000_e50000\
+|punet_h32_k2_heads4_layers2_n1000_e5000\
+|punet_h32_k2_heads4_layers2_n3000_e15000\
+|punet_h32_k2_heads4_layers2_n30000_e150000\
+|punet_h32_k2_heads4_layers2_n100000_e500000\
+|punet_h64_k4_heads4_layers2_n10000_e50000\
+|punet_h64_k4_heads4_layers2_n100_e500\
+|punet_h64_k4_heads4_layers2_n300_e1500\
+|punet_h64_k4_heads4_layers2_n3000_e15000\
+|punet_h64_k4_heads4_layers2_n30000_e150000\
+|punet_h64_k4_heads4_layers2_n100000_e500000\
+|gnn_h32_k2_n30000_e150000\
+|gnn_h32_k2_n100000_e500000\
+|gnn_h64_k4_n30000_e150000\
+|gnn_h64_k4_n100000_e500000\
+|transformer_L1000_B100\
+|transformer_L100_B100\
+|transformer_L8000_B1\
+|transformer_d32_h2_L6_ff32_n60_s60)\\.onnx$")
+
+if(NOT BENCHMARK_ONNX_MODELS)
+    message(STATUS
+        "SOFIE Benchmark: No .onnx models found in benchmark/models/. "
+        "Add ONNX models there and re-run cmake to enable benchmarking.")
+    return()
+endif()
+
+list(LENGTH BENCHMARK_ONNX_MODELS N_MODELS)
+message(STATUS "SOFIE Benchmark: Found ${N_MODELS} model(s) in benchmark/models/")
+
+################################################################################
+# Fetch Alpaka and sofieBLAS (same pinned revisions as the test suite)
+################################################################################
+
+FetchContent_Declare(
+    sofieBLAS
+    GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS
+    GIT_TAG        dev
+)
+FetchContent_MakeAvailable(sofieBLAS)
+
+FetchContent_Declare(
+    alpaka
+    GIT_REPOSITORY https://github.com/alpaka-group/alpaka
+    GIT_TAG        2fa91a34ed11b2076e474c5507d920e85cf9b79d
+)
+FetchContent_MakeAvailable(alpaka)
+
+################################################################################
+# Hardware toolkit setup — CUDA (the only supported backend for now)
+################################################################################
+
+enable_language(CUDA)
+find_package(CUDAToolkit REQUIRED)
+message(STATUS "SOFIE Benchmark: CUDA toolkit ${CUDAToolkit_VERSION}")
+
+################################################################################
+# Optional: ONNX Runtime GPU backend
+################################################################################
+
+set(SOFIE_ORT_FOUND FALSE)
+
+if(SOFIE_BENCHMARK_ORT)
+    # Prefer manual detection — the installed ORT CMake config may reference
+    # a wrong lib path (e.g. lib64 vs lib) and raise a hard error even with QUIET.
+    # If ONNXRUNTIME_ROOT is provided, go straight to the manual path.
+    # Otherwise attempt the CMake config with NO_DEFAULT_PATH so it only looks
+    # where we tell it, and fall through to manual on failure.
+
+    set(_ort_search_roots "")
+    if(DEFINED ONNXRUNTIME_ROOT)
+        list(APPEND _ort_search_roots "${ONNXRUNTIME_ROOT}")
+    endif()
+    list(APPEND _ort_search_roots
+        /usr/local/onnxruntime /usr/local /usr /opt)
+
+    # Manual header + library search (reliable, no broken cmake-config risk)
+    find_path(ONNXRUNTIME_INCLUDE_DIR
+        NAMES onnxruntime_cxx_api.h
+        PATHS ${_ort_search_roots}
+        PATH_SUFFIXES include include/onnxruntime
+        NO_DEFAULT_PATH)
+
+    find_library(ONNXRUNTIME_LIBRARY
+        NAMES onnxruntime
+        PATHS ${_ort_search_roots}
+        PATH_SUFFIXES lib lib64
+        NO_DEFAULT_PATH)
+
+    if(ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY)
+        set(SOFIE_ORT_FOUND TRUE)
+        add_library(onnxruntime::onnxruntime SHARED IMPORTED)
+        set_target_properties(onnxruntime::onnxruntime PROPERTIES
+            IMPORTED_LOCATION             "${ONNXRUNTIME_LIBRARY}"
+            INTERFACE_INCLUDE_DIRECTORIES "${ONNXRUNTIME_INCLUDE_DIR}")
+        set(SOFIE_ORT_TARGET onnxruntime::onnxruntime)
+        message(STATUS "SOFIE Benchmark: ONNX Runtime found — ${ONNXRUNTIME_LIBRARY}")
+        message(STATUS "SOFIE Benchmark: ORT headers  — ${ONNXRUNTIME_INCLUDE_DIR}")
+    else()
+        message(WARNING
+            "SOFIE Benchmark: SOFIE_BENCHMARK_ORT=ON but ONNX Runtime "
+            "not found.  Set -DONNXRUNTIME_ROOT=<path> or install ORT.  "
+            "ORT benchmarking will be disabled.")
+    endif()
+endif()
+
+if(SOFIE_BENCHMARK_ORT AND NOT SOFIE_ORT_FOUND)
+    message(STATUS "SOFIE Benchmark: ORT benchmarking disabled (library not found)")
+endif()
+
+################################################################################
+# Build per-model strings for configure_file
+################################################################################
+
+set(_EMIT_BLOCK
+"try {\n\
+    EmitBenchmarkModel(\"@1@\", \"@2@\", outDir);\n\
+} catch (const std::exception &e) {\n\
+    std::cerr << \"[ERROR] @2@: \" << e.what() << \"\\n\";\n\
+    ++failures;\n\
+} catch (...) {\n\
+    std::cerr << \"[ERROR] @2@: unknown exception\\n\";\n\
+    ++failures;\n\
+}\n\
+")
+
+set(BENCHMARK_EMIT_CAPTURES        "")
+set(BENCHMARK_BENCH_HEADERS        "")
+set(BENCHMARK_FWD_DECLS            "")
+set(BENCHMARK_SINGLE_MODEL_CASES   "")
+set(BENCHMARK_SPAWN_CALLS          "")
+set(GENERATED_HEADERS              "")
+set(BENCHMARK_MODEL_CU_SRCS        "")
+
+foreach(ONNX_FILE ${BENCHMARK_ONNX_MODELS})
+    get_filename_component(MODEL_NAME "${ONNX_FILE}" NAME_WE)
+
+    string(REGEX REPLACE "[^A-Za-z0-9]" "_" MODEL_CPPNAME "${MODEL_NAME}")
+
+    set(GEN_HXX   "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_GPU_ALPAKA.hxx")
+    set(GEN_BENCH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.hxx")
+    list(APPEND GENERATED_HEADERS "${GEN_HXX}" "${GEN_BENCH}")
+
+    string(REPLACE "@1@" "${ONNX_FILE}"  _emit_cap "${_EMIT_BLOCK}")
+    string(REPLACE "@2@" "${MODEL_NAME}" _emit_cap "${_emit_cap}")
+    string(APPEND BENCHMARK_EMIT_CAPTURES "${_emit_cap}")
+
+    string(APPEND BENCHMARK_BENCH_HEADERS
+        "#include \"${MODEL_NAME}_bench.hxx\"\n")
+
+    # Forward declaration for the main TU (function defined in per-model .cu)
+    string(APPEND BENCHMARK_FWD_DECLS
+        "void Benchmark_${MODEL_CPPNAME}(int warmup, int iterations, const std::string& weightsDir);\n")
+
+    # Single-model dispatch: one if-branch per model (used in --single-model mode)
+    string(APPEND BENCHMARK_SINGLE_MODEL_CASES
+        "    if (model == \"${MODEL_CPPNAME}\") {\n"
+        "        Benchmark_${MODEL_CPPNAME}(warmup, iterations, weightsDir);\n"
+        "#ifdef SOFIE_BENCHMARK_ORT\n"
+        "        if (run_ort) BenchmarkORT_GPU(\"${ONNX_FILE}\", \"${MODEL_NAME}\", warmup, iterations);\n"
+        "#endif\n"
+        "        return 0;\n"
+        "    }\n"
+    )
+
+    # Subprocess spawn: run this binary as a child with --single-model <name>.
+    # Each child gets a fresh CUDA context so GPU memory from the previous
+    # model is completely freed before the next one allocates.
+    string(APPEND BENCHMARK_SPAWN_CALLS
+        "    {\n"
+        "        std::string cmd = std::string(argv[0])\n"
+        "            + \" --single-model ${MODEL_CPPNAME}\"\n"
+        "            + commonArgs;\n"
+        "        int rc = std::system(cmd.c_str());\n"
+        "        if (rc != 0) {\n"
+        "            std::fprintf(stderr, \"[ERROR] ${MODEL_NAME}: subprocess exited %d\\n\", rc);\n"
+        "            ++totalFailed;\n"
+        "        }\n"
+        "    }\n"
+    )
+
+    # Per-model compilation unit: include one bench.hxx → one .cu file
+    set(_model_cu "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.cu")
+    configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/src/ModelBench.cu.in"
+        "${_model_cu}"
+        @ONLY
+    )
+    list(APPEND BENCHMARK_MODEL_CU_SRCS "${_model_cu}")
+endforeach()
+
+################################################################################
+# Configure emitter and runner sources
+################################################################################
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkEmitter.cxx.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx"
+    @ONLY
+)
+
+# Main runner: only main() + forward declarations (no model headers).
+# Each model is compiled in its own _bench.cu TU (see BENCHMARK_MODEL_CU_SRCS).
+set(RUNNER_SRC "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_main.cpp")
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkRunner.cxx.in"
+    "${RUNNER_SRC}"
+    @ONLY
+)
+
+################################################################################
+# Emitter executable (plain C++, generates SOFIE headers at build time)
+################################################################################
+
+add_executable(sofie_benchmark_emitter
+    "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_all.cxx"
+)
+
+target_include_directories(sofie_benchmark_emitter PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}/../core/inc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc"
+)
+
+target_link_libraries(sofie_benchmark_emitter PRIVATE
+    SOFIE_core
+    SOFIE_parsers
+    protobuf::libprotobuf
+)
+
+target_compile_options(sofie_benchmark_emitter PRIVATE
+    -Wno-unused-parameter
+    -Wno-array-bounds
+)
+
+target_compile_definitions(sofie_benchmark_emitter PRIVATE
+    $<$<BOOL:${SOFIE_BENCHMARK_PROFILE}>:SOFIE_BENCHMARK_PROFILE>
+)
+
+################################################################################
+# Custom command: run emitter → generate inference + benchmark headers
+################################################################################
+
+add_custom_command(
+    OUTPUT  ${GENERATED_HEADERS}
+    COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+    COMMAND "$<TARGET_FILE:sofie_benchmark_emitter>" "${CMAKE_CURRENT_BINARY_DIR}"
+    DEPENDS sofie_benchmark_emitter ${BENCHMARK_ONNX_MODELS}
+    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    COMMENT "SOFIE Benchmark: generating headers for ${N_MODELS} model(s)..."
+    VERBATIM
+)
+
+add_custom_target(sofie_benchmark_headers
+    DEPENDS ${GENERATED_HEADERS}
+)
+
+################################################################################
+# Benchmark runner (compiled as .cu, same as the test suite)
+################################################################################
+
+# Mark every per-model bench.cu as CUDA so nvcc processes the device kernels.
+set_source_files_properties(${BENCHMARK_MODEL_CU_SRCS} PROPERTIES LANGUAGE CUDA)
+
+# The main runner is plain C++ (no CUDA kernels, just calls forward-declared fns).
+set_source_files_properties("${RUNNER_SRC}" PROPERTIES LANGUAGE CXX)
+
+add_executable(sofie_benchmark "${RUNNER_SRC}" ${BENCHMARK_MODEL_CU_SRCS})
+
+add_dependencies(sofie_benchmark sofie_benchmark_headers)
+
+target_include_directories(sofie_benchmark PRIVATE
+    "${CMAKE_CURRENT_BINARY_DIR}"                 # generated headers live here
+    "${CMAKE_CURRENT_SOURCE_DIR}/src"             # ONNXRuntimeBenchmark.hxx
+    "${alpaka_SOURCE_DIR}/include"
+    "${sofieblas_SOURCE_DIR}/include"
+    "${CUDAToolkit_INCLUDE_DIRS}"
+)
+
+# Default to the native GPU architecture so we compile only one arch (saves
+# memory and time).  The previous "70;75;80;86;89;90" multi-arch list caused
+# nvcc to OOM-kill when processing all models.  Override via
+#   cmake -DSOFIE_BENCHMARK_CUDA_ARCH="75;86" .
+if(NOT DEFINED SOFIE_BENCHMARK_CUDA_ARCH OR SOFIE_BENCHMARK_CUDA_ARCH STREQUAL "")
+    if(CMAKE_CUDA_ARCHITECTURES AND NOT CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+        set(SOFIE_BENCHMARK_CUDA_ARCH "${CMAKE_CUDA_ARCHITECTURES}")
+    else()
+        set(SOFIE_BENCHMARK_CUDA_ARCH "75")   # RTX 2070 SUPER default
+    endif()
+endif()
+message(STATUS "SOFIE Benchmark: CUDA architectures = ${SOFIE_BENCHMARK_CUDA_ARCH}")
+
+set_target_properties(sofie_benchmark PROPERTIES
+    CUDA_SEPARABLE_COMPILATION OFF
+    CUDA_ARCHITECTURES          "${SOFIE_BENCHMARK_CUDA_ARCH}"
+    CUDA_STANDARD               20
+    CUDA_STANDARD_REQUIRED      ON
+    RUNTIME_OUTPUT_DIRECTORY    "${CMAKE_CURRENT_BINARY_DIR}"
+)
+
+target_compile_definitions(sofie_benchmark PRIVATE
+    ${_SOFIE_BENCH_ALPAKA_DEFINE}
+    ${_SOFIE_BENCH_BACKEND_DEFINE}
+    ALPAKA_HAS_STD_ATOMIC_REF
+    $<$<BOOL:${SOFIE_ORT_FOUND}>:SOFIE_BENCHMARK_ORT>
+    $<$<BOOL:${SOFIE_BENCHMARK_PROFILE}>:SOFIE_BENCHMARK_PROFILE>
+)
+
+target_compile_options(sofie_benchmark PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:
+        --extended-lambda
+        --expt-relaxed-constexpr
+        --use_fast_math
+        -O1
+        -Wno-deprecated-gpu-targets
+        # Suppress "variable was declared but never referenced" (#177-D).
+        # Generated Expand/Where kernels compute per-dimension indices for
+        # broadcast dimensions; when a dimension has size 1 the index variable
+        # is always 0 and goes unused.  Without this suppress the CUDA device
+        # compiler counts them against its error-budget and stops compilation.
+        -diag-suppress 177
+        # Limit device register usage to reduce per-kernel memory footprint
+        # during compilation of the large all-models TU.
+        --maxrregcount=64
+    >
+    $<$<COMPILE_LANGUAGE:CXX>:
+        -O2
+        -fPIC
+    >
+)
+
+target_link_libraries(sofie_benchmark PRIVATE
+    SOFIE_core
+    CUDA::cudart
+    CUDA::cublas
+    CUDA::cublasLt
+    $<$<BOOL:${SOFIE_ORT_FOUND}>:${SOFIE_ORT_TARGET}>
+)
+
+if(SOFIE_ORT_FOUND)
+    message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured "
+                   "(${N_MODELS} model(s), CUDA backend + ORT-GPU)")
+else()
+    message(STATUS "SOFIE Benchmark: target 'sofie_benchmark' configured "
+                   "(${N_MODELS} model(s), CUDA backend; "
+                   "re-configure with -DSOFIE_BENCHMARK_ORT=ON for ORT comparison)")
+endif()
+
+# Convenience CTest entry
+if(testing)
+    add_test(
+        NAME    SofieBenchmark
+        COMMAND sofie_benchmark --warmup 5 --iterations 20
+        WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+endif()
+
+################################################################################
+# Large-input benchmark (cluster GPUs: A100 / H100 / MI300X, ≥40 GB VRAM)
+#
+# Enables:
+#  • All models excluded from sofie_benchmark due to OOM on ≤8 GB GPUs
+#    (punet large, transformer_L1000_B100, transformer_L8000_B1, ...)
+#  • New very-large GNN / PUNet variants: n=30 000 / n=100 000 nodes
+#  • transformer_d32_h2_L6_ff32_n60_s60  (max sequence length for this arch)
+#
+# Usage:
+#   cmake -Bbuild . -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_LARGE=ON \
+#         [-DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=80]   # 80=A100, 90=H100
+#   cmake --build build --target sofie_benchmark_large
+################################################################################
+
+option(SOFIE_BENCHMARK_LARGE
+    "Build sofie_benchmark_large for cluster GPUs (A100/H100, ≥40 GB VRAM)"
+    OFF)
+
+if(SOFIE_BENCHMARK_LARGE)
+
+    # CUDA architecture for the cluster.  Override on the command line:
+    #   cmake ... -DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=90   # H100
+    set(SOFIE_BENCHMARK_LARGE_CUDA_ARCH "80" CACHE STRING
+        "CUDA SM architecture(s) for sofie_benchmark_large (e.g. 80=A100, 90=H100)")
+
+    # -------------------------------------------------------------------------
+    # Exact list of large-input models to include.
+    # These are either previously OOM-excluded or brand-new (n=30k / n=100k).
+    # -------------------------------------------------------------------------
+    set(LARGE_MODEL_NAMES
+        # ── PUNet h32/k2  (OOM + new) ──────────────────────────────────────
+        "punet_h32_k2_heads4_layers2_n1000_e5000"
+        "punet_h32_k2_heads4_layers2_n3000_e15000"
+        "punet_h32_k2_heads4_layers2_n10000_e50000"
+        "punet_h32_k2_heads4_layers2_n30000_e150000"
+        "punet_h32_k2_heads4_layers2_n100000_e500000"
+        # ── PUNet h64/k4  (OOM + new) ──────────────────────────────────────
+        "punet_h64_k4_heads4_layers2_n100_e500"
+        "punet_h64_k4_heads4_layers2_n300_e1500"
+        "punet_h64_k4_heads4_layers2_n1000_e5000"
+        "punet_h64_k4_heads4_layers2_n3000_e15000"
+        "punet_h64_k4_heads4_layers2_n10000_e50000"
+        "punet_h64_k4_heads4_layers2_n30000_e150000"
+        "punet_h64_k4_heads4_layers2_n100000_e500000"
+        # ── GNN h32/k2  (new large) ────────────────────────────────────────
+        "gnn_h32_k2_n30000_e150000"
+        "gnn_h32_k2_n100000_e500000"
+        # ── GNN h64/k4  (new large) ────────────────────────────────────────
+        "gnn_h64_k4_n30000_e150000"
+        "gnn_h64_k4_n100000_e500000"
+        # ── Transformers (OOM on 8 GB) ──────────────────────────────────────
+        "transformer_L100_B100"
+        "transformer_L1000_B100"
+        "transformer_L8000_B1"
+        # ── Transformer d32 (max sequence length for this architecture) ─────
+        "transformer_d32_h2_L6_ff32_n60_s60"
+    )
+
+    # Build a regex that matches any of the large model names.
+    set(_large_regex "")
+    foreach(_m ${LARGE_MODEL_NAMES})
+        if(_large_regex)
+            set(_large_regex "${_large_regex}|${_m}")
+        else()
+            set(_large_regex "${_m}")
+        endif()
+    endforeach()
+    set(_large_regex "(${_large_regex})\\.onnx$")
+
+    file(GLOB _ALL_LARGE_ONNX "${CMAKE_CURRENT_SOURCE_DIR}/models/*.onnx")
+    list(FILTER _ALL_LARGE_ONNX INCLUDE REGEX "${_large_regex}")
+
+    if(NOT _ALL_LARGE_ONNX)
+        message(WARNING
+            "SOFIE_BENCHMARK_LARGE=ON but none of the expected large-input .onnx "
+            "files were found in benchmark/models/.  Run specialize_models.py "
+            "(with the extended GNN_VARIANTS list) first, then re-run cmake.")
+    else()
+        list(LENGTH _ALL_LARGE_ONNX N_LARGE)
+        message(STATUS
+            "SOFIE Benchmark Large: ${N_LARGE} large-input model(s), "
+            "CUDA arch = ${SOFIE_BENCHMARK_LARGE_CUDA_ARCH}")
+
+        # ------------------------------------------------------------------
+        # Build per-model strings (same pattern as the small benchmark above)
+        # ------------------------------------------------------------------
+        set(LARGE_EMIT_CAPTURES      "")
+        set(LARGE_FWD_DECLS          "")
+        set(LARGE_SINGLE_MODEL_CASES "")
+        set(LARGE_SPAWN_CALLS        "")
+        set(LARGE_GENERATED_HEADERS  "")
+        set(LARGE_MODEL_CU_SRCS      "")
+
+        foreach(ONNX_FILE ${_ALL_LARGE_ONNX})
+            get_filename_component(MODEL_NAME "${ONNX_FILE}" NAME_WE)
+            string(REGEX REPLACE "[^A-Za-z0-9]" "_" MODEL_CPPNAME "${MODEL_NAME}")
+
+            set(GEN_HXX   "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_GPU_ALPAKA.hxx")
+            set(GEN_BENCH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.hxx")
+            list(APPEND LARGE_GENERATED_HEADERS "${GEN_HXX}" "${GEN_BENCH}")
+
+            string(REPLACE "@1@" "${ONNX_FILE}"  _ec "${_EMIT_BLOCK}")
+            string(REPLACE "@2@" "${MODEL_NAME}" _ec "${_ec}")
+            string(APPEND LARGE_EMIT_CAPTURES "${_ec}")
+
+            string(APPEND LARGE_FWD_DECLS
+                "void Benchmark_${MODEL_CPPNAME}(int warmup, int iterations, const std::string& weightsDir);\n")
+
+            string(APPEND LARGE_SINGLE_MODEL_CASES
+                "    if (model == \"${MODEL_CPPNAME}\") {\n"
+                "        Benchmark_${MODEL_CPPNAME}(warmup, iterations, weightsDir);\n"
+                "#ifdef SOFIE_BENCHMARK_ORT\n"
+                "        if (run_ort) BenchmarkORT_GPU(\"${ONNX_FILE}\", \"${MODEL_NAME}\", warmup, iterations);\n"
+                "#endif\n"
+                "        return 0;\n"
+                "    }\n"
+            )
+
+            string(APPEND LARGE_SPAWN_CALLS
+                "    {\n"
+                "        std::string cmd = std::string(argv[0])\n"
+                "            + \" --single-model ${MODEL_CPPNAME}\"\n"
+                "            + commonArgs;\n"
+                "        int rc = std::system(cmd.c_str());\n"
+                "        if (rc != 0) {\n"
+                "            std::fprintf(stderr, \"[ERROR] ${MODEL_NAME}: subprocess exited %d\\n\", rc);\n"
+                "            ++totalFailed;\n"
+                "        }\n"
+                "    }\n"
+            )
+
+            set(_model_cu "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_NAME}_bench.cu")
+            configure_file(
+                "${CMAKE_CURRENT_SOURCE_DIR}/src/ModelBench.cu.in"
+                "${_model_cu}"
+                @ONLY
+            )
+            list(APPEND LARGE_MODEL_CU_SRCS "${_model_cu}")
+        endforeach()
+
+        # ------------------------------------------------------------------
+        # Configure emitter + runner sources for the large benchmark
+        # ------------------------------------------------------------------
+        set(BENCHMARK_EMIT_CAPTURES      "${LARGE_EMIT_CAPTURES}")
+        configure_file(
+            "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkEmitter.cxx.in"
+            "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_large.cxx"
+            @ONLY
+        )
+
+        set(BENCHMARK_FWD_DECLS          "${LARGE_FWD_DECLS}")
+        set(BENCHMARK_SINGLE_MODEL_CASES "${LARGE_SINGLE_MODEL_CASES}")
+        set(BENCHMARK_SPAWN_CALLS        "${LARGE_SPAWN_CALLS}")
+        configure_file(
+            "${CMAKE_CURRENT_SOURCE_DIR}/src/BenchmarkRunner.cxx.in"
+            "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp"
+            @ONLY
+        )
+
+        # ------------------------------------------------------------------
+        # sofie_benchmark_large_emitter — generates hxx + dat at build time
+        # ------------------------------------------------------------------
+        add_executable(sofie_benchmark_large_emitter
+            "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkEmitter_large.cxx"
+        )
+        target_include_directories(sofie_benchmark_large_emitter PRIVATE
+            "${CMAKE_CURRENT_SOURCE_DIR}/../core/inc"
+            "${CMAKE_CURRENT_SOURCE_DIR}/../parsers/inc"
+        )
+        target_link_libraries(sofie_benchmark_large_emitter PRIVATE
+            SOFIE_core
+            SOFIE_parsers
+            protobuf::libprotobuf
+        )
+        target_compile_options(sofie_benchmark_large_emitter PRIVATE
+            -Wno-unused-parameter
+            -Wno-array-bounds
+        )
+        target_compile_definitions(sofie_benchmark_large_emitter PRIVATE
+            $<$<BOOL:${SOFIE_BENCHMARK_PROFILE}>:SOFIE_BENCHMARK_PROFILE>
+        )
+
+        # ------------------------------------------------------------------
+        # Custom command: run large emitter → large hxx / dat files
+        # ------------------------------------------------------------------
+        add_custom_command(
+            OUTPUT  ${LARGE_GENERATED_HEADERS}
+            COMMAND "${CMAKE_COMMAND}" -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+            COMMAND "$<TARGET_FILE:sofie_benchmark_large_emitter>" "${CMAKE_CURRENT_BINARY_DIR}"
+            DEPENDS sofie_benchmark_large_emitter ${_ALL_LARGE_ONNX}
+            WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+            COMMENT "SOFIE Benchmark Large: generating headers for ${N_LARGE} large model(s)..."
+            VERBATIM
+        )
+
+        add_custom_target(sofie_benchmark_large_headers
+            DEPENDS ${LARGE_GENERATED_HEADERS}
+        )
+
+        # ------------------------------------------------------------------
+        # sofie_benchmark_large — the cluster benchmark binary
+        # ------------------------------------------------------------------
+        set_source_files_properties(${LARGE_MODEL_CU_SRCS} PROPERTIES LANGUAGE CUDA)
+        set_source_files_properties(
+            "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp"
+            PROPERTIES LANGUAGE CXX)
+
+        add_executable(sofie_benchmark_large
+            "${CMAKE_CURRENT_BINARY_DIR}/BenchmarkRunner_large.cpp"
+            ${LARGE_MODEL_CU_SRCS}
+        )
+        add_dependencies(sofie_benchmark_large sofie_benchmark_large_headers)
+
+        target_include_directories(sofie_benchmark_large PRIVATE
+            "${CMAKE_CURRENT_BINARY_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/src"
+            "${alpaka_SOURCE_DIR}/include"
+            "${sofieblas_SOURCE_DIR}/include"
+            "${CUDAToolkit_INCLUDE_DIRS}"
+        )
+
+        set_target_properties(sofie_benchmark_large PROPERTIES
+            CUDA_SEPARABLE_COMPILATION OFF
+            CUDA_ARCHITECTURES          "${SOFIE_BENCHMARK_LARGE_CUDA_ARCH}"
+            CUDA_STANDARD               20
+            CUDA_STANDARD_REQUIRED      ON
+            RUNTIME_OUTPUT_DIRECTORY    "${CMAKE_CURRENT_BINARY_DIR}"
+        )
+
+        target_compile_definitions(sofie_benchmark_large PRIVATE
+            ${_SOFIE_BENCH_ALPAKA_DEFINE}
+            ${_SOFIE_BENCH_BACKEND_DEFINE}
+            ALPAKA_HAS_STD_ATOMIC_REF
+            $<$<BOOL:${SOFIE_BENCHMARK_PROFILE}>:SOFIE_BENCHMARK_PROFILE>
+            # ORT intentionally excluded: libonnxruntime has no static lib,
+            # so the binary would fail on pods that lack the CUDA toolkit.
+        )
+
+        target_compile_options(sofie_benchmark_large PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:
+                --extended-lambda
+                --expt-relaxed-constexpr
+                --use_fast_math
+                -O2
+                -Wno-deprecated-gpu-targets
+                -diag-suppress 177
+                --maxrregcount=64
+            >
+            $<$<COMPILE_LANGUAGE:CXX>:
+                -O2
+                -fPIC
+            >
+        )
+
+        target_link_libraries(sofie_benchmark_large PRIVATE
+            SOFIE_core
+            CUDA::cudart_static      # statically embed libcudart — no .so needed on the pod
+            CUDA::cublas_static      # statically embed libcublas
+            CUDA::cublasLt_static    # statically embed libcublasLt
+            CUDA::culibos            # required companion for static cublas
+            # ORT intentionally excluded — no static libonnxruntime available
+        )
+
+        message(STATUS
+            "SOFIE Benchmark Large: target 'sofie_benchmark_large' configured "
+            "(${N_LARGE} large model(s), sm${SOFIE_BENCHMARK_LARGE_CUDA_ARCH})")
+    endif() # _ALL_LARGE_ONNX
+endif() # SOFIE_BENCHMARK_LARGE
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..b06a2bf
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,234 @@
+# SOFIE Benchmark for Inference on Heterogeneous Architectures
+
+Measures **inference latency and throughput** for ONNX models compiled by SOFIE and
+executed via [Alpaka](https://github.com/alpaka-group/alpaka).  Optionally runs the
+same models through **ONNX Runtime GPU** for a side-by-side comparison.
+
+---
+
+## Supported Backends
+
+| Backend | CMake value | Status |
+|---------|-------------|--------|
+| NVIDIA CUDA | `CUDA` (default) | Supported |
+| AMD HIP/ROCm | `HIP` | Planned — not yet implemented |
+
+The target architecture is selected with `-DSOFIE_BENCHMARK_BACKEND=<value>` at
+configure time.  Specifying any value other than `CUDA` is a **hard CMake error** until
+the corresponding backend is implemented.
+
+The generated inference code and timing harness are backend-agnostic: they use
+`sofie_bench::AccTag`, `sofie_bench::Platform`, `sofie_bench::Queue`, and the
+`SOFIE_BENCH_DEVICE_SYNC()` macro defined in `src/BenchmarkBackend.hxx`.  Only the
+low-level toolkit (CUDA vs HIP) needs to be swapped to add a new backend.
+
+---
+
+## Quick Start
+
+### 1. Add your models
+
+```
+benchmark/models/
+  GNN_model.onnx
+  simple_transformer.onnx
+  resnet50.onnx
+  ...
+```
+
+Re-run CMake after adding or removing files (it globs `models/*.onnx`).
+
+### 2. Configure
+
+```bash
+# SOFIE inference only — CUDA backend (default)
+cmake -B build -DSOFIE_BENCHMARK=ON /path/to/SOFIE
+
+# Explicitly name the backend (useful for CI or future HIP support)
+cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_BACKEND=CUDA /path/to/SOFIE
+
+# With ONNX Runtime GPU comparison
+cmake -B build \
+  -DSOFIE_BENCHMARK=ON \
+  -DSOFIE_BENCHMARK_ORT=ON \
+  -DONNXRUNTIME_ROOT=/path/to/onnxruntime \
+  /path/to/SOFIE
+
+# Override the CUDA SM architecture (default: native GPU or sm_75)
+cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_CUDA_ARCH="86" /path/to/SOFIE
+```
+
+| CMake flag | Default | Description |
+|---|---|---|
+| `-DSOFIE_BENCHMARK=ON` | — | Enable the benchmark suite |
+| `-DSOFIE_BENCHMARK_BACKEND=<val>` | `CUDA` | Target accelerator backend |
+| `-DSOFIE_BENCHMARK_CUDA_ARCH=<sm>` | native / `75` | CUDA SM architecture(s), e.g. `86` for RTX 30xx, `80` for A100 |
+| `-DSOFIE_BENCHMARK_ORT=ON` | `OFF` | Also benchmark ONNX Runtime GPU |
+| `-DONNXRUNTIME_ROOT=<path>` | — | Path for ORT headers/library |
+| `-DSOFIE_BENCHMARK_PROFILE=ON` | `OFF` | Enable per-operator GPU profiling instead of throughput benchmarking (see [Profiling](#profiling)) |
+| `-DSOFIE_BENCHMARK_LARGE=ON` | `OFF` | Build `sofie_benchmark_large` for cluster GPUs (A100/H100, ≥40 GB VRAM) |
+| `-DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=<sm>` | `80` | CUDA SM architecture for the large-input benchmark |
+
+> **Tested with ONNX Runtime 1.22.0 GPU**
+> (`onnxruntime-linux-x64-gpu-1.22.0`).  The CMake config bundled with some ORT
+> installations may reference an incorrect `lib64/` path — this toolkit uses manual
+> header/library detection to avoid that.
+
+### 3. Build
+
+```bash
+cmake --build build --target sofie_benchmark -j$(nproc)
+```
+
+This automatically:
+1. Builds **`sofie_benchmark_emitter`** — parses each `.onnx` and emits:
+   - `<Model>_GPU_ALPAKA.hxx` — SOFIE Alpaka inference code
+   - `<Model>_GPU_ALPAKA.dat` — serialized weights
+   - `<Model>_bench.hxx`      — timing wrapper `Benchmark_<Model>()`
+2. Builds **`sofie_benchmark`** — compiles all generated code and links the timing loop.
+
+### 4. Run
+
+```bash
+cd build/benchmark
+
+# SOFIE only (no ORT needed at runtime)
+./sofie_benchmark
+
+# SOFIE + ONNX Runtime GPU comparison
+LD_LIBRARY_PATH=/path/to/onnxruntime/lib:$LD_LIBRARY_PATH \
+./sofie_benchmark --onnxruntime
+```
+
+---
+
+## Runtime Options
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--warmup,     -w <N>` | 10  | Warm-up iterations (not timed) |
+| `--iterations, -n <N>` | 100 | Timed iterations |
+| `--weights-dir <path>` | `.` | Directory containing `.dat` weight files |
+| `--onnxruntime, --ort` | off | Run ONNX Runtime GPU benchmark after each SOFIE model |
+| `--help,       -h`     |     | Print this help and exit |
+
+---
+
+## Large-input Benchmark (`sofie_benchmark_large`)
+
+For cluster GPUs (A100/H100/MI300X with ≥40 GB VRAM) a separate target is available
+that includes models excluded from the default benchmark due to memory constraints on
+consumer cards (≤8 GB):
+
+```bash
+cmake -B build -DSOFIE_BENCHMARK=ON -DSOFIE_BENCHMARK_LARGE=ON \
+      -DSOFIE_BENCHMARK_LARGE_CUDA_ARCH=80   # 80=A100, 90=H100
+cmake --build build --target sofie_benchmark_large -j$(nproc)
+```
+
+The large-benchmark binary links CUDA runtime statically so it can run on cluster
+nodes where the CUDA toolkit is not installed system-wide.
+
+---
+
+## Profiling
+
+Profiling and throughput benchmarking are **mutually exclusive** builds.  Rebuild
+with `-DSOFIE_BENCHMARK_PROFILE=ON` to switch the binary into profiling mode: the
+timed H2D/inference/D2H loops are replaced by a profiling pass that measures
+per-operator GPU time and prints a CPU/GPU memory breakdown.  The target backend
+and CUDA architecture are controlled by the same `SOFIE_BENCHMARK_BACKEND` and
+`SOFIE_BENCHMARK_CUDA_ARCH` flags used for benchmarking.
+
+```bash
+cmake -B build \
+  -DSOFIE_BENCHMARK=ON \
+  -DSOFIE_BENCHMARK_PROFILE=ON \
+  /path/to/SOFIE
+cmake --build build --target sofie_benchmark -j$(nproc)
+cd build/benchmark && ./sofie_benchmark
+```
+
+After the normal throughput table, each model will print two additional blocks:
+
+**GPU Profiling Results** — per-operator wall-clock time (microseconds) measured
+with `std::chrono` and an `alpaka::wait(queue)` synchronisation point after every
+kernel.  Results are sorted by average time descending, with ± stderr over all
+timed iterations.  Warmup iterations are excluded (the session is reset before the
+timed runs start).
+
+```
+============================================================
+           GPU PROFILING RESULTS
+   (wall-clock with alpaka::wait synchronization)
+============================================================
+  MatMul_3                      : 142.718 +/- 0.412 us  (100 runs)
+  MatMul_1                      : 138.005 +/- 0.389 us  (100 runs)
+  LayerNorm_5                   :  23.441 +/- 0.201 us  (100 runs)
+  ...
+  Overall_Time                  : 847.332 +/- 1.104 us  (100 runs)
+============================================================
+```
+
+**Memory Usage Breakdown** — sizes computed at code-generation time from tensor
+shapes and types.  No runtime measurement is needed; the values are embedded
+as constants in the generated session code.
+
+```
+============================================================
+              MEMORY USAGE BREAKDOWN
+============================================================
+  CPU Memory:
+    Constant/embedded tensors : 0 bytes  (0.0000 MB)
+    Weight tensors            : 12582912 bytes  (12.000 MB)
+    Intermediate memory pool  : 0 bytes  (0.0000 MB)
+    Total CPU                 : 12582912 bytes  (12.000 MB)
+  GPU Memory (device buffers):
+    Weight device buffers     : 12582912 bytes  (12.000 MB)
+    Intermediate device bufs  : 4194304 bytes  (4.000 MB)
+    Total GPU                 : 16777216 bytes  (16.000 MB)
+============================================================
+```
+
+> **Note:** Profiling and benchmarking are mutually exclusive.  In a profiling
+> build the throughput table is not printed; in a benchmark build
+> `PrintProfilingResults` / `PrintMemoryInfo` are not called.  Rebuild without
+> `-DSOFIE_BENCHMARK_PROFILE=ON` to measure peak throughput.
+
+The same flag works for the large-input benchmark:
+
+```bash
+cmake -B build \
+  -DSOFIE_BENCHMARK=ON \
+  -DSOFIE_BENCHMARK_LARGE=ON \
+  -DSOFIE_BENCHMARK_PROFILE=ON \
+  /path/to/SOFIE
+cmake --build build --target sofie_benchmark_large -j$(nproc)
+```
+
+---
+
+## Re-running after adding models
+
+```bash
+cmake build
+cmake --build build --target sofie_benchmark -j$(nproc)
+```
+
+---
+
+## Adding a New Backend (HIP/ROCm)
+
+The benchmark infrastructure is designed so adding a new backend requires changes
+in only a few places:
+
+1. **`CMakeLists.txt`** — add `"HIP"` to the `SOFIE_BENCHMARK_BACKEND` allowed
+   values, call `enable_language(HIP)`, find `hip::host`, and set
+   `_SOFIE_BENCH_ALPAKA_DEFINE = ALPAKA_ACC_GPU_HIP_ENABLED` /
+   `_SOFIE_BENCH_BACKEND_DEFINE = SOFIE_BACKEND_HIP`.
+2. **`src/BenchmarkBackend.hxx`** — already contains the `SOFIE_BACKEND_HIP` branch
+   with `alpaka::TagGpuHipRt` aliases and `hipDeviceSynchronize()` sync macro.
+3. **`src/ModelBench.cu.in`** — rename to `.hip.in` (or use a common extension) and
+   configure the source file language property to `HIP`.
+4. **`src/ONNXRuntimeBenchmark.hxx`** — swap `OrtCUDAProviderOptions` for the ROCm
+   execution provider options if ORT comparison is desired on AMD hardware.
diff --git a/benchmark/models/.gitkeep b/benchmark/models/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmark/models/GNN_model.onnx b/benchmark/models/GNN_model.onnx
new file mode 100644
index 0000000..833e34d
Binary files /dev/null and b/benchmark/models/GNN_model.onnx differ
diff --git a/benchmark/models/gnn_h32_k2.onnx b/benchmark/models/gnn_h32_k2.onnx
new file mode 100644
index 0000000..03e9f69
Binary files /dev/null and b/benchmark/models/gnn_h32_k2.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n100000_e500000.onnx b/benchmark/models/gnn_h32_k2_n100000_e500000.onnx
new file mode 100644
index 0000000..eb43d13
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n100000_e500000.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n10000_e50000.onnx b/benchmark/models/gnn_h32_k2_n10000_e50000.onnx
new file mode 100644
index 0000000..bf795eb
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n10000_e50000.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n1000_e5000.onnx b/benchmark/models/gnn_h32_k2_n1000_e5000.onnx
new file mode 100644
index 0000000..4ecab7b
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n1000_e5000.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n100_e500.onnx b/benchmark/models/gnn_h32_k2_n100_e500.onnx
new file mode 100644
index 0000000..aa94c75
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n100_e500.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n30000_e150000.onnx b/benchmark/models/gnn_h32_k2_n30000_e150000.onnx
new file mode 100644
index 0000000..c7d2a73
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n30000_e150000.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n3000_e15000.onnx b/benchmark/models/gnn_h32_k2_n3000_e15000.onnx
new file mode 100644
index 0000000..cd3c21b
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n3000_e15000.onnx differ
diff --git a/benchmark/models/gnn_h32_k2_n300_e1500.onnx b/benchmark/models/gnn_h32_k2_n300_e1500.onnx
new file mode 100644
index 0000000..2761c2a
Binary files /dev/null and b/benchmark/models/gnn_h32_k2_n300_e1500.onnx differ
diff --git a/benchmark/models/gnn_h64_k4.onnx b/benchmark/models/gnn_h64_k4.onnx
new file mode 100644
index 0000000..7de5594
Binary files /dev/null and b/benchmark/models/gnn_h64_k4.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n100000_e500000.onnx b/benchmark/models/gnn_h64_k4_n100000_e500000.onnx
new file mode 100644
index 0000000..379fe0c
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n100000_e500000.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n10000_e50000.onnx b/benchmark/models/gnn_h64_k4_n10000_e50000.onnx
new file mode 100644
index 0000000..177aa1c
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n10000_e50000.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n1000_e5000.onnx b/benchmark/models/gnn_h64_k4_n1000_e5000.onnx
new file mode 100644
index 0000000..f9f92ef
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n1000_e5000.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n100_e500.onnx b/benchmark/models/gnn_h64_k4_n100_e500.onnx
new file mode 100644
index 0000000..4496d2f
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n100_e500.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n30000_e150000.onnx b/benchmark/models/gnn_h64_k4_n30000_e150000.onnx
new file mode 100644
index 0000000..8da0743
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n30000_e150000.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n3000_e15000.onnx b/benchmark/models/gnn_h64_k4_n3000_e15000.onnx
new file mode 100644
index 0000000..16de0ed
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n3000_e15000.onnx differ
diff --git a/benchmark/models/gnn_h64_k4_n300_e1500.onnx b/benchmark/models/gnn_h64_k4_n300_e1500.onnx
new file mode 100644
index 0000000..3e6d355
Binary files /dev/null and b/benchmark/models/gnn_h64_k4_n300_e1500.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2.onnx b/benchmark/models/punet_h32_k2_heads4_layers2.onnx
new file mode 100644
index 0000000..a918af9
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx
new file mode 100644
index 0000000..0b6a9b9
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n100000_e500000.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx
new file mode 100644
index 0000000..419a6e8
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n10000_e50000.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx
new file mode 100644
index 0000000..fab0378
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n1000_e5000.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx
new file mode 100644
index 0000000..68f5278
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n100_e500.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx
new file mode 100644
index 0000000..be0835a
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n30000_e150000.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx
new file mode 100644
index 0000000..a4f0ef3
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n3000_e15000.onnx differ
diff --git a/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx b/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx
new file mode 100644
index 0000000..f1be9ad
Binary files /dev/null and b/benchmark/models/punet_h32_k2_heads4_layers2_n300_e1500.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2.onnx b/benchmark/models/punet_h64_k4_heads4_layers2.onnx
new file mode 100644
index 0000000..398e1a9
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx
new file mode 100644
index 0000000..882cba5
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n100000_e500000.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx
new file mode 100644
index 0000000..007f479
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n10000_e50000.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx
new file mode 100644
index 0000000..a6e583f
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n1000_e5000.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx
new file mode 100644
index 0000000..640d9fc
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n100_e500.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx
new file mode 100644
index 0000000..d8ef5ab
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n30000_e150000.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx
new file mode 100644
index 0000000..eca6217
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n3000_e15000.onnx differ
diff --git a/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx b/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx
new file mode 100644
index 0000000..b9dfa5e
Binary files /dev/null and b/benchmark/models/punet_h64_k4_heads4_layers2_n300_e1500.onnx differ
diff --git a/benchmark/models/simple_transformer.onnx b/benchmark/models/simple_transformer.onnx
new file mode 100644
index 0000000..1925d9d
Binary files /dev/null and b/benchmark/models/simple_transformer.onnx differ
diff --git a/benchmark/models/simple_transformer.onnx.data b/benchmark/models/simple_transformer.onnx.data
new file mode 100644
index 0000000..3f52857
Binary files /dev/null and b/benchmark/models/simple_transformer.onnx.data differ
diff --git a/benchmark/models/simple_transformer_300.onnx b/benchmark/models/simple_transformer_300.onnx
new file mode 100644
index 0000000..b32c59a
Binary files /dev/null and b/benchmark/models/simple_transformer_300.onnx differ
diff --git a/benchmark/models/simple_transformer_300.onnx.data b/benchmark/models/simple_transformer_300.onnx.data
new file mode 100644
index 0000000..d1a5ee9
Binary files /dev/null and b/benchmark/models/simple_transformer_300.onnx.data differ
diff --git a/benchmark/models/specialize_models.py b/benchmark/models/specialize_models.py
new file mode 100644
index 0000000..fae90ad
--- /dev/null
+++ b/benchmark/models/specialize_models.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Specialize parametric ONNX models to static input shapes for SOFIE GPU benchmarking.
+
+For each source model, produces N variants with concrete (static) input dimensions,
+replacing all dim_param symbols with dim_value integers, then runs ONNX shape
+inference to propagate concrete shapes through the entire graph.
+
+Usage:
+    python3 specialize_models.py
+"""
+
+import copy
+import os
+import sys
+import onnx
+from onnx import shape_inference, TensorProto
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def set_static_inputs(model: onnx.ModelProto,
+                      shape_map: dict[str, list[int]]) -> onnx.ModelProto:
+    """
+    Return a deep copy of *model* with every dim_param in graph inputs replaced
+    by the concrete dim_value from *shape_map* {input_name: [d0, d1, ...]}.
+    Unmentioned inputs are left unchanged.
+    """
+    m = copy.deepcopy(model)
+    for inp in m.graph.input:
+        if inp.name not in shape_map:
+            continue
+        t = inp.type.tensor_type
+        new_dims = shape_map[inp.name]
+        for i, dim in enumerate(t.shape.dim):
+            dim.ClearField("dim_param")
+            dim.dim_value = new_dims[i]
+    return m
+
+
+def fix_output_shapes(model: onnx.ModelProto) -> onnx.ModelProto:
+    """
+    Run ONNX shape inference so all intermediate and output value_info entries
+    carry concrete shapes (where inferrable).
+    """
+    return shape_inference.infer_shapes(model, check_type=True,
+                                        strict_mode=False,
+                                        data_prop=True)
+
+
+def verify_no_dynamic_inputs(model: onnx.ModelProto, name: str) -> None:
+    """Warn if any graph input still has a dim_param after specialization."""
+    for inp in model.graph.input:
+        t = inp.type.tensor_type
+        for dim in t.shape.dim:
+            if dim.dim_param:
+                print(f"  [WARN] {name}: input '{inp.name}' still has "
+                      f"dim_param='{dim.dim_param}'")
+
+
+def patch_output_shapes(model: onnx.ModelProto,
+                        output_shape_map: dict[str, list[int | str]]) -> onnx.ModelProto:
+    """
+    Forcibly set output tensor shapes to concrete values where ONNX shape
+    inference could not propagate through ops like ScatterElements / Expand / Add.
+    *output_shape_map* maps output name → list of dim values (int = static,
+    str = leave dim_param unchanged).
+    """
+    m = copy.deepcopy(model)
+    for out in m.graph.output:
+        if out.name not in output_shape_map:
+            continue
+        t = out.type.tensor_type
+        new_dims = output_shape_map[out.name]
+        for i, dim in enumerate(t.shape.dim):
+            v = new_dims[i]
+            if isinstance(v, int):
+                dim.ClearField("dim_param")
+                dim.dim_value = v
+    return m
+
+
+def save(model: onnx.ModelProto, path: str) -> None:
+    onnx.save(model, path)
+    size_kb = os.path.getsize(path) / 1024
+    print(f"  → saved {os.path.basename(path)}  ({size_kb:.0f} KB)")
+
+
+# ---------------------------------------------------------------------------
+# Model specifications
+# ---------------------------------------------------------------------------
+
+BASE = os.path.dirname(os.path.abspath(__file__))
+
+# ── GNN family ──────────────────────────────────────────────────────────────
+# Inputs: node_features [n_nodes, 29]
+#         edge_features [n_edges,  5]
+#         edge_index    [n_edges,  2]  (int64)
+# Output: edge_scores   [n_edges,  1]
+#
+# Scale: 5 variants with n_edges ≈ 5 × n_nodes (realistic for tracking GNNs)
+GNN_VARIANTS = [
+    {"n_nodes": 100,    "n_edges": 500},
+    {"n_nodes": 300,    "n_edges": 1500},
+    {"n_nodes": 1000,   "n_edges": 5000},
+    {"n_nodes": 3000,   "n_edges": 15000},
+    {"n_nodes": 10000,  "n_edges": 50000},
+    # Large-input variants — cluster GPUs only (≥40 GB VRAM)
+    {"n_nodes": 30000,  "n_edges": 150000},
+    {"n_nodes": 100000, "n_edges": 500000},
+]
+
+GNN_MODELS = [
+    "gnn_h32_k2.onnx",
+    "gnn_h64_k4.onnx",
+    "punet_h32_k2_heads4_layers2.onnx",
+    "punet_h64_k4_heads4_layers2.onnx",
+]
+
+
+def gnn_shape_map(n_nodes: int, n_edges: int) -> dict[str, list[int]]:
+    return {
+        "node_features": [n_nodes, 29],
+        "edge_features": [n_edges, 5],
+        "edge_index":    [n_edges, 2],
+    }
+
+
+def gnn_output_shape_map(n_nodes: int, n_edges: int) -> dict[str, list[int]]:
+    # edge_scores output: [n_edges, 1] — patch for models where shape inference
+    # cannot trace through ScatterElements/Expand back to the output.
+    return {"edge_scores": [n_edges, 1]}
+
+
+def gnn_suffix(v: dict) -> str:
+    return f"n{v['n_nodes']}_e{v['n_edges']}"
+
+
+# ── Transformer ──────────────────────────────────────────────────────────────
+# Inputs: src [1, n_nodes,   3]  float32
+#         tgt [1, seq_length]    int64
+# Output: logits [batch, seq_length, 132]
+#
+# Constraint: tgt_positional_encoding.pe is [1, 60, 32]
+#             → seq_length must be ≤ 60.
+# Use the same value for n_nodes and seq_length (square attention pattern).
+TRANSFORMER_VARIANTS = [
+    {"n_nodes": 10, "seq_len": 10},
+    {"n_nodes": 20, "seq_len": 20},
+    {"n_nodes": 30, "seq_len": 30},
+    {"n_nodes": 40, "seq_len": 40},
+    {"n_nodes": 50, "seq_len": 50},
+    # Maximum allowed by the positional-encoding table [1, 60, 32]
+    {"n_nodes": 60, "seq_len": 60},
+]
+
+TRANSFORMER_MODELS = ["transformer_d32_h2_L6_ff32.onnx"]
+
+
+def transformer_shape_map(n_nodes: int, seq_len: int) -> dict[str, list[int]]:
+    return {
+        "src": [1, n_nodes, 3],
+        "tgt": [1, seq_len],
+    }
+
+
+def transformer_output_shape_map(n_nodes: int, seq_len: int) -> dict[str, list[int]]:
+    # logits output: [batch=1, seq_length, 132].  The batch dim is produced by a
+    # bias Add op whose shape inference leaves it as 'Addlogits_dim_0'.
+    return {"logits": [1, seq_len, 132]}
+
+
+def transformer_suffix(v: dict) -> str:
+    return f"n{v['n_nodes']}_s{v['seq_len']}"
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+def specialize_family(model_names: list[str],
+                      variants: list[dict],
+                      shape_map_fn,
+                      suffix_fn,
+                      output_shape_map_fn=None) -> None:
+    for mname in model_names:
+        src_path = os.path.join(BASE, mname)
+        if not os.path.exists(src_path):
+            print(f"[SKIP] {mname} — file not found")
+            continue
+
+        stem = mname.removesuffix(".onnx")
+        # Load with external data so weights are embedded in the new file
+        base_model = onnx.load(src_path, load_external_data=True)
+        print(f"\nSpecializing {mname}:")
+
+        for v in variants:
+            out_name = f"{stem}_{suffix_fn(v)}.onnx"
+            out_path = os.path.join(BASE, out_name)
+
+            smap = shape_map_fn(**v)
+            m = set_static_inputs(base_model, smap)
+            try:
+                m = fix_output_shapes(m)
+            except Exception as e:
+                print(f"  [WARN] shape inference failed for {out_name}: {e}")
+
+            # Manually patch outputs that inference couldn't resolve
+            if output_shape_map_fn is not None:
+                omap = output_shape_map_fn(**v)
+                m = patch_output_shapes(m, omap)
+
+            verify_no_dynamic_inputs(m, out_name)
+            save(m, out_path)
+
+
+def main() -> None:
+    print("=" * 60)
+    print("SOFIE benchmark model specialization")
+    print("=" * 60)
+
+    specialize_family(GNN_MODELS, GNN_VARIANTS, gnn_shape_map, gnn_suffix,
+                      output_shape_map_fn=gnn_output_shape_map)
+    specialize_family(TRANSFORMER_MODELS, TRANSFORMER_VARIANTS,
+                      transformer_shape_map, transformer_suffix,
+                      output_shape_map_fn=transformer_output_shape_map)
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/models/transformer_L1000.onnx.data b/benchmark/models/transformer_L1000.onnx.data
new file mode 100644
index 0000000..935826b
Binary files /dev/null and b/benchmark/models/transformer_L1000.onnx.data differ
diff --git a/benchmark/models/transformer_L1000_B1.onnx b/benchmark/models/transformer_L1000_B1.onnx
new file mode 100644
index 0000000..c0728dd
Binary files /dev/null and b/benchmark/models/transformer_L1000_B1.onnx differ
diff --git a/benchmark/models/transformer_L1000_B100.onnx b/benchmark/models/transformer_L1000_B100.onnx
new file mode 100644
index 0000000..669777d
Binary files /dev/null and b/benchmark/models/transformer_L1000_B100.onnx differ
diff --git a/benchmark/models/transformer_L1000_B100.onnx.data b/benchmark/models/transformer_L1000_B100.onnx.data
new file mode 100644
index 0000000..cb59778
Binary files /dev/null and b/benchmark/models/transformer_L1000_B100.onnx.data differ
diff --git a/benchmark/models/transformer_L100_B100.onnx b/benchmark/models/transformer_L100_B100.onnx
new file mode 100644
index 0000000..1af481f
Binary files /dev/null and b/benchmark/models/transformer_L100_B100.onnx differ
diff --git a/benchmark/models/transformer_L100_B100.onnx.data b/benchmark/models/transformer_L100_B100.onnx.data
new file mode 100644
index 0000000..08b324e
Binary files /dev/null and b/benchmark/models/transformer_L100_B100.onnx.data differ
diff --git a/benchmark/models/transformer_L8000_B1.onnx b/benchmark/models/transformer_L8000_B1.onnx
new file mode 100644
index 0000000..fdd7d69
Binary files /dev/null and b/benchmark/models/transformer_L8000_B1.onnx differ
diff --git a/benchmark/models/transformer_L8000_B1.onnx.data b/benchmark/models/transformer_L8000_B1.onnx.data
new file mode 100644
index 0000000..91daf1d
Binary files /dev/null and b/benchmark/models/transformer_L8000_B1.onnx.data differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32.onnx b/benchmark/models/transformer_d32_h2_L6_ff32.onnx
new file mode 100644
index 0000000..be64f93
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx
new file mode 100644
index 0000000..2d5a39f
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n10_s10.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx
new file mode 100644
index 0000000..3cb3b3a
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n20_s20.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx
new file mode 100644
index 0000000..5e869ea
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n30_s30.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx
new file mode 100644
index 0000000..0dfe632
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n40_s40.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx
new file mode 100644
index 0000000..05af836
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n50_s50.onnx differ
diff --git a/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx b/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx
new file mode 100644
index 0000000..3970cba
Binary files /dev/null and b/benchmark/models/transformer_d32_h2_L6_ff32_n60_s60.onnx differ
diff --git a/benchmark/src/BenchmarkBackend.hxx b/benchmark/src/BenchmarkBackend.hxx
new file mode 100644
index 0000000..c12fde3
--- /dev/null
+++ b/benchmark/src/BenchmarkBackend.hxx
@@ -0,0 +1,49 @@
+#pragma once
+// Backend type aliases and helpers — selected at compile time by CMake via
+// -DSOFIE_BACKEND_CUDA / -DSOFIE_BACKEND_HIP.
+// Every generated bench header and the runner use these so they stay free of
+// hard-coded backend-specific APIs (cuda_runtime.h, hip_runtime.h, …).
+
+#include <alpaka/alpaka.hpp>
+
+// ---- Per-backend runtime header and device-sync macro ----------------------
+#if defined(SOFIE_BACKEND_CUDA)
+#  include <cuda_runtime.h>
+#  define SOFIE_BENCH_DEVICE_SYNC() cudaDeviceSynchronize()
+#elif defined(SOFIE_BACKEND_HIP)
+#  include <hip/hip_runtime.h>
+#  define SOFIE_BENCH_DEVICE_SYNC() hipDeviceSynchronize()
+#else
+#  define SOFIE_BENCH_DEVICE_SYNC() do {} while (0)
+#endif
+
+namespace sofie_bench {
+
+using Idx  = std::size_t;
+using Dim1 = alpaka::DimInt<1>;
+using Ext1 = alpaka::Vec<Dim1, Idx>;
+
+#if defined(SOFIE_BACKEND_CUDA)
+
+    using AccTag   = alpaka::TagGpuCudaRt;
+    using Platform = alpaka::PlatformCudaRt;
+    using Device   = alpaka::DevCudaRt;
+    using Queue    = alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking>;
+
+#elif defined(SOFIE_BACKEND_HIP)
+
+    using AccTag   = alpaka::TagGpuHipRt;
+    using Platform = alpaka::PlatformHipRt;
+    using Device   = alpaka::DevHipRt;
+    using Queue    = alpaka::Queue<alpaka::DevHipRt, alpaka::NonBlocking>;
+
+#else  // CPU serial (default / fallback)
+
+    using AccTag   = alpaka::TagCpuSerial;
+    using Platform = alpaka::PlatformCpu;
+    using Device   = alpaka::DevCpu;
+    using Queue    = alpaka::Queue<alpaka::DevCpu, alpaka::Blocking>;
+
+#endif
+
+} // namespace sofie_bench
diff --git a/benchmark/src/BenchmarkEmitter.cxx.in b/benchmark/src/BenchmarkEmitter.cxx.in
new file mode 100644
index 0000000..f5d5435
--- /dev/null
+++ b/benchmark/src/BenchmarkEmitter.cxx.in
@@ -0,0 +1,309 @@
+// SOFIE Benchmark Emitter
+// Auto-configured by CMake — do not edit directly.
+// For each .onnx model in benchmark/models/ this binary generates:
+//   <ModelName>_GPU_ALPAKA.hxx  — SOFIE inference code
+//   <ModelName>_GPU_ALPAKA.dat  — serialized weights
+//   <ModelName>_bench.hxx       — timing function, following the same
+//                                 pattern as the unit tests
+
+#include "SOFIE/RModel_Base.hxx"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelParser_ONNX.hxx"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace SOFIE;
+
+static size_t resolveDim(const Dim &d) {
+    return (d.dim > 0) ? static_cast<size_t>(d.dim) : 1u;
+}
+
+static int EmitBenchmarkModel(const std::string &onnxPath,
+                              const std::string &modelName,
+                              const std::string &outDir)
+{
+    std::cout << "[Benchmark] Processing: " << onnxPath << "\n";
+
+    RModelParser_ONNX parser;
+    RModel model = parser.Parse(onnxPath);
+
+    const auto &inputNames = model.GetInputTensorNames();
+    if (inputNames.empty()) {
+        std::cerr << "[WARN] " << modelName << " has no inputs – skipping.\n";
+        return 1;
+    }
+
+    // Map SOFIE tensor type to C++ type string
+    auto tensorTypeToCpp = [](ETensorType t) -> std::string {
+        switch (t) {
+            case ETensorType::FLOAT:  return "float";
+            case ETensorType::DOUBLE: return "double";
+            case ETensorType::INT32:  return "int32_t";
+            case ETensorType::INT64:  return "int64_t";
+            case ETensorType::UINT8:  return "uint8_t";
+            case ETensorType::INT8:   return "int8_t";
+            case ETensorType::UINT16: return "uint16_t";
+            case ETensorType::INT16:  return "int16_t";
+            case ETensorType::UINT32: return "uint32_t";
+            case ETensorType::UINT64: return "uint64_t";
+            case ETensorType::BOOL:   return "uint8_t";
+            default:                  return "float";
+        }
+    };
+
+    // Collect input metadata before code generation
+    struct TensorMeta {
+        std::string cppType;
+        size_t numElements;
+    };
+    std::vector<TensorMeta> inputs;
+    for (const auto &n : inputNames) {
+        TensorMeta m;
+        try { m.cppType = tensorTypeToCpp(model.GetTensorType(n)); }
+        catch (...) { m.cppType = "float"; }
+        m.numElements = 1;
+        try {
+            for (const auto &d : model.GetDimTensorShape(n))
+                m.numElements *= resolveDim(d);
+        } catch (...) {}
+        inputs.push_back(m);
+    }
+
+    // Generate SOFIE GPU/Alpaka inference code first — this calls Initialize()
+    // which runs shape inference on all operators.  Output tensor shapes are only
+    // available *after* Initialize(), so we must call GenerateGPU_ALPAKA() before
+    // querying output metadata.
+#ifdef SOFIE_BENCHMARK_PROFILE
+    model.GenerateGPU_ALPAKA(Options::kProfile);
+#else
+    model.GenerateGPU_ALPAKA();
+#endif
+
+    // Collect output metadata AFTER code generation so shapes are fully propagated.
+    const auto &outputNames = model.GetOutputTensorNames();
+    std::vector<TensorMeta> outputs;
+    for (const auto &n : outputNames) {
+        TensorMeta m;
+        try { m.cppType = tensorTypeToCpp(model.GetTensorType(n)); }
+        catch (...) { m.cppType = "float"; }
+        m.numElements = 1;
+        try {
+            for (const auto &d : model.GetDimTensorShape(n))
+                m.numElements *= resolveDim(d);
+        } catch (...) {}
+        outputs.push_back(m);
+    }
+
+    std::string hxxPath  = outDir + "/" + modelName + "_GPU_ALPAKA.hxx";
+    std::string benchPath = outDir + "/" + modelName + "_bench.hxx";
+
+    model.OutputGenerated(hxxPath);
+
+    // Sanitize model name into a valid C++ identifier
+    std::string cppName = modelName;
+    for (char &c : cppName)
+        if (!std::isalnum(static_cast<unsigned char>(c))) c = '_';
+
+    // Build "session.infer(input_d_0, input_d_1, ...)"
+    std::ostringstream inferCall;
+    inferCall << "session.infer(";
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        if (i) inferCall << ", ";
+        inferCall << "input_d_" << i;
+    }
+    inferCall << ")";
+
+    std::ofstream bench(benchPath);
+    if (!bench.is_open()) {
+        std::cerr << "[ERROR] Cannot open " << benchPath << "\n";
+        return 1;
+    }
+
+    bench
+        << "// Auto-generated benchmark for model: " << modelName << "\n"
+        << "// DO NOT EDIT — regenerated by the SOFIE benchmark emitter.\n"
+        << "// Backend is selected at compile time via -DSOFIE_BACKEND_CUDA / -DSOFIE_BACKEND_HIP\n"
+        << "// (see BenchmarkBackend.hxx for the sofie_bench:: type aliases).\n"
+        << "#pragma once\n\n"
+        << "#include \"" << modelName << "_GPU_ALPAKA.hxx\"\n"
+        << "#include \"BenchmarkBackend.hxx\"\n"
+        << "#include <chrono>\n"
+        << "#include <iostream>\n"
+        << "#include <random>\n"
+        << "#include <string>\n\n"
+        // Not inline: each model is compiled in its own _bench.cu TU and
+        // called from the main TU via a forward declaration.  Using inline
+        // here would require all 32 models to share one translation unit
+        // which OOM-kills the CUDA compiler.
+        << "void Benchmark_" << cppName
+        << "(int warmup, int iterations, const std::string& weightsDir) {\n"
+        << "    using namespace sofie_bench;\n\n"
+        << "    // ---- Device/host setup (mirrors unit-test pattern) ----\n"
+        << "    alpaka::PlatformCpu hostPlatform{};\n"
+        << "    auto host = alpaka::getDevByIdx(hostPlatform, 0u);\n"
+        << "    Platform platform{};\n"
+        << "    auto device = alpaka::getDevByIdx(platform, 0u);\n"
+        << "    Queue queue{device};\n\n"
+        << "    std::mt19937 rng(42);\n"
+        << "    std::uniform_real_distribution<float> fdist(-1.0f, 1.0f);\n\n";
+
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+        const std::string &T = inputs[i].cppType;
+        const size_t       N = inputs[i].numElements;
+        bench
+            << "    // Input " << i << ": " << T << "[" << N << "]\n"
+            << "    auto input_h_" << i << " = alpaka::allocBuf<" << T
+            << ", Idx>(host, Ext1::all(Idx{" << N << "}));\n"
+            << "    {\n"
+            << "        auto *p = reinterpret_cast<" << T
+            << "*>(alpaka::getPtrNative(input_h_" << i << "));\n";
+        if (T == "float" || T == "double") {
+            bench
+                << "        for (size_t k = 0; k < " << N
+                << "; ++k) p[k] = static_cast<" << T << ">(fdist(rng));\n";
+        } else {
+            bench
+                << "        std::fill(p, p + " << N
+                << ", static_cast<" << T << ">(0));\n";
+        }
+        bench
+            << "    }\n"
+            << "    auto input_d_" << i << " = alpaka::allocBuf<" << T
+            << ", Idx>(device, Ext1::all(Idx{" << N << "}));\n"
+            << "    alpaka::memcpy(queue, input_d_" << i
+            << ", input_h_" << i << ");\n\n";
+    }
+    bench << "    alpaka::wait(queue);\n\n";
+
+    bench
+        << "    // ---- Create session (loads weights) ----\n"
+        << "    std::string weightFile = weightsDir + \"/"
+        << modelName << "_GPU_ALPAKA.dat\";\n"
+        << "    SOFIE_" << cppName
+        << "::Session<AccTag> session(weightFile);\n\n";
+
+
+    bench << "    // ---- One priming inference to obtain output buffer handle(s) ----\n"
+          << "    auto out_d = " << inferCall.str() << ";\n"
+          << "    alpaka::wait(session.queue);\n\n";
+
+    for (size_t i = 0; i < outputs.size(); ++i) {
+        const std::string &T = outputs[i].cppType;
+        const size_t       N = outputs[i].numElements;
+        bench
+            << "    // Output " << i << ": " << T << "[" << N << "]\n"
+            << "    auto out_h_" << i << " = alpaka::allocBuf<" << T
+            << ", Idx>(host, Ext1::all(Idx{" << N << "}));\n";
+    }
+    bench << "\n";
+
+    auto outBufExpr = [&](size_t i) -> std::string {
+        if (outputs.size() == 1) return "out_d";
+        return "out_d[" + std::to_string(i) + "]";
+    };
+
+    bench << "    // ---- Warmup (not timed) ----\n"
+          << "    for (int w = 0; w < warmup; ++w) {\n";
+    // H2D: inputs host → device
+    for (size_t i = 0; i < inputs.size(); ++i)
+        bench << "        alpaka::memcpy(queue, input_d_" << i
+              << ", input_h_" << i << ");\n";
+    bench << "        alpaka::wait(queue);\n";
+    // Inference
+    bench << "        " << inferCall.str() << ";\n"
+          << "        alpaka::wait(session.queue);\n";
+    // D2H: outputs device → host
+    for (size_t i = 0; i < outputs.size(); ++i)
+        bench << "        alpaka::memcpy(queue, out_h_" << i
+              << ", " << outBufExpr(i) << ");\n";
+    bench << "        alpaka::wait(queue);\n"
+          << "    }\n"
+          << "    cudaDeviceSynchronize();\n\n";
+
+    // ---- Profiling path (mutually exclusive with benchmark path) ----
+    bench << "#ifdef SOFIE_BENCHMARK_PROFILE\n"
+          << "    // ---- Profiling: reset warmup data, run iterations, print results ----\n"
+          << "    session.ResetProfilingResults();\n"
+          << "    for (int _i = 0; _i < iterations; ++_i)\n"
+          << "        " << inferCall.str() << ";\n"
+          << "    alpaka::wait(session.queue);\n"
+          << "    std::printf(\"%s\\n\", std::string(60, '-').c_str());\n"
+          << "    std::printf(\"Model: " << modelName << "\\n\");\n"
+          << "    session.PrintProfilingResults();\n"
+          << "    session.PrintMemoryInfo();\n"
+          << "#else\n";
+
+    // ---- Benchmark path ----
+    bench
+        << "    // ---- Timed input transfer (H2D: host -> device) ----\n"
+        << "    auto t0_in = std::chrono::high_resolution_clock::now();\n"
+        << "    for (int _i = 0; _i < iterations; ++_i) {\n";
+    for (size_t i = 0; i < inputs.size(); ++i)
+        bench << "        alpaka::memcpy(queue, input_d_" << i
+              << ", input_h_" << i << ");\n";
+    bench << "        alpaka::wait(queue);\n"
+          << "    }\n"
+          << "    SOFIE_BENCH_DEVICE_SYNC();\n"
+          << "    auto t1_in = std::chrono::high_resolution_clock::now();\n\n";
+
+    bench
+        << "    // ---- Timed inference ----\n"
+        << "    auto t0_infer = std::chrono::high_resolution_clock::now();\n"
+        << "    for (int _i = 0; _i < iterations; ++_i)\n"
+        << "        " << inferCall.str() << ";\n"
+        << "    alpaka::wait(session.queue);\n"
+        << "    SOFIE_BENCH_DEVICE_SYNC();\n"
+        << "    auto t1_infer = std::chrono::high_resolution_clock::now();\n\n";
+
+    bench
+        << "    // ---- Timed output transfer (D2H: device -> host) ----\n"
+        << "    auto t0_out = std::chrono::high_resolution_clock::now();\n"
+        << "    for (int _i = 0; _i < iterations; ++_i) {\n";
+    for (size_t i = 0; i < outputs.size(); ++i)
+        bench << "        alpaka::memcpy(queue, out_h_" << i
+              << ", " << outBufExpr(i) << ");\n";
+    bench << "        alpaka::wait(queue);\n"
+          << "    }\n"
+          << "    SOFIE_BENCH_DEVICE_SYNC();\n"
+          << "    auto t1_out = std::chrono::high_resolution_clock::now();\n\n";
+
+    bench
+        << "    double avg_infer_ms  = std::chrono::duration<double, std::milli>"
+           "(t1_infer - t0_infer).count() / iterations;\n"
+        << "    double avg_in_ms     = std::chrono::duration<double, std::milli>"
+           "(t1_in    - t0_in   ).count() / iterations;\n"
+        << "    double avg_out_ms    = std::chrono::duration<double, std::milli>"
+           "(t1_out   - t0_out  ).count() / iterations;\n"
+        << "    double throughput    = (avg_infer_ms > 0.0) ? 1000.0 / avg_infer_ms : 0.0;\n\n"
+        << "    std::printf(\"%-40s  %12.4f  %14.4f  %15.4f  %16.1f\\n\",\n"
+        << "                \"" << modelName << "\",\n"
+        << "                avg_infer_ms, avg_in_ms, avg_out_ms, throughput);\n"
+        << "#endif  // SOFIE_BENCHMARK_PROFILE\n"
+        << "}\n";
+
+    bench.close();
+
+    std::cout << "[Benchmark] Wrote: " << hxxPath  << "\n"
+              << "            Wrote: " << benchPath << "\n";
+    return 0;
+}
+
+int main(int argc, char *argv[]) {
+    if (argc < 2) {
+        std::cerr << "Usage: sofie_benchmark_emitter <output_dir>\n";
+        return 1;
+    }
+    std::string outDir = argv[1];
+    int failures = 0;
+
+@BENCHMARK_EMIT_CAPTURES@
+
+    std::cout << "[Benchmark Emitter] Done — " << failures << " failure(s).\n";
+    return failures == 0 ? 0 : 1;
+}
diff --git a/benchmark/src/BenchmarkRunner.cxx.in b/benchmark/src/BenchmarkRunner.cxx.in
new file mode 100644
index 0000000..1848680
--- /dev/null
+++ b/benchmark/src/BenchmarkRunner.cxx.in
@@ -0,0 +1,140 @@
+// SOFIE Alpaka Benchmark Runner — main TU
+// Auto-configured by CMake — do not edit directly.
+//
+// Execution model:
+//   Normal mode  — iterates over all models, spawning ONE subprocess per model
+//                  so each model gets a fresh CUDA context and all GPU memory
+//                  is freed when the subprocess exits.  This avoids the
+//                  cudaErrorMemoryAllocation that occurs when a large model's
+//                  CUDA allocator cache is not returned to the OS between runs.
+//   Single-model mode (--single-model <name>) — called BY the parent; runs
+//                  exactly one model then exits.
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <sstream>
+
+// Optional ONNX Runtime GPU comparison
+#ifdef SOFIE_BENCHMARK_ORT
+#include "ONNXRuntimeBenchmark.hxx"
+#endif
+
+// Forward-declare one per-model benchmark function (defined in <model>_bench.cu)
+@BENCHMARK_FWD_DECLS@
+
+// ---------------------------------------------------------------------------
+// Dispatch table: called when --single-model <name> is given.
+// ---------------------------------------------------------------------------
+static int run_single_model(const std::string& model,
+                             int warmup, int iterations,
+                             const std::string& weightsDir,
+                             bool run_ort)
+{
+@BENCHMARK_SINGLE_MODEL_CASES@
+    std::fprintf(stderr, "Unknown model: %s\n", model.c_str());
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+    int warmup     = 10;
+    int iterations = 100;
+    std::string weightsDir = ".";
+    bool run_ort   = false;
+    std::string singleModel;
+
+    for (int i = 1; i < argc; ++i) {
+        std::string a = argv[i];
+        if      ((a == "--warmup"        || a == "-w") && i + 1 < argc) warmup     = std::stoi(argv[++i]);
+        else if ((a == "--iterations"    || a == "-n") && i + 1 < argc) iterations = std::stoi(argv[++i]);
+        else if  (a == "--weights-dir"                 && i + 1 < argc) weightsDir = argv[++i];
+        else if  (a == "--single-model"                && i + 1 < argc) singleModel = argv[++i];
+        else if  (a == "--onnxruntime"   || a == "--ort")               run_ort    = true;
+        else if  (a == "--help"          || a == "-h") {
+#ifdef SOFIE_BENCHMARK_PROFILE
+            std::cout <<
+                "Usage: sofie_benchmark [options]  [PROFILING MODE]\n"
+                "  Per-operator GPU timing (alpaka::wait per op) + memory breakdown.\n"
+                "  Throughput benchmarking is disabled in this build.\n"
+                "  Rebuild without -DSOFIE_BENCHMARK_PROFILE=ON for throughput numbers.\n\n"
+                "  --warmup,      -w <N>   Warmup iterations  (default: 10)\n"
+                "  --iterations,  -n <N>   Profiling runs      (default: 100)\n"
+                "  --weights-dir    <DIR>  SOFIE .dat files    (default: .)\n"
+                "  --single-model   <N>   Run one model (internal)\n"
+                ;
+#else
+            std::cout <<
+                "Usage: sofie_benchmark [options]\n"
+                "  --warmup,      -w <N>   Warmup iterations       (default: 10)\n"
+                "  --iterations,  -n <N>   Timed iterations         (default: 100)\n"
+                "  --weights-dir    <DIR>  SOFIE .dat weight files  (default: .)\n"
+#ifdef SOFIE_BENCHMARK_ORT
+                "  --onnxruntime, --ort    Also run ONNX Runtime GPU comparison\n"
+#else
+                "  --onnxruntime, --ort    (not available; rebuild with -DSOFIE_BENCHMARK_ORT=ON)\n"
+#endif
+                "  Rebuild with -DSOFIE_BENCHMARK_PROFILE=ON for per-operator profiling.\n"
+                "  --single-model   <N>   Run exactly one model by C++ name (internal)\n"
+                ;
+#endif
+            return 0;
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Single-model mode: called as a subprocess, run one model and exit.
+    // All GPU memory is freed when this process exits.
+    // -----------------------------------------------------------------------
+    if (!singleModel.empty())
+        return run_single_model(singleModel, warmup, iterations, weightsDir, run_ort);
+
+    // -----------------------------------------------------------------------
+    // Orchestrator mode: spawn one child process per model so every model
+    // starts with a fresh CUDA context (avoids cudaErrorMemoryAllocation
+    // caused by the CUDA allocator retaining freed pages between models).
+    // -----------------------------------------------------------------------
+#ifndef SOFIE_BENCHMARK_ORT
+    if (run_ort) {
+        std::fprintf(stderr,
+            "Warning: --onnxruntime requested but this binary was built without "
+            "ORT support.  Rebuild with -DSOFIE_BENCHMARK_ORT=ON.\n");
+        run_ort = false;
+    }
+#endif
+
+#ifdef SOFIE_BENCHMARK_PROFILE
+    std::printf("=== SOFIE Alpaka Profiler ===\n");
+    std::printf("Backend: @SOFIE_BENCHMARK_BACKEND@  |  Warmup: %d  |  Iterations: %d\n\n",
+                warmup, iterations);
+#else
+    std::printf("=== SOFIE Alpaka Benchmark ===\n");
+    std::printf("Backend: @SOFIE_BENCHMARK_BACKEND@  |  Warmup: %d  |  Iterations: %d", warmup, iterations);
+#ifdef SOFIE_BENCHMARK_ORT
+    if (run_ort) std::printf("  |  ORT-GPU: ON");
+#endif
+    std::printf("\n\n");
+
+    std::printf("%-40s  %12s  %14s  %15s  %16s\n",
+                "Model", "infer(ms)", "in_xfer(ms)", "out_xfer(ms)", "Throughput(inf/s)");
+    std::printf("%s\n", std::string(103, '-').c_str());
+#endif  // SOFIE_BENCHMARK_PROFILE
+    std::fflush(stdout);
+
+    // Build the common argument suffix forwarded to every subprocess.
+    std::ostringstream common;
+    common << " -w " << warmup
+           << " -n " << iterations
+           << " --weights-dir \"" << weightsDir << "\"";
+    if (run_ort) common << " --ort";
+    std::string commonArgs = common.str();
+
+    int totalFailed = 0;
+@BENCHMARK_SPAWN_CALLS@
+
+    if (totalFailed > 0)
+        std::fprintf(stderr, "\n%d model(s) failed.\n", totalFailed);
+
+    return (totalFailed > 0) ? 1 : 0;
+}
diff --git a/benchmark/src/BenchmarkUtils.hxx b/benchmark/src/BenchmarkUtils.hxx
new file mode 100644
index 0000000..9dda93a
--- /dev/null
+++ b/benchmark/src/BenchmarkUtils.hxx
@@ -0,0 +1,167 @@
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cstddef>
+#include <cstring>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace sofie_bench {
+
+struct BenchmarkConfig {
+    int         warmupIter    = 10;
+    int         benchIter     = 100;
+    int         deviceId      = 0;
+    float       tolerance     = 1e-3f;
+    bool        validateOrt   = false;
+    std::string weightsDir    = ".";
+    bool        csvOutput     = false;
+    bool        verbose       = false;
+};
+
+struct BenchmarkResult {
+    std::string modelName;
+    size_t      inputElements   = 0;
+    size_t      outputElements  = 0;
+    float       avgInferMs      = 0.0f;  // per-inference average (chrono)
+    float       throughput      = 0.0f;  // inferences / second
+    float       weightMemMB     = 0.0f;  // device memory for model weights
+    float       runtimeMemMB    = 0.0f;  // device memory for intermediates
+    bool        ortRan          = false;
+    bool        ortMatch        = false;
+    float       ortMaxDiff      = -1.0f;
+    bool        skipped         = false;
+    std::string skipReason;
+};
+
+inline BenchmarkConfig ParseArgs(int argc, char *argv[]) {
+    BenchmarkConfig cfg;
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+        if ((arg == "--warmup" || arg == "-w") && i + 1 < argc)
+            cfg.warmupIter = std::stoi(argv[++i]);
+        else if ((arg == "--iterations" || arg == "-n") && i + 1 < argc)
+            cfg.benchIter = std::stoi(argv[++i]);
+        else if ((arg == "--device" || arg == "-d") && i + 1 < argc)
+            cfg.deviceId = std::stoi(argv[++i]);
+        else if ((arg == "--tolerance" || arg == "-t") && i + 1 < argc)
+            cfg.tolerance = std::stof(argv[++i]);
+        else if (arg == "--validate-ort")
+            cfg.validateOrt = true;
+        else if ((arg == "--weights-dir") && i + 1 < argc)
+            cfg.weightsDir = argv[++i];
+        else if (arg == "--csv")
+            cfg.csvOutput = true;
+        else if (arg == "--verbose" || arg == "-v")
+            cfg.verbose = true;
+        else if (arg == "--help" || arg == "-h") {
+            std::cout << "SOFIE Alpaka Benchmark\n\n"
+                      << "Options:\n"
+                      << "  --warmup,     -w <N>   Warmup iterations (default: 10)\n"
+                      << "  --iterations, -n <N>   Benchmark iterations (default: 100)\n"
+                      << "  --device,     -d <ID>  Device index (default: 0)\n"
+                      << "  --tolerance,  -t <F>   ONNXRuntime diff tolerance (default: 1e-3)\n"
+                      << "  --validate-ort         Compare SOFIE outputs to ONNXRuntime\n"
+                      << "  --weights-dir <path>   Directory containing .dat weight files (default: .)\n"
+                      << "  --csv                  Print results in CSV format\n"
+                      << "  --verbose,    -v       Verbose output\n";
+            std::exit(0);
+        }
+    }
+    return cfg;
+}
+
+inline void PrintDeviceInfo(const std::string &deviceName) {
+    std::cout << "Device: " << deviceName << "\n";
+}
+
+inline void PrintHeader(const BenchmarkConfig &cfg, const std::string &deviceName = "") {
+    std::cout << "\n=== SOFIE Alpaka Benchmark ===\n";
+    if (!deviceName.empty())
+        PrintDeviceInfo(deviceName);
+    std::cout << "Warmup: " << cfg.warmupIter
+              << "  |  Iterations: " << cfg.benchIter;
+    if (cfg.validateOrt)
+        std::cout << "  |  ONNXRuntime validation ON (tol=" << cfg.tolerance << ")";
+    std::cout << "\n\n";
+
+    if (cfg.csvOutput) {
+        std::cout << "Model,InputElems,OutputElems,AvgInferMs,Throughput(inf/s),"
+                     "WeightMem(MB),RuntimeMem(MB),OrtMatch,OrtMaxDiff\n";
+    } else {
+        std::cout << std::left
+                  << std::setw(30) << "Model"
+                  << std::setw(12) << "Input"
+                  << std::setw(12) << "Output"
+                  << std::setw(14) << "Avg(ms)"
+                  << std::setw(16) << "Throughput(i/s)"
+                  << std::setw(12) << "ORT Check"
+                  << "\n";
+        std::cout << std::string(96, '-') << "\n";
+    }
+}
+
+inline void PrintResult(const BenchmarkResult &r, const BenchmarkConfig &cfg) {
+    if (r.skipped) {
+        if (!cfg.csvOutput)
+            std::cout << std::left << std::setw(30) << r.modelName
+                      << "  [SKIPPED: " << r.skipReason << "]\n";
+        return;
+    }
+
+    if (cfg.csvOutput) {
+        std::cout << r.modelName << ","
+                  << r.inputElements << ","
+                  << r.outputElements << ","
+                  << std::fixed << std::setprecision(4) << r.avgInferMs << ","
+                  << std::fixed << std::setprecision(1) << r.throughput << ","
+                  << std::fixed << std::setprecision(2) << r.weightMemMB << ","
+                  << std::fixed << std::setprecision(2) << r.runtimeMemMB << ",";
+        if (r.ortRan)
+            std::cout << (r.ortMatch ? "PASS" : "FAIL") << "," << r.ortMaxDiff;
+        else
+            std::cout << "N/A,N/A";
+        std::cout << "\n";
+    } else {
+        std::string ortStr = "N/A";
+        if (r.ortRan) {
+            std::ostringstream oss;
+            oss << (r.ortMatch ? "PASS" : "FAIL")
+                << "(d=" << std::scientific << std::setprecision(1) << r.ortMaxDiff << ")";
+            ortStr = oss.str();
+        }
+        std::cout << std::left
+                  << std::setw(30) << r.modelName
+                  << std::setw(12) << r.inputElements
+                  << std::setw(12) << r.outputElements
+                  << std::setw(14) << std::fixed << std::setprecision(4) << r.avgInferMs
+                  << std::setw(16) << std::fixed << std::setprecision(1) << r.throughput
+                  << std::setw(12) << ortStr
+                  << "\n";
+    }
+}
+
+inline void PrintSummary(const std::vector<BenchmarkResult> &results, const BenchmarkConfig &cfg) {
+    if (cfg.csvOutput) return;
+
+    std::cout << "\n" << std::string(96, '=') << "\n";
+    int ran = 0, skipped = 0, ortFail = 0;
+    float totalMs = 0.0f;
+    for (const auto &r : results) {
+        if (r.skipped) { ++skipped; continue; }
+        ++ran;
+        totalMs += r.avgInferMs;
+        if (r.ortRan && !r.ortMatch) ++ortFail;
+    }
+    std::cout << "Summary: " << ran << " model(s) benchmarked";
+    if (skipped) std::cout << ", " << skipped << " skipped";
+    if (ran > 0)  std::cout << ", avg inference " << std::fixed << std::setprecision(4) << (totalMs / ran) << " ms";
+    if (ortFail)  std::cout << ", " << ortFail << " ORT mismatch(es)";
+    std::cout << "\n";
+}
+
+} // namespace sofie_bench
diff --git a/benchmark/src/ModelBench.cu.in b/benchmark/src/ModelBench.cu.in
new file mode 100644
index 0000000..4717407
--- /dev/null
+++ b/benchmark/src/ModelBench.cu.in
@@ -0,0 +1,5 @@
+// Per-model compilation unit for: @MODEL_NAME@
+// Each model is compiled in isolation so the GPU template instantiation
+// for one model's Session<tagAcc> does not share a translation unit with
+// every other model's (which would OOM the CUDA compiler on the all-in-one TU).
+#include "@MODEL_NAME@_bench.hxx"
diff --git a/benchmark/src/ONNXRuntimeBenchmark.hxx b/benchmark/src/ONNXRuntimeBenchmark.hxx
new file mode 100644
index 0000000..9d3ddd3
--- /dev/null
+++ b/benchmark/src/ONNXRuntimeBenchmark.hxx
@@ -0,0 +1,210 @@
+// SOFIE Benchmark — ONNX Runtime GPU backend
+// Generic benchmark: loads any ONNX model, introspects shapes, runs with the
+// CUDA ExecutionProvider.  Float inputs are filled with uniform random values;
+// integer inputs are zeroed (safe for index tensors like edge_index).
+//
+// Data stays on the HOST side of the ORT API (ORT handles H↔D transfers
+// internally) — this measures end-to-end latency from the application's
+// perspective.  Use the optional IOBinding path (--ort-device-io, WIP) to
+// measure pure GPU compute time comparable to the SOFIE numbers.
+#pragma once
+
+#include <onnxruntime_cxx_api.h>
+#include <cuda_runtime.h>
+
+#include <chrono>
+#include <cstdio>
+#include <cstring>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace sofie_ort_bench_detail {
+
+/// Total element count from a shape vector (-1 dynamic dims are treated as 1).
+inline std::size_t shapeToSize(const std::vector<int64_t>& shape) {
+    std::size_t n = 1;
+    for (auto d : shape) n *= (d > 0 ? static_cast<std::size_t>(d) : 1u);
+    return n;
+}
+
+inline const char* ortTypeName(ONNXTensorElementDataType t) {
+    switch (t) {
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:  return "float32";
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: return "float64";
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:  return "int32";
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:  return "int64";
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:  return "uint8";
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:   return "bool";
+        default:                                   return "other";
+    }
+}
+
+} // namespace sofie_ort_bench_detail
+
+/// Run @p model_path through ONNX Runtime's CUDAExecutionProvider.
+/// Results are printed in the same table format as the SOFIE Alpaka benchmark.
+///
+/// @param model_path   Full path to the .onnx file.
+/// @param model_name   Display name shown in the table (typically the stem).
+/// @param warmup       Number of warm-up iterations (not timed).
+/// @param iterations   Number of timed iterations.
+/// @param device_id    CUDA device index (default 0).
+/// @param verbose      If true, print per-input shape/type information.
+inline void BenchmarkORT_GPU(const std::string& model_path,
+                              const std::string& model_name,
+                              int warmup,
+                              int iterations,
+                              int  device_id = 0,
+                              bool verbose   = false)
+{
+    using namespace sofie_ort_bench_detail;
+
+    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "sofie_ort_bench");
+
+    Ort::SessionOptions opts;
+    opts.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    opts.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+
+    OrtCUDAProviderOptions cuda_opts{};
+    cuda_opts.device_id                 = device_id;
+    cuda_opts.arena_extend_strategy     = 0;   // kNextPowerOfTwo
+    cuda_opts.gpu_mem_limit             = SIZE_MAX;
+    cuda_opts.cudnn_conv_algo_search    = OrtCudnnConvAlgoSearchExhaustive;
+    cuda_opts.do_copy_in_default_stream = 1;
+    opts.AppendExecutionProvider_CUDA(cuda_opts);
+
+    Ort::Session session(env, model_path.c_str(), opts);
+    Ort::AllocatorWithDefaultOptions alloc;
+    Ort::MemoryInfo mem_cpu =
+        Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+    const std::size_t num_inputs = session.GetInputCount();
+
+    std::vector<std::string>      input_names_str(num_inputs);
+    std::vector<const char*>      input_names_ptr(num_inputs);
+    std::vector<std::vector<int64_t>> input_shapes(num_inputs);
+    std::vector<ONNXTensorElementDataType> input_types(num_inputs);
+
+    std::vector<std::vector<float>>   float_data(num_inputs);
+    std::vector<std::vector<double>>  double_data(num_inputs);
+    std::vector<std::vector<int64_t>> int64_data(num_inputs);
+    std::vector<std::vector<int32_t>> int32_data(num_inputs);
+    std::vector<std::vector<uint8_t>> uint8_data(num_inputs);
+
+    std::mt19937 rng(42);
+    std::uniform_real_distribution<float> fdist(-1.f, 1.f);
+
+    std::vector<Ort::Value> input_tensors;
+    input_tensors.reserve(num_inputs);
+
+    for (std::size_t i = 0; i < num_inputs; ++i) {
+        auto name_ptr = session.GetInputNameAllocated(i, alloc);
+        input_names_str[i] = name_ptr.get();
+        input_names_ptr[i] = input_names_str[i].c_str();
+
+        auto info = session.GetInputTypeInfo(i);
+        auto tinfo = info.GetTensorTypeAndShapeInfo();
+        input_types[i]  = tinfo.GetElementType();
+        input_shapes[i] = tinfo.GetShape();
+
+        for (auto& d : input_shapes[i]) if (d < 0) d = 1;
+
+        std::size_t n = shapeToSize(input_shapes[i]);
+
+        if (verbose) {
+            std::printf("  Input %-2zu  %-20s  type=%-8s  numel=%zu\n",
+                i, input_names_str[i].c_str(),
+                ortTypeName(input_types[i]), n);
+        }
+
+        switch (input_types[i]) {
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: {
+                float_data[i].resize(n);
+                for (auto& v : float_data[i]) v = fdist(rng);
+                input_tensors.push_back(Ort::Value::CreateTensor<float>(
+                    mem_cpu, float_data[i].data(), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: {
+                double_data[i].resize(n, 0.0);
+                for (auto& v : double_data[i])
+                    v = static_cast<double>(fdist(rng));
+                input_tensors.push_back(Ort::Value::CreateTensor<double>(
+                    mem_cpu, double_data[i].data(), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: {
+                int64_data[i].assign(n, 0);
+                input_tensors.push_back(Ort::Value::CreateTensor<int64_t>(
+                    mem_cpu, int64_data[i].data(), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: {
+                int32_data[i].assign(n, 0);
+                input_tensors.push_back(Ort::Value::CreateTensor<int32_t>(
+                    mem_cpu, int32_data[i].data(), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: {
+                uint8_data[i].assign(n, 0);
+                input_tensors.push_back(Ort::Value::CreateTensor<uint8_t>(
+                    mem_cpu, uint8_data[i].data(), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: {
+                uint8_data[i].assign(n, 0);
+                input_tensors.push_back(Ort::Value::CreateTensor<bool>(
+                    mem_cpu,
+                    reinterpret_cast<bool*>(uint8_data[i].data()), n,
+                    input_shapes[i].data(), input_shapes[i].size()));
+                break;
+            }
+            default:
+                throw std::runtime_error(
+                    std::string("BenchmarkORT_GPU: unsupported input type for ") +
+                    input_names_str[i]);
+        }
+    }
+
+    const std::size_t num_outputs = session.GetOutputCount();
+    std::vector<std::string> output_names_str(num_outputs);
+    std::vector<const char*> output_names_ptr(num_outputs);
+    for (std::size_t i = 0; i < num_outputs; ++i) {
+        auto ptr = session.GetOutputNameAllocated(i, alloc);
+        output_names_str[i] = ptr.get();
+        output_names_ptr[i] = output_names_str[i].c_str();
+    }
+
+    Ort::RunOptions run_opts;
+
+    for (int w = 0; w < warmup; ++w) {
+        session.Run(run_opts,
+                    input_names_ptr.data(),  input_tensors.data(),  num_inputs,
+                    output_names_ptr.data(), num_outputs);
+    }
+    cudaDeviceSynchronize();
+
+    auto t0 = std::chrono::high_resolution_clock::now();
+    for (int it = 0; it < iterations; ++it) {
+        session.Run(run_opts,
+                    input_names_ptr.data(),  input_tensors.data(),  num_inputs,
+                    output_names_ptr.data(), num_outputs);
+    }
+    cudaDeviceSynchronize();
+    auto t1 = std::chrono::high_resolution_clock::now();
+
+    double avg_ms   = std::chrono::duration<double, std::milli>(t1 - t0).count()
+                      / iterations;
+    double throughput = (avg_ms > 0.0) ? 1000.0 / avg_ms : 0.0;
+
+    std::string label = std::string(model_name) + " [ORT-GPU]";
+    std::printf("%-30s  avg %8.4f ms  (%8.1f inf/s)\n",
+                label.c_str(), avg_ms, throughput);
+}
diff --git a/check_style.sh b/check_style.sh
new file mode 100644
index 0000000..22a56e4
--- /dev/null
+++ b/check_style.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+set -e
+
+# Directories
+SRC_DIR="./include"
+TEST_DIR="./tests"
+
+echo "📝 Discovering source/header files..."
+
+FILES=$(find "$SRC_DIR" "$TEST_DIR" \
+    -path "$TEST_DIR/build" -prune -o \
+    -type f \( \
+        -name '*.cpp' -o -name '*.cc' -o -name '*.cxx' -o \
+        -name '*.h' -o -name '*.hpp' -o -name '*.hxx' -o -name '*.hh' \
+    \) -print)
+
+if [ -z "$FILES" ]; then
+    echo "⚠️ No files found to process."
+    exit 0
+fi
+
+echo "🎯 Files to check:"
+echo "$FILES"
+
+echo "🎨 Running clang-format..."
+for file in $FILES; do
+    echo "Formatting $file"
+    clang-format -i "$file"
+done
+
+echo "🔍 Running clang-tidy..."
+for file in $FILES; do
+    echo "Linting $file"
+    clang-tidy "$file" --extra-arg=-std=c++20 -- -I"$SRC_DIR" || true
+done
+
+echo "✅ Formatting and linting complete."
diff --git a/cmake/SOFIEConfig.cmake.in b/cmake/SOFIEConfig.cmake.in
new file mode 100644
index 0000000..94ebc4a
--- /dev/null
+++ b/cmake/SOFIEConfig.cmake.in
@@ -0,0 +1,13 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+find_dependency(Protobuf)
+
+if(@SOFIE_WITH_ROOT@)
+  find_dependency(ROOT COMPONENTS Core TMVA Tree)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/SOFIETargets.cmake")
+
+check_required_components(SOFIE)
diff --git a/cmake/modules/SofieTestMacros.cmake b/cmake/modules/SofieTestMacros.cmake
new file mode 100644
index 0000000..1f4d235
--- /dev/null
+++ b/cmake/modules/SofieTestMacros.cmake
@@ -0,0 +1,73 @@
+# Fallback test macros used when ROOT is not available.
+# These provide the same interface as ROOTTEST_GENERATE_EXECUTABLE and
+# ROOTTEST_ADD_TEST from RoottestMacros.cmake but without requiring ROOT.
+
+macro(ROOTTEST_GENERATE_EXECUTABLE executable)
+  cmake_parse_arguments(ARG "" "RESOURCE_LOCK"
+    "LIBRARIES;COMPILE_FLAGS;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED"
+    ${ARGN})
+
+  add_executable(${executable} EXCLUDE_FROM_ALL ${ARG_UNPARSED_ARGUMENTS})
+  set_target_properties(${executable} PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  if(ARG_DEPENDS)
+    add_dependencies(${executable} ${ARG_DEPENDS})
+  endif()
+
+  if(ARG_LIBRARIES)
+    target_link_libraries(${executable} ${ARG_LIBRARIES})
+  endif()
+
+  if(ARG_COMPILE_FLAGS)
+    set_target_properties(${executable} PROPERTIES COMPILE_FLAGS ${ARG_COMPILE_FLAGS})
+  endif()
+
+  set(_sofie_build_test ${executable}-build)
+  add_test(NAME ${_sofie_build_test}
+    COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target ${executable})
+
+  if(ARG_FIXTURES_SETUP)
+    set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP})
+  endif()
+  if(ARG_FIXTURES_CLEANUP)
+    set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP})
+  endif()
+  if(ARG_FIXTURES_REQUIRED)
+    set_property(TEST ${_sofie_build_test} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED})
+  endif()
+endmacro()
+
+function(ROOTTEST_ADD_TEST testname)
+  cmake_parse_arguments(ARG ""
+    "WORKING_DIR;TIMEOUT;RESOURCE_LOCK"
+    "EXEC;COMMAND;DEPENDS;FIXTURES_SETUP;FIXTURES_CLEANUP;FIXTURES_REQUIRED;ENVIRONMENT;PROPERTIES"
+    ${ARGN})
+
+  if(ARG_EXEC)
+    set(_cmd ${ARG_EXEC})
+  elseif(ARG_COMMAND)
+    set(_cmd ${ARG_COMMAND})
+  else()
+    message(FATAL_ERROR "ROOTTEST_ADD_TEST: must specify EXEC or COMMAND")
+  endif()
+
+  add_test(NAME ${testname} COMMAND ${_cmd}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+  if(ARG_FIXTURES_SETUP)
+    set_property(TEST ${testname} PROPERTY FIXTURES_SETUP ${ARG_FIXTURES_SETUP})
+  endif()
+  if(ARG_FIXTURES_CLEANUP)
+    set_property(TEST ${testname} PROPERTY FIXTURES_CLEANUP ${ARG_FIXTURES_CLEANUP})
+  endif()
+  if(ARG_FIXTURES_REQUIRED)
+    set_property(TEST ${testname} PROPERTY FIXTURES_REQUIRED ${ARG_FIXTURES_REQUIRED})
+  endif()
+  if(ARG_ENVIRONMENT)
+    set_property(TEST ${testname} PROPERTY ENVIRONMENT ${ARG_ENVIRONMENT})
+  endif()
+  if(ARG_TIMEOUT)
+    set_property(TEST ${testname} PROPERTY TIMEOUT ${ARG_TIMEOUT})
+  endif()
+endfunction()
diff --git a/src/SOFIE_core/CMakeLists.txt b/core/CMakeLists.txt
similarity index 72%
rename from src/SOFIE_core/CMakeLists.txt
rename to core/CMakeLists.txt
index 7297957..a99f6d4 100644
--- a/src/SOFIE_core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -17,6 +17,8 @@ set(sources_headers
    SOFIE/OperatorList.hxx
    SOFIE/RModel_Base.hxx
    SOFIE/RModel.hxx
+   SOFIE/RModelProfiler.hxx
+   SOFIE/RModelProfilerGPU.hxx
    SOFIE/ROperator.hxx
    SOFIE/ROperator_BasicUnary.hxx
    SOFIE/ROperator_BasicBinary.hxx
@@ -76,6 +78,9 @@ list(TRANSFORM sources_headers PREPEND "inc/")
 set(sources_cxx
     src/RModel_Base.cxx
     src/RModel.cxx
+    src/RModelProfiler.cxx
+    src/RModelProfilerGPU.cxx
+    src/RModel_ALPAKA.cxx
     src/RModel_GNN.cxx
     src/RModel_GraphIndependent.cxx
     src/RFunction.cxx
@@ -86,24 +91,33 @@ set(sources_cxx
 )
 
 target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx})
-target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc)
-target_link_libraries(SOFIE_core PUBLIC
-    Tree
-    Core
-    RIO
+target_include_directories(SOFIE_core PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
+target_link_libraries(SOFIE_core PUBLIC utils)
 
-ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers}
-    LINKDEF inc/LinkDef.h
-    MODULE SOFIE_core
-    OPTIONS --deep
-)
+if(SOFIE_WITH_ROOT AND ROOT_FOUND)
+  target_compile_definitions(SOFIE_core PUBLIC SOFIE_SUPPORT_ROOT_BINARY)
+  target_link_libraries(SOFIE_core PUBLIC Tree Core RIO)
+
+  ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers}
+      LINKDEF inc/LinkDef.h
+      MODULE SOFIE_core
+      OPTIONS --deep
+  )
+
+  # Install the dictionaries.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm
+                ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap
+          DESTINATION lib)
+endif()
 
 install(TARGETS SOFIE_core
-        LIBRARY DESTINATION lib
+  EXPORT SOFIETargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/"
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
-install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include")
 
-if(testing)
-  add_subdirectory(test)
-endif()
diff --git a/src/SOFIE_core/README.md b/core/README.md
similarity index 96%
rename from src/SOFIE_core/README.md
rename to core/README.md
index 033cad4..b0a50a1 100644
--- a/src/SOFIE_core/README.md
+++ b/core/README.md
@@ -12,10 +12,10 @@ This is a new development in TMVA and is currently in early experimental stage.
 
 ## Installation
 
-Build ROOT with the cmake option tmva-sofie enabled.
+Build ROOT with the cmake option sofie enabled.
 
 ```bash
-cmake ../root -Dtmva-sofie=ON
+cmake ../root -Dsofie=ON
 make -j8
 ```
 
@@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get
 From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model:
 
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 SOFIE::RModel model = parser.Parse(“./example_model.onnx”);
 model.Generate();
@@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t
 
 Here is the updated list of supported ONNX operators. You can obtain this list by doing
 ```cpp
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 std::vector<std::string> supportedOperators = parser.GetRegisteredOperators();
 ```
@@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types:
 
 You can also check your model whether all operators are implemented by doing the following:
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 parser.CheckModel("example_model.ONNX");
 ```
diff --git a/src/SOFIE_core/inc/LinkDef.h b/core/inc/LinkDef.h
similarity index 100%
rename from src/SOFIE_core/inc/LinkDef.h
rename to core/inc/LinkDef.h
diff --git a/src/SOFIE_core/inc/SOFIE/FunctionList.hxx b/core/inc/SOFIE/FunctionList.hxx
similarity index 100%
rename from src/SOFIE_core/inc/SOFIE/FunctionList.hxx
rename to core/inc/SOFIE/FunctionList.hxx
diff --git a/src/SOFIE_core/inc/SOFIE/OperatorList.hxx b/core/inc/SOFIE/OperatorList.hxx
similarity index 100%
rename from src/SOFIE_core/inc/SOFIE/OperatorList.hxx
rename to core/inc/SOFIE/OperatorList.hxx
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/core/inc/SOFIE/RFunction.hxx
similarity index 98%
rename from src/SOFIE_core/inc/SOFIE/RFunction.hxx
rename to core/inc/SOFIE/RFunction.hxx
index 53c30e3..f79691a 100644
--- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx
+++ b/core/inc/SOFIE/RFunction.hxx
@@ -3,6 +3,7 @@
 
 #include "SOFIE/RModel_Base.hxx"
 #include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
 
 #include <memory>
 #include <string>
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx b/core/inc/SOFIE/RFunction_MLP.hxx
similarity index 90%
rename from src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx
rename to core/inc/SOFIE/RFunction_MLP.hxx
index 8dfc0e1..d9f8626 100644
--- a/src/SOFIE_core/inc/SOFIE/RFunction_MLP.hxx
+++ b/core/inc/SOFIE/RFunction_MLP.hxx
@@ -15,7 +15,7 @@ enum class Activation {
 
 class RFunction_MLP: public RFunction_Update {
 private:
-    Int_t fNumLayers;           // Number of Layers in MLP
+    int_t fNumLayers;           // Number of Layers in MLP
     Activation fActivationFunction;
     bool  fActivateFinal;       // if True, fActivationFunction is applied as the activation for the last layer
     std::vector<std::string> fKernelTensors;
@@ -23,7 +23,7 @@ private:
 
 public:
     virtual ~RFunction_MLP() {}
-    RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN);
+    RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function=Activation::RELU, bool activate_final=false, GraphType gType=GraphType::GNN);
 
     void Initialize();
 
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx b/core/inc/SOFIE/RFunction_Mean.hxx
similarity index 100%
rename from src/SOFIE_core/inc/SOFIE/RFunction_Mean.hxx
rename to core/inc/SOFIE/RFunction_Mean.hxx
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx b/core/inc/SOFIE/RFunction_Sum.hxx
similarity index 100%
rename from src/SOFIE_core/inc/SOFIE/RFunction_Sum.hxx
rename to core/inc/SOFIE/RFunction_Sum.hxx
diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/core/inc/SOFIE/RModel.hxx
similarity index 60%
rename from src/SOFIE_core/inc/SOFIE/RModel.hxx
rename to core/inc/SOFIE/RModel.hxx
index 79541af..8153408 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel.hxx
+++ b/core/inc/SOFIE/RModel.hxx
@@ -10,20 +10,33 @@ namespace SOFIE {
 
 class RModel final : public RModel_Base {
 
+   friend class RModelProfiler;
+   friend class RModelProfilerGPU;
+
 private:
    bool fIsInitialized = false;
    bool fIsSubGraph = false;
+   bool fUseVDT = false;
+   bool fProfile = false;
    int fVerbose = 0;
    int fBatchSize = -1;
    long fReadPos = 0;  // reading file position
+   size_t fConstantTensorSize = 0; // size  (in Bytes) of the allocated constant tensors
+   size_t fWeightsTensorSize = 0;  // size  (in Bytes) of the allocated weight tensors
+   size_t fOtherTensorSize = 0;    // size  (in Bytes) of intermediate tensors which are not managed by the memory pool
+
+   OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
 
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs?
    std::unordered_map<std::string, TensorInfo> fReadyInputTensorInfos; // input tensors where shape is full defined
    std::unordered_map<std::string, InitializedTensor> fInitializedTensors;
    std::unordered_map<std::string, TensorInfo> fIntermediateTensorInfos;
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
+   std::unordered_map<std::string, std::pair<std::vector<Dim>, bool>> fShapeTensors; // constant tensors describing a shape
+   std::unordered_map<std::string, std::string> fAliasTensors; // alias tensors (name -> original tensor name)
    std::unordered_map<std::string, std::string>
       fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value
+   std::vector<std::string> fDimShapeNames; // parameter names used to define the shapes
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
 
@@ -38,6 +51,30 @@ private:
    MemoryPoolInfo fIntermediateMemoryInfo;    ///<!  intermediate memory info (transient)
    std::unordered_map<std::string_view, size_t> fIntermediateTensorFrequencyLookup;    ///<!  lookup table for intermediate tensor frequency (transient)
 
+   std::string fExtraCodeForDimShapes; // extra code needed for initialization of dynamic parameters (e.g. number of non zero elements in NonZero operator)
+
+   // GPU ALPAKA elementwise kernel fusion state (transient, computed in GenerateGPU_ALPAKA)
+   struct EltwiseFusionGroup {
+      std::vector<size_t> opIndices; ///< consecutive op indices forming this group
+      std::string inputTensor;       ///< input tensor name of the first op
+      std::string outputTensor;      ///< output tensor name of the last op
+      size_t numElements = 0;
+      bool isFused() const { return opIndices.size() > 1; }
+      std::string suffix() const {
+         std::string s;
+         for (auto i : opIndices) s += "_" + std::to_string(i);
+         return s;
+      }
+   };
+   std::vector<EltwiseFusionGroup> fEltwiseFusionGroups; ///<!
+   std::unordered_map<size_t, size_t> fOpToFusionGroupIdx; ///<!  op_idx -> fusion group index
+   std::set<std::string> fFusionIntermediateTensors;        ///<!  intermediate tensors whose alloc is skipped
+   std::set<size_t>      fSkipOperators;                    ///<!  ops swallowed by a preceding fusion (e.g. GEMM+LeakyReLU)
+   void ComputeEltwiseFusionGroups();
+   /// GPU-only pass: fuse GEMM→LeakyReLU (and GEMM→ReLU where not already
+   /// handled by the ONNX parser) into a single in-place kernel sequence.
+   void FuseGemmActivations_GPU();
+
 public:
    // Rule of five: explicitly define move semantics, disallow copy
    RModel(RModel &&other);
@@ -58,9 +95,14 @@ public:
 
    int Verbose() const { return fVerbose;}
 
-   const std::vector<size_t> &GetTensorShape(std::string name) const;
-   std::vector<Dim> GetDynamicTensorShape(std::string name) const;
-   const ETensorType &GetTensorType(std::string name) const;
+   std::vector<size_t> GetTensorShape(const std::string & name) const;
+   std::vector<Dim> GetDimTensorShape(const std::string & name) const;
+   ETensorType GetTensorType(std::string name) const;
+   std::vector<Dim> GetDynamicTensorShape(const std::string & name) const ;
+
+   // get the values for the tensor representing a shape
+   const std::vector<Dim> & GetShapeTensorValues(const std::string & tensor_name) const;
+
 
    bool CheckIfTensorAlreadyExist(std::string tensor_name);
    void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<Dim> shape);
@@ -81,6 +123,7 @@ public:
       size_t length = ConvertShapeToLength(shape);
       std::shared_ptr<void> data_ptr(malloc(length * sizeof(T)), free);
       std::memcpy(data_ptr.get(), (void*) data, length * sizeof(T));
+      std::cout<<"Length of constant tensor "<<name<<" added: "<<length<<std::endl;
       AddConstantTensor(name, GetTemplatedType<T>(T()), shape, data_ptr);
    }
    // for boolean can be more convenient passing an std::vector
@@ -102,6 +145,12 @@ public:
       AddInitializedTensor(tensor_name,  GetTemplatedType(T()), shape, data);
    }
 
+   void AddShapeTensor(const std::string & name, const std::vector<Dim> & shapeValues, bool scalar = false);
+   void AddAliasTensor(const std::string & name, const std::string & origin);
+   bool IsAliasTensor(const std::string & tensor_name) const;
+
+   void AddExtraCodeForDimShapes(const std::string & code) { fExtraCodeForDimShapes += code; }
+
    // add and initialize subgraph to the model
    void InitializeSubGraph(std::shared_ptr<RModel>  graph);
 
@@ -118,13 +167,15 @@ public:
    bool IsDimInputTensor(const std::string &name) const;
    // check if tensor is a fully specified input tensor
    bool IsReadyInputTensor(const std::string &name) const;
+   /// check if a tensor is a shape tensor
+   bool IsShapeTensor(const std::string & name) const;
 
    // Add intermediate tensor
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape);
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape);
    // Add an intermediate dynamic tensor
    void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector<Dim> shape);
-
+   void AddShapeParam(const std::string & name, size_t def_value = 0);
    void AddInputTensorName(std::string name);
    void AddOutputTensorNameList(std::vector<std::string> output_tensor_names);
    void
@@ -132,6 +183,9 @@ public:
    void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape,
                                 std::shared_ptr<void> data);
    std::shared_ptr<void> GetInitializedTensorData(std::string tensor_name);
+   void RemoveInitializedTensor(std::string tensor_name);
+   template<class T>
+   std::vector<T> GetTensorData(const std::string & name);
 
    void Initialize(int batchSize = -1, bool verbose = false);
    void Initialize(const std::map<std::string,size_t> & inputParams, bool verbose = false);
@@ -141,40 +195,77 @@ public:
    {
       Generate(static_cast<std::underlying_type_t<Options>>(options), batchSize, pos, verbose);
    }
+   void GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize = -1, bool verbose = false);
+   void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false)
+   {
+      GenerateGPU_ALPAKA(static_cast<std::underlying_type_t<Options>>(options), batchSize, verbose);
+   }
    // generate the infer function signature. If isdecl= false generate the calling infer function
    // used to infer the sub-graphs
    std::string GenerateInferSignature(bool isdecl = true);
 
+   // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function
+   // used to infer the sub-graphs
+   std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true);
+
+   // generate the _infer_impl signature using ViewPlainPtr types instead of Buf types
+   std::string GenerateImplSignature_GPU_ALPAKA(bool isdecl = true);
+
+   void RemoveIntermediateTensor(const std::string& tensor_name){
+      fIntermediateTensorInfos.erase(tensor_name);
+   }
+
    // calculate total intermediate memory and position intermediate tensor addresses
-   std::string AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors);
-   void CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_output_tensors, const size_t& op_idx);
+   std::string AllocateIntermediateMemory(std::span<const std::string> op_output_tensors);
+   void CheckAndFlushIntermediateMemory(std::span<const std::string> op_output_tensors, const size_t& op_idx);
 
 protected:
    // internal functions
    // generate code for the initialized tensors
    void GenerateInitializedTensorInfo();
+
+   void GenerateInitializedTensorInfo_GPU_ALPAKA(); 
    // generate code for the intermediate tensors
    void GenerateIntermediateTensorInfo();
+
+   // generate code for the temporary initialized tensors containers
+   void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+
    // generate code for the dynamic tensors
    void GenerateDynamicTensorInfo();
+
+   void GenerateDynamicTensorInfo_GPU_ALPAKA();
    // generate code for declarations needed by operators
    void GenerateOperatorDeclarations();
    // generate code for inference
    void GenerateOutput();
+
+   void GenerateOutput_GPU_ALPAKA();
+
+   void MoveInitializedTensorsToBuffers_ALPAKA();
    // generate code for initializing memory pool for intermediate tensors
    void GenerateIntermediateMemoryPool();
    // Generate all session code
    void GenerateSessionCode();
+   void GenerateSessionCode_GPU_ALPAKA();
+   void GenerateGPU_ALPAKA_Buffers();
+
+   void CheckAndFuseOperators();
+   bool IsInputTensorShapeParam(std::string const &paramName) const;
+   std::vector<std::string> CollectTensorMemberNames(const std::string &input);
+   void GenerateRequiredInputTensorInfo();
 
 public:
    const std::vector<std::string> &GetInputTensorNames() const { return fInputTensorNames; }
    const std::vector<std::string> &GetOutputTensorNames() const { return fOutputTensorNames; }
+   const std::vector<std::string> & GetDimShapeNames() const { return fDimShapeNames; }
 
    void ReadInitializedTensorsFromFile(long);
    long WriteInitializedTensorsToFile(std::string filename = "");
 
-   void PrintIntermediateTensors();
-   void PrintOutputTensors();
+   void PrintIntermediateTensors() const;
+   void PrintOutputTensors() const;
+   void PrintSummary() const;
    void OutputGenerated(std::string filename = "", bool append = false);
    std::vector<std::string> GetOutputTensorNames() { return fOutputTensorNames; }
    void SetFilename(std::string filename) { fName = filename; }
@@ -185,24 +276,46 @@ public:
          //a view only
          T obj;
          if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()){
-            throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n");
+            throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n");
          }
          InitializedTensor new_tensor_ {GetTemplatedType(obj), new_tensor.GetShape() ,
       static_cast<void>(new_tensor.GetData())}; fInitializedTensors[tensor_name] = new_tensor_;
       }
    */
 
-   void PrintRequiredInputTensors();
-   void PrintInitializedTensors();
-   void PrintDynamicTensors();
+   void PrintRequiredInputTensors() const;
+   void PrintInitializedTensors() const;
+   void PrintDynamicTensors() const;
    void HeadInitializedTensors(std::string name, int n_print = 50);
 
    bool UseSession() const { return fUseSession; }
-
+   void SetUseVDT(bool on) {
+      fUseVDT = on;
+   }
+   bool UseVDT() const { return fUseVDT;}
+   
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
    // Use the ClassDef macro to allow definition of custom streaming
    ClassDefNV(RModel, 3);
+#endif
+
 };
 
+template<class T>
+inline std::vector<T> RModel::GetTensorData(const std::string & name) {
+   if (!IsInitializedTensor(name)) return std::vector<T>{};
+   T * data = static_cast<T*>(GetInitializedTensorData(name).get());
+   size_t size = ConvertShapeToLength(GetTensorShape(name));
+   return std::vector<T>(data, data+size);
+}
+
+template<>
+inline std::vector<Dim> RModel::GetTensorData<Dim>(const std::string & name) {
+   if (!IsShapeTensor(name)) return std::vector<Dim>{};
+   return GetShapeTensorValues(name);
+}
+
+
 } // namespace SOFIE
 
 #endif // SOFIE_RMODEL
diff --git a/core/inc/SOFIE/RModelProfiler.hxx b/core/inc/SOFIE/RModelProfiler.hxx
new file mode 100644
index 0000000..93e05f7
--- /dev/null
+++ b/core/inc/SOFIE/RModelProfiler.hxx
@@ -0,0 +1,34 @@
+#ifndef SOFIE_RMODELPROFILER
+#define SOFIE_RMODELPROFILER
+
+#include "SOFIE/RModel.hxx"
+
+namespace SOFIE {
+
+/// \class RModelProfiler
+/// \brief Generates profiled inference code for an RModel (CPU path).
+///
+/// Instruments the generated C++ code to measure per-operator execution time
+/// using std::chrono. Activated when RModel::Generate is called with Options::kProfile.
+class RModelProfiler {
+
+public:
+   static void AddNeededStdLibs(RModel &model);
+   static std::string GenerateSessionMembers();
+   static std::string GenerateUtilityFunctions();
+   static std::string GenerateBeginInferCode();
+   static std::string GenerateOperatorCode(ROperator &op, size_t op_idx);
+   static std::string GenerateEndInferCode();
+
+   RModelProfiler() = delete;
+   ~RModelProfiler() = default;
+
+   RModelProfiler(const RModelProfiler &) = delete;
+   RModelProfiler(RModelProfiler &&) = delete;
+   RModelProfiler &operator=(const RModelProfiler &) = delete;
+   RModelProfiler &operator=(RModelProfiler &&) = delete;
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_RMODELPROFILER
diff --git a/core/inc/SOFIE/RModelProfilerGPU.hxx b/core/inc/SOFIE/RModelProfilerGPU.hxx
new file mode 100644
index 0000000..bc4aab2
--- /dev/null
+++ b/core/inc/SOFIE/RModelProfilerGPU.hxx
@@ -0,0 +1,52 @@
+#ifndef SOFIE_RMODELPROFILERGPU
+#define SOFIE_RMODELPROFILERGPU
+
+#include <string>
+#include <cstddef>
+#include "SOFIE/RModel.hxx"
+
+namespace SOFIE {
+
+/// \class RModelProfilerGPU
+/// \brief Generates profiled inference code for the GPU/Alpaka path.
+///
+/// Instruments the generated C++ code to measure per-operator GPU execution time
+/// using std::chrono + alpaka::wait for synchronization, and reports CPU/GPU memory usage.
+/// Activated when RModel::GenerateGPU_ALPAKA is called with Options::kProfile.
+class RModelProfilerGPU {
+
+public:
+   static void AddNeededStdLibs(RModel &model);
+   static std::string GenerateSessionMembers();
+   static std::string GenerateUtilityFunctions();
+
+   // Memory info: CPU and GPU tensor sizes computed at code-gen time.
+   struct MemoryInfo {
+      // CPU-side
+      size_t constantTensorBytes = 0;   // tensors embedded as C++ arrays (IsConstantTensor)
+      size_t weightTensorBytes = 0;     // tensors loaded from .dat into temporary CPU vectors
+      size_t intermediateCPUBytes = 0;  // intermediate tensor pool (0 in GPU path)
+      // GPU-side
+      size_t weightDeviceBytes = 0;     // ALL initialized tensor device buffers (const + weights)
+      size_t intermediateGPUBytes = 0;  // intermediate device buffers (excl. fused intermediates)
+   };
+
+   static MemoryInfo ComputeMemoryInfo(const RModel &model);
+   static std::string GenerateMemoryReport(const MemoryInfo &info);
+
+   static std::string GenerateBeginInferCode();
+   static std::string GenerateOperatorCode(ROperator &op, size_t op_idx);
+   static std::string GenerateEndInferCode();
+
+   RModelProfilerGPU() = delete;
+   ~RModelProfilerGPU() = default;
+
+   RModelProfilerGPU(const RModelProfilerGPU &) = delete;
+   RModelProfilerGPU(RModelProfilerGPU &&) = delete;
+   RModelProfilerGPU &operator=(const RModelProfilerGPU &) = delete;
+   RModelProfilerGPU &operator=(RModelProfilerGPU &&) = delete;
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_RMODELPROFILERGPU
diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/core/inc/SOFIE/RModel_Base.hxx
similarity index 54%
rename from src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
rename to core/inc/SOFIE/RModel_Base.hxx
index f8a9d34..b598652 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
+++ b/core/inc/SOFIE/RModel_Base.hxx
@@ -12,8 +12,10 @@
 #include <fstream>
 #include <sstream>
 #include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
+
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
 #include "TBuffer.h"
+#endif
 
 
 namespace SOFIE {
@@ -25,12 +27,29 @@ enum class Options {
    kRootBinaryWeightFile = 0x4,
    kGNN = 0x8,
    kGNNComponent = 0x10,
+   kProfile = 0x20,
+};
+
+// Optimization levels inspired by ONNXRuntime.
+// We only get Operator Fusion with the Basic, and
+// memory reuse with Extended. kExtended is enabled
+// by default
+enum class OptimizationLevel {
+   kBasic = 0x0,
+   kExtended = 0x1,
 };
 
 enum class WeightFileType { None, RootBinary, Text };
 
-std::underlying_type_t<Options> operator|(Options opA, Options opB);
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB);
+
+inline std::underlying_type_t<Options> operator|(Options opA, Options opB) {
+    return static_cast<std::underlying_type_t<Options>>(opA) |
+           static_cast<std::underlying_type_t<Options>>(opB);
+}
+
+inline std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
+    return opA | static_cast<std::underlying_type_t<Options>>(opB);
+}
 
 class RModel_Base {
 
@@ -53,6 +72,45 @@ protected:
    bool fIsGNN = false;
    bool fIsGNNComponent = false;
 
+   // Function to generate the code for declaring and initializing constant tensors
+   // This is for tensors which are not part of weight files and can be created from the Constant operator
+   template <typename T>
+   std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
+   {
+      std::stringstream strs;
+      std::string type = ConvertTypeToString(t.second.type());
+      size_t length = ConvertShapeToLength(t.second.shape());
+      std::cout<<"Constant tensor name: "<<t.first<<", Constant tensor length: "<<length<<"\n";
+      // avoid using stack sizes for constant tensors to reduce compilation time
+      bool allocateOnStack = (length > 100) ? false : true;
+
+      const T *data = t.second.data<T>();
+
+      // and check if all values are the same
+      bool sameData = false;
+      // for non stack allocation check if data are the same
+      if (!allocateOnStack && length > 1) {
+         size_t idx = 1;
+         std::cout<<"insider allocate on stack and length\n";
+         do {
+            sameData = (data[idx] == data[idx - 1]);
+            idx++;
+         } while (sameData && idx < length);
+      }
+      if (allocateOnStack) {
+         strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      } else {
+         strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
+         if (sameData)
+            strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
+         else {
+            strs << ConvertValuesToString(length, data) << ";\n";
+         }
+         strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      }
+      return strs.str();
+   }
+
 public:
    /**
        Default constructor. Needed to allow serialization of ROOT objects. See
@@ -73,15 +131,15 @@ public:
    }
    void AddNeededStdLib(std::string libname)
    {
-      if (fAllowedStdLib.find(libname) != fAllowedStdLib.end()) {
-         fNeededStdLib.insert(libname);
-      }
+      // if the library is already in the set, insert does nothing
+      fNeededStdLib.insert(libname);
    }
    void AddNeededCustomHeader(std::string filename)
    {
        fCustomOpHeaders.insert(filename);
    }
    void GenerateHeaderInfo(std::string &hgname);
+   void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname);
    void PrintGenerated() { std::cout << fGC; }
 
    std::string ReturnGenerated() { return fGC; }
diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx b/core/inc/SOFIE/RModel_GNN.hxx
similarity index 94%
rename from src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx
rename to core/inc/SOFIE/RModel_GNN.hxx
index 558f82c..93bb092 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel_GNN.hxx
+++ b/core/inc/SOFIE/RModel_GNN.hxx
@@ -66,7 +66,7 @@ struct GNN_Init {
          break;
       }
       default: {
-         throw std::runtime_error("TMVA SOFIE: Invalid Update function supplied for creating GNN function block.");
+         throw std::runtime_error("SOFIE: Invalid Update function supplied for creating GNN function block.");
       }
       }
    }
@@ -88,7 +88,7 @@ struct GNN_Init {
          break;
       }
       default: {
-         throw std::runtime_error("TMVA SOFIE: Invalid Aggregate function supplied for creating GNN function block.");
+         throw std::runtime_error("SOFIE: Invalid Aggregate function supplied for creating GNN function block.");
       }
       }
    }
diff --git a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx b/core/inc/SOFIE/RModel_GraphIndependent.hxx
similarity index 96%
rename from src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx
rename to core/inc/SOFIE/RModel_GraphIndependent.hxx
index 407c645..dfade7f 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel_GraphIndependent.hxx
+++ b/core/inc/SOFIE/RModel_GraphIndependent.hxx
@@ -49,7 +49,7 @@ struct GraphIndependent_Init {
       }
       default: {
          throw std::runtime_error(
-            "TMVA SOFIE: Invalid Update function supplied for creating GraphIndependent function block.");
+            "SOFIE: Invalid Update function supplied for creating GraphIndependent function block.");
       }
       }
    }
diff --git a/core/inc/SOFIE/ROperator.hxx b/core/inc/SOFIE/ROperator.hxx
new file mode 100644
index 0000000..c24fd70
--- /dev/null
+++ b/core/inc/SOFIE/ROperator.hxx
@@ -0,0 +1,133 @@
+#ifndef SOFIE_ROPERATOR
+#define SOFIE_ROPERATOR
+
+#include <vector>
+#include <set>
+#include <memory>
+
+#include "SOFIE/SOFIE_common.hxx"
+
+
+namespace SOFIE{
+
+class RModel;
+
+enum class OperatorKind {
+   GEMM = 0,
+   LAYERNORM = 1,
+   RELU = 2,
+   CONSTANT = 3,
+   CONSTANTOFSHAPE = 4,
+   UNDEFINED = 5,
+   CONV=6,
+   BATCHNORM=7,
+   CAST=8,
+   COMPARISON=9,
+   EINSUM=10,
+   ELU=11,
+   SIGMOID=12,
+   TANH=13,
+   SOFTMAX=14,
+   LEAKYRELU=15,
+   UNARY_RECIPROCAL=16,
+   UNARY_SQRT=17,
+   UNARY_NEG=18,
+   UNARY_EXP=19,
+   UNARY_LOG=20,
+   UNARY_SIN=21,
+   UNARY_COS=22,
+   UNARY_ABS=23,
+   CLIP=24,
+   NOT=25
+};
+
+inline const char* toString(OperatorKind kind) {
+   switch (kind) {
+       case OperatorKind::GEMM:       return "GEMM";
+       case OperatorKind::LAYERNORM:  return "LAYERNORM";
+       case OperatorKind::RELU:       return "RELU";
+       case OperatorKind::CONSTANT:        return "CONSTANT";
+       case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE";
+       case OperatorKind::BATCHNORM:       return "BATCHNORM";  
+       case OperatorKind::CONV:       return "CONV";
+       case OperatorKind::UNDEFINED:  return "UNDEFINED";
+       default:                       return "UNKNOWN";
+   }
+}
+
+inline std::set<OperatorKind> FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM};
+
+class ROperator{
+
+
+public:
+   virtual std::vector<std::string> GetBlasRoutines() { return {}; }
+   virtual std::vector<std::string> GetStdLibs() { return {}; }
+   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) { return {}; };
+   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) { return {}; };
+   virtual void Initialize(RModel&) = 0;
+   virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
+   virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel
+   // generate initialization code for session constructor
+   virtual std::string GenerateInitCode() { return "";}
+   virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";};
+   // generate some specific declaration code for Session
+   virtual std::string GenerateDeclCode() { return "";}
+   // generate session data members specific to operator
+   virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; }
+   virtual std::string Header() { return "";}
+   virtual std::string GetFusableOutputTensorName() { return "";}
+   virtual std::string GetBlasConfig() { return ""; }
+   virtual void UpdateFusableTensorName(std::string, const std::function<void(const std::string&)>& removal_func){ return;};
+
+   // Elementwise kernel fusion interface
+   virtual bool IsElementwise() const { return false; }
+   // Returns the C++ expression applying this op to inputVar (a local T variable) for fused kernel generation
+   virtual std::string GetElementwiseExpr(const std::string& /*inputVar*/) const { return ""; }
+
+   //virtual void Forward_reference() = 0;
+   //virtual void Forward_blas() = 0;
+   virtual ~ROperator(){}
+
+   std::string fName = "UnnamedOperator";
+   const std::string &Name() const { return fName; }
+
+protected:
+   OperatorKind fKind = OperatorKind::UNDEFINED;
+   size_t fOpOrder = 0;
+   const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
+   bool fUseSession = false;        ///< flag to identify if using the session class
+   bool fIsOutputConstant = false;  ///< flag to identify if operator has a constant output (no need to generate code)
+   bool fIsOutputParamShape = false;     ///< flag to identify of the output represents a parametric shape (can be known at compile time)
+
+   mutable std::vector<std::string> fInputTensorNames;
+   mutable std::vector<std::string> fOutputTensorNames;
+
+public:
+   std::span<const std::string> GetOpInputTensors() const {
+      return fInputTensorNames;
+   }
+
+   std::span<const std::string> GetOpOutputTensors() const {
+      return fOutputTensorNames;
+   }
+
+   OperatorKind GetKind() const { return fKind; }
+   bool IsOutputConstant() const { return fIsOutputConstant; }
+
+   void RegisterOperatorOrder(const size_t ord){
+      fOpOrder = ord;
+   }
+   size_t GetOpOrder(){
+      return fOpOrder;
+   }
+
+};
+
+
+
+}//SOFIE
+
+#endif //SOFIE_OPERATOR
diff --git a/core/inc/SOFIE/ROperator_BasicBinary.hxx b/core/inc/SOFIE/ROperator_BasicBinary.hxx
new file mode 100644
index 0000000..9a1a963
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_BasicBinary.hxx
@@ -0,0 +1,589 @@
+#ifndef SOFIE_SOFIE_ROperator_BasicBinary
+#define SOFIE_SOFIE_ROperator_BasicBinary
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+namespace SOFIE {
+
+enum EBasicBinaryOperator {
+   Add,
+   Sub,
+   Mul,
+   Div,
+   Pow,
+   Mod,
+   FMod
+};
+
+template <typename T, EBasicBinaryOperator Op1>
+struct BinaryOperatorTrait {};
+
+template <typename T>
+struct BinaryOperatorTrait<T, Add> {
+   static const std::string Name() { return "Add"; }
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; }
+   static T Func(T t1, T t2) { return t1 + t2; }
+};
+
+template <typename T>
+struct BinaryOperatorTrait<T, Sub> {
+   static const std::string Name() { return "Sub"; }
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; }
+   static T Func(T t1, T t2) { return t1 - t2; }
+};
+
+template <typename T>
+struct BinaryOperatorTrait<T, Mul> {
+   static const std::string Name() { return "Mul"; }
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; }
+   static T Func(T t1, T t2) { return t1 * t2; }
+};
+
+template <typename T>
+struct BinaryOperatorTrait<T, Div> {
+   static const std::string Name() { return "Div"; }
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; }
+   static T Func(T t1, T t2) { return t1 / t2; }
+};
+
+template <typename T>
+struct BinaryOperatorTrait<T, Pow> {
+   static const std::string Name() { return "Pow"; }
+   static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
+   static T Func(T t1, T t2) { return std::pow(t1, t2); }
+};
+template <typename T>
+struct BinaryOperatorTrait<T, Mod> {
+   static const std::string Name() { return "Mod"; }
+   static std::string Op(const std::string & t1, const std::string t2) { return "(" + t1 + " % " + t2 + ")"; }
+   static T Func(T t1, T t2) { return t1 % t2; }
+};
+template <typename T>
+struct BinaryOperatorTrait<T, FMod> {
+   static const std::string Name() { return "FMod"; }
+   static std::string Op(const std::string & t1, const std::string t2) { return "std::fmod(" + t1 + "," + t2 + ")"; }
+   static T Func(T t1, T t2) { return std::fmod(t1, t2); }
+};
+
+template <typename T, EBasicBinaryOperator Op>
+class ROperator_BasicBinary final : public ROperator {
+private:
+   int fBroadcastFlag = 0;
+   std::string fNA;
+   std::string fNB;
+   std::string fNBroadcastedA;
+   std::string fNBroadcastedB;
+   std::string fNY;
+
+   std::vector<size_t> fShapeA;
+   std::vector<size_t> fShapeB;
+   std::vector<size_t> fShapeY;
+
+   std::vector<Dim> fDimShapeA;
+   std::vector<Dim> fDimShapeB;
+   std::vector<Dim> fDimShapeY;
+
+public:
+   ROperator_BasicBinary() {}
+   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY)
+      : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames = {fNA, fNB};
+      fOutputTensorNames = {fNY};
+   }
+
+   // type of output given input
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
+
+   // shape of output tensors given input tensors
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override
+   {
+      // assume now inputs have same shape (no broadcasting)
+      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
+      return ret;
+   }
+
+   void Initialize(RModel &model) override
+   {
+      // input must be a graph input, or already initialized intermediate tensor
+      if (!model.CheckIfTensorAlreadyExist(fNA)) {
+         throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNA + "is not found in model");
+      }
+      if (!model.CheckIfTensorAlreadyExist(fNB)) {
+         throw std::runtime_error(std::string("SOFIE Binary Op Input Tensor ") + fNB + "is not found in model");
+      }
+      int dynamicInputs = 0;
+      if (model.IsDynamicTensor(fNA)) {
+         fDimShapeA = model.GetDynamicTensorShape(fNA);
+         dynamicInputs |= 1;
+      } else {
+         fShapeA = model.GetTensorShape(fNA);
+         fDimShapeA = ConvertShapeToDim(fShapeA);
+      }
+      if (model.IsDynamicTensor(fNB)) {
+         dynamicInputs |= 2;
+         fDimShapeB = model.GetDynamicTensorShape(fNB);
+      } else {
+         fShapeB = model.GetTensorShape(fNB);
+         fDimShapeB = ConvertShapeToDim(fShapeB);
+      }
+      if (dynamicInputs & 1 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNA << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeA) << std::endl;
+      if (dynamicInputs & 2 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNB << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeB) << std::endl;
+
+      // check if need to broadcast at initialization time if shapes are known and different
+      // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done)
+      // case of known shapes
+      // if shapes are known find the output shape from broadcasting
+      if (dynamicInputs == 0) {
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB);
+         fBroadcastFlag = ret.first;
+         fShapeY = ret.second;
+         auto  lengthY = ConvertShapeToLength(fShapeY);
+         if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) {
+            bool broadcast = fBroadcastFlag > 0;
+            if (broadcast) {
+               // Y is the common shape of A and B
+               bool broadcastA = fBroadcastFlag & 2;
+               bool broadcastB = fBroadcastFlag & 1;
+               // Broadcast A to Y
+               if (broadcastA) {
+                  fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNA);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                     std::default_delete<T[]>());
+                  if (model.Verbose())
+                     std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  // Update the data and the shape of A
+                  model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
+                  fShapeA = fShapeY;
+                  fDimShapeA = ConvertShapeToDim(fShapeA);
+               }
+               // Broadcast B to Y
+               if (broadcastB) {
+                  fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNB);
+                  if (model.Verbose())
+                     std::cout << "data B " << ConvertShapeToString(fShapeB) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T *>(data.get()))
+                               << std::endl;
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                     std::default_delete<T[]>());
+                  // do not update tensor B but add broadcasted one (since it can be input to some other operators)
+                  if (model.Verbose())
+                     std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
+                  fShapeB = fShapeY;
+                  fDimShapeB = ConvertShapeToDim(fShapeB);
+               }
+            } else {
+               fShapeY = fShapeA;
+            }
+            // tensors are constant: perform here the binary operation
+
+            const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA;
+            const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB;
+            auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
+            auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
+            std::vector<T> dataY(lengthY);
+            for (size_t i = 0; i < dataY.size(); i++) {
+               dataY[i] = BinaryOperatorTrait<T, Op>::Func(dataA[i], dataB[i]);
+            }
+            model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
+            // flag tensors to not be written in the generated code or weight file
+            model.SetNotWritableInitializedTensor(nameA);
+            model.SetNotWritableInitializedTensor(nameB);
+            fIsOutputConstant = true;
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl;
+            }
+         } else if (((model.IsShapeTensor(fNA) && model.IsShapeTensor(fNB)) ||
+                    (model.IsShapeTensor(fNA) && model.IsInitializedTensor(fNB)) ||
+                    (model.IsShapeTensor(fNB) && model.IsInitializedTensor(fNA)))
+                     && (fShapeA.size() <=1 && fShapeB.size() <=1 &&  model.GetTensorType(fNA) == ETensorType::INT64)) {
+            // case of shape tensors ( tensors are of rank 0 or 1  )
+            std::vector<Dim> dimValA;
+            std::vector<Dim> dimValB;
+            if (model.IsShapeTensor(fNA))
+               dimValA = model.GetShapeTensorValues(fNA);
+            if (model.IsShapeTensor(fNB))
+               dimValB = model.GetShapeTensorValues(fNB);
+            // adjust for broadcasting - repet values until it reaches shapes of Y
+            if (!fShapeY.empty() && fShapeY[0] > 1) {
+               if (dimValA.size() == 1) dimValA = std::vector<Dim>( fShapeY[0], dimValA[0]);
+               if (dimValB.size() == 1) dimValB = std::vector<Dim>( fShapeY[0], dimValB[0]);
+            }
+
+            auto convertDataToDim = [&](const std::string & name, const std::vector<size_t> & shape, std::vector<Dim> & dimValues) {
+               auto data = static_cast<int64_t *>(model.GetInitializedTensorData(name).get());
+               dimValues.resize(lengthY);
+               for (size_t i = 0; i < lengthY; i++) {
+                  if (!shape.empty() && lengthY == shape[0])
+                     dimValues[i] = Dim{ static_cast<size_t>(data[i])};
+                  else // case dataA is a scalar
+                     dimValues[i] = Dim{ static_cast<size_t>(data[0])};
+               }
+            };
+            if (model.IsInitializedTensor(fNA)) {
+               convertDataToDim(fNA,fShapeA,dimValA);
+            } else if (model.IsInitializedTensor(fNB)) {
+               convertDataToDim(fNB,fShapeB,dimValB);
+            }
+
+            //perform binary operations on shape tensors
+            std::vector<Dim> dimValY(lengthY);
+            for (size_t i = 0; i < lengthY; i++) {
+               if (!dimValA[i].isParam && !dimValB[i].isParam) {
+                  size_t d = BinaryOperatorTrait<size_t, Op>::Func(dimValA[i].dim, dimValB[i].dim);
+                  dimValY[i] = Dim{d};
+               } else {
+                  auto res =  BinaryOperatorTrait<T, Op>::Op(dimValA[i].GetVal(), dimValB[i].GetVal());
+                  dimValY[i] = Dim{res, static_cast<size_t>(-1)};
+               }
+            }
+            model.AddShapeTensor(fNY,dimValY, fShapeY.empty()); // cannot be a  scalar
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertDimShapeToString(dimValY) << " (shape)" <<  std::endl;
+            }
+            // no code needs to be generated (flag this as a constant output tensor)
+            fIsOutputConstant = true;
+
+         } else {
+            // case of defined and non-constant tensors
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << std::endl;
+            }
+            // we convert non-dim shapes to Dim shapes
+            fDimShapeY = ConvertShapeToDim(fShapeY);
+         }
+      } else {
+         // case A or B have dynamic shapes. We need to broadcast if shape are not same
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB);
+         fBroadcastFlag = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
+                  }
+               }
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(fDimShapeA[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (fDimShapeA[i].dim != 1)
+                        s = fDimShapeA[i];
+                     else
+                        s = fDimShapeB[i];
+                  } else if (IsInputDimParam(fDimShapeB[i].param)) {
+                     if (fDimShapeB[i].dim != 1)
+                        s = fDimShapeB[i];
+                     else
+                        s = fDimShapeA[i];
+                  }
+               }
+            }
+         }
+
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY);
+         if (model.Verbose()) {
+            std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << ConvertDimShapeToString(fDimShapeA) << " , "
+                      << ConvertDimShapeToString(fDimShapeB) << " --> " << ConvertDimShapeToString(fDimShapeY) << std::endl;
+         }
+      }
+   }
+
+   std::string GenerateInitCode() override
+   {
+      std::stringstream out;
+      return out.str();
+   }
+
+   std::string Generate(std::string opName) override
+   {
+
+      if (fIsOutputConstant)
+         return "";
+
+      opName = "op_" + opName;
+
+      std::stringstream out;
+      out << SP << "\n//------ " << opName << "  " << BinaryOperatorTrait<T, Op>::Name() << " --> "
+          << ConvertDimShapeToString(fDimShapeY) << "\n";
+      auto length = ConvertDimShapeToLength(fDimShapeY);
+      std::string typeName = TensorType<T>::Name();
+
+      // we need to check if we can broadcast (case flag has bit 4 set)
+
+      if (fBroadcastFlag & 4) {
+         // need to check if shapes are the same
+         auto lengthA = ConvertDimShapeToLength(fDimShapeA);
+         auto lengthB = ConvertDimShapeToLength(fDimShapeB);
+         out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n";
+         // check if A->B or B->A
+         // bool broadcastable = true;
+         for (size_t i = 0; i < fDimShapeY.size(); i++) {
+            if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 &&
+                fDimShapeB[i].isParam) {
+               // B->A B[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator "
+                   << opName << "\");\n";
+            }
+            if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 &&
+                fDimShapeA[i].isParam) {
+               // A-> B A[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator "
+                   << opName << "\");\n";
+            } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) {
+               // both shapes are parametric and we broadcast to maximum
+               // we allocate here output vector
+               out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i]
+                   << " != 1 || " << fDimShapeB[i] << " != 1))\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName
+                   << "\");\n";
+            }
+         }
+         out << SP << "}\n";
+      }
+
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_A, compute_idx_B, compute_idx_Y;
+      if (fDimShapeA.empty() ||
+          std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_A = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeA.size(); ++i) {
+            if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1")
+               continue;
+            compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size()));
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_A += " * " + stridesA[i].GetVal();
+            compute_idx_A += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_A.pop_back();
+      }
+      if (fDimShapeB.empty() ||
+          std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_B = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeB.size(); ++i) {
+            if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1")
+               continue;
+            compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size()));
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_B += " * " + stridesB[i].GetVal();
+            compute_idx_B += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_B.pop_back();
+      }
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                   << "; ++idx_" << i << "){\n";
+               compute_idx_Y += "idx_" + std::to_string(i);
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNY << "[" << compute_idx_Y << "] = "
+          << BinaryOperatorTrait<T, Op>::Op("tensor_" + fNA + "[" + compute_idx_A + "]",
+                                            "tensor_" + fNB + "[" + compute_idx_B + "]")
+          << " ;\n";
+
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op = "\n//------ "+opName+"_"+BinaryOperatorTrait<T, Op>::Name()+"_KERNEL_ALPAKA\n";
+      op += SP + "struct Binary"+opName+BinaryOperatorTrait<T, Op>::Name()+"Kernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < " + std::to_string(ConvertShapeToLength(fShapeY)) + ") {\n";
+      auto stridesA = UTILITY::ComputeStrideFromShape(fShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fShapeB);
+
+      for(size_t id_s = 0; id_s < stridesA.size(); ++id_s){
+         if(fShapeA[id_s] == 1)
+            stridesA[id_s] = 0;
+      }
+
+      for(size_t id_s = 0; id_s < stridesB.size(); ++id_s){
+         if(fShapeB[id_s] == 1)
+            stridesB[id_s] = 0;
+      }
+
+      auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      // --- Fast-path index simplifications ---
+      // Check whether A is broadcast (all strides zero → single element)
+      bool isAScalar = true;
+      for (const auto& s : stridesA) { if (s != 0) { isAScalar = false; break; } }
+      // Check whether B is broadcast (all strides zero → single element)
+      bool isBScalar = true;
+      for (const auto& s : stridesB) { if (s != 0) { isBScalar = false; break; } }
+      // Check whether A has the same contiguous layout as Y (no broadcasting)
+      bool isAContiguous = (fShapeA.size() == fShapeY.size());
+      if (isAContiguous) {
+         for (size_t i = 0; i < fShapeA.size(); ++i)
+            if (fShapeA[i] != fShapeY[i]) { isAContiguous = false; break; }
+      }
+      // Check whether B has the same contiguous layout as Y (no broadcasting)
+      bool isBContiguous = (fShapeB.size() == fShapeY.size());
+      if (isBContiguous) {
+         for (size_t i = 0; i < fShapeB.size(); ++i)
+            if (fShapeB[i] != fShapeY[i]) { isBContiguous = false; break; }
+      }
+
+      std::string flattened_index_A = "";
+      std::string flattened_index_B = "";
+
+      if (isAScalar) {
+         // A is a single broadcast value
+         flattened_index_A = "0";
+      } else if (isAContiguous) {
+         // A and Y have identical shapes → direct index
+         flattened_index_A = "idx";
+      } else {
+         // General broadcast case: decompose idx into per-dim coords
+         std::string temp = "idx";
+         for (size_t id_s = 0; id_s < fShapeA.size(); ++id_s) {
+            auto strideY = stridesY[id_s];
+            auto strideA = stridesA[id_s];
+            std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")";
+            flattened_index_A += coord + " * " + std::to_string(strideA) + " + ";
+            temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")";
+         }
+         if (!flattened_index_A.empty())
+            flattened_index_A.erase(flattened_index_A.size() - 3);
+      }
+
+      if (isBScalar) {
+         // B is a single broadcast value
+         flattened_index_B = "0";
+      } else if (isBContiguous) {
+         // B and Y have identical shapes → direct index
+         flattened_index_B = "idx";
+      } else {
+         // General broadcast case
+         std::string temp = "idx";
+         for (size_t id_s = 0; id_s < fShapeB.size(); ++id_s) {
+            auto strideY = stridesY[id_s];
+            auto strideB = stridesB[id_s];
+            std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")";
+            flattened_index_B += coord + " * " + std::to_string(strideB) + " + ";
+            temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")";
+         }
+         if (!flattened_index_B.empty())
+            flattened_index_B.erase(flattened_index_B.size() - 3);
+      }
+
+      op += "C[idx] = " + BinaryOperatorTrait<T, Op>::Op("A["+flattened_index_A+"]", "B["+flattened_index_B+"]") + ";\n";
+      op += "}\n}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      return SP + "Binary"+OpName+BinaryOperatorTrait<T, Op>::Name()+"Kernel binary" + OpName + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      if (fDimShapeY.empty()) {
+         throw std::runtime_error("SOFIE Operator Basic Binary called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fDimShapeY);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+         << ", binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA
+         << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   std::vector<std::string> GetStdLibs() override
+   {
+      if (Op == EBasicBinaryOperator::Pow) {
+         return {std::string("cmath")};
+      } else {
+         return {};
+      }
+   }
+
+   
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROperator_BasicBinary
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx b/core/inc/SOFIE/ROperator_BasicNary.hxx
similarity index 85%
rename from src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx
rename to core/inc/SOFIE/ROperator_BasicNary.hxx
index cbe0497..928ab1c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicNary.hxx
+++ b/core/inc/SOFIE/ROperator_BasicNary.hxx
@@ -81,10 +81,10 @@ private:
 
    std::vector<std::string> fNInputs;
    std::string fNY;
-   std::vector<std::vector<size_t>> fShapeInputs;
+   std::vector<std::vector<Dim>> fShapeInputs;
 
    std::vector<std::string> fNBroadcastedInputs;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim> fShapeY;
 
    bool fBroadcast = false;
 
@@ -119,18 +119,24 @@ public:
    void Initialize(RModel& model) override {
       for (auto &it : fNInputs) {
          if (!model.CheckIfTensorAlreadyExist(it)) {
-            throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model");
+            throw std::runtime_error("SOFIE BasicNary Op Input Tensor " + it + " is not found in model");
          }
-         fShapeInputs.push_back(model.GetTensorShape(it));
+         fShapeInputs.push_back(model.GetDimTensorShape(it));
+      }
+      // Find the common output shape by pairwise multidirectional broadcast
+      fShapeY = fShapeInputs[0];
+      for (size_t i = 1; i < fShapeInputs.size(); i++) {
+         auto shapeA = fShapeY;
+         auto shapeB = fShapeInputs[i];
+         auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB);
+         fShapeY = ret.second;
       }
-      // Find the common shape of the input tensors
-      fShapeY = UTILITY::MultidirectionalBroadcastShape(fShapeInputs);
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY);
       // Broadcasting
       size_t N = fNInputs.size();
       fNBroadcastedInputs.reserve(N);
       for (size_t i = 0; i < N; i++) {
-         if (!UTILITY::AreSameShape(model.GetTensorShape(fNInputs[i]), fShapeY)) {
+         if (!UTILITY::AreSameShape(fShapeInputs[i], fShapeY)) {
             fBroadcast = true;
             std::string name = "Broadcasted"  + fNInputs[i];
             model.AddIntermediateTensor(name, model.GetTensorType(fNInputs[0]), fShapeY);
@@ -145,18 +151,18 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE BasicNary called to Generate without being initialized first");
       }
       std::stringstream out;
-      size_t length = ConvertShapeToLength(fShapeY);
+      std::string length = ConvertDimShapeToLength(fShapeY);
       out << SP << "\n//------ BasicNary operator\n";
       if (fBroadcast) {
          for (size_t i = 0; i < fNInputs.size(); i++) {
             if (fNBroadcastedInputs[i] != fNInputs[i]) {
-               out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertShapeToString(fShapeY) << "\n";
+               out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertDimShapeToString(fShapeY) << "\n";
                out << SP << SP << "{\n";
-               out << SP << SP << SP << fType << "* data = SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertShapeToString(fShapeInputs[i]);
-               out << ", " << ConvertShapeToString(fShapeY) << ");\n";
+               out << SP << SP << SP << fType << "* data = SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertDimShapeToString(fShapeInputs[i]);
+               out << ", " << ConvertDimShapeToString(fShapeY) << ");\n";
                out << SP << SP << SP << "std::copy(data, data + " << length << ", " << fNBroadcastedInputs[i] << ");\n";
                out << SP << SP << SP << "delete[] data;\n";
                out << SP << SP << "}\n";
diff --git a/core/inc/SOFIE/ROperator_BasicUnary.hxx b/core/inc/SOFIE/ROperator_BasicUnary.hxx
new file mode 100644
index 0000000..dfe6714
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_BasicUnary.hxx
@@ -0,0 +1,207 @@
+#ifndef SOFIE_ROPERATOR_BASIC_UNARY
+#define SOFIE_ROPERATOR_BASIC_UNARY
+
+#include <SOFIE/ROperator.hxx>
+#include <SOFIE/RModel.hxx>
+#include <SOFIE/SOFIE_common.hxx>
+
+
+namespace SOFIE {
+
+enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs, kSoftplus, kAtan, kFloor };
+
+template <typename T, EBasicUnaryOperator Op>
+struct UnaryOpTraits {
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kReciprocal> {
+   static std::string Name() { return "Reciprocal"; }
+   static std::string Op(const std::string &X) { return "1/" + X; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kSqrt> {
+   static std::string Name() { return "Sqrt"; }
+   static std::string Op(const std::string &X) { return "std::sqrt(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kNeg> {
+   static std::string Name() { return "Neg"; }
+   static std::string Op(const std::string &X) { return "-" + X; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kExp> {
+   static std::string Name() { return "Exp"; }
+   static std::string Op(const std::string &X) { return "std::exp(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kLog> {
+   static std::string Name() { return "Log"; }
+   static std::string Op(const std::string &X) { return "std::log(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kSin> {
+   static std::string Name() { return "Sin"; }
+   static std::string Op(const std::string &X) { return "std::sin(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kCos> {
+   static std::string Name() { return "Cos"; }
+   static std::string Op(const std::string &X) { return "std::cos(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kAbs> {
+   static std::string Name() { return "Abs"; }
+   static std::string Op(const std::string &X) { return "std::abs(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kSoftplus> {
+   static std::string Name() { return "Softplus"; }
+   static std::string Op(const std::string &X) { return "std::log(std::exp(" + X + ") + 1)"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kAtan> {
+   static std::string Name() { return "Atan"; }
+   static std::string Op(const std::string &X) { return "std::atan(" + X + ")"; }
+};
+
+template <typename T>
+struct UnaryOpTraits<T, EBasicUnaryOperator::kFloor> {
+   static std::string Name() { return "Floor"; }
+   static std::string Op(const std::string &X) { return "std::floor(" + X + ")"; }
+};
+
+template <typename T, EBasicUnaryOperator Op>
+class ROperator_BasicUnary final : public ROperator {
+private:
+   std::string fNX;
+   std::string fNY;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeY;
+
+public:
+   ROperator_BasicUnary() {}
+
+   ROperator_BasicUnary(std::string nameX, std::string nameY)
+      : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
+   {
+
+         switch(Op) {
+            case EBasicUnaryOperator::kReciprocal:
+               fKind = OperatorKind::UNARY_RECIPROCAL;
+               break;
+            case EBasicUnaryOperator::kSqrt:
+               fKind = OperatorKind::UNARY_SQRT;
+               break;
+            case EBasicUnaryOperator::kNeg:
+               fKind = OperatorKind::UNARY_NEG;
+               break;
+            case EBasicUnaryOperator::kExp:
+               fKind = OperatorKind::UNARY_EXP;
+               break;
+            case EBasicUnaryOperator::kLog:
+               fKind = OperatorKind::UNARY_LOG;
+               break;
+            case EBasicUnaryOperator::kSin:
+               fKind = OperatorKind::UNARY_SIN;
+               break;
+            case EBasicUnaryOperator::kCos:
+               fKind = OperatorKind::UNARY_COS;
+               break;
+            case EBasicUnaryOperator::kAbs:
+               fKind = OperatorKind::UNARY_ABS;
+               break;
+         }
+         fInputTensorNames =  { fNX };
+         fOutputTensorNames = { fNY };
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override { return input; }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      fShapeY = fShapeX;
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+   }
+
+   std::string Generate(std::string OpName) override
+   {
+      OpName = "op_" + OpName;
+      std::stringstream out;
+
+      out << SP << "\n//---- Operator" << UnaryOpTraits<T, Op>::Name() << " " << OpName << "\n";
+      std::string length = ConvertDimShapeToLength(fShapeX);
+      out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n";
+      out << SP << SP << "tensor_" << fNY << "[i] = " << UnaryOpTraits<T, Op>::Op("tensor_" + fNX + "[i]") << ";\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*OpName*/) override {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op = "\n//------ " + UnaryOpTraits<T, Op>::Name() + "_KERNEL_ALPAKA\n";
+      op += SP + "struct Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel{\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * data, T * output, std::size_t const length) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < length) {\n";
+      op += SP + SP + SP + "output[idx] = " +UnaryOpTraits<T, Op>::Op("data[idx]") + ";\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*OpName*/) override {
+      return SP + "Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel " + UnaryOpTraits<T, Op>::Name() + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShapeX);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+         << ", " << UnaryOpTraits<T, Op>::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), " << length << ");\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   std::vector<std::string> GetStdLibs() override {
+      if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) {
+         return { std::string("cmath") };
+      } else {
+         return {};
+      }
+   }
+
+   bool IsElementwise() const override { return !fIsOutputConstant; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return UnaryOpTraits<T, Op>::Op(v);
+   }
+};
+
+} // namespace SOFIE
+
+#endif
diff --git a/core/inc/SOFIE/ROperator_Basic_Is.hxx b/core/inc/SOFIE/ROperator_Basic_Is.hxx
new file mode 100644
index 0000000..fabe976
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Basic_Is.hxx
@@ -0,0 +1,145 @@
+#ifndef SOFIE_ROPERATOR_BASIC_IS
+#define SOFIE_ROPERATOR_BASIC_IS
+
+#include <SOFIE/ROperator.hxx>
+#include <SOFIE/RModel.hxx>
+#include <SOFIE/SOFIE_common.hxx>
+#include <cmath>
+
+namespace SOFIE {
+
+enum class EBasicIsOperator { kIsInf, kIsInfPos, kIsInfNeg, kIsNaN };
+
+template <EBasicIsOperator Op>
+struct IsOpTraits {
+};
+
+template<>
+struct IsOpTraits<EBasicIsOperator::kIsInf> {
+   static std::string Name() { return "IsInf"; }
+   static std::string Op(const std::string &x) { return "std::isinf(" + x + ")"; }
+};
+
+template<>
+struct IsOpTraits<EBasicIsOperator::kIsInfPos> {
+   static std::string Name() { return "IsInfPos"; }
+   static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " > 0)"; }
+};
+
+template<>
+struct IsOpTraits<EBasicIsOperator::kIsInfNeg> {
+   static std::string Name() { return "IsInfNeg"; }
+   static std::string Op(const std::string &x) { return "(std::isinf(" + x + ") && " + x + " < 0)"; }
+};
+
+template<>
+struct IsOpTraits<EBasicIsOperator::kIsNaN> {
+   static std::string Name() { return "IsNaN"; }
+   static std::string Op(const std::string &x) { return "std::isnan(" + x + ")"; }
+};
+
+
+template <EBasicIsOperator Op>
+class ROperator_Basic_Is final : public ROperator {
+private:
+   std::string fNX;
+   std::string fNY;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeY;
+
+public:
+   ROperator_Basic_Is() {}
+
+   ROperator_Basic_Is(std::string nameX, std::string nameY)
+      : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      fShapeY = fShapeX;
+      model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY);
+   }
+
+   std::string Generate(std::string opName) override
+   {
+      opName = "op_" + opName;
+      std::stringstream out;
+
+      out << SP << "\n//---- Operator " << IsOpTraits<Op>::Name() << " " << opName << "\n";
+      auto length = ConvertDimShapeToLength(fShapeX);
+      out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n";
+      out << SP << SP << "tensor_" << fNY << "[i] = " << IsOpTraits<Op>::Op("tensor_" + fNX + "[i]") << ";\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override
+   {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op  = "\n//------ " + IsOpTraits<Op>::Name() + "_KERNEL_ALPAKA\n";
+      op += SP + "struct Is" + IsOpTraits<Op>::Name() + "Kernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      // Output is uint8_t (bool storage), input is T (float/double).
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const & acc,\n";
+      op += SP + SP + SP + "T const * data,\n";
+      op += SP + SP + SP + "uint8_t * output,\n";
+      op += SP + SP + SP + "std::size_t const length) const\n";
+      op += SP + SP + "{\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < length) {\n";
+      op += SP + SP + SP + SP + "output[idx] = static_cast<uint8_t>(" + IsOpTraits<Op>::Op("data[idx]") + ");\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override
+   {
+      return SP + "Is" + IsOpTraits<Op>::Name() + "Kernel " + IsOpTraits<Op>::Name() + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override
+   {
+      opName = "op_" + opName;
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShapeX);
+
+      out << "\n//------ " << opName << "_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << opName
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", " << IsOpTraits<Op>::Name() << "Kernel"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", " << length << ");\n";
+      out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+      return out.str();
+   }
+
+   bool IsElementwise() const override { return !fIsOutputConstant; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return IsOpTraits<Op>::Op(v);
+   }
+
+   std::vector<std::string> GetStdLibs() override {
+      return { std::string("cmath") };
+   }
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROPERATOR_BASIC_IS
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/core/inc/SOFIE/ROperator_BatchNormalization.hxx
similarity index 65%
rename from src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
rename to core/inc/SOFIE/ROperator_BatchNormalization.hxx
index a27cea4..8bc3b3d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
+++ b/core/inc/SOFIE/ROperator_BatchNormalization.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_ROPERATOR_BatchNormalization
 #define SOFIE_ROPERATOR_BatchNormalization
 
-#include "SOFIE_common.hxx"
-#include "ROperator.hxx"
-#include "RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 
 #include <cmath>
@@ -59,7 +59,7 @@ public:
       }
       else{
 	      throw
-		      std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a BatchNormalization operator");
+		      std::runtime_error("SOFIE Encountered unsupported type parsing a BatchNormalization operator");
       }
    }
 
@@ -72,12 +72,12 @@ public:
    std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
       if (input.size() != 5 ) {
          throw
-         std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference need 5 input tensors");
+         std::runtime_error("SOFIE BatchNormalization Op Shape inference need 5 input tensors");
       }
       for(size_t i = 0; i < input.size(); i++) {
          if (input[i].size() != 4) {
             throw
-            std::runtime_error("TMVA SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions");
+            std::runtime_error("SOFIE BatchNormalization Op Shape inference only accept tensor with 4 dimensions");
          }
       }
 
@@ -88,30 +88,30 @@ public:
    void Initialize(RModel& model) override {
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
          throw
-            std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model");
+            std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNX + " fnx is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNScale)) {
 	     throw
-            std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model");
+            std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNScale + " fns is not found in model");
       }
 	  if (!model.CheckIfTensorAlreadyExist(fNB)) {
          throw
-            std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model");
+            std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNB + " fnb is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNMean)) {
          throw
-            std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model");
+            std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNMean + " fnm is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNVar)) {
          throw
-            std::runtime_error("TMVA SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model");
+            std::runtime_error("SOFIE BatchNormalization op Input Tensor " + fNVar + " fnv is not found in model");
       }
 
       fShapeX = model.GetTensorShape(fNX);
 
       if (fShapeX.size() <  2 || fShapeX.size() > 4) {
          throw
-            std::runtime_error("TMVA SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX));
+            std::runtime_error("SOFIE BatchNormalization Op input tensor " + fNX + " fnx has wrong shape : " + ConvertShapeToString(fShapeX));
       }
 
       fShapeScale = model.GetTensorShape(fNScale);
@@ -185,7 +185,7 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShapeX.empty()){
-         throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Batch Normalization called to Generate without being initialized first");
       }
 
       std::stringstream out;
@@ -227,6 +227,80 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty())
+         throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+
+      std::string kname = "BatchNormKernel_" + opName;
+      std::string op;
+      op  = "\n//------ BATCHNORM_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ X,\n";
+      op += SP + SP + SP + "T const* __restrict__ scale,\n";
+      op += SP + SP + SP + "T const* __restrict__ bias,\n";
+      op += SP + SP + SP + "T const* __restrict__ mean,\n";
+      op += SP + SP + SP + "T* __restrict__ Y,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t i = global_thread_idx; i < totalElements; i += grid_thread_extent) {\n";
+
+      op += SP + SP + SP + SP + "T val = (X[i] - mean[i]) * scale[i] + bias[i];\n";
+
+      if (fActivation == EActivationType::RELU)
+         op += SP + SP + SP + SP + "Y[i] = val > static_cast<T>(0) ? val : static_cast<T>(0);\n";
+      else
+         op += SP + SP + SP + SP + "Y[i] = val;\n";
+
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "BatchNormKernel_" + opName;
+      return SP + kname + " batchNormKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty())
+         throw std::runtime_error("SOFIE BatchNormalization called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "batchNormKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ BATCHNORM_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      
+      out << SP << "auto task_" << fNY << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+         << ", " << kname
+         << ", alpaka::getPtrNative(deviceBuf_" << fNX     << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNScale << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNB     << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNMean  << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNY     << ")"
+         << ", static_cast<Idx>(" << totalElements << "));\n";
+      out << SP <<"alpaka::enqueue(queue, task_" << fNY << ");\n";
+      
+      return out.str();
+   }
+
    std::vector<std::string> GetBlasRoutines() override { return { std::string("Copy"), std::string("Axpy") }; }
 };
 
diff --git a/core/inc/SOFIE/ROperator_Cast.hxx b/core/inc/SOFIE/ROperator_Cast.hxx
new file mode 100644
index 0000000..3571e39
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Cast.hxx
@@ -0,0 +1,175 @@
+#ifndef SOFIE_ROPERATOR_Cast
+#define SOFIE_ROPERATOR_Cast
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template <typename In>
+std::vector<int64_t> convertToInt64(const In* src, size_t n) {
+   std::vector<int64_t> dst(n);
+   std::transform(src, src + n, dst.begin(),
+                  [](In v) { return static_cast<int64_t>(v); });
+   return dst;
+}
+
+
+class ROperator_Cast final : public ROperator
+{
+
+private:
+
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+   ETensorType fType;
+
+public:
+   ROperator_Cast(){}
+   ROperator_Cast(ETensorType type,std::string nameX, std::string nameY):
+      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)),
+      fType(type)
+   {
+      fKind = OperatorKind::CAST;
+      fInputTensorNames = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+       //input must be a graph input, or already initialized intermediate tensor
+      if (model.CheckIfTensorAlreadyExist(fNX) == false){
+        throw std::runtime_error("SOFIE Cast Op Input Tensor is not found in model");
+      }
+      fShape = model.GetDimTensorShape(fNX);
+      // should we add a check if the same type
+      auto inputType = model.GetTensorType(fNX);
+      if (model.IsInitializedTensor(fNX)) {
+         fIsOutputConstant = true;
+         auto inputData = model.GetInitializedTensorData(fNX);
+         if (fType == ETensorType::INT64) {
+            size_t length = ConvertShapeToLength(fShape);
+            std::vector<int64_t> convertedData;
+            if (inputType == ETensorType::FLOAT) {
+               convertedData = convertToInt64(static_cast<const float*>(inputData.get()), length);
+            } else if (inputType == ETensorType::DOUBLE) {
+               convertedData = convertToInt64(static_cast<const double*>(inputData.get()), length);
+            } else if (inputType == ETensorType::INT32) {
+               convertedData = convertToInt64(static_cast<const int32_t*>(inputData.get()), length);
+            } else {
+               // Already INT64 — safe direct copy
+               convertedData.assign(static_cast<const int64_t*>(inputData.get()),
+                                    static_cast<const int64_t*>(inputData.get()) + length);
+            }
+            model.AddConstantTensor<int64_t>(fNY, ConvertShapeToInt(fShape), convertedData.data());
+            model.SetNotWritableInitializedTensor(fNX);
+         }
+         else
+            fIsOutputConstant = false;
+      } else if (model.IsShapeTensor(fNX) && fType == ETensorType::INT64) {
+         auto shapeData = model.GetShapeTensorValues(fNX);
+         model.AddShapeTensor(fNY, shapeData, fShape.size() == 0);
+         fIsOutputConstant = true;
+      }
+      if (!fIsOutputConstant)
+         model.AddIntermediateTensor(fNY, fType, fShape);
+      if (model.Verbose()) {
+         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << ConvertTypeToString(fType);
+         if (fType == ETensorType::BOOL) std::cout << " (converted from BOOL) ";
+         std::cout << " for " << fNY << " shape " << ConvertDimShapeToString(fShape);
+         if (fIsOutputConstant) std::cout << " (constant) ";
+         std::cout << std::endl;
+      }
+   }
+
+
+   std::string Generate(std::string opName) override {
+
+      // output shape can be empty if is a scalar
+
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShape);
+
+      out << "\n//------ CAST " << opName << " ---> " << fNY << "  " << ConvertDimShapeToString(fShape) << "\n";
+       // no generated code for constant outputs
+      if (fIsOutputConstant) return out.str();
+
+      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
+
+      // need to handle bool case separatly since casting to uint8 will not give right result
+      if (fType == ETensorType::BOOL)
+         out << SP << SP << "tensor_" << fNY << "[id] = (tensor_" << fNX << "[id] != 0) ? 1 : 0;\n";
+      else
+         out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< ConvertTypeToString(fType) << ">(tensor_" << fNX << "[id]);\n";
+
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      std::string op;
+      op = "\n//------ CAST_KERNEL_ALPAKA\n";
+      op += SP + "struct CastKernel"+opName+"{\n";
+      op += SP + SP + "template<typename TAcc, typename SrcT, typename DstT>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "dst[i] = static_cast<DstT>(src[i]);\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   // Use a per-operator variable name so that multiple Cast operators with
+   // different source/destination types in the same model each get their own
+   // distinct member variable (the struct type is already per-op: CastKernelN).
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      return SP + "CastKernel" + opName + " castKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      // Save the raw operator index before building the "op_N" prefix so the
+      // variable name matches the one declared in Generate_GPU_Kernel_Definitions_ALPAKA.
+      std::string varName = "castKernel_" + OpName;
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Cast called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ CAST_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY << ", " << varName << ", alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << ")); \n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+   
+   // Cast changes the data type, so it cannot participate in the single-type-T
+   // FusedEltwiseKernel (which reads input and writes output as the same T).
+   // Returning false here routes Cast through its own Generate_GPU_ALPAKA path,
+   // which correctly uses separate SrcT and DstT device buffers.
+   bool IsElementwise() const override { return false; }
+
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_Cast
diff --git a/core/inc/SOFIE/ROperator_Clip.hxx b/core/inc/SOFIE/ROperator_Clip.hxx
new file mode 100644
index 0000000..4a92afb
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Clip.hxx
@@ -0,0 +1,376 @@
+#ifndef SOFIE_ROPERATOR_CLIP
+#define SOFIE_ROPERATOR_CLIP
+
+#include "SOFIE_common.hxx"
+#include "ROperator.hxx"
+#include "RModel.hxx"
+
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace SOFIE {
+
+// ---------------------------------------------------------------------------
+// ROperator_Clip
+//
+// ONNX spec: Y = max(min_val, min(max_val, X))  element-wise
+//
+// The min and max bounds are optional in the ONNX spec:
+//   - if fNMin is empty  → no lower clipping  (effectively -inf)
+//   - if fNMax is empty  → no upper clipping  (effectively +inf)
+//
+// Bounds can be provided either as:
+//   (a) initializer / constant tensors (scalar, shape []),
+//   (b) runtime input tensors          (resolved at Generate time),
+//   (c) compile-time float literals    (via the fMin / fMax attributes).
+//
+// The implementation follows the Selu operator style exactly:
+//   - static shape stored in fShape
+//   - dynamic shape stored in fDimShape
+//   - a flat loop over all elements in Generate()
+// ---------------------------------------------------------------------------
+
+template <typename T>
+class ROperator_Clip final : public ROperator {
+private:
+
+   // Tensor names
+   std::string fNX;       // input data
+   std::string fNY;       // output
+   std::string fNMin;     // optional: tensor name for min bound
+   std::string fNMax;     // optional: tensor name for max bound
+
+
+   // Static shape (non-dynamic path, mirrors Selu)
+   std::vector<size_t> fShape;
+
+   // Dynamic shape (Dim-aware, for dynamic input tensors)
+   std::vector<Dim> fDimShape;
+   bool fIsDynamic = false;
+
+   // Compile-time bound values — used when bounds are constant tensors
+   // Initialised to the ONNX defaults (no clipping)
+   T fMin =  std::numeric_limits<T>::lowest();   // -inf equivalent
+   T fMax =  std::numeric_limits<T>::max();      //  +inf equivalent
+
+   // Flags indicating whether each bound is:
+   //   - absent (no input provided)
+   //   - a constant resolved at Initialize time
+   //   - a runtime tensor that must be read in the generated code
+   bool fHasMin         = false;
+   bool fHasMax         = false;
+   bool fMinIsConstant  = false;
+   bool fMaxIsConstant  = false;
+
+public:
+
+   ROperator_Clip() {}
+
+   // Constructor for the common case where bounds are tensor inputs
+   // (follows ONNX node input order: X, min, max)
+   ROperator_Clip(std::string nameX,
+                  std::string nameY,
+                  std::string nameMin = "",
+                  std::string nameMax = "")
+      : fNX  (UTILITY::Clean_name(nameX)),
+        fNY  (UTILITY::Clean_name(nameY)),
+        fNMin(nameMin.empty() ? "" : UTILITY::Clean_name(nameMin)),
+        fNMax(nameMax.empty() ? "" : UTILITY::Clean_name(nameMax))
+   {
+      fKind = OperatorKind::CLIP;
+      fInputTensorNames  = { fNX };
+      if (!fNMin.empty()) fInputTensorNames.push_back(fNMin);
+      if (!fNMax.empty()) fInputTensorNames.push_back(fNMax);
+      fOutputTensorNames = { fNY };
+   }
+
+   // Convenience constructor when bounds are known scalars at model-build time
+   ROperator_Clip(std::string nameX,
+                  std::string nameY,
+                  T minVal,
+                  T maxVal)
+      : fNX (UTILITY::Clean_name(nameX)),
+        fNY (UTILITY::Clean_name(nameY)),
+        fMin(minVal), fMax(maxVal),
+        fHasMin(true), fHasMax(true),
+        fMinIsConstant(true), fMaxIsConstant(true)
+   {
+      fKind = OperatorKind::CLIP;
+      fInputTensorNames  = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+
+   // -----------------------------------------------------------------------
+   void Initialize(RModel& model) override
+   {
+      // ---- validate main input ------------------------------------------
+      if (!model.CheckIfTensorAlreadyExist(fNX))
+         throw std::runtime_error(
+            "SOFIE Clip Op Input Tensor " + fNX + " is not found in model");
+
+      // ---- collect shape (static or dynamic, mirrors BasicBinary) -------
+      if (model.IsDynamicTensor(fNX)) {
+         fIsDynamic = true;
+         fDimShape  = model.GetDynamicTensorShape(fNX);
+      } else {
+         fShape    = model.GetTensorShape(fNX);
+         fDimShape = ConvertShapeToDim(fShape);
+      }
+
+      // ---- resolve min bound --------------------------------------------
+      if (!fNMin.empty() && model.CheckIfTensorAlreadyExist(fNMin)) {
+         fHasMin = true;
+         if (model.IsInitializedTensor(fNMin)) {
+            // constant scalar tensor — read value now
+            auto data = static_cast<T*>(model.GetInitializedTensorData(fNMin).get());
+            fMin            = data[0];
+            fMinIsConstant  = true;
+            model.SetNotWritableInitializedTensor(fNMin);
+         }
+         // else: runtime input — will be dereferenced in generated code
+      }
+
+      // ---- resolve max bound --------------------------------------------
+      if (!fNMax.empty() && model.CheckIfTensorAlreadyExist(fNMax)) {
+         fHasMax = true;
+         if (model.IsInitializedTensor(fNMax)) {
+            auto data = static_cast<T*>(model.GetInitializedTensorData(fNMax).get());
+            fMax            = data[0];
+            fMaxIsConstant  = true;
+            model.SetNotWritableInitializedTensor(fNMax);
+         }
+      }
+
+      // ---- register output tensor ---------------------------------------
+      if (fIsDynamic)
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fDimShape);
+      else
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+
+      if (model.Verbose()) {
+         std::cout << "Clip : " << fNX << " "
+                   << ConvertShapeToString(fShape);
+         if (fHasMin)
+            std::cout << "  min=" << (fMinIsConstant
+                       ? std::to_string(fMin) : fNMin + "(runtime)");
+         if (fHasMax)
+            std::cout << "  max=" << (fMaxIsConstant
+                       ? std::to_string(fMax) : fNMax + "(runtime)");
+         std::cout << " --> " << fNY << "\n";
+      }
+
+      // only needs <algorithm> and <limits> — no cmath
+      model.AddNeededStdLib("algorithm");
+      model.AddNeededStdLib("limits");
+   }
+
+
+   // -----------------------------------------------------------------------
+   // GPU ALPAKA
+   // -----------------------------------------------------------------------
+
+   // Each Clip instance carries its own min/max values (passed as kernel
+   // arguments) and may have different element types.  Use per-operator names
+   // for the kernel struct and member variable so that multiple Clip operators
+   // in the same model do not produce duplicate definitions.
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override
+   {
+      std::string kname = "ClipKernel_op_" + opName;
+      std::string op;
+      op  = "\n//------ CLIP_KERNEL_ALPAKA op_" + opName + "\n";
+      op += "struct " + kname + " {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + "TAcc const & acc,\n";
+      op += SP + SP + "T const * __restrict__ data,\n";
+      op += SP + SP + "T * __restrict__ out,\n";
+      op += SP + SP + "std::size_t numElements,\n";
+      op += SP + SP + "T minVal,\n";
+      op += SP + SP + "T maxVal) const\n";
+      op += SP + "{\n";
+      op += SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + "if (idx < numElements) {\n";
+      op += SP + SP + SP + "T val = data[idx];\n";
+      op += SP + SP + SP + "val = val < minVal ? minVal : val;\n";
+      op += SP + SP + SP + "out[idx] = val > maxVal ? maxVal : val;\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override
+   {
+      std::string kname = "ClipKernel_op_" + opName;
+      std::string vname = "clipKernel_op_" + opName;
+      return kname + " " + vname + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override
+   {
+      // Save the raw operator index before building the "op_N" prefix so that
+      // the variable name matches the one declared in Generate_GPU_Kernel_Definitions_ALPAKA.
+      std::string varName = "clipKernel_op_" + OpName;
+      OpName = "op_" + OpName;
+
+      if (fShape.empty() && fDimShape.empty())
+         throw std::runtime_error(
+            "SOFIE Operator Clip called to Generate_GPU_ALPAKA without being initialized first");
+
+      std::stringstream out;
+      out << "\n//------ CLIP_GPU_ALPAKA " << OpName << "\n";
+
+      std::string length = ConvertDimShapeToLength(fDimShape);
+
+      std::string minExpr, maxExpr;
+      if (fMinIsConstant) {
+         minExpr = ToStringHighPrec(fMin);
+      } else if (fHasMin) {
+         throw std::runtime_error(
+            "SOFIE Clip GPU ALPAKA: runtime (non-constant) min bound is not supported in GPU path");
+      } else {
+         minExpr = "std::numeric_limits<" + TensorType<T>::Name() + ">::lowest()";
+      }
+
+      if (fMaxIsConstant) {
+         maxExpr = ToStringHighPrec(fMax);
+      } else if (fHasMax) {
+         throw std::runtime_error(
+            "SOFIE Clip GPU ALPAKA: runtime (non-constant) max bound is not supported in GPU path");
+      } else {
+         maxExpr = "std::numeric_limits<" + TensorType<T>::Name() + ">::max()";
+      }
+
+      std::string castMin = "static_cast<" + TensorType<T>::Name() + ">(" + minExpr + ")";
+      std::string castMax = "static_cast<" + TensorType<T>::Name() + ">(" + maxExpr + ")";
+
+      out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << OpName
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY << ", " << varName
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<std::size_t>(" << length << ")"
+          << ", " << castMin << ", " << castMax << ");\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   bool IsElementwise() const override { return true; }
+
+   std::string GetElementwiseExpr(const std::string& v) const override
+   {
+      std::string minExpr, maxExpr;
+      if (fMinIsConstant)       minExpr = ToStringHighPrec(fMin);
+      else if (fHasMin)         minExpr = "tensor_" + fNMin + "[0]";
+      else                      minExpr = "std::numeric_limits<" + TensorType<T>::Name() + ">::lowest()";
+
+      if (fMaxIsConstant)       maxExpr = ToStringHighPrec(fMax);
+      else if (fHasMax)         maxExpr = "tensor_" + fNMax + "[0]";
+      else                      maxExpr = "std::numeric_limits<" + TensorType<T>::Name() + ">::max()";
+
+      std::string expr = fHasMax || fMaxIsConstant ? "std::min(" + maxExpr + ", " + v + ")" : v;
+      if (fHasMin || fMinIsConstant)
+         expr = "std::max(" + minExpr + ", " + expr + ")";
+      return expr;
+   }
+
+   std::string GetFusableOutputTensorName() override { return fNY; }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name,
+                                 const std::function<void(const std::string&)>& removal_func) override
+   {
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0]  = fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
+   // -----------------------------------------------------------------------
+   // Generate
+   // -----------------------------------------------------------------------
+   std::string Generate(std::string OpName) override
+   {
+      OpName = "op_" + OpName;
+
+      if (fShape.empty() && fDimShape.empty())
+         throw std::runtime_error(
+            "SOFIE Operator Clip called to Generate without being initialized first");
+
+      std::stringstream out;
+      out << SP << "\n//------ CLIP " << OpName << "\n";
+
+      // ---- build the length expression (static or dynamic) -------------
+      std::string length = ConvertDimShapeToLength(fDimShape);
+
+      // ---- build min/max expressions for the generated code ------------
+      //
+      //  Priority:
+      //    1. compile-time constant value  → emit literal
+      //    2. runtime input tensor         → emit tensor_<name>[0]  (scalar)
+      //    3. not provided                 → emit numeric_limits extreme
+      //
+      std::string minExpr, maxExpr;
+
+      if (fMinIsConstant) {
+         minExpr = ToStringHighPrec(fMin);
+      } else if (fHasMin) {
+         minExpr = "tensor_" + fNMin + "[0]";  // scalar input tensor
+      } else {
+         // No lower bound — use lowest representable value
+         minExpr = "std::numeric_limits<" + TensorType<T>::Name()
+                   + ">::lowest()";
+      }
+
+      if (fMaxIsConstant) {
+         maxExpr = ToStringHighPrec(fMax);
+      } else if (fHasMax) {
+         maxExpr = "tensor_" + fNMax + "[0]";
+      } else {
+         // No upper bound — use max representable value
+         maxExpr = "std::numeric_limits<" + TensorType<T>::Name()
+                   + ">::max()";
+      }
+
+      auto tensorValue = [](const std::string & name, const std::string & index) {
+         std::stringstream s;
+         s << "tensor_" << name << "[" << index << "]";
+         return s.str();
+      };
+
+      // ---- flat element loop (identical structure to Selu) -------------
+      out << SP << "for (int id = 0; id < " << length << " ; id++) {\n";
+      std::string firstExpr = fHasMax ? "std::min(" + maxExpr + ", " + tensorValue(fNX, "id") + ")" : tensorValue(fNX, "id");
+      std::string secondExpr  = fHasMin ? "std::max(" + minExpr + ", " + firstExpr + ")" : firstExpr;
+      out << SP << SP << tensorValue(fNY, "id") << " = " << secondExpr << ";\n";
+      out << SP << "}\n";
+
+      return out.str();
+   }
+
+
+private:
+
+   // Helper: convert a T value to string with enough precision
+   std::string ToStringHighPrec(T val) const {
+      std::ostringstream ss;
+      ss << std::setprecision(std::numeric_limits<T>::max_digits10) << val;
+      // add dot if missing
+      if (ss.str().find(".") == std::string::npos) ss << ".";
+      // append 'f' suffix for float literals so generated code compiles
+      // cleanly without implicit double→float conversion warnings
+      if (std::is_same<T, float>::value) ss << "f";
+      return ss.str();
+   }
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROPERATOR_CLIP
\ No newline at end of file
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx b/core/inc/SOFIE/ROperator_Comparision.hxx
similarity index 57%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx
rename to core/inc/SOFIE/ROperator_Comparision.hxx
index 7648a9a..1e02d53 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx
+++ b/core/inc/SOFIE/ROperator_Comparision.hxx
@@ -1,4 +1,3 @@
-
 #ifndef SOFIE_ROperator_Comparision
 #define SOFIE_ROperator_Comparision
 
@@ -73,30 +72,26 @@ public:
    ROperator_Comparision(){}
    ROperator_Comparision(const std::string & nameX1, const std::string & nameX2, const std::string & nameY):
       fNX1(UTILITY::Clean_name(nameX1)), fNX2(UTILITY::Clean_name(nameX2)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::COMPARISON;
          fInputTensorNames = { fNX1, fNX2 };
-         
-         // output will be a boolean vector so should not be considered for memory optimized pool
          fOutputTensorNames = { fNY };
       }
 
-   // type of output given input
    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
       return input;
    }
 
-   // shape of output tensors given input tensors
    std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; // return vector size 1 with first input
+      auto ret = input;
       return ret;
    }
 
    void Initialize(RModel& model) override {
-      // input must be a graph input, or already initialized intermediate tensor
       if (!model.CheckIfTensorAlreadyExist(fNX1)){
-         throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model");
+         throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX1 + "is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNX2)) {
-         throw std::runtime_error(std::string("TMVA SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model");
+         throw std::runtime_error(std::string("SOFIE Comparision Op Input Tensor ") + fNX2 + "is not found in model");
       }
       fShapeX1 = model.GetTensorShape(fNX1);
       fShapeX2 = model.GetTensorShape(fNX2);
@@ -104,38 +99,34 @@ public:
       fTensorType2 = model.GetTensorType(fNX2);
       bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2);
       if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2);
+         // ONNX comparison ops support multidirectional broadcasting (numpy semantics):
+         // both inputs can be broadcast to the common output shape.
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeX1, fShapeX2);
+         fShapeY = ret.second;
          bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY);
          bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY);
-         // Broadcast A to Y
          if (broadcastX1) {
             if (model.IsInitializedTensor(fNX1)) {
                auto data = model.GetInitializedTensorData(fNX1);
                std::shared_ptr<void> broadcastedData(
                   UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX1, fShapeY),
                   std::default_delete<T[]>());
-               // Update the data and the shape of A
                model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData);
                fShapeX1 = fShapeY;
             } else {
-               // Add an intermediate tensor for broadcasting A
                fNBroadcastedX1 = "Broadcasted" + fNX1;
                model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY);
             }
          }
-         // Broadcast B to Y
          if (broadcastX2) {
             if (model.IsInitializedTensor(fNX2)) {
                auto data = model.GetInitializedTensorData(fNX2);
                std::shared_ptr<void> broadcastedData(
                   UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX2, fShapeY),
                   std::default_delete<T[]>());
-               // Update the data and the shape of B
                model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData);
                fShapeX2 = fShapeY;
             } else {
-               // Add an intermediate tensor for broadcasting B
                fNBroadcastedX2 = "Broadcasted" + fNX2;
                model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY);
             }
@@ -143,8 +134,7 @@ public:
       } else {
          fShapeY = fShapeX1;
       }
-      // case of constant tensors
-      if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2) ) {
+      if (model.IsInitializedTensor(fNX1) && model.IsInitializedTensor(fNX2)) {
          fIsOutputConstant = true;
          auto data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
          auto data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
@@ -158,9 +148,8 @@ public:
                << ConvertValuesToString(length,outData) << std::endl;
          delete [] outData;
       } else {
-         model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY);
+         model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY);
       }
-      // check if this is not output operators to add a specific line for definining the tensor_xxx variable
       const auto & outputTensorNames = model.GetOutputTensorNames();
       fIsModelOutput = false;
       if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end())
@@ -170,14 +159,12 @@ public:
    std::string Generate(std::string OpName) override {
       if (fIsOutputConstant) return "";
       OpName = "op_" + OpName;
-
-     if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first");
+      if (fShapeY.empty()) {
+         throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first");
       }
       std::stringstream out;
       out << SP << "\n//------ " << ComparisionTrait<T,Op>::Name() << "\n";
       size_t length = ConvertShapeToLength(fShapeY);
-      // Broadcast A if it's uninitialized
       if (!fNBroadcastedX1.empty()) {
          std::string type1 = ConvertTypeToString(fTensorType1);
          out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n";
@@ -187,7 +174,6 @@ public:
          out << SP << SP << "delete[] data;\n";
          out << SP << "}\n";
       }
-      // Broadcast B if it's uninitialized
       if (!fNBroadcastedX2.empty()) {
          std::string type2 = ConvertTypeToString(fTensorType2);
          out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n";
@@ -199,14 +185,126 @@ public:
       }
       const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1;
       const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2;
-
       out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
       out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait<T,Op>::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") <<  " ;\n";
       out << SP << "}\n";
-      // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector
       if (!fIsModelOutput)
          out << SP << "const std::vector<bool> & tensor_" << fNY << " = fTensor_" << fNY << ";\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first");
+
+      const std::size_t D = fShapeY.size();
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
 
+      std::vector<size_t> shapeX1_padded(D, 1);
+      std::vector<size_t> shapeX2_padded(D, 1);
+      {
+         size_t off1 = D - fShapeX1.size();
+         for (size_t i = 0; i < fShapeX1.size(); ++i)
+            shapeX1_padded[off1 + i] = fShapeX1[i];
+         size_t off2 = D - fShapeX2.size();
+         for (size_t i = 0; i < fShapeX2.size(); ++i)
+            shapeX2_padded[off2 + i] = fShapeX2[i];
+      }
+
+      auto stridesX1 = UTILITY::ComputeStrideFromShape(shapeX1_padded);
+      auto stridesX2 = UTILITY::ComputeStrideFromShape(shapeX2_padded);
+      auto stridesY  = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      std::string type1  = ConvertTypeToString(fTensorType1);
+      std::string type2  = ConvertTypeToString(fTensorType2);
+      std::string kname  = "ComparisonKernel_" + opName;
+      std::string opname = ComparisionTrait<T, Op>::Name();
+
+      std::string op;
+      op  = "\n//------ " + opname + "_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + type1 + " const* __restrict__ x1,\n";
+      op += SP + SP + SP + type2 + " const* __restrict__ x2,\n";
+      op += SP + SP + SP + "uint8_t* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+             + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % "
+             + std::to_string(fShapeY[d]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const x1_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         if (shapeX1_padded[d] == 1)
+            op += SP + SP + SP + SP + SP + "0u";
+         else
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(d)
+                + " * " + std::to_string(stridesX1[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "std::size_t const x2_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         if (shapeX2_padded[d] == 1)
+            op += SP + SP + SP + SP + SP + "0u";
+         else
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(d)
+                + " * " + std::to_string(stridesX2[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[elem_idx] = "+ ComparisionTrait<T,Op>::Op("x1[x1_idx]" , "x2[x2_idx]") + " ;\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      std::string kname = "ComparisonKernel_" + opName;
+      return SP + kname + " comparisonKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("SOFIE Comparision Op called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "comparisonKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ " << ComparisionTrait<T,Op>::Name() << "_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+         << ", " << kname
+         << ", alpaka::getPtrNative(deviceBuf_" << fNX1 << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNX2 << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+         << ", static_cast<Idx>(" << totalElements << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+      
       return out.str();
    }
 
diff --git a/core/inc/SOFIE/ROperator_Concat.hxx b/core/inc/SOFIE/ROperator_Concat.hxx
new file mode 100644
index 0000000..36ede27
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Concat.hxx
@@ -0,0 +1,505 @@
+#ifndef SOFIE_ROPERATOR_Concat
+#define SOFIE_ROPERATOR_Concat
+
+
+ #include "SOFIE/SOFIE_common.hxx"
+ #include "SOFIE/ROperator.hxx"
+ #include "SOFIE/RModel.hxx"
+
+ #include <sstream>
+ #include <algorithm>
+ #include <iterator>
+ #include <iomanip>
+ #include <limits>
+
+ namespace SOFIE{
+
+     class ROperator_Concat final : public ROperator
+     {
+     private:
+         int fAxis=0;
+         int fnewAxis=0;
+         std::vector<std::string> fInputs;
+         std::string fOutput;
+         std::vector<Dim>fOutputShape;
+         std::vector<Dim> fOutputShapeData; // in case output is a shape tensor we store here the output shape value data (can be parametric)
+         std::vector<std::vector<Dim>> fInputShapes;
+         ETensorType fInputType = ETensorType::UNDEFINED;
+
+     public:
+
+         ROperator_Concat(){}
+         ROperator_Concat(std::vector<std::string> inputs, int axis, int newAxis, std::string output):
+         fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) {
+            fInputs.reserve(inputs.size());
+            for (auto & name : inputs)
+               fInputs.push_back(UTILITY::Clean_name(name));
+
+         fInputTensorNames.resize(fInputs.size());
+         std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(),
+                   [](const std::string& s) -> std::string_view { return s; });
+         fOutputTensorNames = { fOutput };
+         }
+
+         std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+             return input;
+         }
+
+         // get shape of output given inputs. It is going to be called after initialized
+         std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> inputs) override {
+             std::vector<std::vector<size_t>> ret(1);
+            // treat negative axis case
+            if (fAxis<0) {
+               fAxis = inputs[0].size()+fAxis;
+            }
+            if (fAxis < 0 || fAxis >= (int) inputs[0].size())
+               throw std::runtime_error("SOFIE Concat Op - invalid axis value ");
+
+            int concat_dim=0;
+            // case of Concat (fNewAxis = 0) and not ConcatFromSequence
+            if(fnewAxis == 0){
+               for (size_t i = 0; i < inputs.size(); i++) {
+                  if (i > 0 && inputs[i].size() != inputs[i - 1].size())
+                     throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " +
+                                              ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1]));
+                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
+                     if ((int)iaxis == fAxis)
+                        concat_dim += inputs[i][iaxis];
+                     else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis])
+                        throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " +
+                                                 ConvertShapeToString(inputs[i]) + " and " +
+                                                 ConvertShapeToString(inputs[i - 1]));
+                  }
+               }
+
+               // output shape
+               ret[0] = inputs[0];
+               ret[0][fAxis] = concat_dim;
+            }
+            std::vector<int> stack;
+            // case ConCatFromSequence
+            if(fnewAxis == 1){
+               for(size_t i = 0; i < inputs.size(); i++) {
+                  if (i > 0 && inputs[i].size() != inputs[i-1].size() )
+                  throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
+                     ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1]));
+                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
+                     if ((int) iaxis == fAxis)
+                        stack.push_back(inputs[i][iaxis]);
+                     else
+                     if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis])
+                        throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " +
+                        ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1]));
+                  }
+
+               }
+               for(auto it:stack)
+               ret[0].push_back(it);
+            }
+
+            return ret;
+         }
+
+         // get shape of output given inputs. It is going to be called after initialized
+         std::vector<Dim> ShapeInference(const std::vector<std::vector<Dim>> & inputs, const RModel & model) {
+            std::vector<Dim> ret(inputs[0].size());
+            // treat negative axis case
+            if (fAxis<0) {
+               fAxis = inputs[0].size()+fAxis;
+            }
+            if (fAxis < 0 || fAxis >= (int) inputs[0].size())
+               throw std::runtime_error("SOFIE Concat Op - invalid axis value ");
+
+            Dim concat_dim;
+            if(fnewAxis == 0){
+               for (size_t i = 0; i < inputs.size(); i++) {
+                  if (i > 0 && inputs[i].size() != inputs[i - 1].size())
+                     throw std::runtime_error("SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
+                                              ConvertDimShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDimShapeToString(inputs[i - 1]));
+                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
+                     if ((int)iaxis == fAxis) {
+                        // support both integer and params shape for the concatenation axis
+                        if (concat_dim.param.empty() && concat_dim.dim == 0)
+                           concat_dim = inputs[i][iaxis];
+                        else if (inputs[i][iaxis].isParam || concat_dim.isParam) {
+                           concat_dim =
+                              Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(),
+                                 static_cast<size_t>(-1)};
+                        } else {
+                           concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim };
+                        }
+                     }
+                     else if (i == 0) {
+                        ret[iaxis] = inputs[i][iaxis];
+                     }
+                     else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) {
+                        throw std::runtime_error("SOFIE Concat Op - input tensors have wrong shapes " +
+                                                 ConvertDimShapeToString(inputs[i]) + " and " +
+                                                 ConvertDimShapeToString(inputs[i - 1]));
+                     }
+                     else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){
+                        // if shape is not parametric use it
+                        ret[iaxis] = inputs[i][iaxis];
+                     }
+                     else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) {
+                        // check which parameter is first in RModel list
+                        auto & dimNames = model.GetDimShapeNames();
+                        auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param);
+                        auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param);
+                        if (p1 < p2) ret[iaxis] = inputs[i][iaxis];
+                     }
+
+                  }
+                  // add parenthesis in case is an expression
+                  if (concat_dim.isParam && concat_dim.dim == static_cast<size_t>(-1))
+                     concat_dim =  Dim{ std::string("(") + concat_dim.GetVal() +  std::string(")"), concat_dim.dim };
+               }
+
+               // output shape for concatenated axis
+               ret[fAxis] = concat_dim;
+
+            }
+            // case of stacking (not supported yet)
+            // here we need to check that input shapes are the same
+            // for example for fAxis == 0
+            // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....]
+            if(fnewAxis == 1){
+               throw std::runtime_error("SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported ");
+            }
+            return ret;
+         }
+
+         void Initialize(RModel& model) override {
+            std::vector<std::vector<size_t>> inputIntShapes;
+            for (auto &it : fInputs) {
+               if (model.CheckIfTensorAlreadyExist(it) == false) {
+                  throw std::runtime_error("SOFIE Concat Op Input Tensor " + it + " is not found in model");
+               }
+               fInputShapes.push_back(model.GetDimTensorShape(it));
+               if (!model.IsDynamicTensor(it)) {
+                  inputIntShapes.push_back(ConvertShapeToInt(fInputShapes.back()));
+               }
+            }
+            if (inputIntShapes.size() == fInputs.size()) {
+               // if all input shapes are static we can compute output shape at initialization time
+               auto outputIntShape = ShapeInference(inputIntShapes)[0];
+               fOutputShape = ConvertShapeToDim(outputIntShape);
+               if (model.Verbose())
+                  std::cout << "Initialize Concat operator with defined inputs shapes, "
+                           << "output has shape " << ConvertShapeToString(outputIntShape) << std::endl;
+
+            } else {
+               // if at least one input shape is dynamic we need to compute output shape using the symbolic expression for the dimensions
+               fOutputShape = ShapeInference(fInputShapes, model);
+               if (model.Verbose())
+                  std::cout << "Initialize Concat operator with dynamic inputs shapes, "
+                           << "output has shape " << ConvertDimShapeToString(fOutputShape) << std::endl;
+            }
+
+            // check if concat has constant inputs , axis 0(concat contigous memory and type is integer)
+            bool isOutputShape = false;
+
+            // if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) {
+            fIsOutputConstant = true;
+            isOutputShape = true;
+
+            for (auto &input : fInputs) {
+               if (model.IsDynamicTensor(input)) {
+                  fIsOutputConstant = false;
+                  isOutputShape = false;
+                  break;
+               }
+               if (!model.IsInitializedTensor(input)) {
+                  if (model.IsShapeTensor(input)) {
+                     // if it is a shape tensor we can have constant output if the shapes are defined)
+                     auto shapeData = model.GetShapeTensorValues(input);
+                     bool isShapeFullyDefined = ConvertShapeToInt(shapeData).size() == shapeData.size();
+                     if (!isShapeFullyDefined) {
+                        fIsOutputConstant = false;
+                     } else {
+                        // if shape is fully defined we can consider output as constant and we can compute the output
+                        // shape at initialization time
+                        fIsOutputConstant = fIsOutputConstant && true;
+                     }
+                     // inputs are then shape tensors and output is a shape tensor
+                     isOutputShape = true;
+                  } else {
+                     // case of standard intermediate tensor
+                     fIsOutputConstant = false;
+                     isOutputShape = false;
+                     break;
+                  }
+               } else {
+                  fIsOutputConstant = fIsOutputConstant && true;
+               }
+            }
+            //}
+
+            if (fIsOutputConstant) {
+               auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible
+               std::vector<int64_t> outputData(ConvertShapeToLength(outputShape));
+               size_t offset = 0;
+               for (auto &input : fInputs) {
+                  auto inputData = static_cast<int64_t *>(model.GetInitializedTensorData(input).get());
+                  auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant
+                  size_t inputLength = ConvertShapeToLength(inputShape);
+                  std::copy(inputData, inputData + inputLength, outputData.begin() + offset);
+                  offset += inputLength;
+                  // the data of the input tensor don't need to be written in the generated code and data file
+                  model.SetNotWritableInitializedTensor(input);
+               }
+               model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
+               if (model.Verbose()) {
+                  std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : "
+                            << ConvertValuesToString(outputData) << " (constant)" << std::endl;
+               }
+            } else if (isOutputShape) {
+               auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible
+               if (outputShape.size() != 1)
+                  throw std::runtime_error("SOFIE Concat Op - output shape for shape tensor must have rank 1");
+               // output shape is a rank 1 tensor with size equal to the output rank
+               std::vector<Dim> outputData(outputShape[0]);
+               size_t offset = 0;
+               for (auto &input : fInputs) {
+                  std::vector<Dim> inputData;
+                  auto inputShape = model.GetTensorShape(input);         // shape is not dynamic
+                  size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar
+                  if (model.IsShapeTensor(input)) {
+                     inputData = model.GetShapeTensorValues(input);
+                  } else if (model.IsInitializedTensor(input)) {
+                     inputData.resize(inputLength);
+                     auto intData = static_cast<int64_t *>(model.GetInitializedTensorData(input).get());
+                     for (size_t i = 0; i < inputData.size(); i++)
+                        inputData[i] = Dim{static_cast<size_t>(intData[i])};
+                  } else {
+                     // this should not happen
+                     throw std::runtime_error("SOFIE Concat Operator- invalid tensor input " + input +
+                                              " for shape output type");
+                  }
+                  std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset);
+                  offset += inputLength;
+               }
+               // add output tensor
+               model.AddShapeTensor(fOutput, outputData, false); // cannot be a  scalar
+               fOutputShapeData = outputData;
+               if (model.Verbose()) {
+                  std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : "
+                            << ConvertDimShapeToString(outputData) << " (shape)" << std::endl;
+               }
+               fIsOutputParamShape = true;
+            }
+            if (!fIsOutputConstant && !fIsOutputParamShape) {
+               fInputType = model.GetTensorType(fInputs[0]);
+               model.AddIntermediateTensor(fOutput, fInputType, fOutputShape);
+               if (model.Verbose()) {
+                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDimShapeToString(fOutputShape) << std::endl;
+               }
+            }
+         }
+
+         std::string Generate(std::string opName) override {
+            opName = "op_" + opName;
+            std::stringstream out;
+            out<<"\n//--------- Concat " << opName << " --> " << fOutput << "  " << ConvertDimShapeToString(fOutputShape) << "\n";
+
+            if (fIsOutputConstant) return out.str();
+
+            if (fIsOutputParamShape) {
+               // output is a shape tensor defined by the concatenation of the input shapes
+               out << "// output is a shape tensor defined by the concatenation of the input shapes\n";
+               for (int i = 0; i < static_cast<int>(fOutputShape
+                  [0].dim); i++) {
+                  out << SP << "tensor_" << fOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n";
+               }
+               return out.str();
+            }
+            // special case when memory is contiguous
+            bool hasShapeOnes = true;
+            for(int i = 0; i<fAxis; ++i){
+               if(fInputShapes[0][i].dim !=1){
+                  hasShapeOnes = false;
+                  break;
+               }
+            }
+            if (fAxis == 0 || hasShapeOnes) {
+               std::string offset;
+               for(size_t i=0; i<fInputs.size(); ++i) {
+                  auto length = ConvertDimShapeToLength(fInputShapes[i]);
+                  out << SP << "SOFIE::Copy(tensor_" << fOutput;
+                  if (i > 0)
+                     out << offset;
+                  offset += " + " + length;
+                  out << ", " << "tensor_" << fInputs[i] << ", " + length << ");\n";
+               }
+            }
+            else {
+
+               std::vector<Dim> outStride = UTILITY::ComputeStrideFromShape(fOutputShape);
+               std::vector<std::vector<Dim>> inStrides(fInputs.size());
+               int idx = 0;
+               for ( auto &s : inStrides) {
+                  s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]);
+                  idx++;
+               }
+               for (int i = 0; i < fAxis; ++i) {
+                  // loop on dimensions
+                  out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n";
+               }
+
+               out << SP << SP << SP << "int idxOut = ";
+               for (int k = 0; k < fAxis; k++) {
+                  if (k > 0) out << " + ";
+                  out << outStride[k].GetVal() << "*i" << k;
+               }
+               out << ";\n";
+
+               for (size_t j = 0; j < fInputs.size(); j++) {
+                  if (j>0)
+                  out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n";
+                  out << SP << SP << SP << "int idxIn" << j <<" = ";
+                  for (int k = 0; k < fAxis; k++) {
+                     if (k > 0) out << " + ";
+                     out << inStrides[j][k].GetVal() << "*i" << k;
+                  }
+                  out << ";\n";
+                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n";
+                  out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n";
+                  out << SP << SP << SP << "}\n";
+               // concatenate the axis values
+               }
+                for (int i = 0; i < fAxis; ++i) {
+                    out << SP << "}\n";
+                }
+            }
+
+            return out.str();
+         }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant || fIsOutputParamShape) return "";
+      opName = "op_" + opName;
+      if (fOutputShape.empty())
+         throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first");
+
+      const std::size_t D   = fOutputShape.size();
+      const std::size_t Nin = fInputs.size();
+
+      auto outStrides = UTILITY::ComputeStrideFromShape(fOutputShape);
+
+      std::vector<std::size_t> prefix(Nin);
+      prefix[0] = 0;
+      for (std::size_t k = 1; k < Nin; ++k)
+         prefix[k] = prefix[k - 1] + std::stoul(fInputShapes[k - 1][fAxis].GetVal());
+
+      std::vector<std::vector<Dim>> inStrides(Nin);
+      for (std::size_t k = 0; k < Nin; ++k)
+         inStrides[k] = UTILITY::ComputeStrideFromShape(fInputShapes[k]);
+
+      std::string op;
+      op  = "\n//------ CONCAT_KERNEL_ALPAKA\n";
+      op += SP + "struct ConcatKernel_" + opName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "std::array<T const*, " + std::to_string(Nin) + "> inputs,\n";
+      op += SP + SP + SP + "T* output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "std::size_t remaining;\n";
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      op += SP + SP + SP + SP + "remaining = elem_idx;\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         std::string stride_val = outStrides[d].GetVal();
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+               + " = remaining / " + stride_val + "u;\n";
+         op += SP + SP + SP + SP + "remaining -= out_" + std::to_string(d)
+               + " * " + stride_val + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t chosen = 0;\n";
+      for (std::size_t k = 0; k < Nin; ++k) {
+         std::size_t end_k = prefix[k] + std::stoul(fInputShapes[k][fAxis].GetVal());
+         op += SP + SP + SP + SP + "chosen += static_cast<std::size_t>("
+               + std::to_string(end_k) + "u <= out_" + std::to_string(fAxis) + ");\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const output_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + SP + "out_" + std::to_string(d)
+               + " * " + outStrides[d].GetVal() + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+      for (std::size_t k = 0; k < Nin; ++k) {
+         op += SP + SP + SP + SP + SP + "(chosen == " + std::to_string(k) + "u) * (\n";
+         for (std::size_t d = 0; d < D; ++d) {
+               std::string coord = (d == static_cast<std::size_t>(fAxis))
+                  ? ("(out_" + std::to_string(d) + " - " + std::to_string(prefix[k]) + "u)")
+                  : ("out_" + std::to_string(d));
+               op += SP + SP + SP + SP + SP + SP + coord
+                  + " * " + inStrides[k][d].GetVal() + "u";
+               op += (d + 1 < D) ? " +\n" : "\n";
+         }
+         op += SP + SP + SP + SP + SP + ")";
+         op += (k + 1 < Nin) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[output_idx] = inputs[chosen][input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant || fIsOutputParamShape) return "";
+      opName = "op_" + opName;
+      return SP + "ConcatKernel_" + opName + " concatKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant || fIsOutputParamShape) return "";
+      OpName = "op_" + OpName;
+      if (fOutputShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Concat called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fOutputShape);
+      out << "\n//------ CONCAT_GPU_ALPAKA\n";
+      switch (fInputType){
+         case ETensorType::FLOAT:
+            out << SP << "std::array<const float *, " << fInputs.size() << "> input_ptrs_" << OpName << " = {"; break;
+         case ETensorType::INT64:
+            out << SP << "std::array<const int64_t *, " << fInputs.size() << "> input_ptrs_" << OpName << " = {"; break;
+         default: 
+            throw std::runtime_error("Data type for Concat operator is not yet supported.");
+      }
+      for(size_t i=0; i<fInputs.size(); ++i){
+         if(i>0) out << ", ";
+         out << "alpaka::getPtrNative(deviceBuf_" << fInputs[i] << ")";
+      }
+      out << "};\n";
+
+      out << SP << "auto const elementsPerThread_"<<OpName<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<OpName<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << OpName << " = sofie_workdiv(elementsPerGrid_" << OpName << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << OpName
+         << ", concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+ };
+ }//SOFIE
+
+
+ #endif //SOFIE_ROPERATOR_CONCAT
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/core/inc/SOFIE/ROperator_Constant.hxx
similarity index 64%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
rename to core/inc/SOFIE/ROperator_Constant.hxx
index 0d08432..4fea387 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
+++ b/core/inc/SOFIE/ROperator_Constant.hxx
@@ -18,6 +18,7 @@ private:
    std::string fNX;
    std::string fNY;
    std::vector<size_t> fShape;
+   std::vector<Dim> fDimShape;  // used for dynamic ConstantOfShape
    std::vector<T> fValues;
    std::string fAttrType;
    bool fIsConstantOfShape = false;
@@ -52,15 +53,35 @@ public:
          // case of ConstantOfShape (since no inputs in case of Constant operator)
          fIsConstantOfShape  = true;
          if (model.CheckIfTensorAlreadyExist(fNX) == false){
-           throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor is not found in model");
+           throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor is not found in model");
+         }
+         if (model.IsShapeTensor(fNX)) {
+            // Input is a shape tensor (symbolic dimensions) — output will be a dynamic tensor
+            // whose shape is determined at runtime from the symbolic values.
+            const auto & dimVals = model.GetShapeTensorValues(fNX);
+            std::vector<Dim> outShape;
+            for (const auto & d : dimVals)
+               outShape.push_back(d);
+            if (fValues.size() != 1)
+               throw std::runtime_error("SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size()));
+            // Register as a dynamic intermediate tensor — values will be filled at runtime
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), outShape);
+            // Store shape for code generation (use fShape for rank, values = 0 for symbolic dims)
+            fShape.resize(outShape.size());
+            for (size_t i = 0; i < outShape.size(); i++)
+               fShape[i] = outShape[i].isParam ? 0 : outShape[i].dim;
+            // Store symbolic lengths/shape for Generate()
+            fDimShape = outShape;
+            fIsOutputConstant = false;  // cannot be constant since shape is dynamic
+            return;
          }
          // get output shape from input values:
-         // can work only if input is a constant or initialized tensor (or dynamic one)
+         // can work only if input is a constant or initialized tensor
          auto dptr = model.GetInitializedTensorData(fNX);
          auto input_tensor = static_cast<int64_t *>(dptr.get());
          auto input_shape = model.GetTensorShape(fNX);
          if (input_shape.size() > 1 )
-            throw std::runtime_error("TMVA SOFIE ConstantOfShape Op Input Tensor has invalid shape");
+            throw std::runtime_error("SOFIE ConstantOfShape Op Input Tensor has invalid shape");
          if (input_tensor != nullptr && !input_shape.empty()) {
             fShape = std::vector<size_t> (input_shape[0]);
             for (size_t i = 0; i < fShape.size(); i++)
@@ -70,7 +91,7 @@ public:
 
          length = ConvertShapeToLength(fShape);
          if (fValues.size() != 1)
-            throw std::runtime_error("TMVA SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size()));
+            throw std::runtime_error("SOFIE ConstantOfShape Op value Tensor has invalid size " + std::to_string(fValues.size()));
 
          T value = fValues[0];
          fValues = std::vector<T>(length, value);
@@ -80,7 +101,7 @@ public:
          // in case of standard constant the shape is provided as input
          length = ConvertShapeToLength(fShape);
          if (length != fValues.size())
-            throw std::runtime_error("TMVA SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) +
+            throw std::runtime_error("SOFIE Constant Op has invalid shape : " + ConvertShapeToString(fShape) +
                                  " with " + std::to_string(fValues.size()) + " values");
       }
 
@@ -101,6 +122,11 @@ public:
       // no code to generate here. Tensor are defined in Session constructor
       return "//---------------------------------------\n";
    }
+
+   std::string Generate_GPU_ALPAKA(std::string /* OpName */) override {
+      // no code to generate here. Tensor are defined in Session constructor
+      return "//---------------------------------------\n";
+   }
 };
 
 }//SOFIE
diff --git a/core/inc/SOFIE/ROperator_Conv.hxx b/core/inc/SOFIE/ROperator_Conv.hxx
new file mode 100644
index 0000000..835a0ff
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Conv.hxx
@@ -0,0 +1,999 @@
+#ifndef SOFIE_SOFIE_ROPERATOR_CONV
+#define SOFIE_SOFIE_ROPERATOR_CONV
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <memory>
+#include <sstream>
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+#include <cassert>
+
+
+namespace SOFIE {
+
+template<typename T>
+class ROperator_Conv final : public ROperator
+{
+private:
+   bool fBroadcastBias = false;
+
+   std::string fAttrAutopad;
+   std::vector<size_t> fAttrDilations;
+   size_t fAttrGroup;
+   std::vector<size_t> fAttrKernelShape;
+   std::vector<size_t> fAttrPads;
+   std::vector<size_t> fAttrStrides;
+
+   std::string fNX;
+   std::string fNW;
+   std::string fNB;
+   std::string fNY;
+
+   std::string convK;
+   std::string imcol;
+
+   std::vector<Dim> fShapeX;
+   std::vector<size_t> fShapeW;
+   std::vector<size_t> fShapeB;
+   std::vector<Dim> fShapeY;
+
+   std::string fType;
+
+   size_t fDim;   // dimension of the convolution
+
+
+public:
+
+   ROperator_Conv() {}
+
+   ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
+      size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
+      std::vector<size_t> strides, std::string nameX, std::string nameW,
+      std::string nameB, std::string nameY):
+      fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
+      fAttrPads(pads), fAttrStrides(strides),
+      fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)),
+      fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
+   {
+      if(std::is_same<T, float>::value) {
+         fType = "float";
+      } else {
+         throw
+            std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
+      }
+      fInputTensorNames = { fNX, fNB };
+      fOutputTensorNames = { fNY };
+      fKind = OperatorKind::CONV;
+   }
+
+   ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
+      size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
+      std::vector<size_t> strides, std::string nameX, std::string nameW,
+      std::string nameY):
+      fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
+      fAttrPads(pads), fAttrStrides(strides),
+      fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY))
+   {
+      if(std::is_same<T, float>::value) {
+         fType = "float";
+      } else {
+         throw
+            std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
+      }
+      fInputTensorNames = { fNX };
+      fOutputTensorNames = { fNY };
+      fKind=  OperatorKind::CONV;
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      ETensorType out = input[0];
+      return {out};
+   }
+
+   // function returning output shape given input
+   std::vector<Dim> DoShapeInference(const std::vector<Dim> & input, const std::vector<size_t> & weight) {
+      // shape of convolution input has to be (according to ONNX): N x C x H x W
+      // Where N : batch size, C : input  channels, H : input height, W : input width
+
+      if (input.size() -2 != fDim) {
+         throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid input ");
+      }
+      if (weight.size() -2 != fDim) {
+         throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid weights ");
+      }
+      if (fAttrGroup == 0 && input[1].isParam)
+         throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without group attr");
+      if (fAttrKernelShape.empty()) {
+         if (input[2].isParam || (fDim > 1 && input[3].isParam) || (fDim > 2 && input[4].isParam))
+            throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without kernel attr");
+      }
+
+      if (fAttrGroup == 0) {
+         fAttrGroup = input[1].dim / weight[1];
+      }
+
+      // kernel shape
+      size_t k1 = ((fAttrKernelShape.empty())? weight[2] : fAttrKernelShape[0]);
+      size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? weight[3] : fAttrKernelShape[1]) : 1;
+      size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? weight[4] : fAttrKernelShape[2]) : 1;
+
+
+      size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1;
+      size_t i2 = (fDim > 2) ? 4 : 3;
+      size_t i3 = 5;
+
+      if (fAttrDilations.empty()) {
+         fAttrDilations = {1, 1, 1};
+      }
+      fAttrDilations.resize(3);
+      if (fDim < 3) {
+         fAttrDilations.resize(3, 1);
+      }
+      // Shape of the kernel
+      fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1),
+                          k2 + (fAttrDilations[1] - 1) * (k2 - 1),
+                          k3 + (fAttrDilations[2] - 1) * (k3 - 1)};
+
+      if (fAttrStrides.empty()) {
+         fAttrStrides = {1, 1, 1};
+      }
+      if (fDim < 3)
+         fAttrStrides.resize(3, 1);
+
+      if (fAttrAutopad == "NOTSET") {
+         if (fAttrPads.empty()) {
+            fAttrPads = {1, 1, 1, 1, 1, 1};
+         }
+      } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") {
+         for (size_t d = 0; d < fDim; ++d) {
+            if (input[d + 2].isParam)
+               throw std::runtime_error(
+                  "TMVA SOFIE Conv Op: SAME padding with parametric input shape is not supported");
+         }
+         // ONNX SAME padding: total_pad = max(0, (ceil(in/stride)-1)*stride + kernel - in)
+         // SAME_UPPER places extra padding at end, SAME_LOWER at beginning
+         fAttrPads.assign(6, 0);
+         for (size_t d = 0; d < fDim; ++d) {
+            size_t inSize = input[d + 2].dim;
+            size_t stride_d = fAttrStrides[d];
+            size_t outSize = (inSize + stride_d - 1) / stride_d;
+            int totalPad = std::max(0, (int)((outSize - 1) * stride_d + fAttrKernelShape[d]) - (int)inSize);
+            if (fAttrAutopad == "SAME_UPPER") {
+               fAttrPads[d] = (size_t)(totalPad / 2);
+               fAttrPads[d + fDim] = (size_t)(totalPad - totalPad / 2);
+            } else {
+               fAttrPads[d] = (size_t)(totalPad - totalPad / 2);
+               fAttrPads[d + fDim] = (size_t)(totalPad / 2);
+            }
+         }
+      } else if (fAttrAutopad != "VALID") {
+         throw
+            std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad");
+      }
+      // to be sure pad is vector of size 6
+      if (fDim < 3) fAttrPads.resize(6, 0);
+
+      Dim input1 = input[2];
+      Dim input2 = (fDim > 1) ? input[3] : Dim{1};
+      Dim input3 = (fDim > 2) ? input[4] : Dim{1};
+
+      size_t pad1 = fAttrPads[0] + fAttrPads[i1];
+
+      // function to get output dimension of convolution given input
+
+      auto computeOutput = [&](Dim inputDim, size_t kernel, size_t pad, size_t stride) {
+         if (!inputDim.isParam) {
+            size_t outSize = (inputDim.dim + pad - kernel) / stride + 1;
+            return  Dim{outSize};
+         } else {
+            if (stride == 1){
+               if ((pad - kernel + 1) == 0 )
+                  // output is same as input
+                  return inputDim;
+               else  {
+                  int64_t v =  pad - kernel + 1;
+                  std::string outStr = "(" + inputDim.param + "+" + std::to_string(v) + ")";
+                  return Dim{ outStr, static_cast<size_t>(-1)};
+               }
+            } else { // general case (stride not 1)
+               int64_t v =  pad - kernel;
+               std::string outStr = "((" + inputDim.param + "+" + std::to_string(v) + ")/"
+                                 + std::to_string(stride) + "1)";
+               return Dim{ outStr, static_cast<size_t>(-1)};
+            }
+         }
+         throw std::runtime_error("TMVA SOFIE Conv Op -  invalid values");
+         return Dim{};
+      };
+
+      Dim output1 = computeOutput(input1, fAttrKernelShape[0], pad1, fAttrStrides[0]);
+
+      Dim batch_size = input[0];        // first element in input tensor
+      Dim output_channels = Dim{weight[0]};   // first element in weight tensor
+
+      std::vector<Dim> ret({ batch_size, output_channels, output1 });
+
+      if (fDim == 1)
+         return ret;
+
+      size_t pad2 = fAttrPads[1] + fAttrPads[i2];
+      Dim output2 = computeOutput(input2, fAttrKernelShape[1], pad2, fAttrStrides[1]);
+
+      // output is N x M x OH x OW
+      ret.push_back(output2);
+      if (fDim == 2)
+         return ret;
+
+      size_t pad3 = fAttrPads[2] + fAttrPads[i3];
+      Dim output3 = computeOutput(input3, fAttrKernelShape[2], pad3, fAttrStrides[2]);
+
+      // output is N x M x OH x OW x OD
+      ret.push_back(output3);
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      fUseSession = model.UseSession();
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw
+            std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      if (fShapeX.size() < 3 || fShapeX.size()  > 5) {
+         std::cout << fNX << " : " << ConvertDimShapeToString(fShapeX) << std::endl;
+         throw
+            std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
+      }
+      fDim = fShapeX.size() - 2;
+      if (!model.CheckIfTensorAlreadyExist(fNW)) {
+         throw
+            std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
+      }
+      fShapeW = model.GetTensorShape(fNW);
+      if (fShapeW.size() < 3 || fShapeW.size()  > 5) {
+         std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
+         throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions");
+      }
+      fShapeY = DoShapeInference(fShapeX, fShapeW);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+      if (fNB != "") {
+         if (!model.CheckIfTensorAlreadyExist(fNB)) {
+            throw
+               std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
+         }
+         fShapeB = model.GetTensorShape(fNB);
+         if (fShapeB.size() != 1)
+            throw
+               std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)");
+         std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
+         auto shapeDimB = model.GetDimTensorShape(fNB);
+         bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape);
+         if (broadcast_needed) {
+            auto original_data = model.GetInitializedTensorData(fNB);
+            // make bias shape equal to Y shape by adding 1
+            if (fShapeB.size() < 1)
+               throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape");
+            // we assume bias tensor dimension is equal to number of filters that is the second dimension in
+            // the output tensor
+            if (!(shapeDimB[0] == fShapeY[1]))
+               throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " +
+                                           ConvertShapeToString(fShapeB));
+            if (fType != "float")
+               throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
+            // here is the actual broadcasting
+            fBroadcastBias = true;
+            if (!fUseSession) {
+               // do here broadcasting
+               std::vector<size_t> shape(fDim + 1, 1);
+               shape[0] = fShapeB[0];
+               auto intTargetShape = ConvertShapeToInt(targetShape);
+               std::shared_ptr<void> new_data_ptr(
+                  UTILITY::UnidirectionalBroadcast(static_cast<float *>(original_data.get()), shape, intTargetShape),
+                  std::default_delete<float[]>());
+               model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr);
+               fShapeB = model.GetTensorShape(fNB);
+            }
+         }
+      }
+      // output channel size can be parametric and is an expression
+      std::vector<Dim> outputDims = std::vector<Dim>(fShapeY.begin()+2, fShapeY.end());
+      //check if shape is not parametric
+      std::vector<size_t> outputInts = ConvertShapeToInt(outputDims);
+      Dim channelDim;
+      if (outputInts.empty()) {
+         auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
+         channelDim = Dim{ outputChannelSize, static_cast<size_t>(-1)};
+      } else {
+         size_t outputChannelSize = ConvertShapeToLength(outputInts);
+         channelDim = Dim{ outputChannelSize };
+      }
+      size_t kernelSize = fAttrKernelShape[0];
+      for (size_t i = 1; i < fDim; i++) {
+         kernelSize *= fAttrKernelShape[i];
+      }
+
+      std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
+      std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim };
+      model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
+      model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
+      convK = fNX +"_f";
+      imcol = fNX +"_xcol";
+      fOutputTensorNames.emplace_back(convK);
+      fOutputTensorNames.emplace_back(imcol);
+      fInputTensorNames.emplace_back(convK);
+      fInputTensorNames.emplace_back(imcol);
+
+      if (model.Verbose()) {
+         std::cout << "Conv - " << fDim << "  " << fNX << " : " << ConvertDimShapeToString(fShapeX)
+                  << " --> " << fNY << " : " << ConvertDimShapeToString(fShapeY) << std::endl;
+      }
+   }
+
+   std::string GenerateInitCode() override {
+      std::stringstream out;
+      // Generate initialization code for broadcasting of bias tensor
+      if (fBroadcastBias) {
+         // include a separate scope to avoid defining unique operator temp variables
+         std::vector<size_t> shape(fDim + 1, 1);
+         // bias (is a 1D tensor)
+         shape[0] = fShapeB[0];
+         std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
+         out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n";
+         // in case of dynamic tensors check needs to be done at run time
+         bool isOutDynamic = ConvertShapeToInt(targetShape).empty();
+         auto length = ConvertDimShapeToLength(targetShape);
+         if (isOutDynamic)
+            out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n";
+         else
+            out << SP << "{\n";
+         out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"
+             << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertDimShapeToString(fShapeY) << ");\n";
+         out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n";
+         out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n";
+         out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n";
+         out << SP << SP << "delete[] data;\n";
+         out << SP << "}\n";
+      }
+      return out.str();
+   }
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+
+      if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
+         throw
+            std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto bsize = fShapeX[0];
+      size_t kDepth = (fDim > 2) ?  fShapeW[2] : 1;  // kernel depth
+      size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1;  // kernel height
+      size_t kWidth = fShapeW[fDim+1]; // kernel width
+      auto iDepth = (fDim > 2) ?  fShapeX[2] : Dim{1};  // input depth
+      auto iHeight = (fDim > 1) ? fShapeX[fDim] : Dim{1}; // input height
+      auto iWidth = fShapeX[fDim+1]; // input width
+      auto oDepth = (fDim > 2) ? fShapeY[2] : Dim{1}; // output depth
+      auto oHeight = (fDim > 1) ? fShapeY[fDim] : Dim{1};  // ouput height
+      auto oWidth = fShapeY[fDim+1]; // output width
+      // total output size for a channel
+      auto outputChannelStride = ConvertDimShapeToLength(std::vector<Dim>{oDepth, oHeight, oWidth}); // size of channel = D * H * W
+      auto outputBatchStride =  ConvertDimShapeToLength(std::vector<Dim>{fShapeY[1] , oDepth, oHeight, oWidth}); // size of C * D * H * W
+      // input size
+      auto inputChannelStride = ConvertDimShapeToLength(std::vector<Dim>{iDepth, iHeight, iWidth});
+      auto inputBatchStride =  ConvertDimShapeToLength(std::vector<Dim>{fShapeX[1] , iDepth, iHeight, iWidth}); // size of C * D * H * W
+
+      out << "\n//----  operator Conv " << OpName << "\n";
+
+      // vectorize the (dilated)convolution kernels into a matrix
+      // no need to transpose the matrix
+      // to fix for 1d and 3d
+
+      size_t id = (fDim > 2) ? fDim-3 : 2;
+      size_t ih = (fDim > 1) ? fDim-2 : 1;
+      size_t iw = fDim-1;
+
+      size_t wstrideDil = fAttrDilations[iw];
+      size_t hstride = kWidth;
+      size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw];  // stride dilated in the height
+      size_t dstride = kHeight * kWidth;
+      size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
+      size_t icstride = kHeight * kWidth * kDepth;
+      size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
+      size_t ocstride = fShapeW[1] * icstride;
+      size_t ocstrideDil = fShapeW[1] * icstrideDil;
+
+      out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n";
+      out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n";
+      if (fDim > 2)
+         out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
+      if (fDim > 1)
+         out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
+      out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
+
+      out << SP << SP << SP << SP << SP << "tensor_" <<fNX <<  "_f[oc * "
+          << ocstrideDil << " + ic * " << icstrideDil;
+      if (fDim > 2) out << " + kd * " << dstrideDil;
+      if (fDim > 1) out << " + kh * " << hstrideDil;
+      out << " + kw * " << wstrideDil  << "  ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride;
+      if (fDim > 2) out << " + kd * " << dstride;
+      if (fDim > 1) out << " + kh * " << hstride;
+      out  << " + kw ];\n";
+
+      out << SP << SP << SP << SP << "}\n";
+      if (fDim > 1) out << SP << SP << SP << "}\n";
+      if (fDim > 2) out << SP << SP << SP << "}\n";
+      out << SP << SP << "}\n";
+      out << SP << "}\n";
+
+      //out << SP << "char " << OpName << "_transA = 'T';\n";
+      out << SP << "char " << OpName << "_transA = 'N';\n";
+      out << SP << "char " << OpName << "_transB = 'N';\n";
+      out << SP << "int " << OpName << "_m = " << outputChannelStride << ";\n"; // output h*w
+      assert(fShapeY[1] == fShapeW[0]);
+      //assert(fShapeW[1] == fShapeX[1] / fAttrGroup);
+      out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels
+      out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n";
+      out << SP << "float " << OpName << "_alpha = 1.0;\n";
+      if (fNB != "")
+         out << SP << "float " << OpName << "_beta = 1.0;\n";
+      else // when bias is not present beta needs to be equal to zero to avoid re-using previous results in output tensor
+         out << SP << "float " << OpName << "_beta = 0.0;\n";
+
+
+      // Loop on batch size
+      out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
+
+      // IM2COL: Unroll the input tensor
+      // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
+      //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
+      //   (xa2,...xak+1,ya1,...yak)(......)
+      // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
+      // By doing this one has consecutive memory reads and writes
+      // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w)
+      if (fDim ==1) {
+         if (fAttrPads[0] != fAttrPads[1] ) {
+            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
+                      << std::endl;
+            fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
+         }
+         fAttrPads[1] = 0;
+         fAttrStrides[1] = 1;
+      }
+      if (fDim == 2) {
+         if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
+            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
+            fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
+            fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
+         }
+      }
+      if (fDim == 3) {
+         if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
+            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
+            fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
+            fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
+            fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
+         }
+      }
+      out << SP << SP << "size_t out_offset = n * " << outputBatchStride  << ";\n";
+
+      if (fAttrGroup == 1) {
+         out << SP << SP << "size_t x_offset = n * " << inputBatchStride << ";\n";
+         // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y,  output_h *
+         // output_w)
+         if (fDim < 3) {
+            out << SP << SP << "SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
+                << " + x_offset,"
+                //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                //  dilation_w,
+                //
+                << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
+            if (fDim == 1)
+               out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
+                   << fAttrDilations[0];
+            else // dim ==2
+               out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
+                   << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
+                   << fAttrDilations[1];
+            out << "," << "tensor_" <<fNX << "_xcol);\n\n ";
+         } else {
+            // 3d im2col
+            out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
+                << " + x_offset,"
+                //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
+                //  dilation_d, dilation_h, dilation_w,
+                //
+                << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << ","
+                << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << ","
+                << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << ","
+                << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
+                << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ","
+                << "tensor_" << fNX << "_xcol);\n\n ";
+         }
+         // BLAS
+         out << SP << "SOFIE::Gemm_Call("
+             << "tensor_" << fNY << " + out_offset, false, false, "
+             << OpName << "_m, " << OpName << "_n, " << OpName << "_k, "
+             << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f, "
+             << OpName << "_beta, ";
+         if (fNB != "")
+            out << "tensor_" << fNB;
+         else
+            out << "nullptr";
+         out << ");\n";
+
+
+         // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
+         //     << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName
+         //     << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
+         // out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
+         //     << " + out_offset, &" << OpName << "_m);\n";
+      } else {
+         // case of group convolution
+         // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
+         // group)
+         // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
+         out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
+         out << SP << SP << "size_t x_offset = n * " << inputBatchStride << " + g * "
+             << fShapeW[1] << " * " << inputChannelStride << ";\n ";
+         out << SP << SP << "size_t g_offset = g * " << fShapeW[0] << " * (" << outputChannelStride << ") / " << fAttrGroup << ";\n ";
+         out << SP << SP << "size_t out_offset = n * " << outputBatchStride << " + g_offset;\n";
+
+         if (fDim < 3) {
+            out << SP << SP << "SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
+                << " + x_offset,"
+                //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                //  dilation_w,
+                //
+                << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
+            if (fDim == 1)
+               out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
+                   << fAttrDilations[0];
+            else // dim ==2
+               out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
+                   << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
+                   << fAttrDilations[1];
+            out << ", tensor_" << fNX << "_xcol);\n\n ";
+         } else {
+            // 3d im2col
+            out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
+                << " + x_offset,"
+                //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
+                //  dilation_d, dilation_h, dilation_w,
+                //
+                << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << ","
+                << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1]
+                << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2]
+                << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX
+                << "_xcol);\n\n ";
+         }
+
+         // BLAS
+         // n must be divided by the number of groups
+         out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n";
+         // offset g must be  g * k * n
+         out << SP << SP << SP << "size_t offset_f = g * "
+             << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup
+             << ";\n";
+
+         out << SP << "SOFIE::Gemm_Call("
+             << "tensor_" << fNY << " + out_offset, false, false, "
+             << OpName << "_m, " << OpName << "_n, " << OpName << "_k, "
+             << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, tensor_" << fNX << "_f + offset_f, "
+             << OpName << "_beta, ";
+         if (fNB != "")
+            out << "tensor_" << fNB << " + g_offset";
+         else
+            out << "nullptr";
+         out << ");\n";
+         out << SP << SP << "}\n"; // end of group loop
+      }
+      out << SP << "}\n"; // end of batch size loop
+
+      return out.str();
+      }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
+
+      size_t oDepth  = (fDim > 2) ? fShapeY[2].dim      : 1;
+      size_t oHeight = (fDim > 1) ? fShapeY[fDim].dim   : 1;
+      size_t oWidth  = fShapeY[fDim + 1].dim;
+      size_t iDepth  = (fDim > 2) ? fShapeX[2].dim      : 1;
+      size_t iHeight = (fDim > 1) ? fShapeX[fDim].dim   : 1;
+      size_t iWidth  = fShapeX[fDim + 1].dim;
+      size_t kHeight = (fDim > 1) ? fShapeW[fDim]   : 1;
+      size_t kWidth  = fShapeW[fDim + 1];
+      size_t kDepth  = (fDim > 2) ? fShapeW[2]      : 1;
+
+      size_t kernelSize  = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2];
+      size_t colRows     = fShapeW[1] * kernelSize;
+      size_t colCols     = oDepth * oHeight * oWidth;
+      size_t colElements = colRows * colCols;
+      size_t outChannels = fShapeW[0];
+      size_t spatialSize = oDepth * oHeight * oWidth;
+
+      // Strides for weight vectorisation
+      size_t id = (fDim > 2) ? fDim - 3 : 2;
+      size_t ih = (fDim > 1) ? fDim - 2 : 1;
+      size_t iw = fDim - 1;
+      size_t wstrideDil  = fAttrDilations[iw];
+      size_t hstrideDil  = fAttrDilations[ih]  * fAttrKernelShape[iw];
+      size_t dstrideDil  = fAttrDilations[id]  * fAttrKernelShape[ih] * fAttrKernelShape[iw];
+      size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
+      size_t ocstrideDil = fShapeW[1] * icstrideDil;
+      size_t hstride     = kWidth;
+      size_t dstride     = kHeight * kWidth;
+      size_t icstride    = kHeight * kWidth * kDepth;
+      size_t ocstride    = fShapeW[1] * icstride;
+      size_t wTotalElements = ConvertShapeToLength(fShapeW);
+
+      std::string op;
+
+      // Kernel 1: Weight vectorisation — reorder W into _f with dilation layout
+      // Each thread handles one output element of _f
+      std::string wKname = "WeightVecKernel_" + opName;
+      op  = "\n//------ WEIGHT_VEC_KERNEL_ALPAKA (Conv " + opName + ")\n";
+      op += SP + "struct " + wKname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ W,\n";
+      op += SP + SP + SP + "T* __restrict__ f,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      // Decompose elem_idx into (oc, ic, kd, kh, kw) using compile-time strides
+      op += SP + SP + SP + SP + "std::size_t const oc    = elem_idx / " + std::to_string(ocstride) + "u;\n";
+      op += SP + SP + SP + SP + "std::size_t const oc_rem = elem_idx % " + std::to_string(ocstride) + "u;\n";
+      op += SP + SP + SP + SP + "std::size_t const ic    = oc_rem / " + std::to_string(icstride) + "u;\n";
+      op += SP + SP + SP + SP + "std::size_t const ic_rem = oc_rem % " + std::to_string(icstride) + "u;\n";
+      if (fDim > 2) {
+         op += SP + SP + SP + SP + "std::size_t const kd = ic_rem / " + std::to_string(kHeight * kWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = (ic_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n";
+      } else if (fDim > 1) {
+         op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = ic_rem / " + std::to_string(kWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = ic_rem % " + std::to_string(kWidth) + "u;\n\n";
+      } else {
+         op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = ic_rem;\n\n";
+      }
+
+      // Compute destination index in _f (dilated layout)
+      op += SP + SP + SP + SP + "std::size_t const f_idx =\n";
+      op += SP + SP + SP + SP + SP + "oc * " + std::to_string(ocstrideDil) + "u +\n";
+      op += SP + SP + SP + SP + SP + "ic * " + std::to_string(icstrideDil) + "u";
+      if (fDim > 2) op += " +\n" + SP + SP + SP + SP + SP + "kd * " + std::to_string(dstrideDil) + "u";
+      if (fDim > 1) op += " +\n" + SP + SP + SP + SP + SP + "kh * " + std::to_string(hstrideDil) + "u";
+      op += " +\n" + SP + SP + SP + SP + SP + "kw * " + std::to_string(wstrideDil) + "u;\n\n";
+
+      op += SP + SP + SP + SP + "f[f_idx] = W[elem_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n\n";
+
+      // Kernel 2: Im2Col
+      std::string im2colKname = "Im2ColKernel_" + opName;
+      op += SP + "//------ IM2COL_KERNEL_ALPAKA (Conv " + opName + ")\n";
+      op += SP + "struct " + im2colKname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ input,\n";
+      op += SP + SP + SP + "T* __restrict__ col,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t const col_row = elem_idx / " + std::to_string(colCols) + "u;\n";
+      op += SP + SP + SP + SP + "std::size_t const col_col = elem_idx % " + std::to_string(colCols) + "u;\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t const ic    = col_row / " + std::to_string(kernelSize) + "u;\n";
+      op += SP + SP + SP + SP + "std::size_t const k_rem = col_row % " + std::to_string(kernelSize) + "u;\n";
+      if (fDim > 2) {
+         op += SP + SP + SP + SP + "std::size_t const kd = k_rem / " + std::to_string(kHeight * kWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = (k_rem / " + std::to_string(kWidth) + "u) % " + std::to_string(kHeight) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n";
+      } else if (fDim > 1) {
+         op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = k_rem / " + std::to_string(kWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = k_rem % " + std::to_string(kWidth) + "u;\n\n";
+      } else {
+         op += SP + SP + SP + SP + "std::size_t const kd = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kh = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const kw = k_rem;\n\n";
+      }
+
+      if (fDim > 2) {
+         op += SP + SP + SP + SP + "std::size_t const od = col_col / " + std::to_string(oHeight * oWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const oh = (col_col / " + std::to_string(oWidth) + "u) % " + std::to_string(oHeight) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n";
+      } else if (fDim > 1) {
+         op += SP + SP + SP + SP + "std::size_t const od = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const oh = col_col / " + std::to_string(oWidth) + "u;\n";
+         op += SP + SP + SP + SP + "std::size_t const ow = col_col % " + std::to_string(oWidth) + "u;\n\n";
+      } else {
+         op += SP + SP + SP + SP + "std::size_t const od = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const oh = 0u;\n";
+         op += SP + SP + SP + SP + "std::size_t const ow = col_col;\n\n";
+      }
+
+      // Depth: trivially 0 for fDim < 3 (od=kd=0 always); pads[0] is height-begin for 2D, so
+      // applying it here would make id_in negative and zero the whole output.
+      if (fDim >= 3) {
+         op += SP + SP + SP + SP + "int64_t const id_in = static_cast<int64_t>(od * " + std::to_string(fAttrStrides[0])
+            + "u + kd * " + std::to_string(fAttrDilations[0]) + "u) - " + std::to_string(fAttrPads[0]) + ";\n";
+      } else {
+         op += SP + SP + SP + SP + "int64_t const id_in = 0;\n";
+      }
+      // Height: for fDim==3 the height dim is at strides/pads index 1; for fDim==2 it is at index 0.
+      // For fDim==1 oh=kh=0 so ih_in=0.
+      {
+         size_t const hIdx = (fDim > 2) ? 1 : 0;
+         if (fDim >= 2) {
+            op += SP + SP + SP + SP + "int64_t const ih_in = static_cast<int64_t>(oh * " + std::to_string(fAttrStrides[hIdx])
+               + "u + kh * " + std::to_string(fAttrDilations[hIdx]) + "u) - " + std::to_string(fAttrPads[hIdx]) + ";\n";
+         } else {
+            op += SP + SP + SP + SP + "int64_t const ih_in = 0;\n";
+         }
+      }
+      // Width: fAttrStrides/Dilations/Pads are ordered [d,h,w] so width is at index fDim-1.
+      {
+         size_t const wIdx = fDim - 1;
+         op += SP + SP + SP + SP + "int64_t const iw_in = static_cast<int64_t>(ow * " + std::to_string(fAttrStrides[wIdx])
+            + "u + kw * " + std::to_string(fAttrDilations[wIdx]) + "u) - " + std::to_string(fAttrPads[wIdx]) + ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "bool const in_bounds =\n";
+      op += SP + SP + SP + SP + SP + "id_in >= 0 && id_in < " + std::to_string(iDepth)  + " &&\n";
+      op += SP + SP + SP + SP + SP + "ih_in >= 0 && ih_in < " + std::to_string(iHeight) + " &&\n";
+      op += SP + SP + SP + SP + SP + "iw_in >= 0 && iw_in < " + std::to_string(iWidth)  + ";\n\n";
+
+      op += SP + SP + SP + SP + "if (in_bounds) {\n";
+      op += SP + SP + SP + SP + SP + "std::size_t const in_idx =\n";
+      op += SP + SP + SP + SP + SP + SP + "ic * " + std::to_string(iDepth * iHeight * iWidth) + "u +\n";
+      op += SP + SP + SP + SP + SP + SP + "static_cast<std::size_t>(id_in) * " + std::to_string(iHeight * iWidth) + "u +\n";
+      op += SP + SP + SP + SP + SP + SP + "static_cast<std::size_t>(ih_in) * " + std::to_string(iWidth) + "u +\n";
+      op += SP + SP + SP + SP + SP + SP + "static_cast<std::size_t>(iw_in);\n";
+      op += SP + SP + SP + SP + SP + "col[elem_idx] = input[in_idx];\n";
+      op += SP + SP + SP + SP + "} else {\n";
+      op += SP + SP + SP + SP + SP + "col[elem_idx] = static_cast<T>(0);\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n\n";
+
+      // Kernel 3: Bias broadcast (only if bias present)
+      if (!fNB.empty()) {
+         std::string biasKname = "BiasBroadcastKernel_" + opName;
+         op += SP + "//------ BIAS_BROADCAST_KERNEL_ALPAKA (Conv " + opName + ")\n";
+         op += SP + "struct " + biasKname + " {\n";
+         op += SP + SP + "template<typename TAcc, typename T>\n";
+         op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+         op += SP + SP + SP + "TAcc const& acc,\n";
+         op += SP + SP + SP + "T const* __restrict__ bias,\n";
+         op += SP + SP + SP + "T* __restrict__ output,\n";
+         op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+         op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+         op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+         op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n";
+         op += SP + SP + SP + SP + "std::size_t const channel = elem_idx / " + std::to_string(spatialSize) + "u;\n";
+         op += SP + SP + SP + SP + "output[elem_idx] = bias[channel];\n";
+         op += SP + SP + SP + "}\n";
+         op += SP + SP + "}\n";
+         op += SP + "};\n\n";
+      }
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string op;
+      op  = SP + "WeightVecKernel_"  + opName + " weightVecKernel_"  + opName + ";\n";
+      op += SP + "Im2ColKernel_"     + opName + " im2colKernel_"     + opName + ";\n";
+      if (!fNB.empty())
+         op += SP + "BiasBroadcastKernel_" + opName + " biasBroadcastKernel_" + opName + ";\n";
+      return op;
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty() || fShapeW.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first");
+
+      size_t bsize       = fShapeX[0].dim;
+      size_t oDepth      = (fDim > 2) ? fShapeY[2].dim    : 1;
+      size_t oHeight     = (fDim > 1) ? fShapeY[fDim].dim : 1;
+      size_t oWidth      = fShapeY[fDim + 1].dim;
+      size_t iDepth      = (fDim > 2) ? fShapeX[2].dim    : 1;
+      size_t iHeight     = (fDim > 1) ? fShapeX[fDim].dim : 1;
+      size_t iWidth      = fShapeX[fDim + 1].dim;
+      size_t outChannels = fShapeW[0];
+      size_t kernelSize  = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2];
+      // gemm dimensions computed from shape members
+      size_t gemm_n      = outChannels;                   // output channels
+      size_t gemm_k      = fShapeW[1] * kernelSize;       // input channels/group * kernel volume
+      size_t gemm_m      = oDepth * oHeight * oWidth;     // output spatial size per channel
+      size_t colElements = gemm_k * gemm_m;   // colRows * colCols
+      size_t wTotal      = ConvertShapeToLength(fShapeW);
+
+      // For group conv: per-group output channels and _f offset
+      // gemm_n stays as total output channels — we divide per group at launch
+      size_t groupFOffset     = gemm_n * gemm_k;  // elements of _f per group
+
+      std::stringstream out;
+      out << "\n//------ CONV_GPU_ALPAKA\n";
+
+      // -----------------------------------------------------------------------
+      // Step 1: Weight vectorisation kernel — runs once, fully on GPU
+      // -----------------------------------------------------------------------
+      out << SP << "// Step 1: vectorise W -> _f on GPU (once per infer call)\n";
+      out << SP << "{\n";
+      out << SP << SP << "auto const elementsPerThread_wv = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << SP << "auto const elementsPerGrid_wv   = Vec::all(Idx{" << wTotal << "});\n";
+      out << SP << SP << "auto const workDiv_wv = sofie_workdiv(elementsPerGrid_wv);\n";
+      out << SP << SP << "alpaka::exec<Acc>(queue, workDiv_wv, weightVecKernel_" << opName
+         << ", alpaka::getPtrNative(deviceBuf_" << fNW << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << convK << ")"
+         << ", static_cast<Idx>(" << wTotal << "));\n";
+      out << SP << SP << "alpaka::wait(queue);\n";
+      out << SP << "}\n\n";
+
+      // -----------------------------------------------------------------------
+      // Step 2: Batch loop
+      // -----------------------------------------------------------------------
+      out << SP << "for (std::size_t n = 0; n < " << bsize << "; n++) {\n\n";
+      out << SP << SP << "std::size_t const x_offset   = n * "
+         << fShapeX[1].dim * iDepth * iHeight * iWidth << "u;\n";
+      out << SP << SP << "std::size_t const out_offset = n * "
+         << fShapeY[1].dim * gemm_m << "u;\n\n";
+
+      // -----------------------------------------------------------------------
+      // Step 3 + 4: Im2Col then GEMM — structure differs for grouped vs non-grouped
+      // -----------------------------------------------------------------------
+      if (fAttrGroup == 1) {
+         // Non-grouped: single im2col per batch, then GEMM
+         out << SP << SP << "// Step 3: im2col\n";
+         out << SP << SP << "{\n";
+         out << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast<Idx>(1));\n";
+         out << SP << SP << SP << "auto const elementsPerGrid_im2col   = Vec::all(Idx{" << colElements << "});\n";
+         out << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n";
+         out << SP << SP << SP << "alpaka::exec<Acc>(queue, workDiv_im2col, im2colKernel_" << opName
+            << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + x_offset"
+            << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+            << ", static_cast<Idx>(" << colElements << "));\n";
+         out << SP << SP << SP << "alpaka::wait(queue);\n";
+         out << SP << SP << "}\n\n";
+
+         if (!fNB.empty()) {
+               size_t biasElements = gemm_n * gemm_m;
+               out << SP << SP << "// Step 4a: broadcast bias into output slice\n";
+               out << SP << SP << "{\n";
+               out << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast<Idx>(1));\n";
+               out << SP << SP << SP << "auto const elementsPerGrid_bias   = Vec::all(Idx{" << biasElements << "});\n";
+               out << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n";
+               out << SP << SP << SP << "alpaka::exec<Acc>(queue, workDiv_bias, biasBroadcastKernel_" << opName
+                  << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")"
+                  << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset"
+                  << ", static_cast<Idx>(" << biasElements << "));\n";
+               out << SP << SP << SP << "alpaka::wait(queue);\n";
+               out << SP << SP << "}\n\n";
+               out << SP << SP << "// Step 4b: GEMM beta=1 accumulates onto bias-initialised output\n";
+               out << SP << SP << "blas.matmul('n', 'n', "
+                  << gemm_m << ", " << gemm_n << ", " << gemm_k
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+                  << ", alpaka::getPtrNative(deviceBuf_" << convK << ")"
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n";
+         } else {
+               out << SP << SP << "// Step 4: GEMM beta=0 (no bias)\n";
+               out << SP << SP << "blas.matmul('n', 'n', "
+                  << gemm_m << ", " << gemm_n << ", " << gemm_k
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+                  << ", alpaka::getPtrNative(deviceBuf_" << convK << ")"
+                  << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + out_offset);\n\n";
+         }
+         // Wait for GEMM to finish before next batch overwrites the shared _xcol buffer.
+         out << SP << SP << "alpaka::wait(queue);\n\n";
+
+      } else {
+         // Grouped convolution: im2col and GEMM per group with group-adjusted input pointer.
+         // Each group processes fShapeW[1] input channels starting at g * fShapeW[1].
+         out << SP << SP << "for (std::size_t g = 0; g < " << fAttrGroup << "; g++) {\n\n";
+         out << SP << SP << SP << "std::size_t const g_in_offset  = x_offset   + g * "
+               << fShapeW[1] * iDepth * iHeight * iWidth << "u;\n";
+         out << SP << SP << SP << "std::size_t const g_out_offset = out_offset + g * "
+               << gemm_n * gemm_m << "u;\n";
+         out << SP << SP << SP << "std::size_t const f_offset     = g * " << groupFOffset << "u;\n\n";
+
+         out << SP << SP << SP << "// im2col for group g (reads only this group's input channels)\n";
+         out << SP << SP << SP << "{\n";
+         out << SP << SP << SP << SP << "auto const elementsPerThread_im2col = Vec::all(static_cast<Idx>(1));\n";
+         out << SP << SP << SP << SP << "auto const elementsPerGrid_im2col   = Vec::all(Idx{" << colElements << "});\n";
+         out << SP << SP << SP << SP << "auto const workDiv_im2col = sofie_workdiv(elementsPerGrid_im2col);\n";
+         out << SP << SP << SP << SP << "alpaka::exec<Acc>(queue, workDiv_im2col, im2colKernel_" << opName
+            << ", alpaka::getPtrNative(deviceBuf_" << fNX << ") + g_in_offset"
+            << ", alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+            << ", static_cast<Idx>(" << colElements << "));\n";
+         out << SP << SP << SP << SP << "alpaka::wait(queue);\n";
+         out << SP << SP << SP << "}\n\n";
+
+         if (!fNB.empty()) {
+               size_t groupBiasElements = gemm_n * gemm_m;
+               out << SP << SP << SP << "// Broadcast group bias\n";
+               out << SP << SP << SP << "{\n";
+               out << SP << SP << SP << SP << "auto const elementsPerThread_bias = Vec::all(static_cast<Idx>(1));\n";
+               out << SP << SP << SP << SP << "auto const elementsPerGrid_bias   = Vec::all(Idx{" << groupBiasElements << "});\n";
+               out << SP << SP << SP << SP << "auto const workDiv_bias = sofie_workdiv(elementsPerGrid_bias);\n";
+               out << SP << SP << SP << SP << "alpaka::exec<Acc>(queue, workDiv_bias, biasBroadcastKernel_" << opName
+                  << ", alpaka::getPtrNative(deviceBuf_" << fNB << ") + g * " << gemm_n
+                  << ", alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset"
+                  << ", static_cast<Idx>(" << groupBiasElements << "));\n";
+               out << SP << SP << SP << SP << "alpaka::wait(queue);\n";
+               out << SP << SP << SP << "}\n\n";
+               out << SP << SP << SP << "blas.matmul('n', 'n', "
+                  << gemm_m << ", " << gemm_n << ", " << gemm_k
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+                  << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset"
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n";
+         } else {
+               out << SP << SP << SP << "blas.matmul('n', 'n', "
+                  << gemm_m << ", " << gemm_n << ", " << gemm_k
+                  << ", 1.0f, alpaka::getPtrNative(deviceBuf_" << imcol << ")"
+                  << ", alpaka::getPtrNative(deviceBuf_" << convK << ") + f_offset"
+                  << ", 0.0f, alpaka::getPtrNative(deviceBuf_" << fNY << ") + g_out_offset);\n\n";
+         }
+         // Wait for GEMM to finish before next group's im2col overwrites the shared _xcol buffer.
+         out << SP << SP << SP << "alpaka::wait(queue);\n\n";
+         out << SP << SP << "}\n"; // end group loop
+      }
+
+      out << SP << "}\n"; // end batch loop
+      return out.str();
+   }
+
+   /*! \brief Returns the blas routines needed to compile the generated code
+    */
+   std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
+
+
+   std::string GetBlasConfig(){
+      size_t oDepth_  = (fDim > 2) ? fShapeY[2].dim    : 1;
+      size_t oHeight_ = (fDim > 1) ? fShapeY[fDim].dim : 1;
+      size_t oWidth_  = fShapeY[fDim + 1].dim;
+      size_t kSize_   = fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2];
+      size_t gemm_n_  = fShapeW[0];
+      size_t gemm_k_  = fShapeW[1] * kSize_;
+      size_t gemm_m_  = oDepth_ * oHeight_ * oWidth_;
+      auto lda = std::to_string(gemm_m_);  // ld for xcol^T (gemm_m×gemm_k col-major)
+      auto ldb = std::to_string(gemm_k_);  // ld for xf^T   (gemm_k×gemm_n col-major)
+      auto ldc = std::to_string(gemm_m_);  // ld for y^T    (gemm_m×gemm_n col-major)
+      return std::to_string(gemm_m_) + ", " + std::to_string(gemm_n_) + ", " + std::to_string(gemm_k_) + ", " + lda + ", " + ldb + ", " + ldc + ", 'n', 'n'";
+   }
+
+};
+
+} // namespace SOFIE
+
+#endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/core/inc/SOFIE/ROperator_ConvTranspose.hxx
similarity index 95%
rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
rename to core/inc/SOFIE/ROperator_ConvTranspose.hxx
index 0467385..5a4acf3 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
+++ b/core/inc/SOFIE/ROperator_ConvTranspose.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 
-#include <SOFIE/SOFIE_common.hxx>
-#include <SOFIE/ROperator.hxx>
-#include <SOFIE/RModel.hxx>
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 #include <memory>
 #include <sstream>
@@ -88,7 +88,7 @@ public:
       if (std::is_same<T, float>::value) {
          fType = "float";
       } else {
-         throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
+         throw std::runtime_error("SOFIE Encountered unsupported type parsing a Conv operator");
       }
    }
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc b/core/inc/SOFIE/ROperator_ConvTranspose.icc
similarity index 93%
rename from src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc
rename to core/inc/SOFIE/ROperator_ConvTranspose.icc
index 3a52796..52b6b3e 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.icc
+++ b/core/inc/SOFIE/ROperator_ConvTranspose.icc
@@ -105,22 +105,22 @@ void ROperator_ConvTranspose<T>::Initialize(RModel& model){
 
    fUseSession = model.UseSession();
    if (!model.CheckIfTensorAlreadyExist(fNX)) {
-      throw std::runtime_error("TMVA SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model");
+      throw std::runtime_error("SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model");
    }
    fShapeX = model.GetTensorShape(fNX);
    if (fShapeX.size() < 3 || fShapeX.size() > 5) {
       std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
-      throw std::runtime_error("TMVA SOFIE Conv Transpose Op input data tensor" + fNX +
+      throw std::runtime_error("SOFIE Conv Transpose Op input data tensor" + fNX +
                                " is not of 3,4 or 5 dimensions");
    }
    fDim = fShapeX.size() - 2;
    if (!model.CheckIfTensorAlreadyExist(fNW)) {
-      throw std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
+      throw std::runtime_error("SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
    }
    fShapeW = model.GetTensorShape(fNW);
    if (fShapeW.size() < 3 || fShapeW.size() > 5) {
       std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
-      throw std::runtime_error("TMVA SOFIE Conv Transpose Op input weight tensor" + fNW +
+      throw std::runtime_error("SOFIE Conv Transpose Op input weight tensor" + fNW +
                                " is not of 3,4 or 5 dimensions");
    }
    fShapeY = ShapeInference({fShapeX, fShapeW})[0];
@@ -128,11 +128,11 @@ void ROperator_ConvTranspose<T>::Initialize(RModel& model){
    model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
    if (fNB != "") {
       if (!model.CheckIfTensorAlreadyExist(fNB)) {
-         throw std::runtime_error("TMVA SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model");
+         throw std::runtime_error("SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model");
       }
       fShapeB = model.GetTensorShape(fNB);
       if (fShapeB.size() < 1)
-            throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has empty shape");
+            throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has empty shape");
 
       size_t bsize = ConvertShapeToLength(fShapeB);
       size_t ysize = ConvertShapeToLength(fShapeY);
@@ -143,13 +143,13 @@ void ROperator_ConvTranspose<T>::Initialize(RModel& model){
          // we assume bias tensor size is equal to number of filters that is the second dimension in
          // the output tensor
          if (bsize != fShapeY[1] )
-            throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has wrong shape: " +
+            throw std::runtime_error("SOFIE ConvTrans op: Bias Tensor has wrong shape: " +
                                      ConvertShapeToString(fShapeB));
 
          auto original_data = model.GetInitializedTensorData(fNB);
 
          if (fType != "float")
-            throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported");
+            throw std::runtime_error("SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported");
          // here the acual broadcasting
          if (!fUseSession) {
             // Broadcast B from M to N x M x Od x Oh x Ow
@@ -170,7 +170,7 @@ void ROperator_ConvTranspose<T>::Initialize(RModel& model){
       else {
          // bias tensor is already correct shape, no need to broadcast
          if (fShapeY != fShapeB)
-            throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" +
+            throw std::runtime_error("SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" +
                ConvertShapeToString(fShapeB));
          fNBroadcastedB = fNB;
       }
@@ -218,7 +218,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
    OpName = "op_" + OpName;
 
    if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
-      throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
+      throw std::runtime_error("SOFIE Conv Op called to Generate without being initialized first");
    }
 
    std::stringstream out;
@@ -331,7 +331,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
    // Resulting matrix op_xcol is (output channels * filter_h * filter_w , output_h * output_w)
    if (fDim == 1) {
       if (fAttrPads[0] != fAttrPads[1]) {
-         std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
+         std::cout << "SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
                    << std::endl;
          fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
       }
@@ -339,7 +339,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
    }
    if (fDim == 2) {
       if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
-         std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
+         std::cout << "SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
                    << std::endl;
          fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
          fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
@@ -347,7 +347,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
    }
    if (fDim == 3) {
       if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
-         std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
+         std::cout << "SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
                    << std::endl;
          fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
          fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
@@ -385,7 +385,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
          out << ", tensor_" << fNY << " + out_offset);\n\n ";
       } else {
          // 3d : needs a col2im for 3d
-         throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
+         throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported");
          out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
              << " + x_offset,"
              //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
@@ -436,7 +436,7 @@ std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
          out << ", tensor_" << fNY << " + out_offset);\n\n ";
       } else {
          // 3d im2col
-         throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
+         throw std::runtime_error("SOFIE 3D Conv Transpose not yet supported");
 
          out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
              << " + x_offset,"
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx b/core/inc/SOFIE/ROperator_Custom.hxx
similarity index 92%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx
rename to core/inc/SOFIE/ROperator_Custom.hxx
index c24d329..fb618d4 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Custom.hxx
+++ b/core/inc/SOFIE/ROperator_Custom.hxx
@@ -48,13 +48,13 @@ public:
 
       for(auto& it:fInputNames){
         if (model.CheckIfTensorAlreadyExist(it) == false){
-         throw std::runtime_error("TMVA SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model");
+         throw std::runtime_error("SOFIE Custom " + fOpName + " Op Input Tensor " + it + " is not found in model");
         }
         fInputSizes.push_back(ConvertShapeToLength(model.GetTensorShape(it)));
       }
 
       if(fOutputNames.size() != fOutputShapes.size()){
-        throw std::runtime_error("TMVA SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors");
+        throw std::runtime_error("SOFIE Custom "+ fOpName + " Op was not intialized with the names/shapes of all the output tensors");
       }
 
       for(long unsigned int i=0; i<fOutputNames.size(); ++i){
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx b/core/inc/SOFIE/ROperator_Einsum.hxx
similarity index 95%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx
rename to core/inc/SOFIE/ROperator_Einsum.hxx
index e9b555b..c43ea31 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx
+++ b/core/inc/SOFIE/ROperator_Einsum.hxx
@@ -41,12 +41,13 @@ public:
    ROperator_Einsum(const std::string & equation, const std::vector<std::string> & namesX, const std::string & nameY):
       fNInputs(namesX.size()), fNY(UTILITY::Clean_name(nameY))
    {
+      fKind = OperatorKind::EINSUM;
       for (size_t i = 0; i < namesX.size(); i++)
          fNInputs[i] = UTILITY::Clean_name(namesX[i]);
 
       // parse teh equations to find labels
       if (!ParseEquation(equation))
-         throw std::runtime_error("TMVA SOFIE Einsum Op: Error parsing the equation " + equation);
+         throw std::runtime_error("SOFIE Einsum Op: Error parsing the equation " + equation);
 
       fInputTensorNames.resize(fNInputs.size());
       std::transform(fNInputs.begin(), fNInputs.end(), fInputTensorNames.begin(),
@@ -128,7 +129,7 @@ public:
       std::map<char, int> labelsMap;
       for ( auto & name : fNInputs) {
          if (!model.CheckIfTensorAlreadyExist(name))
-            throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor ") + name + "is not found in model");
+            throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor ") + name + "is not found in model");
 
          // if (model.IsDynamicTensor(name) || model.IsDimInputTensor(name) ) {
          //    // not yet supported
@@ -140,7 +141,7 @@ public:
          std::string labels = fInputLabels[i];
          for (size_t j = 0; j < shape.size(); j++) {
             if (j >= labels.length()) {
-               throw std::runtime_error(std::string("TMVA SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape));
+               throw std::runtime_error(std::string("SOFIE Einsum Op Input Tensor has invalid label or shape ") + labels + " " + ConvertShapeToString(shape));
             }
             labelsMap[labels[j]] = shape[j];
          }
@@ -149,7 +150,7 @@ public:
       // get output shape from label maps
       for (char l : fOutputLabels) {
          if (labelsMap.count(l) == 0)
-            throw std::runtime_error(std::string("TMVA SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs");
+            throw std::runtime_error(std::string("SOFIE Einsum Op : output label ") + std::string(&l) + " is not present in inputs");
          fShapeY.push_back(labelsMap[l]);
       }
       // we need to get the labels we are going to sum
@@ -209,7 +210,7 @@ public:
       opName = "op_" + opName;
 
       if (fShapeY.size() != fOutputLabels.length()) {
-         throw std::runtime_error("TMVA SOFIE Einsum Op called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Einsum Op called to Generate without being initialized first");
       }
 
       // function to write compute expression index from strides
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx b/core/inc/SOFIE/ROperator_Elu.hxx
similarity index 81%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx
rename to core/inc/SOFIE/ROperator_Elu.hxx
index 34e18a6..6588b61 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx
+++ b/core/inc/SOFIE/ROperator_Elu.hxx
@@ -19,7 +19,7 @@ private:
    float falpha= 1.0; //default value
    std::string fNX;
    std::string fNY;
-   std::vector<size_t> fShape;
+   std::vector<Dim> fShape;
    std::string fType;
 
 public:
@@ -27,6 +27,7 @@ public:
    ROperator_Elu(float alpha,std::string nameX, std::string nameY):
    falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
    {
+      fKind = OperatorKind::ELU;
       fInputTensorNames = { fNX };
       fOutputTensorNames = { fNY };
       
@@ -34,7 +35,7 @@ public:
          fType = "float";
       }
 		else{
-			throw std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Elu operator");
+			throw std::runtime_error("SOFIE Encountered unsupported type parsing a Elu operator");
 		}
    }
 
@@ -49,9 +50,9 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Elu Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE Elu Op Input Tensor is not found in model");
       }
-      fShape = model.GetTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
    }
 
@@ -59,10 +60,10 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator Elu called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Elu called to Generate without being initialized first");
       }
       std::stringstream out;
-      size_t length = ConvertShapeToLength(fShape);
+      std::string length = ConvertDimShapeToLength(fShape);
 
       out << SP << "float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx b/core/inc/SOFIE/ROperator_Erf.hxx
similarity index 93%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx
rename to core/inc/SOFIE/ROperator_Erf.hxx
index 72f8cc5..6a51864 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Erf.hxx
+++ b/core/inc/SOFIE/ROperator_Erf.hxx
@@ -17,7 +17,7 @@ private:
 
    std::string fNX;
    std::string fNY;
-   std::vector<size_t> fShape;
+   std::vector<Dim> fShape;
 
 public:
    ROperator_Erf(){}
@@ -41,7 +41,7 @@ public:
       if (model.CheckIfTensorAlreadyExist(fNX) == false){
         throw std::runtime_error("SOFIE SOFIE Erf Op Input Tensor is not found in model");
       }
-      fShape = model.GetTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
    }
 
@@ -52,7 +52,7 @@ public:
          throw std::runtime_error("SOFIE SOFIE Erf operator called to Generate without being initialized first");
       }
       std::stringstream out;
-      size_t length = ConvertShapeToLength(fShape);
+      std::string length = ConvertDimShapeToLength(fShape);
       out << "\n//------ ERF\n";
       out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
       out << SP << SP << "tensor_" << fNY << "[id] = std::erf(tensor_" << fNX << "[id]);\n";
diff --git a/core/inc/SOFIE/ROperator_Expand.hxx b/core/inc/SOFIE/ROperator_Expand.hxx
new file mode 100644
index 0000000..95955ed
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Expand.hxx
@@ -0,0 +1,347 @@
+#ifndef SOFIE_ROperator_Expand
+#define SOFIE_ROperator_Expand
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template<typename T>
+class ROperator_Expand final : public ROperator{
+private:
+
+   std::vector<Dim> fShapeX;
+   std::vector<size_t> fShape;
+   std::vector<Dim> fShapeY;
+   std::vector<Dim> fShapeDim;
+
+   std::string fNX;
+   std::string fNShape;
+   std::string fNY;
+   std::string fType;
+
+   bool fInitialized = false;
+   bool fInitializedShape = false;
+   bool fInitBroadcast = false;
+
+public:
+   ROperator_Expand(){}
+   ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY):
+      fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){
+         fInputTensorNames = { fNX };
+         fOutputTensorNames = { fNY };
+      }
+
+
+   void Initialize(RModel& model) override {
+      // input must be a graph input, or already initialized intermediate tensor
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+        throw std::runtime_error("SOFIE Expand Op Input Tensor " + fNX + " is not found in model");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      if (model.IsInitializedTensor(fNShape)) {
+         fInitializedShape = true;
+         int64_t *shapeData =
+           static_cast<int64_t *>(model.GetInitializedTensorData(fNShape).get());
+         fShape = model.GetTensorShape(fNShape);
+         if (fShape.size() != 1) {
+            throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor.");
+         }
+         size_t N = fShape[0];
+         // what do we do if shapeData contains negative values?
+         for (size_t i = 0; i < N; i++) {
+            if ( shapeData[i] < 0)
+               throw std::runtime_error("TMVA::SOFIE - Expand: invalid shape value " + std::to_string(shapeData[i]));
+         }
+         std::vector<size_t> shape(shapeData, shapeData + N);
+         fShapeDim = ConvertShapeToDim(shape);
+      } else if (model.IsShapeTensor(fNShape)) {
+         // case input shape is a shape tensor
+         fShapeDim = model.GetShapeTensorValues(fNShape);
+         fInitializedShape = true;
+      } else {
+         // assume shape of input shape is known (size is 1)
+         auto shapeOfInputShape = model.GetTensorShape(fNShape);
+         fShapeDim.resize(shapeOfInputShape[0]);
+         for (size_t i = 0; i < fShapeDim.size(); i++) {
+            fShapeDim[i] = Dim{std::string("v_") + fNShape + "_" + std::to_string(i)};
+            model.AddShapeParam(fShapeDim[i].param);
+         }
+      }
+      // Y is the common shape of fShapeX and shape
+      auto ret  = SOFIE::UTILITY::MultidirectionalBroadcastShape(fShapeX, fShapeDim);
+      fShapeY = ret.second;
+      fInitialized = model.IsInitializedTensor(fNX) && fInitializedShape;
+      std::vector<size_t> shapeX;
+      std::vector<size_t> shapeY;
+      // case shape tensor and input shape are known
+      if (!model.IsDynamicTensor(fNX) && !model.IsDimInputTensor(fNX) && fInitializedShape) {
+         shapeX = ConvertShapeToInt(fShapeX);
+         shapeY = ConvertShapeToInt(fShapeY);
+         if (!UTILITY::AreSameShape(shapeX, shapeY))
+            fInitBroadcast = true;
+      }
+      if (fInitialized) {
+         // cannot have Dim initialized tensors
+         assert(!shapeX.empty() && !shapeY.empty());
+         // Broadcast X to the common shape shapeY
+         // If X is an initialized tensor (constant)
+         auto data = model.GetInitializedTensorData(fNX);
+         if (fInitBroadcast) {
+            std::shared_ptr<void> broadcastedData(
+               UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), shapeX, shapeY),
+               std::default_delete<T[]>());
+            // Update the data and the shape of X
+            model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData);
+            fShapeX = fShapeY;
+            // need to set as a not writable tensor
+            model.SetNotWritableInitializedTensor(fNX);
+            data = broadcastedData;
+         }
+         if (fInitBroadcast || model.IsConstantTensor(fNX)) {
+            fIsOutputConstant = true; // constant output in this case
+            model.AddConstantTensor(fNY, model.GetTensorType(fNX), shapeY, data);
+            fOutputTensorNames.pop_back();
+         } else {
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), shapeY);
+         }
+      } else {
+         // // case input is not initialized
+         // if (shapeX.empty() && shapeDim.empty()) {
+
+         // }
+         // if (fInitializedShape)
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+      }
+      fType = ConvertTypeToString(model.GetTensorType(fNX));
+      if (model.Verbose()) {
+         std::cout << "Expand - input " << fNX << " shape " << ConvertDimShapeToString(fShapeX) << " --> " << fNY << " shape "
+                  << ConvertDimShapeToString(fShapeY) << (fIsOutputConstant ? ConvertValuesToString(model.GetTensorData<T>(fNY)) + " (constant)" : "") << std::endl;
+      }
+
+      if (fInitializedShape && model.IsInitializedTensor(fNShape)) {
+         // Shape values are fully consumed into fShapeY/fShapeDim at generation time —
+         // no device buffer needed for fNShape for Heterogeneous inference
+         model.SetNotWritableInitializedTensor(fNShape);
+      }
+   }
+
+   std::string GenerateInitCode() override {
+      std::stringstream out;
+      if (!fIsOutputConstant && fInitialized && !fInitBroadcast) {
+         // shapeX and shapeY are the same in this case
+         auto length = ConvertDimShapeToLength(fShapeY);
+         out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n";
+         out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
+      }
+      return out.str();
+   }
+
+   std::string Generate(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeY.empty()) {
+         throw std::runtime_error("SOFIE Expand Op called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      out << SP << "\n//------ Expand " << opName << " --> " << ConvertDimShapeToString(fShapeY) << "\n";
+      // need to declare shape parameters for non initialized shapes
+      if (!fInitializedShape) {
+         for (size_t i = 0; i < fShapeDim.size(); i++) {
+            out << SP << "size_t " << fShapeDim[i] << " = " << "tensor_" << fNShape << "[" << i << "];\n";
+         }
+      }
+      // No need to broadcast A if it's an initialized tensor or shapes are the same
+      if (!fInitialized && fShapeX != fShapeY) {
+         out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n";
+         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNX << ", " << ConvertDimShapeToString(fShapeX) << ", " << ConvertDimShapeToString(fShapeY)
+                   << ", tensor_"<<fNY<<");\n";
+      }
+      return out.str();
+   }
+
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    if (fInitialized) return "";
+
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("SOFIE Expand Op called to Generate without being initialized first");
+
+    // Can only generate a static kernel if all dimensions are concrete values
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    if (!isStatic(fShapeX) || !isStatic(fShapeY)) return "";
+
+    // Check if broadcast is actually needed
+    bool needsBroadcast = (fShapeX.size() != fShapeY.size());
+    if (!needsBroadcast) {
+        needsBroadcast = std::any_of(fShapeX.begin(), fShapeX.end(),
+                          [&](const Dim& d) {
+                              size_t i = &d - fShapeX.data();
+                              return fShapeX[i].dim != fShapeY[i].dim;
+                          });
+    }
+    if (!needsBroadcast) return ""; // same static shape — just a memcpy
+
+    const std::size_t D = fShapeY.size();
+
+    // Left-pad fShapeX with dim=1 entries to match rank of fShapeY
+    std::vector<size_t> shapeX_padded(D, 1);
+    size_t offset = D - fShapeX.size();
+    for (size_t i = 0; i < fShapeX.size(); ++i)
+        shapeX_padded[offset + i] = fShapeX[i].dim;
+
+    std::vector<size_t> shapeY_int(D);
+    for (size_t i = 0; i < D; ++i)
+        shapeY_int[i] = fShapeY[i].dim;
+
+    auto stridesX = UTILITY::ComputeStrideFromShape(shapeX_padded);
+    auto stridesY = UTILITY::ComputeStrideFromShape(shapeY_int);
+    std::size_t totalElements = ConvertShapeToLength(shapeY_int);
+
+    std::string kname = "ExpandKernel_" + opName;
+
+    std::string op;
+    op  = "\n//------ EXPAND_KERNEL_ALPAKA\n";
+    op += SP + "struct " + kname + " {\n";
+    op += SP + SP + "template<typename TAcc, typename T>\n";
+    op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+    op += SP + SP + SP + "TAcc const& acc,\n";
+    op += SP + SP + SP + "T const* __restrict__ input,\n";
+    op += SP + SP + SP + "T* __restrict__ output,\n";
+    op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+    op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+    op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+    op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+    op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+    // Decompose output linear index using compile-time output strides
+    for (std::size_t d = 0; d < D; ++d) {
+        op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+            + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % "
+            + std::to_string(shapeY_int[d]) + "u;\n";
+    }
+    op += "\n";
+
+    // Input index: broadcast dims (shapeX_padded[d]==1) contribute 0 —
+    // compiler eliminates zero terms entirely, no runtime branch
+    op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+    for (std::size_t d = 0; d < D; ++d) {
+        if (shapeX_padded[d] == 1) {
+            op += SP + SP + SP + SP + SP + "0u";
+        } else {
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(d)
+                + " * " + std::to_string(stridesX[d]) + "u";
+        }
+        op += (d + 1 < D) ? " +\n" : ";\n\n";
+    }
+
+    op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+    op += SP + SP + SP + "}\n";   // end grid-stride loop
+    op += SP + SP + "}\n";        // end operator()
+    op += SP + "};\n";            // end struct
+
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    if (fInitialized) return "";
+
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    if (!isStatic(fShapeX) || !isStatic(fShapeY)) return "";
+
+    // Check if broadcast is actually needed
+    bool needsBroadcast = (fShapeX.size() != fShapeY.size());
+    if (!needsBroadcast) {
+        for (size_t i = 0; i < fShapeX.size(); ++i)
+            if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; }
+    }
+    if (!needsBroadcast) return "";
+
+    opName = "op_" + opName;
+    std::string kname = "ExpandKernel_" + opName;
+    return SP + kname + " expandKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("SOFIE Operator Expand called to Generate without being initialized first");
+
+    std::stringstream out;
+    out << "\n//------ EXPAND_GPU_ALPAKA\n";
+
+    if (fInitialized && !fInitBroadcast) {
+        // GenerateInitCode already handled the copy — nothing to do at inference time
+        return "";
+    }
+
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    bool staticShapes = isStatic(fShapeX) && isStatic(fShapeY);
+
+    // Check if broadcast is actually needed for static shapes
+    bool needsBroadcast = !staticShapes; // dynamic always needs runtime broadcast
+    if (staticShapes) {
+        needsBroadcast = (fShapeX.size() != fShapeY.size());
+        if (!needsBroadcast) {
+            for (size_t i = 0; i < fShapeX.size(); ++i)
+                if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; }
+        }
+    }
+
+    if (!needsBroadcast) {
+        // Same static shape — device-to-device copy
+        out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY
+            << ", deviceBuf_" << fNX << ");\n";
+        out << SP << "alpaka::wait(queue);\n";
+        return out.str();
+    }
+
+    if (!staticShapes) {
+        // Dynamic shapes — not yet supported on GPU, throw a clear error
+        throw std::runtime_error(
+            "SOFIE Expand GPU: dynamic shapes are not yet supported for GPU inference. "
+            "Tensor " + fNX + " has a dynamic shape.");
+    }
+
+    // Static broadcast — launch the expand kernel
+    std::vector<size_t> shapeY_int(fShapeY.size());
+    for (size_t i = 0; i < fShapeY.size(); ++i)
+        shapeY_int[i] = fShapeY[i].dim;
+    std::size_t totalElements = ConvertShapeToLength(shapeY_int);
+    std::string kname = "expandKernel_" + opName;
+
+    out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+    out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+    out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+    out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+        << ", " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+   out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n";
+
+    return out.str();
+}
+};
+}//SOFIE
+
+#endif //SOFIE_ROperator_Expand
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx b/core/inc/SOFIE/ROperator_EyeLike.hxx
similarity index 89%
rename from src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx
rename to core/inc/SOFIE/ROperator_EyeLike.hxx
index 8e94e1c..91103ef 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_EyeLike.hxx
+++ b/core/inc/SOFIE/ROperator_EyeLike.hxx
@@ -40,11 +40,11 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not found in model");
       }
       fShape = model.GetTensorShape(fNX);
       if (fShape.size() != 2)
-         throw std::runtime_error("TMVA SOFIE EyeLike Op Input Tensor is not of rank 2");
+         throw std::runtime_error("SOFIE EyeLike Op Input Tensor is not of rank 2");
 
       if(fdtype){
         ETensorType extractedType = static_cast<ETensorType>(fdtype);
@@ -59,7 +59,7 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShape.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator EyeLike called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator EyeLike called to Generate without being initialized first");
       }
       auto length = ConvertShapeToLength(fShape);
       auto stride = SOFIE::UTILITY::ComputeStrideFromShape(fShape);
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/core/inc/SOFIE/ROperator_GRU.hxx
similarity index 92%
rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
rename to core/inc/SOFIE/ROperator_GRU.hxx
index bb1a74e..037e016 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
+++ b/core/inc/SOFIE/ROperator_GRU.hxx
@@ -11,7 +11,6 @@
 #include <string>
 #include <vector>
 
-
 namespace SOFIE {
 
 /*! \brief Gated Recurrent Unit operator
@@ -91,7 +90,7 @@ template <typename T> class ROperator_GRU final : public ROperator {
          fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)),
          fNInitial_h(UTILITY::Clean_name(nameInitial_h)),
          fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) {
-      
+
       fInputTensorNames = { fNX, fNW, fNR };
       if (!fNB.empty()){
         fInputTensorNames.emplace_back(fNB);
@@ -115,7 +114,7 @@ template <typename T> class ROperator_GRU final : public ROperator {
          fType = "float";
       } else {
          throw std::runtime_error(
-             "TMVA SOFIE Encountered unsupported type parsing a GRU operator");
+             "SOFIE Encountered unsupported type parsing a GRU operator");
       }
    }
 
@@ -123,39 +122,34 @@ template <typename T> class ROperator_GRU final : public ROperator {
     *
     * \param input type of the input tensors
     */
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/);
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/) override;
 
    /*! \brief Infers the shape of the output tensors
     *
     * \param input shape of the input tensors
     */
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/);
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/) override;
 
    /*! \brief Initialize the model
     *
     * \param model Model
     */
-   void Initialize(RModel &);
+   void Initialize(RModel &) override;
 
    /*! \brief Generate the inference code
     *
     * \param OpName name of the operator
     */
-   std::string Generate(std::string /*OpName*/);
-
-   /*! \brief Generate the code for the Session internal data vectors
-    *
-    * \param opName name of the operator
-    */
-   std::string GenerateSessionMembersCode(std::string opName);
+   std::string Generate(std::string /*OpName*/) override;
 
    /*! \brief Returns the blas routines needed to compile the generated code
     */
-   std::vector<std::string> GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; }
+   std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
 };
 
 } // namespace SOFIE
 
+
 // Implementation of the ROperator_GRU class
 #include "SOFIE/ROperator_GRU.icc"
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/core/inc/SOFIE/ROperator_GRU.icc
similarity index 93%
rename from src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
rename to core/inc/SOFIE/ROperator_GRU.icc
index f3813c2..f24460c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
+++ b/core/inc/SOFIE/ROperator_GRU.icc
@@ -38,33 +38,33 @@ void ROperator_GRU<T>::Initialize(RModel& model){
    fUseSession = model.UseSession();
    // Check the input and output tensors
    if (!model.CheckIfTensorAlreadyExist(fNX)) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + "  is not found in model.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + "  is not found in model.");
    }
    fShapeX = model.GetTensorShape(fNX);
    if (fShapeX.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNX + " is not of 3 dimensions.");
    }
    if (!model.CheckIfTensorAlreadyExist(fNW)) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + "  is not found in model.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + "  is not found in model.");
    }
    fShapeW = model.GetTensorShape(fNW);
    if (fShapeW.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNW + " is not of 3 dimensions.");
    }
    if (!model.CheckIfTensorAlreadyExist(fNR)) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + "  is not found in model.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + "  is not found in model.");
    }
    fShapeR = model.GetTensorShape(fNR);
    if (fShapeR.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions.");
+      throw std::runtime_error("SOFIE GRU Op input tensor " + fNR + " is not of 3 dimensions.");
    }
    if (!fNB.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNB)) {
-         throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not  found in model.");
+         throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not  found in model.");
       }
       fShapeB = model.GetTensorShape(fNB);
       if (fShapeB.size() != 2 && fShapeB.size() != 4) {
-         throw std::runtime_error("TMVA SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions.");
+         throw std::runtime_error("SOFIE GRU op input tensor " + fNB + " is not of 2 or 4 dimensions.");
       }
       if (fShapeB.size() == 2) {
          // Broadcasting the bias
@@ -99,25 +99,25 @@ void ROperator_GRU<T>::Initialize(RModel& model){
    }
    if (!fNSequence_lens.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) {
-         throw std::runtime_error("TMVA SOFIE GRU Op input tensor " +
+         throw std::runtime_error("SOFIE GRU Op input tensor " +
                                   fNSequence_lens +
                                   "is not found in model.");
       }
       fShapeSequence_lens = model.GetTensorShape(fNSequence_lens);
       if (fShapeSequence_lens.size() != 1) {
-         throw std::runtime_error("TMVA SOFIE GRU Op input tensor " +
+         throw std::runtime_error("SOFIE GRU Op input tensor " +
                                   fNSequence_lens +
                                   " is not of 1 dimension.");
       }
    }
    if (!fNInitial_h.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) {
-         throw std::runtime_error("TMVA SOFIE GRU Op input tensor " +
+         throw std::runtime_error("SOFIE GRU Op input tensor " +
                                   fNInitial_h + " is not found in model.");
       }
       fShapeInitial_h = model.GetTensorShape(fNInitial_h);
       if (fShapeInitial_h.size() != 3) {
-         throw std::runtime_error("TMVA SOFIE GRU Op input tensor " +
+         throw std::runtime_error("SOFIE GRU Op input tensor " +
                                   fNInitial_h + " is not of 3 dimensions.");
       }
    }
@@ -141,7 +141,7 @@ void ROperator_GRU<T>::Initialize(RModel& model){
           activation != "ScaledTanh" && activation != "HardSigmoid" &&
           activation != "Elu" && activation != "Softsign" &&
           activation != "Softplus") {
-         throw std::runtime_error("TMVA SOFIE - Activation function " +
+         throw std::runtime_error("SOFIE - Activation function " +
                                   activation + " not implemented");
       }
    }
@@ -150,22 +150,22 @@ void ROperator_GRU<T>::Initialize(RModel& model){
        fAttrDirection != "reverse" &&
        fAttrDirection != "bidirectional") {
       throw std::runtime_error(
-          "TMVA SOFIE - Invalid GRU direction fAttrDirection = " +
+          "SOFIE - Invalid GRU direction fAttrDirection = " +
           fAttrDirection);
    }
    if (3 * fAttrHiddenSize != fShapeW[1]) {
       throw std::runtime_error(
-          "TMVA SOFIE - fAttrHiddenSize must be equal to " +
+          "SOFIE - fAttrHiddenSize must be equal to " +
           std::to_string(fShapeW[1] / 3));
    }
    if (fAttrLayout > 1) {
-      throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " +
+      throw std::runtime_error("SOFIE - Layout fAttrLayout = " +
                                std::to_string(fAttrLayout) +
                                " must be 0 (timewise) or 1 (batchwise)");
    }
    if (fAttrLinearBeforeReset > 1) {
       throw std::runtime_error(
-         "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset)
+         "SOFIE - fAttrInputForget = " + std::to_string(fAttrLinearBeforeReset)
          + " must be 0 or 1.");
    }
    if (fAttrActivations.empty()) {
@@ -175,51 +175,45 @@ void ROperator_GRU<T>::Initialize(RModel& model){
          fAttrActivations = {"Sigmoid", "Tanh"};
       }
    }
-}
 
-// generate code for Session data members (e.g. internal vectors)
-template <typename T>
-std::string ROperator_GRU<T>::GenerateSessionMembersCode(std::string opName)
-{
-   opName = "op_" + opName;
-   std::stringstream out;
+   // To get unique intermediate tensor names, we add the name of the input
+   // tensor. One might also consider using the index of the operator in the
+   // RMode, but this information is not available in the current scope.
+   std::string opName = "op_gru_" + fNX;
 
    size_t num_directions = fShapeW[0];
    size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1];
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   auto declareVector = [&](std::string const &name, std::size_t n){
+      std::string fullName = opName + "_" + name;
+      model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector<std::size_t>{n});
+   };
+
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      declareVector("input", seq_length * batch_size * input_size);
+      declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize);
+      declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize);
    }
    // Set the feedforward
    size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
+   declareVector("f_update_gate", ff_size);
+   declareVector("f_reset_gate", ff_size);
+   declareVector("f_hidden_gate", ff_size);
    // gate results
    size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
+   declareVector("update_gate", hs_size);
+   declareVector("reset_gate", hs_size);
+   declareVector("hidden_gate", hs_size);
 
    // feedback
-   out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">("
-       << batch_size * fAttrHiddenSize << ");\n";
+   declareVector("feedback", batch_size * fAttrHiddenSize);
 
    // hiddden state
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n";
+      declareVector("hidden_state", hs_size);
    }
-
-   out << "\n";
-
-   return out.str();
 }
 
 
@@ -234,12 +228,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    size_t input_size = fShapeX[2];
    size_t num_directions = fShapeW[0];
 
+   auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; };
+
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n";
       }
@@ -261,8 +257,7 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
                 << fNInitial_h << ";\n";
       } else {
          if (fUseSession) {
-            out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+            out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n";
          } else {
             out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size *
                 fAttrHiddenSize << "];\n";
@@ -283,9 +278,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the feedforward
    size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n";
@@ -294,9 +289,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the gates
    size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n";
@@ -307,14 +302,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n";
       }
    }
 
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n";
+      out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n";
    }
diff --git a/core/inc/SOFIE/ROperator_Gather.hxx b/core/inc/SOFIE/ROperator_Gather.hxx
new file mode 100644
index 0000000..3c16f18
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Gather.hxx
@@ -0,0 +1,400 @@
+#ifndef SOFIE_ROPERATOR_GATHER
+#define SOFIE_ROPERATOR_GATHER
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+
+namespace SOFIE{
+
+class ROperator_Gather final : public ROperator
+{
+private:
+
+   int64_t fAttrAxis = 0;
+
+   std::string fNX;
+   std::string fNIndices;
+   std::string fNY;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeIndices;
+   std::vector<Dim> fShapeY;
+
+   std::vector<int64_t> fIndices;  // indices vector in case they are known at initialization
+
+   std::string fType;
+
+public:
+   ROperator_Gather(){}
+   ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY):
+      fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) {
+         fInputTensorNames = { fNX, fNIndices };
+         fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input;
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw std::runtime_error("SOFIE Gather Op Input Tensor " + fNX + " is not found in model");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      if (model.Verbose())
+         std::cout << "Gather - initial shape " << ConvertDimShapeToString(fShapeX) << " shape of indices "
+               << ConvertDimShapeToString(model.GetDimTensorShape(fNIndices)) << std::endl;
+      //  fShapeIndices can be  dynamic
+      fShapeIndices = model.GetDimTensorShape(fNIndices);
+      size_t q = fShapeIndices.size();
+      // Axis in range [0, r) where r=rank(X)
+      size_t r = fShapeX.size();
+       // Set the axis
+      if (fAttrAxis < 0) {
+         fAttrAxis = fAttrAxis + int64_t(r);
+      }
+
+
+      // case indices tensor is initialized
+      if (model.IsInitializedTensor(fNIndices)) {
+          // empty shape Indices is a scalar value for the indices
+         size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices));
+         int64_t* indicesData = static_cast<int64_t*>(model.GetInitializedTensorData(fNIndices).get());
+         // update indices data in case of negative dim values
+         for (size_t i = 0; i < indicesLength; i++) {
+            // move this at generation time?
+            if (!fShapeX[fAttrAxis].isParam) {
+               if (indicesData[i] < 0) {
+                  indicesData[i] += fShapeX[fAttrAxis].dim;
+               }
+            }
+         }
+         // Save in a vector gather Indices of size q
+         fIndices = std::vector<int64_t>(indicesData, indicesData + indicesLength);
+      }
+      // Output shape
+      if (model.Verbose())
+         std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertDimShapeToString(fShapeIndices) << std::endl;
+
+      if (fShapeY.empty()) {
+         fShapeY.resize(q + r - 1);
+         if (fAttrAxis > 0) {
+            // Copy shape of X[0, ..., axis-1) to Shape of Y[0, ..., axis-1)
+            std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin());
+         }
+         // Set shape of Y[axis, ..., axis + q)
+         for (size_t i = 0; i < q; i++) {
+            fShapeY[fAttrAxis + i] = Dim{ fShapeIndices[i]};
+         }
+         // Copy shape of X[axis + 1, ..., r) to shape of Y[axis + q, ... q + r - 1)
+         std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q);
+      }
+      // case input is known (type is an integer) and input indices is a scalar (or vector of size 1)
+      if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) {
+         auto shapeX = ConvertShapeToInt(fShapeX);  // we assume model is not dynamic
+         auto shapeY = ConvertShapeToInt(fShapeY);
+         if (model.GetTensorType(fNX) == ETensorType::INT64) {
+            auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNX).get());
+            // if q <=1 and r = 1 output length = 1 (it is a scalar)
+            std::vector<int64_t> outputData(1); //ConvertShapeToLength(shapeY));
+            outputData[0] = inputData[fIndices[0]];
+            model.AddConstantTensor(fNY, shapeY, outputData.data());
+            if (model.Verbose())
+               std::cout << "Gather: " << fNX << " " << ConvertShapeToString(shapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(shapeY)
+                   << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl;
+            fIsOutputConstant = true;
+         }
+      }
+      // case input is a shape tensor  (r is == 1 by definition) and indices are known
+      else if (model.IsShapeTensor(fNX) && q <=1  && fIndices.size() > 0) {
+         auto inputData = model.GetShapeTensorValues(fNX);
+         // if r == 1 and q<=1 then output length is 1 (is a scalar or tensor of size1)
+         std::vector<Dim> outputData(1);
+         outputData[0] = inputData[fIndices[0]];
+         if (outputData[0].isParam) {
+            fIsOutputConstant = true;
+            // shapeY can be scalar or vector of size1
+            model.AddShapeTensor(fNY, outputData, fShapeY.size() == 0);
+            if (model.Verbose())
+               std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
+                   << " and values " << ConvertDimShapeToString(outputData) << " (shape) " << std::endl;
+         } else {
+            int64_t value = static_cast<int64_t>(outputData[0].dim);
+            auto shapeY = ConvertShapeToInt(fShapeY);
+            model.AddConstantTensor(fNY, shapeY, &value);
+            fIsOutputConstant = true;
+            if (model.Verbose())
+               std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
+                   << " and values {" << value <<  "} (constant) " << std::endl;
+         }
+      }
+      if (!fIsOutputConstant) {
+         // Add output tensor
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+         fType = ConvertTypeToString(model.GetTensorType(fNX));
+         if (model.Verbose())
+               std::cout <<  "Gather: input " << fNX << " " << ConvertDimShapeToString(fShapeX) << " indices " << fNIndices << ConvertDimShapeToString(fShapeIndices)
+                         << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << std::endl;
+      }
+   }
+
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      std::stringstream out;
+      out << "//--------- Gather " << opName << " --> " << fNY << "  " << ConvertDimShapeToString(fShapeY) << "\n";
+      if (fIsOutputConstant) {
+         // no code to generate here for constant output. Tensor output is defined in Session constructor
+         out << "//--------------------(constant)----------\n";
+         return out.str();
+      }
+      // The shape of the output is q + r - 1
+      size_t r = fShapeX.size();
+      // Indices of shape q
+      size_t q = fShapeIndices.size();
+      // Strides
+      auto stridesX = UTILITY::ComputeStrideFromShape(fShapeX);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+
+      // case fIndices is not known we need to correct for negative axis indices at run-time
+      if (fIndices.empty()) {
+         auto indicesLength = ConvertDimShapeToLength(fShapeIndices);
+         out << SP << "// correct in case of negative gather indices\n";
+         out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n";
+         out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n";
+         out << SP << SP << SP <<  "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n";
+         out << SP << "}\n";
+      }
+
+      // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}]
+      // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1)
+      // iterate in [0 ... axis) [0 ... q) [axis ... r - 1)
+      // for j_0, j_1, ..., j_{axis-1}
+
+      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
+         std::string index = "j_" + std::to_string(j);
+         for (size_t k = 0; k <= j; k++) out << SP;
+         out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n";
+      }
+      // for i_0, i_1, ..., i_{q - 1}
+      for (size_t i = 0; i < q; i++) {
+         std::string index = "i_" + std::to_string(i);
+         for (size_t k = 0; k <= i + fAttrAxis; k++) out << SP;
+         out << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n";
+      }
+      // for j_axis, j_{axis + 1}, ..., j_{r - 1}
+      for (size_t j = fAttrAxis; j + 1 < r; j++) {
+         std::string index = "j_" + std::to_string(q+j); // annotate index using output axis
+         for (size_t k = 0; k <= q + j; k++) out << SP;
+         out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n";
+      }
+
+      // add a scope for local variables in case above loop are not done
+      if (fAttrAxis == 0 && q == 0 && r <= 1)
+         out << SP << "{   // scalar case \n";
+
+      // output index
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t y_index = ";
+      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
+         if (j > 0) out << " + ";
+         out << "j_" << j;
+         if (stridesY[j].dim != 1) out << " * " << stridesY[j];
+      }
+      for (size_t i = 0; i < q; i++) {
+         if (fAttrAxis + i > 0) out << " + ";
+         out << "i_" << i;
+         if (stridesY[fAttrAxis + i].dim != 1) out << " * " << stridesY[fAttrAxis + i];
+      }
+      for (size_t j = fAttrAxis; j + 1 < r; j++) {
+         if (j + q > 0) out << " + ";
+         out << "j_" << q+j;
+         if (stridesY[q+j].dim != 1) out << " * " << stridesY[q+j];
+      }
+      // empty case
+      if (fAttrAxis == 0 && q == 0 && r <= 1)
+         out << "0";
+      out << ";\n";
+
+      // input Indices
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t i_index = ";
+      for (size_t i = 0; i < q; i++) {
+         if (i > 0) out << " + ";
+         out << "i_" << i;
+         if (stridesIndices[i].dim != 1) out << " * " << stridesIndices[i];
+      }
+      // empty case
+      if (q == 0)
+         out << "0";
+      out << ";\n";
+
+      // K
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t k = static_cast<size_t>(" << "tensor_" << fNIndices << "[i_index]" << ");\n";
+      // Input
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t x_index = k";
+      if (stridesX[fAttrAxis].dim != 1) out << " * " << stridesX[fAttrAxis];
+      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
+         out << " + ";
+         out << " j_" << j;
+         if (stridesX[j].dim != 1) out << " * " << stridesX[j];
+      }
+      // for input corresponding stride is axis+1,.... r
+      // loop is on j from fAttrAxis, so consider stridesX[j+1]
+      for (size_t j = fAttrAxis; j+1 < r; j++) {
+         out << " + ";
+         out << " j_" << q+j;
+         if (stridesX[j+1].dim != 1) out << " * " << stridesX[j+1];
+      }
+      out << ";\n";
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n";
+
+      // end loops j_k, j_{k + 1}, ..., j_{r - 2}
+      for (size_t j = q+r-1; j > 0; j--) {
+         for (size_t k = 0; k <j; k++) out << SP;
+         out << "}\n";
+      }
+      // close empty scope if it was opened
+      if (q == 0 && fAttrAxis == 0 && r <= 1)
+         out << SP << "}   // close Gather scope for scalar case \n";
+
+
+      return out.str();
+   }
+
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("SOFIE Gather Op called to Generate without being initialized first");
+
+    const std::size_t D  = fShapeY.size();   // output rank = q + r - 1
+    const std::size_t r  = fShapeX.size();
+    const std::size_t q  = fShapeIndices.size();
+
+    auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+    auto stridesX       = UTILITY::ComputeStrideFromShape(fShapeX);
+    auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+
+    std::string kname = "GatherKernel_" + opName;
+
+    std::string op;
+    op  = "\n//------ GATHER_KERNEL_ALPAKA\n";
+    op += SP + "struct " + kname + " {\n";
+    op += SP + SP + "template<typename TAcc, typename T>\n";
+    op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+    op += SP + SP + SP + "TAcc const& acc,\n";
+    op += SP + SP + SP + "T const* __restrict__ input,\n";
+    op += SP + SP + SP + "int64_t const* __restrict__ indices,\n";
+    op += SP + SP + SP + "T* __restrict__ output,\n";
+    op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+    op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+    op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+    op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+    op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+    for (std::size_t d = 0; d < D; ++d) {
+        op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+            + " = (elem_idx / " + stridesY[d].GetVal() + "u) % "
+            + fShapeY[d].GetVal() + "u;\n";
+    }
+    op += "\n";
+
+    // Output dims [axis ... axis+q) correspond to the indices tensor dims [0 ... q)
+    // so i_index = sum over i in [0,q): out_{axis+i} * stridesIndices[i]
+    if (q == 0) {
+        op += SP + SP + SP + SP + "std::size_t const i_index = 0u;\n";
+    } else {
+        op += SP + SP + SP + SP + "std::size_t const i_index =\n";
+        for (std::size_t i = 0; i < q; ++i) {
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(fAttrAxis + i)
+                + " * " + stridesIndices[i].GetVal() + "u";
+            op += (i + 1 < q) ? " +\n" : ";\n";
+        }
+    }
+    op += "\n";
+
+    op += SP + SP + SP + SP + "int64_t k = indices[i_index];\n";
+    op += SP + SP + SP + SP + "if (k < 0) k += " + fShapeX[fAttrAxis].GetVal() + ";\n";
+    op += SP + SP + SP + SP + "if (k < 0) k = 0;\n";
+    op += SP + SP + SP + SP + "if (k >= static_cast<int64_t>(" + fShapeX[fAttrAxis].GetVal() + ")) "
+        + "k = static_cast<int64_t>(" + fShapeX[fAttrAxis].GetVal() + ") - 1;\n\n";
+
+    // x_index = k * stridesX[axis]
+    //         + sum over j in [0, axis):   out_j          * stridesX[j]
+    //         + sum over j in [axis+1, r): out_{j-1+q}    * stridesX[j]
+    // (the dims after axis in Y are shifted by q-1 relative to X)
+    op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+    op += SP + SP + SP + SP + SP + "static_cast<std::size_t>(k) * " + stridesX[fAttrAxis].GetVal() + "u";
+    for (std::size_t j = 0; j < static_cast<std::size_t>(fAttrAxis); ++j) {
+        op += " +\n" + SP + SP + SP + SP + SP
+            + "out_" + std::to_string(j) + " * " + stridesX[j].GetVal() + "u";
+    }
+    for (std::size_t j = fAttrAxis + 1; j < r; ++j) {
+        // in Y, the coord for X's dim j lives at output dim q + j - 1
+        op += " +\n" + SP + SP + SP + SP + SP
+            + "out_" + std::to_string(q + j - 1) + " * " + stridesX[j].GetVal() + "u";
+    }
+    op += ";\n\n";
+
+    op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+    op += SP + SP + SP + "}\n";
+    op += SP + SP + "}\n";
+    op += SP + "};\n";
+
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    std::string kname = "GatherKernel_" + opName;
+    return SP + kname + " gatherKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("SOFIE Gather Op called to Generate without being initialized first");
+
+    auto totalElements = ConvertDimShapeToLength(fShapeY);
+    std::string kname = "gatherKernel_" + opName;
+
+    std::stringstream out;
+    out << "\n//------ GATHER_GPU_ALPAKA\n";
+    out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+    out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+    out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+    out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+        << ", " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+    return out.str();
+}
+
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_RELU
diff --git a/core/inc/SOFIE/ROperator_GatherND.hxx b/core/inc/SOFIE/ROperator_GatherND.hxx
new file mode 100644
index 0000000..ffcdab8
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_GatherND.hxx
@@ -0,0 +1,297 @@
+#ifndef SOFIE_ROPERATOR_GATHERND
+#define SOFIE_ROPERATOR_GATHERND
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <numeric>
+
+namespace SOFIE {
+
+class ROperator_GatherND final : public ROperator
+{
+private:
+
+   int64_t fBatchDims = 0;
+
+   std::string fNData;
+   std::string fNIndices;
+   std::string fNY;
+
+   std::vector<size_t> fShapeData;
+   std::vector<size_t> fShapeIndices;
+   std::vector<size_t> fShapeY;
+
+   std::string fType;
+
+public:
+   ROperator_GatherND() {}
+   ROperator_GatherND(int64_t batchDims,
+                      std::string nameData,
+                      std::string nameIndices,
+                      std::string nameY)
+      : fBatchDims(batchDims),
+        fNData(UTILITY::Clean_name(nameData)),
+        fNIndices(UTILITY::Clean_name(nameIndices)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNData, fNIndices };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return { input[0] };
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      return { input[0] };
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNData))
+         throw std::runtime_error("SOFIE GatherND: data tensor " + fNData + " not found in model");
+      if (!model.CheckIfTensorAlreadyExist(fNIndices))
+         throw std::runtime_error("SOFIE GatherND: indices tensor " + fNIndices + " not found in model");
+
+      fShapeData    = model.GetTensorShape(fNData);
+      fShapeIndices = model.GetTensorShape(fNIndices);
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      if (r < 1)
+         throw std::runtime_error("SOFIE GatherND: data rank must be >= 1");
+      if (q < 1)
+         throw std::runtime_error("SOFIE GatherND: indices rank must be >= 1");
+      if (b >= std::min(q, r))
+         throw std::runtime_error("SOFIE GatherND: batch_dims must be < min(q, r)");
+      if (last_idx_dim > r - b)
+         throw std::runtime_error("SOFIE GatherND: indices_shape[-1] must be <= r - batch_dims");
+
+      for (size_t i = 0; i < b; ++i) {
+         if (fShapeData[i] != fShapeIndices[i])
+            throw std::runtime_error("SOFIE GatherND: first batch_dims dimensions of data and indices must match");
+      }
+
+      // Output shape: batch_dims + indices[0..q-2] + data[b + last_idx_dim .. r-1]
+      // rank = b + (q - b - 1) + (r - b - last_idx_dim)
+      //      = q + r - last_idx_dim - 1 - b
+      fShapeY.clear();
+      for (size_t i = 0; i < b; ++i)
+         fShapeY.push_back(fShapeData[i]);
+      for (size_t i = b; i + 1 < q; ++i)
+         fShapeY.push_back(fShapeIndices[i]);
+      for (size_t i = b + last_idx_dim; i < r; ++i)
+         fShapeY.push_back(fShapeData[i]);
+
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNData), fShapeY);
+      fType = ConvertTypeToString(model.GetTensorType(fNData));
+
+      if (model.Verbose())
+         std::cout << "GatherND: data " << ConvertShapeToString(fShapeData)
+                   << " indices " << ConvertShapeToString(fShapeIndices)
+                   << " batch_dims=" << fBatchDims
+                   << " -> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl;
+   }
+
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first");
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      auto stridesData    = UTILITY::ComputeStrideFromShape(fShapeData);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+      auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      size_t totalOutput = ConvertShapeToLength(fShapeY);
+
+      std::stringstream out;
+      out << SP << "//--------- GatherND operator " << opName << "\n";
+
+      out << SP << "for (size_t out_idx = 0; out_idx < " << totalOutput << "; out_idx++) {\n";
+
+      out << SP << SP << "size_t rem = out_idx;\n";
+      size_t Dy = fShapeY.size();
+      for (size_t d = 0; d < Dy; ++d) {
+         out << SP << SP << "size_t oy_" << d << " = rem / " << stridesY[d] << ";\n";
+         out << SP << SP << "rem %= " << stridesY[d] << ";\n";
+      }
+
+      out << SP << SP << "size_t idx_base = 0;\n";
+      for (size_t i = 0; i < b; ++i)
+         out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n";
+      for (size_t i = b; i + 1 < q; ++i)
+         out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n";
+
+      out << SP << SP << "size_t data_idx = 0;\n";
+      for (size_t i = 0; i < b; ++i)
+         out << SP << SP << "data_idx += oy_" << i << " * " << stridesData[i] << ";\n";
+
+      out << SP << SP << "for (size_t k = 0; k < " << last_idx_dim << "; k++) {\n";
+      out << SP << SP << SP << "int64_t idx_val = tensor_" << fNIndices
+          << "[idx_base + k * " << stridesIndices[q - 1] << "];\n";
+      out << SP << SP << SP << "if (idx_val < 0) idx_val += " << "static_cast<int64_t>(tensor_"
+          << fNData << "_shape[" << b << " + k]);\n";
+      out << SP << SP << SP << "data_idx += static_cast<size_t>(idx_val) * " << "data_stride_b_plus_k_" << opName << "[k];\n";
+      out << SP << SP << "}\n";
+
+      // Accumulate trailing data dims from output coords
+      // Y dims [b + (q-b-1) .. ] correspond to data dims [b + last_idx_dim .. r-1]
+      size_t y_trailing_start = b + (q - b - 1);
+      for (size_t i = b + last_idx_dim; i < r; ++i) {
+         size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim));
+         out << SP << SP << "data_idx += oy_" << oy_dim << " * " << stridesData[i] << ";\n";
+      }
+
+      out << SP << SP << "tensor_" << fNY << "[out_idx] = tensor_" << fNData << "[data_idx];\n";
+      out << SP << "}\n";
+
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first");
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      auto stridesData    = UTILITY::ComputeStrideFromShape(fShapeData);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+      auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      size_t Dy = fShapeY.size();
+      size_t totalOutput = ConvertShapeToLength(fShapeY);
+
+      std::string kname = "GatherNDKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ GATHERND_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ data,\n";
+      op += SP + SP + SP + "int64_t const* __restrict__ indices,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      for (size_t d = 0; d < Dy; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d)
+             + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % "
+             + std::to_string(fShapeY[d]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const idx_base =\n";
+      // batch dims: oy_0..oy_{b-1} * stridesIndices[0..b-1]
+      // outer idx dims: oy_b..oy_{b+(q-b-2)} * stridesIndices[b..q-2]
+      bool first = true;
+      for (size_t i = 0; i < q - 1; ++i) {
+         op += SP + SP + SP + SP + SP
+             + (first ? "" : "+ ")
+             + "oy_" + std::to_string(i) + " * " + std::to_string(stridesIndices[i]) + "u\n";
+         first = false;
+      }
+      if (first) op += SP + SP + SP + SP + SP + "0u\n"; // q==1: scalar index tuple
+      op += SP + SP + SP + SP + SP + ";\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t data_idx =\n";
+      first = true;
+      for (size_t i = 0; i < b; ++i) {
+         op += SP + SP + SP + SP + SP
+             + (first ? "" : "+ ")
+             + "oy_" + std::to_string(i) + " * " + std::to_string(stridesData[i]) + "u\n";
+         first = false;
+      }
+      if (first) op += SP + SP + SP + SP + SP + "0u\n";
+      op += SP + SP + SP + SP + SP + ";\n\n";
+
+      op += SP + SP + SP + SP + "// Read " + std::to_string(last_idx_dim) + "-element index tuple\n";
+      for (size_t k = 0; k < last_idx_dim; ++k) {
+         size_t idx_offset = k;
+         size_t data_axis  = b + k;
+         op += SP + SP + SP + SP + "{\n";
+         op += SP + SP + SP + SP + SP
+             + "int64_t idx_val = indices[idx_base + "
+             + std::to_string(idx_offset) + "u];\n";
+         op += SP + SP + SP + SP + SP
+             + "if (idx_val < 0) idx_val += "
+             + std::to_string(fShapeData[data_axis]) + ";\n";
+         op += SP + SP + SP + SP + SP
+             + "data_idx += static_cast<std::size_t>(idx_val) * "
+             + std::to_string(stridesData[data_axis]) + "u;\n";
+         op += SP + SP + SP + SP + "}\n";
+      }
+      op += "\n";
+
+      size_t y_trailing_start = b + (q - b - 1);
+      for (size_t i = b + last_idx_dim; i < r; ++i) {
+         size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim));
+         op += SP + SP + SP + SP
+             + "data_idx += oy_" + std::to_string(oy_dim)
+             + " * " + std::to_string(stridesData[i]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "output[elem_idx] = data[data_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "GatherNDKernel_" + opName;
+      return SP + kname + " gatherNDKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("SOFIE GatherND called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "gatherNDKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ GATHERND_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+          << ", " << kname
+          << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << totalElements << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+};
+
+} // SOFIE
+
+#endif // SOFIE_ROPERATOR_GATHERND
diff --git a/core/inc/SOFIE/ROperator_Gemm.hxx b/core/inc/SOFIE/ROperator_Gemm.hxx
new file mode 100644
index 0000000..eecb33b
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Gemm.hxx
@@ -0,0 +1,860 @@
+#ifndef SOFIE_ROPERATOR_GEMM
+#define SOFIE_ROPERATOR_GEMM
+
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+#include <iomanip>
+#include <limits>
+#include <cassert>
+
+
+namespace SOFIE{
+
+
+   template <typename T>
+   class ROperator_Gemm final : public ROperator
+   {
+
+   private:
+      bool fIsDynamic = false;
+      bool fBroadcastBias = false;
+      bool fCheckBiasShapeAtRuntime = false; // flag to identify the need to do a run time check of bias shape compatibility in case of dynamic shapes and uni-directional broadcasting
+
+      float fAttrAlpha = 1.0;
+      float fAttrBeta = 1.0;
+      int_t fAttrTransA = 0;
+      int_t fAttrTransB = 0;
+
+      std::string fNA;
+      std::string fNB;
+      std::string fNC = "";
+      std::string fNY;
+      std::string fType;
+      EActivationType fActivation;
+      float fLeakyReluAlpha = 0.01f;   // used when fActivation == LEAKYRELU
+      std::vector<Dim> fShapeA;
+      std::vector<Dim> fShapeB;
+      std::vector<size_t> fShapeC;
+      std::vector<Dim> fDimShapeC;
+      std::vector<Dim> fShapeY;
+      RModel * fModel = nullptr;
+
+   public:
+
+      ROperator_Gemm(){}
+      ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED):
+         fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
+         fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
+      {
+         fActivation = activation;
+         fType = "float";
+         static_assert(std::is_same_v<T, float>,
+                  "TMVA::SOFIE - Unsupported type parsing a Gemm operator");
+         fInputTensorNames = { fNA, fNB };
+         fOutputTensorNames = { fNY };
+         fKind = OperatorKind::GEMM;
+      }
+
+      ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED):
+         fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
+         fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation)
+      {
+         fActivation = activation;
+         fType = "float";
+
+         fInputTensorNames = {fNA, fNB, fNC};
+         fOutputTensorNames = { fNY };
+         fKind = OperatorKind::GEMM;
+      }
+
+      std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+         ETensorType out = input[0];
+         return {out};
+      }
+
+      template <typename U>
+      std::vector<U> DoShapeInference(const std::vector<std::vector<U>> & input){
+         if (input.size() > 3) throw std::runtime_error("SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor");
+         // accept tensor with input dimensions > 2
+         // example: A = (d1,d2,...,N1,N2)  B = (d1,d2,...,N2,N3)    --> Y = (d1,d2,..,N1,N3)
+         for (auto& i: input){
+            if (i.size() < 2){
+               throw std::runtime_error("SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions");
+            }
+         }
+
+         // when there are 3 inputs shape of Y is the one of C
+         if (input.size() == 3){
+            //shape of C is shape of Y
+            return input[2];
+         }
+         // ioffset cannot be less than 2
+         int ioffset = input[0].size()-2;  // in case of tensors with dim > 2
+
+         std::vector<U> s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2);
+         std::vector<U> s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2);
+         // reverse in case of transpose
+         if (fAttrTransA){
+            std::reverse(s_a.begin(), s_a.end());
+         }
+         if (fAttrTransB){
+            std::reverse(s_b.begin(), s_b.end());
+         }
+         std::vector<U> s_y;
+         s_y.reserve(input[0].size());
+         if (input[0].size() > 2 && input[1].size() == input[0].size()) {
+            // in case of dim > 2 first dimensions are equal to the input ones not
+            // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4))
+            // here could probably use the Broadcasting function  UTILITY::MultidirectionalBroadcastShape
+            for (size_t i = 0; i < input[0].size()-2; i++) {
+               Dim valueA = input[0][i];
+               Dim valueB = input[1][i];
+               if (valueA.GetVal() != valueB.GetVal()) {
+                  if (valueB.GetVal() == "1")
+                     s_y.push_back(input[0][i]);
+                  else if (valueA.GetVal() == "1")
+                     s_y.push_back(input[1][i]);
+                  else if (!valueA.isParam && !valueB.isParam)
+                     throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and "
+                        + valueB.GetVal());
+                  else if (valueA.isParam && valueB.isParam){
+                      // check which parameter is first in RModel list
+                     auto & dimNames = fModel->GetDimShapeNames();
+                     auto p1 = std::find(dimNames.begin(), dimNames.end(), valueA.param);
+                     auto p2 = std::find(dimNames.begin(), dimNames.end(), valueB.param);
+                     if (p1 < p2) s_y.push_back(input[0][i]);
+                     else  s_y.push_back(input[1][i]);
+                  }
+                  else if (!valueA.isParam)
+                     s_y.push_back(input[0][i]);
+                  else if (!valueB.isParam)
+                     s_y.push_back(input[1][i]);
+                  else
+                     throw std::runtime_error("SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and "
+                        + valueB.GetVal());
+               }
+               else
+                  s_y.push_back(input[0][i]);
+            }
+         }
+
+         s_y.push_back(s_a[0]);
+         s_y.push_back(s_b[1]);
+         return s_y;
+      }
+
+      std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+         std::vector<std::vector<size_t>> ret;
+         ret.push_back(DoShapeInference<size_t>(input));
+         return ret;
+      }
+      std::vector<Dim> DynamicShapeInference(const std::vector<std::vector<Dim>> & input){
+         return DoShapeInference<Dim>(input);
+      }
+
+
+
+      void Initialize(RModel& model) override {
+         //TODO: propagate A or B as specified by ONNX standard
+         fModel = &model;
+
+         if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){   //input must be a graph input, or already initialized intermediate tensor
+            throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model");
+         }
+         if (fNC != ""){
+            if (model.CheckIfTensorAlreadyExist(fNC) == false){   //input must be a graph input, or already initialized intermediate tensor
+               throw std::runtime_error("SOFIE Gemm Op Input Tensor " + fNC + " is not found in model");
+            }
+         }
+         if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) {
+            fShapeA = model.GetDynamicTensorShape(fNA);
+            fIsDynamic = true;
+         } else {
+            auto shapeA_int = model.GetTensorShape(fNA);
+            fShapeA = ConvertShapeToDim(shapeA_int);
+         }
+         // case A is of dim1 we prepend a 1 but we need to remove later
+         bool prependOne = false;
+         if (fShapeA.size() == 1) {
+            fShapeA.insert(fShapeA.begin(), Dim(1));
+            prependOne = true;
+         }
+
+         if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) {
+            fShapeB = model.GetDynamicTensorShape(fNB);
+            fIsDynamic = true;
+         }
+         else {
+            auto shapeB_int = model.GetTensorShape(fNB);
+            fShapeB = ConvertShapeToDim(shapeB_int);
+         }
+         // case B is dim1 we append a 1 but we need to remove later
+         bool appendOne = false;
+         if (fShapeB.size() == 1) {
+            fShapeB.insert(fShapeB.end(), Dim(1));
+            appendOne = true;
+         }
+         // assume if not shape is 2 that extra values are 1.
+         // implement also MatMul case where we stack matrices (see numpy.matmul)
+         if (fShapeA.size() != fShapeB.size()) {
+            // if different dimensions we prepend 1 values
+            if (fShapeA.size() < fShapeB.size()) {
+               fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1));
+            } else if (fShapeB.size() < fShapeA.size()) {
+               fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1));
+            }
+         }
+
+         fShapeY = DynamicShapeInference({fShapeA, fShapeB});
+         std::vector<size_t> shapeY = ConvertShapeToInt(fShapeY);
+
+         // bias is normally not dynamic (not support it for time being)
+         if (fNC != ""){
+            if (model.IsDynamicTensor(fNC))
+               fDimShapeC = model.GetDynamicTensorShape(fNC);
+            else {
+               fShapeC = model.GetTensorShape(fNC);
+               fDimShapeC = ConvertShapeToDim(fShapeC);
+            }
+            // for dynamic outputs broadcasting is always needed
+            bool broadcast_needed = false;
+            if (fIsDynamic && shapeY.empty())
+               broadcast_needed = true;
+            else
+               // consider broadcasting also if they have different length
+               broadcast_needed = (fShapeC != shapeY);
+
+
+            if (broadcast_needed) {
+               fBroadcastBias = true;
+               // check if broadcasting is compatible and note that prepend 1 to shapeC
+               auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, fDimShapeC);
+               // return flag must not have bit equal to 2 since this is a unidirectional broadcast of C->Y
+               //
+               if ((r.first & 2) == 2) {
+                  throw std::runtime_error("SOFIE Gemm Op - bias tensor of shape " + ConvertDimShapeToString(fDimShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY));
+               } else if (r.first  == 4) {
+                  // we need to do a run time check of bias shape if it is compatible
+                  fCheckBiasShapeAtRuntime = true;
+               }
+               fShapeC = ConvertShapeToInt(fDimShapeC);
+            }
+         }
+
+         // remove appended or prepended value of 1 in Y
+         if (prependOne) {
+            if (fIsDynamic)
+               fShapeY.erase(fShapeY.begin());
+            else
+               shapeY.erase(shapeY.begin());
+         }
+         if (appendOne) {
+            if (fIsDynamic)
+               fShapeY.erase(fShapeY.end()-1);
+            else
+               shapeY.erase(shapeY.end()-1);
+         }
+
+         if (!fIsDynamic)
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY);
+         else
+            model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY);
+
+         if (model.Verbose()){
+            std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape ";
+            if (fIsDynamic)
+               std::cout << ConvertDimShapeToString(fShapeY) << std::endl;
+            else
+               std::cout << ConvertShapeToString(shapeY) << std::endl;
+         }
+
+         model.AddNeededStdLib("algorithm");
+      }
+
+      std::string Generate(std::string opName) override {
+         opName = "op_" + opName;
+
+         // if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) {
+         //    throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first");
+         // }
+         std::stringstream out;
+         out << "\n//--------- Gemm " << opName << " " << ConvertDimShapeToString(fShapeA) << " * " << ConvertDimShapeToString(fShapeB)
+             << " -> " << ConvertDimShapeToString(fShapeY) << "\n";
+         // need to consider case A and B have dim > 2 (for MatMul)
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         int64_t dimY = fShapeY.size();
+         int64_t dimC = fDimShapeC.size();
+         if (dimA != dimB || dimA != dimY || (fBroadcastBias && dimC != dimY)) {
+             std::cout << " shape A " << ConvertDimShapeToString(fShapeA)
+                       << " shape B " << ConvertDimShapeToString(fShapeB)
+                       << " shape C " << ConvertDimShapeToString(fDimShapeC)
+                       << " shape Y " << ConvertDimShapeToString(fShapeY) << std::endl;
+             throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output");
+         }
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         // size of A: if (transposeA) is m*k else k*m
+         // size of B  n*k
+         std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
+         // extra dimensions in case of stacked MatMul
+         std::vector<Dim> sExtraY;
+         for (int64_t i = 0; i < dimY-2; i++) {
+            sExtraY.push_back(fShapeY[i]);
+         }
+         auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
+         auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul)
+         std::string lengthExtra_C;
+         std::vector<Dim> sExtraC;
+         std::vector<Dim> sC;
+         bool haveExtraC = false;
+         if (dimC > 2) {
+            sC = {fDimShapeC[dimC-2], fDimShapeC[dimC-1]};
+            for (int64_t i = 0; i < dimC-2; i++) {
+               sExtraC.push_back(fDimShapeC[i]);
+            }
+            lengthExtra_C = ConvertDimShapeToLength(sExtraC);
+            if (lengthExtra_C != "1") haveExtraC = true;
+         } else if (dimC > 0) {
+            for (int64_t i = 0; i < dimC; i++) {
+               sC.push_back(fDimShapeC[i]);
+            }
+         }
+
+         // case bias is present
+         if (!fNC.empty()){
+             // when the 2 last dims of bias and Y are not compatible we need to perform a run time broadcast
+            if (sC != sY) fBroadcastBias = true;
+            if (!fBroadcastBias) {
+               // add a check in case broadcasting was not needed or done outside of session
+               // C should have smaller dimension of Y
+               if (!fIsDynamic) {
+                  if ((std::stoi(lengthGemm) != std::stoi(ConvertDimShapeToLength(sC))) ||
+                      ( haveExtraC &&  std::stoi(lengthExtra_Y) != std::stoi(lengthExtra_C)))
+                     throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor " + fNC + " has not correct size "
+                            + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
+               } else {
+                  // add a dynamic check (C should not be a dynamic tensor)
+                  out << SP << "assert(" << lengthGemm << " == " <<  ConvertDimShapeToLength(sC) << ");\n";
+                  if (haveExtraC) out << SP << "assert(" << lengthExtra_Y << " == " <<  lengthExtra_C << ");\n";
+               }
+            }
+         } else {
+            fBroadcastBias = false;
+            //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use
+            // the previous result
+            if (fAttrBeta != 0) {
+               // some model don't have bias but Beta is not zero - force it to zero
+               fAttrBeta = 0;
+               std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n";
+            }
+         }
+
+         // include MatMul case where we stack the Gemm operations
+         // exclude case where we have only 1's in the additional dims
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra_Y) > 1);
+         // compute input offset for stack multiplications
+         std::string lengthExtra_A;
+         std::string lengthExtra_B;
+         std::string increment_A;
+         std::string increment_B;
+
+         if (doStackMul) {
+            std::vector<Dim> sA(fShapeA.begin(), fShapeA.begin()+dimA-2);
+            std::vector<Dim> sB(fShapeB.begin(), fShapeB.begin()+dimB-2);
+            std::vector<Dim> mA = {fShapeA[dimA-2], fShapeA[dimA-1]};
+            std::vector<Dim> mB = {fShapeB[dimB-2], fShapeB[dimB-1]};
+            lengthExtra_A = ConvertDimShapeToLength(sA);
+            lengthExtra_B = ConvertDimShapeToLength(sB);
+            // if A ( b, m, k) and B (b, k, n) these are the strides of A and B ( m*k for A and n*k for B )
+            increment_A = ConvertDimShapeToLength(mA);
+            increment_B = ConvertDimShapeToLength(mB);
+         }
+         bool extraA = (doStackMul && lengthExtra_A != "1");
+         bool extraB = (doStackMul && lengthExtra_B != "1");
+         bool extraC = (doStackMul && haveExtraC && !fBroadcastBias);
+         // run time check for bias broadcasting
+         std::string biasShapeType = opName + "_biasShapeType";
+         if (fBroadcastBias && fCheckBiasShapeAtRuntime) {
+            // create a flag according to bias shape:
+            // = 1 for (1,Y2)
+            // = 2 for (Y1,1)
+            // = 3 for a scalar
+            out << SP << "int " << biasShapeType << " = 0;\n";
+            // case vector of columns
+            if (sC[0].GetVal() != "1" && sC[1].GetVal() != sY[1].GetVal())
+               out << SP << "if (" << sC[0] << " == 1 && " << sC[1] << " == " << sY[1] << ")\n";
+            else if (sC[0].GetVal() == "1")
+               out << SP << "if (" << sC[1] << " == " << sY[1] << ")\n";
+            else if (sC[1].GetVal() == sY[1].GetVal())
+               out << SP << "if (" << sC[0] << " == 1)\n";
+
+            out << SP << SP << biasShapeType << " = 1;\n";
+
+            // case vector of rows
+            if (sC[1].GetVal() != "1" && sC[0].GetVal() != sY[0].GetVal())
+               out << SP << "else if (" << sC[1] << " == 1 && " << sC[0] << " == " << sY[0] << ")\n";
+            else if (sC[1].GetVal() == "1")
+                out << SP << "else if (" << sC[0] << " == " << sY[0] << ")\n";
+            else if (sC[0].GetVal() == sY[0].GetVal())
+               out << SP << "else if (" << sC[1] << " == 1)\n";
+
+            out << SP << SP << biasShapeType << " = 2;\n";
+
+            // case scalar
+            if (sC[0].GetVal() != "1" && sC[1].GetVal() != "1")
+               out << SP << "else if (" << sC[0] << " == 1 && " << sC[1] << " == 1 )\n";
+            else if (sC[0].GetVal() == "1")
+               out << SP << "else if (" << sC[1] << " == 1)\n";
+            else if (sC[1].GetVal() == "1")
+               out << SP << "else if (" << sC[0] << " == 1)\n";
+            out << SP << SP << biasShapeType << " = 3;\n";
+            out << SP << "else\n";
+            out << SP << SP << "throw std::runtime_error(\"SOFIE Gemm Op - bias tensor "
+                                 << ConvertDimShapeToString(fDimShapeC) << " cannot be broadcasted to "
+                                 << ConvertDimShapeToString(fShapeY) << "\");\n";
+         }
+         auto SP2 = SP;
+         if (doStackMul) {
+            out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations
+            if (extraA)
+               out << SP << "size_t " << opName << "_A_offset = 0;\n";
+            if (extraB)
+               out << SP << "size_t " << opName << "_B_offset = 0;\n";
+            if (extraC)
+               out << SP << "size_t " << opName << "_C_offset = 0;\n";
+            out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n";
+            SP2 += SP;
+         }
+         // do the bias broadcasting at run time by
+         // initializing output Y vector with bias values
+         if (fBroadcastBias) {
+
+            fAttrBeta = 1.;
+
+            // loop on first output dimension
+            out << SP2 << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n";
+            out << SP2 << SP << "size_t y_index = ";
+            if (doStackMul) // add offset in case of stack multiplications (not sure if bias is present in these cases)
+               out <<  opName << "_y_offset + ";
+            if (sY[1].GetVal() != "1")
+               out << sY[1] << " * j;\n";
+            else
+               out << "j;\n";
+
+            std::string prefix = SP2 + SP + "SOFIE::";
+            std::string target = "tensor_" + fNY;
+            if (sC.size() != 2) {
+               throw std::runtime_error("SOFIE Gemm Op - invalid rank for bias tensor " + ConvertDimShapeToString(fDimShapeC) + ConvertDimShapeToString(sC));
+            } if (sC[0].GetVal() == "1" && sC[1].GetVal() == sY[1].GetVal()) {
+               out << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n";
+            } else if (sC[1].GetVal() == "1" && sC[0].GetVal() == sY[0].GetVal()) {
+               out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n";
+            } else if (sC[0].GetVal() == "1" && sC[1].GetVal() == "1") {
+               // scalar case
+               out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n";
+            } else if (fCheckBiasShapeAtRuntime) {
+               // in the generic dynamic case we check at run time that bias is compatible
+               // we check that bias[0] = 1 or equal to SY[0] and that bias[1] = 1 or equal to SY[1]
+               // tbd: this run-time check coul;d be moved outside the loop for better run time efficiency
+               out << SP2 << SP << "if (" << biasShapeType << " == 1)\n";   // case vector of columns
+               out << SP << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n";
+               out << SP2 << SP << "else if (" << biasShapeType << " == 2)\n";  // case vector of rows
+               out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n";
+               out << SP2 << SP << "else \n";  // scalar case
+               out << SP << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n";
+            } else {
+               throw std::runtime_error("SOFIE Gemm Op - invalid shape for bias tensor " + ConvertDimShapeToString(fDimShapeC));
+            }
+
+            out << SP2 << "}\n";
+         }
+
+         if (fType == "float"){
+
+            out << SP2 << "SOFIE::Gemm_Call(" << "tensor_" << fNY;
+             if (doStackMul) out << " + " << opName << "_y_offset";
+            out <<   ", "
+             << (fAttrTransB ? "true, " : "false, ")
+             << (fAttrTransA ? "true, " : "false, ")
+             << n << ", " << m << ", " << k << ", ";
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            if (extraB) out << " + " << opName << "_B_offset";
+            out << ", tensor_" << fNA;
+            if (extraA) out << " + " << opName << "_A_offset";
+            out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
+            // in the case of bias and no broadcasting needed - I need to add bias as an extra tensor in Gemm call
+            if (!fNC.empty() && !fBroadcastBias) {
+               out << "tensor_" << fNC;
+               if (extraC) {
+                  out << " + " << opName << "_C_offset";
+               }
+            } else {
+               out << "nullptr";
+            }
+            out << ");\n";
+
+         }
+
+         if (doStackMul) {
+            out << SP << SP <<  opName << "_y_offset += " << lengthGemm << ";\n";
+            if (lengthExtra_A != "1")
+               out << SP << SP << opName << "_A_offset += " << increment_A << ";\n";
+            if (lengthExtra_B != "1")
+               out << SP << SP << opName << "_B_offset += " << increment_B << ";\n";
+            if (extraC)
+               // increment_C is lengthGEmm
+               out << SP << SP << opName << "_C_offset += " << lengthGemm << ";\n";
+            out << SP << "}\n"; // end of loop on the stacked multiplication
+         }
+
+         // fuse activation with GEMM output (in-place on fNY)
+         if (fActivation == EActivationType::RELU) {
+               out << SP << "//--- applying RELU to output\n";
+               std::string tnsr = "tensor_" + fNY;
+               std::string reluSize = ConvertDimShapeToLength(fShapeY);
+               out << SP << "SOFIE::Relu(" << tnsr << ", " << tnsr << ", " << reluSize << ");\n";
+         } else if (fActivation == EActivationType::LEAKYRELU) {
+               out << SP << "//--- applying LEAKYRELU to output (in-place)\n";
+               std::string tnsr = "tensor_" + fNY;
+               std::string reluSize = ConvertDimShapeToLength(fShapeY);
+               out << SP << "{\n";
+               out << SP << SP << "constexpr float lrelu_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fLeakyReluAlpha << "f;\n";
+               out << SP << SP << "for (size_t _i = 0; _i < " << reluSize << "; ++_i)\n";
+               out << SP << SP << SP << tnsr << "[_i] = " << tnsr << "[_i] >= 0.f ? " << tnsr << "[_i] : lrelu_alpha * " << tnsr << "[_i];\n";
+               out << SP << "}\n";
+         }
+
+         return out.str();
+      }
+
+      std::string Generate_GPU_ALPAKA(std::string opName) override {
+         opName = "op_" + opName;
+
+         if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fDimShapeC.empty())) {
+            throw std::runtime_error("SOFIE Gemm Op called to Generate without being initialized first");
+         }
+         std::stringstream out;
+         out << "\n//--------- Gemm_GPU_ALPAKA\n";
+         // Note: alpaka::wait(queue) intentionally removed here.
+         // Operations are enqueued asynchronously on the Alpaka queue's CUDA
+         // stream.  Synchronisation only happens once per inference at the
+         // alpaka::wait(queue) call in _infer_impl's tail and at the
+         // cudaDeviceSynchronize in the benchmark harness.  Adding a wait
+         // before every GEMM stalls the CPU<->GPU pipeline and is the primary
+         // cause of SOFIE being slower than ONNXRuntime.
+         out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n";
+         out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n";
+         // need to consider case A and B have dim > 2 (for MatMul)
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         int64_t dimY = fShapeY.size();
+         if (dimA != dimB || dimA != dimY) {
+             throw std::runtime_error("SOFIE Gemm(MatMul) has invalid shape for inputs or output");
+         }
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
+         // extra dimensions in case of stacked MatMul
+         std::vector<Dim> sA;
+         for (int64_t i = 0; i < dimY-2; i++) {
+            sA.push_back(fShapeY[i]);
+         }
+         auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
+         auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
+
+         out << SP << "int " << opName << "_m = " << m << ";\n";
+         out << SP << "int " << opName << "_n = " << n << ";\n";
+         out << SP << "int " << opName << "_k = " << k << ";\n";
+         out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ";\n";
+         
+         // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface
+         out << SP << "float " << opName << "_beta = 0;\n";
+
+         // case bias is present
+         if (!fNC.empty()){
+            if (!fBroadcastBias) {
+               // add a check in case broadcasting was not needed or done outside of session
+               // C should have same size as Y
+               if (!fIsDynamic) {
+                  if (std::stoi(lengthGemm) != static_cast<int>(ConvertShapeToLength(fShapeC)))
+                     throw std::runtime_error("SOFIE Gemm Op " + opName + " Bias tensor has not correct size "
+                            + ConvertDimShapeToString(fDimShapeC) + " output length " + lengthGemm);
+               } else {
+                  // add a dynamic check (C should equal output size)
+                  out << SP << "assert(" << lengthGemm << " == " <<  ConvertDimShapeToLength(fDimShapeC) << ");\n";
+               }
+            }
+         } else {
+            fBroadcastBias = false;
+            //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use
+            // the previous result
+            if (fAttrBeta != 0) {
+               // some model don't have bias but Beta is not zero - force it to zero
+               fAttrBeta = 0;
+               std::cout << "WARNING: SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero - force it to zero\n";
+            }
+         }
+
+         // include MatMul case where we stack the Gemm operations
+         // exclude case where we have only 1's in the additional dims
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
+
+         // Compute per-iteration strides for each buffer when stacking.
+         // m/n/k are std::string from Dim::GetVal(); stoi() is safe for static shapes.
+         size_t strideA = 0, strideB = 0, strideY = 0, strideC = 0;
+         // GPU optimisation flags (static shapes only):
+         //   batchCollapseB  — strideB==0: B is the shared weight, so replace the N-iteration
+         //                     loop with a single cuBLASLt GEMM whose batch dimension is
+         //                     folded into the "n_sofie" parameter (n_sofie = m_onnx * N).
+         //                     This turns 30 per-token GEMM launches into one kernel call.
+         //   useSBatched     — both strides non-zero AND no bias: use cublasSgemmStridedBatched
+         //                     so the GPU driver schedules all N GEMMs in one call.
+         //                     (Bias epilogue is not available on the strided-batched path, so
+         //                      this only applies to pure MatMul ops such as softmax(QK^T)·V.)
+         bool batchCollapseB = false;
+         bool useSBatched    = false;
+         if (doStackMul && !fIsDynamic) {
+            strideA = static_cast<size_t>(std::stoi(m)) * static_cast<size_t>(std::stoi(k));
+            // B is a shared weight (broadcast over the stacked/batch dimension) when all its
+            // leading dims (beyond the 2 matrix dims) are 1.  In that case strideB must be 0
+            // so every iteration reads from the same B slice — not i * n*k (which goes OOB).
+            bool bLeadingDimsAllOne = true;
+            for (int64_t i = 0; i < dimB - 2; i++) {
+               if (fShapeB[i].dim != 1) { bLeadingDimsAllOne = false; break; }
+            }
+            strideB = bLeadingDimsAllOne ? 0
+                                         : static_cast<size_t>(std::stoi(n)) * static_cast<size_t>(std::stoi(k));
+            strideY = static_cast<size_t>(std::stoi(m)) * static_cast<size_t>(std::stoi(n));
+            strideC = !fNC.empty() ? static_cast<size_t>(std::stoi(lengthGemm)) : 0;
+
+            batchCollapseB = (strideB == 0);
+            useSBatched    = !batchCollapseB && fNC.empty();
+         }
+
+         // Emit the loop only for the serial fallback path (dynamic shapes, or static
+         // shapes where both A and B vary per iteration AND a bias epilogue is needed).
+         bool useSerialLoop = doStackMul && !batchCollapseB && !useSBatched;
+         if (useSerialLoop || (doStackMul && fIsDynamic)) {
+            out << SP << "size_t " << opName << "_yoffset = 0;\n";
+            out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n";
+         }
+
+         // Use getPtrNative() for all args so the raw-pointer overload is selected
+         // regardless of whether each buffer is a BufXxx or ViewPlainPtr.
+         // For the loop path, add per-iteration offsets; for the collapsed/batched
+         // paths, use base pointers (the whole contiguous tensor is processed at once).
+         std::string pA = "alpaka::getPtrNative(deviceBuf_" + fNA + ")";
+         std::string pB = "alpaka::getPtrNative(deviceBuf_" + fNB + ")";
+         std::string pY = "alpaka::getPtrNative(deviceBuf_" + fNY + ")";
+         if (useSerialLoop && !fIsDynamic) {
+            pA += " + i * " + std::to_string(strideA);
+            if (strideB > 0) pB += " + i * " + std::to_string(strideB);
+            // strideB == 0: B is a shared weight, pointer stays at base
+            pY += " + i * " + std::to_string(strideY);
+         }
+
+         if (useSBatched) {
+            // ----------------------------------------------------------------
+            // gemmStridedBatched: both A and B vary per batch (e.g. per attention
+            // head), and there is no bias.  Uses cublasSgemmStridedBatched via
+            // the legacy cuBLAS handle so all N GEMMs are issued in one driver call.
+            //
+            // sofieBLAS convention (column-major transpose trick):
+            //   transa_sofie = transB_onnx,  transb_sofie = transA_onnx
+            //   m_sofie      = n_onnx,        n_sofie      = m_onnx
+            //   A_sofie      = fNB,           B_sofie      = fNA
+            //   lda = m_sofie  (leading dim of A when transA_sofie='n')
+            //   ldb = k        (leading dim of B when transB_sofie='n')
+            //   ldc = m_sofie  (leading dim of C)
+            // ----------------------------------------------------------------
+            size_t m_sofie    = static_cast<size_t>(std::stoi(n));   // ONNX n
+            size_t n_sofie    = static_cast<size_t>(std::stoi(m));   // ONNX m
+            size_t k_val      = static_cast<size_t>(std::stoi(k));
+            size_t lda        = m_sofie;             // transA_sofie='n'
+            size_t ldb        = k_val;               // transB_sofie='n'
+            size_t ldc        = m_sofie;
+            size_t sA         = m_sofie * k_val;     // stride per batch for fNB
+            size_t sB         = k_val  * n_sofie;    // stride per batch for fNA (= strideA_onnx)
+            size_t sC         = m_sofie * n_sofie;   // stride per batch for fNY (= strideY)
+            size_t batchCount = static_cast<size_t>(std::stoi(lengthExtra));
+            out << SP << "blas.gemmStridedBatched("
+                << opName << "_transB, " << opName << "_transA, "
+                << m_sofie << ", " << n_sofie << ", " << k_val << ", "
+                << opName << "_alpha, "
+                << "alpaka::getPtrNative(deviceBuf_" << fNB << "), "
+                << lda << ", " << sA << ", "
+                << "alpaka::getPtrNative(deviceBuf_" << fNA << "), "
+                << ldb << ", " << sB << ", "
+                << opName << "_beta, "
+                << "alpaka::getPtrNative(deviceBuf_" << fNY << "), "
+                << ldc << ", " << sC << ", "
+                << batchCount << ");\n";
+         } else if (!fNC.empty()) {
+            // ----------------------------------------------------------------
+            // GEMM with bias:  Y = alpha * op(A) * op(B) + bias
+            // cuBLAS is column-major so we swap A↔B and transA↔transB
+            // (row-major C=A*B  ↔  col-major C^T = B^T * A^T).
+            // The epilogue fuses the bias-add (and optional ReLU/GELU) in the
+            // same kernel, avoiding a separate element-wise pass.
+            //
+            // For batch-collapse (batchCollapseB), use m*batchCount so that all
+            // tokens are processed in a single cuBLASLt kernel launch instead of N.
+            // The bias vector is broadcast across all columns by the epilogue.
+            // ----------------------------------------------------------------
+            std::string call_m = batchCollapseB
+               ? std::to_string(static_cast<size_t>(std::stoi(m)) * static_cast<size_t>(std::stoi(lengthExtra)))
+               : (opName + "_m");
+
+            std::string pC = "alpaka::getPtrNative(deviceBuf_" + fNC + ")";
+            if (useSerialLoop && !fIsDynamic) {
+               if (!fBroadcastBias && strideC > 0)
+                  pC += " + i * " + std::to_string(strideC);
+            }
+            if (fActivation == EActivationType::RELU) {
+               out << SP << "blas.gemmrelu("
+                   << opName << "_transB, " << opName << "_transA, "
+                   << opName << "_n, "      << call_m << ", "
+                   << opName << "_k, "      << opName << "_alpha, "
+                   << pB << ", " << pA << ", "
+                   << opName << "_beta, " << pC << ", " << pY << ");\n";
+            } else {
+               out << SP << "blas.gemm("
+                   << opName << "_transB, " << opName << "_transA, "
+                   << opName << "_n, "      << call_m << ", "
+                   << opName << "_k, "      << opName << "_alpha, "
+                   << pB << ", " << pA << ", "
+                   << opName << "_beta, " << pC << ", " << pY << ");\n";
+            }
+         } else {
+            // ----------------------------------------------------------------
+            // Pure MatMul (no bias):  Y = alpha * op(A) * op(B)
+            // This covers:
+            //   • Scaled Dot-Product Attention:  softmax(QK^T/√d) @ V
+            //   • Any other no-bias matrix multiplication
+            // Previously this branch emitted nothing (empty loop body), which
+            // caused the attention output to be silently uninitialized.
+            // For batch-collapse, use m*batchCount for the same reason as above.
+            // ----------------------------------------------------------------
+            std::string call_m = batchCollapseB
+               ? std::to_string(static_cast<size_t>(std::stoi(m)) * static_cast<size_t>(std::stoi(lengthExtra)))
+               : (opName + "_m");
+
+            out << SP << "blas.matmul("
+                << opName << "_transB, " << opName << "_transA, "
+                << opName << "_n, "      << call_m << ", "
+                << opName << "_k, "      << opName << "_alpha, "
+                << pB << ", " << pA << ", "
+                << opName << "_beta, "  << pY << ");\n";
+         }
+
+         if (useSerialLoop || (doStackMul && fIsDynamic)) {
+            out << SP << "}\n"; // end of loop on the stacked multiplication
+         }
+
+         // GEMM+LeakyReLU fusion (GPU): cuBLASLt has no native LeakyReLU epilogue,
+         // so we emit a cheap in-place ALPAKA kernel immediately after the GEMM.
+         // This avoids allocating a separate intermediate buffer and saves one
+         // GPU kernel launch compared to a standalone LeakyReLU operator.
+         if (fActivation == EActivationType::LEAKYRELU) {
+            std::string numElem = ConvertDimShapeToLength(fShapeY);
+            out << SP << "//--- GEMM+LeakyReLU in-place fusion\n";
+            out << SP << "{\n";
+            out << SP << SP << "constexpr float " << opName << "_lrelu_alpha = "
+                << std::setprecision(std::numeric_limits<float>::max_digits10)
+                << fLeakyReluAlpha << "f;\n";
+            out << SP << SP << "auto const elementsPerThread_lrelu_" << opName
+                << " = Vec::all(static_cast<Idx>(1));\n";
+            out << SP << SP << "auto const elementsPerGrid_lrelu_" << opName
+                << " = Vec::all(Idx{" << numElem << "});\n";
+            out << SP << SP << "auto const workDiv_lrelu_" << opName
+                << " = sofie_workdiv(elementsPerGrid_lrelu_" << opName << ");\n";
+            // In-place: input and output pointer are the same device buffer.
+            out << SP << SP << "auto task_lrelu_" << opName
+                << " = alpaka::createTaskKernel<Acc>(workDiv_lrelu_" << opName
+                << ", leakyReluKernel"
+                << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+                << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+                << ", static_cast<Idx>(" << numElem << ")"
+                << ", static_cast<float>(" << opName << "_lrelu_alpha));\n";
+            out << SP << SP << "alpaka::enqueue(queue, task_lrelu_" << opName << ");\n";
+            out << SP << "}\n";
+         }
+
+         return out.str();
+      }
+
+      std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; }
+      std::string GetFusableOutputTensorName() override {
+         return fNY;
+      }
+
+      void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+         removal_func(fNY);
+         fNY = fusable_tensor_name;
+         fOutputTensorNames[0] = fNY;
+      }
+
+      // --- Activation fusion accessors (used by FuseGemmActivations_GPU) ---
+      EActivationType GetActivationType() const { return fActivation; }
+      /// Set fused activation.  alpha is only meaningful for LEAKYRELU.
+      void SetActivation(EActivationType act, float alpha = 0.f) {
+         fActivation      = act;
+         fLeakyReluAlpha  = alpha;
+      }
+
+      std::string GetBlasConfig(){
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         int64_t dimY = fShapeY.size();
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         auto lda = (fAttrTransA ? m : k);
+         auto ldb = (fAttrTransB ? k : n);
+         auto ldc = n;
+         std::string transFlags = std::string(fAttrTransB ? "'t'" : "'n'") + ", " + (fAttrTransA ? "'t'" : "'n'");
+
+         // For stacked (batched) GEMMs on static shapes, return the layout that
+         // matches the actual call emitted by Generate_GPU_ALPAKA:
+         //   - batch-collapse (strideB==0): single GEMM with n_sofie = m_onnx * batchCount
+         //                                  → register the batched layout
+         //   - gemmStridedBatched (both strides non-zero, no bias): uses legacy cuBLAS,
+         //                                  no cuBLASLt layout needed → return ""
+         if (dimY > 2 && !fIsDynamic) {
+            std::vector<Dim> sExtra;
+            for (int64_t i = 0; i < dimY - 2; i++) sExtra.push_back(fShapeY[i]);
+            auto lengthExtra = ConvertDimShapeToLength(sExtra);
+            if (std::stoi(lengthExtra) > 1) {
+               bool bLeadingDimsAllOne = true;
+               for (int64_t i = 0; i < dimB - 2; i++) {
+                  if (fShapeB[i].dim != 1) { bLeadingDimsAllOne = false; break; }
+               }
+               if (bLeadingDimsAllOne) {
+                  // batch-collapse: register layout for the full-batch GEMM
+                  auto m_batched = std::to_string(std::stoi(m) * std::stoi(lengthExtra));
+                  return n+", "+m_batched+", "+k+", "+ldb+", "+lda+", "+ldc+", "+transFlags;
+               } else if (fNC.empty()) {
+                  // gemmStridedBatched: legacy cuBLAS, no cuBLASLt layout needed
+                  return "";
+               }
+               // else: serial loop with bias — fall through to per-iteration layout
+            }
+         }
+
+         return n+", "+m+", "+k+", "+ldb+", "+lda+", "+ldc+", "+transFlags;
+      }
+   };
+
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_GEMM
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx b/core/inc/SOFIE/ROperator_Identity.hxx
similarity index 66%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx
rename to core/inc/SOFIE/ROperator_Identity.hxx
index efb6b14..43688cf 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Identity.hxx
+++ b/core/inc/SOFIE/ROperator_Identity.hxx
@@ -41,7 +41,7 @@ public:
    void Initialize(RModel& model) override {
        //input must be a graph input, or already initialized intermediate tensor
       if (model.CheckIfTensorAlreadyExist(fNX) == false){
-        throw std::runtime_error("TMVA SOFIE Identity Op Input Tensor is not found in model");
+        throw std::runtime_error("SOFIE Identity Op Input Tensor is not found in model");
       }
       fShape = model.GetTensorShape(fNX);
       if (model.IsInitializedTensor(fNX)) {
@@ -77,7 +77,7 @@ public:
       if (fIsOutputConstant || fIsInputInitialized) return "";
       OpName = "op_" + OpName;
       if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator Identity called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Identity called to Generate without being initialized first");
       }
       std::stringstream out;
       out << "\n//------ IDENTITY\n";
@@ -86,6 +86,31 @@ public:
       return out.str();
    }
 
+   std::string GenerateInitCode_GPU_ALPAKA() override {
+      // For initialized (weight) tensors: the device buffer for X is already populated by
+      // MoveInitializedTensorsToBuffers_ALPAKA(); copy it into the Y device buffer.
+      if (!fIsInputInitialized) return "";
+      std::stringstream out;
+      out << "\n//------ IDENTITY (init)\n";
+      out << SP << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      // Constant outputs and already-initialised tensors need no runtime work.
+      if (fIsOutputConstant || fIsInputInitialized) return "";
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Identity called to Generate_GPU_ALPAKA without being initialized first");
+      }
+      std::stringstream out;
+      out << "\n//------ IDENTITY\n";
+      // Device buffers cannot simply be aliased; perform an explicit device-to-device copy.
+      out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n";
+      out << SP << "alpaka::wait(queue);\n";
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx b/core/inc/SOFIE/ROperator_LSTM.hxx
similarity index 98%
rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx
rename to core/inc/SOFIE/ROperator_LSTM.hxx
index 5bfd4e3..69fb7a2 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.hxx
+++ b/core/inc/SOFIE/ROperator_LSTM.hxx
@@ -106,7 +106,7 @@ template <typename T> class ROperator_LSTM final : public ROperator {
          fType = "float";
       } else {
          throw std::runtime_error(
-             "TMVA SOFIE Encountered unsupported type parsing a LSTM operator");
+             "SOFIE Encountered unsupported type parsing a LSTM operator");
       }
       
       fInputTensorNames = { fNX, fNW, fNR };
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/core/inc/SOFIE/ROperator_LSTM.icc
similarity index 97%
rename from src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
rename to core/inc/SOFIE/ROperator_LSTM.icc
index bec7760..2fb390d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
+++ b/core/inc/SOFIE/ROperator_LSTM.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_LSTM_I
 #define SOFIE_ROPERATOR_LSTM_I
 
-
 namespace SOFIE {
 
 template<typename T>
@@ -41,33 +40,33 @@ auto ROperator_LSTM<T>::Initialize(RModel& model)
    fUseSession = model.UseSession();
    // Check the input and output tensors
    if (!model.CheckIfTensorAlreadyExist(fNX)) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + "  is not found in model.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + "  is not found in model.");
 	}
 	fShapeX = model.GetTensorShape(fNX);
 	if (fShapeX.size() != 3) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNX + " is not of 3 dimensions.");
 	}
 	if (!model.CheckIfTensorAlreadyExist(fNW)) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + "  is not found in model.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + "  is not found in model.");
 	}
 	fShapeW = model.GetTensorShape(fNW);
 	if (fShapeW.size() != 3) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNW + " is not of 3 dimensions.");
 	}
 	if (!model.CheckIfTensorAlreadyExist(fNR)) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + "  is not found in model.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + "  is not found in model.");
 	}
 	fShapeR = model.GetTensorShape(fNR);
 	if (fShapeR.size() != 3) {
-		throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions.");
+		throw std::runtime_error("SOFIE LSTM Op input tensor " + fNR + " is not of 3 dimensions.");
 	}
 	if (!fNB.empty()) {
 		if (!model.CheckIfTensorAlreadyExist(fNB)) {
-			throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not  found in model.");
+			throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not  found in model.");
 		}
 		fShapeB = model.GetTensorShape(fNB);
 		if (fShapeB.size() != 2 && fShapeB.size() != 5) {
-			throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions.");
+			throw std::runtime_error("SOFIE LSTM op input tensor " + fNB + " is not of 2 or 5 dimensions.");
 		}
 		if (fShapeB.size() == 2) {
 			// Broadcasting the bias
@@ -104,46 +103,46 @@ auto ROperator_LSTM<T>::Initialize(RModel& model)
 	}
 	if (!fNSequence_lens.empty()) {
 		if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) {
-			throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+			throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                   fNSequence_lens +
                                   "is not found in model.");
       }
       fShapeSequence_lens = model.GetTensorShape(fNSequence_lens);
       if (fShapeSequence_lens.size() != 1) {
-         throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+         throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                   fNSequence_lens +
                                   " is not of 1 dimension.");
       }
    }
    if (!fNInitial_h.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) {
-        throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+        throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                  fNInitial_h + " is not found in model.");
       }
       fShapeInitial_h = model.GetTensorShape(fNInitial_h);
       if (fShapeInitial_h.size() != 3) {
-        throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+        throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                  fNInitial_h + " is not of 3 dimensions.");
       }
    }
    if (!fNInitial_c.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNInitial_c)) {
-         throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+         throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                   fNInitial_c + " is not found in model.");
       }
       fShapeInitial_c = model.GetTensorShape(fNInitial_c);
       if (fShapeInitial_c.size() != 3) {
-         throw std::runtime_error("TMVA SOFIE LSTM Op input tensor " +
+         throw std::runtime_error("SOFIE LSTM Op input tensor " +
                                   fNInitial_c + " is not of 3 dimensions.");
       }
    }
    if (!fNP.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNP)) {
-         throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not  found in model.");
+         throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not  found in model.");
       }
       fShapeP = model.GetTensorShape(fNP);
       if (fShapeP.size() != 2 && fShapeP.size() != 4) {
-         throw std::runtime_error("TMVA SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions.");
+         throw std::runtime_error("SOFIE LSTM op input tensor " + fNP + " is not of 2 or 4 dimensions.");
       }
       if (fShapeP.size() == 2) {
          // Broadcasting the weight for peepholes
@@ -197,28 +196,28 @@ auto ROperator_LSTM<T>::Initialize(RModel& model)
           activation != "ScaledTanh" && activation != "HardSigmoid" &&
           activation != "Elu" && activation != "Softsign" &&
           activation != "Softplus") {
-         throw std::runtime_error("TMVA SOFIE - Activation function " +
+         throw std::runtime_error("SOFIE - Activation function " +
                                  activation + " not implemented");
       }
 	}
    if (fAttrDirection != "forward" && fAttrDirection != "backward" &&
        fAttrDirection != "bidirectional") {
       throw std::runtime_error(
-          "TMVA SOFIE - Invalid LSTM direction fAttrDirection = " +
+          "SOFIE - Invalid LSTM direction fAttrDirection = " +
           fAttrDirection);
    }
    if (4 * fAttrHiddenSize != fShapeW[1]) {
       throw std::runtime_error(
-          "TMVA SOFIE - fAttrHiddenSize must be equal to " +
+          "SOFIE - fAttrHiddenSize must be equal to " +
           std::to_string(fShapeW[1] / 4));
    }
    if (fAttrInputForget > 1) {
       throw std::runtime_error(
-         "TMVA SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget)
+         "SOFIE - fAttrInputForget = " + std::to_string(fAttrInputForget)
          + " must be 0 or 1.");
    }
    if (fAttrLayout > 1) {
-      throw std::runtime_error("TMVA SOFIE - Layout fAttrLayout = " +
+      throw std::runtime_error("SOFIE - Layout fAttrLayout = " +
                                std::to_string(fAttrLayout) +
                                " must be 0 (timewise) or 1 (batchwise)");
    }
@@ -291,7 +290,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName)
 
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession)
          out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
diff --git a/core/inc/SOFIE/ROperator_LayerNormalization.hxx b/core/inc/SOFIE/ROperator_LayerNormalization.hxx
new file mode 100644
index 0000000..dbf113a
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_LayerNormalization.hxx
@@ -0,0 +1,732 @@
+#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION
+#define SOFIE_ROPERATOR_LAYERNORMALIZATION
+
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+#include <sstream>
+#include <string>
+
+namespace SOFIE {
+
+template <typename T>
+class ROperator_LayerNormalization : public ROperator {
+private:
+   bool fCastToFloat = false;  // flag to indicate if operation 1 are in floats (to be  impl)
+   int fAttrAxis;
+   float fAttrEpsilon;
+   size_t fAttrStashType;
+
+   std::string fNX;
+   std::string fNScale;
+   std::string fNB;
+   std::string fNY;
+   std::string fNMean;
+   std::string fNInvStdDev;
+
+   std::string fNCastedX;
+   std::string fNNormalizedX;
+   std::string fNBroadcastedB;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeScale;
+   std::vector<Dim> fShapeB;
+   std::vector<Dim> fShapeY;
+   std::vector<Dim> fShapeMean;
+   std::vector<Dim> fShapeInvStdDev;
+
+   size_t fAxis; // axis in [0, size)
+   size_t fSize; // Size of the input
+   // size_t fAxisDim;
+
+   std::vector<Dim> fNormalizedShape;  // shape from X[ axis,...,N-1]
+   std::vector<Dim> fAxesShape;        // shape from X[0,..,axis-1]
+   // lengths in string format
+   std::string fLength; // Length of the input
+   std::string fNormalizedLength;
+   std::string fAxesLength;
+
+   std::string fType;
+
+public:
+   ROperator_LayerNormalization() {}
+
+   ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX,
+                                const std::string &nameScale, const std::string &nameB, const std::string &nameY,
+                                const std::string &nameMean, const std::string &nameInvStdDev)
+      : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)),
+        fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)),
+        fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev))
+   {
+         fInputTensorNames = { fNX, fNScale };
+         if (!fNB.empty()){
+            fInputTensorNames.emplace_back(fNB);
+         }
+
+         fOutputTensorNames = { fNY };
+         if (!fNMean.empty()){
+            fOutputTensorNames.emplace_back(fNMean);
+         }
+         if (!fNInvStdDev.empty()){
+            fOutputTensorNames.emplace_back(fNInvStdDev);
+         }
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override { return input; }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found.");
+      }
+      bool isDynamic = model.IsDynamicTensor(fNX);
+      fShapeX = model.GetDimTensorShape(fNX);
+      fShapeY = fShapeX;
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+      // Type of the output
+      fType = ConvertTypeToString(model.GetTensorType(fNX));
+      // Size of the input
+      fSize = fShapeX.size();
+      // Axis in [0, size)
+      fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis;
+      // Shape of fShapeX[0, ..., fAxis)
+      fAxesShape = std::vector<Dim>(fShapeX.begin(), fShapeX.begin() + fAxis);
+      // Length of the axes
+      fAxesLength = ConvertDimShapeToLength(fAxesShape);
+      // Shape of fShapeX[fAxis, ..., fSize)
+      fNormalizedShape = std::vector<Dim>(fShapeX.begin() + fAxis, fShapeX.end());
+      // Length of the normalized axis
+      fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape);
+      // length of the input
+      fLength = ConvertDimShapeToLength(fShapeX);
+      // Type of mean and std
+      ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
+      // Mean
+      if (!fNMean.empty()) {
+         // cannot use initializer list with one element since it is ambiguous
+         if (isDynamic)
+            // add size_t(-1) to indicate that shape is an expression
+            model.AddIntermediateTensor(fNMean, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
+         else
+            model.AddIntermediateTensor(fNMean, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
+      }
+      // Inverse Standard Deviation
+      if (!fNInvStdDev.empty()) {
+         if (isDynamic)
+            model.AddIntermediateTensor(fNInvStdDev, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
+         else
+            model.AddIntermediateTensor(fNInvStdDev, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
+      }
+      // if mean and stdev are not empty they are not defined in the output list
+      // Cast X to float
+      if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) {
+         fCastToFloat = true;
+         fType = "float";
+         // fNCastedX = "Casted" + fNX;
+         // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
+         // fNNormalizedX = "Normalized" + fNX;
+         // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+      }
+      // scale shape
+      fShapeScale = model.GetDimTensorShape(fNScale);
+      // appends 1 to scale shapes if missing
+      size_t dimScale = fShapeScale.size();
+      if (dimScale < fSize) {
+         for (size_t i = 0; i < fSize-dimScale; i++)
+            fShapeScale.insert(fShapeScale.begin(), Dim{1});
+      }
+      // check also shape if consistent now
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i])
+            throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
+      }
+      if (!fNB.empty()) {
+         fShapeB = model.GetDimTensorShape(fNB);
+         // appends 1 to bias shapes if missing
+         size_t dimB = fShapeB.size();
+         if (dimB < fShapeX.size()) {
+            for (size_t i = 0; i < fSize-dimB; i++)
+               fShapeB.insert(fShapeB.begin(), Dim{1});
+         }
+         for (size_t i = 0; i < fSize; i++) {
+            if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i])
+               throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
+         }
+      }
+
+      std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << "  " << ConvertDimShapeToString(fShapeScale) << std::endl;
+
+      // // Broadcast the bias
+      // if (!fNB.empty()) {
+      //    fShapeB = model.GetTensorShape(fNB);
+      //    size_t lengthB = ConvertShapeToLength(fShapeB);
+      //    if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
+      //       fNBroadcastedB = "Broadcasted" + fNB;
+      //       model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+      //    }
+      // }
+      model.AddNeededStdLib("cmath");
+   }
+
+   std::string GenerateInitCode() override
+   {
+      std::stringstream out;
+      if (!fNBroadcastedB.empty()) {
+         out << SP << "// Broadcasting the bias of LayerNormalization op\n";
+         out << SP << "{\n";
+         out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_";
+         out << fNB << ", " << ConvertDimShapeToString(fShapeB) << ", " << ConvertDimShapeToString(fShapeX) << ");\n";
+         out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n";
+         out << SP << "delete[] data;\n";
+         out << SP << "}\n";
+      }
+      return out.str();
+   }
+
+   std::string Generate(std::string opName) override
+   {
+      opName = "op_" + opName;
+      if (fShapeX.empty()) {
+         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName +
+                                  " called to generate without being initialized first.");
+      }
+
+      std::stringstream out;
+
+      out << "//---- Layer Normalization  operator " << opName << "\n";
+
+      // Loop over all the normalized axes i.e. [axis, ..., size)
+      std::vector<std::string> inputShape(fSize);
+
+      for (size_t i = 0; i < fSize; i++) {
+         inputShape[i] = fShapeX[i].GetVal();
+      }
+
+      auto strides = UTILITY::ComputeStrideFromShape(fShapeX);
+      std::string inputIndex = "axis_0 * " + strides[0].GetVal();
+      for (size_t i = 1; i < fSize; i++) {
+         inputIndex += " + axis_" + std::to_string(i);
+         if (i < fSize-1) inputIndex += " * " + strides[i].GetVal();
+      }
+      auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale);
+      std::string scaleIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1) {
+            if (!scaleIndex.empty()) scaleIndex += " + ";
+            scaleIndex += "axis_" + std::to_string(i);
+            if ( scaleStrides[i].dim != 1) scaleIndex +=  " * " + scaleStrides[i].GetVal();
+         }
+      }
+      if (scaleIndex.empty()) scaleIndex = "0";
+
+      auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB);
+      std::string biasIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeB[i].dim != 1) {
+            if (!biasIndex.empty()) biasIndex += " + ";
+            biasIndex += "axis_" + std::to_string(i);
+            if ( biasStrides[i].dim != 1) biasIndex +=  " * " + biasStrides[i].GetVal();
+         }
+      }
+      if (biasIndex.empty()) biasIndex = "0";
+
+      auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape);
+      std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal();
+      for (size_t i = 1; i < fAxis; i++) {
+         axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal();
+      }
+
+
+      // compute mean and std-dev. Save in tensors if requested
+
+      out << SP << "// Compute the mean\n";
+
+      // Loop over all the outer dims in [0, fAxis)
+      for (size_t i = 0; i < fAxis; i++) {
+         std::string iIdx = "axis_" + std::to_string(i);
+         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
+                      << "; " << iIdx << "++) {\n";
+      }
+      out << SP << SP << fType << " mean = 0.;\n";
+      // loop over the normalized dimensions (fAxis,....,N-1)
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
+                         << "; " << jIdx << "++) {\n";
+      }
+      out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n";
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
+      }
+      out << SP << SP << "mean  /= " << fType << "(" << fNormalizedLength << ");\n";
+
+
+      out << SP << "// Compute the inverse Standard Deviation\n";
+
+      // Set sum = 0
+      out << SP << SP << fType << " sum = 0.;\n";
+      // loop over all the dims in [0, fAxis)
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
+                          << "; " << jIdx << "++){\n";
+      }
+      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n";
+      out << SP << SP << SP << "sum += tmp*tmp;\n";
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
+      }
+      out << SP << SP << fType << " invStdDev = 1 / std::sqrt(";
+      out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n";
+
+
+      // set output mean and invStdDev if requested
+      if (!fNMean.empty())
+         out << SP << SP <<  "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n";
+      if (!fNInvStdDev.empty())
+         out << SP << SP <<  "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n";
+
+      // scale and add bias
+
+      out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx
+             << "++){\n";
+      }
+      out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale;
+      out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)";
+
+      // add bias if needed
+      if (!fNB.empty())
+         // assume bias has index as scale
+         out << " + tensor_" << fNB << "[" << biasIndex << "]";
+      out << ";\n";
+
+      // close loops on normalizing dim  [..,fAxis,...fSize-1]
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
+      }
+      // close loops on the other dimensions [0,...,fAxis]
+      for (size_t i = 0; i < fAxis; i++) {
+         out << SP << "}\n";
+      }
+
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty())
+         throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first");
+
+      // -----------------------------------------------------------------------
+      // Parallel block-per-row strategy (for static normalizedLength ≤ 1024):
+      //   • One block per row (axes element).
+      //   • blockSize = next power-of-2 ≥ normalizedLength, capped at 1024.
+      //   • Each thread loads one element, two shared-memory tree reductions
+      //     compute mean then variance; final pass normalises in parallel.
+      // This replaces the previous single-thread-per-row serial scan.
+      // For dynamic shapes or normalizedLength > 1024, fall back to the original
+      // serial kernel (one thread per row, explicit loops).
+      // -----------------------------------------------------------------------
+
+      // Determine whether we can use the parallel path
+      size_t normLenVal = 0;
+      bool canParallel = false;
+      try {
+         normLenVal = std::stoul(fNormalizedLength);
+         canParallel = (normLenVal > 0 && normLenVal <= 1024);
+      } catch (...) {}
+
+      // Compute blockSize = next power-of-2 >= normLenVal
+      size_t blockSize = 1;
+      if (canParallel) {
+         while (blockSize < normLenVal) blockSize <<= 1;
+      }
+
+      // Each thread handles one "row" — one element of the axes dims [0..axis)
+      // and iterates over all normalized dims [axis..size)
+      // axesLength = product of fShapeX[0..axis)
+      // normalizedLength = product of fShapeX[axis..size)
+      // totalElements = axesLength (one thread per row)
+
+      std::vector<std::string> inputShape(fSize);
+      for (size_t i = 0; i < fSize; i++)
+         inputShape[i] = fShapeX[i].GetVal();
+
+      auto strides      = UTILITY::ComputeStrideFromShape(fShapeX);
+      auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale);
+      auto biasStrides  = (!fNB.empty()) ? UTILITY::ComputeStrideFromShape(fShapeB)
+                                          : std::vector<Dim>{};
+      auto axesStrides  = UTILITY::ComputeStrideFromShape(fAxesShape);
+
+      // Build index expressions reusing the same logic as Generate()
+      // input index: axis_0*stride0 + axis_1*stride1 + ... + norm_0*stride_axis + ...
+      // For the kernel we decompose the linear thread index into axis coords,
+      // then loop over normalized dims inside the kernel.
+
+      std::string kname = "LayerNormKernel_" + opName;
+      std::string op;
+      op  = "\n//------ LAYERNORM_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ X,\n";
+      op += SP + SP + SP + "T const* __restrict__ scale,\n";
+      if (!fNB.empty())
+         op += SP + SP + SP + "T const* __restrict__ bias,\n";
+      if (!fNMean.empty())
+         op += SP + SP + SP + "T* __restrict__ out_mean,\n";
+      if (!fNInvStdDev.empty())
+         op += SP + SP + SP + "T* __restrict__ out_invstd,\n";
+      op += SP + SP + SP + "T* __restrict__ Y,\n";
+      op += SP + SP + SP + "std::size_t const axesLength) const {\n\n";
+
+      if (canParallel) {
+         // ---------------------------------------------------------------
+         // PARALLEL PATH: one block per row, blockSize threads per block.
+         // Each thread handles one element in the normalised dimension.
+         // Two shared-memory tree reductions compute mean then variance.
+         // ---------------------------------------------------------------
+         std::string bs = std::to_string(blockSize);
+         std::string nl = fNormalizedLength; // e.g. "64"
+         std::string eps = std::to_string(fAttrEpsilon);
+
+         op += SP + SP + SP + "// Block-parallel LayerNorm: one block per row, "
+               + bs + " threads per block, " + nl + " active.\n";
+         op += SP + SP + SP + "auto& shmem = alpaka::declareSharedVar<T[" + bs + "], __COUNTER__>(acc);\n";
+         op += SP + SP + SP + "auto const row = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0];\n";
+         op += SP + SP + SP + "auto const tid = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];\n";
+         op += SP + SP + SP + "if (row >= axesLength) return;\n\n";
+
+         // --- Decompose row into axis-dim coordinates (same logic as serial path) ---
+         if (fAxis > 0) {
+            for (size_t i = 0; i < fAxis; ++i) {
+               op += SP + SP + SP + "std::size_t const axis_" + std::to_string(i)
+                  + " = (row / " + axesStrides[i].GetVal() + "u) % "
+                  + inputShape[i] + "u;\n";
+            }
+            op += "\n";
+         }
+
+         // Base input offset for this row
+         op += SP + SP + SP + "std::size_t const row_base =\n";
+         if (fAxis == 0) {
+            op += SP + SP + SP + SP + "0u;\n\n";
+         } else {
+            for (size_t i = 0; i < fAxis; ++i) {
+               op += SP + SP + SP + SP + "axis_" + std::to_string(i)
+                  + " * " + strides[i].GetVal() + "u";
+               op += (i + 1 < fAxis) ? " +\n" : ";\n\n";
+            }
+         }
+
+         // Map thread id → index within normalised dims.
+         // For each normalised dim j, the "within-norm" stride is the product of
+         // dimensions after it: normInnerStrides[j-fAxis] computed at code-gen time.
+         // Then:  norm_offset = sum_j( (tid / normInnerStride[j]) % dim[j] * stride[j] )
+         // For the common 1D normalised case this simplifies to: norm_offset = tid * stride[fAxis]
+
+         // Build the norm-dim strides (strides within the flattened normalised space)
+         auto normShape = fNormalizedShape;  // dims [fAxis .. fSize-1]
+         auto normInner = UTILITY::ComputeStrideFromShape(normShape);
+
+         op += SP + SP + SP + "bool const in_range = (tid < " + nl + "u);\n";
+         op += SP + SP + SP + "std::size_t norm_offset = 0u;\n";
+         op += SP + SP + SP + "std::size_t s_norm_offset = 0u;\n";
+         if (!fNB.empty())
+            op += SP + SP + SP + "std::size_t b_norm_offset = 0u;\n";
+         op += SP + SP + SP + "if (in_range) {\n";
+
+         if (fSize - fAxis == 1) {
+            // Single normalised dim — simplest case
+            op += SP + SP + SP + SP + "norm_offset = tid * " + strides[fAxis].GetVal() + "u;\n";
+            if (fShapeScale[fAxis].dim != 1)
+               op += SP + SP + SP + SP + "s_norm_offset = tid * " + scaleStrides[fAxis].GetVal() + "u;\n";
+            if (!fNB.empty() && fShapeB[fAxis].dim != 1)
+               op += SP + SP + SP + SP + "b_norm_offset = tid * " + biasStrides[fAxis].GetVal() + "u;\n";
+         } else {
+            // Multi-dim normalised space
+            op += SP + SP + SP + SP + "std::size_t norm_rem = tid;\n";
+            for (size_t j = fAxis; j < fSize; ++j) {
+               size_t ji = j - fAxis;
+               op += SP + SP + SP + SP + "{ std::size_t nj = norm_rem / " + normInner[ji].GetVal() + "u;"
+                  + " norm_rem %= " + normInner[ji].GetVal() + "u;"
+                  + " norm_offset += nj * " + strides[j].GetVal() + "u;";
+               if (fShapeScale[j].dim != 1)
+                  op += " s_norm_offset += nj * " + scaleStrides[j].GetVal() + "u;";
+               if (!fNB.empty() && fShapeB[j].dim != 1)
+                  op += " b_norm_offset += nj * " + biasStrides[j].GetVal() + "u;";
+               op += " }\n";
+            }
+         }
+         op += SP + SP + SP + "}\n\n";
+
+         op += SP + SP + SP + "std::size_t const norm_idx = row_base + norm_offset;\n";
+         op += SP + SP + SP + "T const val = in_range ? X[norm_idx] : static_cast<T>(0);\n\n";
+
+         // --- Pass 1: parallel mean ---
+         op += SP + SP + SP + "// Pass 1: compute mean via shared-memory tree reduction\n";
+         op += SP + SP + SP + "shmem[tid] = val;\n";
+         op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n";
+         size_t half = blockSize / 2;
+         while (half > 0) {
+            op += SP + SP + SP + "if (tid < " + std::to_string(half) + "u) shmem[tid] += shmem[tid + " + std::to_string(half) + "u];\n";
+            op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n";
+            half >>= 1;
+         }
+         op += SP + SP + SP + "T const mean = shmem[0] / static_cast<T>(" + nl + ");\n";
+         op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n\n";
+
+         // --- Pass 2: parallel variance ---
+         op += SP + SP + SP + "// Pass 2: compute variance\n";
+         op += SP + SP + SP + "T const diff = val - mean;\n";
+         op += SP + SP + SP + "shmem[tid] = in_range ? diff * diff : static_cast<T>(0);\n";
+         op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n";
+         half = blockSize / 2;
+         while (half > 0) {
+            op += SP + SP + SP + "if (tid < " + std::to_string(half) + "u) shmem[tid] += shmem[tid + " + std::to_string(half) + "u];\n";
+            op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n";
+            half >>= 1;
+         }
+         op += SP + SP + SP + "T const invStdDev = static_cast<T>(1) / alpaka::math::sqrt(acc,"
+               " shmem[0] / static_cast<T>(" + nl + ") + static_cast<T>(" + eps + "));\n\n";
+
+         // Save mean/invstd if requested
+         if (!fNMean.empty())
+            op += SP + SP + SP + "if (tid == 0u) out_mean[row] = mean;\n";
+         if (!fNInvStdDev.empty())
+            op += SP + SP + SP + "if (tid == 0u) out_invstd[row] = invStdDev;\n";
+         op += "\n";
+
+         // --- Pass 3: normalise, scale, bias ---
+         op += SP + SP + SP + "// Pass 3: normalize + scale + bias\n";
+         op += SP + SP + SP + "if (in_range) {\n";
+
+         // scale base (axis contribution)
+         op += SP + SP + SP + SP + "std::size_t const scale_base =\n";
+         {
+            bool any = false;
+            for (size_t i = 0; i < fAxis; ++i) {
+               if (fShapeScale[i].dim != 1) {
+                  op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i)
+                     + " * " + scaleStrides[i].GetVal() + "u";
+                  if (any) op += " +\n";
+                  any = true;
+               }
+            }
+            if (!any) op += SP + SP + SP + SP + SP + "0u";
+            op += ";\n";
+         }
+         op += SP + SP + SP + SP + "T out_val = scale[scale_base + s_norm_offset] * invStdDev * (val - mean);\n";
+
+         if (!fNB.empty()) {
+            op += SP + SP + SP + SP + "std::size_t const bias_base =\n";
+            bool any = false;
+            for (size_t i = 0; i < fAxis; ++i) {
+               if (fShapeB[i].dim != 1) {
+                  op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i)
+                     + " * " + biasStrides[i].GetVal() + "u";
+                  if (any) op += " +\n";
+                  any = true;
+               }
+            }
+            if (!any) op += SP + SP + SP + SP + SP + "0u";
+            op += ";\n";
+            op += SP + SP + SP + SP + "out_val += bias[bias_base + b_norm_offset];\n";
+         }
+
+         op += SP + SP + SP + SP + "Y[norm_idx] = out_val;\n";
+         op += SP + SP + SP + "}\n";  // end in_range
+
+      } else {
+         // ---------------------------------------------------------------
+         // SERIAL PATH (dynamic shapes or normalizedLength > 1024):
+         // one thread per row, explicit loops over normalized dims.
+         // ---------------------------------------------------------------
+         op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op += SP + SP + SP + "if (global_thread_idx >= axesLength) return;\n";
+         op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+         op += SP + SP + SP + "for (std::size_t row = global_thread_idx; row < axesLength; row += grid_thread_extent) {\n\n";
+
+         if (fAxis > 0) {
+            for (size_t i = 0; i < fAxis; ++i) {
+               op += SP + SP + SP + SP + "std::size_t const axis_" + std::to_string(i)
+                  + " = (row / " + axesStrides[i].GetVal() + "u) % "
+                  + inputShape[i] + "u;\n";
+            }
+            op += "\n";
+         }
+
+         op += SP + SP + SP + SP + "std::size_t const row_base =\n";
+         if (fAxis == 0) {
+            op += SP + SP + SP + SP + SP + "0u;\n\n";
+         } else {
+            for (size_t i = 0; i < fAxis; ++i) {
+               op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i)
+                  + " * " + strides[i].GetVal() + "u";
+               op += (i + 1 < fAxis) ? " +\n" : ";\n\n";
+            }
+         }
+
+         op += SP + SP + SP + SP + "std::size_t const scale_base =\n";
+         { bool any = false;
+           for (size_t i = 0; i < fAxis; ++i) {
+              if (fShapeScale[i].dim != 1) {
+                 op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i)
+                    + " * " + scaleStrides[i].GetVal() + "u";
+                 if (any) op = " +\n" + op; any = true;
+              }
+           }
+           if (!any) op += SP + SP + SP + SP + SP + "0u";
+           op += ";\n\n";
+         }
+
+         if (!fNB.empty()) {
+            op += SP + SP + SP + SP + "std::size_t const bias_base =\n";
+            bool any = false;
+            for (size_t i = 0; i < fAxis; ++i) {
+               if (fShapeB[i].dim != 1) {
+                  op += SP + SP + SP + SP + SP + "axis_" + std::to_string(i)
+                     + " * " + biasStrides[i].GetVal() + "u";
+                  if (any) op = " +\n" + op; any = true;
+               }
+            }
+            if (!any) op += SP + SP + SP + SP + SP + "0u";
+            op += ";\n\n";
+         }
+
+         op += SP + SP + SP + SP + "T mean = static_cast<T>(0);\n";
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j)
+               + " = 0; n_" + std::to_string(j) + " < " + inputShape[j]
+               + "u; n_" + std::to_string(j) + "++) {\n";
+         op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base";
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u";
+         op += ";\n";
+         op += SP + SP + SP + SP + SP + "mean += X[norm_idx];\n";
+         for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n";
+         op += SP + SP + SP + SP + "mean /= static_cast<T>(" + fNormalizedLength + ");\n\n";
+
+         op += SP + SP + SP + SP + "T sum = static_cast<T>(0);\n";
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j)
+               + " = 0; n_" + std::to_string(j) + " < " + inputShape[j]
+               + "u; n_" + std::to_string(j) + "++) {\n";
+         op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base";
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u";
+         op += ";\n";
+         op += SP + SP + SP + SP + SP + "T tmp = X[norm_idx] - mean;\n";
+         op += SP + SP + SP + SP + SP + "sum += tmp * tmp;\n";
+         for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n";
+         op += SP + SP + SP + SP + "T const invStdDev = static_cast<T>(1) / "
+            "alpaka::math::sqrt(acc, sum / static_cast<T>(" + fNormalizedLength
+            + ") + static_cast<T>(" + std::to_string(fAttrEpsilon) + "));\n\n";
+
+         if (!fNMean.empty())
+            op += SP + SP + SP + SP + "out_mean[row] = mean;\n";
+         if (!fNInvStdDev.empty())
+            op += SP + SP + SP + SP + "out_invstd[row] = invStdDev;\n";
+         op += "\n";
+
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += SP + SP + SP + SP + "for (std::size_t n_" + std::to_string(j)
+               + " = 0; n_" + std::to_string(j) + " < " + inputShape[j]
+               + "u; n_" + std::to_string(j) + "++) {\n";
+         op += SP + SP + SP + SP + SP + "std::size_t const norm_idx = row_base";
+         for (size_t j = fAxis; j < fSize; ++j)
+            op += " + n_" + std::to_string(j) + " * " + strides[j].GetVal() + "u";
+         op += ";\n";
+         op += SP + SP + SP + SP + SP + "std::size_t const s_idx = scale_base";
+         for (size_t j = fAxis; j < fSize; ++j) {
+            if (fShapeScale[j].dim != 1)
+               op += " + n_" + std::to_string(j) + " * " + scaleStrides[j].GetVal() + "u";
+         }
+         op += ";\n";
+         op += SP + SP + SP + SP + SP + "T val = scale[s_idx] * invStdDev * (X[norm_idx] - mean);\n";
+         if (!fNB.empty()) {
+            op += SP + SP + SP + SP + SP + "std::size_t const b_idx = bias_base";
+            for (size_t j = fAxis; j < fSize; ++j) {
+               if (fShapeB[j].dim != 1)
+                  op += " + n_" + std::to_string(j) + " * " + biasStrides[j].GetVal() + "u";
+            }
+            op += ";\n";
+            op += SP + SP + SP + SP + SP + "val += bias[b_idx];\n";
+         }
+         op += SP + SP + SP + SP + SP + "Y[norm_idx] = val;\n";
+         for (size_t j = fAxis; j < fSize; ++j) op += SP + SP + SP + SP + "}\n";
+
+         op += SP + SP + SP + "}\n";  // end row loop
+      }
+
+      op += SP + SP + "}\n";    // end operator()
+      op += SP + "};\n";        // end struct
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "LayerNormKernel_" + opName;
+      return SP + kname + " layerNormKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty())
+         throw std::runtime_error("TMVA::SOFIE LayerNormalization called to Generate without being initialized first");
+
+      std::string axesLengthStr = fAxesLength;
+      std::string kname = "layerNormKernel_" + opName;
+
+      // Determine parallel vs serial (same logic as kernel generation)
+      size_t normLenVal2 = 0;
+      bool canParallel2 = false;
+      try { normLenVal2 = std::stoul(fNormalizedLength); canParallel2 = (normLenVal2 > 0 && normLenVal2 <= 1024); }
+      catch (...) {}
+      size_t blockSize2 = 1;
+      if (canParallel2) { while (blockSize2 < normLenVal2) blockSize2 <<= 1; }
+
+      std::string args =
+         "alpaka::getPtrNative(deviceBuf_" + fNX + "), "
+         + "alpaka::getPtrNative(deviceBuf_" + fNScale + ")";
+      if (!fNB.empty())
+         args += ", alpaka::getPtrNative(deviceBuf_" + fNB + ")";
+      if (!fNMean.empty())
+         args += ", alpaka::getPtrNative(deviceBuf_" + fNMean + ")";
+      if (!fNInvStdDev.empty())
+         args += ", alpaka::getPtrNative(deviceBuf_" + fNInvStdDev + ")";
+      args += ", alpaka::getPtrNative(deviceBuf_" + fNY + ")";
+      args += ", static_cast<Idx>(" + axesLengthStr + ")";
+
+      std::stringstream out;
+      out << "\n//------ LAYERNORM_GPU_ALPAKA\n";
+      if (canParallel2) {
+         // Parallel: one block per row, blockSize2 threads per block
+         out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << opName << "(\n";
+         out << SP << SP << "Vec::all(Idx{" << axesLengthStr << "}),\n";    // numBlocks = rows
+         out << SP << SP << "Vec::all(Idx{" << blockSize2 << "u}),\n";     // threads/block
+         out << SP << SP << "Vec::all(Idx{1u}));\n";
+      } else {
+         // Serial fallback: normal sofie_workdiv
+         out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << axesLengthStr << "});\n";
+         out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      }
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+         << ", " << kname << ", " << args << ");\n";
+
+      return out.str();
+   }
+
+   std::vector<std::string> GetBlasRoutines() override { return { std::string("Axpy") }; }
+
+   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") }; }
+};
+
+} // namespace SOFIE
+
+#endif
diff --git a/core/inc/SOFIE/ROperator_LeakyRelu.hxx b/core/inc/SOFIE/ROperator_LeakyRelu.hxx
new file mode 100644
index 0000000..9eb15c1
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_LeakyRelu.hxx
@@ -0,0 +1,144 @@
+#ifndef SOFIE_ROPERATOR_LeakyRelu
+#define SOFIE_ROPERATOR_LeakyRelu
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template <typename T>
+class ROperator_LeakyRelu final : public ROperator
+{
+
+private:
+
+   /* Attributes*/
+   float falpha=0.01; //default value
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+   std::string fType;
+
+public:
+   ROperator_LeakyRelu(){}
+   ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY):
+   falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
+   {  
+      fKind = OperatorKind::LEAKYRELU;
+      if(std::is_same<T, float>::value){
+         fType = "float";
+      }
+		else{
+			throw
+				std::runtime_error("SOFIE Encountered unsupported type parsing a Leaky Relu operator");
+		}
+
+      fInputTensorNames = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("SOFIE Leaky Relu Op Input Tensor is not found in model");
+      }
+      fShape = model.GetDimTensorShape(fNX);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+   }
+
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Leaky Relu called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+
+      out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
+
+      out << "\n//------ LEAKY RELU\n";
+      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
+      out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n";
+      op += "struct LeakyReluKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < numElements) {\n";
+      op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : alpha * data[idx];\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return "LeakyReluKernel leakyReluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator LeakyRelu called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n";
+      out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNX << " = sofie_workdiv(elementsPerGrid_" << fNX << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNX
+         << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), " << OpName << "_alpha);\n";
+      out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   /// Alpha accessor — used by the GEMM+LeakyReLU fusion pass.
+   float GetAlpha() const { return falpha; }
+
+   bool IsElementwise() const override { return true; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "((" + v + " >= 0) ? " + v + " : " + std::to_string(falpha) + " * " + v + ")";
+   }
+
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_LeakyRelu
diff --git a/core/inc/SOFIE/ROperator_Logic.hxx b/core/inc/SOFIE/ROperator_Logic.hxx
new file mode 100644
index 0000000..3f98e94
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Logic.hxx
@@ -0,0 +1,336 @@
+#ifndef SOFIE_ROPERATOR_LOGIC
+#define SOFIE_ROPERATOR_LOGIC
+
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace SOFIE {
+
+enum class ELogicBinaryOp {
+   // logical (bool / uint8)
+   And,
+   Or,
+   Xor,
+   // bitwise (integer types)
+   BitwiseAnd,
+   BitwiseOr,
+   BitwiseXor,
+};
+
+
+template <typename T, ELogicBinaryOp Op>
+struct LogicBinaryTrait {};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::And> {
+   static std::string Name()   { return "And"; }
+   static std::string KernelName() { return "AndKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " && " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a && b); }
+   static ETensorType OutputType() { return ETensorType::BOOL; }
+};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::Or> {
+   static std::string Name()   { return "Or"; }
+   static std::string KernelName() { return "OrKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " || " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a || b); }
+   static ETensorType OutputType() { return ETensorType::BOOL; }
+};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::Xor> {
+   static std::string Name()   { return "Xor"; }
+   static std::string KernelName() { return "XorKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " != " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a != b); }
+   static ETensorType OutputType() { return ETensorType::BOOL; }
+};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::BitwiseAnd> {
+   static std::string Name()   { return "BitwiseAnd"; }
+   static std::string KernelName() { return "BitwiseAndKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " & " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a & b); }
+   static ETensorType OutputType() { return GetTemplatedType(T()); }
+};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::BitwiseOr> {
+   static std::string Name()   { return "BitwiseOr"; }
+   static std::string KernelName() { return "BitwiseOrKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " | " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a | b); }
+   static ETensorType OutputType() { return GetTemplatedType(T()); }
+};
+
+template <typename T>
+struct LogicBinaryTrait<T, ELogicBinaryOp::BitwiseXor> {
+   static std::string Name()   { return "BitwiseXor"; }
+   static std::string KernelName() { return "BitwiseXorKernel"; }
+   static std::string Expr(const std::string &a, const std::string &b) {
+      return "(" + a + " ^ " + b + ")";
+   }
+   static T Eval(T a, T b) { return static_cast<T>(a ^ b); }
+   static ETensorType OutputType() { return GetTemplatedType(T()); }
+};
+
+template <typename T, ELogicBinaryOp Op>
+class ROperator_LogicBinary final : public ROperator {
+private:
+   std::string fNA;
+   std::string fNB;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+   using Trait = LogicBinaryTrait<T, Op>;
+
+public:
+   ROperator_LogicBinary() {}
+
+   ROperator_LogicBinary(std::string nameA, std::string nameB, std::string nameY)
+      : fNA(UTILITY::Clean_name(nameA)),
+        fNB(UTILITY::Clean_name(nameB)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNA, fNB };
+      fOutputTensorNames = { fNY };
+   }
+
+   // ── Type / shape inference ────────────────────────────────────────────────
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return { Trait::OutputType() };
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      if (input.size() < 2)
+         throw std::runtime_error("SOFIE " + Trait::Name() +
+                                  " ShapeInference requires 2 inputs");
+      return { input[0] };
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNA))
+         throw std::runtime_error("SOFIE " + Trait::Name() + ": input A '" +
+                                  fNA + "' not found in model");
+      if (!model.CheckIfTensorAlreadyExist(fNB))
+         throw std::runtime_error("SOFIE " + Trait::Name() + ": input B '" +
+                                  fNB + "' not found in model");
+
+      fShape = model.GetDimTensorShape(fNA);
+      auto length = ConvertShapeToLength(fShape);
+      // Constant-fold: if both inputs are constant, compute output at init time.
+      if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) {
+         auto dataA  = static_cast<T*>(model.GetInitializedTensorData(fNA).get());
+         auto dataB  = static_cast<T*>(model.GetInitializedTensorData(fNB).get());
+         std::vector<T> dataY(length);
+         for (size_t i = 0; i < length; ++i)
+            dataY[i] = Trait::Eval(dataA[i], dataB[i]);
+         std::vector<size_t> outShape = (length == 1) ?
+            std::vector<size_t>{} : std::vector<size_t>{ length };
+         model.AddConstantTensor<T>(fNY, outShape, dataY.data());
+         fIsOutputConstant = true;
+      } else {
+         model.AddIntermediateTensor(fNY, Trait::OutputType(), fShape);
+      }
+
+      if (model.Verbose()) {
+         std::cout << Trait::Name() << " : " << fNA << " , " << fNB
+                   << " -> " << fNY << " " << ConvertDimShapeToString(fShape)
+                   << (fIsOutputConstant ? " [constant-folded]" : "") << std::endl;
+      }
+   }
+
+   std::string Generate(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      OpName = "op_" + OpName;
+      auto length = ConvertDimShapeToLength(fShape);
+      std::stringstream out;
+      out << "\n//------ " << Trait::Name() << "\n";
+      out << SP << "for (std::size_t id = 0; id < " << length << "u; ++id) {\n";
+      out << SP << SP << "tensor_" << fNY << "[id] = "
+          << Trait::Expr("tensor_" + fNA + "[id]", "tensor_" + fNB + "[id]")
+          << ";\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      OpName = "op_" + OpName;
+      std::stringstream op;
+      op << "\n//------ " << Trait::Name() << "_KERNEL_ALPAKA\n";
+      op << "struct " << Trait::KernelName() << "_" << OpName << " {\n";
+      op << SP << "template<typename TAcc, typename T>\n";
+      op << SP << "ALPAKA_FN_ACC void operator()("
+               << "TAcc const& acc, "
+               << "T const* __restrict__ A, "
+               << "T const* __restrict__ B, "
+               << "T* __restrict__ C, "
+               << "std::size_t const N) const {\n";
+      op << SP << SP << "auto const idx = "
+               << "alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op << SP << SP << "if (idx >= N) return;\n";
+      op << SP << SP << "C[idx] = " << Trait::Expr("A[idx]", "B[idx]") << ";\n";
+      op << SP << "}\n";
+      op << "};\n";
+      return op.str();
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      std::string clean = "op_" + OpName;
+      return SP + Trait::KernelName() + "_" + clean + " logic_" + clean + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      std::string cleanOp = "op_" + OpName;
+      auto length = ConvertDimShapeToLength(fShape);
+      std::stringstream out;
+      out << "\n//------ " << Trait::Name() << "_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY
+          << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_" << fNY
+          << " = Vec::all(Idx{" << length << "});\n";
+      out << SP << "auto const workDiv_" << fNY
+          << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << cleanOp
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", logic_" << cleanOp << "Kernel"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNA << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNB << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n";
+      return out.str();
+   }
+
+   bool IsElementwise() const override { return !fIsOutputConstant; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return Trait::Expr(v, v);   // not really meaningful for binary, but satisfy interface
+   }
+};
+
+template <typename T>
+class ROperator_BitwiseNot final : public ROperator {
+private:
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+public:
+   ROperator_BitwiseNot() {}
+
+   ROperator_BitwiseNot(std::string nameX, std::string nameY)
+      : fNX(UTILITY::Clean_name(nameX)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      return input;
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX))
+         throw std::runtime_error("SOFIE BitwiseNot: input tensor '" + fNX +
+                                  "' not found in model");
+      fShape = model.GetDimTensorShape(fNX);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+      if (model.Verbose())
+         std::cout << "BitwiseNot: " << fNX << " -> " << fNY
+                   << " " << ConvertDimShapeToString(fShape) << std::endl;
+   }
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      auto length = ConvertDimShapeToLength(fShape);
+      std::stringstream out;
+      out << "\n//------ BITWISE_NOT\n";
+      out << SP << "for (std::size_t id = 0; id < " << length << "u; ++id) {\n";
+      out << SP << SP << "tensor_" << fNY << "[id] = ~tensor_" << fNX << "[id];\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      std::stringstream op;
+      op << "\n//------ BITWISE_NOT_KERNEL_ALPAKA\n";
+      op << "struct BitwiseNotKernel_" << OpName << " {\n";
+      op << SP << "template<typename TAcc, typename T>\n";
+      op << SP << "ALPAKA_FN_ACC void operator()("
+               << "TAcc const& acc, "
+               << "T const* __restrict__ input, "
+               << "T* __restrict__ output, "
+               << "std::size_t const N) const {\n";
+      op << SP << SP << "auto const idx = "
+               << "alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op << SP << SP << "if (idx >= N) return;\n";
+      op << SP << SP << "output[idx] = ~input[idx];\n";
+      op << SP << "}\n";
+      op << "};\n";
+      return op.str();
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override {
+      std::string clean = "op_" + OpName;
+      return SP + "BitwiseNotKernel_" + clean + " bitwiseNotKernel_" + clean + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      std::string cleanOp = "op_" + OpName;
+      auto length = ConvertDimShapeToLength(fShape);
+      std::stringstream out;
+      out << "\n//------ BITWISE_NOT_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY
+          << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_" << fNY
+          << " = Vec::all(Idx{" << length << "});\n";
+      out << SP << "auto const workDiv_" << fNY
+          << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << cleanOp
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", bitwiseNotKernel_" << cleanOp
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n";
+      return out.str();
+   }
+
+   bool IsElementwise() const override { return true; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "~" + v;
+   }
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROPERATOR_LOGIC
diff --git a/core/inc/SOFIE/ROperator_Not.hxx b/core/inc/SOFIE/ROperator_Not.hxx
new file mode 100644
index 0000000..4e42eca
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Not.hxx
@@ -0,0 +1,112 @@
+#ifndef TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT
+#define TMVA_EXPERIMENTAL_SOFIE_ROPERATOR_NOT
+
+#include <SOFIE/ROperator.hxx>
+#include <SOFIE/RModel.hxx>
+#include <SOFIE/SOFIE_common.hxx>
+
+
+namespace SOFIE {
+
+
+class ROperator_Not final : public ROperator {
+private:
+   std::string fNX;
+   std::string fNY;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeY;
+
+public:
+   ROperator_Not() {}
+
+   ROperator_Not(std::string nameX, std::string nameY)
+      : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
+   {
+         fKind = OperatorKind::NOT;
+         fInputTensorNames =  { fNX };
+         fOutputTensorNames = { fNY };
+   }
+
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+      }
+      fShapeX = model.GetDimTensorShape(fNX);
+      fShapeY = fShapeX;
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+   }
+
+   std::string Generate(std::string opName) override
+   {
+      opName = "op_" + opName;
+      std::stringstream out;
+
+      out << SP << "\n//---- Operator Not  " << opName << "\n";
+      auto length = ConvertDimShapeToLength(fShapeX);
+      out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n";
+      out << SP << SP << "tensor_" << fNY << "[i] = !tensor_" + fNX + "[i];\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override
+   {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op  = "\n//------  NOT_KERNEL_ALPAKA\n";
+      op += SP + "struct NotKernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const & acc,\n";
+      op += SP + SP + SP + "T const * data,\n";
+      op += SP + SP + SP + "T * output,\n";
+      op += SP + SP + SP + "std::size_t const length) const\n";
+      op += SP + SP + "{\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < length) {\n";
+      op += SP + SP + SP + SP + "output[idx] = !data[idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override
+   {
+      return SP + "NotKernel notKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override
+   {
+      opName = "op_" + opName;
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShapeX);
+
+      out << "\n//------ " << opName << "_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << fNY << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << fNY << " = Vec::all(Idx{" << length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << opName
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", " << "notKernel"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", " << length << ");\n";
+      out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+      return out.str();
+   }
+
+   bool IsElementwise() const override { return !fIsOutputConstant; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "!" + v;
+   }
+
+};
+
+} // namespace SOFIE
+
+#endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx b/core/inc/SOFIE/ROperator_Pad.hxx
similarity index 89%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx
rename to core/inc/SOFIE/ROperator_Pad.hxx
index dae3a5b..04365d8 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Pad.hxx
+++ b/core/inc/SOFIE/ROperator_Pad.hxx
@@ -61,13 +61,13 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Pad Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE Pad Op Input Tensor is not found in model");
       }
 
       fInputShape = model.GetTensorShape(fNX);
 
       if (fMode != EMode::kConstant) {
-         throw std::runtime_error("TMVA SOFIE Pad Op supports now only Constant mode");
+         throw std::runtime_error("SOFIE Pad Op supports now only Constant mode");
       }
 
       // get pads data
@@ -75,7 +75,7 @@ public:
       if (model.IsInitializedTensor(fNP)) {
          padsData = static_cast<int64_t*>(model.GetInitializedTensorData(fNP).get());
       } else {
-         throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Pads data");
+         throw std::runtime_error("SOFIE Pad Op supports now only initialized Pads data");
       }
       // get constant value
       fConstantValue = 0;
@@ -84,7 +84,7 @@ public:
             T * cData = static_cast<T*>(model.GetInitializedTensorData(fNCV).get());
             fConstantValue = cData[0];
          } else {
-            throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Constant Value  data");
+            throw std::runtime_error("SOFIE Pad Op supports now only initialized Constant Value  data");
          }
       }
       std::vector<int64_t> axes;
@@ -103,10 +103,10 @@ public:
                for (size_t i = 0; i < nax; i++)
                   axes[i] = data[i];
             }  else {
-               throw std::runtime_error("TMVA SOFIE Pad Op invalid input Axes type");
+               throw std::runtime_error("SOFIE Pad Op invalid input Axes type");
             }
          } else {
-            throw std::runtime_error("TMVA SOFIE Pad Op supports now only initialized Axes data");
+            throw std::runtime_error("SOFIE Pad Op supports now only initialized Axes data");
          }
       }
 
@@ -127,7 +127,7 @@ public:
             fPads[i].second = padsData[axesSize + i];
             int64_t outDim = static_cast<int64_t>(fOutputShape[i]) + fPads[i].first + fPads[i].second;
             if (outDim < 0)
-               throw std::runtime_error("TMVA SOFIE Pad Op : invalid Pads values");
+               throw std::runtime_error("SOFIE Pad Op : invalid Pads values");
             fOutputShape[i] = outDim;
          }
       }
@@ -149,7 +149,7 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fOutputShape.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator Pad called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Pad called to Generate without being initialized first");
       }
       std::stringstream out;
       auto inputStride = UTILITY::ComputeStrideFromShape(fInputShape);
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx b/core/inc/SOFIE/ROperator_Pool.hxx
similarity index 95%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx
rename to core/inc/SOFIE/ROperator_Pool.hxx
index e6fbc25..8e11271 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Pool.hxx
+++ b/core/inc/SOFIE/ROperator_Pool.hxx
@@ -76,7 +76,7 @@ public:
          fType = "float";
       } else {
          throw
-            std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Pool operator");
+            std::runtime_error("SOFIE Encountered unsupported type parsing a Pool operator");
       }
       fInputTensorNames = { fNX };
       fOutputTensorNames = { fNY };
@@ -94,19 +94,19 @@ public:
       // Where N is batch size, C : input  channels, H : input height, W = input width
       // or it can be [N, C, F1,F2,....FN] . Minimum dimension is 3
       if (input.size() != 1 ) {
-         throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference need 1 input tensor");
+         throw std::runtime_error("SOFIE" + Name() + "Op Shape inference need 1 input tensor");
       }
       if (input[0].size() < 3) {
-         throw std::runtime_error("TMVA SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions");
+         throw std::runtime_error("SOFIE" + Name() + "Op Shape inference only accept tensor with at least 3 dimensions");
       }
       // support only input tensors with dim = 3,4,5
       if (input[0].size() < 3 || input[0].size() >  5) {
-         throw std::runtime_error("TMVA SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported");
+         throw std::runtime_error("SOFIE" + Name() + "Op : tensors with dimension " + std::to_string(input[0].size()) + " are not yet supported");
       }
 
       if (input[0].size() -2 != fDim) {
          throw
-            std::runtime_error("TMVA SOFIE Pool Op Shape inference - invalid inputs ");
+            std::runtime_error("SOFIE Pool Op Shape inference - invalid inputs ");
       }
        // kernel shape
       size_t k1 = ((fAttrKernelShape.empty())? input[0][2] : fAttrKernelShape[0]);
@@ -156,7 +156,7 @@ public:
          }
       } else if (fAttrAutopad != "VALID") {
          throw
-            std::runtime_error("TMVA SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad);
+            std::runtime_error("SOFIE" + Name() + "Op invalid Autopad value : " + fAttrAutopad);
       }
       // to be sure pad is vector of size 6
       if (fDim < 3) fAttrPads.resize(6, 0);
@@ -204,13 +204,13 @@ public:
 
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
          throw
-            std::runtime_error("TMVA SOFIE Pool op Input Tensor " + fNX + " is not found in model");
+            std::runtime_error("SOFIE Pool op Input Tensor " + fNX + " is not found in model");
       }
       fShapeX = model.GetTensorShape(fNX);
       if (fShapeX.size() < 3 || fShapeX.size()  > 5) {
          std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
          throw
-            std::runtime_error("TMVA SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
+            std::runtime_error("SOFIE Pool Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
       }
        fDim = fShapeX.size() - 2;
       // case of GlobalAveragePool. It is a pool case with kernel shape == image shape
@@ -267,7 +267,7 @@ public:
       OpName = "op_" + OpName;
 
       if (fShapeX.empty() || fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Pool Op called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Pool Op called to Generate without being initialized first");
       }
 
       std::stringstream out;
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx b/core/inc/SOFIE/ROperator_RNN.hxx
similarity index 98%
rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx
rename to core/inc/SOFIE/ROperator_RNN.hxx
index aed7bc1..3a0f58f 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.hxx
+++ b/core/inc/SOFIE/ROperator_RNN.hxx
@@ -91,7 +91,7 @@ template <typename T> class ROperator_RNN final : public ROperator {
          fType = "float";
       } else {
          throw std::runtime_error(
-             "TMVA SOFIE Encountered unsupported type parsing a RNN operator");
+             "SOFIE Encountered unsupported type parsing a RNN operator");
       }
 
       fInputTensorNames = { fNX, fNW, fNR };
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/core/inc/SOFIE/ROperator_RNN.icc
similarity index 96%
rename from src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
rename to core/inc/SOFIE/ROperator_RNN.icc
index c03c1c2..467fda8 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
+++ b/core/inc/SOFIE/ROperator_RNN.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_RNN_I
 #define SOFIE_ROPERATOR_RNN_I
 
-
 namespace SOFIE {
 
 template <typename T>
@@ -39,40 +38,40 @@ auto ROperator_RNN<T>::Initialize(RModel& model)
    fUseSession = model.UseSession();
    // Check the input and output tensors
    if (!model.CheckIfTensorAlreadyExist(fNX)) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNX +
                                "  is not found in model.");
    }
    fShapeX = model.GetTensorShape(fNX);
    if (fShapeX.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNX +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNX +
                                " is not of 3 dimensions.");
    }
    if (!model.CheckIfTensorAlreadyExist(fNW)) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNW +
                                "  is not found in model.");
    }
    fShapeW = model.GetTensorShape(fNW);
    if (fShapeW.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNW +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNW +
                                " is not of 3 dimensions.");
    }
    if (!model.CheckIfTensorAlreadyExist(fNR)) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNR +
                                "  is not found in model.");
    }
    fShapeR = model.GetTensorShape(fNR);
    if (fShapeR.size() != 3) {
-      throw std::runtime_error("TMVA SOFIE RNN Op input tensor " + fNR +
+      throw std::runtime_error("SOFIE RNN Op input tensor " + fNR +
                                " is not of 3 dimensions.");
    }
    if (!fNB.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNB)) {
-         throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB +
+         throw std::runtime_error("SOFIE RNN op input tensor " + fNB +
                                   " is not  found in model.");
       }
       fShapeB = model.GetTensorShape(fNB);
       if (fShapeB.size() != 2 && fShapeB.size() != 4) {
-         throw std::runtime_error("TMVA SOFIE RNN op input tensor " + fNB +
+         throw std::runtime_error("SOFIE RNN op input tensor " + fNB +
                                   " is not of 2 or 4 dimensions.");
       }
       if (fShapeB.size() == 2) {
@@ -112,23 +111,23 @@ auto ROperator_RNN<T>::Initialize(RModel& model)
    }
    if (!fNSequence_lens.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNSequence_lens)) {
-         throw std::runtime_error("TMVA SOFIE RNN Op input tensor " +
+         throw std::runtime_error("SOFIE RNN Op input tensor " +
                                   fNSequence_lens + "is not found in model.");
       }
       fShapeSequence_lens = model.GetTensorShape(fNSequence_lens);
       if (fShapeSequence_lens.size() != 1) {
-         throw std::runtime_error("TMVA SOFIE RNN Op input tensor " +
+         throw std::runtime_error("SOFIE RNN Op input tensor " +
                                   fNSequence_lens + " is not of 1 dimension.");
       }
    }
    if (!fNInitial_h.empty()) {
       if (!model.CheckIfTensorAlreadyExist(fNInitial_h)) {
-         throw std::runtime_error("TMVA SOFIE RNN Op input tensor " +
+         throw std::runtime_error("SOFIE RNN Op input tensor " +
                                   fNInitial_h + " is not found in model.");
       }
       fShapeInitial_h = model.GetTensorShape(fNInitial_h);
       if (fShapeInitial_h.size() != 3) {
-         throw std::runtime_error("TMVA SOFIE RNN Op input tensor " +
+         throw std::runtime_error("SOFIE RNN Op input tensor " +
                                   fNInitial_h + " is not of 3 dimensions.");
       }
    }
@@ -153,24 +152,24 @@ auto ROperator_RNN<T>::Initialize(RModel& model)
           activation != "ScaledTanh" && activation != "HardSigmoid" &&
           activation != "Elu" && activation != "Softsign" &&
           activation != "Softplus") {
-         throw std::runtime_error("TMVA SOFIE - Activation function " +
+         throw std::runtime_error("SOFIE - Activation function " +
                                   activation + " not implemented");
       }
    }
    if (fAttrDirection != "forward" && fAttrDirection != "backward" &&
        fAttrDirection != "bidirectional") {
       throw std::runtime_error(
-          "TMVA SOFIE - Invalid RNN direction fAttrDirection = " +
+          "SOFIE - Invalid RNN direction fAttrDirection = " +
           fAttrDirection);
    }
    if (fAttrHiddenSize != fShapeW[1]) {
       throw std::runtime_error(
-          "TMVA SOFIE - fAttrHiddenSize must be equal to " +
+          "SOFIE - fAttrHiddenSize must be equal to " +
           std::to_string(fShapeW[1]));
    }
    if (fAttrLayout > 1) {
       throw std::runtime_error(
-          "TMVA SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) +
+          "SOFIE - Layout fAttrLayout = " + std::to_string(fAttrLayout) +
           " must be 0 (timewise) or 1 (batchwise)");
    }
    if (fAttrActivations.empty()) {
@@ -230,7 +229,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName)
    // set the input
    if (fAttrLayout == 0) {
       if (fType == "float") {
-         out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n";
+         out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n";
       }
    } else {
       if (fUseSession)
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx b/core/inc/SOFIE/ROperator_Random.hxx
similarity index 95%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx
rename to core/inc/SOFIE/ROperator_Random.hxx
index cde08b5..0de1cd9 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Random.hxx
+++ b/core/inc/SOFIE/ROperator_Random.hxx
@@ -121,13 +121,13 @@ public:
       if (fUseROOT) {
          if (fMode == kNormal) {
             if (fParams.count("mean") == 0 || fParams.count("scale") == 0)
-               throw std::runtime_error("TMVA SOFIE RandomNormal op : no mean or scale are defined");
+               throw std::runtime_error("SOFIE RandomNormal op : no mean or scale are defined");
             float mean = fParams["mean"];
             float scale = fParams["scale"];
             out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Gaus(" << mean << "," << scale << ");\n";
          } else if (fMode == kUniform) {
             if (fParams.count("high") == 0 || fParams.count("low") == 0)
-              throw std::runtime_error("TMVA SOFIE RandomUniform op : no low or high are defined");
+              throw std::runtime_error("SOFIE RandomUniform op : no low or high are defined");
             float high = fParams["high"];
             float low = fParams["low"];
             out << SP << SP << "tensor_" << fNY << "[i] = fRndmEngine->Uniform(" << low << "," << high << ");\n";
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/core/inc/SOFIE/ROperator_Range.hxx
similarity index 84%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
rename to core/inc/SOFIE/ROperator_Range.hxx
index 8af272d..8ea17d9 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
+++ b/core/inc/SOFIE/ROperator_Range.hxx
@@ -8,7 +8,6 @@
 #include <sstream>
 #include <algorithm>
 
-
 namespace SOFIE{
 
 template <typename T>
@@ -51,15 +50,15 @@ public:
        //input must be a graph input, or already initialized intermediate tensor
       if (!model.CheckIfTensorAlreadyExist(fNStart)) {
          throw
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNStart + "is not found in model");
+            std::runtime_error("SOFIE Range Op Input Tensor " + fNStart + "is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNLimit)) {
          throw
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNLimit + "is not found in model");
+            std::runtime_error("SOFIE Range Op Input Tensor " + fNLimit + "is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNDelta)) {
          throw
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model");
+            std::runtime_error("SOFIE Range Op Input Tensor " + fNDelta + "is not found in model");
       }
       ETensorType type = ConvertStringToType(fType);
       if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) {
@@ -67,7 +66,7 @@ public:
          T * limit = static_cast<T*>(model.GetInitializedTensorData(fNLimit).get());
          T * delta = static_cast<T*>(model.GetInitializedTensorData(fNDelta).get());
          if (!start || !delta || !limit)
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data");
+            std::runtime_error("SOFIE Range Op Input Tensor has invalid input data");
          T a = *start;
          T b = *limit;
          T d = *delta;
@@ -89,9 +88,9 @@ public:
          model.AddDynamicTensor(fNOutput, type, fShape);
       }
       if (model.Verbose()) {
-         std::cout << "Range -> output is " << fNOutput << " ";
-         if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl;
-         else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl;
+         std::cout << "Range -> output is " << fNOutput << " : " << ConvertDimShapeToString(fShape);
+         if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData<T>(fNOutput));
+         std::cout << std::endl;
       }
    }
 
@@ -103,7 +102,7 @@ public:
 
       OpName = "op_" + OpName;
       if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Range operator called to Generate without being initialized first");
       }
 
       std::string sizeName = fShape[0].param;
@@ -121,5 +120,5 @@ public:
 };
 
 }//SOFIE
-
+   
 #endif //SOFIE_ROPERATOR_RANGE
diff --git a/core/inc/SOFIE/ROperator_Reduce.hxx b/core/inc/SOFIE/ROperator_Reduce.hxx
new file mode 100644
index 0000000..f3e7170
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Reduce.hxx
@@ -0,0 +1,475 @@
+#ifndef SOFIE_ROPERATOR_Reduce
+#define SOFIE_ROPERATOR_Reduce
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <memory>
+#include <sstream>
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+#include <cassert>
+
+
+namespace SOFIE{
+
+enum EReduceOpMode { ReduceMean, ReduceSum, ReduceSumSquare, ReduceProd, ReduceL2, ReduceMax, InvalidReduceOp };
+
+template <typename T, EReduceOpMode Op>
+class ROperator_Reduce final : public ROperator
+{
+private:
+    /* Attributes*/
+    int fkeepdims = 1; //default value
+    std::vector<int64_t> fAttrAxes;
+    EReduceOpMode fReduceOpMode;
+    std::string fNX;
+    std::string fNAxes;
+    std::string fNY;
+    std::vector<size_t> fShapeX;
+    std::vector<size_t> fShapeY;
+    std::vector<size_t> fShapeYNotPruned; // needed for fKeepdims=0
+
+
+public:
+
+   std::string Name() {
+      if (fReduceOpMode == ReduceMean)           return "ReduceMean";
+      else if (fReduceOpMode == ReduceSumSquare) return "ReduceSumSquare";
+      else if (fReduceOpMode == ReduceProd)      return "ReduceProd";
+      else if (fReduceOpMode == ReduceSum)       return "ReduceSum";
+      else if (fReduceOpMode == ReduceL2)        return "ReduceL2";
+      else if (fReduceOpMode == ReduceMax)       return "ReduceMax";
+      return "Invalid";
+   }
+
+   std::vector<std::string> GetStdLibs() override {
+      if (fReduceOpMode == ReduceL2)
+         return { std::string("cmath") };
+      if (fReduceOpMode == ReduceMax)
+         return { std::string("limits") };
+      return {};
+   }
+
+   ROperator_Reduce(){}
+   ROperator_Reduce(int keepdims, std::vector<int64_t> attrAxes, std::string nameX, std::string nameAxes, std::string nameY):
+   fkeepdims(keepdims), fAttrAxes(attrAxes), fNX(UTILITY::Clean_name(nameX)), fNAxes(UTILITY::Clean_name(nameAxes)), fNY(UTILITY::Clean_name(nameY)) {
+      fReduceOpMode = Op;
+      
+      fInputTensorNames = { fNX };
+      if(!fNAxes.empty()){
+         fInputTensorNames.emplace_back(fNAxes);
+      }
+
+      fOutputTensorNames = { fNY };
+   }
+
+   // type of output given input
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   // shape of output tensors given input tensors
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      auto & outputShape = ret[0];
+      for (size_t j = 0; j < fAttrAxes.size(); j++) {
+         if (fAttrAxes[j] < 0) fAttrAxes[j] += outputShape.size();
+         if (fAttrAxes[j] < 0 || (size_t) fAttrAxes[j] >= outputShape.size() )
+            throw std::runtime_error("SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j]));
+         // set to 1 the reduced dims
+         outputShape[fAttrAxes[j]] = 1;
+      }
+      fShapeYNotPruned = outputShape;
+      // in case of pruning dimension we need to sort axes attributes
+      if (fkeepdims == 0) {
+         auto ax = fAttrAxes;
+         std::sort(ax.begin(), ax.end());
+         for (size_t j = 0; j < ax.size(); j++) {
+            // erase reduced dimensions, but keep last one
+            if (outputShape.size() > 1) {
+               outputShape.erase(outputShape.begin() + ax[j]);
+               for (size_t k = j+1; k < ax.size(); k++)
+                  ax[k] -= 1;  // decrease by one since we have removed a value
+            }
+         }
+      }
+      return ret;
+   }
+   void Initialize(RModel& model) override {
+
+      fUseSession = model.UseSession();
+
+      if (!model.CheckIfTensorAlreadyExist(fNX)) {
+         // input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("SOFIE Reduce Op Input Tensor " + fNX + " is not found in model");
+      }
+      fShapeX = model.GetTensorShape(fNX);
+      // check if tensor with axes is provided
+      if (!fNAxes.empty()) {
+         auto ax_shptr = model.GetInitializedTensorData(fNAxes);
+         auto ax_ptr = static_cast<int64_t *>(ax_shptr.get());
+         auto ax_shape = model.GetTensorShape(fNAxes);
+         size_t ax_length = ConvertShapeToLength(ax_shape);
+         fAttrAxes = std::vector<int64_t>(ax_ptr, ax_ptr+ax_length);
+      } else if (fAttrAxes.empty()) {
+         // in case no axes is passed assume full reduction
+         fAttrAxes.resize(fShapeX.size());
+         for (size_t i = 0; i < fAttrAxes.size(); i++)
+            fAttrAxes[i] = i;
+      }
+      // find shape of Y and add it in the list of intermediate tensors
+      fShapeY = ShapeInference({fShapeX})[0];
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+      if (model.Verbose()){
+         std::cout << Name() << " : " << fNX << " -> " << fNY << " shape " << ConvertShapeToString(fShapeY) << std::endl;
+      }
+      model.AddNeededStdLib("algorithm");
+   }
+
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeX.empty() || fShapeY.empty()) {
+         throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first");
+      }
+
+      size_t inputLength = SOFIE::ConvertShapeToLength(fShapeX);
+      size_t outputLength = SOFIE::ConvertShapeToLength(fShapeY);
+
+      auto inputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeX);
+      // output stride (or not pruned vector)
+      auto outputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeYNotPruned);
+
+      // write here according to size of shape
+      // in generation code can be done automatically
+      // i0 =  i / stride0  % shape0; i1 = i / stride1 % shape1 and so on
+      // and we have for the inverse
+      // i = i0 * s0 + i1 * s1 + i2 * s2 + i3 * s3 ....
+
+      // don't need to divide by last stride s[n-1] since it is 1 by definition
+
+      std::stringstream out;
+      out << "\n//----  operator " << Name() << "  " << opName << "\n";
+      // check where is reduced axes are first or last one. In these case we can do a faster implementation
+      enum EReduceDim {kFirst, kLast, kMiddle};
+      EReduceDim reduceDims = kLast;
+      int kmin = fShapeX.size()-fAttrAxes.size();
+      for (int k = fShapeX.size()-1; k >= kmin; k--) {
+         // if k is not a reduced axis is not last ones
+         if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
+            reduceDims = kMiddle;
+            break;
+         }
+      }
+      if (reduceDims == kMiddle) {
+         reduceDims = kFirst;
+         // check if at the beginning
+         for (size_t k = 0; k < fAttrAxes.size(); k++) {
+            // if k is not a reduced axis is not first ones
+            if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
+               reduceDims = kMiddle;
+               break;
+            }
+         }
+      }
+      size_t reducedLength = inputLength / outputLength;
+      if (reduceDims == kLast) {
+         //std::cout << "reduction for operator " << opName << " is last" << std::endl;
+         // new faster implementation using a single loop
+         // faster to loop first on reduced dimension and then output
+         // reset output tensors
+
+         // loop on output dimensions
+         out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n";
+         // loop on reduce dimensions
+         if (fReduceOpMode == ReduceProd)
+            out << SP << SP << "tensor_" << fNY << "[i] = 1;\n";
+         else if (fReduceOpMode == ReduceMax)
+            out << SP << SP << "tensor_" << fNY << "[i] = std::numeric_limits<float>::lowest();\n";
+         else
+            out << SP << SP << "tensor_" << fNY << "[i] = 0;\n";
+         out << SP << SP << "for (size_t j = 0; j < " << reducedLength << "; j++) {\n";
+
+         if (fReduceOpMode == ReduceProd)
+            out << SP << SP << SP <<  "tensor_" << fNY << "[i] *= tensor_" << fNX << "[i * " << reducedLength << " + j];\n";
+         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
+            out << SP << SP << SP <<  "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j];\n";
+         else if(fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2)
+            out << SP << SP << SP <<  "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j] * tensor_"
+                                    << fNX << "[i * " << reducedLength << " + j];\n";
+         else if (fReduceOpMode == ReduceMax)
+            out << SP << SP << SP << "if (tensor_" << fNX << "[i * " << reducedLength << " + j] > tensor_" << fNY << "[i])\n"
+                << SP << SP << SP << SP << "tensor_" << fNY << "[i] = tensor_" << fNX << "[i * " << reducedLength << " + j];\n";
+         out << SP << SP << "}\n"; // end j loop
+         if(fReduceOpMode == ReduceMean)
+            out << SP << SP << "tensor_" << fNY << "[i] /= static_cast<float>(" << reducedLength << ");\n";
+         else if (fReduceOpMode == ReduceL2)
+            out << SP << SP << "tensor_" << fNY << "[i] = std::sqrt(tensor_" << fNY << "[i]);\n";
+
+         out << SP << "}\n"; // end i loop
+      } else if (reduceDims == kFirst) {
+         //std::cout << "reduction for operator " << opName << " is first" << std::endl;
+         // case reduction is at beginning
+         // reset output tensors
+         if (fReduceOpMode == ReduceProd)
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n";
+         else if (fReduceOpMode == ReduceMax)
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength
+                      << ", std::numeric_limits<float>::lowest());\n";
+         else
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 0);\n";
+
+         out << SP << "for (size_t i = 0; i < " << reducedLength << "; i++) {\n";
+         out << SP << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n";
+
+         if (fReduceOpMode == ReduceProd)
+            out << SP << SP << SP << "tensor_" << fNY << "[j] *= tensor_" << fNX << "[i * " << outputLength << " + j];\n";
+         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
+            out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j];\n";
+         else if(fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2)
+            out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j] * tensor_"
+                                    << fNX << "[i * " << outputLength << " + j];\n";
+         else if (fReduceOpMode == ReduceMax)
+            out << SP << SP << SP << "if (tensor_" << fNX << "[i * " << outputLength << " + j] > tensor_" << fNY << "[j])\n"
+                << SP << SP << SP << SP << "tensor_" << fNY << "[j] = tensor_" << fNX << "[i * " << outputLength << " + j];\n";
+         out << SP << SP << "}\n"; // end j loop
+         out << SP  << "}\n"; // end i loop
+         if(fReduceOpMode == ReduceMean) {
+            out << SP  << "for (size_t j = 0; j < " << outputLength << "; j++) {\n";
+            out << SP << SP << "tensor_" << fNY << "[j] /= static_cast<float>(" << reducedLength << ");\n";
+            out << SP << "}\n"; // end j loop
+         } else if (fReduceOpMode == ReduceL2) {
+            out << SP  << "for (size_t j = 0; j < " << outputLength << "; j++) {\n";
+            out << SP << SP << "tensor_" << fNY << "[j] = std::sqrt(tensor_" << fNY << "[j]);\n";
+            out << SP << "}\n"; // end j loop
+         }
+      }
+      else
+      { // standard case
+         //std::cout << "reduction for operator " << opName << " is middle" << std::endl;
+         // reset output tensors
+         if (fReduceOpMode == ReduceProd)
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n";
+         else if (fReduceOpMode == ReduceMax)
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength
+                      << ", std::numeric_limits<float>::lowest());\n";
+         else
+            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ",0);\n";
+
+         out << SP << "for (size_t i = 0; i < " << inputLength << "; i++) {\n";
+
+         size_t dim = fShapeX.size(); // this is the input dimension (e.g. 2, 3 or 4 or more)
+
+         // here we find output index
+         out << SP << SP << "size_t outputIndex = 0;\n";
+         for (size_t k = 0; k < dim; k++) {
+            if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
+               // do for not reducing axes
+               out << SP << SP << "size_t i_" << k << " = i / " << inputStrides[k] << " % " << fShapeX[k] << ";\n";
+               out << SP << SP << "outputIndex += i_" << k << " * " << outputStrides[k] << ";\n";
+            }
+         }
+         // now compute reduction
+         out << SP << SP << "// compute reduction....\n";
+         if (fReduceOpMode == ReduceProd)
+            out << SP << SP << "tensor_" << fNY << "[outputIndex] *= tensor_" << fNX << "[i];\n";
+         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
+            out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i];\n";
+         else if (fReduceOpMode == ReduceSumSquare || fReduceOpMode == ReduceL2) {
+            out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i] * tensor_" << fNX
+                << "[i];\n";
+         } else if (fReduceOpMode == ReduceMax) {
+            out << SP << SP << "if (tensor_" << fNX << "[i] > tensor_" << fNY << "[outputIndex])\n";
+            out << SP << SP << SP << "tensor_" << fNY << "[outputIndex] = tensor_" << fNX << "[i];\n";
+         }
+         out << SP << "}\n"; // end loop on input elements
+         // post-processing passes
+         if (fReduceOpMode == ReduceMean) {
+            out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n";
+            out << SP << SP << "tensor_" << fNY << "[i] /= static_cast<float>(" << reducedLength << ");\n";
+            out << SP << "}\n";
+         } else if (fReduceOpMode == ReduceL2) {
+            out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n";
+            out << SP << SP << "tensor_" << fNY << "[i] = std::sqrt(tensor_" << fNY << "[i]);\n";
+            out << SP << "}\n";
+         }
+      }
+
+      return out.str();
+   }
+
+   // ---------------------------------------------------------------------------
+   // GPU kernel: one block per output element, 256 threads cooperatively reduce
+   // the slice via shared-memory tree reduction.
+   // This replaces the previous naive "one thread per output element" approach
+   // which serialised the entire reduction loop inside a single thread.
+   // ---------------------------------------------------------------------------
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      if (fShapeX.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first");
+
+      const std::size_t Dx        = fShapeX.size();
+      auto inputStrides            = UTILITY::ComputeStrideFromShape(fShapeX);
+      auto outputStrides           = UTILITY::ComputeStrideFromShape(fShapeYNotPruned);
+      std::size_t inputLength      = ConvertShapeToLength(fShapeX);
+      std::size_t outputLength     = ConvertShapeToLength(fShapeY);
+      std::size_t reducedLength    = inputLength / outputLength;
+
+      // Partition axes into keep (non-reduced) and reduce sets.
+      std::vector<std::size_t> redAxes, keepAxes;
+      for (std::size_t d = 0; d < Dx; ++d) {
+         if (std::find(fAttrAxes.begin(), fAttrAxes.end(), (int64_t)d) != fAttrAxes.end())
+            redAxes.push_back(d);
+         else
+            keepAxes.push_back(d);
+      }
+
+      // Row-major strides for decomposing the flat reduction index r into
+      // per-axis coordinates.
+      // redStrides[i] = product of fShapeX[redAxes[j]] for j > i
+      std::vector<std::size_t> redStrides(redAxes.size(), 1);
+      for (int ri = (int)redAxes.size() - 2; ri >= 0; --ri)
+         redStrides[ri] = redStrides[ri + 1] * fShapeX[redAxes[ri + 1]];
+
+      std::string kname = "ReduceKernel_" + Name() + "_" + fNY;
+
+      std::string op;
+      op  = "\n//------ " + Name() + "_KERNEL_ALPAKA (block parallel reduction)\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ input,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const reducedLength,\n";
+      op += SP + SP + SP + "std::size_t const outputLength) const {\n\n";
+
+      // ---- shared memory (fixed 256 slots, matches block size) ----
+      op += SP + SP + SP + "auto& shmem = alpaka::declareSharedVar<T[256], __COUNTER__>(acc);\n\n";
+
+      // ---- block/thread addressing ----
+      // One block per output element; threads cooperate within the block.
+      op += SP + SP + SP + "auto const out_idx   = alpaka::getIdx<alpaka::Grid,  alpaka::Blocks  >(acc)[0];\n";
+      op += SP + SP + SP + "auto const thread_id = alpaka::getIdx<alpaka::Block, alpaka::Threads >(acc)[0];\n";
+      op += SP + SP + SP + "if (out_idx >= outputLength) return;\n\n";
+
+      // ---- decode output (keep-axis) coordinates from out_idx ----
+      for (std::size_t d = 0; d < Dx; ++d) {
+         if (std::find(redAxes.begin(), redAxes.end(), d) == redAxes.end()) {
+            op += SP + SP + SP + "std::size_t const oy_" + std::to_string(d)
+                  + " = (out_idx / " + std::to_string(outputStrides[d]) + "u) % "
+                  + std::to_string(fShapeYNotPruned[d]) + "u;\n";
+         }
+      }
+      op += "\n";
+
+      // ---- thread-stride partial accumulation over reduction axis ----
+      std::string startVal;
+      if (Op == ReduceProd)       startVal = "static_cast<T>(1)";
+      else if (Op == ReduceMax)   startVal = "std::numeric_limits<T>::lowest()";
+      else                        startVal = "static_cast<T>(0)";
+      op += SP + SP + SP + "T partial = " + startVal + ";\n";
+      op += SP + SP + SP + "for (std::size_t r = thread_id; r < reducedLength; r += 256u) {\n";
+
+      // Decode flat reduction index r into per-axis coordinates.
+      for (std::size_t ri = 0; ri < redAxes.size(); ++ri) {
+         std::size_t rd = redAxes[ri];
+         op += SP + SP + SP + SP + "std::size_t const r_" + std::to_string(rd)
+               + " = (r / " + std::to_string(redStrides[ri]) + "u) % "
+               + std::to_string(fShapeX[rd]) + "u;\n";
+      }
+
+      // Compute flat input index.
+      op += SP + SP + SP + SP + "std::size_t const in_idx =\n";
+      for (std::size_t d = 0; d < Dx; ++d) {
+         bool isReduced = std::find(redAxes.begin(), redAxes.end(), d) != redAxes.end();
+         std::string coord = isReduced ? "r_" + std::to_string(d) : "oy_" + std::to_string(d);
+         op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u";
+         op += (d + 1 < Dx) ? " +\n" : ";\n";
+      }
+
+      // Partial accumulation step.
+      if (Op == ReduceProd)
+         op += SP + SP + SP + SP + "partial *= input[in_idx];\n";
+      else if (Op == ReduceSum || Op == ReduceMean)
+         op += SP + SP + SP + SP + "partial += input[in_idx];\n";
+      else if (Op == ReduceSumSquare || Op == ReduceL2)
+         op += SP + SP + SP + SP + "partial += input[in_idx] * input[in_idx];\n";
+      else if (Op == ReduceMax)
+         op += SP + SP + SP + SP + "if (input[in_idx] > partial) partial = input[in_idx];\n";
+
+      op += SP + SP + SP + "}\n\n"; // end thread-stride loop
+
+      // ---- store in shared memory and synchronise ----
+      op += SP + SP + SP + "shmem[thread_id] = partial;\n";
+      op += SP + SP + SP + "alpaka::syncBlockThreads(acc);\n\n";
+
+      // ---- binary tree reduction within the block ----
+      op += SP + SP + SP + "for (std::size_t s = 128u; s > 0u; s >>= 1u) {\n";
+      op += SP + SP + SP + SP + "if (thread_id < s) {\n";
+      if (Op == ReduceProd)
+         op += SP + SP + SP + SP + SP + "shmem[thread_id] *= shmem[thread_id + s];\n";
+      else if (Op == ReduceMax)
+         op += SP + SP + SP + SP + SP + "if (shmem[thread_id + s] > shmem[thread_id]) shmem[thread_id] = shmem[thread_id + s];\n";
+      else
+         op += SP + SP + SP + SP + SP + "shmem[thread_id] += shmem[thread_id + s];\n";
+      op += SP + SP + SP + SP + "}\n";
+      op += SP + SP + SP + SP + "alpaka::syncBlockThreads(acc);\n";
+      op += SP + SP + SP + "}\n\n";
+
+      // ---- thread 0 writes the final result ----
+      op += SP + SP + SP + "if (thread_id == 0u) {\n";
+      op += SP + SP + SP + SP + "T result = shmem[0];\n";
+      if (Op == ReduceMean)
+         op += SP + SP + SP + SP + "result /= static_cast<T>(" + std::to_string(reducedLength) + "u);\n";
+      else if (Op == ReduceL2)
+         op += SP + SP + SP + SP + "result = std::sqrt(result);\n";
+      op += SP + SP + SP + SP + "output[out_idx] = result;\n";
+      op += SP + SP + SP + "}\n";
+
+      op += SP + SP + "}\n"; // end operator()
+      op += SP + "};\n";     // end struct
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      std::string kname = "ReduceKernel_" + Name() + "_" + fNY;
+      return SP + kname + " reduceKernel_" + Name() + "_" + fNY + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string /*opName*/) override {
+      if (fShapeX.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Reduce Op called to Generate without being initialized first");
+
+      std::size_t inputLength   = ConvertShapeToLength(fShapeX);
+      std::size_t outputLength  = ConvertShapeToLength(fShapeY);
+      std::size_t reducedLength = inputLength / outputLength;
+      std::string kname = "reduceKernel_" + Name() + "_" + fNY;
+
+      std::stringstream out;
+      out << "\n//------ " << Name() << "_GPU_ALPAKA\n";
+      // Grid: one block per output element; Block: 256 threads cooperate to
+      // reduce the corresponding slice.
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_" << fNY << "(\n";
+      out << SP << SP << "Vec::all(Idx{" << outputLength << "u}),\n";
+      out << SP << SP << "Vec::all(Idx{256u}),\n";
+      out << SP << SP << "Vec::all(Idx{1u}));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY
+          << ", " << kname
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<std::size_t>(" << reducedLength << "u)"
+          << ", static_cast<std::size_t>(" << outputLength << "u));\n";
+
+      return out.str();
+   }
+
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROPERATOR_Reduce
+
diff --git a/core/inc/SOFIE/ROperator_Relu.hxx b/core/inc/SOFIE/ROperator_Relu.hxx
new file mode 100644
index 0000000..96d5931
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Relu.hxx
@@ -0,0 +1,130 @@
+#ifndef SOFIE_ROPERATOR_RELU
+#define SOFIE_ROPERATOR_RELU
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template <typename T>
+class ROperator_Relu final : public ROperator
+{
+
+private:
+
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+public:
+   ROperator_Relu(){}
+   ROperator_Relu(std::string nameX, std::string nameY):
+      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::RELU;
+         fInputTensorNames = { fNX };
+         fOutputTensorNames = { fNY };
+      }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("SOFIE Relu Op Input Tensor " + fNX + " is not found in model");
+      }
+
+      fShape = model.GetDimTensorShape(fNX);
+
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+      if (model.Verbose()) {
+         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl;
+      }
+   }
+
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ RELU\n";
+      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
+      out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) {
+      std::string op;
+      op = "\n//------ RELU_KERNEL_ALPAKA\n";
+
+      op = "\n//------ RELU_KERNEL_ALPAKA\n";
+      op += "struct ReluKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < numElements) {\n";
+      op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : 0;\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "ReluKernel reluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Relu called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ RELU_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNY << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+         << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+         return fNY;
+   }
+
+   bool IsElementwise() const override { return true; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "(" + v + ") >= T(0) ? (" + v + ") : T(0)";
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_RELU
diff --git a/core/inc/SOFIE/ROperator_Reshape.hxx b/core/inc/SOFIE/ROperator_Reshape.hxx
new file mode 100644
index 0000000..4393b32
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Reshape.hxx
@@ -0,0 +1,449 @@
+#ifndef SOFIE_ROPERATOR_RESHAPE
+#define SOFIE_ROPERATOR_RESHAPE
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <cassert>
+#include <cctype>
+#include <sstream>
+#include <algorithm>
+
+
+namespace SOFIE{
+
+enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze };
+
+
+class ROperator_Reshape final : public ROperator
+{
+
+private:
+
+   bool fVerbose = false;
+   bool fDimInput = false;
+   bool fDynamicShape = false;
+   ReshapeOpMode fOpMode = Reshape;   // type of Reshape operator
+
+   int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape
+   int fAxis = 1;      // (for Flatten)
+
+   std::string fNData;        // input data tensor name
+   std::string fNInput2;       // reshape or axes tensor name depending on operator
+   std::string fNOutput;            // output tensor name
+   std::vector<Dim> fShapeInput;    // input shape data
+   std::vector<Dim> fShapeOutput;   // output shape data
+   std::vector<Dim> fOutputShapeData; // in case output is a shape tensor we store here the shape value data (can be parametric)
+   std::vector<int64_t> fAttrAxes;  // axes attributes (provided for all version of Squeeze/Unsqueeze)
+   std::vector<int64_t> fShape;     // shape tensor values provided for Reshape for int shapes4
+
+public:
+
+   std::string Name() const {
+      if (fOpMode == Reshape) return "Reshape";
+      if (fOpMode == Flatten) return "Flatten";
+      if (fOpMode == Squeeze) return "Squeeze";
+      if (fOpMode == Unsqueeze) return "Unsqueeze";
+      return "";
+   }
+
+   ROperator_Reshape(){}
+   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput)
+      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)),
+         fNOutput(UTILITY::Clean_name(nameOutput))
+   {
+      if (opMode == Reshape) fAllowZero = attr_value;
+      if (opMode == Flatten) fAxis = attr_value;
+
+      fInputTensorNames = { fNData };
+      if(!fNInput2.empty()){
+         fInputTensorNames.emplace_back(fNInput2);
+      }
+      fOutputTensorNames = { fNOutput };
+   }
+
+   // for squeeze/unsqueezed operators following old ONNX version (< 10)
+   // In this cases axes are passed as attribute values
+   ROperator_Reshape(ReshapeOpMode opMode, std::vector<int64_t> attrAxes, std::string nameData, std::string nameOutput)
+      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)),
+        fAttrAxes(attrAxes)
+   {
+      assert(fOpMode == Squeeze || fOpMode == Unsqueeze);
+      fInputTensorNames = { fNData };
+      fOutputTensorNames = { fNOutput };
+   }
+
+
+   // output shape
+   std::vector<Dim> DoShapeInference(const std::vector<Dim> & input_shape, const std::vector<Dim> & target_shape)  {
+      if (fOpMode == Reshape) {
+         // correct the provided shape (here we have the value) for 0 or -1
+         // the target_shape can be a scalar in case of not present shape input tensor
+         std::vector<Dim> output_shape = target_shape;
+         bool hasMinusOne = false;
+         bool hasZero = false;
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            // case for zero values in given shape: in this case we take the corresponding value from input shape
+            if (!output_shape[i].isParam) {
+               if (output_shape[i].dim == 0) {
+                  hasZero = true;
+                  if (fAllowZero)
+                     output_shape[i] = Dim{0};
+                  else {
+                     if (i > 0 && output_shape.size() != input_shape.size())
+                        std::cout << "WARNING: TMVA Reshape Op : output shape has zero value at index " << i <<
+                                  " but input shape has a different rank than output shape" << std::endl;
+                     if (i >= input_shape.size())
+                        throw std::runtime_error("TMVA Reshape Op : output shape has zero value at index " + std::to_string(i) +
+                              " but input shape does not have corresponding index");
+                     }
+                     output_shape[i] = input_shape[i];
+               } else if (output_shape[i].dim == static_cast<size_t>(-1)) {
+                  hasMinusOne = true;
+               }
+            }
+         }
+         if (hasZero && hasMinusOne) {
+            throw std::runtime_error("TMVA Reshape Op : zero value in shape is not allowed when there is also a -1 in shape");
+         }
+         // now case of -1 in shape - we can infer the value of -1 from all other values
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            if (output_shape[i] == static_cast<size_t>(-1) && !output_shape[i].isParam) {
+               auto tmp = output_shape;
+               tmp.erase(tmp.begin() + i); // erase -1 value to compute the length of the other dimensions
+               auto tmp_length = ConvertDimShapeToLength(tmp);
+               auto input_length = ConvertDimShapeToLength(input_shape);
+               if (fVerbose)
+                  std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length "
+                            << input_length << " to " << tmp_length << std::endl;
+
+               if (IsInteger(tmp_length) && IsInteger(input_length))
+                  output_shape[i] = Dim{static_cast<size_t>(std::stoi(input_length) / std::stoi(tmp_length))};
+               else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) {
+                  output_shape[i] = Dim{input_length, static_cast<size_t>(-1)};
+               }
+               else {
+                  //we can try simplifying expression if tmp_length is integer and part of input_length
+                  // contains tmp_length
+                  bool canSimplify = false;
+                  std::vector <Dim> reduced_input;
+                  if (IsInteger(tmp_length)) {
+
+                     // try to tokenize with * the input length
+
+                     std::stringstream ss(input_length);
+
+                     std::string token;
+
+                     // Tokenizing w.r.t. space '*'
+                     while(getline(ss, token, '*'))
+                     {
+                        // remove any whitespace
+                        token.erase(std::remove_if(token.begin(), token.end(),
+                                                   [](unsigned char x) { return std::isspace(x); }), token.end());
+                        if (token != tmp_length) {
+                           if (IsInteger(token)) {
+                                 size_t il = static_cast<size_t>(std::stoi(input_length));
+                                 size_t tl = static_cast<size_t>(std::stoi(tmp_length));
+                                 if ((il % tl) == 0) {
+                                 canSimplify = true;
+                                 reduced_input.push_back(Dim{il / tl});
+                                 }
+                           } else {
+                              reduced_input.push_back(Dim{token});
+                           }
+                        } else {
+                           // token is equal to tmp_length, can be not considered and is simplified
+                           canSimplify = true;
+                        }
+                     }
+                  }
+                  if (canSimplify) {
+                     // if length contains * we need to add some brackets
+                     std::string res_shape = ConvertDimShapeToLength(reduced_input);
+                     if (res_shape.find('*') != std::string::npos)
+                        output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast<size_t>(-1)};
+                     else
+                        output_shape[i] = Dim{res_shape};
+                  }
+                  if (!canSimplify)
+                     output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast<size_t>(-1)};
+               }
+
+               break; // cannot have more than -1
+            }
+            //  throw std::runtime_error(
+            //                   "TMVA Reshape Op : output shape has multiple negative or zero values");
+         }
+
+         if (fVerbose)
+            std::cout << "Reshape: correct output shape  to " << ConvertDimShapeToString(output_shape) << std::endl;
+
+         if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) {
+            throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertDimShapeToString(input_shape) +
+                                     ConvertDimShapeToString(output_shape));
+         }
+         return output_shape;
+
+      } else if (fOpMode == Flatten) {
+         // flatten case
+         if (fAxis < 0)
+            fAxis += input_shape.size();
+         auto s1 = std::vector<Dim>(input_shape.begin(), input_shape.begin() + fAxis);
+         auto s2 = std::vector<Dim>(input_shape.begin() + fAxis, input_shape.end());
+         auto l1 = ConvertDimShapeToLength(s1);
+         auto l2 = ConvertDimShapeToLength(s2);
+         std::vector<Dim> newShape = {Dim{l1}, Dim{l2}};
+         return newShape;
+      } else if (fOpMode == Squeeze) {
+         // squeeze
+         // assume no axis is provided - remove all axes with value equal to 1
+         auto output_shape = input_shape;
+         if (fAttrAxes.empty()) {
+            size_t i = 0;
+            while (i < output_shape.size()) {
+               if (output_shape[i] == Dim{1}) {
+                  output_shape.erase(output_shape.begin() + i);
+               } else {
+                  i++;
+               }
+            }
+         } else {
+            auto axes = fAttrAxes;
+            for (size_t i = 0; i < axes.size(); i++) {
+               if (axes[i] < 0)
+                  axes[i] += input_shape.size();
+               if (!(output_shape[axes[i]] == Dim{1}))
+                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axis value " + std::to_string(axes[i]) +
+                                           " for " + ConvertDimShapeToString(output_shape));
+            }
+            // for calling vector::erase we must sort axes in decreasing order to avoid
+            std::sort(axes.begin(), axes.end(), std::greater<int>());
+            for (auto & axis : axes) {
+               output_shape.erase(output_shape.begin() + axis);
+            }
+         }
+         return output_shape;
+      }
+      else if (fOpMode == Unsqueeze) {
+         // unsqueeze
+         assert(!fAttrAxes.empty());
+         auto output_shape = input_shape;
+         auto &axes = fAttrAxes;
+         // output rank
+         int64_t r = input_shape.size() + axes.size();
+         for (auto &a : axes) {
+            int64_t i = static_cast<int64_t>(a);
+            if (i < -r || i > r - 1)
+               throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range");
+            if (i >= 0)
+               output_shape.insert(output_shape.begin() + i, Dim{1});
+            else
+               // negative axes
+               output_shape.insert(output_shape.end() + i + 1, Dim{1});
+         }
+         return output_shape;
+      }
+      throw std::runtime_error("TMVA Reshape Op : Invalid ReshapeOpMode");
+      return {Dim{}};
+   }
+
+   void Initialize(RModel& model) override {
+
+      fVerbose = model.Verbose();
+      if (fVerbose)
+         std::cout << "initialize reshape op type " << fOpMode << " -  for input " << fNData
+                   << " to shape given by " << fNInput2 << std::endl;
+
+      if (model.CheckIfTensorAlreadyExist(fNData) == false) {
+          // input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + "  is not found in model");
+      }
+      fShapeInput = model.GetDimTensorShape(fNData);
+      fDimInput = model.IsDynamicTensor(fNData);
+      // check if optional tensor exists defining shape or axes
+      if (!fNInput2.empty()) {
+         if (model.CheckIfTensorAlreadyExist(fNInput2)) {
+            if (model.IsInitializedTensor(fNInput2)) {
+               // assume input shape is an initialized tensor
+               auto dptr = model.GetInitializedTensorData(fNInput2);
+               auto values = static_cast<int64_t *>(dptr.get());
+               auto vec = model.GetTensorShape(fNInput2);
+               size_t n = 1;
+               if (vec.size() > 0)
+                  n = vec[0]; // size of shape input tensor
+               // copy values in fShape vector or fAttrAxes
+               if (fOpMode == Reshape)
+                  fShape = std::vector<int64_t>(values, values + n);
+               else
+                  fAttrAxes = std::vector<int64_t>(values, values + n);
+
+               std::vector<Dim> targetShape(fShape.begin(),fShape.end());
+               fShapeOutput = DoShapeInference(fShapeInput, targetShape);
+               // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
+               model.SetNotWritableInitializedTensor(fNInput2);
+            } else if (model.IsShapeTensor(fNInput2)) {
+               auto shapeData = model.GetShapeTensorValues(fNInput2);
+               fShapeOutput = DoShapeInference(fShapeInput, shapeData);
+               if (model.Verbose())
+                  std::cout << "Reshape op - get output shape from shape tensor " << fNInput2 << " with value " << ConvertDimShapeToString(shapeData) << std::endl;
+            } else {
+               // we cannot get shape at initialization time but at run-time
+               fDynamicShape = true;
+               // size of shape output us given by size of shape input tensor
+               if (model.IsDynamicTensor(fNInput2)) {
+                  throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " cannot have dynamic shape");
+               }
+               auto shapeInput2 = model.GetTensorShape(fNInput2);
+               fShapeOutput.resize(shapeInput2[0]);
+               for (size_t i = 0; i < fShapeOutput.size(); i++) {
+                  fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)};
+               }
+            }
+         } else {
+            throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model");
+         }
+      } else if (!fAttrAxes.empty()) {
+         // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze)
+         fShapeOutput = DoShapeInference(fShapeInput, std::vector<Dim>{});
+      } else if (fOpMode == Flatten || fOpMode == Squeeze) {
+         fShapeOutput = DoShapeInference(fShapeInput, std::vector<Dim>{});
+      } else {
+         throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data");
+      }
+      // check if output is constant or not
+      if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
+         fIsOutputConstant = true;
+         auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
+         auto o_shape = ConvertShapeToInt(fShapeOutput);
+         if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) )
+            throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths");
+         model.AddConstantTensor<int64_t>(fNOutput, o_shape, inputData);
+         if (model.Verbose()) {
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  " << fNOutput << " (constant) " << ConvertDimShapeToString(fShapeOutput)  << " : " <<
+            ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl;
+         }
+      }
+      // for input shape tensors we can have it if output shape is size==1 or a scalar
+      else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) {
+         // not sure if we ever end-up here - maybe reshaping from scalar to vector or viceversa
+         fIsOutputParamShape = true;
+         fOutputShapeData = model.GetShapeTensorValues(fNData);
+         model.AddShapeTensor(fNOutput, fOutputShapeData);
+         if (model.Verbose()) {
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  " << fNOutput << " (shape) " << ConvertDimShapeToString(fShapeOutput)  << " : " <<
+            ConvertDimShapeToString(fOutputShapeData) << std::endl;
+         }
+      }
+      else {
+         // non-constant case
+         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
+         if (model.Verbose())
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  "<< fNOutput << "  " << ConvertDimShapeToString(fShapeOutput)  << std::endl;
+      }
+   }
+
+   std::string Generate(std::string opName) override {
+
+
+      std::stringstream out;
+      std::string opType = "Reshape";
+      if (fOpMode == Flatten)
+         opType = "Flatten";
+      else if (fOpMode == Squeeze)
+         opType = "Squeeze";
+      else if (fOpMode == Unsqueeze)
+         opType = "Unsqueeze";
+
+      out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertDimShapeToString(fShapeOutput) << "\n";
+
+      if (fIsOutputConstant) return out.str();  //no op for constant tensors
+
+      if (fIsOutputParamShape) {
+          // no code to generate here for param shape output. Tensor output is defined in Session constructor
+         out << "//----------------output is a shape tensor----------\n";
+         for (int i = 0; i < static_cast<int>(fShapeOutput[0].dim); i++) {
+            out << SP << "tensor_" << fNOutput << "[" << i << " ] = " << fOutputShapeData[i].GetVal() << ";\n";
+         }
+         return out.str();
+      }
+
+      // in case of dynamic output shape we need to set the shape value from input shape tensor
+      // and take case of the zero values
+      if (fDynamicShape) {
+         for (size_t i = 0; i < fShapeOutput.size(); i++) {
+            // since fNInput2 values are int64_t, should we check if they are negative?
+            out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n";
+            if (!fAllowZero)
+               out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) "
+                         <<  fShapeOutput[i].param << " = " <<  fShapeInput[i] << ";\n";
+         }
+      }
+
+      // output of reshape is same as input
+      auto lengthOut = ConvertDimShapeToLength(fShapeOutput);
+      auto lengthIn = ConvertDimShapeToLength(fShapeInput);
+      if (lengthOut != lengthIn) {
+         // check needs to be done at run-time
+         out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n";
+         out << SP << SP << "throw std::runtime_error(\"SOFIE Reshape " << opName << " output length "
+             << lengthOut << " is different than input one " << lengthIn << "\");\n";
+      }
+
+
+      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput
+          << ");\n";
+      return out.str();
+   }
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+
+    opName = "op_" + opName;
+
+    if (fIsOutputParamShape) {
+        // shape tensor output: fill host-side tensor values, no device copy needed
+        std::stringstream out;
+        for (int i = 0; i < static_cast<int>(fShapeOutput[0].dim); i++) {
+            out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i].GetVal() << ";\n";
+        }
+        return out.str();
+    }
+
+    std::string opType = "Reshape";
+    if (fOpMode == Flatten)   opType = "Flatten";
+    else if (fOpMode == Squeeze)   opType = "Squeeze";
+    else if (fOpMode == Unsqueeze) opType = "Unsqueeze";
+
+    std::stringstream out;
+    out << SP << "///------- " << opType << " operator " << opName << "\n";
+
+    if (fDynamicShape) {
+        auto lengthOut = ConvertDimShapeToLength(fShapeOutput);
+        auto lengthIn  = ConvertDimShapeToLength(fShapeInput);
+        if (lengthOut != lengthIn) {
+            out << SP << "if (" << lengthOut << " != " << lengthIn << ")\n";
+            out << SP << SP << "throw std::runtime_error(\"SOFIE " << opType
+                << " Op : output length is different from input length\");\n";
+        }
+    }
+
+    // Reshape / View / Squeeze / Unsqueeze are zero-copy reinterpretations of memory.
+    // Instead of a GPU memcpy + CPU synchronisation barrier, create a local non-owning
+    // view that aliases the source buffer.  All downstream getPtrNative() calls on the
+    // local view return the same device pointer as the source — no data movement at all.
+    auto outputLength = ConvertDimShapeToLength(fShapeOutput);
+    out << SP << "auto deviceBuf_" << fNOutput
+        << " = alpaka::createView(devAcc, alpaka::getPtrNative(deviceBuf_" << fNData
+        << "), static_cast<Idx>(" << outputLength << "));\n";
+
+    return out.str();
+}
+
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROPERATOR_RESHAPE
diff --git a/core/inc/SOFIE/ROperator_ScatterElements.hxx b/core/inc/SOFIE/ROperator_ScatterElements.hxx
new file mode 100644
index 0000000..3cedaa7
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_ScatterElements.hxx
@@ -0,0 +1,469 @@
+#ifndef SOFIE_ROperator_ScatterElements
+#define SOFIE_ROperator_ScatterElements
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+
+class ROperator_ScatterElements final : public ROperator{
+private:
+
+   int64_t fAxis;
+
+   std::string fNX;
+   std::string fNI;
+   std::string fNU;
+   std::string fNY;
+   std::string fReduction;
+
+   // True only when fNI is a constant/initialized tensor: pre-sort at model
+   // load time is legal and the atomic-free segmented-add path is used.
+   // For dynamic index tensors (computed at inference time) we fall back to
+   // the original atomicAdd kernel — still faster than before because the
+   // stray alpaka::wait() before the scatter is removed.
+   bool fUseSegmentedReduction = false;
+
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeI;
+   std::vector<Dim> fShapeY;
+
+   // define reduction function. Possibilities are:
+   // none (default), add, mul, max, min
+   std::string ReductionFunction(const std::string & t1, const std::string & t2 ) {
+      std::string name = fReduction;
+      if (name.empty() || name == "none")
+         return t2;
+      else if (name == "add")
+         return t1 + " + " + t2;
+      else if (name == "mul")
+         return t1 + " * " + t2;
+      else if (name == "max")
+         return "std::max(" + t1 + "," + t2 + ")";
+      else if (name == "min")
+         return "std::min(" + t1 + "," + t2 + ")";
+      else
+         throw std::runtime_error("SOFIE ScatterElements : invalid reduction attribute");
+
+      return std::string();
+   }
+
+public:
+   ROperator_ScatterElements(){}
+   ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY,
+                           int axis, std::string reduction):
+      fAxis(axis),
+      fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)),
+      fNY(UTILITY::Clean_name(nameY)),
+      fReduction(reduction)
+      {
+         fInputTensorNames = { fNX, fNI, fNU };
+         fOutputTensorNames = { fNY };
+      }
+
+   // type of output given input
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   // shape of output tensors given input tensors
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      // input must be a graph input, or already initialized intermediate tensor
+      if (!model.CheckIfTensorAlreadyExist(fNX)){
+         throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model");
+      }
+      if (!model.CheckIfTensorAlreadyExist(fNI)) {
+         throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model");
+      }
+      if (!model.CheckIfTensorAlreadyExist(fNU)) {
+         throw std::runtime_error(std::string("SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model");
+      }
+      //tbd check for constant tensors
+
+      fShapeX = model.GetDimTensorShape(fNX);
+      fShapeI = model.GetDimTensorShape(fNI);
+      auto fShapeU = model.GetDimTensorShape(fNU);
+      if (fShapeU.size() != fShapeI.size())
+         throw std::runtime_error(std::string("SOFIE ScatterElements - update tensor has invalid rank")) ;
+      if (fShapeX.size() == 0)
+         throw std::runtime_error(std::string("SOFIE ScatterElements - input tensor has zero rank  ")) ;
+      if (fShapeX.size() != fShapeI.size())
+         throw std::runtime_error(std::string("SOFIE ScatterElements - index tensor has invalid rank  ")) ;
+
+      if (fAxis < 0) fAxis += (int64_t)fShapeX.size();
+
+      // assume output shape is identical to input shape
+      fShapeY = fShapeX;
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+
+      // For "add" reduction, only use the atomic-free segmented path when the
+      // index tensor is a static (constant/initialized) tensor — i.e. when the
+      // graph topology is fixed across inference calls.  For dynamic index
+      // tensors the original atomicAdd kernel is used, but the stray
+      // alpaka::wait() before it has already been removed (significant win).
+      if (fReduction == "add" && model.IsInitializedTensor(fNI)) {
+         fUseSegmentedReduction = true;
+         // Convert Dim-based shape to size_t shape for registration.
+         std::vector<size_t> shapeI_static;
+         for (const auto& d : fShapeI)
+            shapeI_static.push_back(d.dim ? d.dim : 1);
+         model.AddIntermediateTensor(fNI + "_sortedI",  ETensorType::INT32, shapeI_static);
+         model.AddIntermediateTensor(fNI + "_sortPerm", ETensorType::INT32, shapeI_static);
+         // std::iota and std::stable_sort used in the generated init code.
+         model.AddNeededStdLib("numeric");
+         model.AddNeededStdLib("algorithm");
+      }
+   }
+
+   std::string GenerateInitCode() override {
+      std::stringstream out;
+      return out.str();
+   }
+
+   // -----------------------------------------------------------------------
+   // GenerateInitCode_GPU_ALPAKA — emitted once inside the Session constructor.
+   //
+   // For "add" scatter, we build two static (model-lifetime) device buffers:
+   //
+   //   deviceBuf_<I>_sortedI   : int64_t[|I|] — the index values, sorted along
+   //                             the scatter axis, then feature within each row.
+   //   deviceBuf_<I>_sortPerm  : int32_t[|I|] — argsort of I (maps sorted
+   //                             position back to the original update position).
+   //
+   // Both are computed on the host at load time and uploaded once.  During
+   // inference the segmented-add kernel reads from these buffers, which are
+   // read-only and never modified.
+   // -----------------------------------------------------------------------
+   std::string GenerateInitCode_GPU_ALPAKA() override {
+      if (!fUseSegmentedReduction) return "";   // only static-index models use segmented path
+
+      std::string totalElements = ConvertDimShapeToLength(fShapeI);
+      // Feature dimension = last dim of I (the non-axis stride).
+      std::string numFeatures = fShapeI.back().GetVal();
+
+      std::stringstream out;
+      out << "\n// --- ScatterElements sorted-index init for segmented-add ---\n";
+      out << "{\n";
+      out << SP << "// Build host-side argsort of the index tensor " << fNI << "\n";
+      out << SP << "// along scatter axis " << fAxis << " so inference can use\n";
+      out << SP << "// the atomic-free segmented-add kernel.\n";
+      out << SP << "const std::size_t _nElem_" << fNI << " = " << totalElements << ";\n";
+      out << SP << "const std::size_t _nFeat_" << fNI << " = " << numFeatures << ";\n";
+      out << SP << "const std::size_t _nRows_" << fNI << " = _nElem_" << fNI << " / _nFeat_" << fNI << ";\n";
+
+      // Retrieve the host pointer for the index tensor.
+      out << SP << "auto* _hI_" << fNI << " = tensor_" << fNI << ";\n";
+
+      // Build a sorted permutation (argsort of row-axis indices).
+      out << SP << "std::vector<int32_t> _hostSortedI_" << fNI << "(_nElem_" << fNI << ");\n";
+      out << SP << "std::vector<int32_t> _hostSortPerm_" << fNI << "(_nElem_" << fNI << ");\n";
+      out << SP << "// argsort rows by axis index value\n";
+      out << SP << "std::vector<std::size_t> _rowOrder_" << fNI << "(_nRows_" << fNI << ");\n";
+      out << SP << "std::iota(_rowOrder_" << fNI << ".begin(), _rowOrder_" << fNI << ".end(), 0);\n";
+      out << SP << "std::stable_sort(_rowOrder_" << fNI << ".begin(), _rowOrder_" << fNI << ".end(),\n";
+      out << SP << SP << "[&](std::size_t a, std::size_t b){\n";
+      out << SP << SP << SP << "return _hI_" << fNI << "[a * _nFeat_" << fNI << "] < _hI_" << fNI << "[b * _nFeat_" << fNI << "];\n";
+      out << SP << SP << "});\n";
+      out << SP << "for (std::size_t _r = 0; _r < _nRows_" << fNI << "; ++_r) {\n";
+      out << SP << SP << "std::size_t _src = _rowOrder_" << fNI << "[_r];\n";
+      out << SP << SP << "for (std::size_t _f = 0; _f < _nFeat_" << fNI << "; ++_f) {\n";
+      out << SP << SP << SP << "_hostSortedI_" << fNI << "[_r * _nFeat_" << fNI << " + _f] = "
+          << "static_cast<int32_t>(_hI_" << fNI << "[_src * _nFeat_" << fNI << " + _f]);\n";
+      out << SP << SP << SP << "_hostSortPerm_" << fNI << "[_r * _nFeat_" << fNI << " + _f] = "
+          << "static_cast<int32_t>(_src * _nFeat_" << fNI << " + _f);\n";
+      out << SP << SP << "}\n";
+      out << SP << "}\n";
+
+      // Allocate device buffers and upload.
+      out << SP << "auto _hBufSortedI_" << fNI
+          << " = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{_nElem_" << fNI << "}));\n";
+      out << SP << "auto _hBufSortPerm_" << fNI
+          << " = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{_nElem_" << fNI << "}));\n";
+      out << SP << "std::copy(_hostSortedI_" << fNI << ".begin(), _hostSortedI_" << fNI << ".end(), "
+          << "alpaka::getPtrNative(_hBufSortedI_" << fNI << "));\n";
+      out << SP << "std::copy(_hostSortPerm_" << fNI << ".begin(), _hostSortPerm_" << fNI << ".end(), "
+          << "alpaka::getPtrNative(_hBufSortPerm_" << fNI << "));\n";
+      out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNI << "_sortedI, _hBufSortedI_" << fNI << ");\n";
+      out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNI << "_sortPerm, _hBufSortPerm_" << fNI << ");\n";
+      out << "}\n";
+      return out.str();
+   }
+
+   std::string Generate(std::string opName) override {
+
+      if (fIsOutputConstant) return "";
+
+      if (fShapeY.empty()) {
+         throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      out << SP << "\n//-------- ScatterElements  --- " << opName << "\n";
+
+      auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
+      auto strideI = UTILITY::ComputeStrideFromShape(fShapeI);
+
+      std::string length = ConvertDimShapeToLength(fShapeY);
+
+      // function to write compute expression for global index from Dim-based strides
+      auto tensorIndex = [](const std::vector<Dim> & stride, const std::vector<std::string> & idx) {
+         std::stringstream strst;
+         int dims = idx.size();
+         assert (dims == (int) stride.size());
+         for (int i = 0; i < dims; i++) {
+            std::string sv = stride[i].GetVal();
+            if (sv != "1")
+               strst << sv << "*" << idx[i];
+            else
+               strst << idx[i];
+            if (i < dims-1)
+               strst << " + ";
+         }
+         return strst.str();
+      };
+
+
+      // copy first input in output (maybe can be avoided??)
+      out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
+
+      // loop on tensor rank
+      int dims = fShapeY.size();
+      std::vector<std::string> idx(dims);
+      for (int i = 0; i < dims; i++) {
+         idx[i] = std::string("i") + std::to_string(i);
+         for (int j = 0; j <= i; j++) out << SP;
+         out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i].GetVal() << "; " << idx[i] << "++) {\n";
+      }
+      // correct index for specific axis
+      for (int j = 0; j <= dims; j++) out << SP;
+      out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n";
+      for (int j = 0; j <= dims; j++) out << SP;
+      out << "int iAxis = tensor_" << fNI << "[updateIndex];\n";
+      for (int j = 0; j <= dims; j++) out << SP;
+      out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis].GetVal() << ";\n";
+      idx[fAxis] = "iAxis";
+      for (int j = 0; j <= dims; j++) out << SP;
+      out << "int  outIndex = " << tensorIndex(strideY, idx) << ";\n";
+      for (int j = 0; j <= dims; j++) out << SP;
+      out << "tensor_" << fNY << "[outIndex] = "
+         << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n";
+
+      for (int i = dims; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
+      return out.str();
+   }
+
+   // -----------------------------------------------------------------------
+   // Generate_GPU_Kernel_ALPAKA
+   //
+   // For the "add" reduction (the GNN scatter-add case) we emit a
+   // *segmented* kernel instead of the naive atomicAdd kernel.
+   //
+   // Motivation:  the index tensor I (edge_index) is STATIC — it never
+   // changes between inference calls.  We pre-sort it once at model init
+   // by the scatter axis value so that all updates targeting the same
+   // output row are contiguous.  Each GPU thread then owns one contiguous
+   // segment of updates and accumulates them with a simple serial loop,
+   // writing the result with a single non-atomic store.  This eliminates
+   // all atomic serialisation and improves cache locality on the update
+   // tensor U.
+   //
+   // The sorted permutation is stored in the device buffer
+   //   deviceBuf_<fNI>_sortPerm   (int32, length = |I|)
+   // and is built by GenerateInitCode_GPU_ALPAKA below.
+   //
+   // Non-"add" reductions retain the original atomicXxx kernel.
+   // -----------------------------------------------------------------------
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty()) {
+         throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first");
+      }
+
+      const std::size_t D = fShapeI.size();
+
+      auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
+      auto strideI = UTILITY::ComputeStrideFromShape(fShapeI);
+
+      std::string totalElementsStr = ConvertDimShapeToLength(fShapeI);
+
+      // ---- segmented-add path (only when index tensor is static/constant) ----
+      if (fUseSegmentedReduction) {
+         // Number of output rows along the scatter axis.
+         std::string numOutputRows = fShapeY[fAxis].GetVal();
+         // Feature stride along the non-axis dimension (for 2-D tensors this
+         // is just strideI[1], i.e. the number of features per row).
+         std::string featStride = strideI[D - 1].GetVal();   // stride of last dim
+
+         std::string op;
+         op  = "\n//------ SCATTERELEMENTS_SEGMENTED_ADD_KERNEL_ALPAKA\n";
+         op += "// One thread per output-row × feature column.\n";
+         op += "// Reads updates in sorted order — no atomics needed.\n";
+         op += SP + "struct ScatterElementsKernel_" + opName + " {\n";
+         op += SP + SP + "template<typename TAcc, typename T>\n";
+         op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+         op += SP + SP + SP + "TAcc const& acc,\n";
+         op += SP + SP + SP + "T* Y,\n";
+         op += SP + SP + SP + "int64_t const* I_sorted,\n";   // axis index, sorted
+         op += SP + SP + SP + "T const* U,\n";
+         op += SP + SP + SP + "int32_t const* sortPerm,\n";   // argsort of I
+         op += SP + SP + SP + "std::size_t const totalUpdates,\n";
+         op += SP + SP + SP + "std::size_t const numFeatures) const {\n\n";
+
+         op += SP + SP + SP + "// Each thread processes one (output_row, feature) pair.\n";
+         op += SP + SP + SP + "auto const tid = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op += SP + SP + SP + "auto const stride = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+         op += SP + SP + SP + "// Total work = numOutputRows * numFeatures (= size of Y)\n";
+         op += SP + SP + SP + "std::size_t const totalWork = " + numOutputRows + " * numFeatures;\n";
+         op += SP + SP + SP + "for (std::size_t t = tid; t < totalWork; t += stride) {\n";
+         op += SP + SP + SP + SP + "std::size_t const out_row = t / numFeatures;\n";
+         op += SP + SP + SP + SP + "std::size_t const feat    = t % numFeatures;\n";
+         op += SP + SP + SP + SP + "// Binary-search for the first sorted index == out_row.\n";
+         op += SP + SP + SP + SP + "std::size_t lo = 0, hi = totalUpdates;\n";
+         op += SP + SP + SP + SP + "while (lo < hi) {\n";
+         op += SP + SP + SP + SP + SP + "std::size_t mid = (lo + hi) / 2;\n";
+         op += SP + SP + SP + SP + SP + "if (static_cast<std::size_t>(I_sorted[mid * numFeatures]) < out_row) lo = mid + 1;\n";
+         op += SP + SP + SP + SP + SP + "else hi = mid;\n";
+         op += SP + SP + SP + SP + "}\n";
+         op += SP + SP + SP + SP + "T acc_val = Y[out_row * numFeatures + feat];\n";
+         op += SP + SP + SP + SP + "for (std::size_t k = lo; k < totalUpdates; ++k) {\n";
+         op += SP + SP + SP + SP + SP + "if (static_cast<std::size_t>(I_sorted[k * numFeatures]) != out_row) break;\n";
+         op += SP + SP + SP + SP + SP + "std::size_t const perm_k = static_cast<std::size_t>(sortPerm[k * numFeatures + feat]);\n";
+         op += SP + SP + SP + SP + SP + "acc_val += U[perm_k];\n";
+         op += SP + SP + SP + SP + "}\n";
+         op += SP + SP + SP + SP + "Y[out_row * numFeatures + feat] = acc_val;\n";
+         op += SP + SP + SP + "}\n";
+         op += SP + SP + "}\n";
+         op += SP + "};\n";
+         return op;
+      }
+
+      // ---- original atomic kernel (non-add reductions) ----
+      std::string op;
+      op  = "\n//------ SCATTERELEMENTS_KERNEL_ALPAKA\n";
+      op += SP + "struct ScatterElementsKernel_" + opName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T* Y,\n";
+      op += SP + SP + SP + "int64_t const* I,\n";
+      op += SP + SP + SP + "T const* U,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t remaining = elem_idx;\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const idx_" + std::to_string(d)
+               + " = remaining / " + strideI[d].GetVal() + ";\n";
+         op += SP + SP + SP + SP + "remaining -= idx_" + std::to_string(d)
+               + " * " + strideI[d].GetVal() + ";\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "int64_t iAxis = I[elem_idx];\n";
+      op += SP + SP + SP + SP + "if (iAxis < 0) iAxis += " + fShapeY[fAxis].GetVal() + ";\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t const out_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         std::string coord = (d == (std::size_t)fAxis)
+               ? "static_cast<std::size_t>(iAxis)"
+               : "idx_" + std::to_string(d);
+         op += SP + SP + SP + SP + SP + coord + " * " + strideY[d].GetVal();
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      if (fReduction.empty() || fReduction == "none") {
+         op += SP + SP + SP + SP + "Y[out_idx] = U[elem_idx];\n";
+      } else if (fReduction == "mul") {
+         op += SP + SP + SP + SP + "alpaka::atomicMul(acc, &Y[out_idx], U[elem_idx]);\n";
+      } else if (fReduction == "max") {
+         op += SP + SP + SP + SP + "alpaka::atomicMax(acc, &Y[out_idx], U[elem_idx]);\n";
+      } else if (fReduction == "min") {
+         op += SP + SP + SP + SP + "alpaka::atomicMin(acc, &Y[out_idx], U[elem_idx]);\n";
+      }
+
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    return SP + "ScatterElementsKernel_" + opName + " scatterElementsKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fShapeY.empty()) {
+        throw std::runtime_error("SOFIE ScatterElements Op called to Generate without being initialized first");
+    }
+
+    std::string totalElements = ConvertDimShapeToLength(fShapeI);
+
+    std::stringstream out;
+    out << "\n//------ SCATTERELEMENTS_GPU_ALPAKA\n";
+
+    // Copy input → output (seeds the accumulation buffer, then scatter adds to it).
+    // No wait needed here — ALPAKA's in-order queue ensures ordering.
+    out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n";
+
+    if (fUseSegmentedReduction) {
+       // ---- segmented-add path: atomic-free, uses pre-sorted index buffers ----
+       // Work is one thread per (output_row × feature); the kernel does a
+       // serial loop over the sorted segment and accumulates without atomics.
+       std::string numOutputRows = fShapeY[fAxis].GetVal();
+       std::string numFeatures   = fShapeI.back().GetVal();
+       std::string numRows       = std::string("(") + totalElements + " / " + numFeatures + ")";
+       std::string totalWork     = numOutputRows + " * " + numFeatures;
+
+       out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+       out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(static_cast<Idx>(" << totalWork << "));\n";
+       out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+       out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+           << ", scatterElementsKernel_" << opName
+           << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+           << ", alpaka::getPtrNative(deviceBuf_" << fNI << "_sortedI)"
+           << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")"
+           << ", alpaka::getPtrNative(deviceBuf_" << fNI << "_sortPerm)"
+           << ", static_cast<Idx>(" << numRows << ")"
+           << ", static_cast<Idx>(" << numFeatures << "));\n";
+       out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+    } else {
+       // ---- original atomic kernel (non-add reductions) ----
+       out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+       out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(static_cast<Idx>(" << totalElements << "));\n";
+       out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+       out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+           << ", scatterElementsKernel_" << opName
+           << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+           << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")"
+           << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")"
+           << ", static_cast<Idx>(" << totalElements << "));\n";
+       out << SP << "alpaka::enqueue(queue, task_" << opName << ");\n";
+    }
+    return out.str();
+}
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROperator_ScatterElements
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx b/core/inc/SOFIE/ROperator_Selu.hxx
similarity index 83%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx
rename to core/inc/SOFIE/ROperator_Selu.hxx
index 96f4445..5bec42c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Selu.hxx
+++ b/core/inc/SOFIE/ROperator_Selu.hxx
@@ -17,7 +17,7 @@ private:
 
    std::string fNX;
    std::string fNY;
-   std::vector<size_t> fShape;
+   std::vector<Dim> fShape;
 
 public:
    ROperator_Selu(){}
@@ -38,9 +38,9 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Selu Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE Selu Op Input Tensor is not found in model");
       }
-      fShape = model.GetTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
    }
 
@@ -48,13 +48,10 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShape.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator Selu called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Selu called to Generate without being initialized first");
       }
       std::stringstream out;
-      int length = 1;
-      for(auto& i: fShape){
-         length *= i;
-      }
+      std::string length = ConvertDimShapeToLength(fShape);
       out << "\t" << "for (int id = 0; id < " << length << " ; id++){\n";
       out << "\t\t" << "tensor_" << fNY << "[id] = 1.0507009873554804934193349852946 * (std::max(float(0.0), tensor_"  << fNX << "[id]) + std::min(0.0, 1.6732632423543772848170429916717 * (std::exp(" << "tensor_" << fNX << "[id]" <<")-1)));\n";
       out << "\t}\n";
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/core/inc/SOFIE/ROperator_Shape.hxx
similarity index 64%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
rename to core/inc/SOFIE/ROperator_Shape.hxx
index 52bdeae..c466271 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
+++ b/core/inc/SOFIE/ROperator_Shape.hxx
@@ -47,10 +47,16 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Shape Op Input Tensor " + fNX + " is not found in model");
+         throw std::runtime_error("SOFIE Shape Op Input Tensor " + fNX + " is not found in model");
       }
-      fShape = model.GetTensorShape(fNX);
-      size_t length = fShape.size();  // this the size of shape not length of tensor
+      // Use Dim-aware shape query to handle dynamic (symbolic) tensors
+      auto dimShape = model.GetDimTensorShape(fNX);
+      size_t length = dimShape.size();  // rank of the input tensor
+      // Build fShape from dimShape (0 for symbolic/dynamic dims, concrete value otherwise)
+      fShape.resize(length);
+      for (size_t i = 0; i < length; i++)
+         fShape[i] = dimShape[i].isParam ? 0 : dimShape[i].dim;
+
       fStart = std::max(fStart,(int) -length);
       fStart = std::min(fStart,(int) length);
       if (fStart < 0) fStart += length;
@@ -74,6 +80,14 @@ public:
             std::cout << std::endl;
          }
          fIsOutputConstant = true;
+      } else if (model.IsDynamicTensor(fNX) && !fOutput_shape.empty()) {
+         // For dynamic tensors, register the output as a shape tensor with symbolic dimension values
+         std::vector<Dim> dimVals(dimShape.begin() + fStart, dimShape.begin() + fEnd);
+         model.AddShapeTensor(fNY, dimVals, false);
+         fIsOutputConstant = true;  // no runtime code needed
+         if (model.Verbose()) {
+            std::cout << "Output of Shape (dynamic input) is shape tensor: " << ConvertDimShapeToString(dimVals) << std::endl;
+         }
       }
       else
          model.AddIntermediateTensor(fNY, ETensorType::INT64, fOutput_shape);
@@ -87,7 +101,7 @@ public:
 
       OpName = "op_" + OpName;
       if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first");
       }
       std::stringstream out;
 
@@ -101,6 +115,26 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      // no need to generate code if the output is constant
+      if (fIsOutputConstant) return "";
+
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Shape op called to Generate without being initialized first");
+      }
+      std::stringstream out;
+
+      out << "\n//------ Shape\n";
+      // add a dummy statement to avoid warning for unused input
+      out << SP << "(void) deviceBuf_" << fNX << ";\n";
+      size_t length = ConvertShapeToLength(fOutput_shape);
+      for (size_t id = 0; id < length; id++) {
+         out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n";
+      }
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/core/inc/SOFIE/ROperator_Sigmoid.hxx b/core/inc/SOFIE/ROperator_Sigmoid.hxx
new file mode 100644
index 0000000..6540b8c
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Sigmoid.hxx
@@ -0,0 +1,124 @@
+#ifndef SOFIE_ROPERATOR_Sigmoid
+#define SOFIE_ROPERATOR_Sigmoid
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+namespace SOFIE{
+
+template <typename T>
+class ROperator_Sigmoid final : public ROperator
+{
+
+private:
+
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+public:
+   ROperator_Sigmoid(){}
+   ROperator_Sigmoid(std::string nameX, std::string nameY):
+      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::SIGMOID;
+         fInputTensorNames = { fNX };
+         fOutputTensorNames = { fNY };
+      }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("SOFIE Sigmoid Op Input Tensor is not found in model");
+      }
+      fShape = model.GetDimTensorShape(fNX);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+   }
+
+
+   std::string Generate(std::string opName) override {
+      if (fShape.empty()){
+         throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ Sigmoid -- " << opName << "\n";
+      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
+      out << SP << SP  << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_"  << fNX << "[id]));\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ SIGMOID_KERNEL_ALPAKA\n";
+      op += "struct SigmoidKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + "if(idx < numElements) {\n";
+      op += SP + SP + SP + SP + "out[idx] = static_cast<T>(1) / (static_cast<T>(1) + exp(-data[idx]));\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "SigmoidKernel sigmoidKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Sigmoid called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ SIGMOID_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNX << " = sofie_workdiv(elementsPerGrid_" << fNX << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNX
+         << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   bool IsElementwise() const override { return true; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "static_cast<T>(1) / (static_cast<T>(1) + exp(-(" + v + ")))";
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
+   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_Sigmoid
diff --git a/core/inc/SOFIE/ROperator_Slice.hxx b/core/inc/SOFIE/ROperator_Slice.hxx
new file mode 100644
index 0000000..fb738cf
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Slice.hxx
@@ -0,0 +1,592 @@
+#ifndef SOFIE_ROPERATOR_SLICE
+#define SOFIE_ROPERATOR_SLICE
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <cassert>
+#include <sstream>
+#include <numeric>
+
+
+namespace SOFIE{
+
+// slice operator
+
+template <typename IType>
+class ROperator_Slice final : public ROperator
+{
+
+private:
+
+   // flags to indicate if start/end and steps are not defined at compiled time
+   bool fIsStartUndef = false;
+   bool fIsEndUndef = false;
+   bool fIsStepUndef = false;
+   bool fIdentitySlice = false;
+   std::string fNData;        // input data tensor name
+   std::string fNOutput;      // output data name
+   std::vector<std::string> fNames;       // tensor names for meta(axis) information
+   std::vector<Dim> fShapeInput;     // input shape
+   std::vector<Dim> fShapeOutput;   // output shape
+   std::vector<Dim> fOutputShapeData;   // output shape data in case output is a shape param tensor
+
+   // saved Start/End.Steps are corrected from initial ONNX for negative/default values
+   // and are available for each axis
+   std::vector<Dim> fStart;         // starting values of slices for all axes
+   std::vector<Dim> fEnd;           // End values of slices for all axes
+   std::vector<Dim> fSteps;         // step values of slices for all axes
+   std::vector<Dim> fStartDims;         // input starting values of slices
+   std::vector<Dim> fEndDims;           // input End values of slices
+   std::vector<Dim> fStepDims;         // input step values of slices
+   std::vector<IType> fAxes;           // axes for input start/emd/step values
+
+   std::vector<std::vector<IType>> fAttributes; // attributes for the version <=10 case
+
+
+public:
+
+   ROperator_Slice(){}
+
+   // ctor for versions >= 10
+   ROperator_Slice(std::string nameData, std::vector<std::string> names, std::string nameOutput)
+      : fNData(UTILITY::Clean_name(nameData)),
+      fNOutput(UTILITY::Clean_name(nameOutput))
+   {
+    fNames.resize(4);
+    // axes and steps can be optional
+    for (size_t i = 0; i < names.size(); ++i) {
+        fNames[i] = UTILITY::Clean_name(names[i]);
+    }
+
+    fInputTensorNames = { fNData };
+    fOutputTensorNames = { fNOutput };
+   }
+   // ctor for versions < 10
+   ROperator_Slice(std::string nameData, std::vector<IType> starts, std::vector<IType> ends, std::vector<IType> axes, std::string nameOutput)
+      : fNData(UTILITY::Clean_name(nameData)),
+      fNOutput(UTILITY::Clean_name(nameOutput))
+   {
+     fAttributes.push_back(starts);
+     fAttributes.push_back(ends);
+     fAttributes.push_back(axes);
+    }
+
+
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNData) == false){   //input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model");
+      }
+
+      std::vector<std::vector<Dim>> shapes;
+      fShapeInput = model.GetDimTensorShape(fNData);
+      shapes.push_back(fShapeInput);
+
+      std::vector<std::vector<IType>> itensors(4);
+
+      if (fNames.size() > 0) {  // size has to be equal to 4
+         // loop on the extra 2 or 3 or 4 inputs
+         for (size_t i = 0; i < 4; ++i) {
+            if (!fNames[i].empty()) {
+               if (model.IsInitializedTensor(fNames[i])) {
+                  auto dptr = model.GetInitializedTensorData(fNames[i]);
+                  auto tensor = static_cast<IType *>(dptr.get());
+                  auto vec = model.GetTensorShape(fNames[i]);
+                  assert(vec.size() == 1);
+                  itensors[i] = std::vector<IType>(tensor, tensor + vec[0]);
+
+               } else if (model.IsShapeTensor(fNames[i])) {
+                  // case is a shape tensor
+                  if (i == 0) {
+                     fStartDims = model.GetShapeTensorValues(fNames[i]);
+                  } else if (i == 1) {
+                     fEndDims = model.GetShapeTensorValues(fNames[i]);
+                  } else if (i == 3) {
+                     fStepDims = model.GetShapeTensorValues(fNames[i]);
+                  }
+               } else {
+                  // case is an intermediate tensor
+                  auto shape = model.GetTensorShape(fNames[i]);
+                  size_t s = shape[0];
+                  for (size_t k = 0; k < s; k++) {
+                     if (i == 0) {
+                        fStartDims.push_back( Dim{std::string("start_") + fNOutput + "_" + std::to_string(k)});
+                        fIsStartUndef = true;
+                     } else if (i == 1) {
+                        fEndDims.push_back(Dim{std::string("end_") + fNOutput + "_" + std::to_string(k)});
+                        fIsEndUndef = true;
+                     } else if (i == 3) {
+                        fStepDims.push_back(Dim{std::string("step_") + fNOutput + "_" + std::to_string(k)});
+                        fIsStepUndef = true;
+                     }
+                  }
+               }
+            }
+         }
+      } else {
+         // old slice versions
+         assert(fAttributes.size() > 1);
+         for (size_t i = 0; i < fAttributes.size(); i++) {
+            itensors[i] = fAttributes[i];
+         }
+      }
+      size_t dim = fShapeInput.size();
+
+      // default values
+      fSteps = std::vector<Dim>(dim, Dim{1});
+      fStart = std::vector<Dim>(dim, Dim{0});
+      fEnd = fShapeInput;
+
+      // default axes
+      if (itensors[2].empty()) {
+         fAxes.resize(dim);
+         std::iota(fAxes.begin(), fAxes.end(), 0);
+      } else {
+         fAxes = itensors[2];
+         for (size_t i = 0; i < fAxes.size(); i++) {
+            // negative axes - they count from the back
+            if (fAxes[i] < 0) fAxes[i] = dim + fAxes[i];
+            if (fAxes[i] < 0 || fAxes[i] >= static_cast<IType>(dim))
+               throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(fAxes[i]) +
+                  " for  " + std::to_string(i));
+         }
+      }
+      // Loop on axis to get start/end/step values
+      for (size_t i = 0; i < fAxes.size(); i++) {
+         if (!itensors[0].empty() )
+            fStartDims.push_back(Dim{ static_cast<size_t>(itensors[0][i])});
+         if (fStartDims.empty())
+            throw std::runtime_error("TMVA Slice Op : Missing start input tensor");
+
+         if (!itensors[1].empty())
+            fEndDims.push_back(Dim{ static_cast<size_t>(itensors[1][i])});
+         else if (fEndDims.empty())
+            throw std::runtime_error("TMVA Slice Op : Missing end input tensor");
+
+         if (!itensors[3].empty()) {
+            fStepDims.push_back(Dim{ static_cast<size_t>(itensors[3][i])});
+         }
+         else if (fStepDims.size() < fAxes.size())  // this can happen since it is optional
+            fStepDims.push_back(Dim{size_t(1)});
+
+         if (!fShapeInput[fAxes[i]].isParam) {
+            size_t iAxisDim = fShapeInput[fAxes[i]].dim;
+            //correct values if too large or too small
+            IType istart = 0;
+            if (!fStartDims[i].isParam) {
+               istart = static_cast<IType>(fStartDims[i].dim);
+               if (istart < 0) istart = iAxisDim + istart;
+            }
+            IType iend = static_cast<IType>(iAxisDim);
+            if (!fEndDims[i].isParam) {
+               iend = static_cast<IType>(fEndDims[i].dim);
+               if (iend < 0) iend = iAxisDim + iend;
+            }
+            //steps
+            IType istep = 1;
+            if (!fStepDims[i].isParam) {
+               istep = static_cast<IType>(fStepDims[i].dim);
+            } else {
+               throw std::runtime_error("TMVA Slice Op : parametric step inputs are not supported");
+            }
+            // clamp start end values depending on steps
+            // start must be [0,N] for positive steps or [0,N-1] for negative
+            // end   must be [0,N] for positive steps or [-1, N-1] for negative
+            if (istart < 0) istart = 0;
+            if (istep > 0) {
+               if (istart > static_cast<IType>(iAxisDim)) istart = static_cast<IType>(iAxisDim);
+               if (iend < 0) iend = 0;
+               if (iend > static_cast<IType>(iAxisDim)) iend = static_cast<IType>(iAxisDim);
+            } else if (istep < 0) {
+               if (istart > static_cast<IType>(iAxisDim)-1) istart = static_cast<IType>(iAxisDim) -1;
+               if (iend < -1) iend = -1;
+               if (iend > static_cast<IType>(iAxisDim)-1) iend = static_cast<IType>(iAxisDim) -1;
+            } else {
+               throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(istep) +
+                  " for  " + std::to_string(i));
+            }
+            // for parametric values clamping we will done at run time
+            if (fStartDims[i].isParam)
+               fStart[fAxes[i]] = fStartDims[i];
+            else
+               fStart[fAxes[i]] = Dim{size_t(istart)};
+            if (fStartDims[i].isParam)
+               fEnd[fAxes[i]] = fEndDims[i];
+            else
+               fEnd[fAxes[i]] = Dim{size_t(iend)};
+
+            fSteps[fAxes[i]] = Dim{size_t(istep)};
+         } else {
+            //std::cout << i << " Param dim for " << fAxes[i] << "  " <<  fShapeInput[fAxes[i]] << std::endl;
+            // correct only negative values
+            if (!fStartDims[i].isParam) {
+               IType istart = static_cast<IType>(fStartDims[i].dim);
+               if (istart < 0) {
+                  std::string sstart = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-istart) +")";
+                  fStart[fAxes[i]] = Dim{sstart,size_t(-1)};
+               } else {
+                 fStart[fAxes[i]] = Dim{size_t(istart)};
+               }
+            } else {
+               fStart[fAxes[i]] = fStartDims[i];
+            }
+            if (!fEndDims[i].isParam) {
+               IType iend = static_cast<IType>(fEndDims[i].dim);
+               if (iend < 0) {
+                  std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")";
+                  fEnd[fAxes[i]] = Dim{send,size_t(-1)};
+               } else if (iend == std::numeric_limits<IType>::max()){
+                  fEnd[fAxes[i]] = fShapeInput[fAxes[i]];
+               } else {
+                 fEnd[fAxes[i]] = Dim{size_t(iend)};
+               }
+            } else {
+               fEnd[fAxes[i]] = fEndDims[i];
+            }
+
+            fSteps[fAxes[i]] = fStepDims[i];
+         }
+
+      }
+      //  find output shape
+      fShapeOutput.resize(dim);
+      for (size_t i = 0; i < dim; i++) {
+         if (!fEnd[i].isParam && !fStart[i].isParam && !fSteps[i].isParam) {
+            int64_t istart = static_cast<int64_t>(fStart[i].dim);
+            int64_t iend = static_cast<int64_t>(fEnd[i].dim);
+            int64_t istep= static_cast<int64_t>(fSteps[i].dim);
+            int64_t s = (iend-istart)/istep;
+            fShapeOutput[i] = Dim{static_cast<size_t>(s)};
+         } else {
+            std::string s;
+            if (fStart[i].GetVal() != "0")
+               s = "(" + fEnd[i].GetVal() + "-" + fStart[i].GetVal() + ")";
+            else
+               s = fEnd[i].GetVal();
+            if (fSteps[i].GetVal() != "1") {
+               s.insert(0,"(");
+               s += ")/" + fSteps[i].GetVal() + ")";
+            }
+            fShapeOutput[i] = Dim{s,size_t(-1)};
+            // add also the shape parameters to RModel to declare them when
+            // allocating output tensor
+            if (fEnd[i].isParam && fEnd[i].dim != size_t(-1))
+               model.AddShapeParam(fEnd[i].param,fEnd[i].dim );
+            if (fStart[i].isParam && fStart[i].dim != size_t(-1))
+               model.AddShapeParam(fStart[i].param,fStart[i].dim );
+            if (fSteps[i].isParam && fSteps[i].dim != size_t(-1))
+               model.AddShapeParam(fSteps[i].param,fSteps[i].dim );
+
+         }
+      }
+      // case input is a constant tensor and of int64 type
+      if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
+         fIsOutputConstant = true;
+         auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
+         size_t outputSize = ConvertShapeToLength(ConvertShapeToInt(fShapeOutput));
+         std::vector<int64_t> outputData(outputSize);
+         std::vector<size_t> inputStride = UTILITY::ComputeStrideFromShape(ConvertShapeToInt(fShapeInput));
+         if (model.Verbose()) {
+            std::cout << "Do slice for initialized input ..(start, end, step)\n";
+            for (size_t ii = 0; ii< fStart.size(); ii++)
+               std::cout << fStart [ii] << "  " << fEnd[ii] << "  " << fSteps[ii] << std::endl;
+         }
+          // perform slice using a recursive function- need to use two lambda functions for this
+         auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) {
+            auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) {
+               if (fStart[iax].isParam || fEnd[iax].isParam || fSteps[iax].isParam)
+                  throw std::runtime_error("TMVA Slice Op : cannot have parametric values when input is constant");
+               // compute indices
+               std::vector<IType> indices;
+               for (IType i = (IType) fStart[iax].dim; (IType(fSteps[iax].dim) > 0) ? i < IType(fEnd[iax].dim) : i > IType(fEnd[iax].dim); i += IType(fSteps[iax].dim) )
+                  indices.push_back(i);
+               if (iax == dim-1) { // last axis
+                  for (size_t i = 0; i < indices.size(); i++) {
+                     outputData[outputIdx] = inputData[inputOffset + indices[i]];
+                     outputIdx++;
+                  }
+                  return;
+               } else {
+                  for (size_t i = 0; i < indices.size(); i++) {
+                     size_t offset = inputOffset + inputStride[iax]*indices[i];
+                     sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl);
+                  }
+               }
+            };
+            slice_impl(iaxis, outIdx, inOffset,slice_impl);
+         };
+         size_t idx = 0;
+         size_t offset = 0;
+         sliceRecursive(0, idx, offset);
+
+         model.AddConstantTensor<int64_t>(fNOutput, ConvertShapeToInt(fShapeOutput), outputData.data());
+         if (model.Verbose()) {
+            std::cout << "Slice: output is a constant tensor " << ConvertDimShapeToString(fShapeOutput) << " : "
+                     << ConvertValuesToString(outputData) << std::endl;
+         }
+      }
+      else if (model.IsShapeTensor(fNData) && !fStart[0].isParam && !fEnd[0].isParam) {
+         // case of input is a shape tensor. In this case rank=1 always, axis =0 and Slice is trivial
+         auto inputData = model.GetShapeTensorValues(fNData);
+         fOutputShapeData = std::vector<Dim>(inputData.begin() + fStart[0].dim, inputData.begin() + fEnd[0].dim);
+         // try to convert to integer values if possible
+         auto outputData = ConvertShapeToInt(fOutputShapeData);
+         fShapeOutput = { Dim{fOutputShapeData.size()}};
+         if (outputData.empty()) {
+            // is a param shape tensor
+            model.AddShapeTensor(fNOutput, fOutputShapeData);
+            fIsOutputParamShape = true;
+            if (model.Verbose()) {
+               std::cout << "Slice: output is a shape tensor -> " << fNOutput << "  " << ConvertDimShapeToString(fShapeOutput) << " with values "
+                        << ConvertDimShapeToString(fOutputShapeData) << " (shape)" << std::endl;
+            }
+         } else {
+            fIsOutputConstant = true;
+            std::vector<int64_t> data(outputData.size());
+            std::copy(outputData.begin(), outputData.end(), data.begin());
+            model.AddConstantTensor<int64_t>(fNOutput, {data.size()}, data.data());
+            if (model.Verbose()) {
+               std::cout << "Slice: output is a constant tensor -> " << fNOutput << "  " << ConvertDimShapeToString(fShapeOutput) << " with values "
+                        << ConvertDimShapeToString(fOutputShapeData) << " constant " << std::endl;
+            }
+         }
+      }
+      else {
+         // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1
+         size_t ndim = fShapeInput.size();
+         fIdentitySlice = fShapeOutput.size() == ndim;
+         // check also if input data is not input to the model. In that case we copy the data since we cannot just copy from the input pointer
+         fIdentitySlice &= (!model.IsReadyInputTensor(fNData) && !model.IsDimInputTensor(fNData));
+         for (size_t idim = 0; idim < ndim; idim++) {
+            if (!fIdentitySlice) break;
+            fIdentitySlice &= (fStart[idim].GetVal() == "0");
+            fIdentitySlice &= (fSteps[idim].GetVal() == "1");
+            fIdentitySlice &= (fEnd[idim].GetVal() == fShapeInput[idim].GetVal());
+         }
+
+         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
+         //if (fIdentitySlice)  model.AddAliasTensor(fNOutput, fNData);
+
+         if (model.Verbose()) {
+            std::cout << "Slice " << fNData << "  " << ConvertDimShapeToString(fShapeInput)
+                      << "---> " << fNOutput << " " <<  ConvertDimShapeToString(fShapeOutput);
+            if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) ";
+            std::cout << std::endl;
+
+         }
+      }
+   }
+
+   std::string Generate(std::string opName) override {
+
+      if (fShapeInput.empty() || fShapeOutput.empty()){
+         throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+
+      out << "///------- Slice operator " << opName << "---> " << fNOutput << " "
+          << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl;
+      if (fIsOutputConstant) return out.str();  //no op for constant tensors
+      if (fIsOutputParamShape) {
+         out << "/// Slice output is a shape tensor with values : " << ConvertDimShapeToString(fShapeOutput) << "\n";
+         // need to generate code assigning values to shape tensors
+         for (int i = 0; i < static_cast<int>(fShapeOutput[0].dim); i++) {
+                  out << SP << "tensor_" << fNOutput << "[" << i << "] = " << fOutputShapeData[i] << ";\n";
+         }
+         return out.str();
+      }
+
+      size_t ndim = fShapeInput.size();
+
+      if (fIdentitySlice) {
+         out << "/// Slice is just an identity (copy) \n";
+         //out << SP << "tensor_" << fNOutput << " = const_cast<" << ConvertTypeToString(fOutputType) << " *>(tensor_" << fNData << ");\n";
+         out << SP << "std::copy(tensor_" << fNData << ", tensor_" << fNData << " + " << ConvertDimShapeToLength(fShapeInput) << ", tensor_" << fNOutput << ");\n";
+         return out.str();
+      }
+
+      // loop on the dimensions depending no the orders
+      auto strides = UTILITY::ComputeStrideFromShape(fShapeInput);
+
+
+      out << SP << "{\n"; // define operator scope
+      for (size_t i = 0; i < fStepDims.size(); i++) {
+         if (fStepDims[i].isParam) {
+            if (fIsStepUndef)
+               out << SP << "size_t " << fStepDims[i] << " = tensor_" << fNames[3] << "[" << i << "];\n";
+         }
+      }
+      // special case for parametric  values for start/end. Need to do clipping
+      for (size_t i = 0; i < fStartDims.size(); i++) {
+         if (fStartDims[i].isParam && fStartDims[i].param != fShapeInput[fAxes[i]].param) {
+            std::string s_start = "start_" + std::to_string(i);
+            if (fIsStartUndef) {
+               s_start = fStartDims[i].param;
+               out << SP << "size_t " << s_start << " = tensor_" << fNames[0] << "[" << i << "];\n";
+            } else {
+               out << SP << "size_t " << s_start << " = " <<  fStartDims[i] << ";\n";
+               fStart[fAxes[i]] = s_start; // need to use this value later when slicing
+            }
+            out << SP << "if (" << s_start << " < 0) " << s_start << " += " << fShapeInput[fAxes[i]] <<";\n";
+            out << SP << "if (" << s_start << " < 0) " << s_start << " = 0;\n";
+            if (!fStepDims[i].isParam) {
+               if (static_cast<IType>(fStepDims[i].dim) > 0 )
+                  out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " ) " << s_start << " = " << fShapeInput[fAxes[i]] <<";\n";
+               else
+                  out << SP << "if (" << s_start << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_start << " = " << fShapeInput[fAxes[i]] << " - 1;\n";
+            }
+         }
+         // special case if step is negative and shape are equal and step is negative
+         else if (fStartDims[i].isParam && fStartDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast<IType>(fStepDims[i].dim) < 0 ) {
+            fStart[fAxes[i]] = Dim{ fStartDims[i].param + "-1" };
+         }
+      }
+      // now to for end
+      for (size_t i = 0; i < fEndDims.size(); i++) {
+         if (fEndDims[i].isParam && fEndDims[i].param != fShapeInput[fAxes[i]].param) {
+            std::string s_end = "end_" + std::to_string(i);
+            if (fIsEndUndef) {
+               s_end = fEndDims[i].param;
+               out << SP << "size_t " << s_end << " = tensor_" << fNames[1] << "[" << i << "];\n";
+            } else {
+               out << SP << "size_t " << s_end << " = " <<  fEndDims[i] << ";\n";
+               fEnd[fAxes[i]] = s_end; // need to use this value later when slicing
+            }
+            out << SP << "if (" << s_end << " < 0) " << s_end << " += " << fShapeInput[fAxes[i]] <<";\n";
+            if (!fStepDims[i].isParam) {
+               if (static_cast<IType>(fStepDims[i].dim) > 0 ) {
+                  out << SP << "if (" << s_end << " < 0) " << s_end << " = 0;\n";
+                  out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " ) " << s_end << " = " << fShapeInput[fAxes[i]] <<";\n";
+               } else {
+                  out << SP << "if (" << s_end << " < -1) " << s_end << " = -1;\n";
+                  out << SP << "if (" << s_end << " > " << fShapeInput[fAxes[i]] << " - 1" << " ) " << s_end << " = " << fShapeInput[fAxes[i]] << " - 1;\n";
+               }
+            }
+         }
+         // special case if step is negative and shape are equal and step is negative
+         else if (fEndDims[i].isParam && fEndDims[i].param == fShapeInput[fAxes[i]].param && !fStepDims[i].isParam && static_cast<IType>(fStepDims[i].dim) < 0 ) {
+            fEnd[fAxes[i]] = Dim{ fEndDims[i].param + "-1" };
+         }
+      }
+
+      out << SP << "size_t iOut = 0;\n";
+      std::string MSP = SP;
+      for (size_t idim = 0; idim < ndim; idim++) {
+        out << MSP << "for (size_t i" << idim << " = " << fStart[idim] <<  "; i" << idim << " < " << fEnd[idim]
+            << "; i" << idim << "+= " << fSteps[idim] << ") {\n";
+        MSP += SP;
+        if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n";
+      }
+      out << MSP << "size_t iInput = ";
+      for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + ";
+      // here should be step size ?
+      out << "i" << ndim-1 << ";\n";
+      out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" <<fNData << "[iInput];\n";
+      for (size_t idim = 0; idim < ndim; idim++) {
+          MSP = MSP.replace(0,SP.length(),"");
+          out << MSP << "}\n";
+      }
+      out << SP << "}\n"; // end operator scope
+
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeOutput.empty())
+         throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first");
+
+      const std::size_t D = fShapeInput.size();
+
+      auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeInput);
+      auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeOutput);
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeOutput);
+      std::string kname = "SliceKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ SLICE_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ input,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+               + " = (elem_idx / " + outputStrides[d].GetVal() + "u) % "
+               + fShapeOutput[d].GetVal() + "u;\n";
+      }
+      op += "\n";
+
+      // Map each output coord back to input coord:
+      //   input_coord[d] = fStart[d] + out_d * fSteps[d]
+      // Negative steps are supported naturally since fStart/fEnd/fSteps are
+      // already corrected for negative/default values during Initialize().
+      op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         // input coordinate for this dim: start + out_d * step
+         std::string input_coord = "(" + fStart[d].GetVal()
+               + " + out_" + std::to_string(d)
+               + " * " + fSteps[d].GetVal() + ")";
+         op += SP + SP + SP + SP + SP
+               + "static_cast<std::size_t>(" + input_coord + ")"
+               + " * " + inputStrides[d].GetVal() + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      std::string kname = "SliceKernel_" + opName;
+      return SP + kname + " sliceKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeOutput.empty())
+         throw std::runtime_error("SOFIE Slice Op called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeOutput);
+      std::string kname = "sliceKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ SLICE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+         << ", " << kname
+         << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNOutput << ")"
+         << ", static_cast<Idx>(" << totalElements << "));\n";
+
+      return out.str();
+   }
+
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROPERATOR_SLICE
diff --git a/core/inc/SOFIE/ROperator_Softmax.hxx b/core/inc/SOFIE/ROperator_Softmax.hxx
new file mode 100644
index 0000000..5626c0f
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Softmax.hxx
@@ -0,0 +1,192 @@
+#ifndef SOFIE_ROPERATOR_Softmax
+#define SOFIE_ROPERATOR_Softmax
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+namespace SOFIE {
+
+class ROperator_Softmax final : public ROperator {
+
+private:
+   bool fLogSoftmax;  // for the logsoftmax case
+   bool fUseVDT = false;
+   int64_t fAttrAxis;
+
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+   std::string fType;
+
+public:
+   ROperator_Softmax() {}
+   ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY, bool logSoftmax = false)
+      : fLogSoftmax(logSoftmax),
+      fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
+
+   {
+         fInputTensorNames = { fNX };
+         fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; // suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNX) ==
+          false) { // input must be a graph input, or already initialized intermediate tensor
+         throw std::runtime_error("SOFIE Softmax Op Input Tensor is not found in model");
+      }
+      fShape = model.GetDimTensorShape(fNX);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+      fType = ConvertTypeToString(model.GetTensorType(fNX));
+      if (model.Verbose()) {
+         std::cout << "Softmax -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl;
+      }
+      fUseVDT = model.UseVDT();
+      if (fUseVDT) {
+         model.AddNeededCustomHeader("vdt/exp.h");
+         if (fLogSoftmax)
+            model.AddNeededCustomHeader("vdt/log.h");
+      }
+   }
+
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Operator Softmax called to Generate without being initialized first");
+      }
+      std::stringstream out;
+       out << "///------- Softmax " << opName << " ---> "  // << fNY << " "
+           << ConvertDimShapeToString(fShape) << "\n" << std::endl;
+      size_t size = fShape.size();
+      auto length_str = ConvertDimShapeToLength(fShape);
+      size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis;
+
+      std::string expFunction = (fUseVDT) ? "vdt::fast_expf" : "std::exp";
+      std::string logFunction = (fUseVDT) ? "vdt::fast_logf" : "std::log";
+
+      // Check if this is the special case where memory is contiguous.
+      if (axis == size - 1) {
+         std::string axis_size = fShape[axis].GetVal();
+         std::string num_rows;
+         if (IsInteger(length_str) && IsInteger(axis_size)) {
+            num_rows = std::to_string(std::stoul(length_str) / std::stoul(axis_size));
+         } else {
+            num_rows = "(" + length_str + ") / (" + axis_size + ")";
+         }
+
+         out << SP << "//-----  softmax axis is last one - " << axis << "\n";
+         out << SP << "for (int i = 0; i < " << num_rows << "; ++i) {\n";
+         out << SP << SP << "size_t offset = i * " << axis_size << ";\n";
+         out << SP << SP << fType << " const * x_ptr = &tensor_" << fNX << "[offset];\n";
+         out << SP << SP << fType << " * y_ptr = &tensor_" << fNY << "[offset];\n";
+
+         out << SP << SP << fType << " vmax = x_ptr[0];\n";
+         out << SP << SP << "for (int j = 1; j < " << axis_size << "; ++j) {\n";
+         out << SP << SP << SP << "if (x_ptr[j] > vmax) vmax = x_ptr[j];\n";
+         out << SP << SP << "}\n";
+
+         out << SP << SP << fType << " sum = 0.0;\n";
+         out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n";
+         out << SP << SP << SP << "y_ptr[j] = " << expFunction << "(x_ptr[j] - vmax);\n";
+         out << SP << SP << SP << "sum += y_ptr[j];\n";
+         out << SP << SP << "}\n";
+
+         out << SP << SP << fType << " inv_sum = 1.0f / sum;\n";
+         out << SP << SP << "for (int j = 0; j < " << axis_size << "; ++j) {\n";
+         out << SP << SP << SP << "y_ptr[j] *= inv_sum;\n";
+         if (fLogSoftmax)
+            out << SP << SP << SP << "y_ptr[j] = " << logFunction << "(y_ptr[j]);\n";
+         out << SP << SP << "}\n";
+         out << SP << "}\n";
+
+      } else {
+         // generic case for any axis
+         auto stride = UTILITY::ComputeStrideFromShape(fShape);
+         size_t k = 0;
+         std::vector<std::string> l(size);
+         for (size_t i = 0; i < size; i++) {
+            if (i != axis) {
+               for (size_t j = 0; j < k; j++) out << SP;
+               l[i] = std::string("i") + std::to_string(i);
+               out << SP << "for (int " << l[i] << " = 0; " << l[i] << " < " << fShape[i] << "; " << l[i] << "++) {\n";
+               k++;
+            }
+         }
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << fType << " sum = 0.;\n";
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "size_t index = ";
+         bool first = true;
+         for (size_t i = 0; i < size; i++) {
+            if (i == axis) continue;
+            if (!first) out << " + ";
+            if (stride[i].GetVal() != "1")
+               out << stride[i] << "*";
+            out << l[i];
+            first = false;
+         }
+         out << ";\n";
+         // find maximum looping along reduced axis
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << fType << " vmax = tensor_" << fNX << "[index];\n";
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "for (int i = 1; i < " << fShape[axis] << "; i++) {\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << fType << " x = tensor_" << fNX << "[index + i";
+         if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")";
+         out << "];\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "if (x > vmax) vmax = x;\n";
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "}\n";
+         // compute softmax
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "size_t id = index + i";
+         if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")";
+         out << ";\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "tensor_" << fNY << "[id] = " << expFunction << "(tensor_" << fNX << "[id] - vmax);\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "sum += tensor_" << fNY << "[id];\n";
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "}\n";
+         // normalize
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "for (int i = 0; i < " << fShape[axis] << "; i++) {\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "size_t id = index + i";
+         if (stride[axis].GetVal() != "1") out << "*(" << stride[axis] << ")";
+         out << ";\n";
+         for (size_t j = 0; j < size; j++) out << SP;
+         out << "tensor_" << fNY << "[id] /= sum;\n";
+         if (fLogSoftmax) {
+            for (size_t j = 0; j < size; j++) out << SP;
+            out << "tensor_" << fNY << "[id] = " << logFunction << "(tensor_" << fNY << "[id]);\n";
+         }
+         for (size_t j = 0; j < size-1; j++) out << SP;
+         out << "}\n";
+         //end loops
+         for (int i = static_cast<int>(k) - 1; i >= 0; i--) {
+            for (int j = 0; j < i; j++) out << SP;
+            out << "}\n";
+         }
+      }
+      return out.str();
+   }
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROPERATOR_Softmax
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/core/inc/SOFIE/ROperator_Split.hxx
similarity index 51%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
rename to core/inc/SOFIE/ROperator_Split.hxx
index 63fbcb3..9604ca8 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
+++ b/core/inc/SOFIE/ROperator_Split.hxx
@@ -51,14 +51,14 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Split Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE Split Op Input Tensor is not found in model");
       }
       fInputShape = model.GetTensorShape(fNX);
 
       // correct for negative axis
       if (fAxis < 0) fAxis += fInputShape.size();
       if (fAxis < 0 || fAxis >= static_cast<int>(fInputShape.size()) )
-         throw std::runtime_error("TMVA SOFIE Split - invalid axis " + std::to_string(fAxis));
+         throw std::runtime_error("SOFIE Split - invalid axis " + std::to_string(fAxis));
 
       // compute output shapes
       size_t nsplit = fNYs.size();
@@ -77,10 +77,10 @@ public:
       } else {
          // get split tensor values
          if (!model.IsInitializedTensor(fNSplit))
-            throw std::runtime_error("TMVA SOFIE Split - non-initialized split tensors are not supported");
+            throw std::runtime_error("SOFIE Split - non-initialized split tensors are not supported");
          auto splitShape =  model.GetTensorShape(fNSplit);
          if (splitShape.size() != 1 || splitShape[0] != nsplit)
-            throw std::runtime_error("TMVA SOFIE Split - split input tensor has invalid shape");
+            throw std::runtime_error("SOFIE Split - split input tensor has invalid shape");
          auto split_data = static_cast<int64_t *>(model.GetInitializedTensorData(fNSplit).get());
          fSplit = std::vector<int64_t>(split_data, split_data + nsplit);
       }
@@ -94,7 +94,7 @@ public:
          fOutputShapes.push_back(outputShape);
       }
       if (tot_split != fInputShape[fAxis])
-         throw std::runtime_error("TMVA SOFIE Split - Sum of split sizes must match the input dimension along the axis");
+         throw std::runtime_error("SOFIE Split - Sum of split sizes must match the input dimension along the axis");
 
 
       if (model.Verbose()) {
@@ -109,7 +109,7 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fOutputShapes.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first");
       }
 
       auto input_strides =  UTILITY::ComputeStrideFromShape(fInputShape);
@@ -153,6 +153,105 @@ public:
       return out.str();
    }
 
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fOutputShapes.empty())
+        throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first");
+
+    const std::size_t D   = fInputShape.size();
+    const std::size_t Nin = fNYs.size();
+
+    auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape);
+
+    std::string op;
+    op  = "\n//------ SPLIT_KERNEL_ALPAKA\n";
+   std::cout<<"Generating GPU kernel for Split operator with input shape "<< ConvertShapeToString(fInputShape) << " and output shapes : ";
+    for (std::size_t i = 0; i < Nin; ++i) {
+      std::cout<<"Loop running for output "<<i<<" with shape "<< ConvertShapeToString(fOutputShapes[i]) << " and split size "<<fSplit[i]<<std::endl;
+        auto outputStrides = UTILITY::ComputeStrideFromShape(fOutputShapes[i]);
+
+        std::size_t axis_offset = 0;
+        for (std::size_t k = 0; k < i; ++k)
+            axis_offset += fSplit[k];
+
+        std::string kname = "SplitKernel_" + opName + "_" + std::to_string(i);
+
+        op += SP + "struct " + kname + " {\n";
+        op += SP + SP + "template<typename TAcc, typename T>\n";
+        op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+        op += SP + SP + SP + "TAcc const& acc,\n";
+        op += SP + SP + SP + "T const* input,\n";
+        op += SP + SP + SP + "T* output,\n";
+        op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+        op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+        op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+        op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+        op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+        for (std::size_t d = 0; d < D; ++d) {
+            op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+                + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % "
+                + std::to_string(fOutputShapes[i][d]) + "u;\n";
+        }
+        op += "\n";
+
+        op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+        for (std::size_t d = 0; d < D; ++d) {
+            std::string coord = (d == static_cast<std::size_t>(fAxis))
+                ? ("(out_" + std::to_string(d) + " + " + std::to_string(axis_offset) + "u)")
+                : ("out_" + std::to_string(d));
+            op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u";
+            op += (d + 1 < D) ? " +\n" : ";\n\n";
+        }
+
+        op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+        op += SP + SP + SP + "}\n";
+        op += SP + SP + "}\n";
+        op += SP + "};\n\n";
+    }
+    std::cout<<"Finished generating GPU kernel for Split operator "<<op<<std::endl;
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    std::string op;
+    for (std::size_t i = 0; i < fNYs.size(); ++i) {
+        std::string kname = "SplitKernel_" + opName + "_" + std::to_string(i);
+        op += SP + kname + " splitKernel_" + opName + "_" + std::to_string(i) + ";\n";
+    }
+    return op;
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fOutputShapes.empty())
+        throw std::runtime_error("SOFIE Operator Split called to Generate without being initialized first");
+
+    std::stringstream out;
+    out << "\n//------ SPLIT_GPU_ALPAKA\n";
+
+    for (std::size_t i = 0; i < fNYs.size(); ++i) {
+        std::size_t length = ConvertShapeToLength(fOutputShapes[i]);
+        std::string kname  = "splitKernel_" + opName + "_" + std::to_string(i);
+
+        out << SP << "{\n";
+        out << SP << SP << "auto const elementsPerThread_" << i << " = Vec::all(static_cast<Idx>(1));\n";
+        out << SP << SP << "auto const elementsPerGrid_"   << i << " = Vec::all(Idx{" << length << "});\n";
+        out << SP << SP << "auto const workDiv_" << i << " = sofie_workdiv(elementsPerGrid_" << i << ");\n";
+        out << SP << SP << "auto task_" << opName << "_" << i << " = alpaka::createTaskKernel<Acc>(workDiv_" << i
+            << ", " << kname
+            << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+            << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")"
+            << ", static_cast<Idx>(" << length << "));\n";
+        out << SP << "alpaka::enqueue(queue, task_" << opName << "_" << i << ");\n";
+        out << SP << "}\n";
+    }
+    return out.str();
+}
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx b/core/inc/SOFIE/ROperator_SubGraph.hxx
similarity index 92%
rename from src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx
rename to core/inc/SOFIE/ROperator_SubGraph.hxx
index cb17671..e273bde 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_SubGraph.hxx
+++ b/core/inc/SOFIE/ROperator_SubGraph.hxx
@@ -34,8 +34,7 @@ public:
             n = UTILITY::Clean_name(n);
 
          fInputTensorNames = { fNX };
-         std::transform(fNYs.begin(), fNYs.end(), fOutputTensorNames.begin(),
-                   [](const std::string& s) -> std::string_view { return s; });
+         fOutputTensorNames.assign(fNYs.begin(), fNYs.end());
       }
 
    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
@@ -50,7 +49,7 @@ public:
    void Initialize(RModel& model) override {
        //input must be a graph input, or already initialized intermediate tensor
       if (model.CheckIfTensorAlreadyExist(fNX) == false){
-        throw std::runtime_error("TMVA SOFIE If Op Input Tensor is not found in model");
+        throw std::runtime_error("SOFIE If Op Input Tensor is not found in model");
       }
       //add the subgraph model to parent RModel and initialize them
       model.InitializeSubGraph(fModel_then);
@@ -71,7 +70,7 @@ public:
             fType = type;
          else {
             if (type != fType)
-               throw std::runtime_error("TMVA SOFIE If Op supports only all outputs of the same type");
+               throw std::runtime_error("SOFIE If Op supports only all outputs of the same type");
          }
          model.AddIntermediateTensor(fNYs[i], fType, shape );
       }
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx b/core/inc/SOFIE/ROperator_Swish.hxx
similarity index 82%
rename from src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx
rename to core/inc/SOFIE/ROperator_Swish.hxx
index a2552f1..cecdd3c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Swish.hxx
+++ b/core/inc/SOFIE/ROperator_Swish.hxx
@@ -17,7 +17,7 @@ private:
 
    std::string fNX;
    std::string fNY;
-   std::vector<size_t> fShape;
+   std::vector<Dim> fShape;
 
 public:
    ROperator_Swish(){}
@@ -38,9 +38,9 @@ public:
 
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Swish Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE Swish Op Input Tensor is not found in model");
       }
-      fShape = model.GetTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
    }
 
@@ -48,13 +48,10 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShape.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator Swish called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator Swish called to Generate without being initialized first");
       }
       std::stringstream out;
-      int length = 1;
-      for(auto& i: fShape){
-         length *= i;
-      }
+      std::string length = ConvertDimShapeToLength(fShape);
       out << "\t" << "for (int id = 0; id < " << length << " ; id++){\n";
       out << "\t\t" << "tensor_" << fNY << "[id] = tensor_" << fNX <<"[id] / (1 + std::exp( - tensor_"  << fNX << "[id]));\n";
       out << "\t}\n";
diff --git a/core/inc/SOFIE/ROperator_Tanh.hxx b/core/inc/SOFIE/ROperator_Tanh.hxx
new file mode 100644
index 0000000..f71b89f
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Tanh.hxx
@@ -0,0 +1,112 @@
+#ifndef SOFIE_ROPERATOR_Tanh
+#define SOFIE_ROPERATOR_Tanh
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template <typename T>
+class ROperator_Tanh final : public ROperator
+{
+
+private:
+
+   std::string fNX;
+   std::string fNY;
+   std::vector<Dim> fShape;
+
+public:
+   ROperator_Tanh(){}
+   ROperator_Tanh(std::string nameX, std::string nameY):
+      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::TANH;
+         fInputTensorNames = { fNX };
+         fOutputTensorNames = { fNY };
+      }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      auto ret = input; //suggest copy to compiler
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+       //input must be a graph input, or already initialized intermediate tensor
+      if (model.CheckIfTensorAlreadyExist(fNX) == false){
+        throw std::runtime_error("SOFIE Tanh Op Input Tensor is not found in model");
+      }
+      fShape = model.GetDimTensorShape(fNX);
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+
+   }
+
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Tanh operator called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ TANH\n";
+      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
+      out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
+
+   bool IsElementwise() const override { return true; }
+   std::string GetElementwiseExpr(const std::string& v) const override {
+      return "tanh(" + v + ")";
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ TANH_KERNEL_ALPAKA\n";
+      op += "struct TanhKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + "if (idx < numElements) { out[idx] = tanh(data[idx]); }\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "TanhKernel tanhKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("SOFIE Tanh called to Generate_GPU_ALPAKA without being initialized");
+      }
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ TANH_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNX << " = sofie_workdiv(elementsPerGrid_" << fNX << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNX
+         << ", tanhKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROPERATOR_Tanh
diff --git a/core/inc/SOFIE/ROperator_Tile.hxx b/core/inc/SOFIE/ROperator_Tile.hxx
new file mode 100644
index 0000000..5a3921e
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Tile.hxx
@@ -0,0 +1,249 @@
+#ifndef SOFIE_ROPERATOR_Tile
+#define SOFIE_ROPERATOR_Tile
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+
+namespace SOFIE{
+
+template <typename T>
+class ROperator_Tile final : public ROperator
+{
+
+private:
+
+   std::string fNRepeats;
+   std::string fNInput;
+   std::string fNY;
+   std::vector<size_t> fShapeInput;
+   std::vector<size_t> fShapeY;
+   std::vector<size_t> fRepeats;
+
+public:
+   ROperator_Tile(){}
+   ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY):
+      fNRepeats(UTILITY::Clean_name(nameRepeat)),
+      fNInput(UTILITY::Clean_name(nameInput)),
+      fNY(UTILITY::Clean_name(nameY)) {
+         fInputTensorNames  = { fNRepeats, fNInput };
+         fOutputTensorNames = { fNY };
+      }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      std::vector<size_t> ret = input[0];
+      for (size_t i = 0; i < input[1].size(); i++)
+         ret[i] = ret[i] * input[1][i];
+      return {ret};
+   }
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNInput) == false)
+         throw std::runtime_error("SOFIE Tile Op Input Tensor is not found in model");
+      if (model.CheckIfTensorAlreadyExist(fNRepeats) == false)
+         throw std::runtime_error("SOFIE Tile Op Repeats Tensor is not found in model");
+
+      fShapeInput = model.GetTensorShape(fNInput);
+
+      if (!model.IsInitializedTensor(fNRepeats))
+         throw std::runtime_error("SOFIE Tile Op: non-initialized repeats input is not supported");
+
+      auto repptr       = model.GetInitializedTensorData(fNRepeats);
+      auto repeats_data = static_cast<int64_t*>(repptr.get());
+      if (repeats_data == nullptr)
+         throw std::runtime_error("SOFIE Tile Op: failed to retrieve repeats tensor data");
+
+      auto repeats_shape = model.GetTensorShape(fNRepeats);
+      if (repeats_shape.size() != 1)
+         throw std::runtime_error("SOFIE Tile Op: repeats tensor must be 1D");
+
+      size_t num_elements = repeats_shape[0];
+
+      // Save repeats if known at generation time so the GPU kernel can bake
+      // fShapeInput[d] directly without needing a runtime repeats pointer.
+      // fRepeats is left empty if repeats are not initialized (future case),
+      // which will cause the kernel to use the runtime repeats pointer path.
+      fRepeats.resize(num_elements);
+      std::copy(repeats_data, repeats_data + num_elements, fRepeats.begin());
+      if (fRepeats.size()){
+         model.RemoveInitializedTensor(fNRepeats);
+      }
+      fShapeY = ShapeInference({fShapeInput, fRepeats})[0];
+
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY);
+
+      if (model.Verbose())
+         std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput)
+                   << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
+                   << " given repeats " << ConvertShapeToString(fRepeats) << std::endl;
+   }
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Tile Op called to Generate without being initialized first");
+
+      std::stringstream out;
+      std::string input   = "tensor_" + fNInput;
+      std::string output  = "tensor_" + fNY;
+      std::string repeats = "tensor_" + fNRepeats;
+
+      out << "///-------- Tile operator\n";
+      out << "{\n";
+
+      out << SP << "const int input_shape[" << fShapeInput.size() << "] = {";
+      for (size_t i = 0; i < fShapeInput.size(); ++i) {
+         if (i > 0) out << ", ";
+         out << fShapeInput[i];
+      }
+      out << "};\n";
+
+      out << SP << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
+      out << SP << "int s = 1;\n";
+
+      // Read repeats from the tensor at runtime so the generated code remains
+      // correct even if repeats become a runtime input/intermediate in the future
+      out << SP << "for (int i = " << fShapeInput.size() - 1 << "; i >= 0; i--) {\n";
+      out << SP << SP << "int r = " << repeats << "[i];\n";
+      out << SP << SP << "int i_offset = 0, o_offset = 0;\n";
+      out << SP << SP << "s = s * input_shape[i];\n";
+      out << SP << SP << "if (i == " << fShapeInput.size() - 1 << ") {\n";
+      out << SP << SP << SP << "for (int j = 0; j < inputLength / s; j++) {\n";
+      out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n";
+      out << SP << SP << SP << SP << SP << "std::copy(" << input << " + i_offset, "
+                                        << input << " + i_offset + s, "
+                                        << output << " + o_offset);\n";
+      out << SP << SP << SP << SP << SP << "o_offset += s;\n";
+      out << SP << SP << SP << SP << "}\n";
+      out << SP << SP << SP << SP << "i_offset += s;\n";
+      out << SP << SP << SP << "}\n";
+      out << SP << SP << "} else {\n";
+      out << SP << SP << SP << "for (int j = inputLength / s - 1; j >= 0; j--) {\n";
+      out << SP << SP << SP << SP << "o_offset = j * s * r;\n";
+      out << SP << SP << SP << SP << "i_offset = j * s;\n";
+      out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n";
+      out << SP << SP << SP << SP << SP << "std::copy(" << output << " + i_offset, "
+                                        << output << " + i_offset + s, "
+                                        << output << " + o_offset);\n";
+      out << SP << SP << SP << SP << SP << "o_offset += s;\n";
+      out << SP << SP << SP << SP << "}\n";
+      out << SP << SP << SP << "}\n";
+      out << SP << SP << "}\n";
+      out << SP << SP << "s *= r;\n";
+      out << SP << SP << "inputLength *= r;\n";
+      out << SP << "}\n";
+      out << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first");
+
+      const std::size_t D = fShapeInput.size();
+
+      auto inputStrides  = UTILITY::ComputeStrideFromShape(fShapeInput);
+      auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeY);
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+
+      // If fRepeats is populated, repeats were known at generation time and
+      // we can bake fShapeInput[d] as literals — no runtime repeats pointer needed.
+      // If fRepeats is empty (future: runtime repeats), pass repeats as a kernel arg.
+      bool repeatsKnown = !fRepeats.empty();
+
+      std::string kname = "TileKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ TILE_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ input,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      if (!repeatsKnown)
+         op += SP + SP + SP + "int64_t const* __restrict__ repeats,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      // Decompose output linear index — output strides always compile-time
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+             + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % "
+             + std::to_string(fShapeY[d]) + "u;\n";
+      }
+      op += "\n";
+
+      // Input index: fShapeInput[d] is always a compile-time constant since
+      // it is the input tensor shape, never runtime-variable.
+      // When repeatsKnown, we bake it directly as a literal.
+      // When not repeatsKnown (future), we still use fShapeInput[d] as a
+      // literal for the % — repeats pointer is only needed if fShapeY is dynamic.
+      op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + SP
+             + "(out_" + std::to_string(d) + " % " + std::to_string(fShapeInput[d]) + "u)"
+             + " * " + std::to_string(inputStrides[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "TileKernel_" + opName;
+      return SP + kname + " tileKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("SOFIE Operator Tile called to Generate without being initialized first");
+
+      bool repeatsKnown = !fRepeats.empty();
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "tileKernel_" + opName;
+
+      // Build argument list once, reused for both getValidWorkDiv and exec
+      std::string args =
+          "alpaka::getPtrNative(deviceBuf_" + fNInput + "), "
+          + "alpaka::getPtrNative(deviceBuf_" + fNY + ")";
+      if (!repeatsKnown)
+         args += ", alpaka::getPtrNative(deviceBuf_" + fNRepeats + ")";
+      args += ", static_cast<Idx>(" + std::to_string(totalElements) + ")";
+
+      std::stringstream out;
+      out << "\n//------ TILE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      out << SP << "auto task_" << opName << " = alpaka::createTaskKernel<Acc>(workDiv_" << opName
+          << ", " << kname << ", " << args << ");\n";
+      out << SP <<"alpaka::enqueue(queue, task_" << opName << ");\n";
+      return out.str();
+   }
+
+};
+
+}//SOFIE
+
+#endif //SOFIE_ROPERATOR_Tile
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx b/core/inc/SOFIE/ROperator_TopK.hxx
similarity index 94%
rename from src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx
rename to core/inc/SOFIE/ROperator_TopK.hxx
index 06d8179..7db1768 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_TopK.hxx
+++ b/core/inc/SOFIE/ROperator_TopK.hxx
@@ -48,7 +48,7 @@ public:
 
    std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
       if (input.size() != 2) {
-         throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors");
+         throw std::runtime_error("SOFIE TopK Op Shape Inference needs exactly 2 input tensors");
       }
 
       auto shape = input[0]; // Shape format: [ m x n x o x p ... ]
@@ -62,11 +62,11 @@ public:
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false) {
          // input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor is not found in model");
+         throw std::runtime_error("SOFIE TopK Op Input Tensor is not found in model");
       }
       if (model.CheckIfTensorAlreadyExist(fNK) == false) {
          // input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model");
+         throw std::runtime_error("SOFIE TopK Op Input Tensor i.e. K is not found in model");
       }
 
       fShapeX = model.GetTensorShape(fNX);
@@ -77,7 +77,7 @@ public:
       fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis;
       if(static_cast<size_t>(fAttrAxis) >=  fShapeX.size()){
          throw
-            std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" .");
+            std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+std::to_string(fShapeX.size())+" .");
       }
       // fK cannot be larger that axis dimension
       fK = std::min(fK, fShapeX[fAttrAxis]);
@@ -111,7 +111,7 @@ public:
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
       if (fShapeX.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator TopK called to Generate without being initialized first");
+         throw std::runtime_error("SOFIE Operator TopK called to Generate without being initialized first");
       }
       std::stringstream out;
       size_t size = fShapeX.size();
diff --git a/core/inc/SOFIE/ROperator_Transpose.hxx b/core/inc/SOFIE/ROperator_Transpose.hxx
new file mode 100644
index 0000000..03dad41
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Transpose.hxx
@@ -0,0 +1,239 @@
+#ifndef SOFIE_ROPERATOR_TRANSPOSE
+#define SOFIE_ROPERATOR_TRANSPOSE
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <cassert>
+
+
+namespace SOFIE{
+
+
+
+
+template <typename T>
+class ROperator_Transpose final : public ROperator
+{
+
+private:
+   std::vector<int_t> fAttrPerm;
+
+   std::string fNData;
+   std::string fNOutput;
+   std::vector<size_t> fShapeData;    // used for initialized (constant) tensor case
+   std::vector<size_t> fShapeOutput;  // used for initialized (constant) tensor case
+   std::vector<Dim> fDimShapeData;    // used for dynamic/runtime tensor case
+   std::vector<Dim> fDimShapeOutput;  // used for dynamic/runtime tensor case
+
+public:
+
+   ROperator_Transpose(){}
+   ROperator_Transpose(std::vector<int_t> attr_perm, std::string nameData, std::string nameOutput):
+      fAttrPerm(attr_perm), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) {
+            fInputTensorNames = { fNData };
+            fOutputTensorNames = { fNOutput };
+   }
+
+   ROperator_Transpose(std::string nameData, std::string nameOutput):
+      fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) {
+         fInputTensorNames = { fNData };
+         fOutputTensorNames = { fNOutput };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      if (input.size() > 1) throw std::runtime_error("SOFIE Tranpose Op Shape Inference only need 1 input tensor");
+      auto& data = input[0];
+      if (fAttrPerm.size() != data.size() )
+         throw std::runtime_error("SOFIE Tranpose Op - Invalid axes attributes");
+
+      std::vector<size_t> output_shape(fAttrPerm.size());
+      for (size_t i = 0; i < fAttrPerm.size(); i++){
+         output_shape[i] = data[fAttrPerm[i]];
+      }
+      std::vector<std::vector<size_t>> ret;
+      ret.push_back(output_shape);
+      return ret;
+   }
+
+
+   void Initialize(RModel& model) override {
+      if (model.CheckIfTensorAlreadyExist(fNData) == false){   //input must be a graph input, or already initialized intermediate tensor
+         std::cout<<"Input tensor for transpose: "<<fNData<<'\n';
+         throw std::runtime_error("SOFIE Tranpose Op Input Tensor is not found in model");
+      }
+      if (model.IsInitializedTensor(fNData)) {
+         // Constant/initialized tensor: use concrete shapes and perform transpose at init time
+         fShapeData = model.GetTensorShape(fNData);
+         if (fAttrPerm.empty()){
+            fAttrPerm.reserve(fShapeData.size());
+            for (int i = fShapeData.size() - 1; i >= 0; i--){
+               fAttrPerm.push_back(i);
+            }
+         }
+         std::vector<std::vector<size_t>> inputs = { fShapeData };
+         fShapeOutput = ShapeInference(inputs).front();
+         fIsOutputConstant = true;
+         auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData);
+         auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput);
+         size_t length = ConvertShapeToLength(fShapeOutput);
+         auto inputData = static_cast<T*>(model.GetInitializedTensorData(fNData).get());
+         size_t dim = fShapeData.size();
+         std::vector<size_t> outputIdx(dim);
+         std::vector<T> outputData(length);
+         for (size_t i = 0; i < length; i++) {
+            outputIdx[0] = i / outStrides[0];
+            for (size_t j = 1; j < dim; j++) {
+               outputIdx[j] = (i % outStrides[j-1]) / outStrides[j];
+            }
+            // compute input index
+            size_t inputIndex = 0;
+            for (size_t j = 0; j < dim; j++) {
+               // find value in fAtrrPerm corresponding to j
+               int k = std::find(fAttrPerm.begin(), fAttrPerm.end(), j) - fAttrPerm.begin();
+               inputIndex += outputIdx[k] * inStrides[j];
+            }
+            outputData[i] = inputData[inputIndex];
+         }
+         model.AddConstantTensor<T>(fNOutput, fShapeOutput, outputData.data());
+         if (model.Verbose()) {
+            std::cout << "Transpose: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : "
+               << ConvertValuesToString(outputData) << std::endl;
+         }
+      } else {
+         // Non-initialized (runtime/dynamic) tensor: use Dim-aware shapes
+         fDimShapeData = model.GetDimTensorShape(fNData);
+         size_t rank = fDimShapeData.size();
+         if (fAttrPerm.empty()){
+            fAttrPerm.reserve(rank);
+            for (int i = rank - 1; i >= 0; i--){
+               fAttrPerm.push_back(i);
+            }
+         }
+         fDimShapeOutput.resize(fAttrPerm.size());
+         for (size_t i = 0; i < fAttrPerm.size(); i++){
+            fDimShapeOutput[i] = fDimShapeData[fAttrPerm[i]];
+         }
+         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fDimShapeOutput);
+         if (model.Verbose()) {
+            std::cout << "Transpose ---> " << fNOutput << " " << ConvertDimShapeToString(fDimShapeOutput) << std::endl;
+         }
+      }
+   }
+
+   std::string Generate(std::string OpName) override {
+      if (fIsOutputConstant) return "";  //no op for constant tensors
+      OpName = "op_" + OpName;
+      // Use Dim shapes when available (dynamic case), else convert from concrete shapes
+      auto dimShapeData   = fDimShapeData.empty()   ? ConvertShapeToDim(fShapeData)   : fDimShapeData;
+      auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput;
+      if (dimShapeData.empty() || dimShapeOutput.empty()){
+         throw std::runtime_error("SOFIE Transpose Op called to Generate without being initialized first");
+      }
+      int dim = dimShapeData.size();
+      auto inStrides  = UTILITY::ComputeStrideFromShape(dimShapeData);
+      auto outStrides = UTILITY::ComputeStrideFromShape(dimShapeOutput);
+      std::string length = ConvertDimShapeToLength(dimShapeOutput);
+
+      std::stringstream out;
+      // Implement transpose operator using consecutive write outputs.
+      // tensorOut[id] = tensorInput[ inStrides[0]*i0 + inStrides[1]*i1 + ...]
+      // where j_k = i_fAttrPerm[k] and (j0,j1,...) are the output indices for id
+      out << SP << "///------- Transpose operator\n" << std::endl;
+      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
+      out << SP << SP << "tensor_" << fNOutput << "[id] = tensor_" << fNData << "[ ";
+      // compute output j indices from id
+      std::vector<std::string> i_out(dim);
+      for (int k = 0; k < dim; k++){
+         if (k == 0)
+            i_out[k] = "id";
+         else
+            i_out[k] = "(id % " + outStrides[k-1].GetVal() + ")";
+         if (k < dim-1)
+            i_out[k] += " / " + outStrides[k].GetVal();
+      }
+      // use output indices to compute input index, inverting the permutation
+      for (int k = 0; k < dim; k++){
+         int l = std::find(fAttrPerm.begin(), fAttrPerm.end(), k) - fAttrPerm.begin();
+         assert(l >= 0 && l < dim);
+         out << "( " << i_out[l] << " )";
+         if (k < dim-1) {
+            out << " * " << inStrides[k].GetVal();
+            out << " + ";
+         }
+      }
+      out << "];\n";
+      out << SP << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) {
+      std::string op;
+      OpName = "op_" + OpName;
+      op = "\n//------ TRANSPOSE_KERNEL_ALPAKA\n";
+      op += SP + "struct TransposeKernel_" + OpName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,";
+      op += "const std::size_t totalElements) const {\n";
+      op += SP + SP + SP + SP + "auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + SP + "if(idx >= totalElements) return;\n";
+      op += SP + SP + SP + SP + "std::size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + "std::size_t remaining = idx;\n";
+      op += SP + SP + SP + SP + "std::size_t coord;\n";
+
+      auto dimShapeData   = fDimShapeData.empty()   ? ConvertShapeToDim(fShapeData)   : fDimShapeData;
+      auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput;
+      auto inputStrides  = UTILITY::ComputeStrideFromShape(dimShapeData);
+      auto outputStrides = UTILITY::ComputeStrideFromShape(dimShapeOutput);
+
+      for (size_t k = 0; k < dimShapeData.size(); k++) {
+         op += SP + SP + SP + SP + "coord = remaining / "
+               + outputStrides[k].GetVal() + "u;\n";
+         op += SP + SP + SP + SP + "remaining = remaining - coord * "
+               + outputStrides[k].GetVal() + "u;\n";
+         op += SP + SP + SP + SP + "input_idx += coord * "
+               + inputStrides[fAttrPerm[k]].GetVal() + "u;\n";
+      }
+
+      op += SP + SP + SP + SP + "output[idx] = input[input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override {
+      return SP + "TransposeKernel_op_" + OpName + " transposeKernel_" + OpName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      auto dimShapeOutput = fDimShapeOutput.empty() ? ConvertShapeToDim(fShapeOutput) : fDimShapeOutput;
+      if (dimShapeOutput.empty()) {
+         throw std::runtime_error("SOFIE Operator Transpose called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      std::string length = ConvertDimShapeToLength(dimShapeOutput);
+
+      out << "\n//------ TRANSPOSE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNOutput<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNOutput<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "auto const workDiv_" << fNOutput << " = sofie_workdiv(elementsPerGrid_" << fNOutput << ");\n";
+      out << SP << "auto task_" << OpName << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNOutput
+         << ", transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData
+         << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP <<"alpaka::enqueue(queue, task_" << OpName << ");\n";
+      return out.str();
+   }
+
+};
+
+}//SOFIE
+
+
+#endif //SOFIE_ROPERATOR_TRANSPOSE
diff --git a/core/inc/SOFIE/ROperator_Trilu.hxx b/core/inc/SOFIE/ROperator_Trilu.hxx
new file mode 100644
index 0000000..04e18d5
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Trilu.hxx
@@ -0,0 +1,232 @@
+#ifndef SOFIE_ROPERATOR_TRILU
+#define SOFIE_ROPERATOR_TRILU
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <cstddef>
+#include <cstdint>
+
+namespace SOFIE {
+
+template <typename T>
+class ROperator_Trilu final : public ROperator {
+private:
+   int      fUpper = 1;
+
+   int64_t  fK        = 0;
+   bool     fKIsStatic= true;
+
+   std::string fNX;
+   std::string fNK;
+   std::string fNY;
+
+   std::vector<size_t> fShape;
+   size_t fM     = 0;
+   size_t fN     = 0;
+   size_t fBatch = 1;
+   size_t fTotal = 0;
+
+public:
+   ROperator_Trilu() {}
+
+   ROperator_Trilu(int upper, std::string nameX, std::string nameY)
+      : fUpper(upper),
+        fNX(UTILITY::Clean_name(nameX)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNX };
+      fOutputTensorNames = { fNY };
+   }
+
+   ROperator_Trilu(int upper, std::string nameX, std::string nameK, std::string nameY)
+      : fUpper(upper),
+        fNX(UTILITY::Clean_name(nameX)),
+        fNK(UTILITY::Clean_name(nameK)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNX, fNK };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return { input[0] };
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      if (input.empty())
+         throw std::runtime_error("SOFIE Trilu ShapeInference: no input shapes");
+      return { input[0] };   // output has the same shape as input
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNX))
+         throw std::runtime_error("SOFIE Trilu: input tensor '" + fNX +
+                                  "' not found in model");
+
+      fShape = model.GetTensorShape(fNX);
+      if (fShape.size() < 2)
+         throw std::runtime_error("SOFIE Trilu: input tensor '" + fNX +
+                                  "' must have at least 2 dimensions, got " +
+                                  std::to_string(fShape.size()));
+
+      fN = fShape.back();
+      fM = fShape[fShape.size() - 2];
+      fBatch = 1;
+      for (size_t d = 0; d + 2 < fShape.size(); ++d)
+         fBatch *= fShape[d];
+      fTotal = fBatch * fM * fN;
+
+      if (!fNK.empty()) {
+         if (model.IsInitializedTensor(fNK) || model.IsConstantTensor(fNK)) {
+            // Bake the constant value into generated code.
+            auto data_ptr = static_cast<int64_t*>(
+               model.GetInitializedTensorData(fNK).get());
+            fK        = data_ptr[0];
+            fKIsStatic = true;
+         } else {
+            fKIsStatic = false;
+         }
+      }
+
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
+
+      if (model.Verbose()) {
+         std::cout << "Trilu: " << fNX
+                   << " upper=" << fUpper << " k=";
+         if (fKIsStatic) std::cout << fK;
+         else            std::cout << "dyn(" << fNK << ")";
+         std::cout << " -> " << fNY
+                   << " " << ConvertShapeToString(fShape) << std::endl;
+      }
+   }
+
+   std::string Generate(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty())
+         throw std::runtime_error(
+            "SOFIE Trilu: Generate called before Initialize");
+
+      std::stringstream out;
+      out << "\n//------ TRILU\n";
+
+      if (fKIsStatic) {
+         out << SP << "const int64_t k_" << OpName << " = " << fK << "LL;\n";
+      } else {
+         out << SP << "const int64_t k_" << OpName
+             << " = static_cast<int64_t>(tensor_" << fNK << "[0]);\n";
+      }
+
+      out << SP << "for (std::size_t id = 0; id < " << fTotal << "u; ++id) {\n";
+      out << SP << SP << "const std::size_t mat_id = id % "
+                      << (fM * fN) << "u;\n";
+      out << SP << SP << "const std::ptrdiff_t row = "
+                      << "static_cast<std::ptrdiff_t>(mat_id / " << fN << "u);\n";
+      out << SP << SP << "const std::ptrdiff_t col = "
+                      << "static_cast<std::ptrdiff_t>(mat_id % " << fN << "u);\n";
+      if (fUpper) {
+         out << SP << SP << "const bool keep = (col >= row + k_" << OpName << ");\n";
+      } else {
+         out << SP << SP << "const bool keep = (col <= row + k_" << OpName << ");\n";
+      }
+      out << SP << SP << "tensor_" << fNY << "[id] = keep ? tensor_" << fNX
+                      << "[id] : static_cast<T>(0);\n";
+      out << SP << "}\n";
+
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty())
+         throw std::runtime_error(
+            "SOFIE Trilu: Generate_GPU_Kernel_ALPAKA called before Initialize");
+
+      std::stringstream op;
+      op << "\n//------ TRILU_KERNEL_ALPAKA\n";
+      op << "struct TriluKernel_" << OpName << " {\n";
+      op << SP << "template<typename TAcc, typename T>\n";
+      op << SP << "ALPAKA_FN_ACC void operator()("
+               << "TAcc const& acc, "
+               << "T const* __restrict__ input, "
+               << "T* __restrict__ output, "
+               << "const std::size_t total, "
+               << "const std::ptrdiff_t k) const {\n";
+      op << SP << SP << "auto const idx = "
+               << "alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op << SP << SP << "if (idx >= total) return;\n";
+      op << SP << SP << "constexpr std::size_t N  = " << fN  << "u;\n";
+      op << SP << SP << "constexpr std::size_t MN = " << (fM * fN) << "u;\n";
+      op << SP << SP << "const std::size_t    mat_id = idx % MN;\n";
+      op << SP << SP << "const std::ptrdiff_t row    = "
+               << "static_cast<std::ptrdiff_t>(mat_id / N);\n";
+      op << SP << SP << "const std::ptrdiff_t col    = "
+               << "static_cast<std::ptrdiff_t>(mat_id % N);\n";
+      if (fUpper) {
+         op << SP << SP << "const bool keep = (col >= row + k);\n";
+      } else {
+         op << SP << SP << "const bool keep = (col <= row + k);\n";
+      }
+      op << SP << SP << "output[idx] = keep ? input[idx] : T(0);\n";
+      op << SP << "}\n";
+      op << "};\n";
+      return op.str();
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override {
+      std::string cleaned = "op_" + OpName;
+      return SP + "TriluKernel_" + cleaned + " triluKernel_" + cleaned + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fShape.empty())
+         throw std::runtime_error(
+            "SOFIE Trilu: Generate_GPU_ALPAKA called before Initialize");
+
+      std::string cleanOp = "op_" + OpName;
+      std::stringstream out;
+      out << "\n//------ TRILU_GPU_ALPAKA\n";
+
+      if (fKIsStatic) {
+         out << SP << "const std::ptrdiff_t k_" << cleanOp
+             << " = static_cast<std::ptrdiff_t>(" << fK << "LL);\n";
+      } else {
+         out << SP << "std::ptrdiff_t k_" << cleanOp << ";\n";
+         out << SP << "{\n";
+         out << SP << SP
+             << "auto hostK = alpaka::allocBuf<int64_t, Idx>(host, Ext1D::all(Idx{1}));\n";
+         out << SP << SP
+             << "alpaka::memcpy(queue, hostK, deviceBuf_" << fNK << ");\n";
+         out << SP << SP << "alpaka::wait(queue);\n";
+         out << SP << SP
+             << "k_" << cleanOp << " = static_cast<std::ptrdiff_t>("
+             << "*reinterpret_cast<const int64_t*>(alpaka::getPtrNative(hostK)));\n";
+         out << SP << "}\n";
+      }
+
+      out << SP << "auto const elementsPerThread_" << fNY
+          << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_" << fNY
+          << " = Vec::all(Idx{" << fTotal << "});\n";
+      out << SP << "auto const workDiv_" << fNY
+          << " = sofie_workdiv(elementsPerGrid_" << fNY << ");\n";
+      out << SP << "auto task_" << cleanOp
+          << " = alpaka::createTaskKernel<Acc>(workDiv_" << fNY
+          << ", triluKernel_" << cleanOp
+          << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << fTotal << ")"
+          << ", k_" << cleanOp << ");\n";
+      out << SP << "alpaka::enqueue(queue, task_" << cleanOp << ");\n";
+      return out.str();
+   }
+};
+
+} // namespace SOFIE
+
+#endif // SOFIE_ROPERATOR_TRILU
diff --git a/core/inc/SOFIE/ROperator_Where.hxx b/core/inc/SOFIE/ROperator_Where.hxx
new file mode 100644
index 0000000..b9956e9
--- /dev/null
+++ b/core/inc/SOFIE/ROperator_Where.hxx
@@ -0,0 +1,613 @@
+#ifndef SOFIE_ROperator_Where
+#define SOFIE_ROperator_Where
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+
+namespace SOFIE{
+
+template<typename T>
+class ROperator_Where final : public ROperator{
+private:
+
+   bool fIsInputBoolTensor = false;
+
+
+   std::string fNX;
+   std::string fNY;
+   std::string fNC;
+   std::string fNBroadcastedX;
+   std::string fNBroadcastedY;
+   std::string fNBroadcastedC;
+   std::string fNZ;
+
+
+
+   // static shapes (used when tensors are not dynamic) )
+   std::vector<size_t> fShapeX;
+   std::vector<size_t> fShapeY;
+   std::vector<size_t> fShapeC;
+   std::vector<size_t> fShapeZ;
+
+   // Dynamic generic shapes
+   std::vector<Dim> fDimShapeC;
+   std::vector<Dim> fDimShapeX;
+   std::vector<Dim> fDimShapeY;
+   std::vector<Dim> fDimShapeZ;
+
+   // Broadcast flag: mirrors convention of BasicBinary
+   //   bit 0: broadcast Y->X (Y needs expanding)
+   //   bit 1: broadcast X->Y (X needs expanding)
+   //   bit 2: broadcast C->Z (C needs expanding)
+   //   bit 4: shapes may differ at runtime (dynamic)
+   int fBroadcastFlag = 0;
+
+public:
+   ROperator_Where(){}
+   ROperator_Where(const std::string & nameC, const std::string & nameX, const std::string & nameY, const std::string & nameZ):
+      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), fNC(UTILITY::Clean_name(nameC)), fNZ(UTILITY::Clean_name(nameZ)){
+         fInputTensorNames = { fNX, fNY, fNC };
+         fOutputTensorNames = { fNZ };
+      }
+
+   // type of output given input
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return input;
+   }
+
+   // shape of output tensors given input tensors
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      // assume now inputs have same shape (no broadcasting)
+      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
+      return ret;
+   }
+
+   void Initialize(RModel& model) override {
+      // input must be a graph input, or already initialized intermediate tensor
+      if (!model.CheckIfTensorAlreadyExist(fNX)){
+         throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNX + "is not found in model");
+      }
+      if (!model.CheckIfTensorAlreadyExist(fNY)) {
+         throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNY + "is not found in model");
+      }
+      if (!model.CheckIfTensorAlreadyExist(fNC)) {
+         throw std::runtime_error(std::string("SOFIE Where Op Input Tensor ") + fNC + "is not found in model");
+      }
+      // check if fNC input tensor is boolean
+      if (model.IsReadyInputTensor(fNC))
+         fIsInputBoolTensor = true;
+
+      // ---------------------------------------------------------------- //
+      //  Collect shapes – dynamic or static
+      // ---------------------------------------------------------------- //
+      int dynamicInputs = 0;   // bitmask: bit0=C, bit1=X, bit2=Y
+
+      if (model.IsDynamicTensor(fNC)) {
+         fDimShapeC = model.GetDynamicTensorShape(fNC);
+         dynamicInputs |= 1;
+      } else {
+         fShapeC    = model.GetTensorShape(fNC);
+         fDimShapeC = ConvertShapeToDim(fShapeC);
+      }
+      if (model.IsDynamicTensor(fNX)) {
+         fDimShapeX = model.GetDynamicTensorShape(fNX);
+         dynamicInputs |= 2;
+      } else {
+         fShapeX    = model.GetTensorShape(fNX);
+         fDimShapeX = ConvertShapeToDim(fShapeX);
+      }
+      if (model.IsDynamicTensor(fNY)) {
+         fDimShapeY = model.GetDynamicTensorShape(fNY);
+         dynamicInputs |= 4;
+      } else {
+         fShapeY    = model.GetTensorShape(fNY);
+         fDimShapeY = ConvertShapeToDim(fShapeY);
+      }
+
+
+      if (model.Verbose()) {
+         if (dynamicInputs & 1)
+            std::cout << "Where : condition " << fNC << " is dynamic " << ConvertDimShapeToString(fDimShapeC) << "\n";
+         if (dynamicInputs & 2)
+            std::cout << "Where :  " << fNX << " is dynamic " << ConvertDimShapeToString(fDimShapeX) << "\n";
+         if (dynamicInputs & 4)
+            std::cout << "Where : Y " << fNZ << " is dynamic " << ConvertDimShapeToString(fDimShapeZ) << "\n";
+      }
+
+      // ---------------------------------------------------------------- //
+      //  Static path: all shapes known at code-gen time
+      // ---------------------------------------------------------------- //
+      if (dynamicInputs == 0) {
+
+         bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY) || !UTILITY::AreSameShape(fShapeX, fShapeC);
+         if (broadcast) {
+            // find shape to broadcast between X,Y,C looking for max length
+            size_t lengthX = ConvertShapeToLength(fShapeX);
+            size_t lengthY = ConvertShapeToLength(fShapeY);
+            size_t lengthC = ConvertShapeToLength(fShapeC);
+            bool broadcastX = false, broadcastY = false, broadcastC = false;
+            if (lengthX >= lengthY && lengthX >= lengthC) {
+               fShapeZ = fShapeX;
+               // broadcast Y and C if different than X
+               broadcastY = (lengthY != lengthX);
+               broadcastC = (lengthC != lengthX);
+            } else if (lengthY >= lengthX && lengthY >= lengthC) {
+               fShapeZ = fShapeY;
+               // broadcast X and C if different than Y
+               broadcastX = (lengthX != lengthY);
+               broadcastC = (lengthC != lengthY);
+            } else if (lengthC >= lengthX && lengthC >= lengthY) {
+               fShapeZ = fShapeC;
+               // broadcast X and Y if different than C
+               broadcastX = (lengthX != lengthC);
+               broadcastY = (lengthY != lengthC);
+            }
+
+            // Broadcast X to Z
+            if (broadcastX) {
+               fNBroadcastedX = "BC_" + fNX + "_to_" + fNZ;
+               if (model.IsInitializedTensor(fNX)) {
+                  auto data = model.GetInitializedTensorData(fNX);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeX, fShapeZ),
+                     std::default_delete<T[]>());
+                  // Update the data and the shape of X
+                  model.AddConstantTensor(fNBroadcastedX, model.GetTensorType(fNX), fShapeZ, broadcastedData);
+                  fShapeX = fShapeZ;
+               } else {
+                  // I need to prepend to shape of X the extra dimensions added for broadcasting to Z
+                  if (fShapeX.size() < fShapeZ.size()) {
+                     size_t nPrepend = fShapeZ.size() - fShapeX.size();
+                     fShapeX.insert(fShapeX.begin(), nPrepend, 1);
+                  }
+               }
+            }
+            // Broadcast Y to Z
+            if (broadcastY) {
+               fNBroadcastedY = "BC_" + fNY + "_to_" + fNZ;
+               if (model.IsInitializedTensor(fNY)) {
+                  auto data = model.GetInitializedTensorData(fNY);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeY, fShapeZ),
+                     std::default_delete<T[]>());
+                  // do not update tensor B but add broadcasted one (since it can be input to some other operators)
+                  model.AddConstantTensor(fNBroadcastedY, model.GetTensorType(fNY), fShapeZ, broadcastedData);
+                  fShapeY = fShapeZ;
+               } else {
+                  // I need to prepend to shape of Y the extra dimensions added for broadcasting to Z
+                  if (fShapeY.size() < fShapeZ.size()) {
+                     size_t nPrepend = fShapeZ.size() - fShapeY.size();
+                     fShapeY.insert(fShapeY.begin(), nPrepend, 1);
+                  }
+
+               }
+            }
+            // Broadcast C to Z
+            if (broadcastC) {
+               fNBroadcastedC = "BC_" + fNC + "_to_" + fNZ;
+               if (model.IsInitializedTensor(fNC)) {
+                  auto data = model.GetInitializedTensorData(fNC);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeC, fShapeZ),
+                     std::default_delete<T[]>());
+                  // do not update tensor C but add broadcasted one (since it can be input to some other operators)
+                  model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeZ, broadcastedData);
+                  fShapeC = fShapeZ;
+               } else {
+                  // I need to prepend to shape of C the extra dimensions added for broadcasting to Z
+                  if (fShapeC.size() < fShapeZ.size()) {
+                     size_t nPrepend = fShapeZ.size() - fShapeC.size();
+                     fShapeC.insert(fShapeC.begin(), nPrepend, 1);
+                  }
+               }
+            }
+         } else {
+            fShapeZ = fShapeX;
+         }
+         // check case of constant  output (if all inputs are defined)
+         if (model.IsInitializedTensor(fNC)) {
+            std::string nameC = fNBroadcastedC.empty() ? fNC : fNBroadcastedC;
+            auto dataC = static_cast<bool *>(model.GetInitializedTensorData(nameC).get());
+            model.SetNotWritableInitializedTensor(nameC);
+            T *dataX = nullptr;
+            T *dataY = nullptr;
+            std::vector<Dim> shapeDataX;
+            std::vector<Dim> shapeDataY;
+            if (model.IsInitializedTensor(fNX)) {
+               std::string nameX = fNBroadcastedX.empty() ? fNX : fNBroadcastedX;
+               dataX = static_cast<T *>(model.GetInitializedTensorData(nameX).get());
+               // flag tensors to not be written in a file
+               model.SetNotWritableInitializedTensor(nameX);
+            } else if (model.IsShapeTensor(fNX)) {
+               shapeDataX = model.GetShapeTensorValues(fNX);
+            }
+            if (model.IsInitializedTensor(fNY)) {
+               std::string nameY = fNBroadcastedY.empty() ? fNY : fNBroadcastedY;
+               dataY = static_cast<T *>(model.GetInitializedTensorData(nameY).get());
+               model.SetNotWritableInitializedTensor(nameY);
+            } else if (model.IsShapeTensor(fNY)) {
+               shapeDataY = model.GetShapeTensorValues(fNY);
+            }
+            std::vector<T> dataZ;        // used in case output is constant tensor
+            std::vector<Dim> shapeDataZ; // used in case output is a shape tensor (can be also constant if all
+                                         // dimensions are not parametric)
+            // if fNC (condition) is initialized we know the output is a shape or a constant tensor,
+            // so we can compute it at initialization and add it as a constant tensor to the model
+            // (and not add the operator output as intermediate tensor to the model)
+            bool isOutputConstantTensor = true;
+            if (dataX && dataY) {
+               dataZ.resize(ConvertShapeToLength(fShapeZ));
+               for (size_t i = 0; i < dataZ.size(); i++)
+                  dataZ[i] = (dataC[i]) ? dataX[i] : dataY[i];
+               if (model.Verbose())
+                  std::cout << "data A and B : dataZ constant: " << ConvertValuesToString(dataZ) << std::endl;
+            } else if (dataX && shapeDataY.size() > 0) {
+               shapeDataZ.resize(ConvertShapeToLength(fShapeZ));
+               for (size_t i = 0; i < shapeDataZ.size(); i++) {
+                  shapeDataZ[i] = (dataC[i]) ? Dim{size_t(dataX[i])} : shapeDataY[i];
+                  isOutputConstantTensor &= !shapeDataZ[i].isParam;
+               }
+               if (model.Verbose())
+                  std::cout << "data A but shapeB " << ConvertDimShapeToString(shapeDataY) << "  "
+                         << isOutputConstantTensor << std::endl;
+            } else if (dataY && shapeDataX.size() > 0) {
+               shapeDataZ.resize(ConvertShapeToLength(fShapeZ));
+               for (size_t i = 0; i < shapeDataZ.size(); i++) {
+                  shapeDataZ[i] = (dataC[i]) ? shapeDataY[i] : Dim{size_t(dataY[i])};
+                  isOutputConstantTensor &= !shapeDataZ[i].isParam;
+               }
+               if (model.Verbose())
+                  std::cout << "data B but shapeA " << ConvertDimShapeToString(shapeDataX) << "  "
+                         << isOutputConstantTensor << std::endl;
+            } else if (shapeDataY.size() > 0 && shapeDataX.size() > 0) {
+               shapeDataZ.resize(ConvertShapeToLength(fShapeZ));
+               for (size_t i = 0; i < shapeDataZ.size(); i++) {
+                  shapeDataZ[i] = (dataC[i]) ? shapeDataX[i] : shapeDataY[i];
+                  isOutputConstantTensor &= !shapeDataZ[i].isParam;
+               }
+               if (model.Verbose())
+                  std::cout << " shapeA and B " << ConvertDimShapeToString(shapeDataX) << " shapeB "
+                         << ConvertDimShapeToString(shapeDataY) << "  " << isOutputConstantTensor << std::endl;
+            }
+            fIsOutputConstant = true;
+            // add as constant or shape tensor depending on the case
+            if (dataZ.size() > 0)
+               model.AddConstantTensor<T>(fNZ, fShapeZ, dataZ.data());
+            else if (shapeDataZ.size() > 0)
+               model.AddShapeTensor(fNZ, shapeDataZ, fShapeZ.size() == 0);
+            else {
+               fIsOutputConstant = false;
+            }
+            if (fIsOutputConstant && model.Verbose())
+               std::cout << "Where op ---> " << fNZ << "  " << ConvertShapeToString(fShapeZ) << " : "
+                         << ((dataZ.size() > 0) ? ConvertValuesToString(dataZ) : ConvertDimShapeToString(shapeDataZ))
+                         << ((dataZ.size() > 0) ? " (constant)" : " (shape)") << std::endl;
+
+            // output is a constant tensor
+            if (fIsOutputConstant)
+               fOutputTensorNames.pop_back();
+         }
+         if (!fIsOutputConstant) {
+
+            fDimShapeZ = ConvertShapeToDim(fShapeZ);
+            model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fShapeZ);
+            if (model.Verbose())
+               std::cout << "Where : condition : " << fNC << "  " << ConvertShapeToString(fShapeC) << " X "
+                         << fNX << "  " << ConvertShapeToString(fShapeX) << " Y " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " ---> " << fNZ << "  " << ConvertShapeToString(fShapeZ)
+                         << std::endl;
+         }
+      } else {
+         // ---------------------------------------------------------------- //
+         //  Dynamic path: at least one input has a parametric shape
+         //  Need to use BroadcastShape to find output shape
+         // ---------------------------------------------------------------- //
+         auto retXY = UTILITY::MultidirectionalBroadcastShape(fDimShapeX, fDimShapeY);
+         fBroadcastFlag = retXY.first;
+         fDimShapeZ     = retXY.second;
+         auto retCZ = UTILITY::MultidirectionalBroadcastShape(fDimShapeC, fDimShapeZ);
+         fBroadcastFlag |= retCZ.first;
+         fDimShapeZ      = retCZ.second;
+
+         // Resolve std::max params to actual input dim params (same logic as BasicBinary)
+         if (fBroadcastFlag & 4) {
+            auto IsInputDimParam = [&](const std::string &p) {
+               for (auto &input : model.GetInputTensorNames())
+                  for (auto &s : model.GetDimTensorShape(input))
+                     if (s.isParam && s.param == p) return true;
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeZ.size(); i++) {
+               auto &s = fDimShapeZ[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  // prefer A dim over B dim
+                  if (i < fDimShapeX.size() && IsInputDimParam(fDimShapeX[i].param)) {
+                     s = (fDimShapeX[i].dim != 1) ? fDimShapeX[i] : fDimShapeY[i];
+                  } else if (i < fDimShapeY.size() && IsInputDimParam(fDimShapeY[i].param)) {
+                     s = (fDimShapeY[i].dim != 1) ? fDimShapeY[i] : fDimShapeX[i];
+                  }
+               }
+            }
+         }
+         // I need to prepend to shape of X,Y,C the extra dimensions added for broadcasting to Z
+         if (fDimShapeX.size() < fDimShapeZ.size()) {
+            size_t nPrepend = fDimShapeZ.size() - fDimShapeX.size();
+            fDimShapeX.insert(fDimShapeX.begin(), nPrepend, Dim{1});
+         }
+         if (fDimShapeY.size() < fDimShapeZ.size()) {
+            size_t nPrepend = fDimShapeZ.size() - fDimShapeY.size();
+            fDimShapeY.insert(fDimShapeY.begin(), nPrepend, Dim{1});
+         }
+         if (fDimShapeC.size() < fDimShapeZ.size()) {
+            size_t nPrepend = fDimShapeZ.size() - fDimShapeC.size();
+            fDimShapeC.insert(fDimShapeC.begin(), nPrepend, Dim{1});
+         }
+
+         model.AddIntermediateTensor(fNZ, model.GetTensorType(fNX), fDimShapeZ);
+
+         if (model.Verbose())
+            std::cout << "Where (dynamic) : C=" << ConvertDimShapeToString(fDimShapeC)
+                      << "  A=" << ConvertDimShapeToString(fDimShapeX)
+                      << "  B=" << ConvertDimShapeToString(fDimShapeY)
+                      << " --> Y=" << ConvertDimShapeToString(fDimShapeZ) << "\n";
+      }
+   }
+
+   std::string GenerateInitCode() override {
+      std::stringstream out;
+      return out.str();
+   }
+
+   std::string Generate(std::string opName) override {
+
+      opName = "op_" + opName;
+      std::stringstream out;
+      out << SP << "\n//------ WHERE " << opName << " --> " << ConvertDimShapeToString(fDimShapeZ) << "\n";
+      if (fIsOutputConstant) return out.str();
+
+
+      // ---------------------------------------------------------------- //
+      //  Runtime broadcast validation (dynamic shapes, flag bit 4)
+      // ---------------------------------------------------------------- //
+      if (fBroadcastFlag & 4) {
+         auto lengthX = ConvertDimShapeToLength(fDimShapeX);
+         auto lengthY = ConvertDimShapeToLength(fDimShapeY);
+         auto lengthC = ConvertDimShapeToLength(fDimShapeC);
+         out << SP << "if (" << lengthX << " != " << lengthY << " || "
+             << lengthX << " != " << lengthC << ") {\n";
+         for (size_t i = 0; i < fDimShapeZ.size(); i++) {
+            // validate X vs Z
+            if (i < fDimShapeX.size() && fDimShapeX[i].isParam) {
+               out << SP << SP << "if (" << fDimShapeX[i] << " != 1 && "
+                   << fDimShapeX[i] << " != " << fDimShapeZ[i] << ")\n";
+               out << SP << SP << SP
+                   << "throw std::runtime_error(\"SOFIE Where: cannot broadcast A dim " << i << " in " << opName << "\");\n";
+            }
+            // validate Y vs Z
+            if (i < fDimShapeY.size() && fDimShapeY[i].isParam) {
+               out << SP << SP << "if (" << fDimShapeY[i] << " != 1 && "
+                   << fDimShapeY[i] << " != " << fDimShapeZ[i] << ")\n";
+               out << SP << SP << SP
+                   << "throw std::runtime_error(\"SOFIE Where: cannot broadcast B dim " << i << " in " << opName << "\");\n";
+            }
+            // validate C vs Z
+            if (i < fDimShapeC.size() && fDimShapeC[i].isParam) {
+               out << SP << SP << "if (" << fDimShapeC[i] << " != 1 && "
+                   << fDimShapeC[i] << " != " << fDimShapeZ[i] << ")\n";
+               out << SP << SP << SP
+                   << "throw std::runtime_error(\"SOFIE Where: cannot broadcast C dim " << i << " in " << opName << "\");\n";
+            }
+         }
+         out << SP << "}\n";
+      }
+      // implement now where using teh strides and looping on the different dimensions
+      // ---------------------------------------------------------------- //
+      //  Generate loop(s) with per-dimension stride-based index arithmetic
+      // ---------------------------------------------------------------- //
+      auto stridesX = UTILITY::ComputeStrideFromShape(fDimShapeX);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+      auto stridesC = UTILITY::ComputeStrideFromShape(fDimShapeC);
+      auto stridesZ = UTILITY::ComputeStrideFromShape(fDimShapeZ);
+
+      auto buildIdxExpr = [&](const std::vector<Dim> &dimShape,
+                               const std::vector<Dim> &strides,
+                               size_t rankZ) -> std::string {
+         if (dimShape.empty() ||
+             std::all_of(dimShape.begin(), dimShape.end(),
+                         [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; }))
+            return "0";
+         std::string expr;
+         size_t offset = rankZ - dimShape.size();
+         for (size_t i = 0; i < dimShape.size(); ++i) {
+            if (dimShape[i].dim == 1 || dimShape[i].GetVal() == "1") continue;
+            expr += "idx_" + std::to_string(i + offset);
+            if (strides[i].GetVal() != "1")
+               expr += " * " + strides[i].GetVal();
+            expr += " + ";
+         }
+         if (expr.size() >= 3)
+            for (int j = 0; j < 3; j++) expr.pop_back();  // remove trailing " + "
+         return expr.empty() ? "0" : expr;
+      };
+
+      std::string idxX = buildIdxExpr(fDimShapeX, stridesX, fDimShapeZ.size());
+      std::string idxY = buildIdxExpr(fDimShapeY, stridesY, fDimShapeZ.size());
+      std::string idxC = buildIdxExpr(fDimShapeC, stridesC, fDimShapeZ.size());
+
+       // Emit nested loops over output shape
+      int nloop = 0;
+      std::string idxZ;
+      // case Z is a scalar (all dimensions are 1) or Z has no dimension
+      if (fDimShapeZ.empty() ||
+          std::all_of(fDimShapeZ.begin(), fDimShapeZ.end(),
+                      [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         idxZ = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeZ.size(); ++i) {
+            if (fDimShapeZ[i].dim != 1 && fDimShapeZ[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i
+                   << " < " << fDimShapeZ[i] << "; ++idx_" << i << ") {\n";
+               idxZ += "idx_" + std::to_string(i);
+               if (stridesZ[i].GetVal() != "1")
+                  idxZ += " * " + stridesZ[i].GetVal();
+               idxZ += " + ";
+            }
+         }
+         if (idxZ.size() >= 3)
+            for (int j = 0; j < 3; j++) idxZ.pop_back();
+      }
+
+      // Inner assignment
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNZ << "[" << idxZ << "] = "
+          << "tensor_" << fNC << "[" << idxC << "] ? "
+          << "tensor_" << fNX << "[" << idxX << "] : "
+          << "tensor_" << fNY << "[" << idxY << "];\n";
+
+      // Close loops
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
+
+      return out.str();
+   }
+
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeZ.empty())
+         throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first");
+
+      const std::size_t D = fShapeZ.size();
+      std::size_t totalElements = ConvertShapeToLength(fShapeZ);
+
+      std::vector<size_t> shapeA_padded(D, 1);
+      std::vector<size_t> shapeB_padded(D, 1);
+      std::vector<size_t> shapeC_padded(D, 1);
+      {
+         size_t offA = D - fShapeX.size();
+         for (size_t i = 0; i < fShapeX.size(); ++i) shapeA_padded[offA + i] = fShapeX[i];
+         size_t offB = D - fShapeY.size();
+         for (size_t i = 0; i < fShapeY.size(); ++i) shapeB_padded[offB + i] = fShapeY[i];
+         size_t offC = D - fShapeC.size();
+         for (size_t i = 0; i < fShapeC.size(); ++i) shapeC_padded[offC + i] = fShapeC[i];
+      }
+
+      auto stridesA = UTILITY::ComputeStrideFromShape(shapeA_padded);
+      auto stridesB = UTILITY::ComputeStrideFromShape(shapeB_padded);
+      auto stridesC = UTILITY::ComputeStrideFromShape(shapeC_padded);
+      auto stridesZ = UTILITY::ComputeStrideFromShape(fShapeZ);
+
+      std::string typeName = TensorType<T>::Name();
+      std::string kname    = "WhereKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ WHERE_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ x,\n";
+      op += SP + SP + SP + "T const* __restrict__ y,\n";
+      op += SP + SP + SP + "uint8_t const* __restrict__ cond,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+               + " = (elem_idx / " + std::to_string(stridesZ[d]) + "u) % "
+               + std::to_string(fShapeZ[d]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const c_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         if (shapeC_padded[d] == 1)
+               op += SP + SP + SP + SP + SP + "0u";
+         else
+               op += SP + SP + SP + SP + SP
+                  + "out_" + std::to_string(d)
+                  + " * " + std::to_string(stridesC[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "std::size_t const x_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         if (shapeA_padded[d] == 1)
+               op += SP + SP + SP + SP + SP + "0u";
+         else
+               op += SP + SP + SP + SP + SP
+                  + "out_" + std::to_string(d)
+                  + " * " + std::to_string(stridesA[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "std::size_t const y_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         if (shapeB_padded[d] == 1)
+               op += SP + SP + SP + SP + SP + "0u";
+         else
+               op += SP + SP + SP + SP + SP
+                  + "out_" + std::to_string(d)
+                  + " * " + std::to_string(stridesB[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[elem_idx] = cond[c_idx] ? x[x_idx] : y[y_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      std::string kname = "WhereKernel_" + opName;
+      return SP + kname + " whereKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fShapeZ.empty())
+         throw std::runtime_error("SOFIE Where Op called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeZ);
+      std::string kname = "whereKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ WHERE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "auto const workDiv_" << opName << " = sofie_workdiv(elementsPerGrid_" << opName << ");\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+         << ", " << kname
+         << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNC << ")"
+         << ", alpaka::getPtrNative(deviceBuf_" << fNZ << ")"
+         << ", static_cast<Idx>(" << totalElements << "));\n";
+
+      return out.str();
+   }
+
+};
+
+}//SOFIE
+
+#endif //TMVA_SOFIE_ROperator_Where
diff --git a/src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx b/core/inc/SOFIE/SOFIEHelpers.hxx
similarity index 100%
rename from src/SOFIE_core/inc/SOFIE/SOFIEHelpers.hxx
rename to core/inc/SOFIE/SOFIEHelpers.hxx
diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/core/inc/SOFIE/SOFIE_common.hxx
similarity index 68%
rename from src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
rename to core/inc/SOFIE/SOFIE_common.hxx
index d183052..e36df0a 100644
--- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
+++ b/core/inc/SOFIE/SOFIE_common.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_SOFIE_COMMON
 #define SOFIE_SOFIE_COMMON
 
-#include "TMVA/RTensor.hxx"
+#include "SOFIE/RTensor.hxx"
 
-#include "ROOT/RSpan.hxx"
+#include <span>
 
 #include <stdexcept>
 #include <type_traits>
@@ -21,13 +21,10 @@
 #include <cassert>
 #include <limits>
 
-
-namespace SOFIE{
-
-//typedef RTensor tensor_t;
+namespace SOFIE {
 
 enum class ETensorType{
-   UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
+   UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
     FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
 };
 
@@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) {
     switch (type) {
         case ETensorType::FLOAT:     return sizeof(float);
         case ETensorType::DOUBLE:    return sizeof(double);
-        case ETensorType::UNINT8:     return sizeof(uint8_t);
+        case ETensorType::UINT8:     return sizeof(uint8_t);
         case ETensorType::INT8:      return sizeof(int8_t);
         case ETensorType::UINT16:    return sizeof(uint16_t);
         case ETensorType::INT16:     return sizeof(int16_t);
@@ -58,6 +55,9 @@ typedef std::int64_t int_t;
 std::string ConvertTypeToString(ETensorType type);
 ETensorType ConvertStringToType(std::string type);
 
+// find if a string represents a number
+bool IsInteger(const std::string & s);
+
 struct Dim{
    bool isParam = false;
    size_t dim = 0;
@@ -67,16 +67,42 @@ struct Dim{
    Dim() {}
 
    // constructor for a parametric dimension with the option to pass a default dim value
-   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {}
+   // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2")
+   // in case the string represents a number make Dim not parametric
+   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p)
+   {
+      if (IsInteger(p)) {
+            isParam = false;
+            dim = std::stoi(p);
+      }
+   }
 
    // constructor for a non-parametric dimension
    Dim(size_t d) : dim(d) {}
 
    std::string GetVal() const {
-      return (isParam) ? param : std::to_string(dim);
+      // cast to int64_t for negative shape values
+      return (isParam) ? param : std::to_string(static_cast<int64_t>(dim));
+   }
+
+   std::ostream& operator<< (std::ostream& os) const {
+      os << GetVal();
+      return os;
+   }
+
+   bool operator==(const Dim& rhs) const {
+       return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim;
+   }
+   bool operator!=(const Dim& rhs) const {
+       return !(*this == rhs);
    }
 };
 
+//bool operator==(const Dim& lhs, const Dim& rhs);
+inline std::ostream & operator<< (std::ostream &os, const Dim &d) {
+   os << d.GetVal();
+   return os;
+}
 
 struct InputTensorInfo{
    ETensorType type;
@@ -93,6 +119,18 @@ struct DynamicTensorInfo{
    std::vector<Dim> shape;
 };
 
+// template traits for Tensor Shape
+template <typename T>
+struct TensorShape {};
+template<>
+struct TensorShape<Dim> {
+   static bool IsDim() { return true; }
+};
+template<>
+struct TensorShape<size_t> {
+   static bool IsDim() { return false; }
+};
+
 // template traits for Tensor type
 template <typename T>
 struct TensorType {};
@@ -120,6 +158,18 @@ template<>
 struct TensorType<uint64_t> {
    static const std::string Name() { return "uint64_t"; }
 };
+template<>
+struct TensorType<bool> {
+   static const std::string Name() { return "bool"; }
+};
+template<>
+struct TensorType<int8_t> {
+   static const std::string Name() { return "int8_t"; }
+};
+template<>
+struct TensorType<uint8_t> {
+   static const std::string Name() { return "uint8_t"; }
+};
 
 struct TensorMemoryInfo {
    std::string_view tensor_name;
@@ -148,47 +198,85 @@ struct MemoryPoolInfo {
    std::map<size_t, size_t> available_stack;
 };
 
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape);
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape);
 
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape);
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape);
 
-std::size_t ConvertShapeToLength(std::vector<size_t> shape);
+std::size_t ConvertShapeToLength(const std::vector<size_t> & shape);
+std::size_t ConvertShapeToLength(const std::vector<Dim> & shape);
 
-std::string ConvertShapeToString(std::vector<size_t> shape);
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape);
-// std::string ConvertShapeToString(std::vector<Dim> shape) {
-//    return ConvertDynamicShapeToString(shape);
-// }
+std::string ConvertShapeToString(const std::vector<size_t> & shape);
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape);
+
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape);
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape);
 
 template<class T>
 std::string ConvertValToString(T value) {
    std::stringstream ret;
-   if (std::is_floating_point_v<T>)
-      ret << std::setprecision(std::numeric_limits<T>::max_digits10);
-   ret << value;
+   ret << std::to_string(value);
+   return ret.str();
+}
+// float specialization
+template<>
+inline std::string ConvertValToString<float>(float value) {
+   std::stringstream ret;
+   // special case for infinity and Nan
+   if (std::isinf(value))
+         ret << (value > 0 ? "std::numeric_limits<float>::infinity()" :
+                                  "-std::numeric_limits<float>::infinity()");
+   else if (std::isnan(value))
+         ret << "std::numeric_limits<float>::quiet_NaN()";
+   else {
+      ret << std::setprecision(std::numeric_limits<float>::max_digits10);
+      ret << value;
+   }
+   return ret.str();
+}
+// double specialization
+template<>
+inline std::string ConvertValToString<double>(double value) {
+   std::stringstream ret;
+   // special case for infinity and Nan
+   if (std::isinf(value))
+         ret << (value > 0 ? "std::numeric_limits<double>::infinity()" :
+                                  "-std::numeric_limits<double>::infinity()");
+   else if (std::isnan(value))
+         ret << "std::numeric_limits<double>::quiet_NaN()";
+   else {
+      ret << std::setprecision(std::numeric_limits<double>::max_digits10);
+      ret << value;
+   }
+   return ret.str();
+}
+// int64_t specialization for INT64_MIN
+template<>
+inline std::string ConvertValToString<int64_t>(int64_t value) {
+   std::stringstream ret;
+   if (value == INT64_MIN)
+      ret << "INT64_MIN";
+   else
+      ret << std::to_string(value);
    return ret.str();
 }
 
 
 // convert list of values in a string taking into account the precision
 template<class T>
-std::string ConvertValuesToString(size_t n, const T * data) {
+std::string ConvertValuesToString(size_t n, const T * data, size_t maxprint = -1) {
    std::stringstream ret;
    ret << "{ ";
-   for (size_t i = 0; i < n; i++) {
-      if (std::is_floating_point_v<T>)
-         ret << std::setprecision(std::numeric_limits<T>::max_digits10);
-      ret << data[i];
+   for (size_t i = 0; i < std::min(n,maxprint); i++) {
+      ret << ConvertValToString(data[i]);
       if (i < n-1) ret << ", ";
+      if (i < n-1 && i == maxprint-1) ret << "..... ";
    }
    ret << "}";
    return ret.str();
 }
 template<class T>
-std::string ConvertValuesToString(const std::vector<T> & data) {
-  return ConvertValuesToString(data.size(), data.data());
+std::string ConvertValuesToString(const std::vector<T> & data, size_t maxprint = 5) {
+  return ConvertValuesToString(data.size(), data.data(), maxprint);
 }
 
 class InitializedTensor {
@@ -204,10 +292,18 @@ public:
    std::shared_ptr<void> const &sharedptr() const { return fData; }
    // query if tensor comes from a Constant operator
    bool IsConstantTensor() const { return fConstant;}
-   // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
+   // query if tensor needs to be written in a weight file. Constant tensors are not written in a separate file
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
+   // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor)
+   // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in
+   // the generated code
+   bool IsNotWritable() const { return fIsNotWritable; }
    // set not writable initialized tensors - i.e. tensor that must not be written in a file
    void SetNotWritable() { fIsNotWritable = true;}
+   // set writable initialized tensors - i.e. tensor that must be written in a file
+   void SetWritable() { fIsNotWritable = false;}
+   // set as constant (needed for non-float initialized tensors)
+   void SetConstant() { fConstant = true;}
 
    template <class T = void>
    T const *data() const
@@ -223,16 +319,8 @@ public:
       for (std::size_t item : fShape) {
          fSize *= static_cast<int>(item);
       }
-      switch (fType) {
-      case ETensorType::FLOAT: fSize *= sizeof(float); break;
-      case ETensorType::DOUBLE: fSize *= sizeof(double); break;
-      case ETensorType::INT32: fSize *= sizeof(int32_t); break;
-      case ETensorType::INT64: fSize *= sizeof(int64_t); break;
-      case ETensorType::BOOL: fSize *= sizeof(bool); break;
-      default:
-         throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " +
-                                  ConvertTypeToString(fType));
-      }
+      // get size in bytes
+      fSize *= GetTypeSize(fType);
       fPersistentData = static_cast<char *>(fData.get());
    }
    void CastPersistentToShared()
@@ -271,7 +359,7 @@ private:
 template <typename T>
 ETensorType GetTemplatedType(T /*obj*/ ){
    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
-   if (std::is_same<T, uint8_t>::value) return ETensorType::UNINT8;
+   if (std::is_same<T, uint8_t>::value) return ETensorType::UINT8;
    if (std::is_same<T, int8_t>::value) return ETensorType::INT8;
    if (std::is_same<T, uint16_t>::value) return ETensorType::UINT16;
    if (std::is_same<T, int16_t>::value) return ETensorType::INT16;
@@ -287,6 +375,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){
 }
 
 namespace UTILITY{
+
+
+
+// clean operator and tensor names
+std::string Clean_name(std::string input_tensor_name);
+
 // Check if two shapes are equal
 bool AreSameShape(const std::vector<size_t>&, const std::vector<size_t>&);
 bool AreSameShape(const std::vector<size_t>&, const std::vector<Dim>&);
@@ -296,10 +390,14 @@ bool AreSameShape(const std::vector<Dim>&, const std::vector<Dim>&);
 // Multidirectional broadcast a list of tensors to the same shape
 std::vector<size_t> MultidirectionalBroadcastShape(std::vector<std::vector<size_t>>);
 
-// Unidirectional broadcast two shapes to the same shape
-std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t>, std::vector<size_t>);
+// Multidirectional broadcast two shapes to the same shape
+
+std::pair<int, std::vector<size_t>> MultidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+
+std::pair<int, std::vector<Dim>> MultidirectionalBroadcastShape(std::vector<Dim> &, std::vector<Dim> &);
+
 
-std::string Clean_name(std::string input_tensor_name);
 
 template<typename T>
 T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size_t>& targetShape) {
@@ -343,16 +441,14 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size
 // Broadcast a tensor from shape to targetShape according to numpy broadcasting rules
 // See more at https://numpy.org/doc/stable/user/basics.broadcasting.html
 // and https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md .
-template<typename T, class ConstContT = std::span<const T>, class ContT = std::span<T> >
-void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, ContT broadcastedData) {
+template<typename T, class ConstContT = std::span<const T>>
+void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones )
    size_t size = shape.size();
    // Current length of the broadcasted tensor
    size_t curLength = data.size();
-   size_t targetLength = broadcastedData.size();
-   assert(ConvertShapeToLength(targetShape) == targetLength);
    // special case when broadcasting last dimensions (initial shapes must be the same)
-   if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) {
+   if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) {
       size_t bsize = targetShape.back();
       // compute the size of the data to broadcast
       for (int k = int(size)-2; k >=0; k--) {
@@ -360,16 +456,16 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          bsize *= targetShape[k];
       }
       for (size_t i = 0; i < curLength; i++) {
-         std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]);
+         std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]);
       }
       return;
    }
 
-   std::copy(data.begin(), data.end(), broadcastedData.begin());
+   std::copy(data.begin(), data.end(), broadcastedData);
    // Product of the previous dimensions of targetShape
    size_t arrayNum = 1;
    // New broadcasted data: is this needed?
-   std::vector<T> newData(targetLength);
+   std::vector<T> newData(ConvertShapeToLength(targetShape));
 
    for (size_t idx = 0; idx < size; idx++) {
       size_t dim = shape[idx];
@@ -385,8 +481,8 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
             for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) {
                for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) {
                   size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength;
-                  std::copy(broadcastedData.begin() + arrayIdx * arrayLength,
-                     broadcastedData.begin() + (arrayIdx + 1) * arrayLength,
+                  std::copy(broadcastedData + arrayIdx * arrayLength,
+                     broadcastedData + (arrayIdx + 1) * arrayLength,
                      newData.begin() + offset);
                }
             }
@@ -400,12 +496,11 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          // Update current length
          curLength = newLength;
          // Update broadcasted data
-         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin());
+         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData);
       }
       // Update the number of arrays
       arrayNum *= targetDim;
    }
-   //return broadcastedData;
 }
 
 // interface where we allocate a new array for broadcasted data
@@ -413,10 +508,8 @@ template<typename T>
 T* CreateBroadcastTensor(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, size_t targetLength) {
    // newShape is an array of size equal to dimension along which we are broadcasting the tensor
    T* broadcastedData = new T[targetLength];
-   std::span<T> bData(broadcastedData, broadcastedData+targetLength);
    size_t curLength = ConvertShapeToLength(shape);
-   std::span<const T> inData(data, curLength);
-   BroadcastTensor<T, std::span<const T>, std::span<T>>(inData, shape, targetShape, bData);
+   BroadcastTensor<T>({data, curLength}, shape, targetShape, broadcastedData);
    return broadcastedData;
 }
 // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not
@@ -429,14 +522,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, cons
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      return CreateBroadcastTensor<T>(data, newShape, targetShape, ConvertShapeToLength(targetShape));
+      return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape));
    }
-   return CreateBroadcastTensor<T>(data, shape, targetShape, ConvertShapeToLength(targetShape));
+   return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape));
 }
 
 // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations
 template<typename T>
-void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::span<T> broadcastedData) {
+void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    size_t curLength = ConvertShapeToLength(shape);
    std::span<T> inData(const_cast<T*>(data), curLength);
    // Prepend shape with ones
@@ -445,12 +538,10 @@ void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, co
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      BroadcastTensor<T>(inData, newShape, targetShape, broadcastedData);
+      BroadcastTensor(inData, newShape, targetShape, broadcastedData);
    }
-   BroadcastTensor<T, std::span<T>>(inData, shape, targetShape, broadcastedData);
+   BroadcastTensor(inData, shape, targetShape, broadcastedData);
 }
-// specialization for vector of boolean
-void UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData);
 
 /// compute stride of a tensor given its shape (assume layout is row-major)
 std::vector<size_t> ComputeStrideFromShape(const std::vector<size_t> & shape);
@@ -619,8 +710,6 @@ void col2im(const Dtype* data_col, const int channels,
   //std::cout << "finishing col2imp" << std::endl;
 }
 
-
-
 }  // end namespace UTILITY
 
 namespace BLAS{
@@ -631,37 +720,37 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m,
 
 
 struct GNN_Data {
-      TMVA::Experimental::RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
-      TMVA::Experimental::RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
-      TMVA::Experimental::RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
-      TMVA::Experimental::RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
+      RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
+      RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
+      RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
+      RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
                                      // edge_index[0,:] are the receivers and edge_index[1,:] are the senders
 
 
       // need to have default constructor since RTensor has not one
-      GNN_Data(): node_data(TMVA::Experimental::RTensor<float>({})), edge_data(TMVA::Experimental::RTensor<float>({})), global_data(TMVA::Experimental::RTensor<float>({})), edge_index(TMVA::Experimental::RTensor<int>({})) {}
+      GNN_Data(): node_data(RTensor<float>({})), edge_data(RTensor<float>({})), global_data(RTensor<float>({})), edge_index(RTensor<int>({})) {}
 
 };
 
 template<typename T>
-TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,  TMVA::Experimental::RTensor<T> & t2, int axis = 0)
+RTensor<T> Concatenate( RTensor<T> & t1,  RTensor<T> & t2, int axis = 0)
 {
    // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis
    if (t1.GetMemoryLayout() != t2.GetMemoryLayout())
-      throw std::runtime_error("TMVA RTensor Concatenate - tensors have different memory layout");
+      throw std::runtime_error("RTensor Concatenate - tensors have different memory layout");
    auto & shape1 = t1.GetShape();
    auto & shape2 = t2.GetShape();
    if (t1.GetSize()/shape1[axis] != t2.GetSize()/shape2[axis]) {
       std::cout << "axis " << axis << " sizes " << t1.GetSize() << " " << t2.GetSize() << "  ";
       std::cout << "shape 1 : " << ConvertShapeToString(t1.GetShape());
       std::cout << " shape 2 : " << ConvertShapeToString(t2.GetShape()) << std::endl;
-      throw std::runtime_error("TMVA RTensor Concatenate - tensors have incompatible shapes");
+      throw std::runtime_error("RTensor Concatenate - tensors have incompatible shapes");
    }
    std::vector<size_t> outShape = shape1;
    outShape[axis] = shape1[axis] + shape2[axis];
-   TMVA::Experimental::RTensor<T> tout(outShape, t1.GetMemoryLayout());
-   if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) {
-      throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors");
+   RTensor<T> tout(outShape, t1.GetMemoryLayout());
+   if (t1.GetMemoryLayout() == MemoryLayout::ColumnMajor) {
+      throw std::runtime_error("RTensor Concatenate is not yet supported for column major tensors");
    }
 
    auto & stride1 = t1.GetStrides();
@@ -693,10 +782,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) {
 
 inline GNN_Data Copy(const GNN_Data & data) {
    GNN_Data out;
-   out.node_data = TMVA::Experimental::RTensor<float>(data.node_data.GetShape());
-   out.edge_data = TMVA::Experimental::RTensor<float>(data.edge_data.GetShape());
-   out.global_data = TMVA::Experimental::RTensor<float>(data.global_data.GetShape());
-   out.edge_index = TMVA::Experimental::RTensor<int>(data.edge_index.GetShape());
+   out.node_data = RTensor<float>(data.node_data.GetShape());
+   out.edge_data = RTensor<float>(data.edge_data.GetShape());
+   out.global_data = RTensor<float>(data.global_data.GetShape());
+   out.edge_index = RTensor<int>(data.edge_index.GetShape());
    std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData());
    std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData());
    std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData());
@@ -704,6 +793,136 @@ inline GNN_Data Copy(const GNN_Data & data) {
    return out;
 }
 
-}//SOFIE
+inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A,
+                      const float *B, float beta, const float *C)
+{
+   char ct = 't';
+   char cn = 'n';
+   const int *lda = transa ? &k : &m;
+   const int *ldb = transb ? &n : &k;
+   const int *ldc = &m;
+   if (C != nullptr) {
+      std::copy(C, C + m * n, output);
+   }
+   BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb,
+               &beta, output, ldc);
+}
+
+inline void Fill(float *output, float value, int size)
+{
+   std::fill(output, output + size, value);
+}
+
+template <class T>
+inline void Copy(T *output, T const *input, int size)
+{
+   std::copy(input, input + size, output);
+}
+
+inline void Relu(float *output, float const *input, int size)
+{
+   for (int i = 0; i < size; i++) {
+      output[i] = (input[i] > 0.0f) ? input[i] : 0.0f;
+   }
+}
+// function to read float from the file dealing with inf and nan values
+inline float ParseFloatToken (const std::string & s)  {
+   if (s == "inf")  return  std::numeric_limits<float>::infinity();
+   if (s == "-inf") return -std::numeric_limits<float>::infinity();
+   if (s == "nan")  return  std::numeric_limits<float>::quiet_NaN();
+   return std::stof(s);
+}
+
+template <class T>
+void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength)
+{
+   std::string name;
+   std::size_t length;
+   is >> name >> length;
+   if (name != expectedName) {
+      std::string err_msg =
+         "sofie failed to read the correct tensor name; expected name is " + expectedName + " , read " + name;
+      throw std::runtime_error(err_msg);
+   }
+   if (length != expectedLength) {
+      std::string err_msg = "sofie failed to read the correct tensor size; expected size is " +
+                            std::to_string(expectedLength) + " , read " + std::to_string(length);
+      throw std::runtime_error(err_msg);
+   }
+   std::string token;
+   for (size_t i = 0; i < length; ++i) {
+      is >> token;
+      target[i] = ParseFloatToken(token);
+   }
+   if (is.fail()) {
+      throw std::runtime_error("sofie failed to read the values for tensor " + expectedName);
+   }
+}
+
+//Utility functions to generate code
+void EmitNestedLoops(std::stringstream &out, size_t loopRank, const std::vector<Dim> shape);
+void CloseNestedLoops(std::stringstream &out, size_t loopRank);
+
+
+// code for the memory greeding allocations
+struct TensorLifeInfo {
+   int begin;   // start time (op index) lifetime
+   int end;     //  end time lifetime
+   size_t size; // size of tensors in bytes
+};
+
+struct MemoryResult {
+  std::size_t total_bytes = 0;  // total memory needed
+  std::vector<size_t> offsets; // resulted offsets for each tensor
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo );
+
+// Simple Dimension classes ans helpers to add constexpr meta info on input
+// tensors to the emitted code.
+struct SingleDim {
+   enum class Kind {
+      Static,
+      Symbolic
+   };
+
+   Kind kind;
+   std::size_t dim;
+   std::string_view name;
+
+   constexpr SingleDim(std::size_t v) : kind(Kind::Static), dim(v), name() {}
+   constexpr SingleDim(const char *v) : kind(Kind::Symbolic), dim(0), name(v) {}
+};
+
+struct TensorDims {
+   const SingleDim *data;
+   std::size_t size;
+
+   constexpr std::size_t total_size() const
+   {
+      std::size_t result = 1;
+      for (std::size_t i = 0; i < size; ++i) {
+         result *= data[i].dim;
+      }
+      return result;
+   }
+};
+
+template<class Arr>
+constexpr TensorDims makeDims(Arr const &arr)
+{
+   return TensorDims{arr.data(), arr.size()};
+}
+
+inline std::string ConvertOutputTypeToString(ETensorType t) {
+   // The std::vector<bool> is a special type that is not wrapping continuous memory.
+   // We don't want to use it as a return type.
+   if (t == ETensorType::BOOL) t = ETensorType::UINT8;
+   return ConvertTypeToString(t);
+}
+
+
+} // namespace SOFIE
 
-#endif //TMVA_SOFIE_RMODEL
+#endif //TMVA_SOFIE_COMMON
diff --git a/src/SOFIE_core/src/Prototype.cxx b/core/src/Prototype.cxx
similarity index 100%
rename from src/SOFIE_core/src/Prototype.cxx
rename to core/src/Prototype.cxx
diff --git a/src/SOFIE_core/src/RFunction.cxx b/core/src/RFunction.cxx
similarity index 100%
rename from src/SOFIE_core/src/RFunction.cxx
rename to core/src/RFunction.cxx
diff --git a/src/SOFIE_core/src/RFunction_MLP.cxx b/core/src/RFunction_MLP.cxx
similarity index 91%
rename from src/SOFIE_core/src/RFunction_MLP.cxx
rename to core/src/RFunction_MLP.cxx
index eff76f6..5666f3e 100644
--- a/src/SOFIE_core/src/RFunction_MLP.cxx
+++ b/core/src/RFunction_MLP.cxx
@@ -10,13 +10,13 @@
 
 namespace SOFIE {
 
-RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation activation_function, bool activate_final, GraphType gType):
+RFunction_MLP::RFunction_MLP(FunctionTarget target, int_t numLayers, Activation activation_function, bool activate_final, GraphType gType):
     RFunction_Update(target, gType), fNumLayers(numLayers), fActivationFunction(activation_function), fActivateFinal(activate_final)
 {
    // assuming all the linear layers has a kernel and a bias initialized tensors
    if (fActivateFinal) {
       if (fActivationFunction == Activation::Invalid) {
-         throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " +
+         throw std::runtime_error("SOFIE GNN doesn't currently supports the provided activation function for " +
                                   fFuncName + " update.");
       }
       function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)});
@@ -43,12 +43,12 @@ void RFunction_MLP::Initialize() {
         double beta = (fBiasTensors[i].empty()) ? 0. : 1.;
         op_gemm.reset(new ROperator_Gemm<float>(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i)));
         function_block->AddOperator(std::move(op_gemm));
-        fGemmInput = fFuncName+"Gemm"+i;
+        fGemmInput = fFuncName+"Gemm"+std::to_string(i);
         if (fActivationFunction == Activation::RELU) {
             std::unique_ptr<ROperator> op_relu;
             op_relu.reset(new ROperator_Relu<float>(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i)));
             function_block->AddOperator(std::move(op_relu));
-            fGemmInput = fFuncName+"Relu"+i;
+            fGemmInput = fFuncName+"Relu"+std::to_string(i);
 
         }
     }
diff --git a/src/SOFIE_core/src/RFunction_Mean.cxx b/core/src/RFunction_Mean.cxx
similarity index 100%
rename from src/SOFIE_core/src/RFunction_Mean.cxx
rename to core/src/RFunction_Mean.cxx
diff --git a/src/SOFIE_core/src/RFunction_Sum.cxx b/core/src/RFunction_Sum.cxx
similarity index 100%
rename from src/SOFIE_core/src/RFunction_Sum.cxx
rename to core/src/RFunction_Sum.cxx
diff --git a/core/src/RModel.cxx b/core/src/RModel.cxx
new file mode 100644
index 0000000..377171c
--- /dev/null
+++ b/core/src/RModel.cxx
@@ -0,0 +1,2012 @@
+#include <limits>
+#include <algorithm>
+#include <cctype>
+#include <memory>
+#include <string>
+
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
+#include "TFile.h"
+#endif
+
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelProfiler.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+
+namespace SOFIE {
+
+namespace {
+const std::string SP = "   ";
+
+void ReplaceAll(std::string &str, const std::string &from, const std::string &to)
+{
+   size_t pos = 0;
+   while ((pos = str.find(from, pos)) != std::string::npos) {
+      str.replace(pos, from.length(), to);
+      pos += to.length();
+   }
+}
+
+bool IsIdentifierChar(char c)
+{
+   return std::isalnum(static_cast<unsigned char>(c)) || c == '_';
+}
+
+// Returns true if s is a valid C++ identifier (can be used as a variable name).
+// Dim::param can be either a plain name (e.g. "W") or a computed expression
+// (e.g. "((W+-3)/2+1)"); only the former can be used as a C++ variable name.
+bool IsIdentifier(const std::string &s)
+{
+   if (s.empty() || std::isdigit(static_cast<unsigned char>(s[0])))
+      return false;
+   for (char c : s)
+      if (!IsIdentifierChar(c))
+         return false;
+   return true;
+}
+
+// Get the data member name corresponding to a tensor with a given name.
+std::string TensorMember(std::string const &name)
+{
+   return "tensor_" + name;
+}
+
+} // namespace
+
+std::vector<size_t> RModel::GetTensorShape(const std::string & name) const {
+    auto f = fReadyInputTensorInfos.find(name);
+    if (f != fReadyInputTensorInfos.end()) {
+        return f->second.shape;
+    }
+    auto f2 = fInitializedTensors.find(name);
+    if (f2 != fInitializedTensors.end()) {
+        return f2->second.shape();
+    }
+    auto f3 = fInputTensorInfos.find(name);
+    if (f3 != fInputTensorInfos.end()) {
+        throw std::runtime_error("SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter");
+    }
+    auto f4 = fIntermediateTensorInfos.find(name);
+    if (f4 != fIntermediateTensorInfos.end()) {
+        return f4->second.shape;
+    }
+    // case of shape tensors
+    auto f5 = fShapeTensors.find(name);
+    if (f5 != fShapeTensors.end()) {
+      // shape is vector of size 1 with size of shape values or just a scalar
+      if (f5->second.second)  // check scalar flag
+         return std::vector<size_t>{};
+      else
+         return std::vector<size_t>{f5->second.first.size()};
+    }
+
+    if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
+      throw std::runtime_error("SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape");
+
+   if (fIsSubGraph && fParentGraph)
+      return fParentGraph->GetTensorShape(name);
+
+    throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found");
+}
+
+std::vector<Dim> RModel::GetDimTensorShape(const std::string & name) const {
+   if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
+      return f->second.shape;
+   }
+   if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) {
+      return f->second.shape;
+   }
+   // in case is not a dynamic tensor convert normal shape to Dim one
+   // for this we need to return the vector by value
+   return ConvertShapeToDim(GetTensorShape(name));
+}
+std::vector<Dim> RModel::GetDynamicTensorShape(const std::string & name) const {
+   if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
+      return f->second.shape;
+   }
+   if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) {
+      return f->second.shape;
+   }
+   // throw error if shape is not dynamic
+   if (!IsDynamicTensor(name))
+      throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not dynamic");
+
+   throw std::runtime_error("SOFIE tensor [" + name + "] for which the shape is requested is not found");
+}
+
+ETensorType RModel::GetTensorType(std::string name) const {
+    auto f = fReadyInputTensorInfos.find(name);
+    if (f != fReadyInputTensorInfos.end()) {
+        return f->second.type;
+    }
+    auto f2 = fInitializedTensors.find(name);
+    if (f2 != fInitializedTensors.end()) {
+        return f2->second.type();
+    }
+    auto f3 = fInputTensorInfos.find(name);
+    if (f3 != fInputTensorInfos.end()) {
+        return f3->second.type;
+    }
+    auto f4 = fIntermediateTensorInfos.find(name);
+    if (f4 != fIntermediateTensorInfos.end()) {
+        return f4->second.type;
+    }
+    auto f5 = fDynamicTensorInfos.find(name);
+    if (f5 != fDynamicTensorInfos.end()){
+      return f5->second.type;
+    }
+    // case of shape tensor type is INT64
+    if (fShapeTensors.find(name) != fShapeTensors.end()){
+      return ETensorType::INT64;
+    }
+
+    if (fIsSubGraph && fParentGraph)
+      return fParentGraph->GetTensorType(name);
+
+    throw std::runtime_error("SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName);
+}
+
+bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) {
+    if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end())  return true;
+    if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true;
+    if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true;
+    if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true;
+    if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true;
+    if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true;
+    if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name);
+    return false;
+}
+
+void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<Dim> shape) {
+    input_name = UTILITY::Clean_name(input_name);
+    if (CheckIfTensorAlreadyExist(input_name)) {
+        throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n");
+    }
+
+    InputTensorInfo inputInfo { type, shape };
+    fInputTensorInfos[input_name] = inputInfo;
+}
+
+void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<size_t> shape) {
+    input_name = UTILITY::Clean_name(input_name);
+    if (CheckIfTensorAlreadyExist(input_name)) {
+        throw std::runtime_error("sofie: input tensor with name " + input_name + " already exists \n");
+    }
+    TensorInfo inputInfo { type, shape };
+    fReadyInputTensorInfos[input_name] = inputInfo;
+}
+
+void RModel::AddInputTensorName(std::string input_name) {
+    fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name));
+}
+
+void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution)
+{
+   AddBlasRoutines(op->GetBlasRoutines());
+   auto libs = op->GetStdLibs();
+   auto op_input_tensors = op->GetOpInputTensors();
+   for (auto &stdlib : libs) {
+      AddNeededStdLib(stdlib);
+   }
+   if (order_execution >= 0) {
+      fOperators.insert(fOperators.begin() + order_execution, std::move(op));
+   } else {
+      fOperators.push_back(std::move(op));
+      order_execution = fOperators.size() - 1;
+   }
+
+   // storing the last usage of tensors which are input to the operator
+   // (excluding tensors which are inputs to the model or the initialized (weights) tensors)
+   // We call this function during parsing so we don't have yet initialized the operators
+   for (size_t index = 0; index < op_input_tensors.size(); index++) {
+      if (!IsInitializedTensor(UTILITY::Clean_name(std::string(op_input_tensors[index]))) &&
+          std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
+                    UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end()) {
+
+         fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
+         if (Verbose())
+            std::cout << "adding order execution for " << op_input_tensors[index] << " order " << order_execution
+                      << std::endl;
+      }
+   }
+}
+
+void RModel::AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
+    tensor_name = UTILITY::Clean_name(tensor_name);
+    //NB: own data
+    if (CheckIfTensorAlreadyExist(tensor_name)) {
+        throw std::runtime_error("sofie: initialized tensor with name " + tensor_name + " already exists \n");
+    }
+    InitializedTensor new_tensor {type, shape, data};
+    fInitializedTensors[tensor_name] = new_tensor;
+}
+
+void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
+    tensor_name = UTILITY::Clean_name(tensor_name);
+    //NB: own data
+    if (CheckIfTensorAlreadyExist(tensor_name)) {
+        throw std::runtime_error("sofie: constant tensor with name " + tensor_name + " already exists \n");
+    }
+    InitializedTensor new_tensor {type, shape, data, true};   // add here flag to specify is a constant tensor
+    fInitializedTensors[tensor_name] = new_tensor;
+}
+
+void RModel::AddShapeTensor(const std::string & name, const std::vector<Dim> & shape_values, bool scalar){
+   auto tensor_name = UTILITY::Clean_name(name);
+   if (fShapeTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("sofie: shape tensor with name " + tensor_name + " already exists \n");
+   }
+   fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar);
+}
+
+void RModel::AddAliasTensor(const std::string & name, const std::string & origin){
+   // add an alias tensor to origin
+   auto tensor_name = UTILITY::Clean_name(name);
+   auto origin_name = UTILITY::Clean_name(origin);
+   if (fAliasTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("sofie: alias tensor with name " + tensor_name + " already exists \n");
+   }
+   fAliasTensors[tensor_name] = origin_name;
+}
+
+bool RModel::IsShapeTensor(const std::string & tensor_name) const {
+   return fShapeTensors.count(tensor_name) != 0;
+}
+
+bool RModel::IsAliasTensor(const std::string & tensor_name) const {
+   return fAliasTensors.count(tensor_name) != 0;
+}
+
+const std::vector<Dim> & RModel::GetShapeTensorValues(const std::string & tensor_name) const {
+   //if (!IsShapeTensor(tensor_name) ) return std::vector<Dim>{};
+   return fShapeTensors.at(tensor_name).first;
+}
+
+bool RModel::IsInitializedTensor(const std::string& tensorName) const {
+    std::string name = UTILITY::Clean_name(tensorName);
+    return fInitializedTensors.find(name) != fInitializedTensors.end();
+}
+bool RModel::IsConstantTensor(const std::string& tensorName) const {
+   // a constant tensor is an initialized tensor but has the constant flag set
+    std::string name = UTILITY::Clean_name(tensorName);
+    auto itr = fInitializedTensors.find(name);
+    if (itr == fInitializedTensors.end()) return false;
+    return itr->second.IsConstantTensor();
+}
+
+// dynamic tensors include also Dim input tensors
+bool RModel::IsDynamicTensor(const std::string& tensorName) const {
+   std::string name = UTILITY::Clean_name(tensorName);
+   bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
+   return (ret) ? true : IsDimInputTensor(tensorName);
+}
+bool RModel::IsDimInputTensor(const std::string& tensorName) const {
+   std::string name = UTILITY::Clean_name(tensorName);
+   return fInputTensorInfos.find(name) != fInputTensorInfos.end();
+}
+bool RModel::IsReadyInputTensor(const std::string& tensorName) const {
+   std::string name = UTILITY::Clean_name(tensorName);
+   return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end();
+}
+
+// generic addition of a tensor
+void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape) {
+   auto int_shape = ConvertShapeToInt(dim_shape);
+   if (!int_shape.empty())
+      AddIntermediateTensor(tensor_name, type, int_shape);
+   else
+      AddDynamicTensor(tensor_name, type, dim_shape);
+}
+
+void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape) {
+    tensor_name = UTILITY::Clean_name(tensor_name);
+    if (CheckIfTensorAlreadyExist(tensor_name)) {
+        throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n");
+    }
+    TensorInfo new_tensor {type, shape};
+    fIntermediateTensorInfos[tensor_name] = new_tensor;
+}
+
+void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector<Dim> shape){
+   tensor_name = UTILITY::Clean_name(tensor_name);
+   if (CheckIfTensorAlreadyExist(tensor_name)){
+      throw std::runtime_error("sofie: intermediate tensor with name " + tensor_name + " already exists \n");
+   }
+   DynamicTensorInfo new_tensor {type, shape};
+   fDynamicTensorInfos[tensor_name] = new_tensor;
+   // store shape parameter if not existing
+   for (auto &d : shape) {
+      if (d.isParam) {
+         if (d.dim != size_t(-1)) {
+            AddShapeParam(d.param, d.dim);
+         }
+      }
+   }
+}
+
+void RModel::AddShapeParam(const std::string & param, size_t default_value) {
+   if (fShapeParams.count(param) == 0) {
+      fShapeParams[param] = std::to_string(default_value);
+      // add also in the vector list (used to keep the order)
+      fDimShapeNames.push_back(param);
+   }
+}
+
+void RModel::AddOutputTensorNameList(std::vector<std::string> outputtensornames) {
+    fOutputTensorNames.clear();
+    for(auto& it : outputtensornames) {
+        fOutputTensorNames.emplace_back(UTILITY::Clean_name(it));
+    }
+}
+
+void RModel::UpdateOutputTensorList(std::vector<std::string> curr_output_tensors, std::vector<std::string> new_output_tensors) {
+    for(auto& it:curr_output_tensors) {
+        fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end());
+    }
+    fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end());
+}
+
+void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
+    tensor_name = UTILITY::Clean_name(tensor_name);
+    if (!CheckIfTensorAlreadyExist(tensor_name)) {
+        throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to update it");
+    }
+    InitializedTensor new_tensor {type, shape, data};
+    fInitializedTensors[tensor_name] = new_tensor;
+}
+
+std::shared_ptr<void> RModel::GetInitializedTensorData(std::string tensor_name) {
+    auto f = fInitializedTensors.find(tensor_name);
+    if (f == fInitializedTensors.end()) {
+        throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to get its data");
+    } else {
+        return f->second.sharedptr();
+    }
+}
+
+void RModel::RemoveInitializedTensor(std::string tensor_name) {
+   auto f = fInitializedTensors.find(tensor_name);
+   if (f == fInitializedTensors.end()) {
+      throw std::runtime_error("sofie: tensor " + tensor_name + " not found when trying to remove it");
+   } else {
+      fInitializedTensors.erase(f);
+   }
+}
+
+void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) {
+      auto t = fInitializedTensors.find(tensor_name);
+      if (t == fInitializedTensors.end()) {
+         throw std::runtime_error("sofie: initialized tensor " + tensor_name + " not found when trying to get its info");
+      }
+      t->second.SetNotWritable();
+   }
+
+std::string RModel::AllocateIntermediateMemory(std::span<const std::string> op_output_tensors)
+{
+   std::stringstream code;
+
+   if (fVerbose) {
+      std::cout << "Total chunks allocated\n";
+      for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+         std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl;
+      }
+   }
+
+   auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) {
+      std::string typeName = ConvertTypeToString(GetTensorType(name));
+      code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
+      code << "\n"
+           << typeName << "* " << TensorMember(name) << " = reinterpret_cast<" << typeName
+           << "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
+   };
+
+   if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n";
+   // order output tensors by size
+   std::vector<TensorMemoryInfo> ordered_output_tensors;
+
+   for (auto &it : op_output_tensors) {
+      auto name = std::string(it);
+      if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() ||
+          fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
+         continue;
+
+      // case of alias tensor
+      if (IsAliasTensor(name)) {
+         continue;
+      }
+
+      auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
+      // important fill the pair in the ordered output tensors with the string view and not the string
+      TensorMemoryInfo tmi = {it, tensor_size};
+      ordered_output_tensors.push_back(tmi);
+   }
+   std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(),
+             [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; });
+
+   for (auto &it : ordered_output_tensors) {
+      bool allocated = false;
+      std::string name = std::string{it.tensor_name};
+      size_t tensor_size = it.tensor_size;
+      if (fVerbose)
+         std::cout << "output tensor " << name << " size " << tensor_size << std::endl;
+
+      for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+           chunk != fIntermediateMemoryInfo.available_stack.end();) {
+
+         if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second;
+         // check if available memory chunks can accommodate the tensor
+         if (chunk->second >= tensor_size) {
+            // need to use here string_view (i.e it.tensor_name)
+            // split returns the new chunk with size of new tensor. The free chunk is before the used one
+            auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size);
+            auto new_chunk_location = chunk->first + chunk->second - tensor_size;
+            fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
+
+            declareIntermediateTensor(name, tensor_size, new_chunk_location);
+            chunk->second -= tensor_size;
+
+            allocated = true;
+
+            if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location;
+
+            if (chunk->second == 0) {
+               if (fVerbose) std::cout << " and deleted since size matches";
+               fIntermediateMemoryInfo.available_stack.erase(chunk);
+            }
+            if (fVerbose) std::cout << std::endl;
+            break;
+         } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first &&
+                    fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) {
+            // case last available chunk is the last in the memory, we can increase that one
+            fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size};
+            declareIntermediateTensor(name, tensor_size, chunk->first);
+            fIntermediateMemoryInfo.available_stack.erase(chunk);
+            allocated = true;
+            if (fVerbose) std::cout << " is extended  with a bigger one of size " << tensor_size << std::endl;
+            break;
+         }
+         ++chunk;
+         if (fVerbose) std::cout << std::endl;
+      }
+
+      if (!allocated) {
+         size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
+                               ? 0
+                               : fIntermediateMemoryInfo.total_stack.rbegin()->first +
+                                    fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
+
+         fIntermediateMemoryInfo.total_stack[chunk_idx] = it;
+
+         declareIntermediateTensor(name, tensor_size, chunk_idx);
+
+         if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx
+                   << std::endl;
+      }
+   }
+   return code.str();
+}
+
+void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string> op_input_tensors, const size_t& op_idx){
+   if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n";
+   //print available chunks
+   if (fVerbose) std::cout << "available chunks before freeing them : \n";
+   for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+        chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
+      if (fVerbose) std::cout << "-- free chunk " << chunk->first <<  " size = " << chunk->second << std::endl;
+   }
+   for (auto &iv : op_input_tensors) {
+      // last occurrence of the tensor is reached => flush it from memory
+      if (fVerbose) std::cout << ".. input tensors : " << iv;
+
+      // for alias tensors replace name with its alias
+      std::string it{iv};  // convert view to string
+      if (IsAliasTensor(it))
+         it = fAliasTensors[it];
+      if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
+         if (fVerbose) std::cout << "  flash condition is met - looping on chunks to find matching one \n";
+         for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
+              chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+            if (fVerbose) std::cout << "---  chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size;
+            if (chunk->second.tensor_name == it) {
+               if (fVerbose) std::cout << " --  Found chunk corresponding to input tensor:  " << chunk->first;
+               // check if nearby chunks in available memory can coalesce
+               auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(
+                  chunk->first); // smallest element greater than the flushed chunk idx
+               auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin())
+                                      ? fIntermediateMemoryInfo.available_stack.end()
+                                      : std::prev(first_greater); // largest element smaller than the flushed chunk idx
+
+               // check if the next stack entry is actually adjacent in memory
+
+               if (last_smaller != fIntermediateMemoryInfo.available_stack.end() &&
+                   last_smaller->first + last_smaller->second == chunk->first) {
+                  // merge chunk with previous one
+                  last_smaller->second += chunk->second.tensor_size;
+                  fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
+                  if (fVerbose) std::cout << " is adjacent in memory with previous one - merge ";
+                  if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                      last_smaller->first + last_smaller->second == first_greater->first) {
+                     // merge also with following one
+                     last_smaller->second += first_greater->second;
+                     fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(
+                        fIntermediateMemoryInfo.total_stack[first_greater->first]);
+                     // delete merged one in available stack and in total stack
+                     fIntermediateMemoryInfo.total_stack.erase(first_greater->first);
+                     fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                     if (fVerbose) std::cout << " merge also with following that is free ";
+                  }
+                  fIntermediateMemoryInfo.total_stack.erase(chunk->first);
+                  if (fVerbose) std::cout << std::endl;
+                  break;
+               } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                          chunk->first + chunk->second.tensor_size == first_greater->first) {
+                  // merge with first greater
+                  if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n";
+                  // cannot modify idx of first_greter. Insert a new one and delete previous one
+                  size_t new_size = chunk->second.tensor_size + first_greater->second;
+                  size_t first_greater_idx = first_greater->first;
+                  fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                  // cannot use anymore first_greater
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size});
+                  fIntermediateMemoryInfo.total_stack[chunk->first].merge(
+                     fIntermediateMemoryInfo.total_stack[first_greater_idx]);
+                  fIntermediateMemoryInfo.total_stack.erase(first_greater_idx);
+               } else {
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size});
+                  if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl;
+               }
+               chunk->second.tensor_name = "free";
+               break;
+            }
+         }
+      } else {
+         if (fVerbose) std::cout << std::endl;
+      }
+   }
+}
+
+void RModel::Initialize(int batchSize, bool verbose) {
+   std::map<std::string, size_t> inputParams;
+   if (batchSize > 0) {
+      inputParams["input_size"] = batchSize;
+      inputParams["batch_size"] = batchSize;
+      inputParams["bs"] = batchSize;
+   }
+   Initialize(inputParams, verbose);
+   fIntermediateMemoryInfo = MemoryPoolInfo();
+}
+void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool verbose) {
+
+   fVerbose = int(verbose);
+
+   if (fIsInitialized) {
+      if (verbose)
+         std::cout << "Model is already initialized  - skip initialization " << std::endl;
+      return;
+   }
+   fIntermediateTensorInfos.clear();
+   fDynamicTensorInfos.clear();
+
+
+   // loop on inputs and see if shape can be  full specified
+   // if the batch size is provided it can be used to specify the full shape
+   // Add the full specified tensors in fReadyInputTensors collection
+   auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements
+   for (auto &input : originalInputTensorInfos) {
+      if (verbose) std::cout << "looking at the tensor " << input.first << std::endl;
+      // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one
+      if (!inputParams.empty()) {
+         for (auto &d : input.second.shape) {
+            if (d.isParam) {
+               std::string pname = d.param;
+               if (pname == input.first + "_size") pname = "input_size";
+               auto itr = inputParams.find(pname);
+               if (itr != inputParams.end() ) {
+                  d = Dim{ itr->second };
+                  if (verbose)
+                     std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl;
+               }
+            }
+         }
+      }
+      // see if shape now is fully defined
+      auto shape = ConvertShapeToInt(input.second.shape);
+      if (verbose)
+         std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from "
+            << ConvertDimShapeToString(input.second.shape) << std::endl;
+      if (!shape.empty()) {
+         // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and
+         // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored
+         fInputTensorInfos.erase(input.first);
+         // add to the ready input tensor information the new fixed shape
+         AddInputTensorInfo(input.first, input.second.type, shape);
+         // check consistency
+         assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size());
+      }
+      // store the parameters of the input tensors
+      else {
+         // store the found parametric shape parameters
+         for (auto &d : input.second.shape) {
+            if (d.isParam) {
+               if (fShapeParams.count(d.param) == 0) {
+                  fDimShapeNames.push_back(d.param);
+                  fShapeParams[d.param] = std::to_string(d.dim);
+               }
+            }
+         }
+      }
+   }
+
+   if (verbose) {
+      PrintRequiredInputTensors();
+      PrintDynamicTensors();
+   }
+
+   // Go through model and initialize each operator
+   int i = 0;
+
+   std::vector<size_t> temp_available_stack; // vector stores individual chunks of available memory that maybe reused
+
+   // Build set of initialized tensors consumed by at least one runtime operator (need for later)
+   std::unordered_set<std::string> runtimeInitializedInputs;
+   for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){
+      if (verbose) {
+         auto& r = *fOperators[op_idx].get();
+         std::cout << "Initializing operator " << i << "  " << typeid(r).name() << std::endl;
+      }
+      fOperators[op_idx]->Initialize(*this);
+      for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
+         std::string name = std::string{it};
+         // check if tensor is not an initialized or output tensor and it is not already in the list
+         if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
+             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() &&
+             fInitializedTensors.find(name) == fInitializedTensors.end())
+         {
+            fIntermediateTensorFrequencyLookup[it] = op_idx;
+         }
+      }
+      // loop for non-constant operators and flag the inputs which are initialized tensors to make sure they are writable
+      if (!fOperators[op_idx]->IsOutputConstant()) {
+         for (auto &it : fOperators[op_idx]->GetOpInputTensors()) {
+            std::string name = std::string{it};
+            if (fInitializedTensors.find(name) != fInitializedTensors.end()) {
+               runtimeInitializedInputs.insert(name);
+            }
+         }
+      }
+
+      i++;
+   }
+
+   // loop on initialized tensors and make the integers as constant to be
+   // not written in a weight file and check if the tensors flagged as not writable are really not writable,
+   // i.e. are not used by non constant operators
+   for (auto &it : fInitializedTensors) {
+      // check if not-writable tensors are really not writable, i.e. are not used by non constant operators
+      if (it.second.IsNotWritable() && runtimeInitializedInputs.find(it.first) != runtimeInitializedInputs.end()) {
+         it.second.SetWritable();
+         if (verbose) {
+            std::cout << "Initialized tensor " << it.first << " is flagged as not writable but is used by non constant operators, set it as writable \n";
+         }
+      }
+      // if the tensor is an integer we can flag it as constant since it will not be written in a weight file and it is considered equivalent as being created from a Constant operator
+      // only FLOAT tensors are written in a weight file
+      if (it.second.type() !=  ETensorType::FLOAT) {
+         it.second.SetConstant();
+      }
+   }
+
+   // check if there are initialized tensors to write in a weight file
+   if (fUseWeightFile) {
+      bool modelHasWeights = false;
+      for (auto &it : fInitializedTensors) {
+         if (it.second.IsWeightTensor()) {
+            modelHasWeights = true;
+            break;
+         }
+      }
+      if (!modelHasWeights)
+         fUseWeightFile = false;
+   }
+
+   // update fIntermediateTensorFrequencyLookup for alias tensors
+   for (auto & it : fAliasTensors) {
+      if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue;
+      if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() )
+         fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first];
+      else {
+         // take the largest one
+         fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] );
+      }
+   }
+
+   fIsInitialized = true;
+}
+
+void RModel::InitializeSubGraph(std::shared_ptr<RModel>  graph) {
+   // add the subgraph to the list
+   fSubGraphs.push_back(graph);
+   //this needs to be done before initializing
+   graph->fParentGraph = this;
+   graph->fIsSubGraph = true;
+
+   graph->Initialize(fBatchSize, fVerbose);
+   // set the same options as parent model
+   graph->fWeightFile = fWeightFile;
+   graph->fUseWeightFile = fUseWeightFile;
+   graph->fUseSession = fUseSession;
+   // add needed blas routines and libs
+   std::vector<std::string> blasRoutines;
+   for (auto & e : graph->fNeededBlasRoutines)
+      blasRoutines.push_back(e);
+   AddBlasRoutines(blasRoutines);
+   for (auto e : graph->fNeededStdLib)
+      AddNeededStdLib(e);
+
+   // add parent input tensors to current graph
+   for (auto & name : fInputTensorNames)
+      graph->fInputTensorNames.emplace_back(name);
+
+   // clean graph name
+   graph->fName = UTILITY::Clean_name(graph->fName);
+
+}
+
+// Function to generate the code for declaring and initializing constant tensors
+// This is for tensors which are not part of weight files and can be created from the Constant operator
+template <typename T>
+std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
+{
+   std::stringstream strs;
+   std::string type = ConvertTypeToString(t.second.type());
+   size_t length = ConvertShapeToLength(t.second.shape());
+   // avoid using stack sizes for constant tensors to reduce compilation time
+   // also for weights which can be broadcasted do not use stack but allocate as a std::vector
+   bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true;
+
+   const T *data = t.second.data<T>();
+
+   // and check if all values are the same
+   bool sameData = false;
+
+   // for non stack allocation check if data are the same
+   if (!allocateOnStack && length > 1) {
+      size_t idx = 1;
+      do {
+         sameData = (data[idx] == data[idx - 1]);
+         idx++;
+      } while (sameData && idx < length);
+   }
+   if (allocateOnStack) {
+      strs << type << " fTensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ";\n";
+   } else {
+      strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
+      if (sameData)
+         strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
+      else {
+         strs << ConvertValuesToString(length, data) << ";\n";
+      }
+      strs << type << " * " << TensorMember(t.first) << " = fTensor_" + t.first + ".data();\n";
+   }
+   return strs.str();
+}
+
+void RModel::GenerateInitializedTensorInfo()
+{
+   if (!fInitializedTensors.empty())
+      fGC += "// initialized (weights and constant) tensors\n";
+
+   // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
+   for (auto &i : fInitializedTensors) {
+      if (i.second.IsNotWritable())  continue;
+      size_t length = ConvertShapeToLength(i.second.shape());
+      if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() || i.second.type() != ETensorType::FLOAT ) {
+         if (i.second.type() == ETensorType::FLOAT) {
+            // check if NaN of Inf are inside tensor data
+            bool hasInfOrNaN = false;
+            const float *data = i.second.data<float>();
+            for (size_t idx = 0; idx < length; idx++) {
+               if (std::is_floating_point<float>::value) {
+                  if (std::isinf(data[idx]) || std::isnan(data[idx])) {
+                     hasInfOrNaN = true;
+                     break;
+                  }
+               }
+            }
+            if (hasInfOrNaN)
+               AddNeededStdLib("limits");
+            fGC += GenerateConstantTensorCode<float>(i);
+            fConstantTensorSize += length * sizeof(float);
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += GenerateConstantTensorCode<int64_t>(i);
+            fConstantTensorSize += length * sizeof(int64_t);
+         } else if (i.second.type() == ETensorType::INT32) {
+            fGC += GenerateConstantTensorCode<int32_t>(i);
+            fConstantTensorSize += length * sizeof(int32_t);
+         }  else if (i.second.type() == ETensorType::BOOL || i.second.type() == ETensorType::UINT8 ) {
+            fGC += GenerateConstantTensorCode<uint8_t>(i);
+            fConstantTensorSize += length * sizeof(uint8_t);
+         }
+
+
+      } else {
+         // case of tensors which are read from a file
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
+            fGC += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+            fWeightsTensorSize += length * sizeof(float);
+         }
+      }
+   }
+}
+
+void RModel::GenerateIntermediateMemoryPool() {
+   if (fIntermediateMemoryInfo.total_stack.empty()) return;
+   fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n";
+
+   // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors
+   // of other data types
+   auto const &totalStack = fIntermediateMemoryInfo.total_stack;
+   const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size;
+   fGC += "std::vector<char> fIntermediateMemoryPool = std::vector<char>(" + std::to_string(memPoolSize) + ");\n\n";
+}
+
+void RModel::GenerateIntermediateTensorInfo() {
+   if (!fIntermediateTensorInfos.empty()) {
+      std::string tensor_declaration_block = "";
+      for (auto &i : fIntermediateTensorInfos) {
+         bool  is_alias = (IsAliasTensor(i.first));
+         if (i.second.type == ETensorType::BOOL && !is_alias) {
+               tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
+               tensor_declaration_block += "std::uint8_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+               continue;
+         }
+         bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended);
+         bool not_in_freq_map =
+            (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end());
+         bool not_in_output_names =
+            (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end());
+
+         if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) {
+            size_t length = ConvertShapeToLength(i.second.shape);
+
+            if (i.second.type == ETensorType::FLOAT) {
+               tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
+               tensor_declaration_block += "float * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 4 * length;
+            }
+            else if (i.second.type == ETensorType::DOUBLE) {
+               tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
+               tensor_declaration_block += "double * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
+            }
+            else if (i.second.type == ETensorType::INT64) {
+               tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
+               tensor_declaration_block += "int64_t * " + TensorMember(i.first) + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
+            }
+         }
+         if (is_alias) {
+             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
+         }
+
+      }
+
+      if (tensor_declaration_block.length()) {
+         fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block;
+      }
+   }
+   // add also the dynamic tensors (only declarations, allocation will be done later)
+   if (!fDynamicTensorInfos.empty()) {
+      fGC += "//--- declare the dynamic tensors\n";
+      for (auto &i : fDynamicTensorInfos) {
+         fGC += ConvertTypeToString(i.second.type) + " * " + TensorMember(i.first) + " = nullptr;\n";
+      }
+      fGC += "//--- dynamic tensors pool\n";
+      fGC += "std::vector<char> fDynamicMemoryPool;\n";
+   }
+}
+
+// generate code for specific operator declarations  to be defined in the Session class
+void RModel::GenerateOperatorDeclarations() {
+   std::string strcode;
+   for (auto & op : fOperators) {
+      strcode += op->GenerateDeclCode();
+   }
+   if (strcode.empty()) return;
+   fGC += "\n//---- operator declarations \n";
+   fGC += strcode;
+   fGC += "\n";
+}
+
+void RModel::GenerateDynamicTensorInfo()
+{
+   // generate code for allocating dynamic tensors using the greedy memory allocations
+   if (fDynamicTensorInfos.empty())
+      return;
+
+   if (fVerbose) {
+      std::cout << "generating code for dynamic tensor management" << std::endl;
+      PrintDynamicTensors();
+   }
+
+   std::stringstream out;
+   out << "//  dynamic tensor memory management\n";
+   out << SP << "std::vector<SOFIE::TensorLifeInfo> dynamicTensorInfos;\n";
+   out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n";
+
+   // loop on all the operators to find begin/end life of the tensors
+   int op_index = 0;
+   std::vector<std::pair<std::string, ETensorType>> tensors;
+   tensors.reserve(fDynamicTensorInfos.size());
+   for (auto & op : fOperators) {
+      // loop on output tensors -
+      for (auto &it : op->GetOpOutputTensors()) {
+         if (fVerbose) {
+            auto op_ptr = op.get();
+            std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
+         }
+         // check if is a dynamic tensor and not an alias tensor or output tensor
+         std::string name = std::string(it);
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)
+              && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end()) {
+            auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
+            auto type = GetTensorType(name);
+            size_t type_size = GetTypeSize(type);
+            int begin = op_index;
+            int end = fOperators.size();
+            // look for end
+            auto it_lookup = fIntermediateTensorFrequencyLookup.find(name);
+            if (it_lookup != fIntermediateTensorFrequencyLookup.end())
+               end = it_lookup->second + 1;  // end is last time used + 1
+            // // some tensors (like xcol in convolutions) are just used within the operators
+            // if (end == 0 && begin > 0) end = begin+1;
+
+            if (begin> end) {
+               std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << "  "  << " end " << end << std::endl;
+               throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin");
+            }
+
+            // write in code
+            out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });"
+                << " // tensor_" << name << std::endl;
+            tensors.push_back({name,type});
+         }
+      }
+      op_index++; // increment operator index
+   }
+   out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n";
+   out << "//  allocating now the memory\n";
+   out << SP << "fDynamicMemoryPool = std::vector<char>(memory_result.total_bytes);\n";
+   out << SP << "int idx = 0;\n";
+   for (auto & it : tensors) {
+      out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n";
+   }
+   // check that all dynamic tensors are covered
+   bool missingTensor = false;
+   for (auto &i : fDynamicTensorInfos) {
+      if (IsAliasTensor(i.first)) continue;
+      if (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) != fOutputTensorNames.end()) continue;
+      if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
+         std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
+         missingTensor = true;
+      }
+   }
+   if (missingTensor)
+      throw std::runtime_error("sofie: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list");
+
+   fGC += out.str();
+}
+
+/// Check if a given parameter is used for the shape of an input tensor.
+bool RModel::IsInputTensorShapeParam(std::string const &paramName) const
+{
+   for (auto &name : fInputTensorNames) {
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            if (d.param == paramName)
+               return true;
+         }
+      }
+   }
+   return false;
+}
+
+/// Collects all identifiers starting with "tensor_" in the input code,
+/// provided that the occurrence is not immediately preceded by a
+/// character that is valid in a C++ identifier. Excludes input and output tensor names.
+/// Returns a deduplicated std::vector<std::string>.
+std::vector<std::string> RModel::CollectTensorMemberNames(const std::string &input)
+{
+   const std::string target = "tensor_";
+
+   std::vector<std::string> result;
+
+   for (size_t i = 0; i < input.size();) {
+
+      bool doCollect = false;
+
+      if (i + target.size() <= input.size() && input.compare(i, target.size(), target) == 0 &&
+          (i == 0 || !IsIdentifierChar(input[i - 1]))) {
+
+         doCollect = true;
+
+         std::size_t j = i + target.size();
+
+         // Extend to full identifier
+         while (j < input.size() && IsIdentifierChar(input[j]))
+            ++j;
+
+         std::string fullName = input.substr(i, j - i);
+
+         // Exclude input tensor names
+         for (std::string const &name : fInputTensorNames) {
+            if (fullName == target + name) {
+               doCollect = false;
+               break;
+            }
+         }
+
+         // Exclude output tensor names
+         if (doCollect) {
+            for (std::string const &name : fOutputTensorNames) {
+               if (fullName == target + name) {
+                  doCollect = false;
+                  break;
+               }
+            }
+         }
+
+         if (doCollect) {
+            result.push_back(fullName);
+         }
+
+         i = j; // advance past the identifier
+      } else {
+         ++i;
+      }
+   }
+
+   // Deduplicate (order not preserved)
+   std::sort(result.begin(), result.end());
+   result.erase(std::unique(result.begin(), result.end()), result.end());
+
+   return result;
+}
+
+std::string RModel::GenerateInferSignature(bool isdecl) {
+   // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2"
+   // if (decl = false) generate only calling signature (tensor1,tensor2,....)
+   std::string rGC;
+   std::unordered_map<std::string, int> inputParams;
+   int i_input = 0;
+   for (auto &name : fInputTensorNames) {
+      // if is a dynamic tensor pass initial parameters
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            // need to check if the input parameters is already existing in another input tensor
+            if (d.isParam && inputParams.count(pName) == 0) {
+               if (isdecl) rGC += "size_t ";
+               rGC += d.param + ",";
+               inputParams[pName] = i_input;
+            }
+         }
+      }
+      if (isdecl) {
+         std::string type = ConvertTypeToString(GetTensorType(name));
+         if (type == "other")
+            throw std::runtime_error("sofie: input tensor " + name +
+                                     " is of a data type which is not yet supported.");
+         rGC += type + " const* ";
+      }
+      rGC += "tensor_" + name + ",";
+      i_input++;
+   }
+
+   if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last ","
+   return rGC;
+}
+
+namespace {
+
+std::string typeForOutput(ETensorType t) {
+   // The std::vector<bool> is a special type that is not wrapping continuous memory.
+   // We don't want to use it as a return type.
+   if (t == ETensorType::BOOL) t = ETensorType::UINT8;
+   return ConvertTypeToString(t);
+}
+
+std::string memberNameForDimShape(std::string name)
+{
+   if (!name.empty()) {
+      name[0] = std::toupper(static_cast<unsigned char>(name[0]));
+   }
+   name = "f" + name;
+   return name;
+}
+
+}
+
+void RModel::GenerateOutput()
+{
+   size_t outputSize = fOutputTensorNames.size();
+   // assume output types are all the same
+
+   bool sameOutputTypes = true;
+   std::string inferReturnType; // type return by infer function
+   ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin());
+   fGC += "\n\n";
+   if (outputSize == 1) {
+      fGC += "std::vector<" + typeForOutput(eFirstOutputType) + ">";
+   } else {
+      // if all output types are the same we return an std::vector - otherwise a tuple
+      for (std::string const &name : fOutputTensorNames) {
+         if (GetTensorType(name) != eFirstOutputType)
+            sameOutputTypes = false;
+      }
+      if (sameOutputTypes)
+         fGC += "std::vector<std::vector<" + typeForOutput(eFirstOutputType) + ">>";
+      else {
+         inferReturnType = "std::tuple<";
+         for (size_t i = 0; i < outputSize; i++) {
+            inferReturnType += "std::vector<" + typeForOutput(GetTensorType(fOutputTensorNames[i])) + ">";
+            if (i < outputSize - 1)
+               inferReturnType += ",";
+         }
+         inferReturnType += ">";
+         fGC += inferReturnType;
+      }
+   }
+
+   fGC += " infer(" + GenerateInferSignature() + "){\n";
+
+   std::string doInferArgs = GenerateInferSignature(false);
+   if (!doInferArgs.empty())
+      doInferArgs += ",";
+   for (std::string const &name : fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      std::string n;
+      if(!isDynamic) {
+         n = std::to_string(ConvertShapeToLength(GetTensorShape(name)));
+      } else {
+         std::string dimLen = ConvertDimShapeToLength(GetDynamicTensorShape(name));
+         // Use the session member (fXxx) when any dim is a runtime-computed identifier
+         // (e.g. NonZero count). For expression-type dims derived from input shapes
+         // (e.g. "((W+-3)/2+1)"), use the expression directly.
+         // for input shape parameters we don't need to use the session member since it is passed as argument to the infer function and it is not a runtime computed value
+         bool hasRuntimeParam = false;
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && IsIdentifier(dim.param) && !IsInputTensorShapeParam(dim.param))
+               hasRuntimeParam = true;
+         }
+         n = hasRuntimeParam ? memberNameForDimShape(dimLen) : dimLen;
+      }
+      std::string outputName = "output_tensor_" + name;
+      fGC += SP + "std::vector<" + typeForOutput(GetTensorType(name)) + " > " + outputName + "(" + n + ");\n";
+      doInferArgs += " " + outputName + ".data(),";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param)) {
+               fGC += SP + "size_t " + dim.param + " = 0;\n";
+               doInferArgs += " " + dim.param + ",";
+            }
+         }
+      }
+   }
+   if (!doInferArgs.empty())
+      doInferArgs.back() = ' ';
+
+   // verifying if the dynamic parameters are within allowed range
+   std::unordered_set<std::string> input_params_checked;
+   std::string dynamic_parameters_check = "";
+   for (auto &name : fInputTensorNames) {
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            if (d.isParam && input_params_checked.count(pName) == 0) {
+               std::string memberName = memberNameForDimShape(d.param);
+               dynamic_parameters_check += d.param + " > " + memberName + " || ";
+               input_params_checked.insert(pName);
+               fGC += SP + "if (" + d.param + " > " + memberName + ") {\n";
+               fGC += SP + SP + "throw std::runtime_error(\"sofie: dynamic input tensor shape parameter " +
+                      d.param + " exceeds the initialized maximum allowed shape.\");\n";
+               fGC += SP + "}\n";
+            }
+         }
+      }
+   }
+
+   if (fUseSession) {
+      fGC += SP + "doInfer(*this, " + doInferArgs + ");\n";
+   } else {
+      fGC += SP + "doInfer(" + doInferArgs + ");\n";
+   }
+
+   // If the output tensors have dynamic sizes, now is the time to set them
+   for (std::string const &name : fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if (isDynamic) {
+         std::string outputName = "output_tensor_" + name;
+         auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name));
+         fGC += SP + outputName + ".resize(" + tensor_size + ");\n";
+      }
+   }
+
+   fGC += SP + "return {";
+   for (size_t i = 0; i < fOutputTensorNames.size(); i++) {
+      fGC += "output_tensor_" + fOutputTensorNames[i];
+      if (i < fOutputTensorNames.size() - 1)
+         fGC += ",";
+   }
+   fGC += "};\n";
+   fGC += "}\n"; // end of infer function scope
+}
+
+void RModel::GenerateSessionCode()
+{
+   std::string sessionName = !fIsSubGraph ? "Session" : "Session_" + fName;
+
+   if (fUseSession && !fIsGNNComponent) {
+      //  forward declare session struct
+      fGC += "struct " + sessionName + ";\n";
+   }
+
+   // Determine the signature of the actual inference function
+   std::string doInferSignature = GenerateInferSignature();
+   if (!doInferSignature.empty())
+      doInferSignature += ", ";
+   for (auto const &name : fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      doInferSignature += typeForOutput(GetTensorType(name)) + " *tensor_" + name + ",";
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param))
+               doInferSignature += " size_t &" + dim.param + "_output,";
+         }
+      }
+   }
+   doInferSignature.back() = ' ';
+
+   if (fUseSession) {
+      doInferSignature = sessionName + " const &session, " + doInferSignature;
+   }
+
+   doInferSignature = "inline void doInfer(" + doInferSignature + ")";
+
+   if (!fIsGNNComponent) {
+      // forward declare inference implementation
+      fGC += doInferSignature + ";\n";
+   }
+
+   // define the Session struct (for GNN this is generated in RModel_GNN)
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "struct " + sessionName + " {\n";
+   }
+
+   // generate code for declaring the initialized tensors
+   GenerateInitializedTensorInfo();
+
+   if (fOptimizationLevel == OptimizationLevel::kExtended) {
+      // evaluate total intermediate memory and position intermediate tensor addresses
+      std::string intermediate_memory_alloc_string = "";
+      intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
+      for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+         if (fVerbose) {
+            auto op = fOperators[op_idx].get();
+            std::cout << "\n******************\n analyzing input/output operator " << op_idx << "  "
+                      << typeid(*op).name() << std::endl;
+         }
+         intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
+         CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
+      }
+
+      // to check remaining unused fragments after memory allocation (lesser the better)
+      // for (const auto &it: fIntermediateMemoryInfo.available_stack){
+      //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
+      // }
+
+      // generate the memory pool to be used by intermediate tensors
+      GenerateIntermediateMemoryPool();
+
+      // position intermediate tensors
+      fGC += intermediate_memory_alloc_string;
+   }
+
+   // generate the declaring the intermediate tensors
+   GenerateIntermediateTensorInfo();
+   // generate code for declarations of some specific operators
+   GenerateOperatorDeclarations();
+   // generate profiling session data member if profiling is enabled
+   if (fProfile) {
+      fGC += RModelProfiler::GenerateSessionMembers();
+   }
+
+   // storing the parameters for future checking to avoid mismatches
+   if (!fDimShapeNames.empty()) {
+      fGC += "\n//   dynamic shape parameters\n";
+      auto dimShapeNames = fDimShapeNames;
+      std::sort(dimShapeNames.begin(), dimShapeNames.end());
+      for (const auto &p : dimShapeNames) {
+         fGC += "size_t " + memberNameForDimShape(p) + ";\n";
+      }
+   }
+
+   // add subgraph session
+   if (!fSubGraphs.empty()) fGC += "//   subgraph sessions\n";
+   for (auto & graph : fSubGraphs) {
+      fGC += "Session_" + graph->fName + "  fSession_" + graph->fName + ";\n";
+   }
+
+   // Generate code for Session constructor
+   if (fUseSession) {
+      // add here specific operator code that needs to define session data members
+      fGC += "\n";
+      for (size_t id = 0; id < fOperators.size(); id++) {
+         std::string opName = std::to_string(id);
+         fGC += fOperators[id]->GenerateSessionMembersCode(opName);
+      }
+      fGC += "\n";
+      // here add initialization and reading of weight tensors
+      if (fUseWeightFile) {
+         std::string fileName = fName;
+         if (fWeightFile == WeightFileType::Text) {
+            fileName += ".dat";
+         }
+         if (fWeightFile == WeightFileType::RootBinary) {
+            fileName += ".root";
+         }
+         fGC += sessionName + "(std::string filename =\"" + fileName + "\"";
+      } else {
+         // no need to pass weight file since it is not used
+         // keep passing a string for compatibility
+         fGC += sessionName + "(std::string = \"\"";
+      }
+      // add initialization of shape parameters
+      // assume all parameters are of type size_t
+      if (!fDimShapeNames.empty()) {
+         // need to use same order as in infer function not alphabetical one
+         for (auto &p : fDimShapeNames) {
+            fGC += ",\n";
+            fGC += "        size_t " + p + " = " + fShapeParams[p];
+         }
+      }
+      fGC += ") {\n";
+
+      // initializing dynamic parameters
+      if (!fDimShapeNames.empty()) {
+         fGC += "\n\n";
+         std::sort(fDimShapeNames.begin(), fDimShapeNames.end());
+         for (const auto &p : fDimShapeNames) {
+            fGC += "   " + memberNameForDimShape(p) + " = " + p + ";\n";
+         }
+      }
+      // add some extra code needed for initialization of dynamic parameters
+      fGC += fExtraCodeForDimShapes;
+
+      if (fUseWeightFile) {
+         fGC += "\n//--- reading weights from file\n";
+         ReadInitializedTensorsFromFile(fReadPos);
+         fGC += "\n";
+         // fUseWeightFile = fUseWeightFile;
+      }
+
+      // now we have passed the parameters we can allocate the dynamic tensors
+      GenerateDynamicTensorInfo();
+
+      // add here initialization code  for operator
+      for (size_t id = 0; id < fOperators.size(); id++) {
+         fGC += fOperators[id]->GenerateInitCode();
+      }
+
+      fGC += "}\n\n";
+   }
+
+   // generate the inference overload that returns an output struct
+   GenerateOutput();
+
+   // generate profiling utility functions inside the Session struct
+   if (fProfile) {
+      fGC += RModelProfiler::GenerateUtilityFunctions();
+   }
+
+   // end of session
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n\n";
+
+      GenerateRequiredInputTensorInfo();
+   }
+
+   fGC += doInferSignature + " {\n";
+   fGC += "\n";
+
+   // generate the inference code
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   if (fOutputTensorNames.size() == 0)
+      throw std::runtime_error("sofie: output size=0 are not supported");
+
+   if (fProfile) {
+      fGC += RModelProfiler::GenerateBeginInferCode();
+   }
+
+   std::string allOperatorCode;
+
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+      if (fProfile) {
+         allOperatorCode += RModelProfiler::GenerateOperatorCode(*fOperators[op_idx], op_idx);
+      } else {
+         allOperatorCode += fOperators[op_idx]->Generate(std::to_string(op_idx));
+      }
+   }
+
+   // If the generated code users members of the session struct, use the
+   // local variable name that we're using for the session:
+   ReplaceAll(allOperatorCode, "this->", "session.");
+
+   if (fUseSession && !fIsGNNComponent) {
+      // Collect all "tensor_*" data members that are not input or output tensors
+      std::vector<std::string> tensorMemberNames = CollectTensorMemberNames(allOperatorCode);
+      for (auto const& name: tensorMemberNames) {
+         fGC += "    auto &" + name + " = session." + name + ";\n";
+      }
+      fGC += "\n";
+   }
+
+   fGC += allOperatorCode;
+
+   if (fProfile) {
+      fGC += RModelProfiler::GenerateEndInferCode();
+   }
+
+   for (auto const& name: fOutputTensorNames) {
+      bool isDynamic = fDynamicTensorInfos.count(name) > 0;
+      if(isDynamic) {
+         for (auto const &dim : GetDynamicTensorShape(name)) {
+            if (dim.isParam && !IsInputTensorShapeParam(dim.param) && IsIdentifier(dim.param))
+               fGC += "   " + dim.param + "_output = " + dim.param + ";\n";
+         }
+      }
+      if(IsConstantTensor(name)) {
+         std::string t = "session.tensor_" + name;
+         size_t length = ConvertShapeToLength(fInitializedTensors[name].shape());
+         fGC += "    std::copy(" + t + ", " + t + " + " + std::to_string(length) + ", tensor_" + name + ");\n";
+      }
+   }
+   fGC += "\n";
+
+   fGC += "}\n";
+}
+
+void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
+{
+   fProfile = static_cast<bool>(options & static_cast<std::underlying_type_t<Options>>(Options::kProfile));
+   fVerbose = verbose;
+   fBatchSize = batchSize;
+   fReadPos = pos;
+
+   // session flag is used in operator initialize
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
+      fUseSession = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoWeightFile) & options) {
+      fUseWeightFile = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kRootBinaryWeightFile) & options) {
+      fUseWeightFile = true;
+      fWeightFile = WeightFileType::RootBinary;
+   }
+   if (fUseWeightFile && !fUseSession) {
+      throw std::runtime_error(
+         "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class");
+   }
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options)
+      fIsGNN = true;
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+      fIsGNNComponent = true;
+
+   if (fProfile)
+      RModelProfiler::AddNeededStdLibs(*this);
+
+   // initialize the model including all operators and sub-graphs
+   Initialize(batchSize, verbose);
+
+   // if having dynamic tensor we need to have a Session
+   if (!fDynamicTensorInfos.empty()) {
+      fUseSession = true;
+      if (verbose)
+         std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl;
+   }
+
+   std::string hgname;
+   if (!fIsGNNComponent && !fIsSubGraph) {
+      fGC.clear();
+      GenerateHeaderInfo(hgname);
+   }
+
+   // generate first code for the subgraphs
+   for (auto &graph : fSubGraphs) {
+      if (fVerbose)
+         std::cout << "generate session code for subgraph " << graph->fName << std::endl;
+      graph->GenerateSessionCode();
+      fGC += graph->fGC;
+   }
+
+   if (fVerbose)
+      std::cout << "generate Main session code - model  " << fName << std::endl;
+
+   // generate main session code
+   GenerateSessionCode();
+
+   if (!fIsGNNComponent && !fIsSubGraph) {
+      fGC += ("} //TMVA_SOFIE_" + fName + "\n");
+      fGC += "\n#endif  // " + hgname + "\n";
+   }
+}
+
+void RModel::ReadInitializedTensorsFromFile(long pos) {
+    // generate the code to read initialized tensors from a text data file
+    if (fWeightFile == WeightFileType::Text) {
+        // check if there are tensors to write
+
+        if (!fUseWeightFile) return;
+
+        fGC += "   std::ifstream f;\n";
+        fGC += "   f.open(filename);\n";
+        fGC += "   if (!f.is_open()) {\n";
+        fGC += "      throw std::runtime_error(\"sofie failed to open file \" + filename + \" for input weights\");\n";
+        fGC += "   }\n";
+
+        if(fIsGNNComponent) {
+            fGC += "   f.seekg(" + std::to_string(pos) + ");\n";
+        }
+
+        fGC += "   using SOFIE::ReadTensorFromStream;\n";
+
+        // loop on tensors and parse the file
+        for (auto& i: fInitializedTensors) {
+            // skip Constant and shape tensors (not written in a file)
+            if (!i.second.IsWeightTensor()) continue;
+            std::string tensor_name = "tensor_" + i.first;
+            if (i.second.type() == ETensorType::FLOAT) {
+               std::string length = std::to_string(ConvertShapeToLength(i.second.shape()));
+               fGC += "   ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n";
+            } else {
+               throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
+            }
+        }
+        fGC += "   f.close();\n";
+    }
+
+    // generate the code to read initialized tensors from a ROOT data file
+    if(fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
+        fGC += "  {\n";
+        fGC += "   std::unique_ptr<TFile> rootFile(TFile::Open(filename.c_str(), \"READ\"));\n";
+        fGC += "   if (!rootFile->IsOpen()) {\n";
+        fGC += "      throw std::runtime_error(\"sofie failed to open ROOT file for input weights\");\n";
+        fGC += "   }\n";
+
+        std::string dirName = fName + "_weights";
+        fGC += "   if (!rootFile->GetKey(\"" + dirName + "\")) {\n";
+        fGC += "      throw std::runtime_error(\"sofie failed to open ROOT directory for input weights\");\n";
+        fGC += "   }\n";
+
+        for (auto &i : fInitializedTensors) {
+            // skip Constant and shape tensors
+            if (!i.second.IsWeightTensor()) continue;
+            fGC += "  {\n";
+            std::string tensor_name = "tensor_" + i.first;
+            if (i.second.type() == ETensorType::FLOAT) {
+               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<float>*>(rootFile->Get(\"";
+               fGC += dirName + "/" + tensor_name + "\"));\n";
+            } else if (i.second.type() == ETensorType::DOUBLE) {
+               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<double>*>(rootFile->Get(\"";
+               fGC += dirName + + "/" + tensor_name + "\"));\n";
+            } else if (i.second.type() == ETensorType::INT64) {
+               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<int64_t>*>(rootFile->Get(\"";
+               fGC += dirName + "/" + tensor_name + "\"));\n";
+            } else {
+               throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file");
+            }
+            fGC += "  }\n";
+        }
+        fGC += "  }\n";
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
+    }
+}
+
+long RModel::WriteInitializedTensorsToFile(std::string filename) {
+    // Determine the file extension based on the weight file type
+    std::string fileExtension;
+    switch (fWeightFile) {
+    case WeightFileType::None:
+        fileExtension = ".dat";
+        break;
+    case WeightFileType::RootBinary:
+        fileExtension = ".root";
+        break;
+    case WeightFileType::Text:
+        fileExtension = ".dat";
+        break;
+    }
+
+    // If filename is empty, use the model name as the base filename
+    if (filename.empty()) {
+        filename = fFileName + fileExtension;
+    }
+
+    // Write the initialized tensors to the file
+    if (fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
+        if(fIsGNNComponent || fIsGNN) {
+            throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file.");
+        }
+        std::unique_ptr<TFile> outputFile(TFile::Open(filename.c_str(), "UPDATE"));
+
+        std::string dirName = fName + "_weights";
+        // check if directory exists, in case delete to replace with new one
+        if (outputFile->GetKey(dirName.c_str()))
+            outputFile->rmdir(dirName.c_str());
+
+        auto outputDir = outputFile->mkdir(dirName.c_str());
+
+        for (const auto& item : fInitializedTensors) {
+            // skip Constant tensors and tensors which are not writable (e.g. shape tensors)
+            if (!item.second.IsWeightTensor()) continue;
+            std::string tensorName = "tensor_" + item.first;
+            size_t length = 1;
+            length = ConvertShapeToLength(item.second.shape());
+            if(item.second.type() == ETensorType::FLOAT) {
+               const float* data = item.second.data<float>();
+                std::vector<float> tensorDataVector(data, data + length);
+               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<float>", tensorName.c_str());
+            }
+            else if(item.second.type() == ETensorType::DOUBLE) {
+               const double* data = item.second.data<double>();
+               std::vector<double> tensorDataVector(data, data + length);
+               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<double>", tensorName.c_str());
+            }
+            else if(item.second.type() == ETensorType::INT64) {
+               const int64_t* data = item.second.data<int64_t>();
+               std::vector<int64_t> tensorDataVector(data, data + length);
+               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<int64_t>", tensorName.c_str());
+            }
+            else {
+               throw std::runtime_error("sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) +
+                                  " cannot be written to a ROOT file");
+            }
+        }
+        outputFile->Write(filename.c_str());
+
+        // this needs to be changed, similar to the text file
+        return -1;
+
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
+    } else if (fWeightFile == WeightFileType::Text) {
+        std::ofstream f;
+        if(fIsGNNComponent) {
+            // appending all GNN components into the same file
+            f.open(filename, std::ios::app);
+        } else {
+            f.open(filename);
+        }
+        if (!f.is_open())
+            throw
+            std::runtime_error("sofie failed to open file " + filename + " for tensor weight data");
+        for (auto& i: fInitializedTensors) {
+             // skip Constant tensors and not writable tensors (e.g. shape tensors)
+            if (!i.second.IsWeightTensor()) {
+               continue;
+            }
+            size_t length = ConvertShapeToLength(i.second.shape());
+            std::string tensor_name = "tensor_" + i.first;
+            f << tensor_name << " " << length << "\n";
+            if (i.second.type() == ETensorType::FLOAT) {
+               const float * data = i.second.data<float>();
+               for (size_t idx = 0; idx < length; idx++) {
+                  // round to zero sub-normal values
+                  float value = data[idx];
+                  if (value != 0. && std::abs(value) < std::numeric_limits<float>::min() ) value = 0;
+                  // handle non-finite values explicitly
+                  if (std::isinf(value))
+                     f << (value > 0 ? "inf" : "-inf");
+                  else if (std::isnan(value))
+                     f << "nan";
+                  else
+                     f << std::setprecision(std::numeric_limits<float>::max_digits10) << value;
+                  f <<  ( (idx < length-1) ? " " : "\n" );
+               }
+            }
+            else {
+               throw std::runtime_error("sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
+            }
+            if (f.fail())
+               throw std::runtime_error("sofie failed to write tensor data to file for  " + tensor_name);
+        }
+        long curr_pos = f.tellp();
+        f.close();
+        return curr_pos;
+    } else {
+        return -1;
+    }
+}
+
+void RModel::PrintSummary() const {
+   std::cout << "Summary of model " << GetName() << std::endl;
+   for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){
+      auto& r = *fOperators[op_idx].get();
+      std::string raw_name =  typeid(r).name();
+      // look for ROperator_NAME
+      std::string name = raw_name.substr(raw_name.find("ROperator_")+10, raw_name.size());
+      std::cout <<  op_idx << "  " << name << "  :  ";
+      for (auto & t_in : r.GetOpInputTensors()) std::cout << t_in << "  ";
+      std::cout << " ----> ";
+      for (auto & t_out : r.GetOpOutputTensors()) std::cout << t_out << "  ";
+      std::cout << std::endl;
+   }
+}
+
+/// To emit the dimensions of the input tensors as a data member of a session,
+/// which is helpful when validating the inference inputs.
+void RModel::GenerateRequiredInputTensorInfo()
+{
+   fGC += "\n// Input tensor dimensions\n";
+   fGC += "using SOFIE::SingleDim;\n";
+   fGC += "using SOFIE::TensorDims;\n";
+   fGC += "using SOFIE::makeDims;\n\n";
+   bool hasDynamicInputTensors = false;
+
+   for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) {
+      auto const &name = fInputTensorNames[iInput];
+      if (IsDimInputTensor(name)) {
+         hasDynamicInputTensors = true;
+      }
+      std::vector<Dim> shape = GetDimTensorShape(name);
+      fGC += "constexpr std::array<SingleDim, " + std::to_string(shape.size()) + "> dim_" + name + "{";
+      for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) {
+         auto const &dim = shape[iDim];
+         if (dim.isParam) {
+            fGC += "SingleDim{\"" + dim.GetVal() + "\"}";
+         } else {
+            fGC += "SingleDim{" + dim.GetVal() + "}";
+         }
+         if (iDim != shape.size() - 1) {
+            fGC += ", ";
+         }
+      }
+      fGC += "};\n";
+   }
+   fGC += "\nconstexpr std::array<TensorDims, " + std::to_string(fInputTensorNames.size()) + "> inputTensorDims{\n";
+   for (std::size_t iInput = 0; iInput < fInputTensorNames.size(); ++iInput) {
+      auto const &name = fInputTensorNames[iInput];
+      fGC += SP + "makeDims(dim_" + name + ")";
+      if (iInput == fInputTensorNames.size() - 1) {
+         fGC += "\n";
+      } else {
+         fGC += ",\n";
+      }
+   }
+   fGC += "};\n";
+
+   fGC +=
+      "\nconstexpr bool hasDynamicInputTensors{" + std::string{hasDynamicInputTensors ? "true" : "false"} + "};\n\n";
+
+   fGC += "\n// Output tensor dimensions\n";
+   bool hasDynamicOutputTensors = false;
+   for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) {
+      auto const &name = fOutputTensorNames[iOutput];
+      if (IsDynamicTensor(name)) {
+         hasDynamicOutputTensors = true;
+      }
+      std::vector<Dim> shape = GetDimTensorShape(name);
+      fGC += "constexpr std::array<SingleDim, " + std::to_string(shape.size()) + "> dim_" + name + "{";
+      for (std::size_t iDim = 0; iDim < shape.size(); ++iDim) {
+         auto const &dim = shape[iDim];
+         if (dim.isParam) {
+            fGC += "SingleDim{\"" + dim.GetVal() + "\"}";
+         } else {
+            fGC += "SingleDim{" + dim.GetVal() + "}";
+         }
+         if (iDim != shape.size() - 1) {
+            fGC += ", ";
+         }
+      }
+      fGC += "};\n";
+   }
+   fGC += "\nconstexpr std::array<TensorDims, " + std::to_string(fOutputTensorNames.size()) + "> outputTensorDims{\n";
+   for (std::size_t iOutput = 0; iOutput < fOutputTensorNames.size(); ++iOutput) {
+      auto const &name = fOutputTensorNames[iOutput];
+      fGC += SP + "makeDims(dim_" + name + ")";
+      if (iOutput == fOutputTensorNames.size() - 1) {
+         fGC += "\n";
+      } else {
+         fGC += ",\n";
+      }
+   }
+   fGC += "};\n";
+   fGC +=
+      "\nconstexpr bool hasDynamicOutputTensors{" + std::string{hasDynamicOutputTensors ? "true" : "false"} + "};\n\n";
+}
+
+void RModel::PrintRequiredInputTensors() const {
+    std::cout << "Model requires following inputs:\n";
+    for (auto& inputInfo: fInputTensorInfos) {
+        std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t";
+        std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t";
+        std::cout << "shape: [";
+        for (size_t i = 0; i < inputInfo.second.shape.size(); i++) {
+            if (inputInfo.second.shape[i].isParam) {
+                std::cout << inputInfo.second.shape[i].param;
+            } else {
+                std::cout << inputInfo.second.shape[i].dim ;
+            }
+            if (i < inputInfo.second.shape.size() - 1) std::cout << ",";
+        }
+        std::cout << "]" << std::endl;
+    }
+
+    for (auto& inputInfo: fReadyInputTensorInfos) {
+        std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t";
+        std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t";
+        std::cout << "shape: [";
+        for (size_t i = 0; i < inputInfo.second.shape.size(); i++) {
+            std::cout << inputInfo.second.shape[i];
+            if (i < inputInfo.second.shape.size() - 1) std::cout << ",";
+        }
+        std::cout << "]" << std::endl;
+    }
+    std::cout << "\n";
+}
+
+void RModel::PrintInitializedTensors() const {
+    std::cout << "Model initialized the following tensors:\n";
+    for (auto& it: fInitializedTensors) {
+        std::cout << "Tensor name: \"" << it.first << "\"\t";
+        std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t";
+        std::cout << "shape: [";
+        for (size_t i = 0; i < it.second.shape().size(); i++) {
+            std::cout << it.second.shape()[i];
+            if (i < it.second.shape().size() - 1) std::cout << ",";
+        }
+        std::cout << "]";
+        if (it.second.IsConstantTensor()) std::cout << " (Constant)";
+        if (it.second.IsNotWritable()) std::cout << " (Not Writable)";
+        std::cout << std::endl;
+    }
+    std::cout << "\n";
+}
+
+void RModel::PrintIntermediateTensors() const {
+    std::cout << "Model specify the following intermediate tensors:\n";
+    for (auto& it: fIntermediateTensorInfos) {
+        std::cout << "Tensor name: \"" << it.first << "\"\t";
+        std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t";
+        std::cout << "shape: [";
+        for (size_t i = 0; i < it.second.shape.size(); i++) {
+            std::cout << it.second.shape[i];
+            if (i < it.second.shape.size() - 1) std::cout << ",";
+        }
+        std::cout << "]" << std::endl;
+    }
+    std::cout << "\n";
+}
+
+void RModel::PrintDynamicTensors() const {
+    std::cout << "Model specify the following dynamic tensors:\n";
+    for (auto& it: fDynamicTensorInfos) {
+        std::cout << "Tensor name: \"" << it.first << "\"\t";
+        std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t";
+        std::cout << "shape: [";
+        for (size_t i = 0; i < it.second.shape.size(); i++) {
+            std::cout << it.second.shape[i].GetVal();
+            if (i < it.second.shape.size() - 1) std::cout << ",";
+        }
+        std::cout << "]" << std::endl;
+    }
+    std::cout << "\n";
+}
+
+void RModel::PrintOutputTensors() const {
+    std::cout << "Model specify the following output tensors:\n";
+    for (auto& it: fOutputTensorNames) {
+        std::cout << "Tensor name: \"" << it << "\"\t";
+        try {
+         auto shape = GetDimTensorShape(it);
+         std::cout << "with shape: " << ConvertDimShapeToString(shape) << std::endl;
+        } catch (...) {
+          std::cout << "with shape not yet defined" << std::endl;
+        }
+    }
+    std::cout << "\n";
+}
+
+void RModel::HeadInitializedTensors(std::string name, int n_print) {
+    auto it = fInitializedTensors.find(name);
+    if (it == fInitializedTensors.end()) {
+        std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl;
+        return;
+    }
+
+    std::cout << "Tensor name: " << it->first << "\t";
+    std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t";
+    int length =1;
+    std::cout << "shape: [";
+    for (size_t i = 0; i < it->second.shape().size(); i++) {
+        std::cout << it->second.shape()[i];
+        length *= it->second.shape()[i];
+        if (i < it->second.shape().size() - 1) std::cout << ",";
+    }
+    std::cout << "]" << std::endl;
+    bool ellipsis = true;
+    if (n_print > length) {
+        n_print = length;
+        ellipsis = false;
+    }
+
+    std::cout << "data: [" << std::endl;
+    if (it->second.type() == ETensorType::FLOAT) {
+        auto converted_data = it->second.data<float>();
+        for (int i =0; i < n_print; i++) {
+            std::cout << converted_data[i];
+            if (i < n_print - 1) std::cout << " ,";
+        }
+    }
+    if (ellipsis) std::cout << ", ...";
+    std::cout << "]" << std::endl;
+
+}
+
+void RModel::OutputGenerated(std::string filename, bool append) {
+
+    RModel_Base::OutputGenerated(filename, append);
+
+    // write weights in a text file
+    if (fUseWeightFile) {
+        if (!filename.empty()) {
+            size_t pos = filename.find(".hxx");
+            if (fWeightFile == WeightFileType::Text)
+                filename.replace(pos, 4, ".dat");
+            if (fWeightFile == WeightFileType::RootBinary)  {
+                filename = filename.erase(pos, 4);
+                filename += ".root";
+            }
+        } else {
+            filename = fName;
+            filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root";
+        }
+        WriteInitializedTensorsToFile(filename);
+    }
+}
+
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
+void RModel::Streamer(TBuffer &R__b) {
+    if (R__b.IsReading()) {
+        RModel::Class()->ReadBuffer(R__b, this);
+        for (auto & i : fInitializedTensors) {
+            i.second.CastPersistentToShared();
+        }
+    }
+    else {
+        for (auto & i : fInitializedTensors) {
+            i.second.CastSharedToPersistent();
+        }
+        RModel::Class()->WriteBuffer(R__b, this);
+    }
+}
+#endif
+
+}//SOFIE
diff --git a/core/src/RModelProfiler.cxx b/core/src/RModelProfiler.cxx
new file mode 100644
index 0000000..25efbd2
--- /dev/null
+++ b/core/src/RModelProfiler.cxx
@@ -0,0 +1,121 @@
+#include "SOFIE/RModelProfiler.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+
+namespace SOFIE {
+
+void RModelProfiler::AddNeededStdLibs(RModel &model)
+{
+   model.AddNeededStdLib("chrono");
+   model.AddNeededStdLib("vector");
+   model.AddNeededStdLib("string");
+   model.AddNeededStdLib("map");
+   model.AddNeededStdLib("iostream");
+   model.AddNeededStdLib("iomanip");
+   model.AddNeededStdLib("algorithm");
+   model.AddNeededStdLib("cmath");
+   model.AddNeededStdLib("tuple");
+}
+
+std::string RModelProfiler::GenerateSessionMembers()
+{
+   std::string gc;
+   gc += "// Maps an operator name to a vector of its execution times (in microseconds).\n";
+   gc += "mutable std::map<std::string, std::vector<double>> fProfilingResults;\n\n";
+   return gc;
+}
+
+std::string RModelProfiler::GenerateUtilityFunctions()
+{
+   std::string gc;
+
+   gc += "   // Print profiling results sorted by average time (highest first).\n";
+   gc += "   void PrintProfilingResults(bool order = true) const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         std::cout << \"No profiling results to display.\" << std::endl;\n";
+   gc += "         return;\n";
+   gc += "      }\n";
+   gc += "      std::vector<std::tuple<std::string, double, double, int>> averageResults;\n";
+   gc += "      std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n";
+   gc += "      std::cout << \"            CPU PROFILING RESULTS\" << std::endl;\n";
+   gc += "      std::cout << std::string(60, '=') << std::endl;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0, sum2 = 0.0;\n";
+   gc += "         for (double time : op.second) { sum += time; sum2 += time*time; }\n";
+   gc += "         double average = sum / op.second.size();\n";
+   gc += "         double stddev = (op.second.size() > 1) ? std::sqrt((sum2 - sum*average) / (op.second.size()-1)) : 0.0;\n";
+   gc += "         averageResults.push_back({op.first, average, stddev, (int)op.second.size()});\n";
+   gc += "      }\n";
+   gc += "      if (order) {\n";
+   gc += "         std::sort(averageResults.begin(), averageResults.end(),\n";
+   gc += "            [](const auto& a, const auto& b){ return std::get<1>(a) > std::get<1>(b); });\n";
+   gc += "      }\n";
+   gc += "      for (const auto& r : averageResults) {\n";
+   gc += "         std::cout << \"  \" << std::left << std::setw(30) << std::get<0>(r)\n";
+   gc += "                   << \": \" << std::fixed << std::setprecision(3) << std::get<1>(r)\n";
+   gc += "                   << \" +/- \" << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n";
+   gc += "                   << \"  (\" << std::get<3>(r) << \" runs)\" << std::endl;\n";
+   gc += "      }\n";
+   gc += "      std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n";
+   gc += "   }\n\n";
+
+   gc += "   void ResetProfilingResults() {\n";
+   gc += "      fProfilingResults.clear();\n";
+   gc += "   }\n\n";
+
+   gc += "   std::map<std::string, double> GetOpAvgTime() const {\n";
+   gc += "      std::map<std::string, double> avg;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0;\n";
+   gc += "         for (double t : op.second) sum += t;\n";
+   gc += "         avg[op.first] = sum / op.second.size();\n";
+   gc += "      }\n";
+   gc += "      return avg;\n";
+   gc += "   }\n\n";
+
+   gc += "   std::map<std::string, double> GetOpVariance() const {\n";
+   gc += "      std::map<std::string, double> variance;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double mean = 0.0, mean2 = 0.0;\n";
+   gc += "         for (double t : op.second) { mean += t; mean2 += t*t; }\n";
+   gc += "         mean /= op.second.size(); mean2 /= op.second.size();\n";
+   gc += "         variance[op.first] = mean2 - mean*mean;\n";
+   gc += "      }\n";
+   gc += "      return variance;\n";
+   gc += "   }\n\n";
+
+   return gc;
+}
+
+std::string RModelProfiler::GenerateBeginInferCode()
+{
+   std::string gc;
+   gc += "   // Profiling timers\n";
+   gc += "   std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n";
+   gc += "   tp_overall_start = std::chrono::steady_clock::now();\n";
+   gc += "   auto & fProfilingResults = session.fProfilingResults;\n\n";
+   return gc;
+}
+
+std::string RModelProfiler::GenerateOperatorCode(ROperator &op, size_t op_idx)
+{
+   std::string gc;
+   gc += "   // -- Profiling operator: " + op.Name() + " --\n";
+   gc += "   tp_start = std::chrono::steady_clock::now();\n";
+   gc += op.Generate(std::to_string(op_idx));
+   gc += "\n   fProfilingResults[\"" + op.Name() + "\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_start).count());\n\n";
+   return gc;
+}
+
+std::string RModelProfiler::GenerateEndInferCode()
+{
+   std::string gc;
+   gc += "   // -- Record overall inference time --\n";
+   gc += "   fProfilingResults[\"Overall_Time\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_overall_start).count());\n";
+   return gc;
+}
+
+} // namespace SOFIE
diff --git a/core/src/RModelProfilerGPU.cxx b/core/src/RModelProfilerGPU.cxx
new file mode 100644
index 0000000..bf946b5
--- /dev/null
+++ b/core/src/RModelProfilerGPU.cxx
@@ -0,0 +1,184 @@
+#include "SOFIE/RModelProfilerGPU.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+
+namespace SOFIE {
+
+void RModelProfilerGPU::AddNeededStdLibs(RModel &model)
+{
+   model.AddNeededStdLib("chrono");
+   model.AddNeededStdLib("vector");
+   model.AddNeededStdLib("string");
+   model.AddNeededStdLib("map");
+   model.AddNeededStdLib("iostream");
+   model.AddNeededStdLib("iomanip");
+   model.AddNeededStdLib("algorithm");
+   model.AddNeededStdLib("cmath");
+   model.AddNeededStdLib("tuple");
+}
+
+std::string RModelProfilerGPU::GenerateSessionMembers()
+{
+   std::string gc;
+   gc += "// Maps operator name to GPU execution times (microseconds, wall-clock with sync).\n";
+   gc += "mutable std::map<std::string, std::vector<double>> fProfilingResults;\n\n";
+   return gc;
+}
+
+std::string RModelProfilerGPU::GenerateUtilityFunctions()
+{
+   std::string gc;
+
+   gc += "   // Print GPU profiling results sorted by average time (highest first).\n";
+   gc += "   void PrintProfilingResults(bool order = true) const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         std::cout << \"No GPU profiling results to display.\" << std::endl;\n";
+   gc += "         return;\n";
+   gc += "      }\n";
+   gc += "      std::vector<std::tuple<std::string, double, double, int>> averageResults;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0, sum2 = 0.0;\n";
+   gc += "         for (double t : op.second) { sum += t; sum2 += t*t; }\n";
+   gc += "         double average = sum / op.second.size();\n";
+   gc += "         double stddev = (op.second.size() > 1) ? std::sqrt((sum2 - sum*average) / (op.second.size()-1)) : 0.0;\n";
+   gc += "         averageResults.push_back({op.first, average, stddev, (int)op.second.size()});\n";
+   gc += "      }\n";
+   gc += "      if (order) {\n";
+   gc += "         std::sort(averageResults.begin(), averageResults.end(),\n";
+   gc += "            [](const auto& a, const auto& b){ return std::get<1>(a) > std::get<1>(b); });\n";
+   gc += "      }\n";
+   gc += "      std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n";
+   gc += "      std::cout << \"           GPU PROFILING RESULTS\" << std::endl;\n";
+   gc += "      std::cout << \"   (wall-clock with alpaka::wait synchronization)\" << std::endl;\n";
+   gc += "      std::cout << std::string(60, '=') << std::endl;\n";
+   gc += "      for (const auto& r : averageResults) {\n";
+   gc += "         std::cout << \"  \" << std::left << std::setw(30) << std::get<0>(r)\n";
+   gc += "                   << \": \" << std::fixed << std::setprecision(3) << std::get<1>(r)\n";
+   gc += "                   << \" +/- \" << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n";
+   gc += "                   << \"  (\" << std::get<3>(r) << \" runs)\" << std::endl;\n";
+   gc += "      }\n";
+   gc += "      std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n";
+   gc += "   }\n\n";
+
+   gc += "   void ResetProfilingResults() {\n";
+   gc += "      fProfilingResults.clear();\n";
+   gc += "   }\n\n";
+
+   gc += "   std::map<std::string, double> GetOpAvgTime() const {\n";
+   gc += "      std::map<std::string, double> avg;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0;\n";
+   gc += "         for (double t : op.second) sum += t;\n";
+   gc += "         avg[op.first] = sum / op.second.size();\n";
+   gc += "      }\n";
+   gc += "      return avg;\n";
+   gc += "   }\n\n";
+
+   return gc;
+}
+
+RModelProfilerGPU::MemoryInfo RModelProfilerGPU::ComputeMemoryInfo(const RModel &model)
+{
+   MemoryInfo info;
+
+   for (const auto &it : model.fInitializedTensors) {
+      if (it.second.IsNotWritable()) continue;
+      size_t bytes = ConvertShapeToLength(it.second.shape()) * GetTypeSize(it.second.type());
+      if (!model.fUseWeightFile || it.second.IsConstantTensor()) {
+         info.constantTensorBytes += bytes;  // embedded as C++ array in generated code
+      } else {
+         info.weightTensorBytes += bytes;    // loaded from .dat into temp CPU vector then H2D
+      }
+      // Every initialized tensor (constant or weight file) gets its own GPU device buffer.
+      info.weightDeviceBytes += bytes;
+   }
+
+   // CPU intermediate memory pool (0 in the GPU path — intermediates live on device)
+   info.intermediateCPUBytes = model.fOtherTensorSize;
+
+   // GPU intermediate device buffers.
+   // Skip fused-kernel intermediates: those tensors share the fused kernel's
+   // input/output buffers and are never separately allocated on the device.
+   for (const auto &it : model.fIntermediateTensorInfos) {
+      if (model.fFusionIntermediateTensors.count(it.first)) continue;
+      size_t len = ConvertShapeToLength(it.second.shape);
+      info.intermediateGPUBytes += len * GetTypeSize(it.second.type);
+   }
+
+   return info;
+}
+
+std::string RModelProfilerGPU::GenerateMemoryReport(const MemoryInfo &info)
+{
+   auto toMB = [](size_t bytes) -> double { return bytes / (1024.0 * 1024.0); };
+
+   size_t totalCPU = info.constantTensorBytes + info.weightTensorBytes + info.intermediateCPUBytes;
+   size_t totalGPU = info.weightDeviceBytes + info.intermediateGPUBytes;
+
+   std::string gc;
+   gc += "   // Print memory usage breakdown computed at code-generation time.\n";
+   gc += "   void PrintMemoryInfo() const {\n";
+   gc += "      std::cout << \"\\n\" << std::string(60, '=') << std::endl;\n";
+   gc += "      std::cout << \"              MEMORY USAGE BREAKDOWN\" << std::endl;\n";
+   gc += "      std::cout << std::string(60, '=') << std::endl;\n";
+   gc += "      std::cout << \"  CPU Memory (during session init):\" << std::endl;\n";
+   gc += "      std::cout << \"    Constant/embedded tensors : "
+         + std::to_string(info.constantTensorBytes) + " bytes  ("
+         + std::to_string(toMB(info.constantTensorBytes)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"    Weight tensors (.dat file): "
+         + std::to_string(info.weightTensorBytes) + " bytes  ("
+         + std::to_string(toMB(info.weightTensorBytes)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"    Intermediate memory pool  : "
+         + std::to_string(info.intermediateCPUBytes) + " bytes  ("
+         + std::to_string(toMB(info.intermediateCPUBytes)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"    Total CPU                 : "
+         + std::to_string(totalCPU) + " bytes  ("
+         + std::to_string(toMB(totalCPU)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"  GPU Memory (device buffers):\" << std::endl;\n";
+   gc += "      std::cout << \"    Initialized bufs (const+weights): "
+         + std::to_string(info.weightDeviceBytes) + " bytes  ("
+         + std::to_string(toMB(info.weightDeviceBytes)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"    Intermediate device bufs  : "
+         + std::to_string(info.intermediateGPUBytes) + " bytes  ("
+         + std::to_string(toMB(info.intermediateGPUBytes)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << \"    Total GPU                 : "
+         + std::to_string(totalGPU) + " bytes  ("
+         + std::to_string(toMB(totalGPU)).substr(0, 6) + " MB)\" << std::endl;\n";
+   gc += "      std::cout << std::string(60, '=') << \"\\n\" << std::endl;\n";
+   gc += "   }\n\n";
+   return gc;
+}
+
+std::string RModelProfilerGPU::GenerateBeginInferCode()
+{
+   std::string gc;
+   gc += "   // GPU profiling timers\n";
+   gc += "   std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n";
+   gc += "   tp_overall_start = std::chrono::steady_clock::now();\n\n";
+   return gc;
+}
+
+std::string RModelProfilerGPU::GenerateOperatorCode(ROperator &op, size_t op_idx)
+{
+   std::string gc;
+   gc += "   // -- GPU Profiling operator: " + op.Name() + " --\n";
+   gc += "   tp_start = std::chrono::steady_clock::now();\n";
+   gc += op.Generate_GPU_ALPAKA(std::to_string(op_idx));
+   // Force synchronisation so chrono measures actual GPU execution time
+   gc += "   alpaka::wait(queue);\n";
+   gc += "   fProfilingResults[\"" + op.Name() + "\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_start).count());\n\n";
+   return gc;
+}
+
+std::string RModelProfilerGPU::GenerateEndInferCode()
+{
+   std::string gc;
+   gc += "   // -- Record overall GPU inference time --\n";
+   gc += "   fProfilingResults[\"Overall_Time\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_overall_start).count());\n";
+   return gc;
+}
+
+} // namespace SOFIE
diff --git a/core/src/RModel_ALPAKA.cxx b/core/src/RModel_ALPAKA.cxx
new file mode 100644
index 0000000..9e0e84c
--- /dev/null
+++ b/core/src/RModel_ALPAKA.cxx
@@ -0,0 +1,861 @@
+#include <algorithm>
+#include <cctype>
+#include <climits>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
+#include "TFile.h"
+#endif
+
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelProfilerGPU.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator_Gemm.hxx"
+#include "SOFIE/ROperator_LeakyRelu.hxx"
+#include "SOFIE/ROperator_Relu.hxx"
+
+namespace SOFIE {
+
+void RModel::ComputeEltwiseFusionGroups() {
+   fEltwiseFusionGroups.clear();
+   fOpToFusionGroupIdx.clear();
+   fFusionIntermediateTensors.clear();
+
+   // Build tensor -> consumer op indices map
+   std::unordered_map<std::string, std::vector<size_t>> tensorConsumers;
+   for (size_t i = 0; i < fOperators.size(); i++) {
+      for (const auto& name : fOperators[i]->GetOpInputTensors())
+         tensorConsumers[std::string(name)].push_back(i);
+   }
+
+   // Returns true if tensorName is safe to treat as a fusion intermediate:
+   // consumed by exactly one op AND not a model output.
+   auto isFuseSafe = [&](const std::string& tensorName) -> bool {
+      for (const auto& outName : fOutputTensorNames)
+         if (outName == tensorName) return false;
+      auto it = tensorConsumers.find(tensorName);
+      return it != tensorConsumers.end() && it->second.size() == 1;
+   };
+
+   std::vector<bool> opAssigned(fOperators.size(), false);
+
+   for (size_t i = 0; i < fOperators.size(); i++) {
+      if (opAssigned[i]) continue;
+      opAssigned[i] = true;
+
+      EltwiseFusionGroup group;
+      group.opIndices.push_back(i);
+
+      auto firstInputs = fOperators[i]->GetOpInputTensors();
+      group.inputTensor = firstInputs.empty() ? "" : std::string(firstInputs[0]);
+
+      // Extend chain: only if CURRENT op is elementwise and its single output can be fused
+      size_t current = i;
+      while (fOperators[current]->IsElementwise()) {
+         auto curOutputs = fOperators[current]->GetOpOutputTensors();
+         if (curOutputs.size() != 1) break;
+         std::string curOut = std::string(curOutputs[0]);
+         if (!isFuseSafe(curOut)) break;
+
+         size_t nextIdx = tensorConsumers.find(curOut)->second[0];
+         // Must be strictly the next op in sequence and itself elementwise with single input
+         if (nextIdx != current + 1) break;
+         if (opAssigned[nextIdx]) break;
+         if (!fOperators[nextIdx]->IsElementwise()) break;
+         auto nextInputs = fOperators[nextIdx]->GetOpInputTensors();
+         if (nextInputs.size() != 1) break;
+
+         opAssigned[nextIdx] = true;
+         group.opIndices.push_back(nextIdx);
+         current = nextIdx;
+      }
+
+      // Output tensor is the last op's output
+      auto lastOutputs = fOperators[current]->GetOpOutputTensors();
+      group.outputTensor = lastOutputs.empty() ? "" : std::string(lastOutputs[0]);
+
+      // Element count from intermediate tensor info (all op outputs are intermediates)
+      if (!group.outputTensor.empty()) {
+         auto it = fIntermediateTensorInfos.find(group.outputTensor);
+         if (it != fIntermediateTensorInfos.end())
+            group.numElements = ConvertShapeToLength(it->second.shape);
+      }
+
+      size_t gIdx = fEltwiseFusionGroups.size();
+      for (auto opIdx : group.opIndices)
+         fOpToFusionGroupIdx[opIdx] = gIdx;
+
+      // Mark all-but-last outputs as fusion intermediates (skip allocation)
+      if (group.isFused()) {
+         for (size_t k = 0; k + 1 < group.opIndices.size(); k++) {
+            auto midOuts = fOperators[group.opIndices[k]]->GetOpOutputTensors();
+            if (!midOuts.empty())
+               fFusionIntermediateTensors.insert(std::string(midOuts[0]));
+         }
+      }
+
+      fEltwiseFusionGroups.push_back(std::move(group));
+   }
+}
+
+
+void RModel::FuseGemmActivations_GPU() {
+   std::unordered_map<std::string, size_t> consumerCount;
+   for (const auto& op : fOperators)
+      for (const auto& inp : op->GetOpInputTensors())
+         ++consumerCount[std::string(inp)];
+
+   const size_t N = fOperators.size();
+   for (size_t i = 0; i + 1 < N; ++i) {
+      if (fSkipOperators.count(i)) continue;
+
+      auto* gemm = dynamic_cast<ROperator_Gemm<float>*>(fOperators[i].get());
+      if (!gemm) continue;
+      if (gemm->GetActivationType() != EActivationType::UNDEFINED) continue;
+
+      auto* lrelu = dynamic_cast<ROperator_LeakyRelu<float>*>(fOperators[i + 1].get());
+      auto* relu  = dynamic_cast<ROperator_Relu<float>*>(fOperators[i + 1].get());
+      if (!lrelu && !relu) continue;
+
+      std::string gemmOut = std::string(fOperators[i]->GetOpOutputTensors()[0]);
+      std::string actIn   = std::string(fOperators[i + 1]->GetOpInputTensors()[0]);
+      if (gemmOut != actIn) continue;
+
+      if (consumerCount[gemmOut] != 1) continue;
+
+      std::string actOut = std::string(fOperators[i + 1]->GetOpOutputTensors()[0]);
+
+      if (lrelu) {
+         gemm->SetActivation(EActivationType::LEAKYRELU, lrelu->GetAlpha());
+      } else {
+         gemm->SetActivation(EActivationType::RELU, 0.f);
+      }
+
+      gemm->UpdateFusableTensorName(actOut, [&](const std::string& old) {
+         fFusionIntermediateTensors.insert(old);
+      });
+
+      fSkipOperators.insert(i + 1);
+   }
+}
+
+void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() {
+   if (!fInitializedTensors.empty()){
+      fGC += "\n// initialized tensors for weights\n";
+   }
+
+   for (auto &i : fInitializedTensors) {
+      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+         if (i.second.type() == ETensorType::FLOAT)
+            fGC += GenerateConstantTensorCode<float>(i);
+         else if (i.second.type() == ETensorType::INT64)
+            fGC += GenerateConstantTensorCode<int64_t>(i);
+         else if (i.second.type() == ETensorType::INT32)
+            fGC += GenerateConstantTensorCode<int32_t>(i);
+
+         else if (i.second.type() == ETensorType::BOOL ||
+                  i.second.type() == ETensorType::UINT8)
+            fGC += GenerateConstantTensorCode<uint8_t>(i);
+      }
+
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "BufF1D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<float, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         } else if (i.second.type() == ETensorType::INT32) {
+            fGC += "BufI321D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<int32_t, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += "BufI641D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<int64_t, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         } else if (i.second.type() == ETensorType::BOOL ||
+                    i.second.type() == ETensorType::UINT8) {
+            fGC += "BufUI81D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<uint8_t, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         }
+
+   }
+}
+
+void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA()
+{
+   if (!fInitializedTensors.empty())
+      fGC += "// temporary initialized tensors for loading weights\n";
+
+   for (auto &i : fInitializedTensors) {
+      if (fUseWeightFile && !i.second.IsConstantTensor()) {
+         // case of tensors which are read from a file
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "std::vector<float> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         } else if (i.second.type() == ETensorType::INT32) {
+            fGC += "std::vector<int32_t> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += "std::vector<int64_t> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         } else if (i.second.type() == ETensorType::BOOL ||
+                    i.second.type() == ETensorType::UINT8) {
+            fGC += "std::vector<uint8_t> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA_Buffers() {
+   if (!fIntermediateTensorInfos.empty()) {
+      std::string tensor_declaration_block = "";
+
+      for (auto &i : fIntermediateTensorInfos) {
+         // Skip tensors that are purely intermediate within a fused kernel chain
+         if (fFusionIntermediateTensors.count(i.first)) continue;
+
+         size_t length = ConvertShapeToLength(i.second.shape);
+
+         if (i.second.type == ETensorType::FLOAT) {
+            tensor_declaration_block += "BufF1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            tensor_declaration_block += "BufD1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<double, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::INT32) {
+            tensor_declaration_block += "BufI321D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<int32_t, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            tensor_declaration_block += "BufI641D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<int64_t, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::BOOL) {
+            tensor_declaration_block += "BufUI81D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<std::uint8_t, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         }
+      }
+
+      if (tensor_declaration_block.length()) {
+         fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block;
+      }
+   }
+
+   // add also the dynamic tensors (only declarations, allocation will be done later)
+   if (!fDynamicTensorInfos.empty()) {
+      fGC += "//--- declare the dynamic tensors\n";
+      fGC += "using bufDev_float = alpaka::Buf<devAcc, float, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_double = alpaka::Buf<devAcc, double, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_int64  = alpaka::Buf<devAcc, int64_t, alpaka::DimInt<1u>, size_t>;\n";
+
+      for (auto &i : fDynamicTensorInfos) {
+         if (i.second.type == ETensorType::FLOAT) {
+            fGC += "bufDev_float bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            fGC += "bufDev_double bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            fGC += "bufDev_int64 bufDev_" + i.first + ";\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() {
+   fGC += "//---- allocate the intermediate dynamic tensors\n";
+   std::stringstream out;
+
+   for (auto &i : fDynamicTensorInfos) {
+      auto length = ConvertDimShapeToLength(i.second.shape);
+      out << SP << "if (" << length << " > 0) {\n";
+      out << "auto bufDev_" + i.first +
+                 " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" << length << "}));\n";
+      out << SP << "}\n";
+   }
+   fGC += out.str();
+}
+
+std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) {
+
+   auto GetBufType = [this](const std::string& name) -> std::string {
+      ETensorType type = GetTensorType(name);
+      if (type == ETensorType::FLOAT)  return "BufF1D";
+      if (type == ETensorType::DOUBLE) return "BufD1D";
+      if (type == ETensorType::INT32)  return "BufI321D";
+      if (type == ETensorType::INT64)  return "BufI641D";
+      if (type == ETensorType::BOOL)  return "BufUI81D";
+      throw std::runtime_error("sofie: input tensor " + name +
+                               " is of a data type which is not yet supported.");
+   };
+
+   std::string rGC;
+   std::unordered_map<std::string, int> inputParams;
+   int i_input = 0;
+   for (auto &name : fInputTensorNames) {
+      // if is a dynamic tensor pass initial parameters
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            if (d.isParam && inputParams.count(pName) == 0) {
+               if (isdecl) rGC += "size_t ";
+               rGC += d.param + ",";
+               inputParams[pName] = i_input;
+            }
+         }
+      }
+      if (isdecl) {
+         rGC += GetBufType(name) + " const ";
+      }
+      rGC += "deviceBuf_" + name + ",";
+      i_input++;
+   }
+
+   if (fInputTensorNames.size() > 0) rGC.pop_back(); // remove last ","
+   return rGC;
+}
+
+std::string RModel::GenerateImplSignature_GPU_ALPAKA(bool isdecl) {
+
+   auto GetViewConstType = [this](const std::string& name) -> std::string {
+      ETensorType type = GetTensorType(name);
+      if (type == ETensorType::FLOAT)  return "ViewConstF1D";
+      if (type == ETensorType::DOUBLE) return "ViewConstD1D";
+      if (type == ETensorType::INT32)  return "ViewConstI321D";
+      if (type == ETensorType::INT64)  return "ViewConstI641D";
+      if (type == ETensorType::BOOL)   return "ViewConstUI81D";
+      throw std::runtime_error("sofie: input tensor " + name +
+                               " is of a data type which is not yet supported.");
+   };
+
+   std::string rGC;
+   std::unordered_map<std::string, int> inputParams;
+   int i_input = 0;
+   for (auto &name : fInputTensorNames) {
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            if (d.isParam && inputParams.count(pName) == 0) {
+               if (isdecl) rGC += "size_t ";
+               rGC += d.param + ",";
+               inputParams[pName] = i_input;
+            }
+         }
+      }
+      if (isdecl) {
+         rGC += GetViewConstType(name) + " const& ";
+      }
+      rGC += "deviceBuf_" + name + ",";
+      i_input++;
+   }
+
+   if (fInputTensorNames.size() > 0) rGC.pop_back();
+   return rGC;
+}
+
+void RModel::GenerateOutput_GPU_ALPAKA() {
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   size_t outputSize = fOutputTensorNames.size();
+   if (outputSize == 0)
+      throw std::runtime_error("sofie: output size=0 are not supported");
+
+   ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin());
+   bool sameOutputTypes = true;
+   for (std::string const &name : fOutputTensorNames) {
+      if (GetTensorType(name) != eFirstOutputType)
+         sameOutputTypes = false;
+   }
+
+   auto GetViewConstType = [this](const std::string &name) -> std::string {
+      ETensorType type = GetTensorType(name);
+      if (type == ETensorType::FLOAT)  return "ViewConstF1D";
+      if (type == ETensorType::DOUBLE) return "ViewConstD1D";
+      if (type == ETensorType::INT32)  return "ViewConstI321D";
+      if (type == ETensorType::INT64)  return "ViewConstI641D";
+      if (type == ETensorType::BOOL)   return "ViewConstUI81D";
+      throw std::runtime_error("sofie: input tensor " + name + " is of an unsupported data type.");
+   };
+
+   // Collect deduplicated dynamic dimension parameter names in declaration order
+   std::vector<std::string> dynParamNames;
+   {
+      std::unordered_map<std::string, int> seen;
+      for (auto &name : fInputTensorNames) {
+         if (IsDimInputTensor(name)) {
+            auto shape = GetDynamicTensorShape(name);
+            for (auto &d : shape) {
+               if (d.isParam && seen.count(d.param) == 0) {
+                  dynParamNames.push_back(d.param);
+                  seen[d.param] = 1;
+               }
+            }
+         }
+      }
+   }
+
+   fGC += "\n\n";
+
+   fGC += "void _infer_impl(";
+   fGC += GenerateImplSignature_GPU_ALPAKA();
+   fGC += "){\n";
+
+   // GPU profiling: _infer_impl is a member of Session, so fProfilingResults
+   // is directly accessible without any alias.
+   if (fProfile) {
+      fGC += RModelProfilerGPU::GenerateBeginInferCode();
+   }
+
+   std::set<size_t> fusedGroupsLaunched;
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+
+      if (fSkipOperators.count(op_idx)) continue;
+
+      auto gIt = fOpToFusionGroupIdx.find(op_idx);
+      size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX;
+      bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused();
+
+      if (inFusedGroup) {
+         // Only emit the fused kernel launch once, at the chain leader
+         if (fEltwiseFusionGroups[gIdx].opIndices[0] == op_idx && !fusedGroupsLaunched.count(gIdx)) {
+            const auto& grp = fEltwiseFusionGroups[gIdx];
+            std::string sfx = grp.suffix();
+            std::string kname = "fusedEltwiseKernel" + sfx;
+            std::string fusedCode;
+            fusedCode += "\n//------ FUSED_ELTWISE_GPU_ALPAKA" + sfx + "\n";
+            fusedCode += SP + "{\n";
+            fusedCode += SP + SP + "auto const elementsPerThread_fused" + sfx + " = Vec::all(static_cast<Idx>(1));\n";
+            fusedCode += SP + SP + "auto const elementsPerGrid_fused" + sfx + " = Vec::all(Idx{" + std::to_string(grp.numElements) + "});\n";
+            fusedCode += SP + SP + "auto const workDiv_fused" + sfx + " = sofie_workdiv(elementsPerGrid_fused" + sfx + ");\n";
+            fusedCode += SP + SP + "auto task_fused" + sfx + " = alpaka::createTaskKernel<Acc>(workDiv_fused" + sfx + ", " + kname +
+                   ", alpaka::getPtrNative(deviceBuf_" + grp.inputTensor + "), alpaka::getPtrNative(deviceBuf_" + grp.outputTensor +
+                   "), static_cast<Idx>(" + std::to_string(grp.numElements) + "));\n";
+            fusedCode += SP + SP + "alpaka::enqueue(queue, task_fused" + sfx + ");\n";
+            fusedCode += SP + "}\n";
+            if (fProfile) {
+               // wrap fused group with profiling
+               std::string fusedName = "FusedKernel" + sfx;
+               fGC += "   // -- GPU Profiling fused group: " + fusedName + " --\n";
+               fGC += "   tp_start = std::chrono::steady_clock::now();\n";
+               fGC += fusedCode;
+               fGC += "   alpaka::wait(queue);\n";
+               fGC += "   fProfilingResults[\"" + fusedName + "\"].push_back(\n";
+               fGC += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+               fGC += "         std::chrono::steady_clock::now() - tp_start).count());\n\n";
+            } else {
+               fGC += fusedCode;
+            }
+            fusedGroupsLaunched.insert(gIdx);
+         }
+         // Chain followers: skip — their logic is inside the fused kernel
+      } else {
+         if (fProfile) {
+            fGC += RModelProfilerGPU::GenerateOperatorCode(*fOperators[op_idx], op_idx);
+         } else {
+            fGC += fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx));
+         }
+      }
+   }
+   // Final wait (no-op when profiling since each op already syncs)
+   fGC += "\n\n   alpaka::wait(queue);\n";
+
+   if (fProfile) {
+      fGC += RModelProfilerGPU::GenerateEndInferCode();
+   }
+
+   fGC += "}\n\n";
+
+
+   std::string spanDynDecl;
+   for (auto &p : dynParamNames)
+      spanDynDecl += ", size_t " + p;
+
+   fGC += "void infer(std::span<ViewConstF1D const> inputs, std::span<ViewF1D> outputs" + spanDynDecl + "){\n";
+
+   {
+      fGC += SP + "_infer_impl(";
+      bool first = true;
+      for (auto &p : dynParamNames) {
+         if (!first) fGC += ", ";
+         fGC += p;
+         first = false;
+      }
+      for (size_t i = 0; i < fInputTensorNames.size(); i++) {
+         if (!first) fGC += ", ";
+         fGC += "inputs[" + std::to_string(i) + "]";
+         first = false;
+      }
+      fGC += ");\n";
+   }
+
+   // Copy member output buffers into caller-provided output views
+   for (size_t i = 0; i < outputSize; i++) {
+      std::string tensorName = *(fOutputTensorNames.begin() + i);
+      fGC += SP + "alpaka::memcpy(queue, outputs[" + std::to_string(i) + "], deviceBuf_" + tensorName + ");\n";
+   }
+   fGC += SP + "alpaka::wait(queue);\n";
+   fGC += "}\n\n";
+
+
+   std::string returnType;
+   if (outputSize == 1) {
+      returnType = "alpaka::Buf<Acc, " + ConvertOutputTypeToString(eFirstOutputType) + ", Dim, Idx>";
+   } else if (sameOutputTypes) {
+      returnType = "std::array<alpaka::Buf<Acc, " + ConvertOutputTypeToString(eFirstOutputType) +
+                   ", Dim, Idx>, " + std::to_string(outputSize) + ">";
+   } else {
+      returnType = "std::tuple<";
+      for (size_t i = 0; i < outputSize; i++) {
+         std::string tname = *(fOutputTensorNames.begin() + i);
+         returnType += "alpaka::Buf<Acc, " + ConvertOutputTypeToString(GetTensorType(tname)) + ", Dim, Idx>";
+         if (i < outputSize - 1) returnType += ",";
+      }
+      returnType += ">";
+   }
+
+   fGC += returnType + " infer(";
+   fGC += GenerateInferSignature_GPU_ALPAKA();
+   fGC += "){\n";
+
+   // Wrap each typed input buffer in a ViewConstXX, then call _infer_impl
+   std::vector<std::string> typedImplArgs;
+   for (auto &p : dynParamNames)
+      typedImplArgs.push_back(p);
+   for (auto &name : fInputTensorNames) {
+      std::string viewType = GetViewConstType(name);
+      fGC += SP + viewType + " const view_" + name +
+             "{alpaka::getPtrNative(deviceBuf_" + name + "), devAcc, alpaka::getExtents(deviceBuf_" + name + ")};\n";
+      typedImplArgs.push_back("view_" + name);
+   }
+
+   fGC += SP + "_infer_impl(";
+   for (size_t i = 0; i < typedImplArgs.size(); i++) {
+      if (i > 0) fGC += ", ";
+      fGC += typedImplArgs[i];
+   }
+   fGC += ");\n";
+
+   // Return the member output buffer(s)
+   fGC += SP + "return ";
+   if (outputSize > 1) fGC += "{";
+   for (size_t i = 0; i < outputSize; i++) {
+      std::string tensorName = *(fOutputTensorNames.begin() + i);
+      fGC += "deviceBuf_" + tensorName;
+      if (i < outputSize - 1) fGC += ",";
+   }
+   if (outputSize > 1) fGC += "}";
+   fGC += ";\n";
+   fGC += "}\n";
+}
+
+void RModel::GenerateSessionCode_GPU_ALPAKA() {
+
+   std::set<SOFIE::OperatorKind> registered_operators;
+   std::set<size_t> fusedGroupsEmitted; // tracks which fusion groups have had their struct/decl emitted
+
+   std::set<SOFIE::OperatorKind> single_initialized_operators = {
+      SOFIE::OperatorKind::RELU,
+      SOFIE::OperatorKind::SIGMOID,
+      SOFIE::OperatorKind::TANH,
+      SOFIE::OperatorKind::SOFTMAX,
+      SOFIE::OperatorKind::LEAKYRELU,
+      SOFIE::OperatorKind::EINSUM,
+      SOFIE::OperatorKind::ELU,
+      SOFIE::OperatorKind::UNARY_RECIPROCAL,
+      SOFIE::OperatorKind::UNARY_SQRT,
+      SOFIE::OperatorKind::UNARY_NEG,
+      SOFIE::OperatorKind::UNARY_EXP,
+      SOFIE::OperatorKind::UNARY_LOG,
+      SOFIE::OperatorKind::UNARY_SIN,
+      SOFIE::OperatorKind::UNARY_COS,
+      SOFIE::OperatorKind::UNARY_ABS,
+      SOFIE::OperatorKind::NOT
+   };
+
+   bool OpNeedsBlas = false;
+
+   fGC += "\n//--- ALPAKA Kernels\n";
+   for (size_t id = 0; id < fOperators.size(); id++) {
+      if(fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) {
+         OpNeedsBlas = true;
+      }
+
+      auto gIt = fOpToFusionGroupIdx.find(id);
+      size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX;
+      bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused();
+
+      if (inFusedGroup) {
+         // Only emit the fused kernel struct once, at the chain leader
+         if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) {
+            const auto& grp = fEltwiseFusionGroups[gIdx];
+            std::string sfx = grp.suffix();
+            fGC += "\n//------ FUSED_ELTWISE_KERNEL" + sfx + "\n";
+            fGC += "struct FusedEltwiseKernel" + sfx + " {\n";
+            fGC += SP + "template<typename TAcc, typename T>\n";
+            fGC += SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* __restrict__ data, T* __restrict__ out, std::size_t n) const {\n";
+            fGC += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+            fGC += SP + SP + "if (idx < n) {\n";
+            fGC += SP + SP + SP + "T v = data[idx];\n";
+            for (size_t opIdx : grp.opIndices)
+               fGC += SP + SP + SP + "v = " + fOperators[opIdx]->GetElementwiseExpr("v") + ";\n";
+            fGC += SP + SP + SP + "out[idx] = v;\n";
+            fGC += SP + SP + "}\n";
+            fGC += SP + "}\n";
+            fGC += "};\n";
+            fusedGroupsEmitted.insert(gIdx);
+         }
+         // Chain followers: skip (their logic is inside the fused kernel)
+      } else {
+         // Unfused op: generate individual kernel struct (with dedup for single_initialized_operators)
+         if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) {
+            if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+               if (fVerbose)
+                  std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl;
+               fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id));
+               registered_operators.insert(fOperators[id]->GetKind());
+            }
+         } else {
+            if (fVerbose)
+               std::cout << "Generating ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl;
+            fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id));
+         }
+      }
+   }
+
+
+   fGC += "\ntemplate<typename TDim, typename TIdx>\n";
+   fGC += "inline alpaka::WorkDivMembers<TDim, TIdx> sofie_workdiv(\n";
+   fGC += "    alpaka::Vec<TDim, TIdx> const& numElems, TIdx blockSz = TIdx{256})\n{\n";
+   fGC += "    auto const numBlocks = alpaka::Vec<TDim, TIdx>::all(\n";
+   fGC += "        (numElems[0] + blockSz - TIdx{1}) / blockSz);\n";
+   fGC += "    return alpaka::WorkDivMembers<TDim, TIdx>(\n";
+   fGC += "        numBlocks,\n";
+   fGC += "        alpaka::Vec<TDim, TIdx>::all(blockSz),\n";
+   fGC += "        alpaka::Vec<TDim, TIdx>::all(TIdx{1}));\n";
+   fGC += "}\n\n";
+
+   // define the Session struct (for GNN this is generated in RModel_GNN)
+  fGC += "\n\ntemplate <typename tagAcc>\n";
+   if (fUseSession) {
+      if (!fIsSubGraph)
+         fGC += "struct Session {\n\n";
+      else
+         fGC += "struct Session_" + fName + " {\n\n";
+   }
+
+   // define host and device accelerators
+    fGC += "using Idx = std::size_t;\n";
+    fGC += "using Dim = alpaka::DimInt<1>;\n";
+    fGC += "using Acc = alpaka::TagToAcc<tagAcc, Dim, Idx>;\n";
+    fGC += "using DevAcc = alpaka::Dev<Acc>;\n\n";
+    fGC += "using QueueProperty = alpaka::NonBlocking;\n";
+    fGC += "using QueueAcc = alpaka::Queue<Acc, QueueProperty>;\n\n";
+    fGC += "using BufF1D = alpaka::Buf<Acc, float, Dim, Idx>;\n";
+    fGC += "using BufD1D = alpaka::Buf<Acc, double, Dim, Idx>;\n";
+    fGC += "using BufI321D = alpaka::Buf<Acc, int32_t, Dim, Idx>;\n";
+    fGC += "using BufI641D = alpaka::Buf<Acc, int64_t, Dim, Idx>;\n";
+    fGC += "using BufUI81D = alpaka::Buf<Acc, uint8_t, Dim, Idx>;\n\n";
+    fGC += "// Non-owning device view types (ViewPlainPtr) for the span-based infer interface\n";
+    fGC += "using ViewF1D = alpaka::ViewPlainPtr<DevAcc, float, Dim, Idx>;\n";
+    fGC += "using ViewConstF1D = alpaka::ViewPlainPtr<DevAcc, const float, Dim, Idx>;\n";
+    fGC += "using ViewD1D = alpaka::ViewPlainPtr<DevAcc, double, Dim, Idx>;\n";
+    fGC += "using ViewConstD1D = alpaka::ViewPlainPtr<DevAcc, const double, Dim, Idx>;\n";
+    fGC += "using ViewI321D = alpaka::ViewPlainPtr<DevAcc, int32_t, Dim, Idx>;\n";
+    fGC += "using ViewConstI321D = alpaka::ViewPlainPtr<DevAcc, const int32_t, Dim, Idx>;\n";
+    fGC += "using ViewI641D = alpaka::ViewPlainPtr<DevAcc, int64_t, Dim, Idx>;\n";
+    fGC += "using ViewConstI641D = alpaka::ViewPlainPtr<DevAcc, const int64_t, Dim, Idx>;\n";
+    fGC += "using ViewUI81D = alpaka::ViewPlainPtr<DevAcc, uint8_t, Dim, Idx>;\n";
+    fGC += "using ViewConstUI81D = alpaka::ViewPlainPtr<DevAcc, const uint8_t, Dim, Idx>;\n\n";
+
+    fGC += "\nalpaka::Platform<Acc> const platform{};\n";
+    fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n";
+    fGC += "alpaka::PlatformCpu platformHost{};\n";
+    fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n";
+    fGC += "QueueAcc queue{devAcc};\n";
+    fGC += "Idx threadsPerBlock = 256;\n";
+    fGC += "\nusing Ext1D = alpaka::Vec<Dim, Idx>;\n";
+    fGC += "using Vec = alpaka::Vec<Dim, Idx>;\n";
+    if (OpNeedsBlas) {
+         fGC += "\n\n// BLAS declarations\n";
+         fGC += "sofieBLAS<tagAcc> blas{queue};\n";
+    }
+
+   GenerateInitializedTensorInfo_GPU_ALPAKA();
+   GenerateGPU_ALPAKA_Buffers();
+   GenerateOperatorDeclarations();
+   // inject profiling session data member
+   if (fProfile) {
+      fGC += RModelProfilerGPU::GenerateSessionMembers();
+   }
+
+   // Session constructor
+   if (fUseSession) {
+      std::string sessionName = "\n\nSession";
+      if (fIsSubGraph)
+         sessionName += "_" + fName;
+
+      if (fUseWeightFile) {
+         std::string fileName = fName;
+         if (fWeightFile == WeightFileType::Text)
+            fileName += ".dat";
+         if (fWeightFile == WeightFileType::RootBinary)
+            fileName += ".root";
+
+         fGC += sessionName + "(std::string filename =\"" + fileName + "\"";
+      } else {
+         fGC += sessionName + "(std::string = \"\"";
+      }
+
+      if (!fShapeParams.empty()) {
+         for (auto &p : fShapeParams) {
+            fGC += ",\n";
+            fGC += "        size_t " + p.first + " = " + p.second;
+         }
+      }
+      fGC += ") {\n";
+      
+      GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+      if (fUseWeightFile) {
+         fGC += "\n//--- reading weights from file\n";
+         ReadInitializedTensorsFromFile(0);
+         fGC += "\n";
+      }
+      
+      MoveInitializedTensorsToBuffers_ALPAKA();
+      GenerateDynamicTensorInfo_GPU_ALPAKA();
+
+      for (size_t id = 0; id < fOperators.size(); id++) {
+         if (fSkipOperators.count(id)) continue;
+         fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA();
+         if (fOperators[id]->GetKind() == OperatorKind::GEMM || fOperators[id]->GetKind() == OperatorKind::CONV) {
+            // GetBlasConfig() returns "" for ops that use gemmStridedBatched
+            // (legacy cuBLAS path, no cuBLASLt layout registration needed).
+            auto blasCfg = fOperators[id]->GetBlasConfig();
+            if (!blasCfg.empty())
+               fGC += "\nblas.addLayoutConfig("+blasCfg+");\n";
+         }
+      }
+
+      fGC += "\nalpaka::wait(queue);\n";
+      fGC += "}\n\n";
+   }
+
+   registered_operators.clear();
+   fusedGroupsEmitted.clear();
+
+   for (size_t id = 0; id < fOperators.size(); id++) {
+      // Same as the kernel-struct loop above: fused activation ops must still
+      // declare their member variable (e.g. `leakyReluKernel`) even though
+      // their Generate_GPU_ALPAKA call is skipped in the infer-body loop.
+
+      auto gIt = fOpToFusionGroupIdx.find(id);
+      size_t gIdx = (gIt != fOpToFusionGroupIdx.end()) ? gIt->second : SIZE_MAX;
+      bool inFusedGroup = (gIdx != SIZE_MAX) && fEltwiseFusionGroups[gIdx].isFused();
+
+      if (inFusedGroup) {
+         if (fEltwiseFusionGroups[gIdx].opIndices[0] == id && !fusedGroupsEmitted.count(gIdx)) {
+            std::string sfx = fEltwiseFusionGroups[gIdx].suffix();
+            fGC += SP + "FusedEltwiseKernel" + sfx + " fusedEltwiseKernel" + sfx + ";\n";
+            fusedGroupsEmitted.insert(gIdx);
+         }
+      } else {
+         if (single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) {
+            if (registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+               if (fVerbose)
+                  std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl;
+               fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id));
+               registered_operators.insert(fOperators[id]->GetKind());
+            }
+         } else {
+            if (fVerbose)
+               std::cout << "Declaring ALPAKA kernel for operator " << toString(fOperators[id]->GetKind()) << std::endl;
+            fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id));
+         }
+      }
+   }
+
+   GenerateOutput_GPU_ALPAKA();
+
+   // inject GPU profiling utility functions and memory report inside Session struct
+   if (fProfile && fUseSession) {
+      fGC += RModelProfilerGPU::GenerateUtilityFunctions();
+      auto memInfo = RModelProfilerGPU::ComputeMemoryInfo(*this);
+      fGC += RModelProfilerGPU::GenerateMemoryReport(memInfo);
+   }
+
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n";
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize, bool verbose) {
+   fProfile = static_cast<bool>(options & static_cast<std::underlying_type_t<Options>>(Options::kProfile));
+   fVerbose = true;
+   fBatchSize = batchSize;
+
+   if (fProfile)
+      RModelProfilerGPU::AddNeededStdLibs(*this);
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
+      fUseSession = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoWeightFile) & options) {
+      fUseWeightFile = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kRootBinaryWeightFile) & options) {
+      fUseWeightFile = true;
+      fWeightFile = WeightFileType::RootBinary;
+   }
+   if (fUseWeightFile && !fUseSession) {
+      throw std::runtime_error(
+          "sofie: RModel::Generate: cannot use a separate weight file without generating a Session class");
+   }
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options ||
+       static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+      throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference.");
+
+   Initialize(batchSize, verbose);
+   FuseGemmActivations_GPU();   // must run before elementwise fusion (redirects tensors)
+   ComputeEltwiseFusionGroups();
+
+   std::string hgname;
+   if (!fIsSubGraph) {
+      fGC.clear();
+      GenerateHeaderInfo_GPU_ALPAKA(hgname);
+   }
+
+   if (fVerbose)
+      std::cout << "generate Main session code - model  " << fName << std::endl;
+
+   GenerateSessionCode_GPU_ALPAKA();
+
+   if (!fIsSubGraph) {
+      fGC += ("} //SOFIE_" + fName + "\n");
+      fGC += "\n#endif  // " + hgname + "\n";
+   }
+}
+
+void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){
+      for (auto &i : fInitializedTensors) {
+         if (i.second.IsNotWritable())  continue;
+         std::string tensor_name = "tensor_" + i.first;
+         auto length = ConvertShapeToLength(i.second.shape());
+         std::string slength = std::to_string(length);
+         // Use the 3-argument createView(dev, container, extent) which calls std::data()
+         // internally — works for both std::vector and raw C arrays.
+         fGC += "     auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+", " + slength + ");\n";
+         fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+   }
+  }
+
+} // namespace SOFIE
diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/core/src/RModel_Base.cxx
similarity index 60%
rename from src/SOFIE_core/src/RModel_Base.cxx
rename to core/src/RModel_Base.cxx
index d4d1f1c..9c49e37 100644
--- a/src/SOFIE_core/src/RModel_Base.cxx
+++ b/core/src/RModel_Base.cxx
@@ -32,9 +32,16 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) {
     fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n";
     if (fUseWeightFile)
         fGC += "#include <fstream>\n";
-    // Include TFile when saving the weights in a binary ROOT file
-    if (fWeightFile == WeightFileType::RootBinary)
-        fGC += "#include \"TFile.h\"\n";
+
+    if (fWeightFile == WeightFileType::RootBinary){
+    #ifdef SOFIE_SUPPORT_ROOT_BINARY
+        // Include TFile when saving the weights in a binary ROOT file
+            fGC += "#include \"TFile.h\"\n";
+    #else
+        throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support");
+    #endif
+    
+    }
 
     fGC += "\nnamespace SOFIE_" + fName + "{\n";
     if (!fNeededBlasRoutines.empty()) {
@@ -58,6 +65,45 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) {
     }
 }
 
+void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) {
+    fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n");
+    // add header guards
+    hgname = fName;
+    std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) {
+                       return std::toupper(c);
+                   } );
+    hgname = "SOFIE_" + hgname;
+    fGC += "\n#ifndef " + hgname + "\n";
+    fGC += "#define " + hgname + "\n\n";
+    for (auto& i: fNeededStdLib) {
+        fGC += "#include <" + i + ">\n";
+    }
+    for (auto& i: fCustomOpHeaders) {
+        fGC += "#include \"" + i + "\"\n";
+    }
+    fGC += "#include <alpaka/alpaka.hpp>\n";
+    fGC += "#include <sofieBLAS/sofieBLAS.hpp>\n";
+    fGC += "#include <span>\n";
+
+    // for the session we need to include SOFIE_Common functions
+    //needed for convolution operator (need to add a flag)
+    fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n";
+    if (fUseWeightFile)
+        fGC += "#include <fstream>\n";
+
+    if (fWeightFile == WeightFileType::RootBinary){
+        #ifdef SOFIE_SUPPORT_ROOT_BINARY
+            // Include TFile when saving the weights in a binary ROOT file
+                fGC += "#include \"TFile.h\"\n";
+        #else 
+            throw std::runtime_error("sofie: ROOT binary weight file option is enabled but the code is not compiled with ROOT support");
+        #endif
+    }
+
+    fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n";
+    fGC += "\nnamespace SOFIE_" + fName + "{\n";
+}
+
 void RModel_Base::OutputGenerated(std::string filename, bool append) {
     // the model can be appended only if a file name is provided
     if (filename.empty()) {
@@ -71,7 +117,7 @@ void RModel_Base::OutputGenerated(std::string filename, bool append) {
     else
         f.open(filename);
     if (!f.is_open()) {
-        throw std::runtime_error("tmva-sofie failed to open file for output generated inference code");
+        throw std::runtime_error("sofie failed to open file for output generated inference code");
     }
     f << fGC;
     f.close();
diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/core/src/RModel_GNN.cxx
similarity index 98%
rename from src/SOFIE_core/src/RModel_GNN.cxx
rename to core/src/RModel_GNN.cxx
index a1dfe06..3dae254 100644
--- a/src/SOFIE_core/src/RModel_GNN.cxx
+++ b/core/src/RModel_GNN.cxx
@@ -94,7 +94,7 @@ void RModel_GNN::Generate() {
 
     // the number of output edges features can be smaller, so we need to correct here
     auto num_edge_features_input = num_edge_features;
-    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
     }
@@ -117,7 +117,7 @@ void RModel_GNN::Generate() {
 
     // we need to correct the output number of node features
     auto num_node_features_input = num_node_features;
-    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
     }
diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/core/src/RModel_GraphIndependent.cxx
similarity index 97%
rename from src/SOFIE_core/src/RModel_GraphIndependent.cxx
rename to core/src/RModel_GraphIndependent.cxx
index bab06b3..cd62d0c 100644
--- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx
+++ b/core/src/RModel_GraphIndependent.cxx
@@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() {
 
        // the number of output edges features can be smaller, so we need to correct here
        // assume num_edge_features is not a parametric shape
-       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
        }
@@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() {
       fGC+="};\n}\n";
 
       // we need to correct the output number of node features
-      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
       }
@@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() {
       // we need to correct the output number of global features
       // global features are in shape[1]
 #if 0
-      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) {
           num_global_features = globals_update_output_shape[1].dim;
        }
diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/core/src/SOFIE_common.cxx
similarity index 50%
rename from src/SOFIE_core/src/SOFIE_common.cxx
rename to core/src/SOFIE_common.cxx
index ad74313..a2bafde 100644
--- a/src/SOFIE_core/src/SOFIE_common.cxx
+++ b/core/src/SOFIE_common.cxx
@@ -1,15 +1,18 @@
 #include "SOFIE/SOFIE_common.hxx"
-#include<cctype>
+
+#include <cctype>
 #include <sstream>
 #include <stdexcept>
+#include <charconv>
+#include <unordered_map>
+#include <set>
 
-
-namespace SOFIE{
+namespace SOFIE {
 
 /// @brief  Convert shape from integer format to dynamic one (based on Dim)
 /// @param shape
 /// @return shape based on Dim
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape){
    std::vector<Dim> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       ret_shape[i].dim = shape[i];
@@ -20,7 +23,7 @@ std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
 /// @brief Convert shape based on Dim to integer format
 /// @param shape
 /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter)
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape){
    std::vector<size_t> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       if (shape[i].isParam) {
@@ -46,18 +49,35 @@ std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
 }
 
 
-std::size_t ConvertShapeToLength(std::vector<size_t> shape){
+std::size_t ConvertShapeToLength(const std::vector<size_t> & shape){
    // Empty shape represent scalar values, so we return a length=1
    std::size_t fLength = 1;
    for (auto& dim: shape) fLength *= dim;
    return fLength;
 }
 
+std::size_t ConvertShapeToLength(const std::vector<Dim> & shape){
+   // convert generic shape to a string
+   // multiply all the integer specified dimensions of the shape
+   std::size_t length = 1;
+   for (size_t i = 0; i < shape.size(); i++) {
+      if (!shape[i].isParam) {
+         length *= shape[i].dim;
+      } else {
+         return static_cast<std::size_t>(-1); // return -1 in case of parametric shapes
+      }
+   }
+   return length;
+}
+
 std::string ConvertTypeToString(ETensorType type){
    switch(type){
       case ETensorType::FLOAT : {
          return "float";
       }
+      case ETensorType::INT8 : {
+         return "int8_t";
+      }
       case ETensorType::INT16 : {
          return "int16_t";
       }
@@ -67,6 +87,9 @@ std::string ConvertTypeToString(ETensorType type){
       case ETensorType::INT64 : {
          return "int64_t";
       }
+      case ETensorType::UINT8 : {
+         return "uint8_t";
+      }
       case ETensorType::UINT16 : {
          return "uint16_t";
       }
@@ -80,7 +103,7 @@ std::string ConvertTypeToString(ETensorType type){
          return "double";
       }
       case ETensorType::BOOL : {
-         return "bool";
+         return "uint8_t";
       }
       default:{
          return "other_" + std::to_string( (int) type);
@@ -106,7 +129,7 @@ ETensorType ConvertStringToType(std::string type){
    }
 }
 
-std::string ConvertShapeToString(std::vector<size_t> shape) {
+std::string ConvertShapeToString(const std::vector<size_t> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
@@ -117,41 +140,49 @@ std::string ConvertShapeToString(std::vector<size_t> shape) {
    return out.str();
 }
 
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape) {
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
-      out << shape[i].GetVal();
+      out << shape[i];
       if (i < shape.size()-1) out << " , ";
    }
    out << " }";
    return out.str();
 }
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape) {
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape) {
    // convert generic shape to a string
    // multiply all the integer specified dimensions of the shape
    std::string length;
-   size_t int_length = 0;
+   // case of empty vectors return 1
+   if (shape.empty()) return "1";
+   int64_t int_length = -1;
    for (size_t i = 0; i < shape.size(); i++) {
       if (shape[i].isParam) {
          if (!length.empty()) length += " * ";
          length += shape[i].param;
       } else {
-         if (int_length == 0)
+         if (int_length == -1)
             int_length = shape[i].dim;
          else
             int_length *= shape[i].dim;
       }
    }
    // multiply the integer components to the parametric one
-   if (int_length > 0) {
-      if (!length.empty()) length += " * ";
-      length += std::to_string(int_length);
+   // if larger than 1 - otherwise returns -1
+   if (int_length >= 0) {
+      if (!length.empty() && int_length > 1) {
+         length += " * ";
+         length += std::to_string(int_length);
+      } else if (length.empty()) { // case is full known shape
+         length = std::to_string(int_length);
+      }
    }
    return length;
 }
 
+
 namespace{
 template<typename T>
 static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* input, T* target){  //only visible within this translation unit
@@ -169,6 +200,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp
 }
 }
 
+bool IsInteger(const std::string & s) {
+   int value;
+   auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value);
+   return ec == std::errc() && ptr == s.data() + s.size();
+}
+
 bool UTILITY::AreSameShape(const std::vector<size_t>& shapeA, const std::vector<size_t>& shapeB) {
    if (shapeA.size() != shapeB.size()) {
       return false;
@@ -330,17 +367,24 @@ std::vector<size_t>  UTILITY::MultidirectionalBroadcastShape(std::vector<std::ve
    }
 }
 
-std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> shapeA, std::vector<size_t> shapeB)
+// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's
+// return a pair of integer flag and new broadcasted shape
+// if flag = 0: shape are identical
+//    flag = 1: return shape is equal to A, we broadcast B
+//    flag = 2: return shape is equal to B we broadcast A
+//    flag = 3: return shape is common of two we broadcast A and B to output
+std::pair<int, std::vector<size_t>>  UTILITY::MultidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
 {
    size_t sizeA = shapeA.size();
    size_t sizeB = shapeB.size();
    // Check if A and B have the same shape
    if (UTILITY::AreSameShape(shapeA, shapeB)){
-      return shapeA;
+      return std::make_pair(0, shapeA);
    }
    // Find the common shape of A and B
    size_t size = std::max(sizeA, sizeB);
    if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
       std::vector<size_t> newShapeA(size, 1);
       size_t offset = size - sizeA;
       std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
@@ -359,36 +403,117 @@ std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> s
          break;
       }
    }
+   int broadcastFlag = 0;
    if (broadcastable) {
       // The output shape is max(outShape, targetShape)
       std::vector<size_t> targetShape(size, 1);
       for (size_t i = 0; i < size; i++) {
          targetShape[i] = std::max(shapeA[i], shapeB[i]);
+         if (shapeB[i] < targetShape[i]) broadcastFlag |= 1;
+         if (shapeA[i] < targetShape[i]) broadcastFlag |= 2;
       }
-      return targetShape;
+      return std::make_pair(broadcastFlag, targetShape);
    } else {
       throw
-         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+         std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape "
             + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
             + " to a common shape.");
    }
 }
+// unidirectional broadcast- of shape A to target B
+std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
+{
+   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA);
+   if (ret.first > 1) {
+      throw
+         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+            + ConvertShapeToString(shapeA) + " to  " + ConvertShapeToString(shapeB)
+            + " in a common shape.");
+   }
+   return ret.second;
+}
+
+// for broadcasting Dim shapes
+// flag indicates also which vector needs to be broadcasted
+//    flag & 1 == 1 : broadcast B -> A
+//    flag & 2 == 2 : broadcast A -> B
+//    flag & 4 == 4 a run time check is needed on shapes with values
+std::pair<int, std::vector<Dim>> UTILITY::MultidirectionalBroadcastShape(std::vector<Dim> & shapeA, std::vector<Dim> & shapeB) {
+   size_t sizeA = shapeA.size();
+   size_t sizeB = shapeB.size();
+   // Check if A and B have the same shape
+   if (UTILITY::AreSameShape(shapeA, shapeB)){
+      return std::make_pair(0, shapeA);
+   }
+   // Find the common shape of A and B
+   size_t size = std::max(sizeA, sizeB);
+   if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
+      std::vector<Dim> newShapeA(size, Dim{1});
+      size_t offset = size - sizeA;
+      std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
+      shapeA = std::move(newShapeA);
+   }
+   if (sizeB < size) {
+      std::vector<Dim> newShapeB(size, Dim{1});
+      size_t offset = size - sizeB;
+      std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset);
+      shapeB = std::move(newShapeB);
+   }
+
+   int broadcastFlag = 0;
+   // The output shape is targetShape
+   std::vector<Dim> targetShape(size);
+   for (size_t i = 0; i < size; i++) {
+      // assume we broadcast to the parametric value
+      if (shapeA[i] == shapeB[i]) {
+         targetShape[i] = shapeA[i];
+      } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) {
+         // broadcast B to A (case A is parametric with )
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 1;
+      } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) {
+         // broadcast A to B
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 2;
+      } else if (!shapeA[i].isParam && !shapeB[i].isParam) {
+         if (shapeB[i].dim == 1) {
+            targetShape[i] = shapeA[i];
+            broadcastFlag |= 1;
+         } else if (shapeA[i].dim == 1) {
+            targetShape[i] = shapeB[i];
+            broadcastFlag |= 2;
+         } else {
+            // non broadcastable case cannot have A and B two different defined shapes different than one
+            broadcastFlag = -1;
+         }
+      } else if (shapeA[i].isParam && shapeB[i].isParam) {
+         // full dynamic case - we will decided at run time
+         std::stringstream s;
+         s <<  "std::max(" << shapeA[i] << "," << shapeB[i] << ")";
+         // use -1 for dim to indicate is an expression
+         targetShape[i] = Dim { s.str() , static_cast<size_t>(-1)};
+         broadcastFlag |= 4;
+      } else if (shapeA[i].isParam && !shapeB[i].isParam) {
+         // A -> B need to check at run time if consistent
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 6;
+      } else if (!shapeA[i].isParam && shapeB[i].isParam) {
+         // B -> A need to check at run time if consistent
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 5;
+      } else {
+         // all cases should be covered
+         throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape");
+      }
+   }
+   if (broadcastFlag == -1) {
+      throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " +
+                                 ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) +
+                                 " to a common shape.");
+   }
 
-// UNidirectional boradcast specializaiton for vector<bool>
-
-// specialization for vector of boolean
-void UTILITY::UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData)
- {
-   // Prepend shape with ones
-   auto ncdata = const_cast<std::vector<bool> &>(data);
-   if (shape.size() < targetShape.size()) {
-      size_t targetSize = targetShape.size();
-      std::vector<size_t> newShape(targetSize, 1);
-      size_t offset = targetSize - shape.size();
-      std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, newShape, targetShape, broadcastedData);
-   }
-   UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, shape, targetShape, broadcastedData);
+   return std::make_pair(broadcastFlag, targetShape);
 }
 
 std::string UTILITY::Clean_name(std::string input_tensor_name){
@@ -413,15 +538,146 @@ std::vector<Dim> UTILITY::ComputeStrideFromShape(const std::vector<Dim> & shape)
    // assume row major layout
    const auto size = shape.size();
    std::vector<Dim> strides(size);
-   strides[size-1] = Dim{1};
-   for (std::size_t i = 1; i < size; i++) {
-      if (!shape[size-i].isParam && !strides[size-i].isParam)
-         strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
-      else
-         strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+   if (size > 0) {
+      strides[size-1] = Dim{1};
+      for (std::size_t i = 1; i < size; i++) {
+         if (!shape[size-i].isParam && !strides[size-i].isParam)
+            strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
+         else {
+            if (strides[size-i].GetVal() == "1")
+               strides[size - 1 - i] = shape[size-i];
+            else if (shape[size-i].GetVal() == "1")
+               strides[size - 1 - i] = strides[size-i];
+            else
+              strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+         }
+      }
    }
    return strides;
 }
 
+struct FreeBlock {
+  std::size_t offset;
+  std::size_t size;
+  bool operator<(const FreeBlock& other) const {
+    // order by offset for deterministic coalescing
+    return offset < other.offset;
+  }
+};
+
+struct MemoryEvent {
+  int t;      // time (i.e. operator index)
+  int type;   // 0 = END first, 1 = START
+  int idx;    // tensor index
+  bool operator<(const MemoryEvent& o) const {
+    if (t != o.t) return t < o.t;
+    return type < o.type; // END before START at the same time
+  }
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo )
+{
+   // Basic validation
+   for (const auto &t : tensorsInfo) {
+      if (!(t.end > t.begin)) {
+         throw std::runtime_error("Each tensor must have end > begin.");
+      }
+   }
+
+   // Build events: free before allocate at equal times.
+   std::vector<MemoryEvent> events;
+   events.reserve(tensorsInfo.size() * 2);
+   for (int i = 0; i < (int)tensorsInfo.size(); ++i) {
+      events.push_back({tensorsInfo[i].end, 0, i});   // END
+      events.push_back({tensorsInfo[i].begin, 1, i}); // START
+   }
+   std::sort(events.begin(), events.end());
+
+   std::vector<size_t> tensorsOffset(tensorsInfo.size());
+
+   // Free list ordered by offset (for O(log n) coalescing)
+   // and faster insert/erase with respect to a vector
+   std::set<FreeBlock> free_list;
+
+   // Bookkeeping: size/offset map for frees.
+   std::unordered_map<int, std::size_t> live_size;
+   std::unordered_map<int, std::size_t> live_offset;
+
+   std::size_t total_bytes = 0;
+
+   auto allocate_best_fit = [&](std::size_t need) -> std::size_t {
+      // Find the *smallest* block whose size >= need (best-fit).
+      // Since free_list is ordered by offset, we scan to find best by size.
+      // (For very large sets you could maintain a multimap by size as well.)
+      auto best = free_list.end();
+      for (auto it = free_list.begin(); it != free_list.end(); ++it) {
+         if (it->size >= need) {
+            if (best == free_list.end() || it->size < best->size)
+               best = it;
+         }
+      }
+      if (best != free_list.end()) {
+         std::size_t off = best->offset;
+         if (best->size == need) {
+            free_list.erase(best);
+         } else {
+            FreeBlock updated{best->offset + need, best->size - need};
+            free_list.erase(best);
+            free_list.insert(updated);
+         }
+         return off;
+      }
+      // No free block large enough; grow the heap.
+      std::size_t off = total_bytes;
+      total_bytes += need;
+      return off;
+   };
+
+   auto try_coalesce = [&](std::set<FreeBlock>::iterator it) {
+      // Coalesce with previous
+      if (it != free_list.begin()) {
+         auto prev = std::prev(it);
+         if (prev->offset + prev->size == it->offset) {
+            FreeBlock merged{prev->offset, prev->size + it->size};
+            free_list.erase(prev);
+            it = free_list.erase(it);
+            it = free_list.insert(merged).first;
+         }
+      }
+      // Coalesce with next
+      auto next = std::next(it);
+      if (next != free_list.end() && it->offset + it->size == next->offset) {
+         FreeBlock merged{it->offset, it->size + next->size};
+         free_list.erase(next);
+         it = free_list.erase(it);
+         free_list.insert(merged);
+      }
+   };
+
+   // Sweep through time.
+   for (const auto &e : events) {
+      if (e.type == 0) { // END: free
+         auto it_sz = live_size.find(e.idx);
+         auto it_off = live_offset.find(e.idx);
+         if (it_sz != live_size.end() && it_off != live_offset.end()) {
+            FreeBlock fb{it_off->second, it_sz->second};
+            // Insert and coalesce with neighbors
+            auto it = free_list.insert(fb).first;
+            try_coalesce(it);
+            live_size.erase(it_sz);
+            live_offset.erase(it_off);
+         }
+      } else { // START: allocate
+         auto &t = tensorsInfo[e.idx];
+         std::size_t off = allocate_best_fit(t.size);
+         tensorsOffset[e.idx] = off;
+         live_size[e.idx] = t.size;
+         live_offset[e.idx] = off;
+      }
+   }
+
+   return MemoryResult{total_bytes, std::move(tensorsOffset)};
+}
 
-}//SOFIE
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/CMakeLists.txt b/parsers/CMakeLists.txt
similarity index 78%
rename from src/SOFIE_parsers/CMakeLists.txt
rename to parsers/CMakeLists.txt
index 379b7d7..7174e90 100644
--- a/src/SOFIE_parsers/CMakeLists.txt
+++ b/parsers/CMakeLists.txt
@@ -5,7 +5,7 @@
 # For the list of contributors see $ROOTSYS/README/CREDITS.
 
 ############################################################################
-# CMakeLists.txt file for building TMVA SOFIE package
+# CMakeLists.txt file for building SOFIE package
 ############################################################################
 #Author: Sitong An, Lorenzo Moneta 10/03/2021
 
@@ -26,13 +26,15 @@ set(source_headers
 )
 list(TRANSFORM source_headers PREPEND "inc/")
 target_include_directories(SOFIE_parsers
-  PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}/inc
+  PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/inc>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
 set(sources_cxx
     src/RModelParser_ONNX.cxx
     src/ParseBasicUnary.cxx
     src/ParseBasicBinary.cxx
+    src/ParseBasicIs.cxx
     src/ParseBatchNormalization.cxx
     src/ParseCast.cxx
     src/ParseConcat.cxx
@@ -61,6 +63,7 @@ set(sources_cxx
     src/ParseLayerNormalization.cxx
     src/ParseExpand.cxx
     src/ParseGather.cxx
+    src/ParseGatherND.cxx
     src/ParseElu.cxx
     src/ParseFuseConvAdd.cxx
     src/ParseFuseConvTransposeAdd.cxx
@@ -79,7 +82,11 @@ set(sources_cxx
     src/ParseWhere.cxx
     src/ParseEinsum.cxx
     src/ParseRandom.cxx
+    src/ParseNot.cxx
+    src/ParseClip.cxx
     src/ParseScatterElements.cxx
+    src/ParseTrilu.cxx
+    src/ParseLogic.cxx
     ${PROTO_SRCS}
   ${DEPENDENCIES}
     ${SOFIE_core}
@@ -102,7 +109,21 @@ target_include_directories(SOFIE_parsers PUBLIC
   set_target_properties(SOFIE_parsers PROPERTIES
   POSITION_INDEPENDENT_CODE TRUE)
 
+if(SOFIE_WITH_ROOT AND ROOT_FOUND)
+  ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers}
+    LINKDEF inc/LinkDef.h
+    MODULE SOFIE_parsers
+    OPTIONS --deep
+  )
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm
+                ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap
+          DESTINATION lib)
+endif()
+
 install(TARGETS SOFIE_parsers
-        LIBRARY DESTINATION lib
+  EXPORT SOFIETargets
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/"
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
-install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/inc/" DESTINATION "include")
diff --git a/src/SOFIE_parsers/inc/LinkDef.h b/parsers/inc/LinkDef.h
similarity index 100%
rename from src/SOFIE_parsers/inc/LinkDef.h
rename to parsers/inc/LinkDef.h
diff --git a/src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx b/parsers/inc/SOFIE/RModelParser_ONNX.hxx
similarity index 100%
rename from src/SOFIE_parsers/inc/SOFIE/RModelParser_ONNX.hxx
rename to parsers/inc/SOFIE/RModelParser_ONNX.hxx
diff --git a/src/SOFIE_parsers/onnx_proto3 b/parsers/onnx_proto3
similarity index 100%
rename from src/SOFIE_parsers/onnx_proto3
rename to parsers/onnx_proto3
diff --git a/src/SOFIE_parsers/src/ParseBasicBinary.cxx b/parsers/src/ParseBasicBinary.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseBasicBinary.cxx
rename to parsers/src/ParseBasicBinary.cxx
diff --git a/parsers/src/ParseBasicIs.cxx b/parsers/src/ParseBasicIs.cxx
new file mode 100644
index 0000000..a1abad4
--- /dev/null
+++ b/parsers/src/ParseBasicIs.cxx
@@ -0,0 +1,66 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_Basic_Is.hxx"
+#include "onnx_proto3.pb.h"
+
+namespace SOFIE {
+
+template <EBasicIsOperator Op>
+std::unique_ptr<ROperator> ParseBasicIs(RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto)
+{
+
+   std::string input_name = nodeproto.input(0);
+   if (!parser.IsRegisteredTensorType(input_name)) {
+      throw
+         std::runtime_error("SOFIE ONNX Parser " + IsOpTraits<Op>::Name() + " op has input tensor " + input_name +
+                                  " but its type is not yet registered");
+   }
+
+   // get attributes for the IsInf operator
+   int detect_negative = 1;
+   int detect_positive = 1;
+   for (int_t i = 0; i < nodeproto.attribute_size(); i++) {
+      std::string attribute_name = nodeproto.attribute(i).name();
+      if (attribute_name == "detect_negative")
+         detect_negative = nodeproto.attribute(i).i();
+       if (attribute_name == "detect_positive")
+         detect_positive = nodeproto.attribute(i).i();
+   }
+
+   if (detect_positive == 0 && detect_negative == 0)
+      throw std::runtime_error("SOFIE ONNX Parser IsInf op has invalide attributes");
+
+
+   std::unique_ptr<ROperator> op;
+   std::string output_name = nodeproto.output(0);
+
+   if (nodeproto.attribute_size() == 0 || (detect_negative == 1 && detect_positive == 1))
+      op.reset(new ROperator_Basic_Is<Op>(input_name, output_name));
+   else if (nodeproto.attribute_size() > 0) {
+      // case detect_negative or detective_positive are set
+      if (detect_negative == 0)
+         op.reset(new ROperator_Basic_Is<EBasicIsOperator::kIsInfPos>(input_name, output_name));
+      else if (detect_positive == 0)
+         op.reset(new ROperator_Basic_Is<EBasicIsOperator::kIsInfNeg>(input_name, output_name));
+   } else
+      throw std::runtime_error("SOFIE ONNX Parser " + IsOpTraits<Op>::Name() + " operator - invalid attributes");
+
+   // Register the output type (is always BOOL)
+   if (!parser.IsRegisteredTensorType(output_name)) {
+      parser.RegisterTensorType(output_name, ETensorType::BOOL);
+   }
+
+   return op;
+};
+
+// Parse IsNaN
+ParserFuncSignature ParseIsNaN = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBasicIs<EBasicIsOperator::kIsNaN>(parser, nodeproto);
+};
+
+// Parse IsInf
+ParserFuncSignature ParseIsInf = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBasicIs<EBasicIsOperator::kIsInf>(parser, nodeproto);
+};
+
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseBasicNary.cxx b/parsers/src/ParseBasicNary.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseBasicNary.cxx
rename to parsers/src/ParseBasicNary.cxx
diff --git a/src/SOFIE_parsers/src/ParseBasicUnary.cxx b/parsers/src/ParseBasicUnary.cxx
similarity index 82%
rename from src/SOFIE_parsers/src/ParseBasicUnary.cxx
rename to parsers/src/ParseBasicUnary.cxx
index 1470f26..40d0225 100644
--- a/src/SOFIE_parsers/src/ParseBasicUnary.cxx
+++ b/parsers/src/ParseBasicUnary.cxx
@@ -79,5 +79,20 @@ ParserFuncSignature ParseAbs = [](RModelParser_ONNX &parser, const onnx::NodePro
    return ParseBasicUnary<EBasicUnaryOperator::kAbs>(parser, nodeproto);
 };
 
+//Parse Softplus
+ParserFuncSignature ParseSoftplus = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBasicUnary<EBasicUnaryOperator::kSoftplus>(parser, nodeproto);
+};
+
+//Parse Atan
+ParserFuncSignature ParseAtan = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBasicUnary<EBasicUnaryOperator::kAtan>(parser, nodeproto);
+};
+
+//Parse Floor
+ParserFuncSignature ParseFloor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBasicUnary<EBasicUnaryOperator::kFloor>(parser, nodeproto);
+};
+
 } // namespace SOFIE
 
diff --git a/src/SOFIE_parsers/src/ParseBatchNormalization.cxx b/parsers/src/ParseBatchNormalization.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseBatchNormalization.cxx
rename to parsers/src/ParseBatchNormalization.cxx
diff --git a/src/SOFIE_parsers/src/ParseCast.cxx b/parsers/src/ParseCast.cxx
similarity index 78%
rename from src/SOFIE_parsers/src/ParseCast.cxx
rename to parsers/src/ParseCast.cxx
index 7685421..a0993d4 100644
--- a/src/SOFIE_parsers/src/ParseCast.cxx
+++ b/parsers/src/ParseCast.cxx
@@ -13,20 +13,19 @@ ParserFuncSignature ParseCast = [](RModelParser_ONNX &parser, const onnx::NodePr
    }
 
    std::unique_ptr<ROperator> op;
-   std::string attr_type;
+   ETensorType attr_type;
 
    for (int_t i = 0; i < nodeproto.attribute_size(); i++) {
       std::string attribute_name = nodeproto.attribute(i).name();
       if (attribute_name == "to")
-         attr_type = ConvertTypeToString(static_cast<ETensorType>(nodeproto.attribute(i).i()));
+         attr_type = static_cast<ETensorType>(nodeproto.attribute(i).i());
    }
 
    std::string output_name = nodeproto.output(0);
    op.reset(new ROperator_Cast(attr_type, nodeproto.input(0), output_name));
 
    if (!parser.IsRegisteredTensorType(output_name)) {
-      ETensorType output_type = ConvertStringToType(attr_type);
-      parser.RegisterTensorType(output_name, output_type);
+      parser.RegisterTensorType(output_name, attr_type);
    }
 
    return op;
diff --git a/parsers/src/ParseClip.cxx b/parsers/src/ParseClip.cxx
new file mode 100644
index 0000000..4424c76
--- /dev/null
+++ b/parsers/src/ParseClip.cxx
@@ -0,0 +1,46 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_Clip.hxx"
+#include "onnx_proto3.pb.h"
+
+namespace SOFIE {
+
+ParserFuncSignature ParseClip = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto)
+{
+   ETensorType input_type = ETensorType::UNDEFINED;
+
+   std::string input_name = nodeproto.input(0);
+   if (parser.IsRegisteredTensorType(input_name)) {
+      input_type = parser.GetTensorType(input_name);
+   } else {
+      throw std::runtime_error("SOFIE ONNX Parser Clip op has input tensor " + input_name +
+                               " but its type is not yet registered");
+   }
+
+   std::string output_name = nodeproto.output(0);
+
+   // ONNX opset 11+: min and max are optional tensor inputs (empty string when absent)
+   std::string min_name = (nodeproto.input_size() > 1 && !nodeproto.input(1).empty())
+                             ? nodeproto.input(1) : "";
+   std::string max_name = (nodeproto.input_size() > 2 && !nodeproto.input(2).empty())
+                             ? nodeproto.input(2) : "";
+
+   std::unique_ptr<ROperator> op;
+   switch (input_type) {
+   case ETensorType::FLOAT:
+      op.reset(new ROperator_Clip<float>(input_name, output_name, min_name, max_name));
+      break;
+   case ETensorType::DOUBLE:
+      op.reset(new ROperator_Clip<double>(input_name, output_name, min_name, max_name));
+      break;
+   default:
+      throw std::runtime_error("SOFIE ONNX Parser Clip op does not yet support input type " +
+                               std::to_string(static_cast<int>(input_type)));
+   }
+
+   if (!parser.IsRegisteredTensorType(output_name))
+      parser.RegisterTensorType(output_name, input_type);
+
+   return op;
+};
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseComparision.cxx b/parsers/src/ParseComparision.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseComparision.cxx
rename to parsers/src/ParseComparision.cxx
diff --git a/src/SOFIE_parsers/src/ParseConcat.cxx b/parsers/src/ParseConcat.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseConcat.cxx
rename to parsers/src/ParseConcat.cxx
diff --git a/src/SOFIE_parsers/src/ParseConstant.cxx b/parsers/src/ParseConstant.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseConstant.cxx
rename to parsers/src/ParseConstant.cxx
diff --git a/src/SOFIE_parsers/src/ParseConv.cxx b/parsers/src/ParseConv.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseConv.cxx
rename to parsers/src/ParseConv.cxx
diff --git a/src/SOFIE_parsers/src/ParseConvTranspose.cxx b/parsers/src/ParseConvTranspose.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseConvTranspose.cxx
rename to parsers/src/ParseConvTranspose.cxx
diff --git a/src/SOFIE_parsers/src/ParseEinsum.cxx b/parsers/src/ParseEinsum.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseEinsum.cxx
rename to parsers/src/ParseEinsum.cxx
diff --git a/src/SOFIE_parsers/src/ParseElu.cxx b/parsers/src/ParseElu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseElu.cxx
rename to parsers/src/ParseElu.cxx
diff --git a/src/SOFIE_parsers/src/ParseErf.cxx b/parsers/src/ParseErf.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseErf.cxx
rename to parsers/src/ParseErf.cxx
diff --git a/src/SOFIE_parsers/src/ParseExpand.cxx b/parsers/src/ParseExpand.cxx
similarity index 81%
rename from src/SOFIE_parsers/src/ParseExpand.cxx
rename to parsers/src/ParseExpand.cxx
index c4ed54f..0793880 100644
--- a/src/SOFIE_parsers/src/ParseExpand.cxx
+++ b/parsers/src/ParseExpand.cxx
@@ -35,9 +35,19 @@ ParserFuncSignature ParseExpand = [](RModelParser_ONNX &parser, const onnx::Node
       case ETensorType::FLOAT:
          op.reset(new ROperator_Expand<float>(input_name, shape_name, output_name));
          break;
+      case ETensorType::DOUBLE:
+         op.reset(new ROperator_Expand<double>(input_name, shape_name, output_name));
+         break;
+      case ETensorType::INT32:
+         op.reset(new ROperator_Expand<int32_t>(input_name, shape_name, output_name));
+         break;
       case ETensorType::INT64:
          op.reset(new ROperator_Expand<int64_t>(input_name, shape_name, output_name));
          break;
+      case ETensorType::BOOL:
+      case ETensorType::UINT8:
+         op.reset(new ROperator_Expand<uint8_t>(input_name, shape_name, output_name));
+         break;
       default:
          throw std::runtime_error("TMVA::SOFIE - Unsupported - Expand Operator does "
                              "not support input type " +
diff --git a/src/SOFIE_parsers/src/ParseEyeLike.cxx b/parsers/src/ParseEyeLike.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseEyeLike.cxx
rename to parsers/src/ParseEyeLike.cxx
diff --git a/src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx b/parsers/src/ParseFuseBatchnormRelu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseFuseBatchnormRelu.cxx
rename to parsers/src/ParseFuseBatchnormRelu.cxx
diff --git a/src/SOFIE_parsers/src/ParseFuseConvAdd.cxx b/parsers/src/ParseFuseConvAdd.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseFuseConvAdd.cxx
rename to parsers/src/ParseFuseConvAdd.cxx
diff --git a/src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx b/parsers/src/ParseFuseConvTransposeAdd.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseFuseConvTransposeAdd.cxx
rename to parsers/src/ParseFuseConvTransposeAdd.cxx
diff --git a/src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx b/parsers/src/ParseFuseGemmRelu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseFuseGemmRelu.cxx
rename to parsers/src/ParseFuseGemmRelu.cxx
diff --git a/src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx b/parsers/src/ParseFuseMatMulAdd.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseFuseMatMulAdd.cxx
rename to parsers/src/ParseFuseMatMulAdd.cxx
diff --git a/src/SOFIE_parsers/src/ParseGRU.cxx b/parsers/src/ParseGRU.cxx
similarity index 97%
rename from src/SOFIE_parsers/src/ParseGRU.cxx
rename to parsers/src/ParseGRU.cxx
index ec2cddf..58ce983 100644
--- a/src/SOFIE_parsers/src/ParseGRU.cxx
+++ b/parsers/src/ParseGRU.cxx
@@ -46,7 +46,7 @@ ParserFuncSignature ParseGRU = [](RModelParser_ONNX &parser, const onnx::NodePro
       } else if (attribute_name == "linear_before_reset") {
          attr_linear_before_reset = nodeproto.attribute(i).i();
       } else {
-         std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
+         std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
                    << nodeproto.name() << " is not defined in ONNX IR and not applied!\n";
       }
    }
diff --git a/src/SOFIE_parsers/src/ParseGather.cxx b/parsers/src/ParseGather.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseGather.cxx
rename to parsers/src/ParseGather.cxx
diff --git a/parsers/src/ParseGatherND.cxx b/parsers/src/ParseGatherND.cxx
new file mode 100644
index 0000000..57beb01
--- /dev/null
+++ b/parsers/src/ParseGatherND.cxx
@@ -0,0 +1,49 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_GatherND.hxx"
+#include "onnx_proto3.pb.h"
+#include <stdexcept>
+
+
+namespace SOFIE {
+
+ParserFuncSignature ParseGatherND = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   ETensorType input_type = ETensorType::UNDEFINED;
+   auto input_name = nodeproto.input(0);
+   if (parser.IsRegisteredTensorType(input_name)) {
+      input_type = parser.GetTensorType(input_name);
+   } else {
+      throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op has input tensor " + input_name +
+                               " but its type is not yet registered");
+   }
+
+   auto indices_name = nodeproto.input(1);
+   if (parser.IsRegisteredTensorType(indices_name)) {
+      ETensorType indices_type = parser.GetTensorType(indices_name);
+      if (indices_type != ETensorType::INT64) {
+         throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op indices tensor must be INT64, got " +
+                                  indices_name);
+      }
+   }
+
+   int64_t batch_dims = 0;
+   for (int i = 0; i < nodeproto.attribute_size(); ++i) {
+      const auto& attr = nodeproto.attribute(i);
+      if (attr.name() == "batch_dims") {
+         batch_dims = attr.i();
+         break;
+      }
+   }
+
+   std::string output_name = nodeproto.output(0);
+
+   std::unique_ptr<ROperator> op(
+      new ROperator_GatherND(batch_dims, input_name, indices_name, output_name));
+
+   if (!parser.IsRegisteredTensorType(output_name)) {
+      parser.RegisterTensorType(output_name, input_type);
+   }
+
+   return op;
+};
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseGemm.cxx b/parsers/src/ParseGemm.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseGemm.cxx
rename to parsers/src/ParseGemm.cxx
diff --git a/src/SOFIE_parsers/src/ParseIdentity.cxx b/parsers/src/ParseIdentity.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseIdentity.cxx
rename to parsers/src/ParseIdentity.cxx
diff --git a/src/SOFIE_parsers/src/ParseIf.cxx b/parsers/src/ParseIf.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseIf.cxx
rename to parsers/src/ParseIf.cxx
diff --git a/src/SOFIE_parsers/src/ParseLSTM.cxx b/parsers/src/ParseLSTM.cxx
similarity index 97%
rename from src/SOFIE_parsers/src/ParseLSTM.cxx
rename to parsers/src/ParseLSTM.cxx
index b9dc165..a95ee01 100644
--- a/src/SOFIE_parsers/src/ParseLSTM.cxx
+++ b/parsers/src/ParseLSTM.cxx
@@ -46,7 +46,7 @@ ParserFuncSignature ParseLSTM = [](RModelParser_ONNX &parser, const onnx::NodePr
       } else if (attribute_name == "layout") {
          attr_layout = nodeproto.attribute(i).i();
       } else {
-         std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
+         std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
                    << nodeproto.name() << " is not defined in ONNX IR and not applied!\n";
       }
    }
diff --git a/src/SOFIE_parsers/src/ParseLayerNormalization.cxx b/parsers/src/ParseLayerNormalization.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseLayerNormalization.cxx
rename to parsers/src/ParseLayerNormalization.cxx
diff --git a/src/SOFIE_parsers/src/ParseLeakyRelu.cxx b/parsers/src/ParseLeakyRelu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseLeakyRelu.cxx
rename to parsers/src/ParseLeakyRelu.cxx
diff --git a/parsers/src/ParseLogic.cxx b/parsers/src/ParseLogic.cxx
new file mode 100644
index 0000000..1609678
--- /dev/null
+++ b/parsers/src/ParseLogic.cxx
@@ -0,0 +1,181 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_Logic.hxx"
+#include "onnx_proto3.pb.h"
+
+namespace SOFIE {
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Helper: parse a binary logical op (And / Or / Xor)
+//
+// ONNX spec: both inputs are bool; output is bool.
+// In SOFIE, BOOL tensors are stored as uint8_t.
+// ─────────────────────────────────────────────────────────────────────────────
+
+template <ELogicBinaryOp Op>
+static std::unique_ptr<ROperator> ParseLogicalBinary(RModelParser_ONNX &parser,
+                                                      const onnx::NodeProto  &nodeproto)
+{
+   const std::string input_a   = nodeproto.input(0);
+   const std::string input_b   = nodeproto.input(1);
+   const std::string output    = nodeproto.output(0);
+
+   for (const auto &name : { input_a, input_b }) {
+      if (!parser.IsRegisteredTensorType(name))
+         throw std::runtime_error(
+            "TMVA::SOFIE ONNX Parser " +
+            LogicBinaryTrait<uint8_t, Op>::Name() +
+            ": input tensor '" + name + "' type not yet registered");
+      ETensorType t = parser.GetTensorType(name);
+      if (t != ETensorType::BOOL && t != ETensorType::UINT8)
+         throw std::runtime_error(
+            "TMVA::SOFIE ONNX Parser " +
+            LogicBinaryTrait<uint8_t, Op>::Name() +
+            ": input '" + name + "' must be bool, got " +
+            ConvertTypeToString(t));
+   }
+
+   std::unique_ptr<ROperator> op(
+      new ROperator_LogicBinary<uint8_t, Op>(input_a, input_b, output));
+
+   if (!parser.IsRegisteredTensorType(output))
+      parser.RegisterTensorType(output, ETensorType::BOOL);
+
+   return op;
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Helper: parse a binary bitwise op (BitwiseAnd / BitwiseOr / BitwiseXor)
+//
+// ONNX spec: inputs can be any integer type; output has same type.
+// ─────────────────────────────────────────────────────────────────────────────
+
+template <ELogicBinaryOp Op>
+static std::unique_ptr<ROperator> ParseBitwiseBinary(RModelParser_ONNX &parser,
+                                                      const onnx::NodeProto  &nodeproto)
+{
+   const std::string input_a   = nodeproto.input(0);
+   const std::string input_b   = nodeproto.input(1);
+   const std::string output    = nodeproto.output(0);
+
+   if (!parser.IsRegisteredTensorType(input_a))
+      throw std::runtime_error(
+         "TMVA::SOFIE ONNX Parser " +
+         LogicBinaryTrait<int32_t, Op>::Name() +
+         ": input tensor '" + input_a + "' type not yet registered");
+
+   const ETensorType input_type = parser.GetTensorType(input_a);
+
+   std::unique_ptr<ROperator> op;
+   switch (input_type) {
+      case ETensorType::INT8:
+         op.reset(new ROperator_LogicBinary<int8_t,   Op>(input_a, input_b, output)); break;
+      case ETensorType::UINT8:
+         op.reset(new ROperator_LogicBinary<uint8_t,  Op>(input_a, input_b, output)); break;
+      case ETensorType::INT16:
+         op.reset(new ROperator_LogicBinary<int16_t,  Op>(input_a, input_b, output)); break;
+      case ETensorType::UINT16:
+         op.reset(new ROperator_LogicBinary<uint16_t, Op>(input_a, input_b, output)); break;
+      case ETensorType::INT32:
+         op.reset(new ROperator_LogicBinary<int32_t,  Op>(input_a, input_b, output)); break;
+      case ETensorType::UINT32:
+         op.reset(new ROperator_LogicBinary<uint32_t, Op>(input_a, input_b, output)); break;
+      case ETensorType::INT64:
+         op.reset(new ROperator_LogicBinary<int64_t,  Op>(input_a, input_b, output)); break;
+      case ETensorType::UINT64:
+         op.reset(new ROperator_LogicBinary<uint64_t, Op>(input_a, input_b, output)); break;
+      default:
+         throw std::runtime_error(
+            "TMVA::SOFIE ONNX Parser " +
+            LogicBinaryTrait<int32_t, Op>::Name() +
+            ": unsupported input type " + ConvertTypeToString(input_type));
+   }
+
+   if (!parser.IsRegisteredTensorType(output))
+      parser.RegisterTensorType(output, input_type);
+
+   return op;
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Logical binary parsers
+// ─────────────────────────────────────────────────────────────────────────────
+
+ParserFuncSignature ParseAnd = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseLogicalBinary<ELogicBinaryOp::And>(parser, nodeproto);
+};
+
+ParserFuncSignature ParseOr = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseLogicalBinary<ELogicBinaryOp::Or>(parser, nodeproto);
+};
+
+ParserFuncSignature ParseXor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseLogicalBinary<ELogicBinaryOp::Xor>(parser, nodeproto);
+};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Bitwise binary parsers
+// ─────────────────────────────────────────────────────────────────────────────
+
+ParserFuncSignature ParseBitwiseAnd = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBitwiseBinary<ELogicBinaryOp::BitwiseAnd>(parser, nodeproto);
+};
+
+ParserFuncSignature ParseBitwiseOr = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBitwiseBinary<ELogicBinaryOp::BitwiseOr>(parser, nodeproto);
+};
+
+ParserFuncSignature ParseBitwiseXor = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseBitwiseBinary<ELogicBinaryOp::BitwiseXor>(parser, nodeproto);
+};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// BitwiseNot parser
+//
+// ONNX spec: any integer type; output same type as input.
+// ─────────────────────────────────────────────────────────────────────────────
+
+ParserFuncSignature ParseBitwiseNot = [](RModelParser_ONNX &parser,
+                                          const onnx::NodeProto  &nodeproto)
+   -> std::unique_ptr<ROperator>
+{
+   const std::string input_name  = nodeproto.input(0);
+   const std::string output_name = nodeproto.output(0);
+
+   if (!parser.IsRegisteredTensorType(input_name))
+      throw std::runtime_error(
+         "TMVA::SOFIE ONNX Parser BitwiseNot: input tensor '" +
+         input_name + "' type not yet registered");
+
+   const ETensorType input_type = parser.GetTensorType(input_name);
+
+   std::unique_ptr<ROperator> op;
+   switch (input_type) {
+      case ETensorType::INT8:
+         op.reset(new ROperator_BitwiseNot<int8_t>  (input_name, output_name)); break;
+      case ETensorType::UINT8:
+         op.reset(new ROperator_BitwiseNot<uint8_t> (input_name, output_name)); break;
+      case ETensorType::INT16:
+         op.reset(new ROperator_BitwiseNot<int16_t> (input_name, output_name)); break;
+      case ETensorType::UINT16:
+         op.reset(new ROperator_BitwiseNot<uint16_t>(input_name, output_name)); break;
+      case ETensorType::INT32:
+         op.reset(new ROperator_BitwiseNot<int32_t> (input_name, output_name)); break;
+      case ETensorType::UINT32:
+         op.reset(new ROperator_BitwiseNot<uint32_t>(input_name, output_name)); break;
+      case ETensorType::INT64:
+         op.reset(new ROperator_BitwiseNot<int64_t> (input_name, output_name)); break;
+      case ETensorType::UINT64:
+         op.reset(new ROperator_BitwiseNot<uint64_t>(input_name, output_name)); break;
+      default:
+         throw std::runtime_error(
+            "TMVA::SOFIE ONNX Parser BitwiseNot: unsupported input type " +
+            ConvertTypeToString(input_type));
+   }
+
+   if (!parser.IsRegisteredTensorType(output_name))
+      parser.RegisterTensorType(output_name, input_type);
+
+   return op;
+};
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseMatMul.cxx b/parsers/src/ParseMatMul.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseMatMul.cxx
rename to parsers/src/ParseMatMul.cxx
diff --git a/parsers/src/ParseNot.cxx b/parsers/src/ParseNot.cxx
new file mode 100644
index 0000000..ca315eb
--- /dev/null
+++ b/parsers/src/ParseNot.cxx
@@ -0,0 +1,38 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_Not.hxx"
+#include "onnx_proto3.pb.h"
+
+namespace SOFIE {
+
+ParserFuncSignature ParseNot = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto)
+{
+   ETensorType input_type = ETensorType::UNDEFINED;
+
+   if (nodeproto.input_size() != 1 || nodeproto.output_size() != 1)
+      std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input or output size ");
+
+   std::string input_name = nodeproto.input(0);
+
+   if (parser.IsRegisteredTensorType(input_name)) {
+      input_type = parser.GetTensorType(input_name);
+      if (input_type !=ETensorType::BOOL  && input_type !=ETensorType::UINT8 )
+         throw std::runtime_error("TMVA::SOFIE ONNX Parser Not op has invalid input type " + ConvertTypeToString(input_type));
+   } else {
+      throw
+         std::runtime_error("TMVA::SOFIE ONNX Parser Not op has input tensor " + input_name +
+                                  " but its type is not yet registered");
+   }
+
+   std::string output_name = nodeproto.output(0);
+   std::unique_ptr<ROperator> op(new ROperator_Not(input_name, output_name));
+
+   // Infer the output type
+   if (!parser.IsRegisteredTensorType(output_name)) {
+      parser.RegisterTensorType(output_name, input_type);
+   }
+
+   return op;
+};
+
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParsePad.cxx b/parsers/src/ParsePad.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParsePad.cxx
rename to parsers/src/ParsePad.cxx
diff --git a/src/SOFIE_parsers/src/ParsePool.cxx b/parsers/src/ParsePool.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParsePool.cxx
rename to parsers/src/ParsePool.cxx
diff --git a/src/SOFIE_parsers/src/ParseRNN.cxx b/parsers/src/ParseRNN.cxx
similarity index 96%
rename from src/SOFIE_parsers/src/ParseRNN.cxx
rename to parsers/src/ParseRNN.cxx
index d75b577..2d20e15 100644
--- a/src/SOFIE_parsers/src/ParseRNN.cxx
+++ b/parsers/src/ParseRNN.cxx
@@ -43,7 +43,7 @@ ParserFuncSignature ParseRNN = [](RModelParser_ONNX &parser, const onnx::NodePro
       } else if (attribute_name == "layout") {
          attr_layout = nodeproto.attribute(i).i();
       } else {
-         std::cout << "TMVA SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
+         std::cout << "SOFIE Warning - Model Loading - Attribute " << attribute_name << " in OperatorNode "
                    << nodeproto.name() << " is not defined in ONNX IR and not applied!\n";
       }
    }
diff --git a/src/SOFIE_parsers/src/ParseRandom.cxx b/parsers/src/ParseRandom.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseRandom.cxx
rename to parsers/src/ParseRandom.cxx
diff --git a/src/SOFIE_parsers/src/ParseRange.cxx b/parsers/src/ParseRange.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseRange.cxx
rename to parsers/src/ParseRange.cxx
diff --git a/src/SOFIE_parsers/src/ParseReduce.cxx b/parsers/src/ParseReduce.cxx
similarity index 86%
rename from src/SOFIE_parsers/src/ParseReduce.cxx
rename to parsers/src/ParseReduce.cxx
index 45696a5..63ff834 100644
--- a/src/SOFIE_parsers/src/ParseReduce.cxx
+++ b/parsers/src/ParseReduce.cxx
@@ -21,6 +21,10 @@ std::unique_ptr<ROperator> ParseReduce(RModelParser_ONNX &parser, const onnx::No
       op_mode = ReduceProd;
    else if (nodeproto.op_type() == "ReduceSum")
       op_mode = ReduceSum;
+   else if (nodeproto.op_type() == "ReduceL2")
+      op_mode = ReduceL2;
+   else if (nodeproto.op_type() == "ReduceMax")
+      op_mode = ReduceMax;
 
    if (op_mode == InvalidReduceOp) {
       throw std::runtime_error("TMVA::SOFIE - Reduce op mode not supported.");
@@ -91,5 +95,15 @@ ParserFuncSignature ParseReduceSum = [](RModelParser_ONNX &parser, const onnx::N
    return ParseReduce<EReduceOpMode::ReduceSum>(parser, nodeproto);
 };
 
+// Parse ReduceL2
+ParserFuncSignature ParseReduceL2 = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseReduce<EReduceOpMode::ReduceL2>(parser, nodeproto);
+};
+
+// Parse ReduceMax
+ParserFuncSignature ParseReduceMax = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   return ParseReduce<EReduceOpMode::ReduceMax>(parser, nodeproto);
+};
+
 } // namespace SOFIE
 
diff --git a/src/SOFIE_parsers/src/ParseRelu.cxx b/parsers/src/ParseRelu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseRelu.cxx
rename to parsers/src/ParseRelu.cxx
diff --git a/src/SOFIE_parsers/src/ParseReshape.cxx b/parsers/src/ParseReshape.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseReshape.cxx
rename to parsers/src/ParseReshape.cxx
diff --git a/src/SOFIE_parsers/src/ParseScatterElements.cxx b/parsers/src/ParseScatterElements.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseScatterElements.cxx
rename to parsers/src/ParseScatterElements.cxx
diff --git a/src/SOFIE_parsers/src/ParseSelu.cxx b/parsers/src/ParseSelu.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseSelu.cxx
rename to parsers/src/ParseSelu.cxx
diff --git a/src/SOFIE_parsers/src/ParseShape.cxx b/parsers/src/ParseShape.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseShape.cxx
rename to parsers/src/ParseShape.cxx
diff --git a/src/SOFIE_parsers/src/ParseSigmoid.cxx b/parsers/src/ParseSigmoid.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseSigmoid.cxx
rename to parsers/src/ParseSigmoid.cxx
diff --git a/src/SOFIE_parsers/src/ParseSlice.cxx b/parsers/src/ParseSlice.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseSlice.cxx
rename to parsers/src/ParseSlice.cxx
diff --git a/src/SOFIE_parsers/src/ParseSoftmax.cxx b/parsers/src/ParseSoftmax.cxx
similarity index 91%
rename from src/SOFIE_parsers/src/ParseSoftmax.cxx
rename to parsers/src/ParseSoftmax.cxx
index aea042e..19bd57a 100644
--- a/src/SOFIE_parsers/src/ParseSoftmax.cxx
+++ b/parsers/src/ParseSoftmax.cxx
@@ -24,7 +24,7 @@ ParserFuncSignature ParseSoftmax = [](RModelParser_ONNX &parser, const onnx::Nod
       attr_axis = nodeproto.attribute(0).i();
 
    switch (input_type) {
-   case ETensorType::FLOAT: op.reset(new ROperator_Softmax<float>(attr_axis, input_name, output_name)); break;
+   case ETensorType::FLOAT: op.reset(new ROperator_Softmax(attr_axis, input_name, output_name)); break;
    default:
       throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Softmax does not yet support input type " +
                                std::to_string(static_cast<int>(input_type)));
diff --git a/src/SOFIE_parsers/src/ParseSplit.cxx b/parsers/src/ParseSplit.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseSplit.cxx
rename to parsers/src/ParseSplit.cxx
diff --git a/src/SOFIE_parsers/src/ParseTanh.cxx b/parsers/src/ParseTanh.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseTanh.cxx
rename to parsers/src/ParseTanh.cxx
diff --git a/src/SOFIE_parsers/src/ParseTile.cxx b/parsers/src/ParseTile.cxx
similarity index 93%
rename from src/SOFIE_parsers/src/ParseTile.cxx
rename to parsers/src/ParseTile.cxx
index 20dbfb6..8b8c47f 100644
--- a/src/SOFIE_parsers/src/ParseTile.cxx
+++ b/parsers/src/ParseTile.cxx
@@ -29,6 +29,7 @@ ParserFuncSignature ParseTile = [](RModelParser_ONNX &parser, const onnx::NodePr
 
    switch (input_type) {
    case ETensorType::FLOAT: op.reset(new ROperator_Tile<float>(repeat_name, input_name, output_name)); break;
+   case ETensorType::INT64: op.reset(new ROperator_Tile<int64_t>(repeat_name, input_name, output_name)); break;
    default:
       throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Tile does not yet support input type " +
                                std::to_string(static_cast<int>(input_type)));
diff --git a/src/SOFIE_parsers/src/ParseTopK.cxx b/parsers/src/ParseTopK.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseTopK.cxx
rename to parsers/src/ParseTopK.cxx
diff --git a/src/SOFIE_parsers/src/ParseTranspose.cxx b/parsers/src/ParseTranspose.cxx
similarity index 100%
rename from src/SOFIE_parsers/src/ParseTranspose.cxx
rename to parsers/src/ParseTranspose.cxx
diff --git a/parsers/src/ParseTrilu.cxx b/parsers/src/ParseTrilu.cxx
new file mode 100644
index 0000000..c196caf
--- /dev/null
+++ b/parsers/src/ParseTrilu.cxx
@@ -0,0 +1,67 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_Trilu.hxx"
+#include "onnx_proto3.pb.h"
+
+namespace SOFIE {
+
+ParserFuncSignature ParseTrilu = [](RModelParser_ONNX &parser,
+                                    const onnx::NodeProto  &nodeproto)
+   -> std::unique_ptr<ROperator>
+{
+   // ── Validate primary input ─────────────────────────────────────────────
+   const std::string input_name = nodeproto.input(0);
+   if (!parser.IsRegisteredTensorType(input_name))
+      throw std::runtime_error(
+         "TMVA::SOFIE ONNX Parser Trilu: input tensor '" + input_name +
+         "' type not yet registered");
+
+   const ETensorType input_type = parser.GetTensorType(input_name);
+   const std::string output_name = nodeproto.output(0);
+
+   // ── Parse 'upper' attribute (default 1) ───────────────────────────────
+   int attr_upper = 1;
+   for (int i = 0; i < nodeproto.attribute_size(); ++i) {
+      if (nodeproto.attribute(i).name() == "upper")
+         attr_upper = static_cast<int>(nodeproto.attribute(i).i());
+   }
+
+   // ── Optional k input (second input, scalar int64) ─────────────────────
+   std::string k_name;
+   if (nodeproto.input_size() > 1 && !nodeproto.input(1).empty()) {
+      k_name = nodeproto.input(1);
+      // Register k tensor type if not yet seen (it is always int64).
+      if (!parser.IsRegisteredTensorType(k_name))
+         parser.RegisterTensorType(k_name, ETensorType::INT64);
+   }
+
+   // ── Create operator (templated on the primary input type) ──────────────
+   std::unique_ptr<ROperator> op;
+
+   auto make_op = [&]<typename T>() {
+      if (k_name.empty())
+         op.reset(new ROperator_Trilu<T>(attr_upper, input_name, output_name));
+      else
+         op.reset(new ROperator_Trilu<T>(attr_upper, input_name, k_name, output_name));
+   };
+
+   switch (input_type) {
+      case ETensorType::FLOAT:   make_op.template operator()<float>();    break;
+      case ETensorType::DOUBLE:  make_op.template operator()<double>();   break;
+      case ETensorType::INT32:   make_op.template operator()<int32_t>();  break;
+      case ETensorType::INT64:   make_op.template operator()<int64_t>();  break;
+      case ETensorType::UINT8:   make_op.template operator()<uint8_t>();  break;
+      case ETensorType::BOOL:    make_op.template operator()<uint8_t>();  break;
+      default:
+         throw std::runtime_error(
+            "TMVA::SOFIE ONNX Parser Trilu: unsupported input type " +
+            std::to_string(static_cast<int>(input_type)));
+   }
+
+   // ── Register output type ───────────────────────────────────────────────
+   if (!parser.IsRegisteredTensorType(output_name))
+      parser.RegisterTensorType(output_name, input_type);
+
+   return op;
+};
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseWhere.cxx b/parsers/src/ParseWhere.cxx
similarity index 80%
rename from src/SOFIE_parsers/src/ParseWhere.cxx
rename to parsers/src/ParseWhere.cxx
index ea73cff..636c7e2 100644
--- a/src/SOFIE_parsers/src/ParseWhere.cxx
+++ b/parsers/src/ParseWhere.cxx
@@ -11,6 +11,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP
       throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has invalid input size");
    }
    // condition boolean vector is input 0
+   if (!parser.IsRegisteredTensorType(nodeproto.input(0))){
+      throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " + nodeproto.input(0)
+                                + " but its type is not yet registered");
+   }
    if (!parser.IsRegisteredTensorType(nodeproto.input(1))){
       throw std::runtime_error("TMVA::SOFIE ONNX Parser Where op has input tensor " +  nodeproto.input(1)
                                 + " but its type is not yet registered");
@@ -31,10 +35,10 @@ ParserFuncSignature ParseWhere = [](RModelParser_ONNX &parser, const onnx::NodeP
 
    switch (input_type) {
    case ETensorType::FLOAT:
-      op.reset(new ROperator_Where<float>(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name));
+      op.reset(new ROperator_Where<float>(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name));
       break;
    case ETensorType::INT64:
-      op.reset(new ROperator_Where<int64_t>(nodeproto.input(1), nodeproto.input(2), nodeproto.input(0), output_name));
+      op.reset(new ROperator_Where<int64_t>(nodeproto.input(0), nodeproto.input(1), nodeproto.input(2), output_name));
       break;
    default:
       throw std::runtime_error("TMVA::SOFIE - Unsupported - Where Operator does not yet support input type " +
diff --git a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx b/parsers/src/RModelParser_ONNX.cxx
similarity index 84%
rename from src/SOFIE_parsers/src/RModelParser_ONNX.cxx
rename to parsers/src/RModelParser_ONNX.cxx
index 68662ae..afb8b93 100644
--- a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx
+++ b/parsers/src/RModelParser_ONNX.cxx
@@ -1,4 +1,3 @@
-#include "Byteswap.h"
 #include "SOFIE/RModelParser_ONNX.hxx"
 #include "onnx_proto3.pb.h"
 
@@ -8,7 +7,12 @@
 #include <cassert>
 #include <iostream>
 #include <unordered_map>
+#include <unordered_set>
 #include <functional>
+#include <algorithm>
+#include <array>
+#include <bit>
+#include <cstring>
 #include "SOFIE/SOFIE_common.hxx"
 
 
@@ -24,6 +28,10 @@ extern ParserFuncSignature ParseLog;
 extern ParserFuncSignature ParseSin;
 extern ParserFuncSignature ParseCos;
 extern ParserFuncSignature ParseAbs;
+extern ParserFuncSignature ParseSoftplus;
+extern ParserFuncSignature ParseAtan;
+extern ParserFuncSignature ParseFloor;
+
 // Binary operators
 extern ParserFuncSignature ParseAdd;
 extern ParserFuncSignature ParseSub;
@@ -41,11 +49,18 @@ extern ParserFuncSignature ParseLess;
 extern ParserFuncSignature ParseLessEq;
 extern ParserFuncSignature ParseGreater;
 extern ParserFuncSignature ParseGreaterEq;
+//Is Operators
+extern ParserFuncSignature ParseIsInf;
+extern ParserFuncSignature ParseIsNaN;
+extern ParserFuncSignature ParseNot;
+extern ParserFuncSignature ParseClip;
 // Reduce operators
 extern ParserFuncSignature ParseReduceMean;
 extern ParserFuncSignature ParseReduceSum;
 extern ParserFuncSignature ParseReduceSumSquare;
 extern ParserFuncSignature ParseReduceProd;
+extern ParserFuncSignature ParseReduceL2;
+extern ParserFuncSignature ParseReduceMax;
 // Others
 extern ParserFuncSignature ParseBatchNormalization;
 extern ParserFuncSignature ParseConstant;
@@ -73,6 +88,7 @@ extern ParserFuncSignature ParseShape;
 extern ParserFuncSignature ParseMatMul;
 extern ParserFuncSignature ParseLayerNormalization;
 extern ParserFuncSignature ParseGather;
+extern ParserFuncSignature ParseGatherND;
 extern ParserFuncSignature ParseErf;
 extern ParserFuncSignature ParseElu;
 extern ParserFuncSignature ParseEyeLike;
@@ -86,6 +102,14 @@ extern ParserFuncSignature ParseWhere;
 extern ParserFuncSignature ParseEinsum;
 extern ParserFuncSignature ParseRandom;
 extern ParserFuncSignature ParseScatterElements;
+extern ParserFuncSignature ParseTrilu;
+extern ParserFuncSignature ParseAnd;
+extern ParserFuncSignature ParseOr;
+extern ParserFuncSignature ParseXor;
+extern ParserFuncSignature ParseBitwiseAnd;
+extern ParserFuncSignature ParseBitwiseOr;
+extern ParserFuncSignature ParseBitwiseXor;
+extern ParserFuncSignature ParseBitwiseNot;
 // Declaration of fused operators
 extern ParserFuseFuncSignature ParseFuseConvAdd;
 extern ParserFuseFuncSignature ParseFuseGemmRelu;
@@ -132,18 +156,31 @@ struct ExtractDataFromTP<int64_t> {
                                                             static_cast<int64_t *>(data));
    }
 };
+// Reverse the bytes of a trivially-copyable value (used on big-endian hosts).
+// ONNX raw_data is always stored in little-endian order.
+template <typename T>
+static T bswap_value(T value) noexcept {
+   static_assert(std::is_trivially_copyable_v<T>);
+   std::array<char, sizeof(T)> bytes;
+   std::memcpy(bytes.data(), &value, sizeof(T));
+   std::reverse(bytes.begin(), bytes.end());
+   T result;
+   std::memcpy(&result, bytes.data(), sizeof(T));
+   return result;
+}
+
 template<typename T>
 std::shared_ptr<void> GetInitializedTensorData(onnx::TensorProto * tensorproto, size_t length) {
+   std::cout<<"Getting Initialized Tensor data for tensor " << tensorproto->name() << " of type " << tensorproto->data_type() << " and length " << length << std::endl;
    std::shared_ptr<void> data(malloc(length * sizeof(T)), free);
 
    if (!tensorproto->raw_data().empty()) {
-#ifdef R__BYTESWAP
       std::memcpy(data.get(), tensorproto->raw_data().c_str(), length * sizeof(T));
-#else
-      for (std::size_t k = 0; k < length; ++k)
-         (reinterpret_cast<typename RByteSwap<sizeof(T)>::value_type *>(data.get()))[k] =
-            RByteSwap<sizeof(T)>::bswap((reinterpret_cast<const typename RByteSwap<sizeof(T)>::value_type *>(tensorproto->raw_data().c_str()))[k]);
-#endif
+      if constexpr (std::endian::native != std::endian::little) {
+         T *ptr = static_cast<T *>(data.get());
+         for (std::size_t k = 0; k < length; ++k)
+            ptr[k] = bswap_value(ptr[k]);
+      }
    } else {
       ExtractDataFromTP<T>::Copy(tensorproto, data.get());
    }
@@ -162,6 +199,10 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un
    RegisterOperator("Sin", ParseSin);
    RegisterOperator("Cos", ParseCos);
    RegisterOperator("Abs", ParseAbs);
+   RegisterOperator("Softplus", ParseSoftplus);
+   RegisterOperator("Atan", ParseAtan);
+   RegisterOperator("Floor", ParseFloor);
+   
    // Binary operators
    RegisterOperator("Add", ParseAdd);
    RegisterOperator("Sub", ParseSub);
@@ -179,11 +220,18 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un
    RegisterOperator("LessOrEqual", ParseLessEq);
    RegisterOperator("Greater", ParseGreater);
    RegisterOperator("GreaterOrEqual", ParseGreaterEq);
+   // Is / Not operators
+   RegisterOperator("IsInf", ParseIsInf);
+   RegisterOperator("IsNaN", ParseIsNaN);
+   RegisterOperator("Not", ParseNot);
+   RegisterOperator("Clip", ParseClip);
    // Reduce operators
    RegisterOperator("ReduceMean", ParseReduceMean);
    RegisterOperator("ReduceSum", ParseReduceSum);
    RegisterOperator("ReduceSumSquare", ParseReduceSumSquare);
    RegisterOperator("ReduceProd", ParseReduceProd);
+   RegisterOperator("ReduceL2", ParseReduceL2);
+   RegisterOperator("ReduceMax", ParseReduceMax);
    // Others
    RegisterOperator("BatchNormalization", ParseBatchNormalization);
    RegisterOperator("Constant", ParseConstant);
@@ -217,6 +265,7 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un
    RegisterOperator("LayerNormalization", ParseLayerNormalization);
    RegisterOperator("Expand", ParseExpand);
    RegisterOperator("Gather", ParseGather);
+   RegisterOperator("GatherND", ParseGatherND);
    RegisterOperator("Erf", ParseErf);
    RegisterOperator("Elu", ParseElu);
    RegisterOperator("EyeLike", ParseEyeLike);
@@ -233,6 +282,16 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un
    RegisterOperator("RandomUniform", ParseRandom);
    RegisterOperator("RandomUniformLike", ParseRandom);
    RegisterOperator("ScatterElements", ParseScatterElements);
+   RegisterOperator("Trilu", ParseTrilu);
+   // Logical operators
+   RegisterOperator("And", ParseAnd);
+   RegisterOperator("Or", ParseOr);
+   RegisterOperator("Xor", ParseXor);
+   // Bitwise operators
+   RegisterOperator("BitwiseAnd", ParseBitwiseAnd);
+   RegisterOperator("BitwiseOr", ParseBitwiseOr);
+   RegisterOperator("BitwiseXor", ParseBitwiseXor);
+   RegisterOperator("BitwiseNot", ParseBitwiseNot);
 }
 
 // Destructor of the parser
@@ -584,6 +643,13 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          if (verbose) std::cout << "add INT64 initialized tensor " << input_name << " shape " << ConvertShapeToString(shape) << std::endl;
          rmodel.AddInitializedTensor(input_name, ETensorType::INT64, shape, data);
          allInitializedTensors[input_name] = i;
+         std::cout<<"Printing initialized values for tensor: "<<input_name;
+         int64_t* rawData = static_cast<int64_t*>(data.get());
+
+         for (size_t i = 0; i < fLength; ++i) {
+            std::cout << rawData[i] << " ";
+         }
+         std::cout << std::endl;
          break;
       }
       default:
@@ -613,6 +679,18 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
    nodesOrder.reserve(graph.node_size());
    std::vector<bool> foundNodes(graph.node_size());
 
+   // Pre-compute the set of all tensor names that belong to THIS graph:
+   // graph inputs, initializers, and node outputs.  A tensor is an "outer-scope
+   // reference" (from an enclosing graph) only if it is NOT in this set.
+   std::unordered_set<std::string> graphLocalTensors;
+   for (int i = 0; i < graph.input_size(); i++)
+      graphLocalTensors.insert(graph.input(i).name());
+   for (int i = 0; i < graph.initializer_size(); i++)
+      graphLocalTensors.insert(graph.initializer(i).name());
+   for (int i = 0; i < graph.node_size(); i++)
+      for (int j = 0; j < graph.node(i).output_size(); j++)
+         graphLocalTensors.insert(graph.node(i).output(j));
+
    // loop at graph inputs
    std::map<std::string, int> allInputs;
    for (int i = 0; i < graph.input_size(); i++) {
@@ -633,13 +711,22 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
             std::string name = graph.node(i).input(j);
             // skip empty names
             if (!name.empty()) {
-               existInputs &= (allInputs.find(name) != allInputs.end() ||
-                               allInitializedTensors.find(name) != allInitializedTensors.end());
+               // A tensor is available if it is: a graph input/previously computed node output
+               // (allInputs), an initializer (allInitializedTensors), or an outer-scope tensor
+               // referenced from a subgraph.  Outer-scope means: registered in the parser's type
+               // map AND not produced by any node/input/initializer of the current graph.  The
+               // second condition prevents cross-model contamination from prior parsing passes.
+               bool isOuterScope = !graphLocalTensors.count(name) && IsRegisteredTensorType(name);
+               bool available = (allInputs.find(name) != allInputs.end() ||
+                                 allInitializedTensors.find(name) != allInitializedTensors.end() ||
+                                 isOuterScope);
+               existInputs &= available;
                if (fVerbose) {
                   std::cout << "\t\t input " << name << " "
                      << bool(allInputs.find(name) != allInputs.end()) << "  " <<
                      bool(allInitializedTensors.find(name) != allInitializedTensors.end()) << "  " <<
-                     existInputs << std::endl;
+                     bool(isOuterScope) << "  "
+                     << existInputs << std::endl;
                }
             }
          }
@@ -720,7 +807,11 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
    }
 
    // we have to record order of node execution separately to
-   // account for fused operators
+   // account for fused operators.
+   // Save and restore fFusedOperators around the parsing loop so that
+   // recursive ParseONNXGraph calls (for If/Loop subgraphs) do not
+   // corrupt the parent graph's fused-operator bookkeeping.
+   auto savedFusedOperators = std::move(fFusedOperators);
    size_t node_order_exec = 0;
    fFusedOperators = std::vector<bool>(graph.node_size(), false);
    for (int i = 0; i < graph.node_size(); i++) {
@@ -730,7 +821,7 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          std::cout << "\t" << i << "  " << nodesOrder[i] << " parsing operator " << op_type << std::endl;
       }
 
-      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]);
+      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]);
       if (!op) {
          if (verbose) {
             std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl;
@@ -738,9 +829,19 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          // for skipping the fused nodes like Add after MatMul
          continue;
       }
+      // assign operator name for profiling
+      const auto &nodeproto = graph.node(nodesOrder[i]);
+      op->fName = nodeproto.name();
+      if (op->fName.empty()) {
+         op->fName = nodeproto.op_type() + "_" + std::to_string(i);
+      }
       rmodel.AddOperator(std::move(op), node_order_exec++);
    }
 
+   // Restore the parent graph's fFusedOperators (may have been saved as empty
+   // for the top-level call, which is fine — we're done with the loop).
+   fFusedOperators = std::move(savedFusedOperators);
+
    std::vector<std::string> outputnames;
    if (verbose)
       std::cout << "\nParsing Graph output list\n";
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
deleted file mode 100644
index c48e8d1..0000000
--- a/src/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (C) 1995-2019, Rene Brun and Fons Rademakers.
-# All rights reserved.
-#
-# For the licensing terms see $ROOTSYS/LICENSE.
-# For the list of contributors see $ROOTSYS/README/CREDITS.
-
-set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE)
-
-add_subdirectory(SOFIE_core)
-add_subdirectory(SOFIE_parsers)
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx
deleted file mode 100644
index edbec58..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef SOFIE_ROPERATOR
-#define SOFIE_ROPERATOR
-
-#include <vector>
-#include <memory>
-
-#include "SOFIE/SOFIE_common.hxx"
-//#include "RModel.hxx"
-
-
-
-
-namespace SOFIE{
-
-class RModel;
-
-class ROperator{
-
-
-public:
-   virtual std::vector<std::string> GetBlasRoutines() { return {}; }
-   virtual std::vector<std::string> GetStdLibs() { return {}; }
-   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) = 0;
-   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) = 0;
-   virtual void Initialize(RModel&) = 0;
-   virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
-   // generate initialization code for session constructor
-   virtual std::string GenerateInitCode() { return "";}
-   // generate some specific declaration code for Session
-   virtual std::string GenerateDeclCode() { return "";}
-   // generate session data members specific to operator
-   virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
-   virtual std::string Header() { return "";}
-
-   //virtual void Forward_reference() = 0;
-   //virtual void Forward_blas() = 0;
-   virtual ~ROperator(){}
-
-protected:
-
-   const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
-   bool fUseSession = false;        ///< flag to identify if using the session class
-   bool fIsOutputConstant = false;  ///< flag to identify if operator has a constant output (no need to generate code)
-   
-   mutable std::vector<std::string_view> fInputTensorNames;
-   mutable std::vector<std::string_view> fOutputTensorNames;
-
-public:
-   std::span<const std::string_view> GetOpInputTensors() const {
-      return fInputTensorNames;
-   }
-
-   std::span<const std::string_view> GetOpOutputTensors() const {
-      return fOutputTensorNames;
-   }
-   
-};
-
-
-
-}//SOFIE
-
-
-#endif //SOFIE_OPERATOR
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
deleted file mode 100644
index 127eaff..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
+++ /dev/null
@@ -1,216 +0,0 @@
-#ifndef SOFIE_ROperator_BasicBinary
-#define SOFIE_ROperator_BasicBinary
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-namespace SOFIE{
-
-enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow };
-
-template <typename T, EBasicBinaryOperator Op1>
-struct BinaryOperatorTrait {};
-
-template <typename T>
-struct BinaryOperatorTrait<T, Add> {
-   static const std::string Name() { return "Add"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; }
-   static T Func(T t1, T t2) {return  t1 + t2;}
-};
-
-template <typename T>
-struct BinaryOperatorTrait<T, Sub> {
-   static const std::string Name() { return "Sub"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; }
-   static T Func (T t1, T t2) { return t1 - t2;}
-};
-
-template <typename T>
-struct BinaryOperatorTrait<T, Mul> {
-   static const std::string Name() { return "Mul"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; }
-   static T Func (T t1, T t2) { return  t1 * t2;}
-};
-
-template <typename T>
-struct BinaryOperatorTrait<T, Div> {
-   static const std::string Name() { return "Div"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; }
-   static T Func (T t1, T t2) { return t1/t2;}
-};
-
-template <typename T>
-struct BinaryOperatorTrait<T, Pow> {
-   static const std::string Name() { return "Pow"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
-   static T Func (T t1, T t2) { return std::pow(t1,t2);}
-};
-
-template<typename T, EBasicBinaryOperator Op>
-class ROperator_BasicBinary final : public ROperator{
-private:
-
-   std::string fNA;
-   std::string fNB;
-   std::string fNBroadcastedA;
-   std::string fNBroadcastedB;
-   std::string fNY;
-
-   std::vector<size_t> fShapeA;
-   std::vector<size_t> fShapeB;
-   std::vector<size_t> fShapeY;
-
-public:
-   ROperator_BasicBinary(){}
-   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY):
-      fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNA, fNB };
-         fOutputTensorNames = { fNY };
-      }
-
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      // assume now inputs have same shape (no broadcasting)
-      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNA)){
-         throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model");
-      }
-      if (!model.CheckIfTensorAlreadyExist(fNB)) {
-         throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model");
-      }
-      fShapeA = model.GetTensorShape(fNA);
-      fShapeB = model.GetTensorShape(fNB);
-      bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB);
-      if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB);
-         bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY);
-         bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY);
-         // Broadcast A to Y
-         if (broadcastA) {
-            fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
-            if (model.IsInitializedTensor(fNA)) {
-               auto data = model.GetInitializedTensorData(fNA);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
-               fShapeA = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting A
-               model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY);
-            }
-         }
-         // Broadcast B to Y
-         if (broadcastB) {
-            fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
-            if (model.IsInitializedTensor(fNB)) {
-               auto data = model.GetInitializedTensorData(fNB);
-               std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T*>(data.get())) << std::endl;
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
-                  std::default_delete<T[]>());
-               // do not update tensor B but add broadcasted one (since it can be input to some other operators)
-               std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast<T*>(broadcastedData.get())) << std::endl;
-               model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
-               fShapeB = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
-            }
-         }
-      } else {
-         fShapeY = fShapeA;
-      }
-      // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) {
-         const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++) {
-            dataY[i] = BinaryOperatorTrait<T,Op>::Func(dataA[i], dataB[i]);
-         }
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a fil
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
-         fIsOutputConstant = true;
-         if (model.Verbose())
-            std::cout << "Binary op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-      }
-      else {
-        model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
-      }
-   }
-
-   std::string GenerateInitCode() override {
-      std::stringstream out;
-      return out.str();
-   }
-
-   std::string Generate(std::string OpName) override {
-
-      if (fIsOutputConstant) return "";
-
-      OpName = "op_" + OpName;
-
-      if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      out << SP << "\n//------ " << BinaryOperatorTrait<T,Op>::Name() << "\n";
-      size_t length = ConvertShapeToLength(fShapeY);
-      std::string typeName = TensorType<T>::Name();
-      // Broadcast A if it's uninitialized
-      // use broadcasting function where we pass an already allocated tensor to minimize memory allocations
-      if (fShapeA != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
-         out << SP  << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
-      }
-      // Broadcast B if it's uninitialized
-      if (fShapeB != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
-      }
-      const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-      const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << BinaryOperatorTrait<T,Op>::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") <<  " ;\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-   std::vector<std::string> GetStdLibs() override {
-      if (Op == EBasicBinaryOperator::Pow) {
-         return { std::string("cmath") };
-      } else {
-         return {};
-      }
-   }
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROperator_BasicBinary
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
deleted file mode 100644
index c18c17e..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef SOFIE_ROPERATOR_BASIC_UNARY
-#define SOFIE_ROPERATOR_BASIC_UNARY
-
-#include <SOFIE/ROperator.hxx>
-#include <SOFIE/RModel.hxx>
-#include <SOFIE/SOFIE_common.hxx>
-
-
-namespace SOFIE {
-
-enum class EBasicUnaryOperator { kReciprocal, kSqrt , kNeg, kExp, kLog, kSin, kCos, kAbs };
-
-template <typename T, EBasicUnaryOperator Op>
-struct UnaryOpTraits {
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kReciprocal> {
-   static std::string Name() { return "Reciprocal"; }
-   static std::string Op(const std::string &X) { return "1/" + X; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kSqrt> {
-   static std::string Name() { return "Sqrt"; }
-   static std::string Op(const std::string &X) { return "std::sqrt(" + X + ")"; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kNeg> {
-   static std::string Name() { return "Neg"; }
-   static std::string Op(const std::string &X) { return "-" + X; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kExp> {
-   static std::string Name() { return "Exp"; }
-   static std::string Op(const std::string &X) { return "std::exp(" + X + ")"; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kLog> {
-   static std::string Name() { return "Log"; }
-   static std::string Op(const std::string &X) { return "std::log(" + X + ")"; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kSin> {
-   static std::string Name() { return "Sin"; }
-   static std::string Op(const std::string &X) { return "std::sin(" + X + ")"; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kCos> {
-   static std::string Name() { return "Cos"; }
-   static std::string Op(const std::string &X) { return "std::cos(" + X + ")"; }
-};
-
-template <typename T>
-struct UnaryOpTraits<T, EBasicUnaryOperator::kAbs> {
-   static std::string Name() { return "Abs"; }
-   static std::string Op(const std::string &X) { return "std::abs(" + X + ")"; }
-};
-
-template <typename T, EBasicUnaryOperator Op>
-class ROperator_BasicUnary final : public ROperator {
-private:
-   std::string fNX;
-   std::string fNY;
-
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeY;
-
-public:
-   ROperator_BasicUnary() {}
-
-   ROperator_BasicUnary(std::string nameX, std::string nameY)
-      : fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
-   {
-         fInputTensorNames =  { fNX };
-         fOutputTensorNames = { fNY };
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override { return input; }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
-
-   void Initialize(RModel& model) override {
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
-      }
-      fShapeX = model.GetTensorShape(fNX);
-      fShapeY = ShapeInference({fShapeX})[0];
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-   }
-
-   std::string Generate(std::string OpName) override
-   {
-      OpName = "op_" + OpName;
-      std::stringstream out;
-
-      out << SP << "\n//---- Operator" << UnaryOpTraits<T, Op>::Name() << " " << OpName << "\n";
-      size_t length = ConvertShapeToLength(fShapeX);
-      out << SP << "for (size_t i = 0; i < " << length << "; i++) {\n";
-      out << SP << SP << "tensor_" << fNY << "[i] = " << UnaryOpTraits<T, Op>::Op("tensor_" + fNX + "[i]") << ";\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-   std::vector<std::string> GetStdLibs() override {
-      if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) {
-         return { std::string("cmath") };
-      } else {
-         return {};
-      }
-   }
-};
-
-} // namespace SOFIE
-
-#endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
deleted file mode 100644
index 47c3d66..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Cast
-#define SOFIE_ROPERATOR_Cast
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-
-class ROperator_Cast final : public ROperator
-{
-
-private:
-
-   std::string fNX;
-   std::string fNY;
-   std::vector<size_t> fShape;
-   std::string fAttrType = "float";
-
-public:
-   ROperator_Cast(){}
-   ROperator_Cast(std::string attr_type,std::string nameX, std::string nameY):
-   fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)),
-   fAttrType(attr_type) {
-      fInputTensorNames = { fNX };
-      fOutputTensorNames = { fNY };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-       //input must be a graph input, or already initialized intermediate tensor
-      if (model.CheckIfTensorAlreadyExist(fNX) == false){
-        throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model");
-      }
-      fShape = model.GetTensorShape(fNX);
-      // shoud we add a check if the same type
-      auto inputType = model.GetTensorType(fNX);
-      if (model.IsInitializedTensor(fNX)) {
-         fIsOutputConstant = true;
-         auto inputData = model.GetInitializedTensorData(fNX);
-         if (ConvertStringToType(fAttrType) == ETensorType::INT64) {
-            model.AddConstantTensor<int64_t>(fNY, fShape, static_cast<int64_t*>(inputData.get()));
-            model.SetNotWritableInitializedTensor(fNX);
-         }
-         else
-            fIsOutputConstant = false;
-      }
-      if (!fIsOutputConstant)
-         model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape);
-      if (model.Verbose()) {
-         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY;
-         if (fIsOutputConstant) std::cout << " (constant) ";
-         std::cout << std::endl;
-      }
-   }
-
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";
-
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      size_t length = ConvertShapeToLength(fShape);
-
-      // out << SP << ETensorType << " " << OpName << "_attr = "  << fattr << ";\n";
-      out << "\n//------ CAST\n";
-       // no generated code for constant outputs
-      if (fIsOutputConstant) return out.str();
-
-      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
-
-      out << SP << SP << "tensor_" << fNY << "[id] = static_cast<"<< fAttrType << ">(tensor_" << fNX << "[id]);\n";
-
-      out << SP << "}\n";
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_Cast
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
deleted file mode 100644
index 0d5e574..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
+++ /dev/null
@@ -1,263 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Concat
- #define SOFIE_ROPERATOR_Concat
-
-
- #include "SOFIE/SOFIE_common.hxx"
- #include "SOFIE/ROperator.hxx"
- #include "SOFIE/RModel.hxx"
-
- #include <sstream>
- #include <algorithm>
- #include <iterator>
- #include <iomanip>
- #include <limits>
-
- namespace SOFIE{
-
-     class ROperator_Concat final : public ROperator
-     {
-     private:
-         int fAxis=0;
-         int fnewAxis=0;
-         std::vector<std::string> fInputs;
-         std::string fOutput;
-         std::vector<Dim>fOutputShape;
-         std::vector<std::vector<Dim>> fInputShapes;
-
-     public:
-         ROperator_Concat(){}
-         ROperator_Concat(std::vector<std::string> inputs, int axis, int newAxis, std::string output):
-         fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) {
-            fInputs.reserve(inputs.size());
-            for (auto & name : inputs)
-               fInputs.push_back(UTILITY::Clean_name(name));
-
-         fInputTensorNames.resize(fInputs.size());
-         std::transform(fInputs.begin(), fInputs.end(), fInputTensorNames.begin(),
-                   [](const std::string& s) -> std::string_view { return s; });
-         fOutputTensorNames = { fOutput };
-         }
-
-         std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-             return input;
-         }
-
-         // get shape of output given inputs. It is going to be called after initialized
-         std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> inputs) override {
-             std::vector<std::vector<size_t>> ret(1);
-            // treat negative axis case
-            if (fAxis<0) {
-               fAxis = inputs[0].size()+fAxis;
-            }
-            if (fAxis < 0 || fAxis >= (int) inputs[0].size())
-               throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
-
-            int concat_dim=0;
-            if(fnewAxis == 0){
-               for (size_t i = 0; i < inputs.size(); i++) {
-                  if (i > 0 && inputs[i].size() != inputs[i - 1].size())
-                     throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " +
-                                              ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i - 1]));
-                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
-                     if ((int)iaxis == fAxis)
-                        concat_dim += inputs[i][iaxis];
-                     else if (i > 0 && inputs[i][iaxis] != inputs[i - 1][iaxis])
-                        throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " +
-                                                 ConvertShapeToString(inputs[i]) + " and " +
-                                                 ConvertShapeToString(inputs[i - 1]));
-                  }
-               }
-
-               // output shape
-               ret[0] = inputs[0];
-               ret[0][fAxis] = concat_dim;
-            }
-            std::vector<int> stack;
-            if(fnewAxis == 1){
-               for(size_t i = 0; i < inputs.size(); i++) {
-                  if (i > 0 && inputs[i].size() != inputs[i-1].size() )
-                  throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
-                     ConvertShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertShapeToString(inputs[i-1]));
-                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
-                     if ((int) iaxis == fAxis)
-                        stack.push_back(inputs[i][iaxis]);
-                     else
-                     if (i> 0 && inputs[i][iaxis] != inputs[i-1][iaxis])
-                        throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " +
-                        ConvertShapeToString(inputs[i]) + " and " + ConvertShapeToString(inputs[i-1]));
-                  }
-
-               }
-               for(auto it:stack)
-               ret[0].push_back(it);
-            }
-
-            return ret;
-         }
-
-         // get shape of output given inputs. It is going to be called after initialized
-         std::vector<std::vector<Dim>> ShapeInference(const std::vector<std::vector<Dim>> & inputs) {
-            std::vector<std::vector<Dim>> ret(1);
-            // treat negative axis case
-            if (fAxis<0) {
-               fAxis = inputs[0].size()+fAxis;
-            }
-            if (fAxis < 0 || fAxis >= (int) inputs[0].size())
-               throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
-
-            int concat_dim=0;
-            if(fnewAxis == 0){
-               for (size_t i = 0; i < inputs.size(); i++) {
-                  if (i > 0 && inputs[i].size() != inputs[i - 1].size())
-                     throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
-                                              ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1]));
-                  for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
-                     if ((int)iaxis == fAxis) {
-                        // support only non-params shape for the concatenation axis
-                        if (inputs[i][iaxis].isParam)
-                           throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " +
-                                                     ConvertDynamicShapeToString(inputs[i]));
-                        concat_dim += inputs[i][iaxis].dim;
-                     }
-                     // other dimensions must be the same
-                     else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal())
-                        throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " +
-                                                 ConvertDynamicShapeToString(inputs[i]) + " and " +
-                                                 ConvertDynamicShapeToString(inputs[i - 1]));
-                  }
-               }
-
-               // output shape
-               ret[0] = inputs[0];
-               ret[0][fAxis].dim = concat_dim;
-            }
-            // case of stacking (not supported yet)
-            // here we need to check that input shapes are the same
-            // for example for fAxis == 0
-            // output shapes: [inputs.size(), inputs[0][0], inputs[0][1],....]
-            if(fnewAxis == 1){
-               throw std::runtime_error("TMVA SOFIE Concat Op - stacking (i.e. COncatFromSequence with new_axis=1) is not supported ");
-            }
-            return ret;
-         }
-
-      void Initialize(RModel& model) override {
-            for (auto &it : fInputs) {
-               if (model.CheckIfTensorAlreadyExist(it) == false) {
-                  throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model");
-               }
-               fInputShapes.push_back(model.GetDynamicTensorShape(it));
-            }
-            fOutputShape = ShapeInference(fInputShapes)[0];
-            if (model.Verbose())
-               std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl;
-
-            // check if concat has constant inputs , axis 0(concat contigous memory and type is integer)
-            if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) {
-               fIsOutputConstant = true;
-               for ( auto & input : fInputs) {
-                  if (!model.IsInitializedTensor(input)) {
-                     fIsOutputConstant = false;
-                     break;
-                  }
-               }
-               if (fIsOutputConstant) {
-                  auto outputShape = ConvertShapeToInt(fOutputShape);  // conversion must be possible
-                  std::vector<int64_t> outputData(ConvertShapeToLength(outputShape));
-                  size_t offset = 0;
-                  for ( auto & input : fInputs) {
-                     auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(input).get());
-                     auto inputShape = model.GetTensorShape(input); // shape is not dynamic if it is constant
-                     size_t inputLength = ConvertShapeToLength(inputShape);
-                     std::copy(inputData, inputData + inputLength, outputData.begin() + offset );
-                     offset += inputLength;
-                     // data do not need to be written as a weight
-                     model.SetNotWritableInitializedTensor(input);
-                  }
-                  model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
-                  if (model.Verbose()) {
-                     std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : "
-                     << ConvertValuesToString(outputData) << std::endl;
-                  }
-               }
-            }
-            if (!fIsOutputConstant) {
-               model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape);
-               if (model.Verbose()) {
-                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDynamicShapeToString(fOutputShape) << std::endl;
-               }
-            }
-         }
-
-         std::string Generate(std::string OpName) override {
-            if (fIsOutputConstant) return "";
-            OpName = "op_"+OpName;
-            if(fOutputShape.empty()){
-                  throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first");
-            }
-            std::stringstream out;
-            out<<"\n//--------- Concat\n";
-            // special case when memory is contiguous
-            bool hasShapeOnes = true;
-            for(int i = 0; i<fAxis; ++i){
-               if(fInputShapes[0][i].dim !=1){
-                  hasShapeOnes = false;
-                  break;
-               }
-            }
-            if (fAxis == 0 || hasShapeOnes) {
-               std::string offset;
-               for(size_t i=0; i<fInputs.size(); ++i) {
-                  std::string length = ConvertDynamicShapeToLength(fInputShapes[i]);
-                  out << SP << "std::copy(tensor_" <<fInputs[i] << ", tensor_" <<fInputs[i] << "+" << length <<", tensor_"<<fOutput;
-                  if (i > 0)  out << offset;
-                  offset += " + " + length;
-                  out << ");\n";
-               }
-            }
-            else {
-
-               std::vector<Dim> outStride = UTILITY::ComputeStrideFromShape(fOutputShape);
-               std::vector<std::vector<Dim>> inStrides(fInputs.size());
-               int idx = 0;
-               for ( auto &s : inStrides) {
-                  s = UTILITY::ComputeStrideFromShape(fInputShapes[idx]);
-                  idx++;
-               }
-               for (int i = 0; i < fAxis; ++i) {
-                  // loop on dimensions
-                  out << SP << "for (size_t i" << i << " = 0; i" << i << " < " << fOutputShape[i].GetVal() << "; ++i" << i <<") {\n";
-               }
-
-               out << SP << SP << SP << "int idxOut = ";
-               for (int k = 0; k < fAxis; k++) {
-                  if (k > 0) out << " + ";
-                  out << outStride[k].GetVal() << "*i" << k;
-               }
-               out << ";\n";
-
-               for (size_t j = 0; j < fInputs.size(); j++) {
-                  if (j>0)
-                  out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n";
-                  out << SP << SP << SP << "int idxIn" << j <<" = ";
-                  for (int k = 0; k < fAxis; k++) {
-                     if (k > 0) out << " + ";
-                     out << inStrides[j][k].GetVal() << "*i" << k;
-                  }
-                  out << ";\n";
-                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n";
-                  out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n";
-                  out << SP << SP << SP << "}\n";
-               // concatenate the axis values
-               }
-                for (int i = 0; i < fAxis; ++i) {
-                    out << SP << "}\n";
-                }
-            }
-
-            return out.str();
-         }
-     };
- }//SOFIE
-
- #endif //SOFIE_ROPERATOR_CONCAT
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx
deleted file mode 100644
index 15ca91e..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Conv.hxx
+++ /dev/null
@@ -1,531 +0,0 @@
-#ifndef SOFIE_SOFIE_ROPERATOR_CONV
-#define SOFIE_SOFIE_ROPERATOR_CONV
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <memory>
-#include <sstream>
-#include <algorithm>
-#include <stdexcept>
-#include <vector>
-#include <cassert>
-
-
-namespace SOFIE {
-
-template<typename T>
-class ROperator_Conv final : public ROperator
-{
-private:
-   std::string fAttrAutopad;
-   std::vector<size_t> fAttrDilations;
-   size_t fAttrGroup;
-   std::vector<size_t> fAttrKernelShape;
-   std::vector<size_t> fAttrPads;
-   std::vector<size_t> fAttrStrides;
-
-   std::string fNX;
-   std::string fNW;
-   std::string fNB;
-   std::string fNB2; // bias tensor name after broadcasting
-   std::string fNY;
-
-   std::string convK;
-   std::string imcol;
-
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeW;
-   std::vector<size_t> fShapeB;
-   std::vector<size_t> fShapeY;
-
-   std::string fType;
-
-   size_t fDim;   // dimension of the convolution
-
-
-public:
-
-   ROperator_Conv() {}
-
-   ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
-      size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
-      std::vector<size_t> strides, std::string nameX, std::string nameW,
-      std::string nameB, std::string nameY):
-      fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
-      fAttrPads(pads), fAttrStrides(strides),
-      fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)),
-      fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
-   {
-      if(std::is_same<T, float>::value) {
-         fType = "float";
-      } else {
-         throw
-            std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
-      }
-      fInputTensorNames = { fNX, fNB };
-      fOutputTensorNames = { fNY };
-   }
-
-   ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
-      size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
-      std::vector<size_t> strides, std::string nameX, std::string nameW,
-      std::string nameY):
-      fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
-      fAttrPads(pads), fAttrStrides(strides),
-      fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY))
-   {
-      if(std::is_same<T, float>::value) {
-         fType = "float";
-      } else {
-         throw
-            std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
-      }
-      fInputTensorNames = { fNX };
-      fOutputTensorNames = { fNY };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      ETensorType out = input[0];
-      return {out};
-   }
-
-   // function returning output shape given input
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      // shape of convolution input has to be (according to ONNX): N x C x H x W
-      // Where N : batch size, C : input  channels, H : input height, W : input width
-
-      if (input.size() > 3 ) {
-         throw
-            std::runtime_error("TMVA SOFIE Conv Op Shape inference need 2 or 3 input tensors");
-      }
-      for(size_t i = 0; i < input.size(); i++) {
-         if (input[i].size() -2 != fDim) {
-            throw
-               std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid inputs ");
-         }
-      }
-
-      if (fAttrGroup == 0) {
-         fAttrGroup = input[0][1] / input[1][1];
-      }
-
-      // kernel shape
-      size_t k1 = ((fAttrKernelShape.empty())? input[1][2] : fAttrKernelShape[0]);
-      size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? input[1][3] : fAttrKernelShape[1]) : 1;
-      size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? input[1][4] : fAttrKernelShape[2]) : 1;
-
-
-      size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1;
-      size_t i2 = (fDim > 2) ? 4 : 3;
-      size_t i3 = 5;
-
-      if (fAttrDilations.empty()) {
-         fAttrDilations = {1, 1, 1};
-      }
-      fAttrDilations.resize(3);
-      if (fDim < 3) {
-         fAttrDilations.resize(3, 1);
-      }
-      // Shape of the kernel
-      fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1),
-                          k2 + (fAttrDilations[1] - 1) * (k2 - 1),
-                          k3 + (fAttrDilations[2] - 1) * (k3 - 1)};
-
-      if (fAttrAutopad == "NOTSET") {
-         if (fAttrPads.empty()) {
-            fAttrPads = {1, 1, 1, 1, 1, 1};
-         }
-      } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") {
-         if (fDim == 1)
-            fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[0] / 2};
-         else if (fDim == 2)
-            fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2};
-         else if (fDim == 3)
-            fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2,
-                         fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2};
-         // add extra padding at beginning or end (depending if SAME_UPPER or SAME_LOWER)
-         // need to check this!
-         if (fAttrKernelShape[0] % 2 == 1) {
-            (fAttrAutopad == "SAME_UPPER") ? fAttrPads[0]++ : fAttrPads[i1]++;
-         }
-         if (fDim > 1 && fAttrKernelShape[1] % 2 == 1) {
-            (fAttrAutopad == "SAME_UPPER") ? fAttrPads[1]++ : fAttrPads[i2]++;
-         }
-         if (fDim > 2 && fAttrKernelShape[2] % 2 == 1) {
-            (fAttrAutopad == "SAME_UPPER") ? fAttrPads[2]++ : fAttrPads[i3]++;
-         }
-      } else if (fAttrAutopad != "VALID") {
-         throw
-            std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad");
-      }
-      // to be sure pad is vector of size 6
-      if (fDim < 3) fAttrPads.resize(6, 0);
-
-      if (fAttrStrides.empty()) {
-         fAttrStrides = {1, 1, 1};
-      }
-      if (fDim < 3)
-         fAttrStrides.resize(3, 1);
-
-
-      size_t input1 = input[0][2];
-      size_t input2 = (fDim > 1) ? input[0][3] : 1;
-      size_t input3 = (fDim > 2) ? input[0][4] : 1;
-
-      size_t pad1 = fAttrPads[0] + fAttrPads[i1];
-      size_t output1 = (input1 + pad1 - fAttrKernelShape[0]) / fAttrStrides[0] + 1;
-
-      size_t batch_size = input[0][0];        // first element in input tensor
-      size_t output_channels = input[1][0];   // first element in weight tensor
-
-      std::vector<std::vector<size_t>> ret({{ batch_size, output_channels, output1 }});
-
-      if (fDim == 1)
-         return ret;
-
-      size_t pad2 = fAttrPads[1] + fAttrPads[i2];
-      size_t output2 = (input2 + pad2 - fAttrKernelShape[1]) / fAttrStrides[1] + 1;
-      // output is N x M x OH x OW
-      ret[0].push_back(output2);
-      if (fDim == 2)
-         return ret;
-
-      size_t pad3 = fAttrPads[2] + fAttrPads[i3];
-      size_t output3 = (input3 + pad3 - fAttrKernelShape[2] ) / fAttrStrides[2] + 1;
-
-      // output is N x M x OH x OW x OD
-      ret[0].push_back(output3);
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      fUseSession = model.UseSession();
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw
-            std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model");
-      }
-      fShapeX = model.GetTensorShape(fNX);
-      if (fShapeX.size() < 3 || fShapeX.size()  > 5) {
-         std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
-         throw
-            std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
-      }
-      fDim = fShapeX.size() - 2;
-      if (!model.CheckIfTensorAlreadyExist(fNW)) {
-         throw
-            std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
-      }
-      fShapeW = model.GetTensorShape(fNW);
-      if (fShapeW.size() < 3 || fShapeW.size()  > 5) {
-         std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
-         throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions");
-      }
-      fShapeY = ShapeInference({fShapeX, fShapeW})[0];
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-      if (fNB != "") {
-         if (!model.CheckIfTensorAlreadyExist(fNB)) {
-            throw
-               std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
-         }
-         fShapeB = model.GetTensorShape(fNB);
-         std::vector<size_t> targetShape(fShapeY.begin() + 1, fShapeY.end());
-         bool broadcast_needed = !UTILITY::AreSameShape(fShapeB, targetShape);
-         if (broadcast_needed) {
-            auto original_data = model.GetInitializedTensorData(fNB);
-            // make bias shape equal to Y shape by adding 1
-            if (fShapeB.size() < 1)
-               throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape");
-            // we assume bias tensor dimension is equal to number of filters that is the second dimension in
-            // the output tensor
-            if (fShapeB[0] != fShapeY[1])
-               throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " +
-                                           ConvertShapeToString(fShapeB));
-            if (fType != "float")
-               throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
-            // here is the actual broadcasting
-            if (!fUseSession) {
-               std::vector<size_t> shape(fDim + 1, 1);
-               shape[0] = fShapeB[0];
-               std::shared_ptr<void> new_data_ptr(
-                  UTILITY::UnidirectionalBroadcast<float>(static_cast<float *>(original_data.get()), shape, targetShape),
-                  std::default_delete<float[]>());
-               model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), targetShape, new_data_ptr);
-               fShapeB = model.GetTensorShape(fNB);
-               fNB2 = fNB;   // use same name
-            }
-            else {
-               // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-               // we need to add a new intermediate tensor for broadcasted bias tensor
-               fNB2 = fNB + "bcast";
-               model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape);
-            }
-         }
-      }
-
-      size_t outputChannelSize = fShapeY[2];  // size/channel = D * H * W
-      size_t kernelSize = fAttrKernelShape[0];
-      for (size_t i = 1; i < fDim; i++) {
-         outputChannelSize *= fShapeY[2 + i];
-         kernelSize *= fAttrKernelShape[i];
-      }
-
-      std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
-      std::vector<size_t> shape2 = {fShapeW[1], kernelSize, outputChannelSize};
-      model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
-      model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
-      convK = fNX +"_f";
-      imcol = fNX +"_xcol";
-      fOutputTensorNames.emplace_back(convK);
-      fOutputTensorNames.emplace_back(imcol);
-   }
-
-   std::string GenerateInitCode() override {
-      std::stringstream out;
-      // Generate initialization code for broadcasting of bias tensor
-      if (!fNB2.empty()) {
-         // include a separate scope to avoid defining unique operator temp variables
-         std::vector<size_t> shape(fDim + 1, 1);
-         shape[0] = fShapeB[0];
-         std::vector<size_t> targetShape(fShapeY.begin() + 1, fShapeY.end());
-         out << SP << "{\n";
-         out << SP << SP << "float * data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-             << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n";
-         out << SP << SP << "delete[] data;\n";
-         out << SP << "}\n";
-      }
-      return out.str();
-   }
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-
-      if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
-         throw
-            std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
-      }
-
-      std::stringstream out;
-      size_t bsize = fShapeX[0];
-      size_t kDepth = (fDim > 2) ?  fShapeW[2] : 1;  // kernel depth
-      size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1;  // kernel height
-      size_t kWidth = fShapeW[fDim+1]; // kernel width
-      size_t iDepth = (fDim > 2) ?  fShapeX[2] : 1;  // input depth
-      size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height
-      size_t iWidth = fShapeX[fDim+1]; // input width
-      size_t oDepth = (fDim > 2) ? fShapeY[2] : 1; // output depth
-      size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1;  // ouput height
-      size_t oWidth = fShapeY[fDim+1]; // output width
-
-      out << "\n//----  operator Conv " << OpName << "\n";
-
-      // vectorize the (dilated)convolution kernels into a matrix
-      // no need to transpose the matrix
-      // to fix for 1d and 3d
-
-      size_t id = (fDim > 2) ? fDim-3 : 2;
-      size_t ih = (fDim > 1) ? fDim-2 : 1;
-      size_t iw = fDim-1;
-
-      size_t wstrideDil = fAttrDilations[iw];
-      size_t hstride = kWidth;
-      size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw];  // stride dilated in the height
-      size_t dstride = kHeight * kWidth;
-      size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
-      size_t icstride = kHeight * kWidth * kDepth;
-      size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
-      size_t ocstride = fShapeW[1] * icstride;
-      size_t ocstrideDil = fShapeW[1] * icstrideDil;
-
-      out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n";
-      out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n";
-      if (fDim > 2)
-         out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
-      if (fDim > 1)
-         out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
-      out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
-
-      out << SP << SP << SP << SP << SP << "tensor_" <<fNX <<  "_f[oc * "
-          << ocstrideDil << " + ic * " << icstrideDil;
-      if (fDim > 2) out << " + kd * " << dstrideDil;
-      if (fDim > 1) out << " + kh * " << hstrideDil;
-      out << " + kw * " << wstrideDil  << "  ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride;
-      if (fDim > 2) out << " + kd * " << dstride;
-      if (fDim > 1) out << " + kh * " << hstride;
-      out  << " + kw ];\n";
-
-      out << SP << SP << SP << SP << "}\n";
-      if (fDim > 1) out << SP << SP << SP << "}\n";
-      if (fDim > 2) out << SP << SP << SP << "}\n";
-      out << SP << SP << "}\n";
-      out << SP << "}\n";
-
-      //out << SP << "char " << OpName << "_transA = 'T';\n";
-      out << SP << "char " << OpName << "_transA = 'N';\n";
-      out << SP << "char " << OpName << "_transB = 'N';\n";
-      out << SP << "int " << OpName << "_m = " << oHeight * oWidth * oDepth << ";\n"; // output h*w
-      assert(fShapeY[1] == fShapeW[0]);
-      assert(fShapeW[1] == fShapeX[1] / fAttrGroup);
-      out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels
-      out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n";
-      out << SP << "float " << OpName << "_alpha = 1.0;\n";
-      out << SP << "float " << OpName << "_beta = 0.0;\n";
-
-
-      // Loop on batch size
-      out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
-
-      // IM2COL: Unroll the input tensor
-      // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
-      //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
-      //   (xa2,...xak+1,ya1,...yak)(......)
-      // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
-      // By doing this one has consecutive memory reads and writes
-      // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w)
-      if (fDim ==1) {
-         if (fAttrPads[0] != fAttrPads[1] ) {
-            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
-                      << std::endl;
-            fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
-         }
-         fAttrPads[1] = 0;
-         fAttrStrides[1] = 1;
-      }
-      if (fDim == 2) {
-         if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
-            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
-            fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
-            fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
-         }
-      }
-      if (fDim == 3) {
-         if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
-            std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
-            fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
-            fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
-            fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
-         }
-      }
-      out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
-
-      if (fAttrGroup == 1) {
-         out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth << ";\n";
-         // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y,  output_h *
-         // output_w)
-         if (fDim < 3) {
-            out << SP << SP << "SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
-                << " + x_offset,"
-                //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-                //  dilation_w,
-                //
-                << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
-            if (fDim == 1)
-               out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
-                   << fAttrDilations[0];
-            else // dim ==2
-               out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
-                   << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
-                   << fAttrDilations[1];
-            out << "," << "tensor_" <<fNX << "_xcol);\n\n ";
-         } else {
-            // 3d im2col
-            out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
-                << " + x_offset,"
-                //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
-                //  dilation_d, dilation_h, dilation_w,
-                //
-                << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << ","
-                << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << ","
-                << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << ","
-                << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
-                << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ","
-                << "tensor_" << fNX << "_xcol);\n\n ";
-         }
-         // BLAS
-         out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
-             << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName
-             << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
-         out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
-             << " + out_offset, &" << OpName << "_m);\n";
-      } else {
-         // case of group convolution
-         // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
-         // group)
-         // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
-         out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
-         out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << " + g * "
-             << fShapeW[1] * iDepth * iHeight * iWidth << ";\n ";
-         out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << " + g * "
-             << fShapeW[0] * oDepth * oHeight * oWidth / fAttrGroup << ";\n ";
-
-         if (fDim < 3) {
-            out << SP << SP << "SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
-                << " + x_offset,"
-                //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-                //  dilation_w,
-                //
-                << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
-            if (fDim == 1)
-               out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
-                   << fAttrDilations[0];
-            else // dim ==2
-               out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
-                   << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
-                   << fAttrDilations[1];
-            out << ", tensor_" << fNX << "_xcol);\n\n ";
-         } else {
-            // 3d im2col
-            out << SP << SP << "SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
-                << " + x_offset,"
-                //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
-                //  dilation_d, dilation_h, dilation_w,
-                //
-                << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << ","
-                << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1]
-                << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2]
-                << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX
-                << "_xcol);\n\n ";
-         }
-
-         // BLAS
-         // n must be divided by the number of groups
-         out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n";
-         // offset g must be  g * k * n
-         out << SP << SP << SP << "size_t offset_f = g * "
-             << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup
-             << ";\n";
-         out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
-             << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName
-             << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
-         out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
-             << " + out_offset"
-             << ", &" << OpName << "_m);\n";
-
-         out << SP << SP << "}\n"; // end of group loop
-      }
-
-      if (fNB2 != "") {
-         out << SP << "int " << OpName << "_size = " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
-         out << SP << "float " << OpName << "_gamma = 1.0;\n";
-         out << SP << "int " << OpName << "_incx = 1;\n";
-         out << SP << "int " << OpName << "_incy = 1;\n";
-
-         out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &"
-             << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n";
-
-      }
-      out << SP << "}\n"; // end of batch size loop
-
-      return out.str();
-      }
-
-   /*! \brief Returns the blas routines needed to compile the generated code
-    */
-   std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
-};
-
-} // namespace SOFIE
-
-#endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
deleted file mode 100644
index c834a06..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef SOFIE_ROperator_Expand
-#define SOFIE_ROperator_Expand
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-template<typename T>
-class ROperator_Expand final : public ROperator{
-private:
-
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShape;
-   std::vector<size_t> fShapeY;
-
-   std::string fNX;
-   std::string fNShape;
-   std::string fNY;
-   std::string fType;
-
-   bool fInitialized = false;
-
-public:
-   ROperator_Expand(){}
-   ROperator_Expand(std::string nameX, std::string nameShape, std::string nameY):
-      fNX(UTILITY::Clean_name(nameX)), fNShape(UTILITY::Clean_name(nameShape)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNX };
-         fOutputTensorNames = { fNY };
-      }
-
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      return input;
-   }
-
-   void Initialize(RModel& model) override {
-      // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-        throw std::runtime_error("TMVA SOFIE Expand Op Input Tensor " + fNX + " is not found in model");
-      }
-      fShapeX = model.GetTensorShape(fNX);
-      if (!model.IsInitializedTensor(fNShape)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNShape + " is not initialized.");
-      }
-      int64_t *shapeData =
-           static_cast<int64_t *>(model.GetInitializedTensorData(fNShape).get());
-      fShape = model.GetTensorShape(fNShape);
-      if (fShape.size() != 1) {
-         throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor.");
-      }
-      size_t N = fShape[0];
-      std::vector<size_t> shape(shapeData, shapeData + N);
-      // Y is the common shape of fShapeX and shape
-      fShapeY = SOFIE::UTILITY::UnidirectionalBroadcastShape(
-        fShapeX, shape);
-      fInitialized = model.IsInitializedTensor(fNX);
-      // Broadcast X to the common shape fShapeY
-      bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY);
-      if (model.IsInitializedTensor(fNX)) {
-         // If X is an initialized tensor (constant)
-         auto data = model.GetInitializedTensorData(fNX);
-         if (broadcast) {
-            std::shared_ptr<void> broadcastedData(
-               UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX, fShapeY),
-               std::default_delete<T[]>());
-            // Update the data and the shape of X
-            model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), fShapeY, broadcastedData);
-            fShapeX = fShapeY;
-            // need to set as a not writable tensor
-            model.SetNotWritableInitializedTensor(fNX);
-            data = broadcastedData;
-         }
-         if (broadcast || model.IsConstantTensor(fNX)) {
-            fIsOutputConstant = true; // constant output in this case
-            model.AddConstantTensor(fNY, model.GetTensorType(fNX), fShapeY, data);
-            fOutputTensorNames.pop_back();
-         } else {
-            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-         }
-      } else {
-         // case input is not initialized
-         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-      }
-      fType = ConvertTypeToString(model.GetTensorType(fNX));
-      if (model.Verbose())
-         std::cout << "Expand - output is with shape " << ConvertShapeToString(fShapeY) << std::endl;      
-   }
-
-   std::string GenerateInitCode() override {
-      std::stringstream out;
-      if (!fIsOutputConstant && (fInitialized || fShapeX == fShapeY  ) ) {
-         size_t length = ConvertShapeToLength(fShapeY);
-         out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n";
-         out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
-      }
-      return out.str();
-   }
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";
-      OpName = "op_" + OpName;
-      if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      out << SP << "\n//------ Expand Op" << "\n";
-      // No need to broadcast A if it's an initialized tensor or shapes are the same
-      if (!fInitialized && fShapeX != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY)
-                   << ", std::span<"<<fType<<">(tensor_"<<fNY<<", "<<ConvertShapeToLength(fShapeY)<<"));\n";                   
-      }
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROperator_Expand
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
deleted file mode 100644
index 4d34846..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
+++ /dev/null
@@ -1,219 +0,0 @@
-#ifndef SOFIE_ROPERATOR_GATHER
-#define SOFIE_ROPERATOR_GATHER
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-
-namespace SOFIE{
-
-class ROperator_Gather final : public ROperator
-{
-private:
-
-   int64_t fAttrAxis = 0;
-
-   std::string fNX;
-   std::string fNIndices;
-   std::string fNY;
-
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeIndices;
-   std::vector<size_t> fShapeY;
-
-   std::vector<int64_t> fIndices;  // indices vector in case they are known at initialization
-
-   std::string fType;
-
-public:
-   ROperator_Gather(){}
-   ROperator_Gather(int64_t attrAxis, std::string nameX, std::string nameIndices, std::string nameY):
-      fAttrAxis(attrAxis), fNX(UTILITY::Clean_name(nameX)), fNIndices(UTILITY::Clean_name(nameIndices)), fNY(UTILITY::Clean_name(nameY)) {
-         fInputTensorNames = { fNX, fNIndices };
-         fOutputTensorNames = { fNY };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input;
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA SOFIE Gather Op Input Tensor " + fNX + " is not found in model");
-      }
-      fShapeX = model.GetTensorShape(fNX);
-      fShapeIndices = model.GetTensorShape(fNIndices);
-      size_t q = fShapeIndices.size();
-      // Axis in range [0, r) where r=rank(X)
-      size_t r = fShapeX.size();
-       // Set the axis
-      if (fAttrAxis < 0) {
-         fAttrAxis = fAttrAxis + int64_t(r);
-      }
-      // empty fShapeIndices is a scalar value for the indices
-      size_t indicesLength = ConvertShapeToLength(fShapeIndices);
-
-      // case indices tensor is initialized
-      if (model.IsInitializedTensor(fNIndices)) {
-         int64_t* indicesData = static_cast<int64_t*>(model.GetInitializedTensorData(fNIndices).get());
-         //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code)
-         model.SetNotWritableInitializedTensor(fNIndices);
-         // update indices data in case of negative dim values
-         for (size_t i = 0; i < indicesLength; i++) {
-            if (indicesData[i] < 0) {
-               indicesData[i] += fShapeX[fAttrAxis];
-            }
-         }
-         // Save in a vector gather Indices of size q
-         fIndices = std::vector<int64_t>(indicesData, indicesData + indicesLength);
-      }
-      // Output shape
-      if (model.Verbose())
-         std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertShapeToString(fShapeIndices) << std::endl;
-
-      if (fShapeY.empty()) {
-         fShapeY.resize(q + r - 1);
-         if (fAttrAxis > 0) {
-            // Copy shape of X[0, ..., axis) to Shape of Y[0, ..., axis)
-            std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin());
-         }
-         // Set shape of Y[axis, ..., axis + q)
-         for (size_t i = 0; i < q; i++) {
-            fShapeY[fAttrAxis + i] = fShapeIndices[i];
-         }
-         // Copy shape of X[axis + 1, ..., axis + r) to shape of Y[axis + q, ... q + r - 1)
-         std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q);
-      }
-      // case input is known (type is an integer) and input indices is a scalar (or vector of size 1)
-      if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) {
-         if (model.GetTensorType(fNX) == ETensorType::INT64) {
-            auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNX).get());
-            // if q <=1 and r = 1 output length = 1 (it is a scalar)
-            std::vector<int64_t> outputData(ConvertShapeToLength(fShapeY));
-            outputData[0] = inputData[fIndices[0]];
-            model.AddConstantTensor(fNY, fShapeY, outputData.data());
-            if (model.Verbose())
-               std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
-                   << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl;
-            fIsOutputConstant = true;
-         }
-      }
-      if (!fIsOutputConstant) {
-         // Add output tensor
-         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-         fType = ConvertTypeToString(model.GetTensorType(fNX));
-         if (model.Verbose())
-               std::cout <<  "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
-                  << std::endl;
-      }
-   }
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) {
-         // no code to generate here for constant output. Tensor output is defined in Session constructor
-         return "//---------------------------------------\n";
-      }
-      OpName = "op_" + OpName;
-      std::stringstream out;
-      out << "//--------- Gather operator \n";
-      // The shape of the output is q + r - 1
-      size_t r = fShapeX.size();
-      // Indices of shape q
-      size_t q = fShapeIndices.size();
-      // Strides
-      std::vector<size_t> stridesX = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::vector<size_t> stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
-      std::vector<size_t> stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
-
-      // case fIndices is not known we need to correct for negative axis indices at run-time
-      if (fIndices.empty()) {
-         size_t indicesLength = ConvertShapeToLength(fShapeIndices);
-         out << SP << "// correct in case of negative gather indices\n";
-         out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n";
-         out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n";
-         out << SP << SP << SP <<  "tensor_" << fNIndices << "[i] += " << fShapeX[fAttrAxis] << ";\n";
-         out << SP << "}\n";
-      }
-
-
-      // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}]
-      // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1)
-      // iterate in [0 ... axis) [0 ... q) [axis ... r - 1)
-      // for j_0, j_1, ..., j_{axis-1}
-      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         std::string index = "j_" + std::to_string(j);
-         out << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n";
-      }
-      // for i_0, i_1, ..., i_{q - 1}
-      if (q == 0)
-         out << SP << SP << "{\n";  // add a scope for local variables
-      for (size_t i = 0; i < q; i++) {
-         std::string index = "i_" + std::to_string(i);
-         out << SP << SP << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n";
-      }
-      // for j_axis, j_{axis + 1}, ..., j_{r - 1}
-      for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         std::string index = "j_" + std::to_string(j);
-         out << SP << SP << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n";
-      }
-
-      out << SP << SP << SP << "size_t y_index = 0;\n";
-      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[j] << ";\n";
-      }
-      for (size_t i = 0; i < q; i++) {
-         out << SP << SP << SP << "y_index += i_" + std::to_string(i) + " * " << stridesY[fAttrAxis + i] << ";\n";
-      }
-      for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[q + j] << ";\n";
-      }
-      // Indices
-      out << SP << SP << SP << "size_t i_index = 0;\n";
-      for (size_t i = 0; i < q; i++) {
-         out << SP << SP << SP << "i_index += i_" + std::to_string(i) + " * " << stridesIndices[i] << ";\n";
-      }
-      // K
-      out << SP << SP << SP << "size_t k = static_cast<size_t>(" << "tensor_" << fNIndices << "[i_index]" << ");\n";
-      // Input
-      out << SP << SP << SP << "size_t x_index = k * " << stridesX[fAttrAxis] << ";\n";
-      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << SP << SP << "x_index += j_" + std::to_string(j) + " * " << stridesX[j] << ";\n";
-      }
-      for (size_t j = fAttrAxis + 1; j < r; j++) {
-         out << SP << SP << SP << "x_index += j_" + std::to_string(j - 1) + " * " << stridesX[j] << ";\n";
-      }
-      out << SP << SP << SP << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n";
-
-      // end loops j_k, j_{k + 1}, ..., j_{r - 2}
-      for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         out << SP << SP << SP << "}\n";
-      }
-      // end loops i_0, i_1, ..., i_{q - 1}
-      if (q == 0)
-         out << SP << SP << "}\n";  // end of scope for q = 0
-      for (size_t i = 0; i < q; i++) {
-         out << SP << SP << "}\n";
-      }
-      // end loops j_0, j_1, ..., j_{axis - 1}
-      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << "}\n";
-      }
-
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_RELU
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
deleted file mode 100644
index 046bf56..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
+++ /dev/null
@@ -1,399 +0,0 @@
-#ifndef SOFIE_ROPERATOR_GEMM
-#define SOFIE_ROPERATOR_GEMM
-
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-#include <algorithm>
-#include <iterator>
-#include <iomanip>
-#include <limits>
-#include <cassert>
-
-
-namespace SOFIE{
-
-
-   template <typename T>
-   class ROperator_Gemm final : public ROperator
-   {
-
-   private:
-      bool fIsDynamic = false;
-
-      float fAttrAlpha = 1.0;
-      float fAttrBeta = 1.0;
-      int_t fAttrTransA = 0;
-      int_t fAttrTransB = 0;
-
-      std::string fNA;
-      std::string fNB;
-      std::string fNC = "";
-      std::string fNC2; // bias tensor name after broadcasting
-      std::string fNY;
-      std::string fType;
-      EActivationType fActivation;
-      std::vector<Dim> fShapeA;
-      std::vector<Dim> fShapeB;
-      std::vector<size_t> fShapeC;
-      std::vector<Dim> fShapeY;
-
-   public:
-
-      ROperator_Gemm(){}
-      ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameY, EActivationType activation=EActivationType::UNDEFINED):
-         fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
-         fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
-      {
-         fActivation = activation;
-         fType = "float";
-         static_assert(std::is_same_v<T, float>,
-                  "TMVA::SOFIE - Unsupported type parsing a Gemm operator");
-         fInputTensorNames = { fNA, fNB };
-         fOutputTensorNames = { fNY };
-      }
-
-      ROperator_Gemm(float alpha, float beta, int_t transA, int_t transB, std::string nameA, std::string nameB, std::string nameC, std::string nameY, EActivationType activation=EActivationType::UNDEFINED):
-         fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
-         fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation)
-      {
-         fActivation = activation;
-         fType = "float";
-
-         fOutputTensorNames = { fNY };
-      }
-
-      std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-         ETensorType out = input[0];
-         return {out};
-      }
-
-      template <typename U>
-      std::vector<std::vector<U>> DoShapeInference(const std::vector<std::vector<U>> & input){
-         if (input.size() > 3) throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor");
-         // accept tensor with input dimensions > 2
-         // example: A = (d1,d2,...,N1,N2)  B = (d1,d2,...,N2,N3)    --> Y = (d1,d2,..,N1,N3)
-         for (auto& i: input){
-            if (i.size() < 2){
-               throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only accept input tensor with >=2 dimensions");
-            }
-         }
-
-         std::vector<std::vector<U>> ret;
-         // when there are 3 inputs shape of Y is the one of C
-         if (input.size() == 3){
-            ret.push_back(input[2]);   //shape of C is shape of Y
-            return ret;
-         }
-         // ioffset cannot be less than 2
-         int ioffset = input[0].size()-2;  // in case of tensors with dim > 2
-
-         std::vector<U> s_a(input[0].begin() + ioffset, input[0].begin() + ioffset + 2);
-         std::vector<U> s_b(input[1].begin() + ioffset, input[1].begin() + ioffset + 2);
-         // reverse in case of transpose
-         if (fAttrTransA){
-            std::reverse(s_a.begin(), s_a.end());
-         }
-         if (fAttrTransB){
-            std::reverse(s_b.begin(), s_b.end());
-         }
-         std::vector<U> s_y;
-         s_y.reserve(input[0].size());
-         if (input[0].size() > 2 && input[1].size() == input[0].size()) {
-            // in case of dim > 2 first dimensions are equal to the input ones not
-            // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4))
-            for (size_t i = 0; i < input[0].size()-2; i++) {
-               Dim valueA = input[0][i];
-               Dim valueB = input[1][i];
-               if (valueA.GetVal() != valueB.GetVal()) {
-                  if (valueB.GetVal() == "1")
-                     s_y.push_back(input[0][i]);
-                  else if (valueA.GetVal() == "1")
-                     s_y.push_back(input[1][i]);
-                  else
-                     throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and "
-                        + valueB.GetVal());
-               }
-               s_y.push_back(input[0][i]);
-            }
-         }
-
-         s_y.push_back(s_a[0]);
-         s_y.push_back(s_b[1]);
-         ret.push_back(s_y);
-         return ret;
-      }
-
-      std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-         return DoShapeInference<size_t>(input);
-      }
-      std::vector<std::vector<Dim>> DynamicShapeInference(const std::vector<std::vector<Dim>> & input){
-         return DoShapeInference<Dim>(input);
-      }
-
-
-
-      void Initialize(RModel& model) override {
-         //TODO: propagate A or B as specified by ONNX standard
-
-         if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){   //input must be a graph input, or already initialized intermediate tensor
-            throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model");
-         }
-         if (fNC != ""){
-            if (model.CheckIfTensorAlreadyExist(fNC) == false){   //input must be a graph input, or already initialized intermediate tensor
-               throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is not found in model");
-            }
-         }
-         if (model.IsDynamicTensor(fNA) || model.IsDimInputTensor(fNA) ) {
-            fShapeA = model.GetDynamicTensorShape(fNA);
-            fIsDynamic = true;
-         } else {
-            auto shapeA_int = model.GetTensorShape(fNA);
-            fShapeA = ConvertShapeToDim(shapeA_int);
-         }
-         // case A is of dim1 we prepend a 1 but we need to remove later
-         bool prependOne = false;
-         if (fShapeA.size() == 1) {
-            fShapeA.insert(fShapeA.begin(), Dim(1));
-            prependOne = true;
-         }
-
-         if (model.IsDynamicTensor(fNB) || model.IsDimInputTensor(fNB)) {
-            fShapeB = model.GetDynamicTensorShape(fNB);
-            fIsDynamic = true;
-         }
-         else {
-            auto shapeB_int = model.GetTensorShape(fNB);
-            fShapeB = ConvertShapeToDim(shapeB_int);
-         }
-         // case B is dim1 we append a 1 but we need to remove later
-         bool appendOne = false;
-         if (fShapeB.size() == 1) {
-            fShapeB.insert(fShapeB.end(), Dim(1));
-            appendOne = true;
-         }
-         // assume if not shape is 2 that extra values are 1.
-         // implement also MatMul case where we stack matrices (see numpy.matmul)
-         if (fShapeA.size() != fShapeB.size()) {
-            // if different dimensions we prepend 1 values
-            if (fShapeA.size() < fShapeB.size()) {
-               fShapeA.insert(fShapeA.begin(), fShapeB.size()-fShapeA.size(), Dim(1));
-            } else if (fShapeB.size() < fShapeA.size()) {
-               fShapeB.insert(fShapeB.begin(), fShapeA.size()-fShapeB.size(), Dim(1));
-            }
-         }
-
-         fShapeY = DynamicShapeInference({fShapeA, fShapeB})[0];
-         std::vector<size_t> shapeY;
-         if (!fIsDynamic) {
-            shapeY = ConvertShapeToInt(fShapeY);
-            if (shapeY.empty()) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY));
-            }
-         }
-
-         // bias is normally not dynamic (not support it for time being)
-         if (fNC != ""){
-            // normally bias is fixed and not dynamic
-            if (model.IsDynamicTensor(fNC)) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported");
-            }
-            fShapeC = model.GetTensorShape(fNC);
-            fNC2 = fNC;
-            size_t lengthC = ConvertShapeToLength(fShapeC);
-            size_t lengthY = ConvertShapeToLength(shapeY);
-            // for dynamic outputs broadcasting is always done
-            bool broadcast_needed = lengthC != lengthY;
-
-
-            if (broadcast_needed) {
-               if (!model.UseSession()) {
-                  // without session dynamic tensors not supported in Gemm
-                  if (fIsDynamic) {
-                      throw std::runtime_error("TMVA SOFIE Gemm Op:  dynamic tensors not supported without a session");
-                  }
-                  auto original_data = model.GetInitializedTensorData(fNC);
-                  auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY);
-                  if (fType == "float") {
-                     std::shared_ptr<void> new_data_ptr(UTILITY::UnidirectionalBroadcast<float>(
-                        static_cast<float *>(original_data.get()), fShapeC, targetShape),
-                        std::default_delete<float[]>());
-
-                     model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr);
-                     fShapeC = shapeY;
-                  }
-               } else {
-                  // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-                  // we need to add a new intermediate tensor for broadcasted bias tensor
-                  fNC2 = fNC + "bcast";
-                  if (!fIsDynamic) {
-                     model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY);
-                  }
-                  else
-                     model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
-               }
-            }
-         }
-
-         // remove appended or prepended value of 1
-         if (prependOne) {
-            if (fIsDynamic)
-               fShapeY.erase(fShapeY.begin());
-            else
-               shapeY.erase(shapeY.begin());
-         }
-         if (appendOne) {
-            if (fIsDynamic)
-               fShapeY.erase(fShapeY.end()-1);
-            else
-               shapeY.erase(shapeY.end()-1);
-         }
-
-         if (!fIsDynamic)
-            model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), shapeY);
-         else
-            model.AddDynamicTensor(fNY, model.GetTensorType(fNA), fShapeY);
-
-         if (model.Verbose()){
-            std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape ";
-            if (fIsDynamic)
-               std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl;
-            else
-               std::cout << ConvertShapeToString(shapeY) << std::endl;
-         }
-
-         model.AddNeededStdLib("algorithm");
-      }
-
-      std::string GenerateInitCode() override {
-         std::stringstream out;
-         // generate initialization code for broadcasting of bias tensor
-         if (fShapeC.size() != fShapeY.size() && fNC != fNC2) {
-            // we broadcast here always C in Y output, so target shape is the one of Y
-            // no need to call UTILITY::UnidirectionalBroadcastShape.
-            // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code.
-            auto targetShape = fShapeY;
-            // include a separate scope to avoid defining unique operator temp variables
-            out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
-            out << SP << "{\n";
-            out << "      float * data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n";
-            auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size
-            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n";
-            out << SP << SP << "delete [] data;\n";
-            out << SP << "}\n";
-         }
-         return out.str();
-      }
-
-      std::string Generate(std::string opName) override {
-         opName = "op_" + opName;
-
-         if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) {
-            throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
-         }
-         std::stringstream out;
-         out << "\n//--------- Gemm\n";
-         out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n";
-         out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n";
-         // need to consider case A and B have dim > 2 (for MatMul)
-         int64_t dimA = fShapeA.size();
-         int64_t dimB = fShapeB.size();
-         int64_t dimY = fShapeY.size();
-         if (dimA != dimB || dimA != dimY) {
-             throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output");
-         }
-         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
-         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
-         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
-         std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
-         // extra dimensions in case of stacked MatMul
-         std::vector<Dim> sA;
-         for (int64_t i = 0; i < dimY-2; i++) {
-            sA.push_back(fShapeY[i]);
-         }
-         auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation
-         auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
-
-         out << SP << "int " << opName << "_m = " << m << ";\n";
-         out << SP << "int " << opName << "_n = " << n << ";\n";
-         out << SP << "int " << opName << "_k = " << k << ";\n";
-         out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ";\n";
-         out << SP << "float " << opName << "_beta = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ";\n";
-         out << SP << "int " << opName << "_lda = " << (fAttrTransA ? m : k) << ";\n";
-         out << SP << "int " << opName << "_ldb = " << (fAttrTransB ? k : n) << ";\n";
-
-         // case bias is present
-         if (!fNC.empty()){
-            if (fNC2 == fNC) {
-               // add a check in case broadcasting was not needed or done outside of session
-               // C should have smaller dimension of Y
-               if (!fIsDynamic) {
-                  if (std::stoi(lengthGemm) != static_cast<int>(ConvertShapeToLength(fShapeC)))
-                     throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size "
-                            + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
-               } else {
-                  // add a dynamic check (C should not be a dynamic tensor)
-                  out << SP << "assert(" << lengthGemm << " != " <<  ConvertShapeToLength(fShapeC) << ");\n";
-               }
-            }
-         } else {
-            //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use
-            // the previous result
-            if (fAttrBeta != 0) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero");
-            }
-         }
-
-         // include MatMul case where we stack the Gemm operations
-         // exclude case where we have only 1's in the additional dims
-         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
-         if (doStackMul) {
-            out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations
-            out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n";
-            out << SP;
-         }
-         // in the case of bias
-         if (!fNC.empty()){
-            out << SP << "std::copy(" << "tensor_" << fNC2 << ", " << "tensor_" << fNC2 << " + " << lengthGemm << ", "
-               << "tensor_" << fNY;
-            if (doStackMul) out << " + " << opName << "_yoffset";
-            out << ");\n";
-         }
-
-
-         if (fType == "float"){
-
-            out << SP << "BLAS::sgemm_(&" << opName << "_transB, &" << opName << "_transA, &" << opName
-             << "_n, &" << opName << "_m, &" << opName << "_k, &" << opName << "_alpha, " << "tensor_" << fNB
-             << ", &" << opName << "_ldb, " << "tensor_" << fNA << ", &" << opName << "_lda, &" << opName << "_beta, "
-             << "tensor_" << fNY;
-             if (doStackMul) out << " + " << opName << "_yoffset";
-             out << ", &" << opName << "_n);\n";
-
-            if(fActivation == EActivationType::RELU){
-               out << SP << "for (int id = 0; id < " << SOFIE::ConvertDynamicShapeToLength(fShapeY) << " ; id++){\n";
-               out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n";
-               out << SP << "}\n";
-            }
-         }
-
-         if (doStackMul) {
-            out << SP << SP <<  opName << "_yoffset += " << lengthGemm << ";\n";
-            out << "}\n"; // end of loop on the stacked multiplications
-         }
-
-         return out.str();
-      }
-
-      std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; }
-
-   };
-
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_GEMM
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
deleted file mode 100644
index 17b77b3..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
+++ /dev/null
@@ -1,343 +0,0 @@
-#ifndef SOFIE_ROPERATOR_LAYERNORMALIZATION
-#define SOFIE_ROPERATOR_LAYERNORMALIZATION
-
-#include "SOFIE/RModel.hxx"
-#include "SOFIE/SOFIE_common.hxx"
-
-#include <sstream>
-#include <string>
-
-
-namespace SOFIE {
-
-template <typename T>
-class ROperator_LayerNormalization : public ROperator {
-private:
-   int fAttrAxis;
-   float fAttrEpsilon;
-   size_t fAttrStashType;
-
-   std::string fNX;
-   std::string fNScale;
-   std::string fNB;
-   std::string fNY;
-   std::string fNMean;
-   std::string fNInvStdDev;
-
-   std::string fNCastedX;
-   std::string fNNormalizedX;
-   std::string fNBroadcastedB;
-
-   std::vector<Dim> fShapeX;
-   std::vector<Dim> fShapeScale;
-   std::vector<size_t> fShapeB;  // shape of input Bias (B) is assumed to be fully defined
-   std::vector<Dim> fShapeY;
-   std::vector<Dim> fShapeMean;
-   std::vector<Dim> fShapeInvStdDev;
-
-   size_t fAxis; // axis in [0, size)
-   size_t fSize; // Size of the input
-   // size_t fAxisDim;
-
-   std::vector<Dim> fNormalizedShape;
-   std::vector<Dim> fAxesShape;
-   // lengths in string format
-   std::string fLength; // Length of the input
-   std::string fNormalizedLength;
-   std::string fAxesLength;
-
-   std::string fType;
-
-public:
-   ROperator_LayerNormalization() {}
-
-   ROperator_LayerNormalization(int axis, float epsilon, size_t stashType, const std::string &nameX,
-                                const std::string &nameScale, const std::string &nameB, const std::string &nameY,
-                                const std::string &nameMean, const std::string &nameInvStdDev)
-      : fAttrAxis(axis), fAttrEpsilon(epsilon), fAttrStashType(stashType), fNX(UTILITY::Clean_name(nameX)),
-        fNScale(UTILITY::Clean_name(nameScale)), fNB(UTILITY::Clean_name(nameB)),
-        fNY(UTILITY::Clean_name(nameY)), fNMean(UTILITY::Clean_name(nameMean)), fNInvStdDev(UTILITY::Clean_name(nameInvStdDev))
-   {
-         fInputTensorNames = { fNX, fNScale };
-         if (!fNB.empty()){
-            fInputTensorNames.emplace_back(fNB);
-         }
-
-         fOutputTensorNames = { fNY };
-         if (!fNMean.empty()){
-            fOutputTensorNames.emplace_back(fNMean);
-         }
-         if (!fNInvStdDev.empty()){
-            fOutputTensorNames.emplace_back(fNInvStdDev);
-         }
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override { return input; }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
-
-   void Initialize(RModel& model) override {
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
-      }
-      bool isDynamic = model.IsDynamicTensor(fNX);
-      fShapeX = model.GetDynamicTensorShape(fNX);
-      fShapeY = fShapeX;
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-      // Type of the output
-      fType = ConvertTypeToString(model.GetTensorType(fNX));
-      // Size of the input
-      fSize = fShapeX.size();
-      // Axis in [0, size)
-      fAxis = (fAttrAxis < 0) ? fSize + fAttrAxis : fAttrAxis;
-      // Shape of fShapeX[0, ..., fAxis)
-      fAxesShape = std::vector<Dim>(fShapeX.begin(), fShapeX.begin() + fAxis);
-      // Length of the axes
-      fAxesLength = ConvertDynamicShapeToLength(fAxesShape);
-      // Shape of fShapeX[fAxis, ..., fSize)
-      fNormalizedShape = std::vector<Dim>(fShapeX.begin() + fAxis, fShapeX.end());
-      // Length of the normalized axis
-      fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape);
-      // length of the input
-      fLength = ConvertDynamicShapeToLength(fShapeX);
-      // Type of mean and std
-      ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
-      // Mean
-      if (fNMean.empty()) {
-         fNMean = "Mean" + fNX;
-         // cannot use initializer list with one element since it is ambiguous
-         if (isDynamic)
-            // add size_t(-1) to indicate that shape is an expression
-            model.AddIntermediateTensor(fNMean, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
-         else
-            model.AddIntermediateTensor(fNMean, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
-      }
-      // Inverse Standard Deviation
-      if (fNInvStdDev.empty()) {
-         fNInvStdDev = "InvStdDev" + fNX;
-         if (isDynamic)
-            model.AddIntermediateTensor(fNInvStdDev, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
-         else
-            model.AddIntermediateTensor(fNInvStdDev, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
-      }
-      // Cast X to float
-      if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) {
-         fNCastedX = "Casted" + fNX;
-         model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
-         fNNormalizedX = "Normalized" + fNX;
-         model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
-      }
-      // Broadcast the bias
-      if (!fNB.empty()) {
-         fShapeB = model.GetTensorShape(fNB);
-         size_t lengthB = ConvertShapeToLength(fShapeB);
-         if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
-            fNBroadcastedB = "Broadcasted" + fNB;
-            model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
-         }
-      }
-      model.AddNeededStdLib("cmath");
-   }
-
-   std::string GenerateInitCode() override
-   {
-      std::stringstream out;
-      if (!fNBroadcastedB.empty()) {
-         out << SP << "// Broadcasting the bias of LayerNormalization op\n";
-         out << SP << "{\n";
-         out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_";
-         out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n";
-         out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n";
-         out << SP << "delete[] data;\n";
-         out << SP << "}\n";
-      }
-      return out.str();
-   }
-
-   std::string Generate(std::string opName) override
-   {
-      opName = "op_" + opName;
-      if (fShapeX.empty()) {
-         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName +
-                                  " called to generate without being initialized first.");
-      }
-      if (fShapeX.size() > 5) {
-         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not "
-                                  "implemented for input tensor of size > 5.");
-      }
-
-      std::stringstream out;
-
-      out << "//---- Layer Normalization  operator " << opName << "\n";
-
-      // Loop over all the normalized axes i.e. [axis, ..., size)
-      std::vector<std::string> inputShape(fSize);
-
-      for (size_t i = 0; i < fSize; i++) {
-         inputShape[i] = fShapeX[i].GetVal();
-      }
-
-      auto strides = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::string InputIndex = "axis_0 * " + strides[0].GetVal();
-      for (size_t i = 1; i < fSize; i++) {
-         InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal();
-      }
-
-      auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape);
-      std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal();
-      for (size_t i = 1; i < fAxis; i++) {
-         axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal();
-      }
-
-      auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape);
-      std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal();
-      for (size_t i = fAxis + 1; i < fSize; i++) {
-         normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal();
-      }
-
-      if (!fNCastedX.empty()) {
-         // Cast X to float
-         out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n";
-         out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast<float>(tensor_" << fNX;
-         out << "[i]);\n";
-         out << SP << "}\n";
-      }
-
-      out << SP << "// Compute the mean\n";
-      // Loop over the normalized dimensions
-      for (size_t i = 0; i < fAxis; i++) {
-         std::string iIdx = "axis_" + std::to_string(i);
-         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                      << "; " << iIdx << "++) {\n";
-      }
-      out << SP << SP << fType << " sum = 0.;\n";
-      // loop over all the dims in [0, fAxis)
-      for (size_t j = fAxis; j < fSize; j++) {
-         std::string jIdx = "axis_" + std::to_string(j);
-         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                         << "; " << jIdx << "++) {\n";
-      }
-      out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n";
-      for (size_t j = fAxis; j < fSize; j++) {
-         out << SP << SP << "}\n";
-      }
-      out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "(";
-      out << fNormalizedLength << ");\n";
-      for (size_t i = fAxis; i < fSize; i++) {
-         out << SP << "}\n";
-      }
-
-      out << SP << "// Compute the inverse Standard Deviation\n";
-      // Loop over the normalized dimensions
-      for (size_t i = 0; i < fAxis; i++) {
-         std::string iIdx = "axis_" + std::to_string(i);
-         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                   << "; " << iIdx << "++){\n";
-      }
-      // Set sum = 0
-      out << SP << SP << fType << " sum = 0.;\n";
-      // loop over all the dims in [0, fAxis)
-      for (size_t j = fAxis; j < fSize; j++) {
-         std::string jIdx = "axis_" + std::to_string(j);
-         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                          << "; " << jIdx << "++){\n";
-      }
-      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_"
-                            << fNMean << "[" << axesIndex << "];\n";
-      out << SP << SP << SP << "sum += tmp*tmp;\n";
-      for (size_t j = fAxis; j < fSize; j++) {
-         out << SP << SP << "}\n";
-      }
-      out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt(";
-      out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n";
-      for (size_t i = 0; i < fAxis; i++) {
-         out << SP << "}\n";
-      }
-
-      if (!fNCastedX.empty()) {
-         out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                          << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                             << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_";
-         out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex;
-         out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-         out << "// Y = Scale o NormalizedX";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                      << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                            << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex;
-         out << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-      } else {
-         out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                         << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                           << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex;
-         out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "[";
-         out << axesIndex << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-      }
-
-      if (!fNB.empty()) {
-         std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
-         out << SP << "// Add the bias to Y\n";
-         out << SP << "int " << opName << "_n = " << fLength << ";\n";
-         out << SP << "float " << opName << "_alpha = 1.;\n";
-         out << SP << "int " << opName << "_inc = 1;\n";
-         out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
-         out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
-      }
-
-      return out.str();
-   }
-
-   std::vector<std::string> GetBlasRoutines() override { return { std::string("Axpy") }; }
-
-   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") }; }
-};
-
-} // namespace SOFIE
-
-
-#endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
deleted file mode 100644
index 8fefa6d..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
+++ /dev/null
@@ -1,82 +0,0 @@
-#ifndef SOFIE_ROPERATOR_LeakyRelu
-#define SOFIE_ROPERATOR_LeakyRelu
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-template <typename T>
-class ROperator_LeakyRelu final : public ROperator
-{
-
-private:
-
-   /* Attributes*/
-   float falpha=0.01; //default value
-   std::string fNX;
-   std::string fNY;
-   std::vector<size_t> fShape;
-   std::string fType;
-
-public:
-   ROperator_LeakyRelu(){}
-   ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY):
-   falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
-   {
-      if(std::is_same<T, float>::value){
-         fType = "float";
-      }
-		else{
-			throw
-				std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Leaky Relu operator");
-		}
-
-      fInputTensorNames = { fNX };
-      fOutputTensorNames = { fNY };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Leaky Relu Op Input Tensor is not found in model");
-      }
-      fShape = model.GetTensorShape(fNX);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
-   }
-
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator Leaky Relu called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      size_t length = ConvertShapeToLength(fShape);
-
-      out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
-
-      out << "\n//------ LEAKY RELU\n";
-      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] >= 0 )? tensor_" << fNX << "[id] : "<< OpName << "_alpha * tensor_"<< fNX<<"[id]);\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_LeakyRelu
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx
deleted file mode 100644
index 886aef1..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Reduce.hxx
+++ /dev/null
@@ -1,270 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Reduce
-#define SOFIE_ROPERATOR_Reduce
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <memory>
-#include <sstream>
-#include <algorithm>
-#include <stdexcept>
-#include <vector>
-#include <cassert>
-
-
-namespace SOFIE{
-
-enum EReduceOpMode { ReduceMean, ReduceSum, ReduceSumSquare, ReduceProd, InvalidReduceOp };
-
-template <typename T, EReduceOpMode Op>
-class ROperator_Reduce final : public ROperator
-{
-private:
-    /* Attributes*/
-    int fkeepdims = 1; //default value
-    std::vector<int64_t> fAttrAxes;
-    EReduceOpMode fReduceOpMode;
-    std::string fNX;
-    std::string fNAxes;
-    std::string fNY;
-    std::vector<size_t> fShapeX;
-    std::vector<size_t> fShapeY;
-    std::vector<size_t> fShapeYNotPruned; // needed for fKeepdims=0
-
-
-public:
-
-   std::string Name() {
-      if (fReduceOpMode == ReduceMean)  return "ReduceMean";
-      else if (fReduceOpMode == ReduceSumSquare )  return "ReduceSumSquare";
-      else if (fReduceOpMode == ReduceProd ) return "ReduceProd";
-      else if (fReduceOpMode == ReduceSum) return "ReduceSum";
-      return "Invalid";
-   }
-
-   ROperator_Reduce(){}
-   ROperator_Reduce(int keepdims, std::vector<int64_t> attrAxes, std::string nameX, std::string nameAxes, std::string nameY):
-   fkeepdims(keepdims), fAttrAxes(attrAxes), fNX(UTILITY::Clean_name(nameX)), fNAxes(UTILITY::Clean_name(nameAxes)), fNY(UTILITY::Clean_name(nameY)) {
-      fReduceOpMode = Op;
-      
-      fInputTensorNames = { fNX };
-      if(!fNAxes.empty()){
-         fInputTensorNames.emplace_back(fNAxes);
-      }
-
-      fOutputTensorNames = { fNY };
-   }
-
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      auto & outputShape = ret[0];
-      for (size_t j = 0; j < fAttrAxes.size(); j++) {
-         if (fAttrAxes[j] < 0) fAttrAxes[j] += outputShape.size();
-         if (fAttrAxes[j] < 0 || (size_t) fAttrAxes[j] >= outputShape.size() )
-            throw std::runtime_error("TMVA SOFIE Reduce Op - invalid axes values " + std::to_string(fAttrAxes[j]));
-         // set to 1 the reduced dims
-         outputShape[fAttrAxes[j]] = 1;
-      }
-      fShapeYNotPruned = outputShape;
-      // in case of pruning dimension we need to sort axes attributes
-      if (fkeepdims == 0) {
-         auto ax = fAttrAxes;
-         std::sort(ax.begin(), ax.end());
-         for (size_t j = 0; j < ax.size(); j++) {
-            // erase reduced dimensions, but keep last one
-            if (outputShape.size() > 1) {
-               outputShape.erase(outputShape.begin() + ax[j]);
-               for (size_t k = j+1; k < ax.size(); k++)
-                  ax[k] -= 1;  // decrease by one since we have removed a value
-            }
-         }
-      }
-      return ret;
-   }
-   void Initialize(RModel& model) override {
-
-      fUseSession = model.UseSession();
-
-      if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         // input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Reduce Op Input Tensor " + fNX + " is not found in model");
-      }
-      fShapeX = model.GetTensorShape(fNX);
-      // check if tensor with axes is provided
-      if (!fNAxes.empty()) {
-         auto ax_shptr = model.GetInitializedTensorData(fNAxes);
-         auto ax_ptr = static_cast<int64_t *>(ax_shptr.get());
-         auto ax_shape = model.GetTensorShape(fNAxes);
-         size_t ax_length = ConvertShapeToLength(ax_shape);
-         fAttrAxes = std::vector<int64_t>(ax_ptr, ax_ptr+ax_length);
-      } else if (fAttrAxes.empty()) {
-         // in case no axes is passed assume full reduction
-         fAttrAxes.resize(fShapeX.size());
-         for (size_t i = 0; i < fAttrAxes.size(); i++)
-            fAttrAxes[i] = i;
-      }
-      // find shape of Y and add it in the list of intermediate tensors
-      fShapeY = ShapeInference({fShapeX})[0];
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-      if (model.Verbose()){
-         std::cout << Name() << " : " << fNX << " -> " << fNY << " shape " << ConvertShapeToString(fShapeY) << std::endl;
-      }
-      model.AddNeededStdLib("algorithm");
-   }
-
-   std::string Generate(std::string opName) override {
-      opName = "op_" + opName;
-      if (fShapeX.empty() || fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Reduce Op called to Generate without being initialized first");
-      }
-
-      size_t inputLength = SOFIE::ConvertShapeToLength(fShapeX);
-      size_t outputLength = SOFIE::ConvertShapeToLength(fShapeY);
-
-      auto inputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeX);
-      // output stride (or not pruned vector)
-      auto outputStrides = SOFIE::UTILITY::ComputeStrideFromShape(fShapeYNotPruned);
-
-      // write here according to size of shape
-      // in generation code can be done automatically
-      // i0 =  i / stride0  % shape0; i1 = i / stride1 % shape1 and so on
-      // and we have for the inverse
-      // i = i0 * s0 + i1 * s1 + i2 * s2 + i3 * s3 ....
-
-      // don't need to divide by last stride s[n-1] since it is 1 by definition
-
-      std::stringstream out;
-      out << "\n//----  operator " << Name() << "  " << opName << "\n";
-      // check where is reduced axes are first or last one. In these case we can do a faster implementation
-      enum EReduceDim {kFirst, kLast, kMiddle};
-      EReduceDim reduceDims = kLast;
-      int kmin = fShapeX.size()-fAttrAxes.size();
-      for (int k = fShapeX.size()-1; k >= kmin; k--) {
-         // if k is not a reduced axis is not last ones
-         if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
-            reduceDims = kMiddle;
-            break;
-         }
-      }
-      if (reduceDims == kMiddle) {
-         reduceDims = kFirst;
-         // check if at the beginning
-         for (size_t k = 0; k < fAttrAxes.size(); k++) {
-            // if k is not a reduced axis is not first ones
-            if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
-               reduceDims = kMiddle;
-               break;
-            }
-         }
-      }
-      size_t reducedLength = inputLength / outputLength;
-      if (reduceDims == kLast) {
-         //std::cout << "reduction for operator " << opName << " is last" << std::endl;
-         // new faster implementation using a single loop
-         // faster to loop first on reduced dimension and then output
-         // reset output tensors
-
-         // loop on output dimensions
-         out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n";
-         // loop on reduce dimensions
-         std::string startingValue = (fReduceOpMode == ReduceProd) ? "1" : "0";
-         out << SP << SP << "tensor_" << fNY << "[i] = " << startingValue << ";\n";
-         out << SP << SP << "for (size_t j = 0; j < " << reducedLength << "; j++) {\n";
-
-         if (fReduceOpMode == ReduceProd)
-            out << SP << SP << SP <<  "tensor_" << fNY << "[i] *= tensor_" << fNX << "[i * " << reducedLength << " + j];\n";
-         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
-            out << SP << SP << SP <<  "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j];\n";
-         else if(fReduceOpMode == ReduceSumSquare)
-            out << SP << SP << SP <<  "tensor_" << fNY << "[i] += tensor_" << fNX << "[i * " << reducedLength << " + j] * tensor_"
-                                    << fNX << "[i * " << reducedLength << " + j];\n";
-         out << SP << SP << "}\n"; // end j loop
-         if(fReduceOpMode == ReduceMean)
-            out << SP << SP << "tensor_" << fNY << "[i] /= static_cast<float>(" << reducedLength << ");\n";
-
-         out << SP << "}\n"; // end i loop
-      } else if (reduceDims == kFirst) {
-         //std::cout << "reduction for operator " << opName << " is first" << std::endl;
-         // case reduction is at beginning
-         // reset output tensors
-         if (fReduceOpMode == ReduceProd)
-            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n";
-         else
-            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 0);\n";
-
-         out << SP << "for (size_t i = 0; i < " << reducedLength << "; i++) {\n";
-         out << SP << SP << "for (size_t j = 0; j < " << outputLength << "; j++) {\n";
-
-         if (fReduceOpMode == ReduceProd)
-            out << SP << SP << SP << "tensor_" << fNY << "[j] *= tensor_" << fNX << "[i * " << outputLength << " + j];\n";
-         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
-            out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j];\n";
-         else if(fReduceOpMode == ReduceSumSquare)
-            out << SP << SP << SP << "tensor_" << fNY << "[j] += tensor_" << fNX << "[i * " << outputLength << " + j] * tensor_"
-                                    << fNX << "[i * " << outputLength << " + j];\n";
-         out << SP << SP << "}\n"; // end j loop
-         out << SP  << "}\n"; // end i loop
-         if(fReduceOpMode == ReduceMean) {
-            out << SP  << "for (size_t j = 0; i < " << outputLength << "; j++) {\n";
-            out << SP << SP << "tensor_" << fNY << "[j] /= static_cast<float>(" << reducedLength << ");\n";
-            out << SP << "}\n"; // end j loop
-         }
-      }
-      else
-      { // standard case
-         //std::cout << "reduction for operator " << opName << " is middle" << std::endl;
-         // reset output tensors
-         if (fReduceOpMode == ReduceProd)
-            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ", 1);\n";
-         else
-            out << SP << "std::fill(tensor_" << fNY <<", tensor_"<< fNY <<" + "<< outputLength << ",0);\n";
-
-         out << SP << "for (size_t i = 0; i < " << inputLength << "; i++) {\n";
-
-         size_t dim = fShapeX.size(); // this is the input dimension (e.g. 2, 3 or 4 or more)
-
-         // here we find output index
-         out << SP << SP << "size_t outputIndex = 0;\n";
-         for (size_t k = 0; k < dim; k++) {
-            if (std::find(fAttrAxes.begin(), fAttrAxes.end(), k) == fAttrAxes.end()) {
-               // do for not reducing axes
-               out << SP << SP << "size_t i_" << k << " = i / " << inputStrides[k] << " % " << fShapeX[k] << ";\n";
-               out << SP << SP << "outputIndex += i_" << k << " * " << outputStrides[k] << ";\n";
-            }
-         }
-         // now compute reduction
-         out << SP << SP << "// compute reduction....\n";
-         if (fReduceOpMode == ReduceProd)
-            out << SP << SP << "tensor_" << fNY << "[outputIndex] *= tensor_" << fNX << "[i];\n";
-         else if (fReduceOpMode == ReduceSum || fReduceOpMode == ReduceMean)
-            out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i];\n";
-         else if (fReduceOpMode == ReduceSumSquare) {
-            out << SP << SP << "tensor_" << fNY << "[outputIndex] += tensor_" << fNX << "[i] * tensor_" << fNX
-                << "[i];\n";
-         }
-         out << SP << "}\n"; // end loop on input elements
-         // normalize for reduced mean
-         if (fReduceOpMode == ReduceMean) {
-            out << SP << "for (size_t i = 0; i < " << outputLength << "; i++) {\n";
-            out << SP << SP << "tensor_" << fNY << "[i] /= static_cast<float>(" << reducedLength << ");\n";
-            out << SP << "}\n";
-         }
-      }
-
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_Reduce
-
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
deleted file mode 100644
index 8062dca..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef SOFIE_ROPERATOR_RELU
-#define SOFIE_ROPERATOR_RELU
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-template <typename T>
-class ROperator_Relu final : public ROperator
-{
-
-private:
-
-   std::string fNX;
-   std::string fNY;
-   std::vector<Dim> fShape;
-
-public:
-   ROperator_Relu(){}
-   ROperator_Relu(std::string nameX, std::string nameY):
-      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNX };
-         fOutputTensorNames = { fNY };
-      }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model");
-      }
-
-      fShape = model.GetDynamicTensorShape(fNX);
-
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
-      if (model.Verbose()) {
-         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl;
-      }
-   }
-
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      auto length = ConvertDynamicShapeToLength(fShape);
-      out << "\n//------ RELU\n";
-      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_RELU
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
deleted file mode 100644
index 66a7e09..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
+++ /dev/null
@@ -1,252 +0,0 @@
-#ifndef SOFIE_ROPERATOR_RESHAPE
-#define SOFIE_ROPERATOR_RESHAPE
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <cassert>
-#include <sstream>
-
-namespace SOFIE{
-
-enum ReshapeOpMode { Reshape, Flatten, Squeeze, Unsqueeze };
-
-
-class ROperator_Reshape final : public ROperator
-{
-
-private:
-
-   bool fVerbose = false;
-   ReshapeOpMode fOpMode = Reshape;   // type of Reshape operator
-
-   int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape
-   int fAxis = 1;      // (for Flatten)
-
-   std::string fNData;        // input data tensor name
-   std::string fNShape;       // reshape tensor name
-   std::string fNOutput;               // output tensor name
-   std::vector<size_t> fShapeInput;     // input shape data
-   std::vector<size_t> fShapeOutput;   // output shape data
-   std::vector<int64_t> fAttrAxes;         // axes attributes (provided for all version of Squeeze/Unsqueeze)
-
-public:
-
-   std::string Name() const {
-      if (fOpMode == Reshape) return "Reshape";
-      if (fOpMode == Flatten) return "Flatten";
-      if (fOpMode == Squeeze) return "Squeeze";
-      if (fOpMode == Unsqueeze) return "Unsqueeze";
-      return "";
-   }
-
-   ROperator_Reshape(){}
-   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput)
-      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)),
-      fNOutput(UTILITY::Clean_name(nameOutput))
-   {
-      if (opMode == Reshape) fAllowZero = attr_value;
-      if (opMode == Flatten) fAxis = attr_value;
-
-      fInputTensorNames = { fNData };
-      if(!fNShape.empty()){
-         fInputTensorNames.emplace_back(fNShape);
-      }
-      fOutputTensorNames = { fNOutput };
-   }
-
-   // for squeeze/unsqueezed operators following old ONNX version (< 10)
-   // In this cases axes are passed as attribute values
-   ROperator_Reshape(ReshapeOpMode opMode, std::vector<int64_t> attrAxes, std::string nameData, std::string nameOutput)
-      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)),
-        fAttrAxes(attrAxes)
-   {
-      assert(fOpMode == Squeeze || fOpMode == Unsqueeze);
-   }
-
-   // output type is same as input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      auto ret = std::vector<ETensorType>(1, input[0]);
-      return ret;
-   }
-
-   // output shape
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<std::vector<size_t>> ret;
-      auto & input_shape = input[0];
-
-      if (fOpMode == Reshape) {
-         if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors");
-         auto output_shape = input[1]; // the provided shape
-         size_t input_length = ConvertShapeToLength(input_shape);
-         size_t output_length = ConvertShapeToLength(output_shape);
-         // (input_length == output_length) is the easy case : (2,3,4) -> (2,12)
-         if (input_length != output_length) {
-            if ((output_length == 0 && fAllowZero == 0) || static_cast<long>(output_length)  < 0) {
-               // in this case value 0 or -1 in shape are automatically corrected
-               bool replacementDone = false;
-               for (size_t i = 0; i < output_shape.size(); i++) {
-                  if (output_shape[i] == 0 || output_shape[i] == static_cast<size_t>(-1)) {
-                     if (replacementDone) {
-                        throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values");
-                     }
-                     auto tmp = output_shape;
-                     tmp.erase(tmp.begin() + i);
-                     auto tmp_length = ConvertShapeToLength(tmp);
-                     output_shape[i] = input_length / tmp_length;
-                     replacementDone = true;
-                  }
-               }
-               if (fVerbose)
-                  std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1])
-                        << " to " << ConvertShapeToString(output_shape) << std::endl;
-            }
-            if (ConvertShapeToLength(output_shape) != input_length) {
-               throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertShapeToString(input_shape) +
-                                        ConvertShapeToString(output_shape));
-            }
-         }
-         ret.push_back(output_shape);
-
-      } else if (fOpMode == Flatten) {
-         // flattenig case
-         size_t inputSize = ConvertShapeToLength(input_shape);
-         size_t b = input[0][0];
-         std::vector<size_t> newShape = {b, inputSize / b};
-         ret.push_back(newShape);
-
-      } else if (fOpMode == Squeeze) {
-         // squeeze
-         // assume no axis is provided - remove all axes with value equal to 1
-         auto output_shape = input[0];
-         if (input.size() == 1) {
-            size_t i = 0;
-            while (i < output_shape.size()) {
-               if (output_shape[i] == 1 ) {
-                  output_shape.erase(output_shape.begin() + i);
-               }
-               else {
-                  i++;
-               }
-            }
-         } else if (input.size() == 2) {
-            auto & axes = input[1];
-            for (size_t i = 0; i < axes.size(); i++){
-               if (output_shape[axes[i]] != 1)
-                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axes : " + ConvertShapeToString(axes) +
-                                           ConvertShapeToString(output_shape));
-               output_shape.erase(output_shape.begin() + axes[i]);
-            }
-         }
-         ret.push_back(output_shape);
-      }
-
-      else if (fOpMode == Unsqueeze) {
-         // unsqueeze
-         assert(input.size() == 2);
-         auto output_shape = input[0];
-         auto &axes = input[1];
-         // output rank
-         int64_t r = input[0].size() + axes.size();
-         for (auto & a : axes) {
-            int64_t i = static_cast<int64_t>(a);
-            if ( i < -r  || i > r - 1 )
-               throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range");
-            if (i >= 0)
-               output_shape.insert(output_shape.begin() + i, 1);
-            else
-               //negative axes
-               output_shape.insert(output_shape.end() + i + 1, 1);
-         }
-         ret.push_back(output_shape);
-      }
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-
-      fVerbose = model.Verbose();
-      if (model.CheckIfTensorAlreadyExist(fNData) == false) {
-          // input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + "  is not found in model");
-      }
-      fShapeInput = model.GetTensorShape(fNData);
-      // check if optional shape tensor exist
-      if (!fNShape.empty()) {
-         if (model.CheckIfTensorAlreadyExist(fNShape)) {
-            auto dptr = model.GetInitializedTensorData(fNShape);
-            auto input_shape = static_cast<int64_t *>(dptr.get());
-            auto vec = model.GetTensorShape(fNShape);
-            assert(vec.size() == 1);
-            size_t n = vec[0]; // size of shape input tensor
-
-            std::vector<size_t> descShape(n);
-            std::copy(input_shape, input_shape + n, descShape.begin());
-            fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
-            // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
-            model.SetNotWritableInitializedTensor(fNShape);
-         } else {
-            throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model");
-         }
-      } else if (!fAttrAxes.empty()) {
-         // case fNShape is empty and axes are provided as attributes
-         std::vector<size_t> descShape(fAttrAxes.size());
-         std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin());
-         fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
-      } else if (fOpMode == Flatten || fOpMode == Squeeze) {
-         fShapeOutput = ShapeInference({fShapeInput})[0];
-      } else {
-         throw std::runtime_error("TMVA Reshape Op : Invalid Input/Attribute data");
-      }
-      // check if output is constant or not
-      if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
-         fIsOutputConstant = true;
-         auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
-         if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput))
-            throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths");
-         model.AddConstantTensor<int64_t>(fNOutput, fShapeOutput, inputData);
-         if (model.Verbose()) {
-            std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput)  << " : " <<
-            ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl;
-         }
-      } else {
-         // non-constant case
-         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
-         if (model.Verbose())
-            std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  "<< fNOutput << "  " << ConvertShapeToString(fShapeOutput)  << std::endl;
-      }
-   }
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";  //no op for constant tensors
-
-      OpName = "op_" + OpName;
-
-      // output of reshape is same as input
-      size_t length = ConvertShapeToLength(fShapeOutput);
-      if (length != ConvertShapeToLength(fShapeInput)) {
-         throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " +
-                                  ConvertShapeToString(fShapeOutput) + " and input is " +
-                                  ConvertShapeToString(fShapeInput));
-      }
-      std::stringstream out;
-      std::string opName = "Reshape";
-      if (fOpMode == Flatten)
-         opName = "Flatten";
-      else if (fOpMode == Squeeze)
-         opName = "Squeeze";
-      else if (fOpMode == Unsqueeze)
-         opName = "Unsquueze";
-
-      out << SP << "///--------" << opName << " operator\n" << std::endl;
-      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput
-          << ");\n";
-      return out.str();
-   }
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_RESHAPE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx
deleted file mode 100644
index 6951017..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx
+++ /dev/null
@@ -1,176 +0,0 @@
-#ifndef SOFIE_ROperator_ScatterElements
-#define SOFIE_ROperator_ScatterElements
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-
-class ROperator_ScatterElements final : public ROperator{
-private:
-
-   int64_t fAxis;
-
-   std::string fNX;
-   std::string fNI;
-   std::string fNU;
-   std::string fNY;
-   std::string fReduction;
-
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeI;
-   std::vector<size_t> fShapeY;
-
-   // define reduction function. Possibilities are:
-   // none (default), add, mul, max, min
-   std::string ReductionFunction(const std::string & t1, const std::string & t2 ) {
-      std::string name = fReduction;
-      if (name.empty() || name == "none")
-         return t2;
-      else if (name == "add")
-         return t1 + " + " + t2;
-      else if (name == "mul")
-         return t1 + " * " + t2;
-      else if (name == "max")
-         return "std::max(" + t1 + "," + t2 + ")";
-      else if (name == "min")
-         return "std::min(" + t1 + "," + t2 + ")";
-      else
-         throw std::runtime_error("TMVA SOFIE ScatterElements : invalid reduction attribute");
-
-      return std::string();
-   }
-
-public:
-   ROperator_ScatterElements(){}
-   ROperator_ScatterElements(const std::string & nameX, const std::string & nameI, const std::string & nameU, const std::string & nameY,
-                           int axis, std::string reduction):
-      fAxis(axis),
-      fNX(UTILITY::Clean_name(nameX)), fNI(UTILITY::Clean_name(nameI)), fNU(UTILITY::Clean_name(nameU)),
-      fNY(UTILITY::Clean_name(nameY)),
-      fReduction(reduction)
-      {
-         fInputTensorNames = { fNX, fNI, fNU };
-         fOutputTensorNames = { fNY };
-      }
-
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNX)){
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNX + "is not found in model");
-      }
-      if (!model.CheckIfTensorAlreadyExist(fNI)) {
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNI + "is not found in model");
-      }
-      if (!model.CheckIfTensorAlreadyExist(fNU)) {
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements Op Input Tensor ") + fNU + "is not found in model");
-      }
-      //tbd check for constant tensors
-
-      fShapeX = model.GetTensorShape(fNX);
-      fShapeI = model.GetTensorShape(fNI);
-      if (model.GetTensorShape(fNU) != fShapeI)
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - update tensor has invalid shape ")) ;
-      if (fShapeX.size() == 0)
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - input tensor has zero rank  ")) ;
-      if (fShapeX.size() != fShapeI.size())
-         throw std::runtime_error(std::string("TMVA SOFIE ScatterElements - index tensor has invalid rank  ")) ;
-
-      if (fAxis < 0) fAxis += fShapeX.size();
-
-      // assume output shape is identical to input shape
-      fShapeY = fShapeX;
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
-   }
-
-   std::string GenerateInitCode() override {
-      std::stringstream out;
-      return out.str();
-   }
-
-   std::string Generate(std::string opName) override {
-
-      if (fIsOutputConstant) return "";
-
-      if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      out << SP << "\n//-------- ScatterElements  --- " << opName << "\n";
-
-      auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
-      auto strideI = UTILITY::ComputeStrideFromShape(fShapeI);
-
-      size_t length = ConvertShapeToLength(fShapeY);
-
-      // function to write compute expression for global index from axes indices
-      auto tensorIndex = [](const std::vector<size_t> & stride, const std::vector<std::string> & idx) {
-         std::stringstream strst;
-         int dims = idx.size();
-         assert (dims == (int) stride.size());
-         for (int i = 0; i < dims; i++) {
-            if (stride[i] != 1)
-               strst << stride[i] << "*" << idx[i];
-            else
-               strst << idx[i];
-            if (i < dims-1)
-               strst << " + ";
-         }
-         return strst.str();
-      };
-
-
-      // copy first input in output (maybe can be avoided??)
-      out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
-
-      // loop on tensor rank
-      int dims = fShapeY.size();
-      std::vector<std::string> idx(dims);
-      for (int i = 0; i < dims; i++) {
-         idx[i] = std::string("i") + std::to_string(i);
-         for (int j = 0; j <= i; j++) out << SP;
-         out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n";
-      }
-      // correct index for specific axis
-      for (int j = 0; j <= dims; j++) out << SP;
-      out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n";
-      for (int j = 0; j <= dims; j++) out << SP;
-      out << "int iAxis = tensor_" << fNI << "[updateIndex];\n";
-      for (int j = 0; j <= dims; j++) out << SP;
-      out << "if (iAxis < 0) iAxis += " << fShapeY[fAxis] << ";\n";
-      idx[fAxis] = "iAxis";
-      for (int j = 0; j <= dims; j++) out << SP;
-      out << "int  outIndex = " << tensorIndex(strideY, idx) << ";\n";
-      for (int j = 0; j <= dims; j++) out << SP;
-      out << "tensor_" << fNY << "[outIndex] = "
-         << ReductionFunction(std::string("tensor_") + fNY + "[outIndex]", std::string("tensor_") + fNU + "[updateIndex]") << ";\n";
-
-      for (int i = dims; i > 0; i--) {
-         for (int j = 0; j < i; j++) out << SP;
-         out << "}\n";
-      }
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROperator_ScatterElements
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
deleted file mode 100644
index 68edd01..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Sigmoid
-#define SOFIE_ROPERATOR_Sigmoid
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-namespace SOFIE{
-
-template <typename T>
-class ROperator_Sigmoid final : public ROperator
-{
-
-private:
-
-   std::string fNX;
-   std::string fNY;
-   std::vector<size_t> fShape;
-
-public:
-   ROperator_Sigmoid(){}
-   ROperator_Sigmoid(std::string nameX, std::string nameY):
-      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNX };
-         fOutputTensorNames = { fNY };
-      }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNX) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Sigmoid Op Input Tensor is not found in model");
-      }
-      fShape = model.GetTensorShape(fNX);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
-   }
-
-
-   std::string Generate(std::string opName) override {
-      if (fShape.empty()){
-         throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      int length = 1;
-      for(auto& i: fShape){
-         length *= i;
-      }
-      out << "\n//------ Sigmoid -- " << opName << "\n";
-      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP  << "tensor_" << fNY << "[id] = 1 / (1 + std::exp( - tensor_"  << fNX << "[id]));\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
-};
-
-}//SOFIE
-
-#endif //SOFIE_ROPERATOR_Sigmoid
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx
deleted file mode 100644
index 6d40003..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Slice.hxx
+++ /dev/null
@@ -1,263 +0,0 @@
-#ifndef SOFIE_ROPERATOR_SLICE
-#define SOFIE_ROPERATOR_SLICE
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <cassert>
-#include <sstream>
-#include <numeric>
-
-
-namespace SOFIE{
-
-// slice operator
-
-template <typename IType>
-class ROperator_Slice final : public ROperator
-{
-
-private:
-
-   std::string fNData;        // input data tensor name
-   std::string fNOutput;      // output data name
-   std::vector<std::string> fNames;       // tensor names for meta(axis) information
-   std::vector<size_t> fShapeInput;     // input shape data
-   std::vector<size_t> fShapeOutput;   // output shape data
-   // saved Start/End.Steps are corrected from initial ONNX for negative/default values
-   // and are available for each axis
-   std::vector<IType> fStart;         // starting values of slices
-   std::vector<IType> fEnd;           // End values of slices
-   std::vector<IType> fSteps;         // step values of slices
-
-   std::vector<std::vector<IType>> fAttributes; // attributes for the version <=10 case
-
-
-public:
-
-   ROperator_Slice(){}
-
-   // ctor for versions >= 10
-   ROperator_Slice(std::string nameData, std::vector<std::string> names, std::string nameOutput)
-      : fNData(UTILITY::Clean_name(nameData)),
-      fNOutput(UTILITY::Clean_name(nameOutput))
-   {
-    fNames.resize(4);
-    // axes and steps can be optional
-    for (size_t i = 0; i < names.size(); ++i) {
-        fNames[i] = UTILITY::Clean_name(names[i]);
-    }
-
-    fInputTensorNames = { fNData };
-    fOutputTensorNames = { fNOutput };
-   }
-   // ctor for versions < 10
-   ROperator_Slice(std::string nameData, std::vector<IType> starts, std::vector<IType> ends, std::vector<IType> axes, std::string nameOutput)
-      : fNData(UTILITY::Clean_name(nameData)),
-      fNOutput(UTILITY::Clean_name(nameOutput))
-   {
-     fAttributes.push_back(starts);
-     fAttributes.push_back(ends);
-     fAttributes.push_back(axes);
-    }
-
-   // output type is same as input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      auto ret = std::vector<ETensorType>(1, input[0]);
-      return ret;
-   }
-
-   // output shape
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto & input_shape = input[0];
-       // assume dimension of output shape is SAME AS INPUT !
-      std::vector<std::vector<size_t>> ret(1, input_shape);
-      auto & output_shape = ret[0];
-      for (size_t i = 0; i < input_shape.size(); i++) {
-          output_shape[i] = (fEnd[i]-fStart[i])/ fSteps[i];
-      }
-      return ret;
-   }
-
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNData) == false){   //input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA Slice Op Input Tensor is not found in model");
-      }
-
-      std::vector<std::vector<size_t>> shapes;
-      fShapeInput = model.GetTensorShape(fNData);
-      shapes.push_back(fShapeInput);
-
-      std::vector<std::vector<IType>> itensors(4);
-      if (fNames.size() > 0) {
-         // loop on the extra 2 or 3 or 4 inputs
-         for (size_t i = 0; i < fNames.size(); ++i) {
-            if (!fNames[i].empty()) {
-               // std::cout << " i " << i << " getting data for tensor " << fNames[i] << std::endl;
-               auto dptr = model.GetInitializedTensorData(fNames[i]);
-               auto tensor = static_cast<IType *>(dptr.get());
-               auto vec = model.GetTensorShape(fNames[i]);
-               assert(vec.size() == 1);
-               itensors[i] = std::vector<IType>(tensor, tensor + vec[0]);
-            } else {
-               switch (i) {
-               case 2: // missing axes
-                  itensors[2] = std::vector<IType>(fShapeInput.size());
-                  std::iota(itensors[2].begin(), itensors[2].end(), 0);
-                  break;
-               case 3: // missing steps
-                  itensors[3] = std::vector<IType>(itensors[0].size(), 1);
-               default: break;
-               }
-            }
-         }
-      } else {
-         assert(fAttributes.size() > 1);
-         for (size_t i = 0; i < fAttributes.size(); i++) {
-            itensors[i] = fAttributes[i];
-         }
-      }
-      size_t dim = fShapeInput.size();
-
-      fSteps = std::vector<IType>(dim, 1);
-      fStart = std::vector<IType>(dim, 0);
-      fEnd = std::vector<IType>(dim, 0);
-      std::copy(fShapeInput.begin(), fShapeInput.end(), fEnd.begin());
-
-      auto istart = itensors[0];
-      auto iend = itensors[1];
-      auto iaxes = itensors[2];
-      auto isteps  = itensors[3];
-
-      // make tensor axis
-      // if iaxes.size is =0 tensor axis is missing and use defaults
-      if (iaxes.size() > 0) {
-         for (size_t i = 0; i < iaxes.size(); i++) {
-            // negative axes - they count from the back
-            if (iaxes[i] < 0) iaxes[i] = dim + iaxes[i];
-            if (iaxes[i] < 0 || iaxes[i] >= static_cast<IType>(dim))
-               throw std::runtime_error("TMVA Slice Op : invalid axis value " + std::to_string(iaxes[i]) +
-                  " for  " + std::to_string(i));
-
-            size_t iAxisDim = fShapeInput[iaxes[i]];
-            // find start/end/step for given axis
-            // check step size for clamping starting/end value
-            if (istart[i] < 0) istart[i] = iAxisDim + istart[i];
-            if (iend[i] < 0) iend[i] = iAxisDim + iend[i];
-            if (istart[i] < 0) istart[i] = 0;
-            if (isteps[i] > 0) {
-               if (istart[i] > static_cast<IType>(iAxisDim)) istart[i] = static_cast<IType>(iAxisDim);
-               if (iend[i] < 0) iend[i] = 0;
-               if (iend[i] > static_cast<IType>(iAxisDim)) iend[i] = static_cast<IType>(iAxisDim);
-            } else if (isteps[i] < 0) {
-               if (istart[i] > static_cast<IType>(iAxisDim)-1) istart[i] = static_cast<IType>(iAxisDim) -1;
-               if (iend[i] < -1) iend[i] = -1;
-               if (iend[i] > static_cast<IType>(iAxisDim)-1) iend[i] = static_cast<IType>(iAxisDim) -1;
-            } else {
-               throw std::runtime_error("TMVA Slice Op : invalid step value " + std::to_string(isteps[i]) +
-                  " for  " + std::to_string(i));
-            }
-            fStart[iaxes[i]] = istart[i];
-            fEnd[iaxes[i]] = iend[i];
-            fSteps[iaxes[i]] = isteps[i];
-         }
-      }
-
-      fShapeOutput = ShapeInference({fShapeInput})[0];
-      // case input is a constant tensor and of int64 type
-      if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
-         fIsOutputConstant = true;
-         auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
-         size_t outputSize = ConvertShapeToLength(fShapeOutput);
-         std::vector<int64_t> outputData(outputSize);
-         std::vector<size_t> inputStride = UTILITY::ComputeStrideFromShape(fShapeInput);
-         // perform slice using a recursive function- need to use two lambda functions for this
-         auto sliceRecursive = [&](size_t iaxis, size_t & outIdx, size_t & inOffset) {
-            auto slice_impl = [&](size_t iax, size_t & outputIdx, size_t & inputOffset, auto & sliceRecImpl) {
-               // compute indices
-               std::vector<IType> indices;
-               for (IType i = fStart[iax]; (fSteps[iax] > 0) ? i < fEnd[iax] : i > fEnd[iax]; i += fSteps[iax] )
-                  indices.push_back(i);
-               if (iax == dim-1) { // last axis
-                  for (size_t i = 0; i < indices.size(); i++) {
-                     outputData[outputIdx] = inputData[inputOffset + indices[i]];
-                     outputIdx++;
-                  }
-                  return;
-               } else {
-                  for (size_t i = 0; i < indices.size(); i++) {
-                     size_t offset = inputOffset + inputStride[iax]*indices[i];
-                     sliceRecImpl(iax+1, outputIdx, offset,sliceRecImpl);
-                  }
-               }
-            };
-            slice_impl(iaxis, outIdx, inOffset,slice_impl);
-         };
-         size_t idx = 0;
-         size_t offset = 0;
-         sliceRecursive(0, idx, offset);
-
-         model.AddConstantTensor<int64_t>(fNOutput, fShapeOutput, outputData.data());
-         if (model.Verbose()) {
-            std::cout << "Slice: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : "
-                     << ConvertValuesToString(outputData) << std::endl;
-         }
-      }
-      else {
-         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
-         if (model.Verbose()) {
-            std::cout << "Slice ---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
-         }
-      }
-   }
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";  //no op for constant tensors
-
-      OpName = "op_" + OpName;
-      if (fShapeInput.empty() || fShapeOutput.empty()){
-         throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first");
-      }
-
-      std::stringstream out;
-      //std::string opName = "Slice";
-
-      out << SP << "///------- Slice operator\n" << std::endl;
-      // loop on the dimensions depending no the orders
-      size_t ndim = fShapeInput.size();
-      std::vector<size_t> strides(ndim,1);
-      for (int i = int(ndim-2); i >=0 ; i--) {
-          strides[i] = strides[i+1]*fShapeInput[i+1];
-      }
-
-      out << SP << "{\n"; // define operator scope
-      out << SP << "size_t iOut = 0;\n";
-      std::string MSP = SP;
-      for (size_t idim = 0; idim < ndim; idim++) {
-        out << MSP << "for (size_t i" << idim << " = " << fStart[idim] <<  "; i" << idim << " < " << fEnd[idim]
-            << "; i" << idim << "+= " << fSteps[idim] << ") {\n";
-        MSP += SP;
-        if (idim < ndim-1) out << MSP << "size_t stride" << idim << " = " << strides[idim] << "*i" << idim << ";\n";
-      }
-      out << MSP << "size_t iInput = ";
-      for (size_t idim = 0; idim < ndim-1; idim++) out << " stride" << idim << " + ";
-      // here should be step size ?
-      out << "i" << ndim-1 << ";\n";
-      out << MSP << "tensor_" << fNOutput << "[iOut++] = tensor_" <<fNData << "[iInput];\n";
-      for (size_t idim = 0; idim < ndim; idim++) {
-          MSP = MSP.replace(0,SP.length(),"");
-          out << MSP << "}\n";
-      }
-      out << SP << "}\n"; // end operator scope
-
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_SLICE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Softmax.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Softmax.hxx
deleted file mode 100644
index 8a78d84..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Softmax.hxx
+++ /dev/null
@@ -1,189 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Softmax
-#define SOFIE_ROPERATOR_Softmax
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-namespace SOFIE {
-
-template <typename T>
-class ROperator_Softmax final : public ROperator {
-
-private:
-   int64_t fAttrAxis;
-
-   std::string fNX;
-   std::string fNY;
-   std::vector<size_t> fShape;
-
-   std::string fType;
-
-public:
-   ROperator_Softmax() {}
-   ROperator_Softmax(int64_t attr_axis, std::string nameX, std::string nameY)
-      : fAttrAxis(attr_axis), fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
-   {
-         fInputTensorNames = { fNX };
-         fOutputTensorNames = { fNY };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; // suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNX) ==
-          false) { // input must be a graph input, or already initialized intermediate tensor
-         throw std::runtime_error("TMVA SOFIE Softmax Op Input Tensor is not found in model");
-      }
-      fShape = model.GetTensorShape(fNX);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
-      fType = ConvertTypeToString(model.GetTensorType(fNX));
-      if (model.Verbose()) {
-         std::cout << "Softmax -> " << fNY << " " << ConvertShapeToString(fShape) << std::endl;
-      }
-   }
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Operator Softmax called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      size_t size = fShape.size();
-      size_t length = ConvertShapeToLength(fShape);
-      size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis;
-      out << "\n" << SP << "//------ SOFTMAX - " << size << "  " << length << "  " << axis << "\n";
-      // use safe numerically implementation by subtracting max of tensor
-      if (size == 1) {
-         out << SP << fType << " vmax = tensor_" << fNX << "[0];\n";
-         out << SP << "for (size_t i = 1; i < " << length << " ; i++){\n";
-         out << SP << SP << "if (tensor_" << fNX << "[i] > vmax) vmax = tensor_" << fNX << "[i];\n";
-         out << SP << "}\n";
-         out << SP << fType << " sum = 0.0;\n";
-         out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n";
-         out << SP << SP << "tensor_" << fNY << "[i] = std::exp(tensor_" << fNX << "[i] - vmax);\n";
-         out << SP << SP << "sum += tensor_" << fNY << "[i];\n";
-         out << SP << "}\n";
-         out << SP << "for (size_t i = 0; i < " << length << " ; i++){\n";
-         out << SP << SP << "tensor_" << fNY << "[i] /= sum;\n";
-         out << SP << "}\n";
-      } else {
-         size_t batch = fShape[0];
-         size_t channel = fShape[1];
-         size_t width = (size > 2) ? fShape[size - 1] : 1;
-         size_t height = (size > 3) ? fShape[size - 2] : 1;
-         size_t depth = (size > 4) ? fShape[size - 3] : 1;
-         size_t hStride = width;
-         size_t dStride = height * width;
-         size_t cStride = depth * dStride;
-         size_t bStride = channel * cStride;
-
-         size_t N = 0; // Size of the axis
-         size_t iStride = 0;
-         if (axis == 0) {
-            N = batch;
-            iStride = bStride;
-         } else if (axis == 1) {
-            N = channel;
-            iStride = cStride;
-         } else if (axis == size - 1) {
-            N = width;
-            iStride = 1;
-         } else if (size > 3 && axis == size - 2) {
-            N = height;
-            iStride = hStride;
-         } else if (size == 5 && axis == size - 3) {
-            N = depth;
-            iStride = dStride;
-         } else {
-            throw
-               std::runtime_error("TMVA::SOFIE - Softmax operator along the axis "
-                  + std::to_string(fAttrAxis) + " with " + std::to_string(size)
-                  + "d input tensor not supported.");
-         }
-
-         bool notBatch = axis != 0;
-         bool notChannel = axis != 1;
-         bool notDepth = (size == 5 && axis != 2);
-         bool notHeight = (size == 5 && axis != 3) || (size == 4 && axis != 2);
-         bool notWidth = (size == 5 && axis != 4) || (size == 4 && axis != 3) || (size == 3 && axis != 2);
-
-         if (notBatch) {
-            out << SP << "for (size_t n = 0; n < " << batch << " ; n++){\n";
-         }
-         if (notChannel) {
-            out << SP << SP << "for (size_t c = 0; c < " << channel << " ; c++){\n";
-         }
-         if (notDepth) {
-            out << SP << SP << "for (size_t d = 0; d < " << depth << " ; d++){\n";
-         }
-         if (notHeight) {
-            out << SP << SP << "for (size_t h = 0; h < " << height << " ; h++){\n";
-         }
-         if (notWidth) {
-            out << SP << SP << "for (size_t w = 0; w < " << width << " ; w++){\n";
-         }
-         out << SP << SP << SP << fType << " sum = 0.;\n";
-         out << SP << SP << SP << "size_t index = 0";
-         if (notBatch) {
-            out << " + n * " << bStride;
-         }
-         if (notChannel) {
-            out << "+ c * " << cStride;
-         }
-         if (notDepth) {
-            out << " + d * " << dStride;
-         }
-         if (notHeight) {
-            out << " + h * " << hStride;
-         }
-         if (notWidth) {
-            out << " + w";
-         }
-         out << ";\n";
-         // apply softmax along the axis - find first maximum value for numerical stability
-         if (N == 0)
-            throw std::runtime_error("TMVA::SOFIE - Softmax operator is along axis with zero elements");
-         out << SP << SP << SP << fType << " vmax = tensor_" << fNX << "[index];\n";
-         out << SP << SP << SP << "for (size_t i = 1; i < " << N << "; i++) {\n";
-         out << SP << SP << SP << SP << "if (tensor_" << fNX << "[index + i*" << iStride << "] > vmax)\n";
-         out << SP << SP << SP << SP << SP << "vmax = tensor_" << fNX << "[index + i*" << iStride << "];\n";
-         out << SP << SP << SP << "}\n";
-         out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n";
-         out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] = std::exp(tensor_" << fNX
-             << "[index + i*" << iStride << "] - vmax);\n";
-         out << SP << SP << SP << SP << "sum += tensor_" << fNY << "[index + i*" << iStride << "];\n";
-         out << SP << SP << SP << "}\n";
-         out << SP << SP << SP << "for (size_t i = 0; i < " << N << "; i++) {\n";
-         out << SP << SP << SP << SP << "tensor_" << fNY << "[index + i*" << iStride << "] /= sum;\n";
-         out << SP << SP << SP << "}\n";
-         if (notWidth) {
-            out << SP << SP << "}\n"; // end w
-         }
-         if (notHeight) {
-            out << SP << SP << "}\n"; // end h
-         }
-         if (notDepth) {
-            out << SP << SP << "}\n"; // end d
-         }
-         if (notChannel) {
-            out << SP << SP << "}\n"; // end c
-         }
-         if (notBatch) {
-            out << SP << "}\n"; // end n
-         }
-      }
-      return out.str();
-   }
-};
-
-} // namespace SOFIE
-
-#endif // SOFIE_ROPERATOR_Softmax
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx
deleted file mode 100644
index 37c92ee..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Tanh.hxx
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Tanh
-#define SOFIE_ROPERATOR_Tanh
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-template <typename T>
-class ROperator_Tanh final : public ROperator
-{
-
-private:
-
-   std::string fNX;
-   std::string fNY;
-   std::vector<size_t> fShape;
-
-public:
-   ROperator_Tanh(){}
-   ROperator_Tanh(std::string nameX, std::string nameY):
-      fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNX };
-         fOutputTensorNames = { fNY };
-      }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-       //input must be a graph input, or already initialized intermediate tensor
-      if (model.CheckIfTensorAlreadyExist(fNX) == false){
-        throw std::runtime_error("TMVA SOFIE Tanh Op Input Tensor is not found in model");
-      }
-      fShape = model.GetTensorShape(fNX);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
-
-   }
-
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Tanh operator called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      size_t length = ConvertShapeToLength(fShape);
-      out << "\n//------ TANH\n";
-      out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = std::tanh(tensor_" << fNX << "[id]);\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-   std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_Tanh
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
deleted file mode 100644
index 354fbe3..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef SOFIE_ROPERATOR_Tile
-#define SOFIE_ROPERATOR_Tile
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-template <typename T>
-class ROperator_Tile final : public ROperator
-{
-
-private:
-
-   std::string fNRepeats;
-   std::string fNInput;
-   std::string fNY;
-   std::vector<size_t>fShapeInput;
-   std::vector<size_t> fShapeY;
-
-public:
-   ROperator_Tile(){}
-   ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY):
-      fNRepeats(UTILITY::Clean_name(nameRepeat)),fNInput(UTILITY::Clean_name(nameInput)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNRepeats, fNInput };
-         fOutputTensorNames = { fNY };
-      }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<size_t> ret = input[0];
-
-      for(size_t i=0; i < input[1].size(); i++) {
-            ret[i]=ret[i]*input[1][i];
-      }
-      return {ret};
-   }
-
-   void Initialize(RModel& model) override {
-       //input must be a graph input, or already initialized intermediate tensor
-      if (model.CheckIfTensorAlreadyExist(fNInput) == false){
-        throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
-      }
-      if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){
-        throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
-      }
-      fShapeInput=model.GetTensorShape(fNInput);
-
-      // if repeats vector is not initialized we cannot deduce shape of output
-      // not support for time being this case
-      if (!model.IsInitializedTensor(fNRepeats)) {
-         throw std::runtime_error("TMVA SOFIE Tile Op: non-initialized repeats input is not supported");
-      }
-
-      // Retrieve the data pointer for the repeats tensor
-      auto repptr = model.GetInitializedTensorData(fNRepeats);
-      // Cast the raw pointer to the appropriate type (size_t*)
-      auto repeats_data = static_cast<int64_t*>(repptr.get());
-      if (repeats_data == nullptr) {
-        throw std::runtime_error("Failed to retrieve the data for the repeats tensor.");
-      }
-      // Get the shape of the repeats tensor to determine the number of elements
-      auto repeats_shape = model.GetTensorShape(fNRepeats);
-      // Ensure the repeats tensor is 1D and get the number of elements
-      if (repeats_shape.size() != 1) {
-         throw std::runtime_error("Repeats tensor is not 1D.");
-      }
-      size_t num_elements = repeats_shape[0];
-      // Convert the data to a vector of size_t
-      std::vector<size_t> repeats_vector(num_elements);
-      std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin());
-
-
-      fShapeY = ShapeInference({fShapeInput,repeats_vector})[0];
-
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY);
-
-      if (model.Verbose())
-         std::cout <<  "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
-            << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl;
-   }
-
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
-      if (fShapeInput.empty() || fShapeY.empty()) {
-            throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first");
-      }
-
-      //size_t input_length = ConvertShapeToLength(fShapeInput);
-      //size_t output_length = ConvertShapeToLength(fShapeY);
-
-
-      std::stringstream out;
-      std::string input = "tensor_" + fNInput;
-      std::string output = "tensor_" + fNY;
-      out << "///-------- Tile operator\n";
-      out << "{\n"; // add scope to re-use same names
-      out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n";
-
-      out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
-      out << "int s = 1;\n";
-      // loop from inverse dim order
-      out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n";
-      out << SP << "int r = tensor_" << fNRepeats << "[i];\n";
-      // we cannot exclude case where repeats=1 since we need offset
-      //out << SP << "if (r == 1 && i < " << fShapeInput.size()-1 <<  ") continue;\n";
-      out << SP << "int i_offset = 0, o_offset = 0;\n";
-      out << SP << "s = s * input_shape[i];\n";
-      // case we have first copy
-      out << SP << "if (i == " << fShapeInput.size()-1 <<  ") {\n";
-      out << SP << SP <<  "for (int j = 0; j < inputLength/s ; j++) {\n";
-      out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n";
-      out << SP << SP << SP << SP << "std::copy(" << input << "+ i_offset, "
-                                    << input << "+ i_offset + s, " << output << "+ o_offset);\n";
-      out << SP << SP << SP << SP << "o_offset += s;\n";
-      out << SP << SP << SP << "}\n"; // end k loop
-      out << SP << SP << SP << "i_offset += s;\n";
-      out << SP << SP << "}\n"; // end j loop
-      out << SP << "} else {\n";  // second copy we do from output to output
-      // and we need to loop on j from reverse order to avoir re-writing in output tensor
-      out << SP << SP << "for (int j = inputLength/s - 1 ; j>=0; j--) {\n";
-      out << SP << SP << SP << "o_offset = j*s*r;\n";
-      out << SP << SP << SP << "i_offset = j*s;\n";
-      out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n";
-      out << SP << SP << SP << SP << "std::copy(" << output << "+ i_offset, "
-                                    << output << "+ i_offset + s, " << output << "+ o_offset);\n";
-      out << SP << SP << SP << SP << "o_offset += s;\n";
-      out << SP << SP << SP << "}\n"; // end k loop
-      out << SP << SP << "}\n"; // end j loop
-      out << SP << "}\n"; // end if
-      out << SP << "s *= r;\n";
-      out << SP << "inputLength *= r;\n";
-      out << "}\n"; // end i loop
-      out << "}\n";  // end of scope
-      return out.str();
-   }
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_Tile
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
deleted file mode 100644
index 11c40bb..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef SOFIE_ROPERATOR_TRANSPOSE
-#define SOFIE_ROPERATOR_TRANSPOSE
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-#include <cassert>
-
-
-namespace SOFIE{
-
-
-
-
-template <typename T>
-class ROperator_Transpose final : public ROperator
-{
-
-private:
-   std::vector<int_t> fAttrPerm;
-
-   std::string fNData;
-   std::string fNOutput;
-   std::vector<size_t> fShapeData;
-   std::vector<size_t> fShapeOutput;
-
-public:
-
-   ROperator_Transpose(){}
-   ROperator_Transpose(std::vector<int_t> attr_perm, std::string nameData, std::string nameOutput):
-      fAttrPerm(attr_perm), fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) {
-            fInputTensorNames = { fNData };
-            fOutputTensorNames = { fNOutput };
-   }
-
-   ROperator_Transpose(std::string nameData, std::string nameOutput):
-      fNData(UTILITY::Clean_name(nameData)), fNOutput(UTILITY::Clean_name(nameOutput)) {
-         fInputTensorNames = { fNData };
-         fOutputTensorNames = { fNOutput };
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      if (input.size() > 1) throw std::runtime_error("TMVA SOFIE Tranpose Op Shape Inference only need 1 input tensor");
-      auto& data = input[0];
-      if (fAttrPerm.size() != data.size() )
-         throw std::runtime_error("TMVA SOFIE Tranpose Op - Invalid axes attributes");
-
-      std::vector<size_t> output_shape(fAttrPerm.size());
-      for (size_t i = 0; i < fAttrPerm.size(); i++){
-         output_shape[i] = data[fAttrPerm[i]];
-      }
-      std::vector<std::vector<size_t>> ret;
-      ret.push_back(output_shape);
-      return ret;
-   }
-
-
-   void Initialize(RModel& model) override {
-      if (model.CheckIfTensorAlreadyExist(fNData) == false){   //input must be a graph input, or already initialized intermediate tensor
-         std::cout<<"Input tensor for transpose: "<<fNData<<'\n';
-         throw std::runtime_error("TMVA SOFIE Tranpose Op Input Tensor is not found in model");
-      }
-      fShapeData = model.GetTensorShape(fNData);
-      if (fAttrPerm.empty()){
-         fAttrPerm.reserve(fShapeData.size());
-         for (int i = fShapeData.size() - 1; i >= 0; i--){
-            fAttrPerm.push_back(i);
-         }
-      }
-      std::vector<std::vector<size_t>> inputs = { fShapeData };
-      fShapeOutput = ShapeInference(inputs).front();
-      if (model.IsInitializedTensor(fNData)) {
-         fIsOutputConstant = true;
-         // case input is a constant or initialized tensor we perform here the transpose
-         auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData);
-         auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput);
-         size_t length = ConvertShapeToLength(fShapeOutput);
-         auto inputData = static_cast<T*>(model.GetInitializedTensorData(fNData).get());
-         size_t dim = fShapeData.size();
-         std::vector<size_t> outputIdx(dim);
-         std::vector<T> outputData(length);
-         for (size_t i = 0; i < length; i++) {
-            outputIdx[0] = i / outStrides[0];
-            for (size_t j = 1; j < dim; j++) {
-               outputIdx[j] = (i % outStrides[j-1]) / outStrides[j];
-            }
-            // compute input index
-            size_t inputIndex = 0;
-            for (size_t j = 0; j < dim; j++) {
-               // find value in fAtrrPerm corresponding to j
-               int k = std::find(fAttrPerm.begin(), fAttrPerm.end(), j) - fAttrPerm.begin();
-               inputIndex += outputIdx[k] * inStrides[j];
-            }
-            outputData[i] = inputData[inputIndex];
-         }
-         model.AddConstantTensor<T>(fNOutput, fShapeOutput, outputData.data());
-         if (model.Verbose()) {
-            std::cout << "Transpose: output is a constant tensor " << ConvertShapeToString(fShapeOutput) << " : "
-               << ConvertValuesToString(outputData) << std::endl;
-         }
-      } else {
-         model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
-         if (model.Verbose()) {
-            std::cout << "Transpose ---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
-         }
-      }
-   }
-
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";  //no op for constant tensors
-      OpName = "op_" + OpName;
-      if (fShapeData.empty() || fShapeOutput.empty()){
-         throw std::runtime_error("TMVA SOFIE Transpose Op called to Generate without being initialized first");
-      }
-      int dim = fShapeData.size();
-      auto inStrides = UTILITY::ComputeStrideFromShape(fShapeData);
-      auto outStrides = UTILITY::ComputeStrideFromShape(fShapeOutput);
-      size_t length = inStrides[0]*fShapeData[0];  // total tensor size
-      assert (length == outStrides[0]*fShapeOutput[0]);
-
-      std::stringstream out;
-      // Implement transpose operator using consecutive read inputs.
-      // But
-      // tensorOut[id] = tensorInput[ inStrides[0]*i0 + inStrides[1]*i1 + inStrides[2]*i2 + ...]
-      // now if (j0,j1,j2) are the output indices
-      // j0 =  id / outStrides[0]
-      // j1 =  (id % outStrides[0])/outStrides[1]
-      // j2 =  (id % outStrides[1])/outStrides[2]
-      //......
-      // and we have j_k = i_fAttrPerm[k]
-      // since we are using consecutive writes we should find the inverse of fAttrPerm
-      out << SP << "///------- Transpose operator\n" << std::endl;
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNOutput << "[id] = tensor_" << fNData << "[ ";
-      // compute output j indices
-      std::vector<std::string> i_out(dim);
-      for (int k =0; k < dim; k++){
-         if (k == 0)
-            i_out[k] = "id";
-         else
-            i_out[k] = "(id % " + std::to_string(outStrides[k-1]) + ")";
-         if (k < dim-1)
-            i_out[k] += " / " + std::to_string(outStrides[k]);
-      }
-      // use now them for input tensors
-      // need to invert the fAttrPerm[k]
-      for (int k =0; k < dim; k++){
-         // find value in fAtrrPerm corresponding to k
-         int l = std::find(fAttrPerm.begin(), fAttrPerm.end(), k) - fAttrPerm.begin();
-         assert(l >= 0 && l < dim);
-         out << "( " << i_out[l] << " )";
-         if (k < dim-1) {
-            out << " * " << inStrides[k];
-            out << " + ";
-         }
-      }
-      out << "];\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROPERATOR_TRANSPOSE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
deleted file mode 100644
index 28ac093..0000000
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
+++ /dev/null
@@ -1,243 +0,0 @@
-#ifndef SOFIE_ROperator_Where
-#define SOFIE_ROperator_Where
-
-#include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
-#include "SOFIE/RModel.hxx"
-
-#include <sstream>
-
-
-namespace SOFIE{
-
-
-
-template<typename T>
-class ROperator_Where final : public ROperator{
-private:
-
-   bool fIsInputBoolTensor = false;
-
-
-   std::string fNA;
-   std::string fNB;
-   std::string fNC;
-   std::string fNBroadcastedA;
-   std::string fNBroadcastedB;
-   std::string fNBroadcastedC;
-   std::string fNY;
-
-
-   std::vector<size_t> fShapeA;
-   std::vector<size_t> fShapeB;
-   std::vector<size_t> fShapeC;
-   std::vector<size_t> fShapeY;
-
-
-public:
-   ROperator_Where(){}
-   ROperator_Where(const std::string & nameA, const std::string & nameB, const std::string & nameC, const std::string & nameY):
-      fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNA, fNB, fNC };
-         fOutputTensorNames = { fNY };
-      }
-
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      // assume now inputs have same shape (no broadcasting)
-      auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
-      return ret;
-   }
-
-   void Initialize(RModel& model) override {
-      // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNA)){
-         throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNA + "is not found in model");
-      }
-      if (!model.CheckIfTensorAlreadyExist(fNB)) {
-         throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNB + "is not found in model");
-      }
-      if (!model.CheckIfTensorAlreadyExist(fNC)) {
-         throw std::runtime_error(std::string("TMVA SOFIE Where Op Input Tensor ") + fNC + "is not found in model");
-      }
-      // check if fNC input tensor is boolean
-      if (model.IsReadyInputTensor(fNC))
-         fIsInputBoolTensor = true;
-      // check broadcast for A, B and C
-      fShapeA = model.GetTensorShape(fNA);
-      fShapeB = model.GetTensorShape(fNB);
-      fShapeC = model.GetTensorShape(fNC);
-      bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB) || !UTILITY::AreSameShape(fShapeA, fShapeC);
-      if (broadcast) {
-         // find shape to broadcast between A,B,C looking for max length
-         size_t lengthA = ConvertShapeToLength(fShapeA);
-         size_t lengthB = ConvertShapeToLength(fShapeB);
-         size_t lengthC = ConvertShapeToLength(fShapeC);
-         bool broadcastA = false, broadcastB = false, broadcastC = false;
-         if (lengthA >= lengthB && lengthA >= lengthC) {
-            fShapeY = fShapeA;
-            //broadcast B and C if different than A
-            broadcastB = (lengthB != lengthA);
-            broadcastC = (lengthC != lengthA);
-         }
-         else if (lengthB >= lengthA && lengthB >= lengthC) {
-            fShapeY = fShapeB;
-            //broadcast A and C if different than B
-            broadcastA = (lengthA != lengthB);
-            broadcastC = (lengthC != lengthB);
-         }
-         else if (lengthC >= lengthA && lengthC >= lengthB) {
-            fShapeY = fShapeC;
-            //broadcast A and B if different than C
-            broadcastA = (lengthA != lengthC);
-            broadcastB = (lengthB != lengthC);
-         }
-
-         // Broadcast A to Y
-         if (broadcastA) {
-            fNBroadcastedA = "BC_" + fNA + "_to_" + fNY;
-            if (model.IsInitializedTensor(fNA)) {
-               auto data = model.GetInitializedTensorData(fNA);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
-               fShapeA = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting A
-               model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY);
-            }
-         }
-         // Broadcast B to Y
-         if (broadcastB) {
-            fNBroadcastedB = "BC_" + fNB + "_to_" + fNY;
-            if (model.IsInitializedTensor(fNB)) {
-               auto data = model.GetInitializedTensorData(fNB);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
-                  std::default_delete<T[]>());
-               // do not update tensor B but add broadcasted one (since it can be input to some other operators)
-               model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
-               fShapeB = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
-            }
-         }
-         // Broadcast C to Y
-         if (broadcastC) {
-            fNBroadcastedC = "BC_" + fNC + "_to_" + fNY;
-            if (model.IsInitializedTensor(fNC)) {
-               auto data = model.GetInitializedTensorData(fNC);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeC, fShapeY),
-                  std::default_delete<T[]>());
-               // do not update tensor C but add broadcasted one (since it can be input to some other operators)
-               model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData);
-               fShapeC = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               model.AddIntermediateTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY);
-            }
-         }
-      } else {
-         fShapeY = fShapeA;
-      }
-      // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) {
-         std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-         std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
-         auto dataC = static_cast<bool *>(model.GetInitializedTensorData(nameC).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++)
-             dataY[i] = (dataC[i]) ? dataA[i] : dataB[i];
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a file
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
-         model.SetNotWritableInitializedTensor(nameC);
-
-         fIsOutputConstant = true;
-         if (model.Verbose())
-            std::cout << "Where op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-         
-         // output is a constant tensor
-         fOutputTensorNames.pop_back();
-      }
-      else {
-        model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
-      }
-   }
-
-   std::string GenerateInitCode() override {
-      std::stringstream out;
-      return out.str();
-   }
-
-   std::string Generate(std::string OpName) override {
-
-      if (fIsOutputConstant) return "";
-
-      OpName = "op_" + OpName;
-
-      if (fShapeY.empty()) {
-         throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first");
-      }
-      std::stringstream out;
-      out << SP << "\n//-------- Where   \n";
-      size_t length = ConvertShapeToLength(fShapeY);
-      std::string typeName = TensorType<T>::Name();
-      // Broadcast A if it's uninitialized
-      if (fShapeA != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
-         //out << SP << "{\n";
-         out << SP  << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
-      }
-      // Broadcast B if it's uninitialized
-      if (fShapeB != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
-         //out << SP << "{\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
-      }
-       // Broadcast C if it's uninitialized
-      if (fShapeC != fShapeY) {
-         // special case if C is an input tensor
-         if (fIsInputBoolTensor) {
-            size_t inputLength = ConvertShapeToLength(fShapeC);
-            out << SP << "std::vector<bool> fTensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
-         }
-         out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n";
-         //out << SP << "{\n";
-         // for boolean we need to pass vector<bool> and use the non-template version of the function
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedC << ");\n";
-      }
-      std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-      std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-      std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      // get output tensor applying condition (note we need to use directly the vector<bool> since v.data(),  i.e the data pointer, does not exist)
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << "(fTensor_" << nameC << "[id]) ? tensor_"
-                               << nameA << "[id] : tensor_" + nameB + "[id];\n";
-      out << SP << "}\n";
-      return out.str();
-   }
-
-};
-
-}//SOFIE
-
-
-#endif //SOFIE_ROperator_Where
diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx
deleted file mode 100644
index e5495ed..0000000
--- a/src/SOFIE_core/src/RModel.cxx
+++ /dev/null
@@ -1,1327 +0,0 @@
-#include <limits>
-#include <algorithm>
-#include <cctype>
-#include <memory>
-#include <string>
-
-#include "TFile.h"
-
-#include "SOFIE/RModel.hxx"
-#include "SOFIE/SOFIE_common.hxx"
-
-
-namespace SOFIE {
-
-std::underlying_type_t<Options> operator|(Options opA, Options opB) {
-    return static_cast<std::underlying_type_t<Options>>(opA) | static_cast<std::underlying_type_t<Options>>(opB);
-}
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
-    return opA | static_cast<std::underlying_type_t<Options>>(opB);
-}
-
-RModel::RModel(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
-}
-
-RModel& RModel::operator=(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
-    return *this;
-}
-
-const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
-    auto f = fReadyInputTensorInfos.find(name);
-    if (f != fReadyInputTensorInfos.end()) {
-        return f->second.shape;
-    }
-    auto f2 = fInitializedTensors.find(name);
-    if (f2 != fInitializedTensors.end()) {
-        return f2->second.shape();
-    }
-    auto f3 = fInputTensorInfos.find(name);
-    if (f3 != fInputTensorInfos.end()) {
-        throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is an input tensor with unspecified dimension parameter");
-    }
-    auto f4 = fIntermediateTensorInfos.find(name);
-    if (f4 != fIntermediateTensorInfos.end()) {
-        return f4->second.shape;
-    }
-    if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
-      throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape");
-
-   if (fIsSubGraph && fParentGraph)
-      return fParentGraph->GetTensorShape(name);
-
-    throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found");
-}
-
-std::vector<Dim> RModel::GetDynamicTensorShape(std::string name) const {
-   if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
-      return f->second.shape;
-   }
-   if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) {
-      return f->second.shape;
-   }
-   // in case is not a dynamic tensor convert normal shape to Dim one
-   // for this we need to return the vector by value
-   return ConvertShapeToDim(GetTensorShape(name));
-}
-
-const ETensorType& RModel::GetTensorType(std::string name) const {
-    auto f = fReadyInputTensorInfos.find(name);
-    if (f != fReadyInputTensorInfos.end()) {
-        return f->second.type;
-    }
-    auto f2 = fInitializedTensors.find(name);
-    if (f2 != fInitializedTensors.end()) {
-        return f2->second.type();
-    }
-    auto f3 = fInputTensorInfos.find(name);
-    if (f3 != fInputTensorInfos.end()) {
-        return f3->second.type;
-    }
-    auto f4 = fIntermediateTensorInfos.find(name);
-    if (f4 != fIntermediateTensorInfos.end()) {
-        return f4->second.type;
-    }
-    auto f5 = fDynamicTensorInfos.find(name);
-    if (f5 != fDynamicTensorInfos.end()){
-      return f5->second.type;
-    }
-
-    if (fIsSubGraph && fParentGraph)
-      return fParentGraph->GetTensorType(name);
-
-    throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the type is requested is not found, model name: " + fName);
-}
-
-bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) {
-    if (fReadyInputTensorInfos.find(tensor_name) != fReadyInputTensorInfos.end())  return true;
-    if (fInputTensorInfos.find(tensor_name) != fInputTensorInfos.end()) return true;
-    if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true;
-    if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true;
-    if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true;
-    if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name);
-    return false;
-}
-
-void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<Dim> shape) {
-    input_name = UTILITY::Clean_name(input_name);
-    if (CheckIfTensorAlreadyExist(input_name)) {
-        throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n");
-    }
-
-    InputTensorInfo inputInfo { type, shape };
-    fInputTensorInfos[input_name] = inputInfo;
-}
-
-void RModel::AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<size_t> shape) {
-    input_name = UTILITY::Clean_name(input_name);
-    if (CheckIfTensorAlreadyExist(input_name)) {
-        throw std::runtime_error("TMVA-SOFIE: input tensor with name " + input_name + " already exists \n");
-    }
-    TensorInfo inputInfo { type, shape };
-    fReadyInputTensorInfos[input_name] = inputInfo;
-}
-
-void RModel::AddInputTensorName(std::string input_name) {
-    fInputTensorNames.emplace_back(UTILITY::Clean_name(input_name));
-}
-
-void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution) {
-    AddBlasRoutines(op->GetBlasRoutines());
-    auto libs = op->GetStdLibs();
-    auto op_input_tensors = op->GetOpInputTensors();
-    for (auto& stdlib : libs) {
-        AddNeededStdLib(stdlib);
-    }
-    if (order_execution >= 0) {
-        fOperators.insert(fOperators.begin() + order_execution, std::move(op));
-    } else {
-        fOperators.push_back(std::move(op));
-    }
-
-    // storing the last usage of tensors which are input to
-    // operators (but are not inputs to the model, i.e. they are intermediate
-    // tensors). This information is needed to keep a check on when a
-    // particular intermediate tensor can be flushed to free up memory for reuse.
-   for(size_t index = 0; index<op_input_tensors.size() &&
-         fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
-         std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
-                   UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end() &&
-         fDynamicTensorInfos.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fDynamicTensorInfos.end();
-         ++index){
-            fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
-   }
-}
-
-void RModel::AddInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
-    tensor_name = UTILITY::Clean_name(tensor_name);
-    //NB: own data
-    if (CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n");
-    }
-    InitializedTensor new_tensor {type, shape, data};
-    fInitializedTensors[tensor_name] = new_tensor;
-}
-
-void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
-    tensor_name = UTILITY::Clean_name(tensor_name);
-    //NB: own data
-    if (CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n");
-    }
-    InitializedTensor new_tensor {type, shape, data, true};   // add here flag to specify is a constant tensor
-    fInitializedTensors[tensor_name] = new_tensor;
-}
-
-bool RModel::IsInitializedTensor(const std::string& tensorName) const {
-    std::string name = UTILITY::Clean_name(tensorName);
-    return fInitializedTensors.find(name) != fInitializedTensors.end();
-}
-bool RModel::IsConstantTensor(const std::string& tensorName) const {
-    std::string name = UTILITY::Clean_name(tensorName);
-    auto itr = fInitializedTensors.find(name);
-    if (itr == fInitializedTensors.end()) return false;
-    return itr->second.IsConstantTensor();
-}
-
-bool RModel::IsDynamicTensor(const std::string& tensorName) const {
-   std::string name = UTILITY::Clean_name(tensorName);
-   return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
-}
-bool RModel::IsDimInputTensor(const std::string& tensorName) const {
-   std::string name = UTILITY::Clean_name(tensorName);
-   return fInputTensorInfos.find(name) != fInputTensorInfos.end();
-}
-bool RModel::IsReadyInputTensor(const std::string& tensorName) const {
-   std::string name = UTILITY::Clean_name(tensorName);
-   return fReadyInputTensorInfos.find(name) != fReadyInputTensorInfos.end();
-}
-
-// generic addition of a tensor
-void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape) {
-   auto int_shape = ConvertShapeToInt(dim_shape);
-   if (!int_shape.empty())
-      AddIntermediateTensor(tensor_name, type, int_shape);
-   else
-      AddDynamicTensor(tensor_name, type, dim_shape);
-}
-
-void RModel::AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape) {
-    tensor_name = UTILITY::Clean_name(tensor_name);
-    if (CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n");
-    }
-    TensorInfo new_tensor {type, shape};
-    fIntermediateTensorInfos[tensor_name] = new_tensor;
-}
-
-void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector<Dim> shape){
-   tensor_name = UTILITY::Clean_name(tensor_name);
-   if (CheckIfTensorAlreadyExist(tensor_name)){
-      throw std::runtime_error("TMVA-SOFIE: intermediate tensor with name " + tensor_name + " already exists \n");
-   }
-   DynamicTensorInfo new_tensor {type, shape};
-   fDynamicTensorInfos[tensor_name] = new_tensor;
-   // store shape parameter if not existing
-   for (auto &d : shape) {
-      if (d.isParam) {
-         if (fShapeParams.count(d.param) == 0) {
-            // case parameter is an expression of some other existing parameter, no need to
-            // register it
-            if (d.dim != size_t(-1)) {
-              fShapeParams[d.param] = std::to_string(d.dim);
-            }
-         }
-      }
-   }
-}
-
-void RModel::AddOutputTensorNameList(std::vector<std::string> outputtensornames) {
-    fOutputTensorNames.clear();
-    for(auto& it : outputtensornames) {
-        fOutputTensorNames.emplace_back(UTILITY::Clean_name(it));
-    }
-}
-
-void RModel::UpdateOutputTensorList(std::vector<std::string> curr_output_tensors, std::vector<std::string> new_output_tensors) {
-    for(auto& it:curr_output_tensors) {
-        fOutputTensorNames.erase(std::remove(fOutputTensorNames.begin(), fOutputTensorNames.end(), it), fOutputTensorNames.end());
-    }
-    fOutputTensorNames.insert(fOutputTensorNames.end(), new_output_tensors.begin(), new_output_tensors.end());
-}
-
-void RModel::UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape, std::shared_ptr<void> data) {
-    tensor_name = UTILITY::Clean_name(tensor_name);
-    if (!CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to update it");
-    }
-    InitializedTensor new_tensor {type, shape, data};
-    fInitializedTensors[tensor_name] = new_tensor;
-}
-
-std::shared_ptr<void> RModel::GetInitializedTensorData(std::string tensor_name) {
-    auto f = fInitializedTensors.find(tensor_name);
-    if (f == fInitializedTensors.end()) {
-        throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to get its data");
-    } else {
-        return f->second.sharedptr();
-    }
-}
-
-void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) {
-      auto t = fInitializedTensors.find(tensor_name);
-      if (t == fInitializedTensors.end()) {
-         throw std::runtime_error("TMVA-SOFIE: initialized tensor " + tensor_name + " not found when trying to get its info");
-      }
-      t->second.SetNotWritable();
-   }
-
-std::string RModel:: AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors) {
-
-   std::string memory_allocation_string = "";
-   bool allocated;
-
-      for (auto& it : op_output_tensors) {
-         allocated = false;
-         if (GetTensorType(std::string(it)) == ETensorType::BOOL ||
-            fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() ||
-            fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue;
-
-         auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it)));
-         memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes";
-
-            for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) {
-
-                  // check if available memory chunks can accommodate the tensor
-                  if (chunk->second >= tensor_size) {
-                     auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size);
-                     auto new_chunk_location = chunk->first+chunk->second-tensor_size;
-                     fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
-
-                     memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) +
-                                                "* tensor_" + std::string(it) +
-                                                " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n";
-                     chunk->second -= tensor_size;
-
-                     allocated = true;
-
-                     if (chunk->second == 0) {
-                        chunk = fIntermediateMemoryInfo.available_stack.erase(chunk);
-                     }
-
-                     break;
-                  }
-                  ++chunk;
-            }
-
-         if (!allocated) {
-               size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
-                                 ? 0
-                                 : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
-
-               fIntermediateMemoryInfo.total_stack[chunk_idx] =
-                   {
-                     it,
-                     tensor_size
-                   };
-
-               memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n";
-         }
-   }
-   return memory_allocation_string;
-}
-
-void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_input_tensors, const size_t& op_idx){
-   for (auto &it : op_input_tensors){
-      // last occurence of the tensor is reached => flush it from memory
-      if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
-         for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
-               chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) {
-               if (chunk->second.tensor_name == it) {
-
-                     // check if nearby chunks in available memory can coalesce
-                     auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx
-                     auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx
-
-                     // check if the next stack entry is actually adjacent in memory
-                     if (last_smaller->first+last_smaller->second + 1 == chunk->first){
-                        last_smaller->second += chunk->second.tensor_size;
-                        fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
-
-                        if (last_smaller->first + last_smaller->second + 1 == first_greater->first){
-                              fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                              first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                     } else{
-                        if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){
-                           fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                           first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                        fIntermediateMemoryInfo.available_stack.insert({
-                           chunk->first,
-                           chunk->second.tensor_size
-        });
-                     }
-               }
-         }
-      }
-   }
-}
-
-
-
-void RModel::Initialize(int batchSize, bool verbose) {
-   std::map<std::string, size_t> inputParams;
-   if (batchSize > 0) {
-      inputParams["input_size"] = batchSize;
-      inputParams["batch_size"] = batchSize;
-      inputParams["bs"] = batchSize;
-   }
-   Initialize(inputParams, verbose);
-   fIntermediateMemoryInfo = MemoryPoolInfo();
-}
-void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool verbose) {
-
-   fVerbose = int(verbose);
-
-   if (fIsInitialized) {
-      if (verbose)
-         std::cout << "Model is already initialized  - skip initialization " << std::endl;
-      return;
-   }
-   fIntermediateTensorInfos.clear();
-   fDynamicTensorInfos.clear();
-
-   // loop on inputs and see if shape can be  full specified
-   // if the batch size is provided it can be used to specify the full shape
-   // Add the full specified tensors in fReadyInputTensors collection
-   auto originalInputTensorInfos = fInputTensorInfos; // need to copy because we may delete elements
-   for (auto &input : originalInputTensorInfos) {
-      if (verbose) std::cout << "looking at the tensor " << input.first << std::endl;
-      // if a parameter (e.g. batch_size) is specified use for converting parametric shape in defined one
-      if (!inputParams.empty()) {
-         for (auto &d : input.second.shape) {
-            if (d.isParam) {
-               std::string pname = d.param;
-               if (pname == input.first + "_size") pname = "input_size";
-               auto itr = inputParams.find(pname);
-               if (itr != inputParams.end() ) {
-                  d = Dim{ itr->second };
-                  if (verbose)
-                     std::cout << "Tensor: " << input.first << " - fix parametric shape " << itr->first << " to " << itr->second << std::endl;
-               }
-            }
-         }
-      }
-      // see if shape now is fully defined
-      auto shape = ConvertShapeToInt(input.second.shape);
-      if (verbose)
-         std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from "
-            << ConvertDynamicShapeToString(input.second.shape) << std::endl;
-      if (!shape.empty()) {
-         // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and
-         // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored
-         fInputTensorInfos.erase(input.first);
-         // add to the ready input tensor information the new fixed shape
-         AddInputTensorInfo(input.first, input.second.type, shape);
-         // check consistency
-         assert( fReadyInputTensorInfos.size() + fInputTensorInfos.size() == fInputTensorNames.size());
-      }
-      // store the parameters of the input tensors
-      else {
-         // store the found parametric shape parameters
-         for (auto &d : input.second.shape) {
-            if (d.isParam)
-               fShapeParams[d.param] = std::to_string(d.dim);
-         }
-      }
-   }
-
-   if (verbose) {
-      PrintRequiredInputTensors();
-      PrintDynamicTensors();
-   }
-
-   // check if there are initialized tensors to write in a weight file
-   // support for the time being only weight of FLOAT type
-   if (fUseWeightFile) {
-      bool modelHasWeights = false;
-      for (auto &i : fInitializedTensors) {
-         if (i.second.type() == ETensorType::FLOAT) {
-            modelHasWeights = true;
-            break;
-         }
-      }
-      if (!modelHasWeights)
-         fUseWeightFile = false;
-   }
-   // Go through model and initialize each operator
-   int i = 0;
-
-   std::vector<size_t> temp_available_stack; // vector stores individual chunks of available memory that maybe reused
-
-   for(size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx){
-      if (verbose) {
-         auto& r = *fOperators[op_idx].get();
-         std::cout << "Initializing operator " << i << "  " << typeid(r).name() << std::endl;
-      }
-      fOperators[op_idx]->Initialize(*this);
-      for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
-         if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
-             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() &&
-             fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() &&
-             fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){
-            fIntermediateTensorFrequencyLookup[it] = op_idx;
-         }
-      }
-      i++;
-   }
-
-   fIsInitialized = true;
-}
-
-void RModel::InitializeSubGraph(std::shared_ptr<RModel>  graph) {
-   // add the subgraph to the list
-   fSubGraphs.push_back(graph);
-   //this needs to be done before initializing
-   graph->fParentGraph = this;
-   graph->fIsSubGraph = true;
-
-   graph->Initialize(fBatchSize, fVerbose);
-   // set the same options as parent model
-   graph->fWeightFile = fWeightFile;
-   graph->fUseWeightFile = fUseWeightFile;
-   graph->fUseSession = fUseSession;
-   // add needed blas routines and libs
-   std::vector<std::string> blasRoutines;
-   for (auto & e : graph->fNeededBlasRoutines)
-      blasRoutines.push_back(e);
-   AddBlasRoutines(blasRoutines);
-   for (auto e : graph->fNeededStdLib)
-      AddNeededStdLib(e);
-
-   // add parent input tensors to current graph
-   for (auto & name : fInputTensorNames)
-      graph->fInputTensorNames.emplace_back(name);
-
-   // clean graph name
-   graph->fName = UTILITY::Clean_name(graph->fName);
-
-}
-
-// Function to generate the code for declaring and initializing constant tensors
-// This is for tensors which are not part of weight files and can be created from the Constant operator
-template <typename T>
-std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
-{
-   std::stringstream strs;
-   std::string type = ConvertTypeToString(t.second.type());
-   size_t length = ConvertShapeToLength(t.second.shape());
-   // avoid using stack sizes for constant tensors to reduce compilation time
-   bool allocateOnStack = (length > 100) ? false : true;
-
-   const T *data = t.second.data<T>();
-
-   // and check if all values are the same
-   bool sameData = false;
-   // for non stack allocation check if data are the same
-   if (!allocateOnStack && length > 1) {
-      size_t idx = 1;
-      do {
-         sameData = (data[idx] == data[idx - 1]);
-         idx++;
-      } while (sameData && idx < length);
-   }
-   if (allocateOnStack) {
-      strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
-   } else {
-      strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
-      if (sameData)
-         strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
-      else {
-         strs << ConvertValuesToString(length, data) << ";\n";
-      }
-      strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
-   }
-   return strs.str();
-}
-
-void RModel::GenerateInitializedTensorInfo()
-{
-   if (!fInitializedTensors.empty())
-      fGC += "// initialized tensors\n";
-
-   for (auto &i : fInitializedTensors) {
-      if (!fUseWeightFile || i.second.IsConstantTensor()) {
-         if (i.second.type() == ETensorType::FLOAT)
-            fGC += GenerateConstantTensorCode<float>(i);
-         else if (i.second.type() == ETensorType::INT64)
-            fGC += GenerateConstantTensorCode<int64_t>(i);
-
-      } else {
-         // case of tensors which are read from a file
-         size_t length = ConvertShapeToLength(i.second.shape());
-         if (i.second.type() == ETensorType::FLOAT) {
-            fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-            fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-         }
-      }
-   }
-}
-
-void RModel::GenerateIntermediateMemoryPool() {
-   if (fIntermediateMemoryInfo.total_stack.size() == 0) return;
-   fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n";
-
-   // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors
-   // of other data types
-   fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n";
-}
-
-void RModel::GenerateIntermediateTensorInfo() {
-   if (!fIntermediateTensorInfos.empty()) {
-      std::string tensor_declaration_block = "";
-
-      for (auto &i : fIntermediateTensorInfos) {
-         if (i.second.type == ETensorType::BOOL) {
-               tensor_declaration_block += "std::vector<bool> fTensor_" + i.first + " = std::vector<bool>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
-               // No pointer allocation needed for BOOL
-         }
-         if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) {
-            size_t length = ConvertShapeToLength(i.second.shape);
-
-            if (i.second.type == ETensorType::FLOAT) {
-               tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-            }
-            else if (i.second.type == ETensorType::DOUBLE) {
-               tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-            }
-            else if (i.second.type == ETensorType::INT64) {
-               tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
-               tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
-            }
-         }
-      }
-
-      if (tensor_declaration_block.length()) {
-         fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block;
-      }
-   }
-   // add also the dynamic tensors (only declarations, allocation will be done later)
-   if (!fDynamicTensorInfos.empty()) {
-      fGC += "//--- declare the dynamic tensors\n";
-      for (auto &i : fDynamicTensorInfos) {
-         if (i.second.type == ETensorType::FLOAT) {
-            fGC += "std::vector<float> fTensor_" + i.first + ";\n";
-            fGC += "float * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::DOUBLE) {
-            fGC += "std::vector<double> fTensor_" + i.first + ";\n";
-            fGC += "double * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::INT64) {
-            fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
-            fGC += "int64_t * tensor_" + i.first + " = nullptr;\n";
-         }
-      }
-   }
-}
-
-// generate code for specific operator declarations  to be defined in the Session class
-void RModel::GenerateOperatorDeclarations() {
-   std::string strcode;
-   for (auto & op : fOperators) {
-      strcode += op->GenerateDeclCode();
-   }
-   if (strcode.empty()) return;
-   fGC += "\n//---- operator declarations \n";
-   fGC += strcode;
-   fGC += "\n";
-}
-
-void RModel::GenerateDynamicTensorInfo() {
-    fGC += "//---- allocate the intermediate dynamic tensors\n";
-    std::stringstream out;
-    for (auto & i: fDynamicTensorInfos) {
-        auto length = ConvertDynamicShapeToLength(i.second.shape);
-        out << SP <<  "if (" << length << " > 0) {\n";
-        out << SP << SP <<  "fTensor_" <<  i.first  <<  ".resize(" <<  length << ");\n";
-        out << SP << SP <<  "tensor_" << i.first << " = fTensor_" << i.first  << ".data();\n";
-        out << SP << "}\n";
-    }
-    fGC += out.str();
-}
-
-std::string RModel::GenerateInferSignature(bool isdecl) {
-   // generate the infer signature given the inputs: eg. "float * tensor1, float * tensor2"
-   // if (decl = false) generate only calling signature (tensor1,tensor2,....)
-   std::string rGC;
-   std::unordered_map<std::string, int> inputParams;
-   int i_input = 0;
-   for (auto &name : fInputTensorNames) {
-      // if is a dynamic tensor pass initial parameters
-      if (IsDimInputTensor(name)) {
-         auto shape = GetDynamicTensorShape(name);
-         for (auto &d : shape) {
-            std::string pName = d.param;
-            // need to check if the input parameters is already existing in another input tensor
-            if (d.isParam && inputParams.count(pName) == 0) {
-               if (isdecl) rGC += "size_t ";
-               rGC += d.param + ",";
-               inputParams[pName] = i_input;
-            }
-         }
-      }
-      if (isdecl) {
-         std::string type = ConvertTypeToString(GetTensorType(name));
-         if (type == "other")
-            throw std::runtime_error("TMVA-SOFIE: input tensor " + name +
-                                     " is of a data type which is not yet supported.");
-         rGC += type + "* ";
-      }
-      rGC += "tensor_" + name + ",";
-      i_input++;
-   }
-
-   if (fInputTensorNames.size() > 0) rGC.pop_back();// remove last ","
-   return rGC;
-}
-
-namespace {
-
-std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor)
-{
-   if(name.empty()) return "{}";
-   ETensorType eOutputType = rmodel.GetTensorType(name);
-   std::string outputType = ConvertTypeToString(eOutputType);
-   if (isIntermediateTensor) {
-
-      if (eOutputType == ETensorType::BOOL) {
-         return "fTensor_" + name;
-      } else {
-         // need to check is size is the same(don't want to return a vector with larger size)
-         // in that case better to copy
-         return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " +
-                std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")";
-      }
-   }
-   // include also dynamic tensors since the vectors can be allocated with a size larger than their output
-   // we need a special handling for bool type allocated as vector<bool>
-   auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name));
-   if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) {
-      return "std::vector<bool>(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")";
-   }
-   return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")";
-}
-
-} // namespace
-
-void RModel::GenerateOutput() {
-
-   if (fVerbose)
-      std::cout << "Generating main inference code for " << fName << std::endl;
-
-   size_t outputSize = fOutputTensorNames.size();
-   // assume output types are all the same
-   if (outputSize == 0)
-      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
-
-   bool sameOutputTypes = true;
-   std::string inferReturnType; // type return by infer function
-   ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin());
-   std::string outputType = ConvertTypeToString(eOutputType);
-   fGC += "\n\n";
-   if (outputSize == 1) {
-      fGC += "std::vector<" + outputType + ">";
-   } else {
-      // if all output types are the same we return an std::vector - otherwise a tuple
-      for (size_t i = 1; i < outputSize; i++) {
-         if (GetTensorType(fOutputTensorNames[i]) != eOutputType)
-            sameOutputTypes = false;
-      }
-      if (sameOutputTypes)
-         fGC += "std::vector<std::vector<" + outputType + ">>";
-      else {
-         inferReturnType = "std::tuple<";
-         for (size_t i = 0; i < outputSize; i++) {
-            inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">";
-            if (i < outputSize-1) inferReturnType += ",";
-         }
-         inferReturnType += ">";
-         fGC += inferReturnType;
-      }
-   }
-
-   fGC += " infer(";
-
-   fGC += GenerateInferSignature();
-
-   fGC += "){\n";
-
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
-   }
-
-   fGC += SP + "return {";
-   for (size_t i = 0; i < outputSize; i++) {
-      std::string tensorName = *(fOutputTensorNames.begin() + i);
-      bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0;
-      fGC += createOutputTensor(*this, tensorName, isIntermediate);
-      if (i < outputSize - 1)
-         fGC += ",";
-   }
-   fGC += "};\n";
-   fGC += "}\n";  // end of infer function scope
-}
-
-void RModel::GenerateSessionCode()
-{
-
-   // define the Session struct (for GNN this is generated in RModel_GNN)
-   if (fUseSession && !fIsGNNComponent) {
-      if (!fIsSubGraph)
-         fGC += "struct Session {\n";
-      else
-         fGC += "struct Session_" + fName + " {\n";
-   }
-
-   // generate code for declaring the initialized tensors
-   GenerateInitializedTensorInfo();
-
-   // evaluate total intermediate memory and position intermediate tensor addresses
-   std::string intermediate_memory_alloc_string = "";
-   intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
-      CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
-   }
-
-   // to check remaining unused fragments after memory allocation (lesser the better)
-   // for (const auto &it: fIntermediateMemoryInfo.available_stack){
-   //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
-   // }
-
-   // generate the memory pool to be used by intermediate tensors
-   GenerateIntermediateMemoryPool();
-
-   // position intermediate tensors
-   fGC += intermediate_memory_alloc_string;
-
-   // generate the declaring the intermediate tensors
-   GenerateIntermediateTensorInfo();
-   // generate code for declarations of some specific operators
-   GenerateOperatorDeclarations();
-
-
-
-   // add subgraph session
-   if (!fSubGraphs.empty()) fGC += "//   subgraph sessions\n";
-   for (auto & graph : fSubGraphs) {
-      fGC += "Session_" + graph->fName + "  fSession_" + graph->fName + ";\n";
-   }
-
-   // Generate code for Session constructor
-   if (fUseSession) {
-      std::string sessionName = "Session";
-      if (fIsSubGraph)
-         sessionName += "_" + fName;
-      // add here specific operator code that needs to define session data members
-      fGC += "\n";
-      for (size_t id = 0; id < fOperators.size(); id++) {
-         std::string opName = std::to_string(id);
-         fGC += fOperators[id]->GenerateSessionMembersCode(opName);
-      }
-      fGC += "\n";
-      // here add initialization and reading of weight tensors
-      if (fUseWeightFile) {
-         std::string fileName = fName;
-         if (fWeightFile == WeightFileType::Text) {
-            fileName += ".dat";
-         }
-         if (fWeightFile == WeightFileType::RootBinary) {
-            fileName += ".root";
-         }
-         fGC += sessionName + "(std::string filename =\"" + fileName + "\"";
-      } else {
-         // no need to pass weight file since it is not used
-         // keep passing a string for compatibility
-         fGC += sessionName + "(std::string = \"\"";
-      }
-      // add initialization of shape parameters
-      // assume all parameters are of type size_t
-      if (!fShapeParams.empty()) {
-         for (auto &p : fShapeParams) {
-            fGC += ",\n";
-            fGC += "        size_t " + p.first + " = " + p.second;
-         }
-      }
-      fGC += ") {\n";
-
-      if (fUseWeightFile) {
-         fGC += "\n//--- reading weights from file\n";
-         ReadInitializedTensorsFromFile(fReadPos);
-         fGC += "\n";
-         // fUseWeightFile = fUseWeightFile;
-      }
-
-      // now we have passed the parameters we can allocate the dynamic tensors
-      GenerateDynamicTensorInfo();
-
-      // add here initialization code  for operator
-      for (size_t id = 0; id < fOperators.size(); id++) {
-         fGC += fOperators[id]->GenerateInitCode();
-      }
-
-      fGC += "}\n\n";
-   }
-   // generate the inference code
-   GenerateOutput();
-
-   // end of session
-   if (fUseSession && !fIsGNNComponent) {
-      fGC += "};   // end of Session\n";
-   }
-}
-
-void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
-{
-   fVerbose = verbose;
-   fBatchSize = batchSize;
-   fReadPos = pos;
-
-   // session flag is used in operator initialize
-   if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
-      fUseSession = false;
-      fWeightFile = WeightFileType::None;
-   }
-   if (static_cast<std::underlying_type_t<Options>>(Options::kNoWeightFile) & options) {
-      fUseWeightFile = false;
-      fWeightFile = WeightFileType::None;
-   }
-   if (static_cast<std::underlying_type_t<Options>>(Options::kRootBinaryWeightFile) & options) {
-      fUseWeightFile = true;
-      fWeightFile = WeightFileType::RootBinary;
-   }
-   if (fUseWeightFile && !fUseSession) {
-      throw std::runtime_error(
-         "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class");
-   }
-
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options)
-      fIsGNN = true;
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
-      fIsGNNComponent = true;
-
-   // initialize the model including all operators and sub-graphs
-   Initialize(batchSize, verbose);
-
-   std::string hgname;
-   if (!fIsGNNComponent && !fIsSubGraph) {
-      fGC.clear();
-      GenerateHeaderInfo(hgname);
-   }
-
-   // generate first code for the subgraphs
-   for (auto &graph : fSubGraphs) {
-      if (fVerbose)
-         std::cout << "generate session code for subgraph " << graph->fName << std::endl;
-      graph->GenerateSessionCode();
-      fGC += graph->fGC;
-   }
-
-   if (fVerbose)
-      std::cout << "generate Main session code - model  " << fName << std::endl;
-
-   // generate main session code
-   GenerateSessionCode();
-
-   if (!fIsGNNComponent && !fIsSubGraph) {
-      fGC += ("} //SOFIE_" + fName + "\n");
-      fGC += "\n#endif  // " + hgname + "\n";
-   }
-}
-
-void RModel::ReadInitializedTensorsFromFile(long pos) {
-    // generate the code to read initialized tensors from a text data file
-    if (fWeightFile == WeightFileType::Text) {
-        if (fInitializedTensors.empty()) return;
-
-        fGC += "   std::ifstream f;\n";
-        fGC += "   f.open(filename);\n";
-        fGC += "   if (!f.is_open()) {\n";
-        fGC += "      throw std::runtime_error(\"tmva-sofie failed to open file \" + filename + \" for input weights\");\n";
-        fGC += "   }\n";
-
-        if(fIsGNNComponent) {
-            fGC += "   f.seekg(" + std::to_string(pos) + ");\n";
-        }
-
-        fGC += "   std::string tensor_name;\n";
-        fGC += "   size_t length;\n";
-
-        // loop on tensors and parse the file
-        for (auto& i: fInitializedTensors) {
-            // skip Constant and shape tensors (not written in a file)
-            if (!i.second.IsWeightTensor()) continue;
-            std::string tensor_name = "tensor_" + i.first;
-            if (i.second.type() == ETensorType::FLOAT) {
-                size_t length = 1;
-                length = ConvertShapeToLength(i.second.shape());
-                std::string slength = std::to_string(length);
-                fGC += "   f >> tensor_name >> length;\n";
-                fGC += "   if (tensor_name != \"" + tensor_name + "\" ) {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " +
-                       tensor_name + " , read \" + tensor_name;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   if (length != " + slength + ") {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " +
-                       slength + " , read \" + std::to_string(length) ;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   for (size_t i = 0; i < length; ++i)\n";
-                fGC += "      f >> " + tensor_name + "[i];\n";
-                fGC += "   if (f.fail()) {\n";
-                fGC += "      throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n";
-                fGC += "   }\n";
-            } else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
-            }
-        }
-        fGC += "   f.close();\n";
-    }
-
-    // generate the code to read initialized tensors from a ROOT data file
-    if(fWeightFile == WeightFileType::RootBinary) {
-        fGC += "  {\n";
-        fGC += "   std::unique_ptr<TFile> rootFile(TFile::Open(filename.c_str(), \"READ\"));\n";
-        fGC += "   if (!rootFile->IsOpen()) {\n";
-        fGC += "      throw std::runtime_error(\"tmva-sofie failed to open ROOT file for input weights\");\n";
-        fGC += "   }\n";
-
-        std::string dirName = fName + "_weights";
-        fGC += "   if (!rootFile->GetKey(\"" + dirName + "\")) {\n";
-        fGC += "      throw std::runtime_error(\"tmva-sofie failed to open ROOT directory for input weights\");\n";
-        fGC += "   }\n";
-
-        for (auto &i : fInitializedTensors) {
-            // skip Constant and shape tensors
-            if (!i.second.IsWeightTensor()) continue;
-            fGC += "  {\n";
-            std::string tensor_name = "tensor_" + i.first;
-            if (i.second.type() == ETensorType::FLOAT) {
-               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<float>*>(rootFile->Get(\"";
-               fGC += dirName + "/" + tensor_name + "\"));\n";
-            } else if (i.second.type() == ETensorType::DOUBLE) {
-               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<double>*>(rootFile->Get(\"";
-               fGC += dirName + + "/" + tensor_name + "\"));\n";
-            } else if (i.second.type() == ETensorType::INT64) {
-               fGC += "      fTensor_" + i.first + " = *reinterpret_cast<std::vector<int64_t>*>(rootFile->Get(\"";
-               fGC += dirName + "/" + tensor_name + "\"));\n";
-            } else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file");
-            }
-            fGC += "  }\n";
-        }
-        fGC += "  }\n";
-    }
-}
-
-long RModel::WriteInitializedTensorsToFile(std::string filename) {
-    // Determine the file extension based on the weight file type
-    std::string fileExtension;
-    switch (fWeightFile) {
-    case WeightFileType::None:
-        fileExtension = ".dat";
-        break;
-    case WeightFileType::RootBinary:
-        fileExtension = ".root";
-        break;
-    case WeightFileType::Text:
-        fileExtension = ".dat";
-        break;
-    }
-
-    // If filename is empty, use the model name as the base filename
-    if (filename.empty()) {
-        filename = fFileName + fileExtension;
-    }
-
-    // Write the initialized tensors to the file
-    if (fWeightFile == WeightFileType::RootBinary) {
-        if(fIsGNNComponent || fIsGNN) {
-            throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file.");
-        }
-        std::unique_ptr<TFile> outputFile(TFile::Open(filename.c_str(), "UPDATE"));
-
-        std::string dirName = fName + "_weights";
-        // check if directory exists, in case delete to replace with new one
-        if (outputFile->GetKey(dirName.c_str()))
-            outputFile->rmdir(dirName.c_str());
-
-        auto outputDir = outputFile->mkdir(dirName.c_str());
-
-        for (const auto& item : fInitializedTensors) {
-            // skip Constant tensors and tensors which are not writable (e.g. shape tensors)
-            if (!item.second.IsWeightTensor()) continue;
-            std::string tensorName = "tensor_" + item.first;
-            size_t length = 1;
-            length = ConvertShapeToLength(item.second.shape());
-            if(item.second.type() == ETensorType::FLOAT) {
-               const float* data = item.second.data<float>();
-                std::vector<float> tensorDataVector(data, data + length);
-               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<float>", tensorName.c_str());
-            }
-            else if(item.second.type() == ETensorType::DOUBLE) {
-               const double* data = item.second.data<double>();
-               std::vector<double> tensorDataVector(data, data + length);
-               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<double>", tensorName.c_str());
-            }
-            else if(item.second.type() == ETensorType::INT64) {
-               const int64_t* data = item.second.data<int64_t>();
-               std::vector<int64_t> tensorDataVector(data, data + length);
-               outputDir->WriteObjectAny(&tensorDataVector, "std::vector<int64_t>", tensorName.c_str());
-            }
-            else {
-               std::runtime_error("tmva-sofie tensor " + tensorName + " with type " + ConvertTypeToString(item.second.type()) +
-                                  " cannot be written to a ROOT file");
-            }
-        }
-        outputFile->Write(filename.c_str());
-
-        // this needs to be changed, similar to the text file
-        return -1;
-
-    } else if (fWeightFile == WeightFileType::Text) {
-        std::ofstream f;
-        if(fIsGNNComponent) {
-            // appending all GNN components into the same file
-            f.open(filename, std::ios::app);
-        } else {
-            f.open(filename);
-        }
-        if (!f.is_open())
-            throw
-            std::runtime_error("tmva-sofie failed to open file " + filename + " for tensor weight data");
-        for (auto& i: fInitializedTensors) {
-             // skip Constant tensors and not writable tensors (e.g. shape tensors)
-            if (!i.second.IsWeightTensor()) {
-               continue;
-            }
-            size_t length = ConvertShapeToLength(i.second.shape());
-            std::string tensor_name = "tensor_" + i.first;
-            f << tensor_name << " " << length << "\n";
-            if (i.second.type() == ETensorType::FLOAT) {
-               const float * data = i.second.data<float>();
-               for (size_t idx = 0; idx < length; idx++) {
-                  // round to zero sub-normal values
-                  float value = data[idx];
-                  if (value != 0. && std::abs(value) < std::numeric_limits<float>::min() ) value = 0;
-                  f << std::setprecision(std::numeric_limits<float>::max_digits10) << value;
-                  f <<  ( (idx < length-1) ? " " : "\n" );
-               }
-            }
-            else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
-            }
-            if (f.fail())
-               std::runtime_error("tmva-sofie failed to write tensor data to file for  " + tensor_name);
-        }
-        long curr_pos = f.tellp();
-        f.close();
-        return curr_pos;
-    } else {
-        return -1;
-    }
-}
-
-void RModel::PrintRequiredInputTensors() {
-    std::cout << "Model requires following inputs:\n";
-    for (auto& inputInfo: fInputTensorInfos) {
-        std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t";
-        std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t";
-        std::cout << "shape: [";
-        for (size_t i = 0; i < inputInfo.second.shape.size(); i++) {
-            if (inputInfo.second.shape[i].isParam) {
-                std::cout << inputInfo.second.shape[i].param;
-            } else {
-                std::cout << inputInfo.second.shape[i].dim ;
-            }
-            if (i < inputInfo.second.shape.size() - 1) std::cout << ",";
-        }
-        std::cout << "]" << std::endl;
-    }
-
-    for (auto& inputInfo: fReadyInputTensorInfos) {
-        std::cout << "Fully Specified Tensor name: " << inputInfo.first << "\t";
-        std::cout << "type: " << ConvertTypeToString(inputInfo.second.type) << "\t";
-        std::cout << "shape: [";
-        for (size_t i = 0; i < inputInfo.second.shape.size(); i++) {
-            std::cout << inputInfo.second.shape[i];
-            if (i < inputInfo.second.shape.size() - 1) std::cout << ",";
-        }
-        std::cout << "]" << std::endl;
-    }
-    std::cout << "\n";
-}
-
-void RModel::PrintInitializedTensors() {
-    std::cout << "Model initialized the following tensors:\n";
-    for (auto& it: fInitializedTensors) {
-        std::cout << "Tensor name: \"" << it.first << "\"\t";
-        std::cout << "type: " << ConvertTypeToString(it.second.type()) << "\t";
-        std::cout << "shape: [";
-        for (size_t i = 0; i < it.second.shape().size(); i++) {
-            std::cout << it.second.shape()[i];
-            if (i < it.second.shape().size() - 1) std::cout << ",";
-        }
-        std::cout << "]";
-        if (it.second.IsConstantTensor()) std::cout << " (Constant)";
-        else if (!it.second.IsWeightTensor()) std::cout << " (Not Writable)";
-        std::cout << std::endl;
-    }
-    std::cout << "\n";
-}
-
-void RModel::PrintIntermediateTensors() {
-    std::cout << "Model specify the following intermediate tensors:\n";
-    for (auto& it: fIntermediateTensorInfos) {
-        std::cout << "Tensor name: \"" << it.first << "\"\t";
-        std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t";
-        std::cout << "shape: [";
-        for (size_t i = 0; i < it.second.shape.size(); i++) {
-            std::cout << it.second.shape[i];
-            if (i < it.second.shape.size() - 1) std::cout << ",";
-        }
-        std::cout << "]" << std::endl;
-    }
-    std::cout << "\n";
-}
-
-void RModel::PrintDynamicTensors() {
-    std::cout << "Model specify the following dynamic tensors:\n";
-    for (auto& it: fDynamicTensorInfos) {
-        std::cout << "Tensor name: \"" << it.first << "\"\t";
-        std::cout << "type: " << ConvertTypeToString(it.second.type) << "\t";
-        std::cout << "shape: [";
-        for (size_t i = 0; i < it.second.shape.size(); i++) {
-            std::cout << it.second.shape[i].GetVal();
-            if (i < it.second.shape.size() - 1) std::cout << ",";
-        }
-        std::cout << "]" << std::endl;
-    }
-    std::cout << "\n";
-}
-
-void RModel::PrintOutputTensors() {
-    std::cout << "Model specify the following output tensors:\n";
-    for (auto& it: fOutputTensorNames) {
-        std::cout << "Tensor name: \"" << it << "\"\t";
-        if (!IsDynamicTensor(it))
-          std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
-       else
-          std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl;
-    }
-    std::cout << "\n";
-}
-
-void RModel::HeadInitializedTensors(std::string name, int n_print) {
-    auto it = fInitializedTensors.find(name);
-    if (it == fInitializedTensors.end()) {
-        std::cout << "Tensor " << name << " not found in model's initialized tensor list" << std::endl;
-        return;
-    }
-
-    std::cout << "Tensor name: " << it->first << "\t";
-    std::cout << "type: " << ConvertTypeToString(it->second.type()) << "\t";
-    int length =1;
-    std::cout << "shape: [";
-    for (size_t i = 0; i < it->second.shape().size(); i++) {
-        std::cout << it->second.shape()[i];
-        length *= it->second.shape()[i];
-        if (i < it->second.shape().size() - 1) std::cout << ",";
-    }
-    std::cout << "]" << std::endl;
-    bool ellipsis = true;
-    if (n_print > length) {
-        n_print = length;
-        ellipsis = false;
-    }
-
-    std::cout << "data: [" << std::endl;
-    if (it->second.type() == ETensorType::FLOAT) {
-        auto converted_data = it->second.data<float>();
-        for (int i =0; i < n_print; i++) {
-            std::cout << converted_data[i];
-            if (i < n_print - 1) std::cout << " ,";
-        }
-    }
-    if (ellipsis) std::cout << ", ...";
-    std::cout << "]" << std::endl;
-
-}
-
-void RModel::OutputGenerated(std::string filename, bool append) {
-
-    RModel_Base::OutputGenerated(filename, append);
-
-    // write weights in a text file
-    if (fUseWeightFile) {
-        if (!filename.empty()) {
-            size_t pos = filename.find(".hxx");
-            if (fWeightFile == WeightFileType::Text)
-                filename.replace(pos, 4, ".dat");
-            if (fWeightFile == WeightFileType::RootBinary)  {
-                filename = filename.erase(pos, 4);
-                filename += ".root";
-            }
-        } else {
-            filename = fName;
-            filename += fWeightFile == WeightFileType::Text ? ".dat" : ".root";
-        }
-        WriteInitializedTensorsToFile(filename);
-    }
-}
-
-void RModel::Streamer(TBuffer &R__b) {
-    if (R__b.IsReading()) {
-        RModel::Class()->ReadBuffer(R__b, this);
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastPersistentToShared();
-        }
-    }
-    else {
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastSharedToPersistent();
-        }
-        RModel::Class()->WriteBuffer(R__b, this);
-    }
-}
-
-}//SOFIE
diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt
deleted file mode 100644
index 34bb49f..0000000
--- a/src/SOFIE_core/test/CMakeLists.txt
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (C) 1995-2021, Rene Brun and Fons Rademakers.
-# All rights reserved.
-#
-# For the licensing terms see $ROOTSYS/LICENSE.
-# For the list of contributors see $ROOTSYS/README/CREDITS.
-
-############################################################################
-# CMakeLists.txt file for building TMVA SOFIE tests.
-# @author Federico Sossai, Sanjiban Sengupta
-############################################################################
-
-include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc)
-include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc)
-
-if (NOT ONNX_MODELS_DIR)
-  set(ONNX_MODELS_DIR input_models)
-endif()
-
-# Finding .onnx files to be parsed and creating the appropriate code to
-# parse all file. It is much faster to combine all parsing in a single executable
-# which will avoid initialization time (especially when using ROOT)
-set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");")
-set(ALL_CAPTURES "")
-# Finding .onnx files to be parsed and creating the appropriate command
-file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx")
-foreach(onnx_file ${ONNX_FILES})
-  get_filename_component(fname ${onnx_file} NAME_WE)
-  get_filename_component(fdir ${onnx_file} DIRECTORY)
-  string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR})
-  string(REPLACE "@2" ${fname} cap ${cap})
-  list(APPEND ALL_CAPTURES ${cap})
-endforeach()
-string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}")
-configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY)
-configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY)
-
-ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx
-    LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
-                             FIXTURES_SETUP sofie-compile-models-onnx-build)
-
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds)
-
-ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname}
-  FIXTURES_REQUIRED sofie-compile-models-onnx-build
-  FIXTURES_SETUP sofie-compile-models-onnx
-)
-
-# Creating a Google Test
-if (BLAS_FOUND)  # we need BLAS for compiling the models
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
-    LIBRARIES
-      MathCore
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-onnx
-    FIXTURES_SETUP
-      sofie-test-models-onnx-build
-  )
-  target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
-                    EXEC ./TestCustomModelsFromONNX
-                    FIXTURES_REQUIRED sofie-test-models-onnx-build)
-endif()
-
-# For testing serialisation of RModel object
-
-ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx
-    LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers
-    FIXTURES_SETUP sofie-compile-models-onnx-root
-)
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds)
-
-# Automatic compilation of headers from root files
-ROOTTEST_ADD_TEST(SofieCompileModels_ROOT
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT
-  FIXTURES_REQUIRED sofie-compile-models-onnx-root
-  FIXTURES_SETUP sofie-compile-models-root
-)
-
-if (BLAS_FOUND)
-  # Creating a Google Test for Serialisation of RModel
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx
-    LIBRARIES
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-root
-    FIXTURES_SETUP
-      sofie-test-models-root-build
-  )
-  target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromROOT
-                    EXEC ./TestCustomModelsFromROOT
-                    FIXTURES_REQUIRED sofie-test-models-root-build)
-endif()
-
-# Look for needed Python modules
-ROOT_FIND_PYTHON_MODULE(torch)
-if (ROOT_TORCH_FOUND)
-  configure_file(Conv1dModelGenerator.py  Conv1dModelGenerator.py COPYONLY)
-  configure_file(Conv2dModelGenerator.py  Conv2dModelGenerator.py COPYONLY)
-  configure_file(Conv3dModelGenerator.py  Conv3dModelGenerator.py COPYONLY)
-  configure_file(ConvTrans2dModelGenerator.py  ConvTrans2dModelGenerator.py COPYONLY)
-  configure_file(LinearModelGenerator.py  LinearModelGenerator.py COPYONLY)
-  configure_file(RecurrentModelGenerator.py  RecurrentModelGenerator.py COPYONLY)
-
-  if (BLAS_FOUND)
-    ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx
-      LIBRARIES
-        SOFIE_core
-        SOFIE_parsers
-        BLAS::BLAS
-      INCLUDE_DIRS
-        ${CMAKE_CURRENT_BINARY_DIR}
-    )
-  endif()
-endif()
-
-ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN)
-
-ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..12f19b1
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,191 @@
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
+
+############################################################################
+# Basic setup
+############################################################################
+include_directories(${CMAKE_SOURCE_DIR}/core/inc)
+include_directories(${CMAKE_SOURCE_DIR}/parsers/inc)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+if (NOT ONNX_MODELS_DIR)
+  set(ONNX_MODELS_DIR input_models)
+endif()
+
+option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF)
+
+set(ALPAKA_BACKEND "cuda"
+    CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)")
+set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl)
+
+############################################################################
+# Generate emitter sources
+############################################################################
+set(CAPTURE_STR
+"try {\n\
+    EmitModel(\"@1\", \"@2\");\n\
+} catch (const std::exception& e) {\n\
+    std::string msg = e.what();\n\
+    if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\
+        std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\
+    } else if (msg.find(\"is of a data type which is not yet supported\") != std::string::npos) {\n\
+        std::cerr << \"[SKIP] Operator with unsupported data type in @1: \" << msg << std::endl;\n\
+    } else {\n\
+        std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\
+        failures++;\n\
+    }\n\
+} catch (...) {\n\
+    std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\
+    failures++;\n\
+}\n\
+")
+
+file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx")
+
+set(ALL_CAPTURES "")
+foreach(onnx_file ${ONNX_FILES})
+  get_filename_component(fname ${onnx_file} NAME_WE)
+  string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}")
+  string(REPLACE "@2" "${fname}" cap "${cap}")
+  string(APPEND ALL_CAPTURES "${cap}")
+endforeach()
+
+set(EMIT_CAPTURES "${ALL_CAPTURES}")
+
+configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY)
+configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY)
+
+############################################################################
+# Alpaka tests
+############################################################################
+if (ENABLE_ALPAKA_TESTS)
+
+  string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend)
+  if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND)
+    message(FATAL_ERROR "Unsupported ALPAKA_BACKEND=${ALPAKA_BACKEND}")
+  endif()
+
+  FetchContent_Declare(
+    sofieBLAS
+    GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS
+    GIT_TAG        dev
+  )
+  FetchContent_MakeAvailable(sofieBLAS)
+
+  FetchContent_Declare(
+    alpaka
+    GIT_REPOSITORY https://github.com/alpaka-group/alpaka
+    GIT_TAG        2fa91a34ed11b2076e474c5507d920e85cf9b79d
+  )
+  FetchContent_MakeAvailable(alpaka)
+
+  ##########################################################################
+  # Alpaka emitter
+  ##########################################################################
+  ROOTTEST_GENERATE_EXECUTABLE(
+    emitFromONNXAlpaka
+    EmitFromONNX_GPU_ALPAKA_all.cxx
+    LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build
+  )
+
+  target_compile_options(emitFromONNXAlpaka PRIVATE
+    -Wno-unused-parameter
+    -Wno-array-bounds
+  )
+
+  ROOTTEST_ADD_TEST(
+    SofieCompileModels_ONNX_Alpaka
+    COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka
+    FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka
+  )
+
+  ##########################################################################
+  # CUDA backend
+  ##########################################################################
+  if (_alpaka_backend STREQUAL "cuda")
+
+    message(STATUS "Enabling Alpaka CUDA tests")
+
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+
+    set_source_files_properties(
+      TestCustomModelsFromONNXForAlpakaCuda.cxx
+      PROPERTIES LANGUAGE CUDA
+    )
+
+    ROOTTEST_GENERATE_EXECUTABLE(
+      TestCustomModelsFromONNXForAlpakaCuda
+      TestCustomModelsFromONNXForAlpakaCuda.cxx
+      LIBRARIES SOFIE_core GTest::gtest GTest::gtest_main
+      FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
+      FIXTURES_SETUP sofie-test-models-onnx-alpaka-build
+    )
+
+    target_include_directories(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ${CMAKE_CURRENT_BINARY_DIR}
+      ${alpaka_SOURCE_DIR}/include
+      ${sofieblas_SOURCE_DIR}/include
+      ${CUDAToolkit_INCLUDE_DIRS}
+      ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+
+    set_target_properties(
+      TestCustomModelsFromONNXForAlpakaCuda
+      PROPERTIES
+        CUDA_SEPARABLE_COMPILATION OFF
+        CUDA_ARCHITECTURES 70 80 86
+        CUDA_STANDARD 20
+        CUDA_STANDARD_REQUIRED ON
+    )
+
+    target_compile_definitions(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ALPAKA_ACC_GPU_CUDA_ENABLED
+      ALPAKA_HAS_STD_ATOMIC_REF
+    )
+
+    target_compile_options(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:
+        --extended-lambda
+        --expt-relaxed-constexpr
+        --generate-line-info
+        --use_fast_math
+        -g
+        -G
+        # -fsanitize=address
+        -O1
+        -Wno-deprecated-gpu-targets
+      >
+      $<$<COMPILE_LANGUAGE:CXX>:
+        -O2
+        -g
+        -G
+        -fPIC
+        -pthread
+      >
+    )
+  # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
+
+    # ROOT-compatible: plain signature only
+    target_link_libraries(
+      TestCustomModelsFromONNXForAlpakaCuda
+      CUDA::cudart
+      CUDA::cublas
+      CUDA::cublasLt
+    )
+
+    ROOTTEST_ADD_TEST(
+      TestCustomModelsFromONNXForAlpakaCuda
+      EXEC ./TestCustomModelsFromONNXForAlpakaCuda
+      FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
+    )
+
+  endif() # cuda backend
+endif()   # ENABLE_ALPAKA_TESTS
diff --git a/src/SOFIE_core/test/Conv1dModelGenerator.py b/test/Conv1dModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/Conv1dModelGenerator.py
rename to test/Conv1dModelGenerator.py
diff --git a/src/SOFIE_core/test/Conv2dModelGenerator.py b/test/Conv2dModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/Conv2dModelGenerator.py
rename to test/Conv2dModelGenerator.py
diff --git a/src/SOFIE_core/test/Conv3dModelGenerator.py b/test/Conv3dModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/Conv3dModelGenerator.py
rename to test/Conv3dModelGenerator.py
diff --git a/src/SOFIE_core/test/ConvTrans2dModelGenerator.py b/test/ConvTrans2dModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/ConvTrans2dModelGenerator.py
rename to test/ConvTrans2dModelGenerator.py
diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/test/EmitFromONNX.cxx.in
similarity index 77%
rename from src/SOFIE_core/test/EmitFromONNX.cxx.in
rename to test/EmitFromONNX.cxx.in
index f7a56e2..c464f4d 100644
--- a/src/SOFIE_core/test/EmitFromONNX.cxx.in
+++ b/test/EmitFromONNX.cxx.in
@@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) {
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
 
 }
 
diff --git a/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/test/EmitFromONNX_GPU_ALPAKA.cxx.in
new file mode 100644
index 0000000..58198c1
--- /dev/null
+++ b/test/EmitFromONNX_GPU_ALPAKA.cxx.in
@@ -0,0 +1,27 @@
+// Author: Sanjiban Sengupta
+
+#include "SOFIE/RModel_Base.hxx"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelParser_ONNX.hxx"
+
+using namespace SOFIE;
+
+int EmitModel(std::string filename, std::string outname) {
+
+   RModelParser_ONNX parser;
+   RModel model = parser.Parse(filename);
+   model.GenerateGPU_ALPAKA();
+   model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx");
+
+   return 0;
+}
+
+int main(int argc, char *argv[]) {
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+}
diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/test/EmitFromRoot.cxx.in
similarity index 83%
rename from src/SOFIE_core/test/EmitFromRoot.cxx.in
rename to test/EmitFromRoot.cxx.in
index 4a630c7..88c0789 100644
--- a/src/SOFIE_core/test/EmitFromRoot.cxx.in
+++ b/test/EmitFromRoot.cxx.in
@@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+    int failures = 0;
 
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+
+   @EMIT_CAPTURES@;
+
+   std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl;
+   return failures == 0 ? 0 : 1;
 }
diff --git a/src/SOFIE_core/test/GNN/EmitGNN.cxx b/test/GNN/EmitGNN.cxx
similarity index 100%
rename from src/SOFIE_core/test/GNN/EmitGNN.cxx
rename to test/GNN/EmitGNN.cxx
diff --git a/src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx b/test/GNN/EmitGraphIndependent.cxx
similarity index 100%
rename from src/SOFIE_core/test/GNN/EmitGraphIndependent.cxx
rename to test/GNN/EmitGraphIndependent.cxx
diff --git a/src/SOFIE_core/test/LinearModelGenerator.py b/test/LinearModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/LinearModelGenerator.py
rename to test/LinearModelGenerator.py
diff --git a/src/SOFIE_core/test/RecurrentModelGenerator.py b/test/RecurrentModelGenerator.py
similarity index 100%
rename from src/SOFIE_core/test/RecurrentModelGenerator.py
rename to test/RecurrentModelGenerator.py
diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/test/TestCustomModelsFromONNX.cxx
similarity index 99%
rename from src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
rename to test/TestCustomModelsFromONNX.cxx
index d02dc5e..902cbcc 100644
--- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
+++ b/test/TestCustomModelsFromONNX.cxx
@@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu)
 {
    constexpr float TOLERANCE = 1;
 
-   // Preparing the standard all-ones input
+   // Preparing input
    std::vector<float> input({
       0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
       -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
@@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){
    });
 
    SOFIE_Equal::Session s("Equal_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){
    });
 
    SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){
    });
 
    SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){
    });
 
    SOFIE_Greater::Session s("Greater_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2615,7 +2615,7 @@ TEST(ONNX, Less){
    });
 
    SOFIE_Less::Session s("Less_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2849,6 +2849,7 @@ TEST(ONNX, Slice_Neg) {
    }
 
 }
+
 TEST(ONNX, RangeFloat) {
    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
 
diff --git a/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
new file mode 100644
index 0000000..fccacbe
--- /dev/null
+++ b/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -0,0 +1,3163 @@
+#include <numeric>
+#include <cstddef>
+
+// ── Trilu ──────────────────────────────────────────────────────────────────
+#include "Trilu_upper_FromONNX_GPU_ALPAKA.hxx"
+#include "Trilu_lower_FromONNX_GPU_ALPAKA.hxx"
+#include "Trilu_k2_FromONNX_GPU_ALPAKA.hxx"
+#include "Trilu_kn1_FromONNX_GPU_ALPAKA.hxx"
+#include "Trilu_3D_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Trilu_upper.ref.hxx"
+#include "input_models/references/Trilu_upper_input.ref.hxx"
+#include "input_models/references/Trilu_lower.ref.hxx"
+#include "input_models/references/Trilu_lower_input.ref.hxx"
+#include "input_models/references/Trilu_k2.ref.hxx"
+#include "input_models/references/Trilu_k2_input.ref.hxx"
+#include "input_models/references/Trilu_kn1.ref.hxx"
+#include "input_models/references/Trilu_kn1_input.ref.hxx"
+#include "input_models/references/Trilu_3D.ref.hxx"
+#include "input_models/references/Trilu_3D_input.ref.hxx"
+// ── Logic ───────────────────────────────────────────────────────────────────
+#include "Logic_And_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_Or_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_Xor_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_BitwiseAnd_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_BitwiseOr_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_BitwiseXor_FromONNX_GPU_ALPAKA.hxx"
+#include "Logic_BitwiseNot_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Logic_And.ref.hxx"
+#include "input_models/references/Logic_And_input.ref.hxx"
+#include "input_models/references/Logic_Or.ref.hxx"
+#include "input_models/references/Logic_Or_input.ref.hxx"
+#include "input_models/references/Logic_Xor.ref.hxx"
+#include "input_models/references/Logic_Xor_input.ref.hxx"
+#include "input_models/references/Logic_BitwiseAnd.ref.hxx"
+#include "input_models/references/Logic_BitwiseAnd_input.ref.hxx"
+#include "input_models/references/Logic_BitwiseOr.ref.hxx"
+#include "input_models/references/Logic_BitwiseOr_input.ref.hxx"
+#include "input_models/references/Logic_BitwiseXor.ref.hxx"
+#include "input_models/references/Logic_BitwiseXor_input.ref.hxx"
+#include "input_models/references/Logic_BitwiseNot.ref.hxx"
+#include "input_models/references/Logic_BitwiseNot_input.ref.hxx"
+// ─────────────────────────────────────────────────────────────────────────
+
+#include "Linear_64_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Linear_64.ref.hxx"
+
+#include "AddBroadcast1_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/AddBroadcast1.ref.hxx"
+
+#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithLeakyRelu.ref.hxx"
+
+#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithSigmoid.ref.hxx"
+
+#include "Transpose_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Concat_0D_FromONNX_GPU_ALPAKA.hxx"
+#include "ScatterElements_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Split_0_FromONNX_GPU_ALPAKA.hxx"
+#include "Split_1_FromONNX_GPU_ALPAKA.hxx"
+#include "Split_2_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Tile5D_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Tile5D.ref.hxx"
+
+#include "GatherAxis0_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis1_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis2_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis3_FromONNX_GPU_ALPAKA.hxx"
+#include "Gather2d_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherNegativeIndices_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/GatherAxis0.ref.hxx"
+#include "input_models/references/GatherAxis1.ref.hxx"
+#include "input_models/references/GatherAxis2.ref.hxx"
+#include "input_models/references/GatherAxis3.ref.hxx"
+#include "input_models/references/Gather2d.ref.hxx"
+#include "input_models/references/GatherNegativeIndices.ref.hxx"
+
+#include "ExpandSameSize_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ExpandSameSize.ref.hxx"
+
+#include "ExpandDiffSize_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ExpandDiffSize.ref.hxx"
+
+#include "GatherND_Ex1_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex2_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex3_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex4_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex5_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Equal_FromONNX_GPU_ALPAKA.hxx"
+#include "LessOrEqual_FromONNX_GPU_ALPAKA.hxx"
+#include "GreaterOrEqual_FromONNX_GPU_ALPAKA.hxx"
+#include "Greater_FromONNX_GPU_ALPAKA.hxx"
+#include "Less_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Equal.ref.hxx"
+#include "input_models/references/LessOrEqual.ref.hxx"
+#include "input_models/references/GreaterOrEqual.ref.hxx"
+#include "input_models/references/Greater.ref.hxx"
+#include "input_models/references/Less.ref.hxx"
+
+#include "Slice_FromONNX_GPU_ALPAKA.hxx"
+#include "Slice_Default_Axis_FromONNX_GPU_ALPAKA.hxx"
+#include "Slice_Default_Steps_FromONNX_GPU_ALPAKA.hxx"
+#include "Slice_Neg_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Slice.ref.hxx"
+#include "input_models/references/Slice_Default_Axis.ref.hxx"
+#include "input_models/references/Slice_Default_Steps.ref.hxx"
+#include "input_models/references/Slice_Neg.ref.hxx"
+
+#include "Sin_FromONNX_GPU_ALPAKA.hxx"
+#include "Cos_FromONNX_GPU_ALPAKA.hxx"
+#include "Abs_FromONNX_GPU_ALPAKA.hxx"
+#include "Sqrt_FromONNX_GPU_ALPAKA.hxx"
+#include "Reciprocal_FromONNX_GPU_ALPAKA.hxx"
+#include "Exp_FromONNX_GPU_ALPAKA.hxx"
+#include "Log_FromONNX_GPU_ALPAKA.hxx"
+#include "Neg_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Sqrt.ref.hxx"
+#include "input_models/references/Reciprocal.ref.hxx"
+#include "input_models/references/Exp.ref.hxx"
+#include "input_models/references/Log.ref.hxx"
+#include "input_models/references/Neg.ref.hxx"
+
+#include "Where_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Softplus_FromONNX_GPU_ALPAKA.hxx"
+
+#include "ReduceMean_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceProd_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceSum_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceSumSquare_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceL2_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceL2Large_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceMax_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceMax_axis0_FromONNX_GPU_ALPAKA.hxx"
+#include "ReduceMax_mid_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ReduceMean.ref.hxx"
+#include "input_models/references/ReduceProd.ref.hxx"
+#include "input_models/references/ReduceL2.ref.hxx"
+#include "input_models/references/ReduceMax.ref.hxx"
+#include "input_models/references/ReduceMax_axis0.ref.hxx"
+#include "input_models/references/ReduceMax_mid.ref.hxx"
+
+#include "ConvWithPadding_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithPadding.ref.hxx"
+
+#include "ConvWithoutPadding_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithoutPadding.ref.hxx"
+
+#include "ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithAutopadSameLower.ref.hxx"
+
+#include "ConvWithStridesPadding_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithStridesPadding.ref.hxx"
+
+#include "ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithStridesNoPadding.ref.hxx"
+
+#include "ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ConvWithAsymmetricPadding.ref.hxx"
+
+#include "BatchNorm_FromONNX_GPU_ALPAKA.hxx"
+#include "BatchNormRelu_FromONNX_GPU_ALPAKA.hxx"
+
+#include "LayerNorm_FromONNX_GPU_ALPAKA.hxx"
+#include "LayerNormScaleBias_FromONNX_GPU_ALPAKA.hxx"
+#include "LayerNorm3D_FromONNX_GPU_ALPAKA.hxx"
+
+#include "IsInf_FromONNX_GPU_ALPAKA.hxx"
+#include "IsNaN_FromONNX_GPU_ALPAKA.hxx"
+#include "Clip_FromONNX_GPU_ALPAKA.hxx"
+#include "Not_FromONNX_GPU_ALPAKA.hxx"
+
+#include "GNN_model_FromONNX_GPU_ALPAKA.hxx"
+
+#include <alpaka/alpaka.hpp>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include "gtest/gtest.h"
+
+constexpr float DEFAULT_TOLERANCE = 1e-3f;
+
+using Idx = std::size_t;
+using Dim = alpaka::DimInt<1>;
+using Ext1D = alpaka::Vec<Dim, Idx>;
+
+class SofieAlpakaTest : public ::testing::Test {
+protected:
+    // Shared devices and platforms
+    alpaka::PlatformCpu hostPlatform;
+    alpaka::DevCpu host;
+    alpaka::PlatformCudaRt platform;
+    alpaka::DevCudaRt device;
+    alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking> queue;
+
+    SofieAlpakaTest() 
+        : hostPlatform{}
+        , host(alpaka::getDevByIdx(hostPlatform, 0u))
+        , platform{}
+        , device(alpaka::getDevByIdx(platform, 0u))
+        , queue(device)
+    {
+    }
+
+    void SetUp() override {
+        cudaDeviceSynchronize();
+    }
+
+    void TearDown() override {
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+    }
+
+    ~SofieAlpakaTest() override {
+        cudaDeviceSynchronize();
+    }
+};
+
+
+TEST_F(SofieAlpakaTest, Linear64)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{6400}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 6400; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{6400}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{640}));
+
+   {
+      SOFIE_Linear_64::Session<alpaka::TagGpuCudaRt> session("Linear_64_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = Linear_64_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 640; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, LinearWithLeakyRelu)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+      0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
+      -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
+      0.7057, -0.3749, -0.3310,  0.0986, -0.1370,  0.0832, -1.6465, -0.2793
+   });
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < input.size(); ++i) {
+      A_ptr[i] = input[i];
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{24}));
+   
+   {
+      SOFIE_LinearWithLeakyRelu::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs;
+
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, LinearWithSigmoid)
+{
+
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{48}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 48; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{48}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{24}));
+   
+   {
+      SOFIE_LinearWithSigmoid::Session<alpaka::TagGpuCudaRt> session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithSigmoid_ExpectedOutput::all_ones;
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, AddBroadcast1)
+{
+
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{5}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   auto B = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{20}));
+   float *B_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(B));
+
+   std::vector<float> A_vec({-0.78023305, -1.34029483, -3.01482951, 0.53641361,
+                 -1.22594789});
+   std::vector<float> B_vec({1.0626695,  0.43842875,  1.22476468,  0.79763274,  0.98688211,
+                 0.25267614, 0.44874883,  0.31516773,  -0.78771195, 0.64565664,
+                 0.50450593, -0.41265227, -0.22474539, -0.22362374, 0.00509674,
+                 0.16927211, 1.06756969,  -0.81634773, 0.88467744,  0.78902059});
+
+   for (Idx i = 0; i < A_vec.size(); ++i) {
+      A_ptr[i] = A_vec[i];
+   }
+
+   for (Idx i = 0; i < B_vec.size(); ++i) {
+      B_ptr[i] = B_vec[i];
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{5}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto B_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{20}));
+   alpaka::memcpy(queue, B_d, B);
+   alpaka::wait(queue);
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{20}));
+   
+   {
+       SOFIE_AddBroadcast1::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(A_d, B_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }  
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = AddBroadcast1_ExpectedOutput::output;
+   for (size_t i = 0; i < 20; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, Transpose)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // Input shape: (2, 1, 3, 4) -> 24 elements
+    constexpr Idx inputSize = 24;
+    // Output shape: (2, 3, 4, 1) -> 24 elements
+    constexpr Idx outputSize = 24;
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+
+    std::vector<float> input_vec({
+        // shape (2, 1, 3, 4)
+        0.f,  1.f,  2.f,  3.f,
+        4.f,  5.f,  6.f,  7.f,
+        8.f,  9.f, 10.f, 11.f,
+
+       12.f, 13.f, 14.f, 15.f,
+       16.f, 17.f, 18.f, 19.f,
+       20.f, 21.f, 22.f, 23.f
+    });
+
+    for (Idx i = 0; i < inputSize; ++i)
+        input_ptr[i] = input_vec[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Transpose::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    std::vector<float> expected(outputSize);
+    std::vector<size_t> inputShape  = {2, 1, 3, 4};
+    std::vector<size_t> perm        = {0, 2, 3, 1};
+    std::vector<size_t> outputShape = {2, 3, 4, 1};
+
+    std::vector<size_t> inputStrides  = {12, 12, 4, 1};
+    std::vector<size_t> outputStrides = {12,  4,  1, 1};
+
+    for (size_t i = 0; i < outputSize; ++i)
+    {
+        size_t remaining = i;
+        size_t inputIdx  = 0;
+        for (size_t d = 0; d < 4; ++d)
+        {
+            size_t const coord = remaining / outputStrides[d];
+            remaining          = remaining - coord * outputStrides[d];
+            inputIdx          += coord * inputStrides[perm[d]];
+        }
+        expected[i] = input_vec[inputIdx];
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Concat0D)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({1.40519865e+00, -2.87660856e-01});
+   std::vector<float> expected_output({
+      1.40519865e+00, -2.87660856e-01,
+      1.40519865e+00, -2.87660856e-01
+   });
+
+   // Host input buffer
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+
+   for (Idx i = 0; i < input.size(); ++i)
+      input_ptr[i] = input[i];
+
+   // Device input buffer
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   // Host output buffer
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected_output.size()}));
+
+   {
+      SOFIE_Concat_0D::Session<alpaka::TagGpuCudaRt> session("Concat_0D_FromONNX_GPU_ALPAKA.dat");
+
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+   for (size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - expected_output[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, ScatterElements)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float>   input   (9, 0.f);
+    std::vector<int64_t> indices = { 1, 0, 2, 0, 2, 1 };
+    std::vector<float>   updates = { 1.f, 1.1f, 1.2f, 2.f, 2.1f, 2.2f };
+    std::vector<float>   correct = { 2.f, 1.1f, 0.f, 1.f, 0.f, 2.2f, 0.f, 2.1f, 1.2f };
+
+    // Allocate and fill host buffers
+    auto input_h   = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{input.size()}));
+    auto indices_h = alpaka::allocBuf<int64_t, Idx>(host, Ext1D::all(Idx{indices.size()}));
+    auto updates_h = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{updates.size()}));
+
+    float*   input_ptr   = reinterpret_cast<float*>  (alpaka::getPtrNative(input_h));
+    int64_t* indices_ptr = reinterpret_cast<int64_t*>(alpaka::getPtrNative(indices_h));
+    float*   updates_ptr = reinterpret_cast<float*>  (alpaka::getPtrNative(updates_h));
+
+    for (Idx i = 0; i < input.size();   ++i) input_ptr[i]   = input[i];
+    for (Idx i = 0; i < indices.size(); ++i) indices_ptr[i] = indices[i];
+    for (Idx i = 0; i < updates.size(); ++i) updates_ptr[i] = updates[i];
+
+    // Allocate device buffers and copy
+    auto input_d   = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{input.size()}));
+    auto indices_d = alpaka::allocBuf<int64_t, Idx>(device, Ext1D::all(Idx{indices.size()}));
+    auto updates_d = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{updates.size()}));
+
+    alpaka::memcpy(queue, input_d,   input_h);
+    alpaka::memcpy(queue, indices_d, indices_h);
+    alpaka::memcpy(queue, updates_d, updates_h);
+    alpaka::wait(queue);
+
+    // Host result buffer
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct.size()}));
+
+    {
+        SOFIE_ScatterElements::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d, indices_d, updates_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(correct.size(), 9u);
+    for (size_t i = 0; i < correct.size(); ++i){
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+    }
+}
+
+TEST_F(SofieAlpakaTest, Split_0)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 0 in 2 tensors {2,2,3} -> {1,2,3} each
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,3.,4.,5.,6.}, {7.,8.,9.,10.,11.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_0::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Split_1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 1 in 2 tensors {2,2,3} -> {2,1,3} each
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,3.,7.,8.,9.}, {4.,5.,6.,10.,11.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_1::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Split_2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 2 in 2 tensors {2,2,3} -> {2,2,2} and {2,2,1}
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,4.,5.,7.,8.,10.,11.}, {3.,6.,9.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    // outputs have different sizes: {2,2,2}=8 and {2,2,1}=4
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_2::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Tile5D)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input_data({
+        0.2386120855808258,   0.5549510717391968,   -1.8190287351608276,  0.5724563598632812,   -0.6596977710723877,
+        0.17560836672782898,  0.7608169317245483,    0.08603227883577347, -0.049375515431165695,  0.2705111503601074,
+        1.42119562625885,     0.032626643776893616, -1.212586522102356,   -0.5129594802856445,   -0.43296414613723755,
+       -0.1606937050819397,   1.1884371042251587,   -0.662174642086029,   -2.291109323501587,    -0.6852569580078125,
+        2.325223922729492,   -0.19389064610004425,  -0.5784135460853577,  -0.39328137040138245,   0.2831517457962036,
+        0.4496127665042877,  -0.2029038816690445,    0.35477763414382935,  0.4266718924045563,    0.24683749675750732,
+        1.90426504611969,    -0.4861580729484558,    0.9139055013656616,  -0.5031066536903381,    0.9583520293235779,
+       -0.23210509121418,     1.3183971643447876,    1.7042455673217773,  -0.3201166093349457,   -0.14444805681705475,
+       -0.8829464912414551,   1.725736141204834,     0.45657631754875183,  0.4920198321342468,   -1.088847041130066,
+        0.49437597393989563, -0.006085286382585764,  2.475630760192871,    0.12170185893774033,  -0.8953945636749268,
+        1.1430096626281738,   1.3278610706329346,    0.3076854348182678,   0.036237504333257675,  0.05180325731635094,
+        0.2802475392818451,   0.5289335250854492,    0.9356630444526672,   0.7863689064979553,    0.4239695370197296,
+        0.8723016977310181,  -0.2248474359512329,    0.3891502320766449,   0.5463842153549194,   -0.7782878875732422,
+       -0.8570080399513245,  -2.593783378601074,    -0.11392943561077118,  0.5637082457542419,    2.075004816055298,
+       -1.0598397254943848,   1.0823975801467896
+    });
+
+    const std::size_t inputSize  = input_data.size();
+    const std::size_t outputSize = sizeof(Tile5D_ExpectedOutput::output) / sizeof(float);
+
+    // Allocate and fill host input buffer
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < inputSize; ++i)
+        input_ptr[i] = input_data[i];
+
+    // Copy to device
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    // Host result buffer
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Tile5D::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr   = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct   = Tile5D_ExpectedOutput::output;
+
+    EXPECT_EQ(outputSize, sizeof(Tile5D_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis0)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis0::Session<alpaka::TagGpuCudaRt> session("GatherAxis0_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis0_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis1::Session<alpaka::TagGpuCudaRt> session("GatherAxis1_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis1_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis2::Session<alpaka::TagGpuCudaRt> session("GatherAxis2_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis2_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis3)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis3::Session<alpaka::TagGpuCudaRt> session("GatherAxis3_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis3_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Gather2d)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 9;
+    const std::size_t outputSize = sizeof(Gather2d_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Gather2d::Session<alpaka::TagGpuCudaRt> session("Gather2d_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Gather2d_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(Gather2d_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherNegativeIndices)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 10;
+    const std::size_t outputSize = sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherNegativeIndices::Session<alpaka::TagGpuCudaRt> session("GatherNegativeIndices_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherNegativeIndices_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, ExpandSameSize)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.f, 1.f, 2.f});
+    const std::size_t outputSize = sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i)
+        input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ExpandSameSize::Session<alpaka::TagGpuCudaRt> session("ExpandSameSize_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ExpandSameSize_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, ExpandDiffSize)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.f, 1.f, 2.f});
+    const std::size_t outputSize = sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i)
+        input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ExpandDiffSize::Session<alpaka::TagGpuCudaRt> session("ExpandDiffSize_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ExpandDiffSize_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f, 1.f, 2.f, 3.f};
+    std::vector<float> expected = {0.f, 3.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex1::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex1_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 2u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f, 1.f, 2.f, 3.f};
+    std::vector<float> expected = {2.f, 3.f, 0.f, 1.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex2::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex2_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex3)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex3::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex3_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex4)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex4::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex4_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex5)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex5::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex5_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_NegativeIndices)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f};
+    std::vector<float> expected = {6.f, 2.f, 4.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_NegativeIndices::Session<alpaka::TagGpuCudaRt> session("GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 3u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Batch)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), 0.f);
+    std::vector<float> expected = {4.f,5.f,6.f,7.f, 20.f,21.f,22.f,23.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Batch::Session<alpaka::TagGpuCudaRt> session("GatherND_Batch_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 8u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Equal)
+{
+    std::vector<float> input1 = {1.0f, 2.0f, 3.0f};
+    std::vector<float> input2 = {4.0f, 2.0f, 6.0f};
+    const std::size_t outputSize = sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool);
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+    alpaka::memcpy(queue, input2_d, input2_h);
+    alpaka::wait(queue);
+
+    // Output is bool — allocate as bool buffer
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Equal::Session<alpaka::TagGpuCudaRt> session("Equal_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr     = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    bool* correct     = Equal_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, LessOrEqual)
+{
+    std::vector<float> input1 = {1.0f, 2.0f, 3.0f};
+    std::vector<float> input2 = {4.0f, 2.0f, 6.0f};
+    const std::size_t outputSize = sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool);
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+    alpaka::memcpy(queue, input2_d, input2_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_LessOrEqual::Session<alpaka::TagGpuCudaRt> session("LessOrEqual_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    bool* correct = LessOrEqual_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GreaterOrEqual)
+{
+    std::vector<float> input1 = {1.0f, 2.0f, 3.0f};
+    std::vector<float> input2 = {4.0f, 2.0f, 6.0f};
+    const std::size_t outputSize = sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool);
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+    alpaka::memcpy(queue, input2_d, input2_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GreaterOrEqual::Session<alpaka::TagGpuCudaRt> session("GreaterOrEqual_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    bool* correct = GreaterOrEqual_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Greater)
+{
+    std::vector<float> input1 = {1.0f, 2.0f, 3.0f};
+    std::vector<float> input2 = {4.0f, 2.0f, 6.0f};
+    const std::size_t outputSize = sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool);
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+    alpaka::memcpy(queue, input2_d, input2_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Greater::Session<alpaka::TagGpuCudaRt> session("Greater_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    bool* correct = Greater_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Less)
+{
+    std::vector<float> input1 = {1.0f, 2.0f, 3.0f};
+    std::vector<float> input2 = {4.0f, 2.0f, 6.0f};
+    const std::size_t outputSize = sizeof(Less_ExpectedOutput::outputs) / sizeof(bool);
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+    alpaka::memcpy(queue, input2_d, input2_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Less::Session<alpaka::TagGpuCudaRt> session("Less_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    bool* correct = Less_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(Less_ExpectedOutput::outputs) / sizeof(bool));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Slice)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = Slice::input;
+    const std::size_t outputSize = sizeof(Slice::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Slice::Session<alpaka::TagGpuCudaRt> session("Slice_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Slice::output;
+    EXPECT_EQ(outputSize, sizeof(Slice::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Slice_Default_Axis)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = Slice_Default_Axis::input;
+    const std::size_t outputSize = sizeof(Slice_Default_Axis::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Slice_Default_Axis::Session<alpaka::TagGpuCudaRt> session("Slice_Default_Axis_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Slice_Default_Axis::output;
+    EXPECT_EQ(outputSize, sizeof(Slice_Default_Axis::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Slice_Default_Steps)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = Slice_Default_Steps::input;
+    const std::size_t outputSize = sizeof(Slice_Default_Steps::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Slice_Default_Steps::Session<alpaka::TagGpuCudaRt> session("Slice_Default_Steps_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Slice_Default_Steps::output;
+    EXPECT_EQ(outputSize, sizeof(Slice_Default_Steps::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Slice_Neg)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = Slice_Neg::input;
+    const std::size_t outputSize = sizeof(Slice_Neg::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Slice_Neg::Session<alpaka::TagGpuCudaRt> session("Slice_Neg_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Slice_Neg::output;
+    EXPECT_EQ(outputSize, sizeof(Slice_Neg::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Sin)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({
+        -0.786738f, -0.197796f, -0.187787f,  0.142758f,
+         0.876096f, -0.653239f,  0.145444f, -1.107658f,
+         2.259171f, -0.947054f, -0.506689f,  1.801250f
+    });
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+
+    {
+        SOFIE_Sin::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(input.size(), 12u);
+    for (size_t i = 0; i < input.size(); ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - std::sin(input[i])), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Cos)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({
+         1.152504f, -1.459324f,  0.691594f,  0.347690f,
+        -1.307323f,  1.832516f, -1.261772f,  0.014224f,
+         1.311477f,  1.147405f, -0.567206f, -0.530606f
+    });
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+
+    {
+        SOFIE_Cos::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(input.size(), 12u);
+    for (size_t i = 0; i < input.size(); ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - std::cos(input[i])), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Abs)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({1.f, -2.f, -3.f, 4.f, -5.f, 6.f});
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+
+    {
+        SOFIE_Abs::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(input.size(), 6u);
+    for (size_t i = 0; i < input.size(); ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - std::abs(input[i])), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Sqrt)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.8344f, 0.4716f, 0.6226f, 0.8448f, 0.2483f, 0.9467f});
+    const std::size_t outputSize = sizeof(Sqrt_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Sqrt::Session<alpaka::TagGpuCudaRt> session("Sqrt_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Sqrt_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(Sqrt_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Reciprocal)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({1.2691f, -1.2160f, 0.6393f, -0.4438f, 0.8065f, 0.2011f});
+    const std::size_t outputSize = sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Reciprocal::Session<alpaka::TagGpuCudaRt> session("Reciprocal_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Reciprocal_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(Reciprocal_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Exp)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({
+         1.46566453f,  0.63334515f,  2.4048165f,   0.54468453f,
+        -1.41271672f, -0.18609187f,  0.2754482f,   1.10615209f,
+         0.88474389f,  0.47531232f
+    });
+    const std::size_t outputSize = sizeof(Exp_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Exp::Session<alpaka::TagGpuCudaRt> session("Exp_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Exp_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(Exp_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Log)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({1.f, 2.f, 3.f, 4.f});
+    const std::size_t outputSize = sizeof(Log_ExpectedOutput::outputs) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Log::Session<alpaka::TagGpuCudaRt> session("Log_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Log_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(Log_ExpectedOutput::outputs) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Neg)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({
+        -1.9100f,  1.8811f, -1.7269f, -0.1094f,
+        -0.0145f,  0.2509f,  0.5893f, -2.2733f,
+        -0.7077f,  1.0645f, -0.8607f,  0.2085f
+    });
+    const std::size_t outputSize = sizeof(Neg_ExpectedOutput::outputs) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Neg::Session<alpaka::TagGpuCudaRt> session("Neg_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Neg_ExpectedOutput::outputs;
+    EXPECT_EQ(outputSize, sizeof(Neg_ExpectedOutput::outputs) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Softplus)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.1,-0.2,0.3,-0.4,0.5,1.});
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+
+    {
+        SOFIE_Softplus::Session<alpaka::TagGpuCudaRt> session("Softplus_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    for (size_t i = 0; i < input.size(); ++i){
+        double exp_value = std::log(std::exp(input[i])+1);
+        EXPECT_LE(std::abs(res_ptr[i] - exp_value), TOLERANCE);
+    }
+}
+
+TEST_F(SofieAlpakaTest, Where)
+{
+    std::vector<float> input1    = {1.f, 2.f};
+    std::vector<float> input2    = {3.f, 4.f, 5.f, 6.f};
+    std::vector<bool>  cond_vec  = {true, false, true};
+    std::vector<float> correct   = {1.f, 2.f, 5.f, 6.f, 1.f, 2.f};
+
+    auto input1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input1.size()}));
+    float* in1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input1_h));
+    for (Idx i = 0; i < input1.size(); ++i) in1_ptr[i] = input1[i];
+
+    auto input1_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input1.size()}));
+    alpaka::memcpy(queue, input1_d, input1_h);
+
+    auto input2_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input2.size()}));
+    float* in2_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input2_h));
+    for (Idx i = 0; i < input2.size(); ++i) in2_ptr[i] = input2[i];
+
+    auto input2_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input2.size()}));
+    alpaka::memcpy(queue, input2_d, input2_h);
+
+    auto cond_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{cond_vec.size()}));
+    uint8_t* cond_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(cond_h));
+    for (Idx i = 0; i < cond_vec.size(); ++i) cond_ptr[i] = cond_vec[i];
+
+    auto cond_d = alpaka::allocBuf<uint8_t, Idx>(device, Ext1D::all(Idx{cond_vec.size()}));
+    alpaka::memcpy(queue, cond_d, cond_h);
+
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct.size()}));
+
+    {
+        SOFIE_Where::Session<alpaka::TagGpuCudaRt> session("Where_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input1_d, input2_d, cond_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(correct.size(), 6u);
+    for (size_t i = 0; i < correct.size(); ++i)
+        EXPECT_EQ(res_ptr[i], correct[i]) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, ReduceMean)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    const std::size_t outputSize = sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ReduceMean::Session<alpaka::TagGpuCudaRt> session("ReduceMean_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceMean_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ReduceMean_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, ReduceProd)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    const std::size_t outputSize = sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ReduceProd::Session<alpaka::TagGpuCudaRt> session("ReduceProd_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceProd_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ReduceProd_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, ReduceSum)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input    = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    std::vector<float> correct  = {24.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct.size()}));
+
+    {
+        SOFIE_ReduceSum::Session<alpaka::TagGpuCudaRt> session("ReduceSum_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(correct.size(), 1u);
+    for (size_t i = 0; i < correct.size(); ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, ReduceSumSquare)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input   = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    std::vector<float> correct = {38.f, 66.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct.size()}));
+
+    {
+        SOFIE_ReduceSumSquare::Session<alpaka::TagGpuCudaRt> session("ReduceSumSquare_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    for (size_t i = 0; i < correct.size(); ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+// ReduceL2: input [1,2,3]={5,2,3,5,5,4}, reduce axis=1, keepdims=0 → [1,3]
+// Expected: {sqrt(50), sqrt(29), 5.0}
+TEST_F(SofieAlpakaTest, ReduceL2)
+{
+    constexpr float TOLERANCE = 1e-3f;
+
+    std::vector<float> input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    const std::size_t outputSize = sizeof(ReduceL2_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ReduceL2::Session<alpaka::TagGpuCudaRt> session("ReduceL2_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceL2_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, 3u);
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+}
+
+// ReduceL2Large: input [4,512], reduce axis=1, keepdims=0 → [4]
+// Row i is filled with (i+1), so L2 norm = (i+1)*sqrt(512).
+// This test exercises the 256-thread block reduction with reducedLength > BLOCK_SIZE.
+TEST_F(SofieAlpakaTest, ReduceL2Large)
+{
+    constexpr float TOLERANCE = 1e-2f;  // slightly looser: large sum, float accumulation
+
+    constexpr std::size_t nrows = 4;
+    constexpr std::size_t ncols = 512;
+    const std::size_t inputSize  = nrows * ncols;
+    const std::size_t outputSize = nrows;
+
+    // Fill row i with value (i+1)
+    std::vector<float> input(inputSize);
+    for (std::size_t r = 0; r < nrows; ++r)
+        for (std::size_t c = 0; c < ncols; ++c)
+            input[r * ncols + c] = static_cast<float>(r + 1);
+
+    // Expected L2 per row: sqrt(ncols) * (row+1)
+    const float sqrt512 = std::sqrt(static_cast<float>(ncols));
+    std::vector<float> correct(nrows);
+    for (std::size_t r = 0; r < nrows; ++r)
+        correct[r] = static_cast<float>(r + 1) * sqrt512;
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < inputSize; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ReduceL2Large::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(outputSize, nrows);
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]) / correct[i], TOLERANCE) << "row=" << i;
+}
+
+// ── ReduceMax: [1,2,3] axis=1 keepdims=0 (kLast path) ──────────────────────
+TEST_F(SofieAlpakaTest, ReduceMax)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = {5.f, 2.f, 3.f, 5.f, 5.f, 4.f};
+    const std::size_t outputSize = sizeof(ReduceMax_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (std::size_t i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+    {
+        SOFIE_ReduceMax::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceMax_ExpectedOutput::output;
+    for (std::size_t i = 0; i < outputSize; ++i)
+        EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << "  i=" << i;
+}
+
+// ── ReduceMax_axis0: [3,4] axis=0 keepdims=0 (kFirst path) ─────────────────
+TEST_F(SofieAlpakaTest, ReduceMax_axis0)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // numpy default_rng(42).standard_normal((3,4)) — same seed/sequence as generator
+    const std::size_t inputSize  = 12;
+    const std::size_t outputSize = sizeof(ReduceMax_axis0_ExpectedOutput::output) / sizeof(float);
+    float vals[] = { 0.30471709f, -1.03998411f,  0.75045121f,  0.94056469f,
+                    -1.95103514f, -1.30217946f,  0.12784040f, -0.31624261f,
+                    -0.01680116f, -0.85304391f,  0.87939799f,  0.77779192f};
+    std::vector<float> input(vals, vals + inputSize);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (std::size_t i = 0; i < inputSize; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+    {
+        SOFIE_ReduceMax_axis0::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceMax_axis0_ExpectedOutput::output;
+    for (std::size_t i = 0; i < outputSize; ++i)
+        EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << "  i=" << i;
+}
+
+// ── ReduceMax_mid: [2,3,4] axis=1 keepdims=0 (kMiddle path) ────────────────
+TEST_F(SofieAlpakaTest, ReduceMax_mid)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    const std::size_t inputSize  = 24;   // 2×3×4
+    const std::size_t outputSize = sizeof(ReduceMax_mid_ExpectedOutput::output) / sizeof(float);
+
+    // numpy default_rng(42).standard_normal((2,3,4)) — same seed/sequence as generator
+    float vals[] = { 0.06603070f,  1.12724125f,  0.46750933f, -0.85929245f,
+                     0.36875078f, -0.95888263f,  0.87845027f, -0.04992591f,
+                    -0.18486236f, -0.68092954f,  1.22254133f, -0.15452948f,
+                    -0.42832783f, -0.35213354f,  0.53230917f,  0.36544406f,
+                     0.41273260f,  0.43082100f,  2.14164758f, -0.40641502f,
+                    -0.51224273f, -0.81377274f,  0.61597943f,  1.12897229f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (std::size_t i = 0; i < inputSize; ++i) input_ptr[i] = vals[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+    {
+        SOFIE_ReduceMax_mid::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ReduceMax_mid_ExpectedOutput::output;
+    for (std::size_t i = 0; i < outputSize; ++i)
+        EXPECT_NEAR(res_ptr[i], correct[i], TOLERANCE) << "  i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, ConvWithPadding)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(25);
+   std::iota(input.begin(), input.end(), 0.0f);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithPadding_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithPadding::Session<alpaka::TagGpuCudaRt> session("ConvWithPadding_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithPadding_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 25; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+
+TEST_F(SofieAlpakaTest, ConvWithoutPadding)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(25);
+   std::iota(input.begin(), input.end(), 0.0f);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithoutPadding::Session<alpaka::TagGpuCudaRt> session("ConvWithoutPadding_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+
+      
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithoutPadding_ExpectedOutput::all_ones;
+   constexpr size_t nOut_convNoPad = sizeof(ConvWithoutPadding_ExpectedOutput::all_ones) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_convNoPad; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+
+}
+
+
+TEST_F(SofieAlpakaTest, ConvWithAutopadSameLower)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(25);
+   std::iota(input.begin(), input.end(), 0.0f);
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithAutopadSameLower_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithAutopadSameLower::Session<alpaka::TagGpuCudaRt> session("ConvWithAutopadSameLower_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithAutopadSameLower_ExpectedOutput::all_ones;
+
+   for (size_t i = 0; i < 9; ++i) {
+      std::cout << "res: " << res_ptr[i] << ", correct: " << correct[i] << std::endl;
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+
+TEST_F(SofieAlpakaTest, ConvWithStridesPadding)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(35);
+   std::iota(input.begin(), input.end(), 0.0f);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithStridesPadding::Session<alpaka::TagGpuCudaRt> session("ConvWithStridesPadding_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithStridesPadding_ExpectedOutput::all_ones;
+   constexpr size_t nOut_stridesPad = sizeof(ConvWithStridesPadding_ExpectedOutput::all_ones) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_stridesPad; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+
+TEST_F(SofieAlpakaTest, ConvWithStridesNoPadding)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(35);
+   std::iota(input.begin(), input.end(), 0.0f);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithStridesNoPadding::Session<alpaka::TagGpuCudaRt> session("ConvWithStridesNoPadding_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithStridesNoPadding_ExpectedOutput::all_ones;
+   constexpr size_t nOut_stridesNoPad = sizeof(ConvWithStridesNoPadding_ExpectedOutput::all_ones) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_stridesNoPad; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+
+// Disables test (asymmetric padding not supported)
+TEST_F(SofieAlpakaTest, ConvWithAsymmetricPadding)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   // Preparing the standard all-ones input
+   std::vector<float> input(35);
+   std::iota(input.begin(), input.end(), 0.0f);
+
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+   for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+   
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float)}));
+
+   {
+        SOFIE_ConvWithAsymmetricPadding::Session<alpaka::TagGpuCudaRt> session("ConvWithAsymmetricPadding_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = ConvWithAsymmetricPadding_ExpectedOutput::all_ones;
+   constexpr size_t nOut_asymPad = sizeof(ConvWithAsymmetricPadding_ExpectedOutput::all_ones) / sizeof(float);
+
+   for (size_t i = 0; i < nOut_asymPad; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE) << "i=" << i;
+   }
+}
+
+TEST_F(SofieAlpakaTest, BatchNormalization)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = {
+        1.f, 2.f, 3.f, 4.f,   // channel 0
+        5.f, 6.f, 7.f, 8.f    // channel 1
+    };
+    const std::size_t outputSize = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_BatchNorm::Session<alpaka::TagGpuCudaRt> session("BatchNorm_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+    float inv_std = 1.f / std::sqrt(1.f + 1e-5f);
+    ASSERT_EQ(outputSize, 8u);
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - input[i] * inv_std), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, BatchNormalizationRelu)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input = {
+        -1.f,  2.f, -3.f,  4.f,
+         5.f, -6.f,  7.f, -8.f
+    };
+    const std::size_t outputSize = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_BatchNormRelu::Session<alpaka::TagGpuCudaRt> session("BatchNormRelu_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+    float inv_std = 1.f / std::sqrt(1.f + 1e-5f);
+    ASSERT_EQ(outputSize, 8u);
+    for (size_t i = 0; i < outputSize; ++i) {
+        float expected = std::max(0.f, input[i] * inv_std);
+        EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i;
+    }
+}
+
+TEST_F(SofieAlpakaTest, LayerNorm)
+{
+    constexpr float TOLERANCE = 1e-4f;
+    std::vector<float> input = {1.f, 2.f, 3.f, 4.f,
+                                 5.f, 6.f, 7.f, 8.f};
+    const std::size_t outputSize = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_LayerNorm::Session<alpaka::TagGpuCudaRt> session("LayerNorm_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+    // Row 0: mean=2.5, std=sqrt(1.25+1e-5) ≈ 1.118034
+    // Row 1: mean=6.5, std=sqrt(1.25+1e-5) ≈ 1.118034
+    // Y[0] = (1-2.5)/1.118034 ≈ -1.3416
+    // Y[1] = (2-2.5)/1.118034 ≈ -0.4472
+    // Y[2] = (3-2.5)/1.118034 ≈  0.4472
+    // Y[3] = (4-2.5)/1.118034 ≈  1.3416
+    float inv_std = 1.f / std::sqrt(1.25f + 1e-5f);
+    std::vector<float> expected = {
+        (1.f - 2.5f) * inv_std, (2.f - 2.5f) * inv_std,
+        (3.f - 2.5f) * inv_std, (4.f - 2.5f) * inv_std,
+        (5.f - 6.5f) * inv_std, (6.f - 6.5f) * inv_std,
+        (7.f - 6.5f) * inv_std, (8.f - 6.5f) * inv_std
+    };
+    ASSERT_EQ(outputSize, 8u);
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, LayerNormScaleBias)
+{
+    constexpr float TOLERANCE = 1e-4f;
+
+    std::vector<float> input = {1.f, 2.f, 3.f, 4.f,
+                                 5.f, 6.f, 7.f, 8.f};
+    const std::size_t outputSize = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_LayerNormScaleBias::Session<alpaka::TagGpuCudaRt> session("LayerNormScaleBias_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+    float inv_std = 1.f / std::sqrt(1.25f + 1e-5f);
+    std::vector<float> expected = {
+        2.f * (1.f - 2.5f) * inv_std + 1.f, 2.f * (2.f - 2.5f) * inv_std + 1.f,
+        2.f * (3.f - 2.5f) * inv_std + 1.f, 2.f * (4.f - 2.5f) * inv_std + 1.f,
+        2.f * (5.f - 6.5f) * inv_std + 1.f, 2.f * (6.f - 6.5f) * inv_std + 1.f,
+        2.f * (7.f - 6.5f) * inv_std + 1.f, 2.f * (8.f - 6.5f) * inv_std + 1.f
+    };
+    ASSERT_EQ(outputSize, 8u);
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, LayerNorm3D)
+{
+    constexpr float TOLERANCE = 1e-4f;
+
+    std::vector<float> input(24);
+    std::iota(input.begin(), input.end(), 0.f);   // 0..23
+    const std::size_t outputSize = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_LayerNorm3D::Session<alpaka::TagGpuCudaRt> session("LayerNorm3D_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+    auto compute_expected = [](std::vector<float> row) {
+        float mean = 0.f;
+        for (float v : row) mean += v;
+        mean /= row.size();
+        float var = 0.f;
+        for (float v : row) var += (v - mean) * (v - mean);
+        var /= row.size();
+        float inv_std = 1.f / std::sqrt(var + 1e-5f);
+        std::vector<float> out;
+        for (float v : row) out.push_back((v - mean) * inv_std);
+        return out;
+    };
+
+    std::vector<float> row0(input.begin(),      input.begin() + 12);
+    std::vector<float> row1(input.begin() + 12, input.end());
+    auto exp0 = compute_expected(row0);
+    auto exp1 = compute_expected(row1);
+
+    ASSERT_EQ(outputSize, 24u);
+    for (size_t i = 0; i < 12; ++i)
+        EXPECT_LE(std::abs(res_ptr[i]      - exp0[i]), TOLERANCE) << "row0 i=" << i;
+    for (size_t i = 0; i < 12; ++i)
+        EXPECT_LE(std::abs(res_ptr[12 + i] - exp1[i]), TOLERANCE) << "row1 i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, IsInf)
+{
+    // Input contains finite values, +inf, -inf; output is bool (uint8_t).
+    float pos_inf = std::numeric_limits<float>::infinity();
+    float neg_inf = -std::numeric_limits<float>::infinity();
+    std::vector<float> input = {1.0f, pos_inf, neg_inf, 0.0f, -1.0f, 2.0f, neg_inf, pos_inf};
+    const std::size_t N = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{N}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+
+    {
+        SOFIE_IsInf::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(N, 8u);
+    for (size_t i = 0; i < N; ++i)
+        EXPECT_EQ(static_cast<bool>(res_ptr[i]), std::isinf(input[i])) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, IsNaN)
+{
+    // Input contains finite values, +inf, and NaN; output is bool (uint8_t).
+    float nan_val = std::numeric_limits<float>::quiet_NaN();
+    float pos_inf = std::numeric_limits<float>::infinity();
+    std::vector<float> input = {1.0f, nan_val, 0.0f, pos_inf, nan_val, 2.0f, -1.0f, nan_val};
+    const std::size_t N = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{N}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+
+    {
+        SOFIE_IsNaN::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(N, 8u);
+    for (size_t i = 0; i < N; ++i)
+        EXPECT_EQ(static_cast<bool>(res_ptr[i]), std::isnan(input[i])) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, Clip)
+{
+    // Model clips to [-1.0, 1.0].
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+    constexpr float clip_min = -1.0f;
+    constexpr float clip_max =  1.0f;
+
+    std::vector<float> input = {
+        -2.0f, -1.5f, -1.0f, -0.5f,
+         0.0f,  0.5f,  1.0f,  1.5f,
+         2.0f, -0.3f,  0.7f,  1.2f
+    };
+    const std::size_t N = input.size();
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{N}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+
+    {
+        SOFIE_Clip::Session<alpaka::TagGpuCudaRt> session("Clip_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(N, 12u);
+    for (size_t i = 0; i < N; ++i) {
+        float expected = std::max(clip_min, std::min(clip_max, input[i]));
+        EXPECT_LE(std::abs(res_ptr[i] - expected), TOLERANCE) << "i=" << i;
+    }
+}
+
+TEST_F(SofieAlpakaTest, Not)
+{
+    // Input and output are bool tensors (uint8_t on device).
+    std::vector<uint8_t> input = {1, 0, 1, 1, 0, 0, 1, 0};
+    const std::size_t N = input.size();
+
+    auto input_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+    uint8_t* input_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < N; ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<uint8_t, Idx>(device, Ext1D::all(Idx{N}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+
+    {
+        SOFIE_Not::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    uint8_t* res_ptr = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(N, 8u);
+    for (size_t i = 0; i < N; ++i)
+        EXPECT_EQ(static_cast<bool>(res_ptr[i]), !static_cast<bool>(input[i])) << "i=" << i;
+}
+
+// GNN model: 3370 nodes (29 features each), 24126 edges (5 features each),
+// edge_index shape [2, 24126].  Output: sigmoid score per edge in [0, 1].
+TEST_F(SofieAlpakaTest, GNN_model)
+{
+    // ---- sizes -------------------------------------------------------
+    constexpr Idx N_x   = 97730;   // 3370 nodes  × 29 features
+    constexpr Idx N_ef  = 120630;  // 24126 edges ×  5 features
+    constexpr Idx N_ei  = 48252;   // 2 rows      × 24126 edges (int64)
+    constexpr Idx N_out = 24126;   // one sigmoid score per edge
+
+    // ---- host buffers -------------------------------------------------
+    auto x_h  = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{N_x}));
+    auto ef_h = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{N_ef}));
+    auto ei_h = alpaka::allocBuf<int64_t, Idx>(host, Ext1D::all(Idx{N_ei}));
+
+    float*   x_ptr  = reinterpret_cast<float*>  (alpaka::getPtrNative(x_h));
+    float*   ef_ptr = reinterpret_cast<float*>  (alpaka::getPtrNative(ef_h));
+    int64_t* ei_ptr = reinterpret_cast<int64_t*>(alpaka::getPtrNative(ei_h));
+
+    for (Idx i = 0; i < N_x;  ++i) x_ptr[i]  = 0.5f;
+    for (Idx i = 0; i < N_ef; ++i) ef_ptr[i] = 0.5f;
+    for (Idx i = 0; i < N_ei; ++i) ei_ptr[i] = 0;   // all self-loops on node 0
+
+    // ---- device buffers -----------------------------------------------
+    auto x_d  = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{N_x}));
+    auto ef_d = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{N_ef}));
+    auto ei_d = alpaka::allocBuf<int64_t, Idx>(device, Ext1D::all(Idx{N_ei}));
+
+    alpaka::memcpy(queue, x_d,  x_h);
+    alpaka::memcpy(queue, ef_d, ef_h);
+    alpaka::memcpy(queue, ei_d, ei_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N_out}));
+
+    {
+        SOFIE_GNN_model::Session<alpaka::TagGpuCudaRt> session("GNN_model_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(x_d, ef_d, ei_d);
+        alpaka::wait(session.queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(N_out, 24126u);
+    for (Idx i = 0; i < N_out; ++i) {
+        EXPECT_GE(res_ptr[i], 0.0f) << "output[" << i << "] < 0";
+        EXPECT_LE(res_ptr[i], 1.0f) << "output[" << i << "] > 1";
+    }
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Trilu operator tests
+// ═══════════════════════════════════════════════════════════════════════════
+
+// Helper: copy a host C-array into an Alpaka host buffer then to device.
+template <typename T>
+static alpaka::Buf<alpaka::DevCudaRt, T, Dim, Idx>
+makeDeviceBuf(alpaka::DevCpu const& host,
+              alpaka::DevCudaRt const& device,
+              alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking>& queue,
+              const T* src, std::size_t n)
+{
+   auto hbuf = alpaka::allocBuf<T, Idx>(host, Ext1D::all(Idx{n}));
+   T* hp = reinterpret_cast<T*>(alpaka::getPtrNative(hbuf));
+   for (std::size_t i = 0; i < n; ++i) hp[i] = src[i];
+   auto dbuf = alpaka::allocBuf<T, Idx>(device, Ext1D::all(Idx{n}));
+   alpaka::memcpy(queue, dbuf, hbuf);
+   alpaka::wait(queue);
+   return dbuf;
+}
+
+// ── Trilu_upper: 4×4, upper=1, k=0 ─────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Trilu_upper)
+{
+   constexpr std::size_t N = 16;   // 4×4
+   auto d_input = makeDeviceBuf<float>(host, device, queue,
+                                       Trilu_upper_Input::data, N);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Trilu_upper::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* ref = Trilu_upper_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << "  index=" << i;
+}
+
+// ── Trilu_lower: 4×4, upper=0, k=0 ─────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Trilu_lower)
+{
+   constexpr std::size_t N = 16;
+   auto d_input = makeDeviceBuf<float>(host, device, queue,
+                                       Trilu_lower_Input::data, N);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Trilu_lower::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* ref = Trilu_lower_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << "  index=" << i;
+}
+
+// ── Trilu_k2: 3×5, upper=1, k=+2 ────────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Trilu_k2)
+{
+   constexpr std::size_t N = 15;   // 3×5
+   auto d_input = makeDeviceBuf<float>(host, device, queue,
+                                       Trilu_k2_Input::data, N);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Trilu_k2::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* ref = Trilu_k2_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << "  index=" << i;
+}
+
+// ── Trilu_kn1: 3×5, upper=0, k=-1 ────────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Trilu_kn1)
+{
+   constexpr std::size_t N = 15;
+   auto d_input = makeDeviceBuf<float>(host, device, queue,
+                                       Trilu_kn1_Input::data, N);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Trilu_kn1::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* ref = Trilu_kn1_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << "  index=" << i;
+}
+
+// ── Trilu_3D: 2×3×4, upper=1, k=0 (batched) ─────────────────────────────────
+TEST_F(SofieAlpakaTest, Trilu_3D)
+{
+   constexpr std::size_t N = 24;   // 2×3×4
+   auto d_input = makeDeviceBuf<float>(host, device, queue,
+                                       Trilu_3D_Input::data, N);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Trilu_3D::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float* ref = Trilu_3D_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_NEAR(res[i], ref[i], DEFAULT_TOLERANCE) << "  index=" << i;
+}
+
+// ═══════════════════════════════════════════════════════════════════════════
+// Logic / Bitwise operator tests
+// ═══════════════════════════════════════════════════════════════════════════
+
+// ── Logic_And: 4×4 bool, And ────────────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_And)
+{
+   constexpr std::size_t N = 16;   // 4×4
+   auto d_a = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_And_Input::data_a, N);
+   auto d_b = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_And_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_And::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   uint8_t* res = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+   uint8_t* ref = Logic_And_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_Or: 4×4 bool, Or ─────────────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_Or)
+{
+   constexpr std::size_t N = 16;
+   auto d_a = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_Or_Input::data_a, N);
+   auto d_b = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_Or_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_Or::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   uint8_t* res = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+   uint8_t* ref = Logic_Or_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_Xor: 4×4 bool, Xor ───────────────────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_Xor)
+{
+   constexpr std::size_t N = 16;
+   auto d_a = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_Xor_Input::data_a, N);
+   auto d_b = makeDeviceBuf<uint8_t>(host, device, queue,
+                                     Logic_Xor_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<uint8_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_Xor::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   uint8_t* res = reinterpret_cast<uint8_t*>(alpaka::getPtrNative(result_h));
+   uint8_t* ref = Logic_Xor_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_BitwiseAnd: 3×5 int32, BitwiseAnd ────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_BitwiseAnd)
+{
+   constexpr std::size_t N = 15;   // 3×5
+   auto d_a = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseAnd_Input::data_a, N);
+   auto d_b = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseAnd_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_BitwiseAnd::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   int32_t* res = reinterpret_cast<int32_t*>(alpaka::getPtrNative(result_h));
+   int32_t* ref = Logic_BitwiseAnd_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_BitwiseOr: 3×5 int32, BitwiseOr ──────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_BitwiseOr)
+{
+   constexpr std::size_t N = 15;
+   auto d_a = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseOr_Input::data_a, N);
+   auto d_b = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseOr_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_BitwiseOr::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   int32_t* res = reinterpret_cast<int32_t*>(alpaka::getPtrNative(result_h));
+   int32_t* ref = Logic_BitwiseOr_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_BitwiseXor: 3×5 int32, BitwiseXor ────────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_BitwiseXor)
+{
+   constexpr std::size_t N = 15;
+   auto d_a = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseXor_Input::data_a, N);
+   auto d_b = makeDeviceBuf<int32_t>(host, device, queue,
+                                     Logic_BitwiseXor_Input::data_b, N);
+
+   auto result_h = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_BitwiseXor::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_a, d_b);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   int32_t* res = reinterpret_cast<int32_t*>(alpaka::getPtrNative(result_h));
+   int32_t* ref = Logic_BitwiseXor_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
+
+// ── Logic_BitwiseNot: 2×3×4 int32, BitwiseNot ──────────────────────────────
+TEST_F(SofieAlpakaTest, Logic_BitwiseNot)
+{
+   constexpr std::size_t N = 24;   // 2×3×4
+   auto d_input = makeDeviceBuf<int32_t>(host, device, queue,
+                                         Logic_BitwiseNot_Input::data_a, N);
+
+   auto result_h = alpaka::allocBuf<int32_t, Idx>(host, Ext1D::all(Idx{N}));
+   {
+      SOFIE_Logic_BitwiseNot::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(d_input);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   int32_t* res = reinterpret_cast<int32_t*>(alpaka::getPtrNative(result_h));
+   int32_t* ref = Logic_BitwiseNot_ExpectedOutput::outputs;
+   for (std::size_t i = 0; i < N; ++i)
+      EXPECT_EQ(res[i], ref[i]) << "  index=" << i;
+}
diff --git a/src/SOFIE_core/test/TestCustomModelsFromROOT.cxx b/test/TestCustomModelsFromROOT.cxx
similarity index 100%
rename from src/SOFIE_core/test/TestCustomModelsFromROOT.cxx
rename to test/TestCustomModelsFromROOT.cxx
diff --git a/src/SOFIE_core/test/TestSofieModels.cxx b/test/TestSofieModels.cxx
similarity index 100%
rename from src/SOFIE_core/test/TestSofieModels.cxx
rename to test/TestSofieModels.cxx
diff --git a/src/SOFIE_core/test/input_models/Abs.onnx b/test/input_models/Abs.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Abs.onnx
rename to test/input_models/Abs.onnx
diff --git a/src/SOFIE_core/test/input_models/Add.onnx b/test/input_models/Add.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Add.onnx
rename to test/input_models/Add.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast1.onnx b/test/input_models/AddBroadcast1.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast1.onnx
rename to test/input_models/AddBroadcast1.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast2.onnx b/test/input_models/AddBroadcast2.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast2.onnx
rename to test/input_models/AddBroadcast2.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast3.onnx b/test/input_models/AddBroadcast3.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast3.onnx
rename to test/input_models/AddBroadcast3.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast4.onnx b/test/input_models/AddBroadcast4.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast4.onnx
rename to test/input_models/AddBroadcast4.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast5.onnx b/test/input_models/AddBroadcast5.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast5.onnx
rename to test/input_models/AddBroadcast5.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast6.onnx b/test/input_models/AddBroadcast6.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast6.onnx
rename to test/input_models/AddBroadcast6.onnx
diff --git a/src/SOFIE_core/test/input_models/AddBroadcast7.onnx b/test/input_models/AddBroadcast7.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AddBroadcast7.onnx
rename to test/input_models/AddBroadcast7.onnx
diff --git a/src/SOFIE_core/test/input_models/AvgPool.onnx b/test/input_models/AvgPool.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/AvgPool.onnx
rename to test/input_models/AvgPool.onnx
diff --git a/test/input_models/BatchNorm.onnx b/test/input_models/BatchNorm.onnx
new file mode 100644
index 0000000..f03cd9a
Binary files /dev/null and b/test/input_models/BatchNorm.onnx differ
diff --git a/test/input_models/BatchNormRelu.onnx b/test/input_models/BatchNormRelu.onnx
new file mode 100644
index 0000000..badf2c2
Binary files /dev/null and b/test/input_models/BatchNormRelu.onnx differ
diff --git a/src/SOFIE_core/test/input_models/Cast.onnx b/test/input_models/Cast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Cast.onnx
rename to test/input_models/Cast.onnx
diff --git a/test/input_models/Clip.onnx b/test/input_models/Clip.onnx
new file mode 100644
index 0000000..a91d748
Binary files /dev/null and b/test/input_models/Clip.onnx differ
diff --git a/src/SOFIE_core/test/input_models/ComplexTopK.onnx b/test/input_models/ComplexTopK.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ComplexTopK.onnx
rename to test/input_models/ComplexTopK.onnx
diff --git a/src/SOFIE_core/test/input_models/Concat_0D.onnx b/test/input_models/Concat_0D.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Concat_0D.onnx
rename to test/input_models/Concat_0D.onnx
diff --git a/src/SOFIE_core/test/input_models/Constant.onnx b/test/input_models/Constant.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Constant.onnx
rename to test/input_models/Constant.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvTranspose1d.onnx b/test/input_models/ConvTranspose1d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvTranspose1d.onnx
rename to test/input_models/ConvTranspose1d.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvTranspose2d.onnx b/test/input_models/ConvTranspose2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvTranspose2d.onnx
rename to test/input_models/ConvTranspose2d.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx b/test/input_models/ConvTransposeBias2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvTransposeBias2d.onnx
rename to test/input_models/ConvTransposeBias2d.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx b/test/input_models/ConvTransposeBias2dBatched.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvTransposeBias2dBatched.onnx
rename to test/input_models/ConvTransposeBias2dBatched.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx b/test/input_models/ConvWithAsymmetricPadding.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithAsymmetricPadding.onnx
rename to test/input_models/ConvWithAsymmetricPadding.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx b/test/input_models/ConvWithAutopadSameLower.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithAutopadSameLower.onnx
rename to test/input_models/ConvWithAutopadSameLower.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithPadding.onnx b/test/input_models/ConvWithPadding.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithPadding.onnx
rename to test/input_models/ConvWithPadding.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx b/test/input_models/ConvWithStridesNoPadding.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithStridesNoPadding.onnx
rename to test/input_models/ConvWithStridesNoPadding.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx b/test/input_models/ConvWithStridesPadding.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithStridesPadding.onnx
rename to test/input_models/ConvWithStridesPadding.onnx
diff --git a/src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx b/test/input_models/ConvWithoutPadding.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ConvWithoutPadding.onnx
rename to test/input_models/ConvWithoutPadding.onnx
diff --git a/src/SOFIE_core/test/input_models/Cos.onnx b/test/input_models/Cos.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Cos.onnx
rename to test/input_models/Cos.onnx
diff --git a/src/SOFIE_core/test/input_models/Div.onnx b/test/input_models/Div.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Div.onnx
rename to test/input_models/Div.onnx
diff --git a/src/SOFIE_core/test/input_models/Einsum_3.onnx b/test/input_models/Einsum_3.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Einsum_3.onnx
rename to test/input_models/Einsum_3.onnx
diff --git a/src/SOFIE_core/test/input_models/Einsum_4.onnx b/test/input_models/Einsum_4.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Einsum_4.onnx
rename to test/input_models/Einsum_4.onnx
diff --git a/src/SOFIE_core/test/input_models/Einsum_dotprod.onnx b/test/input_models/Einsum_dotprod.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Einsum_dotprod.onnx
rename to test/input_models/Einsum_dotprod.onnx
diff --git a/src/SOFIE_core/test/input_models/Einsum_matmul.onnx b/test/input_models/Einsum_matmul.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Einsum_matmul.onnx
rename to test/input_models/Einsum_matmul.onnx
diff --git a/src/SOFIE_core/test/input_models/Elu.onnx b/test/input_models/Elu.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Elu.onnx
rename to test/input_models/Elu.onnx
diff --git a/src/SOFIE_core/test/input_models/Equal.onnx b/test/input_models/Equal.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Equal.onnx
rename to test/input_models/Equal.onnx
diff --git a/src/SOFIE_core/test/input_models/Erf.onnx b/test/input_models/Erf.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Erf.onnx
rename to test/input_models/Erf.onnx
diff --git a/src/SOFIE_core/test/input_models/Exp.onnx b/test/input_models/Exp.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Exp.onnx
rename to test/input_models/Exp.onnx
diff --git a/src/SOFIE_core/test/input_models/ExpandDiffSize.onnx b/test/input_models/ExpandDiffSize.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ExpandDiffSize.onnx
rename to test/input_models/ExpandDiffSize.onnx
diff --git a/src/SOFIE_core/test/input_models/ExpandSameSize.onnx b/test/input_models/ExpandSameSize.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ExpandSameSize.onnx
rename to test/input_models/ExpandSameSize.onnx
diff --git a/src/SOFIE_core/test/input_models/EyeLike.onnx b/test/input_models/EyeLike.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/EyeLike.onnx
rename to test/input_models/EyeLike.onnx
diff --git a/test/input_models/GNN_model.onnx b/test/input_models/GNN_model.onnx
new file mode 100644
index 0000000..833e34d
Binary files /dev/null and b/test/input_models/GNN_model.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GRUBatchwise.onnx b/test/input_models/GRUBatchwise.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GRUBatchwise.onnx
rename to test/input_models/GRUBatchwise.onnx
diff --git a/src/SOFIE_core/test/input_models/GRUBidirectional.onnx b/test/input_models/GRUBidirectional.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GRUBidirectional.onnx
rename to test/input_models/GRUBidirectional.onnx
diff --git a/src/SOFIE_core/test/input_models/GRUDefaults.onnx b/test/input_models/GRUDefaults.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GRUDefaults.onnx
rename to test/input_models/GRUDefaults.onnx
diff --git a/src/SOFIE_core/test/input_models/GRUInitialBias.onnx b/test/input_models/GRUInitialBias.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GRUInitialBias.onnx
rename to test/input_models/GRUInitialBias.onnx
diff --git a/src/SOFIE_core/test/input_models/GRUSeqLength.onnx b/test/input_models/GRUSeqLength.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GRUSeqLength.onnx
rename to test/input_models/GRUSeqLength.onnx
diff --git a/src/SOFIE_core/test/input_models/Gather2d.onnx b/test/input_models/Gather2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Gather2d.onnx
rename to test/input_models/Gather2d.onnx
diff --git a/src/SOFIE_core/test/input_models/GatherAxis0.onnx b/test/input_models/GatherAxis0.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GatherAxis0.onnx
rename to test/input_models/GatherAxis0.onnx
diff --git a/src/SOFIE_core/test/input_models/GatherAxis1.onnx b/test/input_models/GatherAxis1.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GatherAxis1.onnx
rename to test/input_models/GatherAxis1.onnx
diff --git a/src/SOFIE_core/test/input_models/GatherAxis2.onnx b/test/input_models/GatherAxis2.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GatherAxis2.onnx
rename to test/input_models/GatherAxis2.onnx
diff --git a/src/SOFIE_core/test/input_models/GatherAxis3.onnx b/test/input_models/GatherAxis3.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GatherAxis3.onnx
rename to test/input_models/GatherAxis3.onnx
diff --git a/test/input_models/GatherND_Batch.onnx b/test/input_models/GatherND_Batch.onnx
new file mode 100644
index 0000000..4d146c6
Binary files /dev/null and b/test/input_models/GatherND_Batch.onnx differ
diff --git a/test/input_models/GatherND_Ex1.onnx b/test/input_models/GatherND_Ex1.onnx
new file mode 100644
index 0000000..bc1a910
Binary files /dev/null and b/test/input_models/GatherND_Ex1.onnx differ
diff --git a/test/input_models/GatherND_Ex2.onnx b/test/input_models/GatherND_Ex2.onnx
new file mode 100644
index 0000000..4cd511c
Binary files /dev/null and b/test/input_models/GatherND_Ex2.onnx differ
diff --git a/test/input_models/GatherND_Ex3.onnx b/test/input_models/GatherND_Ex3.onnx
new file mode 100644
index 0000000..917008f
Binary files /dev/null and b/test/input_models/GatherND_Ex3.onnx differ
diff --git a/test/input_models/GatherND_Ex4.onnx b/test/input_models/GatherND_Ex4.onnx
new file mode 100644
index 0000000..d3006a2
Binary files /dev/null and b/test/input_models/GatherND_Ex4.onnx differ
diff --git a/test/input_models/GatherND_Ex5.onnx b/test/input_models/GatherND_Ex5.onnx
new file mode 100644
index 0000000..be1ba0d
Binary files /dev/null and b/test/input_models/GatherND_Ex5.onnx differ
diff --git a/test/input_models/GatherND_NegativeIndices.onnx b/test/input_models/GatherND_NegativeIndices.onnx
new file mode 100644
index 0000000..5fa05aa
Binary files /dev/null and b/test/input_models/GatherND_NegativeIndices.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx b/test/input_models/GatherNegativeIndices.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GatherNegativeIndices.onnx
rename to test/input_models/GatherNegativeIndices.onnx
diff --git a/src/SOFIE_core/test/input_models/Greater.onnx b/test/input_models/Greater.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Greater.onnx
rename to test/input_models/Greater.onnx
diff --git a/src/SOFIE_core/test/input_models/GreaterOrEqual.onnx b/test/input_models/GreaterOrEqual.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/GreaterOrEqual.onnx
rename to test/input_models/GreaterOrEqual.onnx
diff --git a/test/input_models/IsInf.onnx b/test/input_models/IsInf.onnx
new file mode 100644
index 0000000..b47fe82
Binary files /dev/null and b/test/input_models/IsInf.onnx differ
diff --git a/test/input_models/IsNaN.onnx b/test/input_models/IsNaN.onnx
new file mode 100644
index 0000000..d1a6e05
Binary files /dev/null and b/test/input_models/IsNaN.onnx differ
diff --git a/src/SOFIE_core/test/input_models/LSTMBatchwise.onnx b/test/input_models/LSTMBatchwise.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LSTMBatchwise.onnx
rename to test/input_models/LSTMBatchwise.onnx
diff --git a/src/SOFIE_core/test/input_models/LSTMBidirectional.onnx b/test/input_models/LSTMBidirectional.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LSTMBidirectional.onnx
rename to test/input_models/LSTMBidirectional.onnx
diff --git a/src/SOFIE_core/test/input_models/LSTMDefaults.onnx b/test/input_models/LSTMDefaults.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LSTMDefaults.onnx
rename to test/input_models/LSTMDefaults.onnx
diff --git a/src/SOFIE_core/test/input_models/LSTMInitialBias.onnx b/test/input_models/LSTMInitialBias.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LSTMInitialBias.onnx
rename to test/input_models/LSTMInitialBias.onnx
diff --git a/src/SOFIE_core/test/input_models/LSTMPeepholes.onnx b/test/input_models/LSTMPeepholes.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LSTMPeepholes.onnx
rename to test/input_models/LSTMPeepholes.onnx
diff --git a/test/input_models/LayerNorm.onnx b/test/input_models/LayerNorm.onnx
new file mode 100644
index 0000000..97142e7
Binary files /dev/null and b/test/input_models/LayerNorm.onnx differ
diff --git a/test/input_models/LayerNorm3D.onnx b/test/input_models/LayerNorm3D.onnx
new file mode 100644
index 0000000..c29afc0
Binary files /dev/null and b/test/input_models/LayerNorm3D.onnx differ
diff --git a/test/input_models/LayerNormScaleBias.onnx b/test/input_models/LayerNormScaleBias.onnx
new file mode 100644
index 0000000..99ea540
Binary files /dev/null and b/test/input_models/LayerNormScaleBias.onnx differ
diff --git a/src/SOFIE_core/test/input_models/LayerNormalization2d.onnx b/test/input_models/LayerNormalization2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LayerNormalization2d.onnx
rename to test/input_models/LayerNormalization2d.onnx
diff --git a/src/SOFIE_core/test/input_models/LayerNormalization4d.onnx b/test/input_models/LayerNormalization4d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LayerNormalization4d.onnx
rename to test/input_models/LayerNormalization4d.onnx
diff --git a/src/SOFIE_core/test/input_models/Less.onnx b/test/input_models/Less.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Less.onnx
rename to test/input_models/Less.onnx
diff --git a/src/SOFIE_core/test/input_models/LessOrEqual.onnx b/test/input_models/LessOrEqual.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LessOrEqual.onnx
rename to test/input_models/LessOrEqual.onnx
diff --git a/src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx b/test/input_models/LinearWithLeakyRelu.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LinearWithLeakyRelu.onnx
rename to test/input_models/LinearWithLeakyRelu.onnx
diff --git a/src/SOFIE_core/test/input_models/LinearWithSelu.onnx b/test/input_models/LinearWithSelu.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LinearWithSelu.onnx
rename to test/input_models/LinearWithSelu.onnx
diff --git a/src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx b/test/input_models/LinearWithSigmoid.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/LinearWithSigmoid.onnx
rename to test/input_models/LinearWithSigmoid.onnx
diff --git a/src/SOFIE_core/test/input_models/Linear_16.onnx b/test/input_models/Linear_16.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Linear_16.onnx
rename to test/input_models/Linear_16.onnx
diff --git a/src/SOFIE_core/test/input_models/Linear_32.onnx b/test/input_models/Linear_32.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Linear_32.onnx
rename to test/input_models/Linear_32.onnx
diff --git a/src/SOFIE_core/test/input_models/Linear_64.onnx b/test/input_models/Linear_64.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Linear_64.onnx
rename to test/input_models/Linear_64.onnx
diff --git a/src/SOFIE_core/test/input_models/Log.onnx b/test/input_models/Log.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Log.onnx
rename to test/input_models/Log.onnx
diff --git a/test/input_models/LogicModelGenerator.py b/test/input_models/LogicModelGenerator.py
new file mode 100644
index 0000000..adb5b16
--- /dev/null
+++ b/test/input_models/LogicModelGenerator.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Generate ONNX test models for the SOFIE Logic operators and write
+the corresponding C++ reference headers.
+
+Models created
+──────────────
+  Logic_And.onnx          bool  input  4×4,   And
+  Logic_Or.onnx           bool  input  4×4,   Or
+  Logic_Xor.onnx          bool  input  4×4,   Xor
+  Logic_BitwiseAnd.onnx   int32 input  3×5,   BitwiseAnd
+  Logic_BitwiseOr.onnx    int32 input  3×5,   BitwiseOr
+  Logic_BitwiseXor.onnx   int32 input  3×5,   BitwiseXor
+  Logic_BitwiseNot.onnx   int32 input  2×3×4, BitwiseNot
+
+Usage:
+    cd <repo>/SOFIE/core/test/input_models
+    python3 LogicModelGenerator.py
+"""
+
+import os
+import numpy as np
+import onnx
+from onnx import helper, TensorProto, numpy_helper
+
+OUT_DIR = os.path.dirname(os.path.abspath(__file__))
+REF_DIR = os.path.join(OUT_DIR, "references")
+os.makedirs(REF_DIR, exist_ok=True)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+
+def ref_header(name, data, dtype_str="bool"):
+    """Emit a C++ header with the flattened expected output array."""
+    flat = data.flatten()
+    if dtype_str == "bool":
+        # store as uint8 in C++
+        vals = ", ".join(str(int(v)) for v in flat)
+        arr_type = "uint8_t"
+    else:
+        # int32 / int64
+        vals = ", ".join(str(int(v)) for v in flat)
+        arr_type = dtype_str
+    return (
+        f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"#include <cstdint>\n"
+        f"namespace {name}_ExpectedOutput {{\n"
+        f"   static {arr_type} outputs[{flat.size}] = {{{vals}}};\n"
+        f"}} // namespace {name}_ExpectedOutput\n"
+    )
+
+
+def inp_header(name, data, dtype_str="bool"):
+    """Emit a C++ header for the input data."""
+    flat = data.flatten()
+    if dtype_str == "bool":
+        vals = ", ".join(str(int(v)) for v in flat)
+        arr_type = "uint8_t"
+    else:
+        vals = ", ".join(str(int(v)) for v in flat)
+        arr_type = dtype_str
+    return (
+        f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"#include <cstdint>\n"
+        f"namespace {name}_Input {{\n"
+        f"   static {arr_type} data_a[{flat.size}] = {{{vals}}};\n"
+        f"}} // namespace {name}_Input\n"
+    )
+
+
+def inp_header2(name, data_a, data_b, dtype_str="bool"):
+    """Emit a C++ header with two input arrays (A and B)."""
+    flat_a = data_a.flatten()
+    flat_b = data_b.flatten()
+    if dtype_str == "bool":
+        arr_type = "uint8_t"
+        vals_a = ", ".join(str(int(v)) for v in flat_a)
+        vals_b = ", ".join(str(int(v)) for v in flat_b)
+    else:
+        arr_type = dtype_str
+        vals_a = ", ".join(str(int(v)) for v in flat_a)
+        vals_b = ", ".join(str(int(v)) for v in flat_b)
+    return (
+        f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"#include <cstdint>\n"
+        f"namespace {name}_Input {{\n"
+        f"   static {arr_type} data_a[{flat_a.size}] = {{{vals_a}}};\n"
+        f"   static {arr_type} data_b[{flat_b.size}] = {{{vals_b}}};\n"
+        f"}} // namespace {name}_Input\n"
+    )
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Model builders
+# ─────────────────────────────────────────────────────────────────────────────
+
+def make_binary_model(op_name, shape, onnx_dtype, name):
+    """Build a single-op binary model (A op B -> Y)."""
+    type_map = {
+        TensorProto.BOOL:  "bool",
+        TensorProto.INT32: "int32",
+        TensorProto.INT64: "int64",
+    }
+    a = helper.make_tensor_value_info("input_a", onnx_dtype, shape)
+    b = helper.make_tensor_value_info("input_b", onnx_dtype, shape)
+    y = helper.make_tensor_value_info("output",  onnx_dtype, shape)
+
+    node = helper.make_node(op_name, inputs=["input_a", "input_b"], outputs=["output"])
+    graph = helper.make_graph([node], name, [a, b], [y])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 18)])
+    model.ir_version = 7
+    onnx.checker.check_model(model)
+    return model
+
+
+def make_unary_model(op_name, shape, onnx_dtype, name):
+    """Build a single-op unary model (X -> Y)."""
+    x = helper.make_tensor_value_info("input", onnx_dtype, shape)
+    y = helper.make_tensor_value_info("output", onnx_dtype, shape)
+
+    node = helper.make_node(op_name, inputs=["input"], outputs=["output"])
+    graph = helper.make_graph([node], name, [x], [y])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 18)])
+    model.ir_version = 7
+    onnx.checker.check_model(model)
+    return model
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Test cases
+# ─────────────────────────────────────────────────────────────────────────────
+
+rng = np.random.default_rng(42)
+
+BOOL_SHAPE  = [4, 4]
+INT_SHAPE   = [3, 5]
+INT3D_SHAPE = [2, 3, 4]
+
+# Bool inputs (stored as uint8 in SOFIE)
+bool_a = rng.integers(0, 2, BOOL_SHAPE).astype(np.uint8)
+bool_b = rng.integers(0, 2, BOOL_SHAPE).astype(np.uint8)
+
+# Int32 inputs
+int_a = rng.integers(-100, 100, INT_SHAPE, dtype=np.int32)
+int_b = rng.integers(-100, 100, INT_SHAPE, dtype=np.int32)
+
+int3d = rng.integers(-100, 100, INT3D_SHAPE, dtype=np.int32)
+
+BINARY_CASES = [
+    # (model_name, onnx_op, onnx_dtype, shape, a, b, ref, dtype_str)
+    ("Logic_And",       "And",        TensorProto.BOOL,  BOOL_SHAPE,  bool_a, bool_b,
+        np.logical_and(bool_a, bool_b).astype(np.uint8), "bool"),
+    ("Logic_Or",        "Or",         TensorProto.BOOL,  BOOL_SHAPE,  bool_a, bool_b,
+        np.logical_or(bool_a, bool_b).astype(np.uint8),  "bool"),
+    ("Logic_Xor",       "Xor",        TensorProto.BOOL,  BOOL_SHAPE,  bool_a, bool_b,
+        np.logical_xor(bool_a, bool_b).astype(np.uint8), "bool"),
+    ("Logic_BitwiseAnd","BitwiseAnd", TensorProto.INT32, INT_SHAPE,   int_a,  int_b,
+        (int_a & int_b).astype(np.int32), "int32_t"),
+    ("Logic_BitwiseOr", "BitwiseOr",  TensorProto.INT32, INT_SHAPE,   int_a,  int_b,
+        (int_a | int_b).astype(np.int32), "int32_t"),
+    ("Logic_BitwiseXor","BitwiseXor", TensorProto.INT32, INT_SHAPE,   int_a,  int_b,
+        (int_a ^ int_b).astype(np.int32), "int32_t"),
+]
+
+for (name, onnx_op, onnx_dtype, shape, a, b, ref, dtype_str) in BINARY_CASES:
+    model = make_binary_model(onnx_op, shape, onnx_dtype, name)
+    onnx_path = os.path.join(OUT_DIR, f"{name}.onnx")
+    onnx.save(model, onnx_path)
+    print(f"Saved {onnx_path}")
+
+    ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx")
+    with open(ref_path, "w") as f:
+        f.write(ref_header(name, ref, dtype_str))
+    print(f"  → reference {ref_path}")
+
+    inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx")
+    with open(inp_path, "w") as f:
+        f.write(inp_header2(name, a, b, dtype_str))
+    print(f"  → input ref {inp_path}")
+
+
+# Unary BitwiseNot
+name = "Logic_BitwiseNot"
+model = make_unary_model("BitwiseNot", INT3D_SHAPE, TensorProto.INT32, name)
+onnx_path = os.path.join(OUT_DIR, f"{name}.onnx")
+onnx.save(model, onnx_path)
+print(f"Saved {onnx_path}")
+
+ref_not = (~int3d).astype(np.int32)
+ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx")
+with open(ref_path, "w") as f:
+    f.write(ref_header(name, ref_not, "int32_t"))
+print(f"  → reference {ref_path}")
+
+inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx")
+with open(inp_path, "w") as f:
+    # Single-input version
+    flat = int3d.flatten()
+    vals = ", ".join(str(int(v)) for v in flat)
+    content = (
+        f"// Auto-generated by LogicModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"#include <cstdint>\n"
+        f"namespace {name}_Input {{\n"
+        f"   static int32_t data_a[{flat.size}] = {{{vals}}};\n"
+        f"}} // namespace {name}_Input\n"
+    )
+    f.write(content)
+print(f"  → input ref {inp_path}")
+
+print("\nAll Logic test models and references generated successfully.")
diff --git a/test/input_models/Logic_And.onnx b/test/input_models/Logic_And.onnx
new file mode 100644
index 0000000..ea0dbce
Binary files /dev/null and b/test/input_models/Logic_And.onnx differ
diff --git a/test/input_models/Logic_BitwiseAnd.onnx b/test/input_models/Logic_BitwiseAnd.onnx
new file mode 100644
index 0000000..a7bf522
Binary files /dev/null and b/test/input_models/Logic_BitwiseAnd.onnx differ
diff --git a/test/input_models/Logic_BitwiseNot.onnx b/test/input_models/Logic_BitwiseNot.onnx
new file mode 100644
index 0000000..6ec0a35
Binary files /dev/null and b/test/input_models/Logic_BitwiseNot.onnx differ
diff --git a/test/input_models/Logic_BitwiseOr.onnx b/test/input_models/Logic_BitwiseOr.onnx
new file mode 100644
index 0000000..49ae37e
Binary files /dev/null and b/test/input_models/Logic_BitwiseOr.onnx differ
diff --git a/test/input_models/Logic_BitwiseXor.onnx b/test/input_models/Logic_BitwiseXor.onnx
new file mode 100644
index 0000000..aba2037
Binary files /dev/null and b/test/input_models/Logic_BitwiseXor.onnx differ
diff --git a/test/input_models/Logic_Or.onnx b/test/input_models/Logic_Or.onnx
new file mode 100644
index 0000000..563e9ec
Binary files /dev/null and b/test/input_models/Logic_Or.onnx differ
diff --git a/test/input_models/Logic_Xor.onnx b/test/input_models/Logic_Xor.onnx
new file mode 100644
index 0000000..c7067e1
Binary files /dev/null and b/test/input_models/Logic_Xor.onnx differ
diff --git a/src/SOFIE_core/test/input_models/Max.onnx b/test/input_models/Max.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Max.onnx
rename to test/input_models/Max.onnx
diff --git a/src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx b/test/input_models/MaxMultidirectionalBroadcast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MaxMultidirectionalBroadcast.onnx
rename to test/input_models/MaxMultidirectionalBroadcast.onnx
diff --git a/src/SOFIE_core/test/input_models/MaxPool1d.onnx b/test/input_models/MaxPool1d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MaxPool1d.onnx
rename to test/input_models/MaxPool1d.onnx
diff --git a/src/SOFIE_core/test/input_models/MaxPool2d.onnx b/test/input_models/MaxPool2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MaxPool2d.onnx
rename to test/input_models/MaxPool2d.onnx
diff --git a/src/SOFIE_core/test/input_models/MaxPool3d.onnx b/test/input_models/MaxPool3d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MaxPool3d.onnx
rename to test/input_models/MaxPool3d.onnx
diff --git a/src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx b/test/input_models/MeanMultidirectionalBroadcast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MeanMultidirectionalBroadcast.onnx
rename to test/input_models/MeanMultidirectionalBroadcast.onnx
diff --git a/src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx b/test/input_models/MinMultidirectionalBroadcast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/MinMultidirectionalBroadcast.onnx
rename to test/input_models/MinMultidirectionalBroadcast.onnx
diff --git a/src/SOFIE_core/test/input_models/Mul.onnx b/test/input_models/Mul.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Mul.onnx
rename to test/input_models/Mul.onnx
diff --git a/src/SOFIE_core/test/input_models/Neg.onnx b/test/input_models/Neg.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Neg.onnx
rename to test/input_models/Neg.onnx
diff --git a/test/input_models/Not.onnx b/test/input_models/Not.onnx
new file mode 100644
index 0000000..b29ca99
Binary files /dev/null and b/test/input_models/Not.onnx differ
diff --git a/src/SOFIE_core/test/input_models/Pad.onnx b/test/input_models/Pad.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Pad.onnx
rename to test/input_models/Pad.onnx
diff --git a/src/SOFIE_core/test/input_models/Pow.onnx b/test/input_models/Pow.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Pow.onnx
rename to test/input_models/Pow.onnx
diff --git a/src/SOFIE_core/test/input_models/Pow_broadcast.onnx b/test/input_models/Pow_broadcast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Pow_broadcast.onnx
rename to test/input_models/Pow_broadcast.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNBatchwise.onnx b/test/input_models/RNNBatchwise.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNBatchwise.onnx
rename to test/input_models/RNNBatchwise.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNBidirectional.onnx b/test/input_models/RNNBidirectional.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNBidirectional.onnx
rename to test/input_models/RNNBidirectional.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx b/test/input_models/RNNBidirectionalBatchwise.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNBidirectionalBatchwise.onnx
rename to test/input_models/RNNBidirectionalBatchwise.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNDefaults.onnx b/test/input_models/RNNDefaults.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNDefaults.onnx
rename to test/input_models/RNNDefaults.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNSeqLength.onnx b/test/input_models/RNNSeqLength.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNSeqLength.onnx
rename to test/input_models/RNNSeqLength.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNSequence.onnx b/test/input_models/RNNSequence.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNSequence.onnx
rename to test/input_models/RNNSequence.onnx
diff --git a/src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx b/test/input_models/RNNSequenceBatchwise.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RNNSequenceBatchwise.onnx
rename to test/input_models/RNNSequenceBatchwise.onnx
diff --git a/src/SOFIE_core/test/input_models/RandomNormal.onnx b/test/input_models/RandomNormal.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RandomNormal.onnx
rename to test/input_models/RandomNormal.onnx
diff --git a/src/SOFIE_core/test/input_models/RandomUniform.onnx b/test/input_models/RandomUniform.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RandomUniform.onnx
rename to test/input_models/RandomUniform.onnx
diff --git a/src/SOFIE_core/test/input_models/RangeFloat.onnx b/test/input_models/RangeFloat.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RangeFloat.onnx
rename to test/input_models/RangeFloat.onnx
diff --git a/src/SOFIE_core/test/input_models/RangeInt.onnx b/test/input_models/RangeInt.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/RangeInt.onnx
rename to test/input_models/RangeInt.onnx
diff --git a/src/SOFIE_core/test/input_models/Reciprocal.onnx b/test/input_models/Reciprocal.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Reciprocal.onnx
rename to test/input_models/Reciprocal.onnx
diff --git a/test/input_models/ReduceL2.onnx b/test/input_models/ReduceL2.onnx
new file mode 100644
index 0000000..1aadbc8
Binary files /dev/null and b/test/input_models/ReduceL2.onnx differ
diff --git a/test/input_models/ReduceL2Large.onnx b/test/input_models/ReduceL2Large.onnx
new file mode 100644
index 0000000..75d4fc7
Binary files /dev/null and b/test/input_models/ReduceL2Large.onnx differ
diff --git a/test/input_models/ReduceMax.onnx b/test/input_models/ReduceMax.onnx
new file mode 100644
index 0000000..fc837e8
Binary files /dev/null and b/test/input_models/ReduceMax.onnx differ
diff --git a/test/input_models/ReduceMaxModelGenerator.py b/test/input_models/ReduceMaxModelGenerator.py
new file mode 100644
index 0000000..305948f
--- /dev/null
+++ b/test/input_models/ReduceMaxModelGenerator.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+"""
+Generate ONNX test models for the SOFIE ReduceMax operator and write
+the corresponding C++ reference headers.
+
+Models created
+──────────────
+  ReduceMax.onnx            [1,2,3] float, axis=1, keepdims=0  (kLast / matches ReduceMean)
+  ReduceMax_axis0.onnx      [3,4]   float, axis=0, keepdims=0  (kFirst)
+  ReduceMax_mid.onnx        [2,3,4] float, axis=1, keepdims=0  (kMiddle)
+
+Usage:
+    cd <repo>/SOFIE/core/test/input_models
+    python3 ReduceMaxModelGenerator.py
+"""
+
+import os
+import numpy as np
+import onnx
+from onnx import helper, TensorProto
+
+OUT_DIR = os.path.dirname(os.path.abspath(__file__))
+REF_DIR = os.path.join(OUT_DIR, "references")
+os.makedirs(REF_DIR, exist_ok=True)
+
+
+def make_reducemax_model(input_shape, axes, keepdims, name):
+    """Build a single ReduceMax node model (opset 13, axes as attribute)."""
+    x = helper.make_tensor_value_info("input", TensorProto.FLOAT, input_shape)
+    # Compute output shape
+    out_shape = list(input_shape)
+    for ax in axes:
+        if keepdims:
+            out_shape[ax] = 1
+        else:
+            out_shape[ax] = None          # will be removed below
+    if not keepdims:
+        out_shape = [d for i, d in enumerate(out_shape) if i not in axes]
+    y = helper.make_tensor_value_info("output", TensorProto.FLOAT, out_shape)
+
+    node = helper.make_node(
+        "ReduceMax",
+        inputs=["input"],
+        outputs=["output"],
+        axes=axes,
+        keepdims=keepdims,
+    )
+    graph = helper.make_graph([node], name, [x], [y])
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+    model.ir_version = 7
+    onnx.checker.check_model(model)
+    return model
+
+
+def ref_header(name, data):
+    flat = data.flatten()
+    vals = ", ".join(f"{v:.8f}f" for v in flat)
+    return (
+        f"// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"namespace {name}_ExpectedOutput {{\n"
+        f"   static float output[{flat.size}] = {{{vals}}};\n"
+        f"}} // namespace {name}_ExpectedOutput\n"
+    )
+
+
+rng = np.random.default_rng(42)
+
+CASES = [
+    # (name,               input_shape,   axes,  keepdims,  input_data)
+    # axis=1 on [1,2,3]  →  reduces last dim → kLast code path
+    ("ReduceMax",          [1, 2, 3],     [1],   0,   np.array([[[5., 2., 3.], [5., 5., 4.]]], dtype=np.float32)),
+    # axis=0 on [3,4]    →  reduces first dim → kFirst code path
+    ("ReduceMax_axis0",    [3, 4],        [0],   0,   rng.standard_normal((3, 4)).astype(np.float32)),
+    # axis=1 on [2,3,4]  →  reduces middle dim → kMiddle code path
+    ("ReduceMax_mid",      [2, 3, 4],     [1],   0,   rng.standard_normal((2, 3, 4)).astype(np.float32)),
+]
+
+for (name, shape, axes, keepdims, x) in CASES:
+    model = make_reducemax_model(shape, axes, keepdims, name)
+    onnx_path = os.path.join(OUT_DIR, f"{name}.onnx")
+    onnx.save(model, onnx_path)
+    print(f"Saved {onnx_path}")
+
+    y = np.max(x, axis=tuple(axes), keepdims=bool(keepdims))
+    ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx")
+    with open(ref_path, "w") as f:
+        f.write(ref_header(name, y))
+    print(f"  → reference {ref_path}  shape={list(y.shape)}")
+
+print("\nAll ReduceMax test models and references generated successfully.")
diff --git a/test/input_models/ReduceMax_axis0.onnx b/test/input_models/ReduceMax_axis0.onnx
new file mode 100644
index 0000000..632fbab
Binary files /dev/null and b/test/input_models/ReduceMax_axis0.onnx differ
diff --git a/test/input_models/ReduceMax_mid.onnx b/test/input_models/ReduceMax_mid.onnx
new file mode 100644
index 0000000..d49a222
Binary files /dev/null and b/test/input_models/ReduceMax_mid.onnx differ
diff --git a/src/SOFIE_core/test/input_models/ReduceMean.onnx b/test/input_models/ReduceMean.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ReduceMean.onnx
rename to test/input_models/ReduceMean.onnx
diff --git a/src/SOFIE_core/test/input_models/ReduceProd.onnx b/test/input_models/ReduceProd.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ReduceProd.onnx
rename to test/input_models/ReduceProd.onnx
diff --git a/src/SOFIE_core/test/input_models/ReduceSum.onnx b/test/input_models/ReduceSum.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ReduceSum.onnx
rename to test/input_models/ReduceSum.onnx
diff --git a/src/SOFIE_core/test/input_models/ReduceSumSquare.onnx b/test/input_models/ReduceSumSquare.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ReduceSumSquare.onnx
rename to test/input_models/ReduceSumSquare.onnx
diff --git a/src/SOFIE_core/test/input_models/ScatterElements.onnx b/test/input_models/ScatterElements.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/ScatterElements.onnx
rename to test/input_models/ScatterElements.onnx
diff --git a/src/SOFIE_core/test/input_models/Shape.onnx b/test/input_models/Shape.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Shape.onnx
rename to test/input_models/Shape.onnx
diff --git a/src/SOFIE_core/test/input_models/Sin.onnx b/test/input_models/Sin.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Sin.onnx
rename to test/input_models/Sin.onnx
diff --git a/src/SOFIE_core/test/input_models/Slice.onnx b/test/input_models/Slice.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Slice.onnx
rename to test/input_models/Slice.onnx
diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx b/test/input_models/Slice_Default_Axis.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Slice_Default_Axis.onnx
rename to test/input_models/Slice_Default_Axis.onnx
diff --git a/src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx b/test/input_models/Slice_Default_Steps.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Slice_Default_Steps.onnx
rename to test/input_models/Slice_Default_Steps.onnx
diff --git a/src/SOFIE_core/test/input_models/Slice_Neg.onnx b/test/input_models/Slice_Neg.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Slice_Neg.onnx
rename to test/input_models/Slice_Neg.onnx
diff --git a/src/SOFIE_core/test/input_models/Softmax1d.onnx b/test/input_models/Softmax1d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Softmax1d.onnx
rename to test/input_models/Softmax1d.onnx
diff --git a/src/SOFIE_core/test/input_models/Softmax2d.onnx b/test/input_models/Softmax2d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Softmax2d.onnx
rename to test/input_models/Softmax2d.onnx
diff --git a/src/SOFIE_core/test/input_models/Softmax3d.onnx b/test/input_models/Softmax3d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Softmax3d.onnx
rename to test/input_models/Softmax3d.onnx
diff --git a/src/SOFIE_core/test/input_models/Softmax4d.onnx b/test/input_models/Softmax4d.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Softmax4d.onnx
rename to test/input_models/Softmax4d.onnx
diff --git a/test/input_models/Softplus.onnx b/test/input_models/Softplus.onnx
new file mode 100644
index 0000000..2f6a69f
--- /dev/null
+++ b/test/input_models/Softplus.onnx
@@ -0,0 +1,11 @@
+onnx-example:S
+
+inputoutput"SoftplusAbsZ
+input
+
+
+b
+output
+
+
+B
\ No newline at end of file
diff --git a/src/SOFIE_core/test/input_models/Split_0.onnx b/test/input_models/Split_0.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Split_0.onnx
rename to test/input_models/Split_0.onnx
diff --git a/src/SOFIE_core/test/input_models/Split_1.onnx b/test/input_models/Split_1.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Split_1.onnx
rename to test/input_models/Split_1.onnx
diff --git a/src/SOFIE_core/test/input_models/Split_2.onnx b/test/input_models/Split_2.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Split_2.onnx
rename to test/input_models/Split_2.onnx
diff --git a/src/SOFIE_core/test/input_models/Sqrt.onnx b/test/input_models/Sqrt.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Sqrt.onnx
rename to test/input_models/Sqrt.onnx
diff --git a/src/SOFIE_core/test/input_models/Sub.onnx b/test/input_models/Sub.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Sub.onnx
rename to test/input_models/Sub.onnx
diff --git a/src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx b/test/input_models/SumMultidirectionalBroadcast.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/SumMultidirectionalBroadcast.onnx
rename to test/input_models/SumMultidirectionalBroadcast.onnx
diff --git a/src/SOFIE_core/test/input_models/Tanh.onnx b/test/input_models/Tanh.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Tanh.onnx
rename to test/input_models/Tanh.onnx
diff --git a/src/SOFIE_core/test/input_models/Tile5D.onnx b/test/input_models/Tile5D.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Tile5D.onnx
rename to test/input_models/Tile5D.onnx
diff --git a/src/SOFIE_core/test/input_models/TopK.onnx b/test/input_models/TopK.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/TopK.onnx
rename to test/input_models/TopK.onnx
diff --git a/test/input_models/Transpose.onnx b/test/input_models/Transpose.onnx
new file mode 100644
index 0000000..0e08157
Binary files /dev/null and b/test/input_models/Transpose.onnx differ
diff --git a/test/input_models/TriluModelGenerator.py b/test/input_models/TriluModelGenerator.py
new file mode 100644
index 0000000..de110c4
--- /dev/null
+++ b/test/input_models/TriluModelGenerator.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Generate ONNX test models for the SOFIE Trilu operator and write
+the corresponding C++ reference headers.
+
+Models created
+──────────────
+  Trilu_upper.onnx         4×4 float, upper=1, k=0  (standard upper triangular)
+  Trilu_lower.onnx         4×4 float, upper=0, k=0  (standard lower triangular)
+  Trilu_k2.onnx            3×5 float, upper=1, k=2  (offset shifts diagonal up)
+  Trilu_kn1.onnx           3×5 float, upper=0, k=-1 (offset shifts diagonal down)
+  Trilu_3D.onnx            2×3×4 float, upper=1, k=0 (batched)
+
+All k values are embedded as ONNX Constant nodes (scalar int64) so they are
+resolved statically by SOFIE.
+
+Usage:
+    cd <repo>/SOFIE/core/test/input_models
+    python3 TriluModelGenerator.py
+"""
+
+import os
+import numpy as np
+import onnx
+from onnx import helper, TensorProto, numpy_helper
+
+OUT_DIR = os.path.dirname(os.path.abspath(__file__))
+REF_DIR = os.path.join(OUT_DIR, "references")
+os.makedirs(REF_DIR, exist_ok=True)
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+
+def make_trilu_model(input_shape: list[int],
+                     upper: int,
+                     k: int,
+                     name: str) -> onnx.ModelProto:
+    """Build an ONNX model with a single Trilu node.
+
+    k is embedded as a Constant node so SOFIE can resolve it statically.
+    """
+    # Graph inputs
+    x = helper.make_tensor_value_info("input", TensorProto.FLOAT, input_shape)
+
+    # Constant node for k
+    k_const_name = "k_const"
+    k_tensor = numpy_helper.from_array(np.array(k, dtype=np.int64),
+                                        name=k_const_name)
+    k_node = helper.make_node(
+        "Constant",
+        inputs=[],
+        outputs=[k_const_name],
+        value=k_tensor,
+    )
+
+    # Trilu node
+    trilu_node = helper.make_node(
+        "Trilu",
+        inputs=["input", k_const_name],
+        outputs=["output"],
+        upper=upper,
+    )
+
+    # Graph output (shape same as input)
+    y = helper.make_tensor_value_info("output", TensorProto.FLOAT, input_shape)
+
+    graph = helper.make_graph(
+        nodes=[k_node, trilu_node],
+        name=name,
+        inputs=[x],
+        outputs=[y],
+    )
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
+    model.ir_version = 7
+    onnx.checker.check_model(model)
+    return model
+
+
+def ref_header(name: str, data: np.ndarray) -> str:
+    """Emit a C++ header with the flattened expected output array."""
+    flat = data.flatten()
+    vals = ", ".join(f"{v:.8f}f" for v in flat)
+    return (
+        f"// Auto-generated by TriluModelGenerator.py — DO NOT EDIT\n"
+        f"#pragma once\n"
+        f"namespace {name}_ExpectedOutput {{\n"
+        f"   static float outputs[{flat.size}] = {{{vals}}};\n"
+        f"}} // namespace {name}_ExpectedOutput\n"
+    )
+
+
+def trilu_ref(x: np.ndarray, upper: int, k: int) -> np.ndarray:
+    """NumPy reference implementation of the Trilu operator."""
+    if upper:
+        return np.triu(x, k=k).astype(np.float32)
+    else:
+        return np.tril(x, k=k).astype(np.float32)
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+# Test cases
+# ─────────────────────────────────────────────────────────────────────────────
+
+rng = np.random.default_rng(42)
+
+CASES = [
+    # (name,               shape,       upper, k,  input_values)
+    ("Trilu_upper",        [4, 4],      1,  0,  rng.standard_normal((4, 4)).astype(np.float32)),
+    ("Trilu_lower",        [4, 4],      0,  0,  rng.standard_normal((4, 4)).astype(np.float32)),
+    ("Trilu_k2",           [3, 5],      1,  2,  rng.standard_normal((3, 5)).astype(np.float32)),
+    ("Trilu_kn1",          [3, 5],      0, -1,  rng.standard_normal((3, 5)).astype(np.float32)),
+    ("Trilu_3D",           [2, 3, 4],   1,  0,  rng.standard_normal((2, 3, 4)).astype(np.float32)),
+]
+
+for (name, shape, upper, k, x) in CASES:
+    # ── ONNX model ───────────────────────────────────────────────────────────
+    model = make_trilu_model(shape, upper, k, name)
+    onnx_path = os.path.join(OUT_DIR, f"{name}.onnx")
+    onnx.save(model, onnx_path)
+    print(f"Saved {onnx_path}")
+
+    # ── Reference output ─────────────────────────────────────────────────────
+    y = trilu_ref(x, upper, k)
+    ref_path = os.path.join(REF_DIR, f"{name}.ref.hxx")
+    with open(ref_path, "w") as f:
+        f.write(ref_header(name, y))
+    print(f"  → reference  {ref_path}  shape={list(y.shape)}")
+
+    # Also save the input so tests can reconstruct it.
+    inp_path = os.path.join(REF_DIR, f"{name}_input.ref.hxx")
+    with open(inp_path, "w") as f:
+        flat_in = x.flatten()
+        vals_in = ", ".join(f"{v:.8f}f" for v in flat_in)
+        f.write(
+            f"// Auto-generated by TriluModelGenerator.py — DO NOT EDIT\n"
+            f"#pragma once\n"
+            f"namespace {name}_Input {{\n"
+            f"   static float data[{flat_in.size}] = {{{vals_in}}};\n"
+            f"}} // namespace {name}_Input\n"
+        )
+    print(f"  → input ref  {inp_path}")
+
+print("\nAll Trilu test models and references generated successfully.")
diff --git a/test/input_models/Trilu_3D.onnx b/test/input_models/Trilu_3D.onnx
new file mode 100644
index 0000000..0a17c1b
Binary files /dev/null and b/test/input_models/Trilu_3D.onnx differ
diff --git a/test/input_models/Trilu_k2.onnx b/test/input_models/Trilu_k2.onnx
new file mode 100644
index 0000000..c484241
Binary files /dev/null and b/test/input_models/Trilu_k2.onnx differ
diff --git a/test/input_models/Trilu_kn1.onnx b/test/input_models/Trilu_kn1.onnx
new file mode 100644
index 0000000..c9865c3
Binary files /dev/null and b/test/input_models/Trilu_kn1.onnx differ
diff --git a/test/input_models/Trilu_lower.onnx b/test/input_models/Trilu_lower.onnx
new file mode 100644
index 0000000..9ac93d6
Binary files /dev/null and b/test/input_models/Trilu_lower.onnx differ
diff --git a/test/input_models/Trilu_upper.onnx b/test/input_models/Trilu_upper.onnx
new file mode 100644
index 0000000..637567b
Binary files /dev/null and b/test/input_models/Trilu_upper.onnx differ
diff --git a/src/SOFIE_core/test/input_models/Where.onnx b/test/input_models/Where.onnx
similarity index 100%
rename from src/SOFIE_core/test/input_models/Where.onnx
rename to test/input_models/Where.onnx
diff --git a/src/SOFIE_core/test/input_models/references/Add.ref.hxx b/test/input_models/references/Add.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Add.ref.hxx
rename to test/input_models/references/Add.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx b/test/input_models/references/AddBroadcast1.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast1.ref.hxx
rename to test/input_models/references/AddBroadcast1.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx b/test/input_models/references/AddBroadcast2.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast2.ref.hxx
rename to test/input_models/references/AddBroadcast2.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx b/test/input_models/references/AddBroadcast3.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast3.ref.hxx
rename to test/input_models/references/AddBroadcast3.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx b/test/input_models/references/AddBroadcast4.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast4.ref.hxx
rename to test/input_models/references/AddBroadcast4.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx b/test/input_models/references/AddBroadcast5.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast5.ref.hxx
rename to test/input_models/references/AddBroadcast5.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx b/test/input_models/references/AddBroadcast6.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast6.ref.hxx
rename to test/input_models/references/AddBroadcast6.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx b/test/input_models/references/AddBroadcast7.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AddBroadcast7.ref.hxx
rename to test/input_models/references/AddBroadcast7.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx b/test/input_models/references/AvgPool.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/AvgPool.ref.hxx
rename to test/input_models/references/AvgPool.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Cast.ref.hxx b/test/input_models/references/Cast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Cast.ref.hxx
rename to test/input_models/references/Cast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx b/test/input_models/references/ComplexTopK.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ComplexTopK.ref.hxx
rename to test/input_models/references/ComplexTopK.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Constant.ref.hxx b/test/input_models/references/Constant.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Constant.ref.hxx
rename to test/input_models/references/Constant.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx b/test/input_models/references/ConvTranspose1d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvTranspose1d.ref.hxx
rename to test/input_models/references/ConvTranspose1d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx b/test/input_models/references/ConvTranspose2d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvTranspose2d.ref.hxx
rename to test/input_models/references/ConvTranspose2d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx b/test/input_models/references/ConvTranspose3d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvTranspose3d.ref.hxx
rename to test/input_models/references/ConvTranspose3d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx b/test/input_models/references/ConvTransposeBias2d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2d.ref.hxx
rename to test/input_models/references/ConvTransposeBias2d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx b/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvTransposeBias2dBatched.ref.hxx
rename to test/input_models/references/ConvTransposeBias2dBatched.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx b/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithAsymmetricPadding.ref.hxx
rename to test/input_models/references/ConvWithAsymmetricPadding.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx b/test/input_models/references/ConvWithAutopadSameLower.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithAutopadSameLower.ref.hxx
rename to test/input_models/references/ConvWithAutopadSameLower.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx b/test/input_models/references/ConvWithPadding.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithPadding.ref.hxx
rename to test/input_models/references/ConvWithPadding.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx b/test/input_models/references/ConvWithStridesNoPadding.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithStridesNoPadding.ref.hxx
rename to test/input_models/references/ConvWithStridesNoPadding.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx b/test/input_models/references/ConvWithStridesPadding.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithStridesPadding.ref.hxx
rename to test/input_models/references/ConvWithStridesPadding.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx b/test/input_models/references/ConvWithoutPadding.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ConvWithoutPadding.ref.hxx
rename to test/input_models/references/ConvWithoutPadding.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Div.ref.hxx b/test/input_models/references/Div.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Div.ref.hxx
rename to test/input_models/references/Div.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Elu.ref.hxx b/test/input_models/references/Elu.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Elu.ref.hxx
rename to test/input_models/references/Elu.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Equal.ref.hxx b/test/input_models/references/Equal.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Equal.ref.hxx
rename to test/input_models/references/Equal.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Erf.ref.hxx b/test/input_models/references/Erf.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Erf.ref.hxx
rename to test/input_models/references/Erf.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Exp.ref.hxx b/test/input_models/references/Exp.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Exp.ref.hxx
rename to test/input_models/references/Exp.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx b/test/input_models/references/ExpandDiffSize.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ExpandDiffSize.ref.hxx
rename to test/input_models/references/ExpandDiffSize.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx b/test/input_models/references/ExpandSameSize.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ExpandSameSize.ref.hxx
rename to test/input_models/references/ExpandSameSize.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx b/test/input_models/references/EyeLike.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/EyeLike.ref.hxx
rename to test/input_models/references/EyeLike.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx b/test/input_models/references/GRUBatchwise.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GRUBatchwise.ref.hxx
rename to test/input_models/references/GRUBatchwise.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx b/test/input_models/references/GRUBidirectional.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GRUBidirectional.ref.hxx
rename to test/input_models/references/GRUBidirectional.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx b/test/input_models/references/GRUDefaults.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GRUDefaults.ref.hxx
rename to test/input_models/references/GRUDefaults.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx b/test/input_models/references/GRUInitialBias.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GRUInitialBias.ref.hxx
rename to test/input_models/references/GRUInitialBias.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx b/test/input_models/references/GRUSeqLength.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GRUSeqLength.ref.hxx
rename to test/input_models/references/GRUSeqLength.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx b/test/input_models/references/Gather2d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Gather2d.ref.hxx
rename to test/input_models/references/Gather2d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx b/test/input_models/references/GatherAxis0.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GatherAxis0.ref.hxx
rename to test/input_models/references/GatherAxis0.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx b/test/input_models/references/GatherAxis1.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GatherAxis1.ref.hxx
rename to test/input_models/references/GatherAxis1.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx b/test/input_models/references/GatherAxis2.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GatherAxis2.ref.hxx
rename to test/input_models/references/GatherAxis2.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx b/test/input_models/references/GatherAxis3.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GatherAxis3.ref.hxx
rename to test/input_models/references/GatherAxis3.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx b/test/input_models/references/GatherNegativeIndices.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GatherNegativeIndices.ref.hxx
rename to test/input_models/references/GatherNegativeIndices.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Greater.ref.hxx b/test/input_models/references/Greater.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Greater.ref.hxx
rename to test/input_models/references/Greater.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx b/test/input_models/references/GreaterOrEqual.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/GreaterOrEqual.ref.hxx
rename to test/input_models/references/GreaterOrEqual.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx b/test/input_models/references/LSTMBatchwise.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LSTMBatchwise.ref.hxx
rename to test/input_models/references/LSTMBatchwise.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx b/test/input_models/references/LSTMBidirectional.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LSTMBidirectional.ref.hxx
rename to test/input_models/references/LSTMBidirectional.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx b/test/input_models/references/LSTMDefaults.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LSTMDefaults.ref.hxx
rename to test/input_models/references/LSTMDefaults.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx b/test/input_models/references/LSTMInitialBias.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LSTMInitialBias.ref.hxx
rename to test/input_models/references/LSTMInitialBias.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx b/test/input_models/references/LSTMPeepholes.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LSTMPeepholes.ref.hxx
rename to test/input_models/references/LSTMPeepholes.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx b/test/input_models/references/LayerNormalization2d.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LayerNormalization2d.hxx
rename to test/input_models/references/LayerNormalization2d.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx b/test/input_models/references/LayerNormalization4d.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LayerNormalization4d.hxx
rename to test/input_models/references/LayerNormalization4d.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Less.ref.hxx b/test/input_models/references/Less.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Less.ref.hxx
rename to test/input_models/references/Less.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx b/test/input_models/references/LessOrEqual.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LessOrEqual.ref.hxx
rename to test/input_models/references/LessOrEqual.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx b/test/input_models/references/LinearWithLeakyRelu.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LinearWithLeakyRelu.ref.hxx
rename to test/input_models/references/LinearWithLeakyRelu.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx b/test/input_models/references/LinearWithSelu.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LinearWithSelu.ref.hxx
rename to test/input_models/references/LinearWithSelu.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx b/test/input_models/references/LinearWithSigmoid.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/LinearWithSigmoid.ref.hxx
rename to test/input_models/references/LinearWithSigmoid.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx b/test/input_models/references/Linear_16.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Linear_16.ref.hxx
rename to test/input_models/references/Linear_16.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx b/test/input_models/references/Linear_32.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Linear_32.ref.hxx
rename to test/input_models/references/Linear_32.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx b/test/input_models/references/Linear_64.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Linear_64.ref.hxx
rename to test/input_models/references/Linear_64.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Log.ref.hxx b/test/input_models/references/Log.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Log.ref.hxx
rename to test/input_models/references/Log.ref.hxx
diff --git a/test/input_models/references/Logic_And.ref.hxx b/test/input_models/references/Logic_And.ref.hxx
new file mode 100644
index 0000000..02b36dc
--- /dev/null
+++ b/test/input_models/references/Logic_And.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_And_ExpectedOutput {
+   static uint8_t outputs[16] = {0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0};
+} // namespace Logic_And_ExpectedOutput
diff --git a/test/input_models/references/Logic_And_input.ref.hxx b/test/input_models/references/Logic_And_input.ref.hxx
new file mode 100644
index 0000000..0caf6a7
--- /dev/null
+++ b/test/input_models/references/Logic_And_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_And_Input {
+   static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1};
+   static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0};
+} // namespace Logic_And_Input
diff --git a/test/input_models/references/Logic_BitwiseAnd.ref.hxx b/test/input_models/references/Logic_BitwiseAnd.ref.hxx
new file mode 100644
index 0000000..d8c16af
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseAnd.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseAnd_ExpectedOutput {
+   static int32_t outputs[15] = {14, 2, 5, 32, 66, 64, -47, 26, -92, 1, 40, 32, 32, 84, 48};
+} // namespace Logic_BitwiseAnd_ExpectedOutput
diff --git a/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx b/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx
new file mode 100644
index 0000000..99c049b
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseAnd_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseAnd_Input {
+   static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11};
+   static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48};
+} // namespace Logic_BitwiseAnd_Input
diff --git a/test/input_models/references/Logic_BitwiseNot.ref.hxx b/test/input_models/references/Logic_BitwiseNot.ref.hxx
new file mode 100644
index 0000000..a8de28f
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseNot.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseNot_ExpectedOutput {
+   static int32_t outputs[24] = {26, -94, 17, 34, -82, 25, 84, 6, -60, 62, 7, 74, -38, 4, 33, 54, -13, -34, -89, 12, 67, -67, -26, -41};
+} // namespace Logic_BitwiseNot_ExpectedOutput
diff --git a/test/input_models/references/Logic_BitwiseNot_input.ref.hxx b/test/input_models/references/Logic_BitwiseNot_input.ref.hxx
new file mode 100644
index 0000000..34408f7
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseNot_input.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseNot_Input {
+   static int32_t data_a[24] = {-27, 93, -18, -35, 81, -26, -85, -7, 59, -63, -8, -75, 37, -5, -34, -55, 12, 33, 88, -13, -68, 66, 25, 40};
+} // namespace Logic_BitwiseNot_Input
diff --git a/test/input_models/references/Logic_BitwiseOr.ref.hxx b/test/input_models/references/Logic_BitwiseOr.ref.hxx
new file mode 100644
index 0000000..25ff8ac
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseOr.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseOr_ExpectedOutput {
+   static int32_t outputs[15] = {-18, 43, 127, -69, -57, -27, -5, -1, -67, 59, -70, -14, -83, 94, -11};
+} // namespace Logic_BitwiseOr_ExpectedOutput
diff --git a/test/input_models/references/Logic_BitwiseOr_input.ref.hxx b/test/input_models/references/Logic_BitwiseOr_input.ref.hxx
new file mode 100644
index 0000000..c77c05e
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseOr_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseOr_Input {
+   static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11};
+   static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48};
+} // namespace Logic_BitwiseOr_Input
diff --git a/test/input_models/references/Logic_BitwiseXor.ref.hxx b/test/input_models/references/Logic_BitwiseXor.ref.hxx
new file mode 100644
index 0000000..0885a6a
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseXor.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseXor_ExpectedOutput {
+   static int32_t outputs[15] = {-32, 41, 122, -101, -123, -91, 42, -27, 25, 58, -110, -46, -115, 10, -59};
+} // namespace Logic_BitwiseXor_ExpectedOutput
diff --git a/test/input_models/references/Logic_BitwiseXor_input.ref.hxx b/test/input_models/references/Logic_BitwiseXor_input.ref.hxx
new file mode 100644
index 0000000..dae7b9c
--- /dev/null
+++ b/test/input_models/references/Logic_BitwiseXor_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_BitwiseXor_Input {
+   static int32_t data_a[15] = {-82, 10, 77, -88, 71, 65, -45, 26, -67, 51, 40, -30, -87, 94, -11};
+   static int32_t data_b[15] = {78, 35, 55, 51, -62, -28, -7, -1, -92, 9, -70, 48, 36, 84, 48};
+} // namespace Logic_BitwiseXor_Input
diff --git a/test/input_models/references/Logic_Or.ref.hxx b/test/input_models/references/Logic_Or.ref.hxx
new file mode 100644
index 0000000..311de40
--- /dev/null
+++ b/test/input_models/references/Logic_Or.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_Or_ExpectedOutput {
+   static uint8_t outputs[16] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+} // namespace Logic_Or_ExpectedOutput
diff --git a/test/input_models/references/Logic_Or_input.ref.hxx b/test/input_models/references/Logic_Or_input.ref.hxx
new file mode 100644
index 0000000..6d3bd98
--- /dev/null
+++ b/test/input_models/references/Logic_Or_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_Or_Input {
+   static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1};
+   static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0};
+} // namespace Logic_Or_Input
diff --git a/test/input_models/references/Logic_Xor.ref.hxx b/test/input_models/references/Logic_Xor.ref.hxx
new file mode 100644
index 0000000..af57d25
--- /dev/null
+++ b/test/input_models/references/Logic_Xor.ref.hxx
@@ -0,0 +1,6 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_Xor_ExpectedOutput {
+   static uint8_t outputs[16] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1};
+} // namespace Logic_Xor_ExpectedOutput
diff --git a/test/input_models/references/Logic_Xor_input.ref.hxx b/test/input_models/references/Logic_Xor_input.ref.hxx
new file mode 100644
index 0000000..51d62e3
--- /dev/null
+++ b/test/input_models/references/Logic_Xor_input.ref.hxx
@@ -0,0 +1,7 @@
+// Auto-generated by LogicModelGenerator.py — DO NOT EDIT
+#pragma once
+#include <cstdint>
+namespace Logic_Xor_Input {
+   static uint8_t data_a[16] = {0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1};
+   static uint8_t data_b[16] = {1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0};
+} // namespace Logic_Xor_Input
diff --git a/src/SOFIE_core/test/input_models/references/Max.ref.hxx b/test/input_models/references/Max.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Max.ref.hxx
rename to test/input_models/references/Max.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx
rename to test/input_models/references/MaxMultidirectionalBroadcast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx b/test/input_models/references/MaxPool1d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MaxPool1d.ref.hxx
rename to test/input_models/references/MaxPool1d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx b/test/input_models/references/MaxPool2d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MaxPool2d.ref.hxx
rename to test/input_models/references/MaxPool2d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx b/test/input_models/references/MaxPool3d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MaxPool3d.ref.hxx
rename to test/input_models/references/MaxPool3d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx
rename to test/input_models/references/MeanMultidirectionalBroadcast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx b/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/MinMultidirectionalBroadcast.ref.hxx
rename to test/input_models/references/MinMultidirectionalBroadcast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Mul.ref.hxx b/test/input_models/references/Mul.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Mul.ref.hxx
rename to test/input_models/references/Mul.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Neg.ref.hxx b/test/input_models/references/Neg.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Neg.ref.hxx
rename to test/input_models/references/Neg.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Pow.ref.hxx b/test/input_models/references/Pow.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Pow.ref.hxx
rename to test/input_models/references/Pow.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx b/test/input_models/references/Pow_broadcast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Pow_broadcast.ref.hxx
rename to test/input_models/references/Pow_broadcast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx b/test/input_models/references/RNNBatchwise.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNBatchwise.ref.hxx
rename to test/input_models/references/RNNBatchwise.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx b/test/input_models/references/RNNBidirectional.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNBidirectional.ref.hxx
rename to test/input_models/references/RNNBidirectional.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx b/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNBidirectionalBatchwise.ref.hxx
rename to test/input_models/references/RNNBidirectionalBatchwise.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx b/test/input_models/references/RNNDefaults.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNDefaults.ref.hxx
rename to test/input_models/references/RNNDefaults.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx b/test/input_models/references/RNNSeqLength.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNSeqLength.ref.hxx
rename to test/input_models/references/RNNSeqLength.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx b/test/input_models/references/RNNSequence.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNSequence.ref.hxx
rename to test/input_models/references/RNNSequence.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx b/test/input_models/references/RNNSequenceBatchwise.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RNNSequenceBatchwise.ref.hxx
rename to test/input_models/references/RNNSequenceBatchwise.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx b/test/input_models/references/RangeFloat.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RangeFloat.ref.hxx
rename to test/input_models/references/RangeFloat.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx b/test/input_models/references/RangeInt.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/RangeInt.ref.hxx
rename to test/input_models/references/RangeInt.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx b/test/input_models/references/Reciprocal.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Reciprocal.ref.hxx
rename to test/input_models/references/Reciprocal.ref.hxx
diff --git a/test/input_models/references/ReduceL2.ref.hxx b/test/input_models/references/ReduceL2.ref.hxx
new file mode 100644
index 0000000..079b68b
--- /dev/null
+++ b/test/input_models/references/ReduceL2.ref.hxx
@@ -0,0 +1,9 @@
+namespace ReduceL2_ExpectedOutput{
+   // Input [1,2,3] = {5,2,3,5,5,4}, ReduceL2 over axis=1, keepdims=0 → shape [1,3]
+   // col0: sqrt(5^2+5^2)=sqrt(50), col1: sqrt(2^2+5^2)=sqrt(29), col2: sqrt(3^2+4^2)=5
+   float output[] = {
+      7.0710678118654755f,
+      5.385164807134504f,
+      5.0f
+   };
+} // namespace ReduceL2_ExpectedOutput
diff --git a/test/input_models/references/ReduceMax.ref.hxx b/test/input_models/references/ReduceMax.ref.hxx
new file mode 100644
index 0000000..b986048
--- /dev/null
+++ b/test/input_models/references/ReduceMax.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace ReduceMax_ExpectedOutput {
+   static float output[3] = {5.00000000f, 5.00000000f, 4.00000000f};
+} // namespace ReduceMax_ExpectedOutput
diff --git a/test/input_models/references/ReduceMax_axis0.ref.hxx b/test/input_models/references/ReduceMax_axis0.ref.hxx
new file mode 100644
index 0000000..1d16a5a
--- /dev/null
+++ b/test/input_models/references/ReduceMax_axis0.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace ReduceMax_axis0_ExpectedOutput {
+   static float output[4] = {0.30471709f, -0.85304391f, 0.87939799f, 0.94056469f};
+} // namespace ReduceMax_axis0_ExpectedOutput
diff --git a/test/input_models/references/ReduceMax_mid.ref.hxx b/test/input_models/references/ReduceMax_mid.ref.hxx
new file mode 100644
index 0000000..4d88eaf
--- /dev/null
+++ b/test/input_models/references/ReduceMax_mid.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by ReduceMaxModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace ReduceMax_mid_ExpectedOutput {
+   static float output[8] = {0.36875078f, 1.12724125f, 1.22254133f, -0.04992591f, 0.41273260f, 0.43082100f, 2.14164758f, 1.12897229f};
+} // namespace ReduceMax_mid_ExpectedOutput
diff --git a/src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx b/test/input_models/references/ReduceMean.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ReduceMean.ref.hxx
rename to test/input_models/references/ReduceMean.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx b/test/input_models/references/ReduceProd.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/ReduceProd.ref.hxx
rename to test/input_models/references/ReduceProd.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Shape.ref.hxx b/test/input_models/references/Shape.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Shape.ref.hxx
rename to test/input_models/references/Shape.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Slice.ref.hxx b/test/input_models/references/Slice.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Slice.ref.hxx
rename to test/input_models/references/Slice.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx b/test/input_models/references/Slice_Default_Axis.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Slice_Default_Axis.ref.hxx
rename to test/input_models/references/Slice_Default_Axis.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx b/test/input_models/references/Slice_Default_Steps.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Slice_Default_Steps.ref.hxx
rename to test/input_models/references/Slice_Default_Steps.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx b/test/input_models/references/Slice_Neg.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Slice_Neg.ref.hxx
rename to test/input_models/references/Slice_Neg.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx b/test/input_models/references/Softmax1d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Softmax1d.ref.hxx
rename to test/input_models/references/Softmax1d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx b/test/input_models/references/Softmax2d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Softmax2d.ref.hxx
rename to test/input_models/references/Softmax2d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx b/test/input_models/references/Softmax3d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Softmax3d.ref.hxx
rename to test/input_models/references/Softmax3d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx b/test/input_models/references/Softmax4d.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Softmax4d.ref.hxx
rename to test/input_models/references/Softmax4d.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx b/test/input_models/references/Sqrt.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Sqrt.ref.hxx
rename to test/input_models/references/Sqrt.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Sub.ref.hxx b/test/input_models/references/Sub.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Sub.ref.hxx
rename to test/input_models/references/Sub.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx b/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/SumMultidirectionalBroadcast.ref.hxx
rename to test/input_models/references/SumMultidirectionalBroadcast.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Tanh.ref.hxx b/test/input_models/references/Tanh.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Tanh.ref.hxx
rename to test/input_models/references/Tanh.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx b/test/input_models/references/Tile5D.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/Tile5D.ref.hxx
rename to test/input_models/references/Tile5D.ref.hxx
diff --git a/src/SOFIE_core/test/input_models/references/TopK.ref.hxx b/test/input_models/references/TopK.ref.hxx
similarity index 100%
rename from src/SOFIE_core/test/input_models/references/TopK.ref.hxx
rename to test/input_models/references/TopK.ref.hxx
diff --git a/test/input_models/references/Trilu_3D.ref.hxx b/test/input_models/references/Trilu_3D.ref.hxx
new file mode 100644
index 0000000..ac94083
--- /dev/null
+++ b/test/input_models/references/Trilu_3D.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_3D_ExpectedOutput {
+   static float outputs[24] = {0.16275306f, 0.58622235f, 0.71122658f, 0.79334724f, 0.00000000f, -0.46235180f, 0.85797590f, -0.19130433f, 0.00000000f, 0.00000000f, -0.91945231f, 0.49716073f, 0.14242573f, 0.69048536f, -0.42725265f, 0.15853970f, 0.00000000f, -0.30934653f, 0.45677525f, -0.66192591f, 0.00000000f, 0.00000000f, -1.19583964f, 0.48697248f};
+} // namespace Trilu_3D_ExpectedOutput
diff --git a/test/input_models/references/Trilu_3D_input.ref.hxx b/test/input_models/references/Trilu_3D_input.ref.hxx
new file mode 100644
index 0000000..257553f
--- /dev/null
+++ b/test/input_models/references/Trilu_3D_input.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_3D_Input {
+   static float data[24] = {0.16275306f, 0.58622235f, 0.71122658f, 0.79334724f, -0.34872508f, -0.46235180f, 0.85797590f, -0.19130433f, -1.27568626f, -1.13328719f, -0.91945231f, 0.49716073f, 0.14242573f, 0.69048536f, -0.42725265f, 0.15853970f, 0.62559038f, -0.30934653f, 0.45677525f, -0.66192591f, -0.36305386f, -0.38173789f, -1.19583964f, 0.48697248f};
+} // namespace Trilu_3D_Input
diff --git a/test/input_models/references/Trilu_k2.ref.hxx b/test/input_models/references/Trilu_k2.ref.hxx
new file mode 100644
index 0000000..b8828e0
--- /dev/null
+++ b/test/input_models/references/Trilu_k2.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_k2_ExpectedOutput {
+   static float outputs[15] = {0.00000000f, 0.00000000f, 0.61597943f, 1.12897229f, -0.11394746f, 0.00000000f, 0.00000000f, 0.00000000f, 0.74325418f, 0.54315424f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 0.87142879f};
+} // namespace Trilu_k2_ExpectedOutput
diff --git a/test/input_models/references/Trilu_k2_input.ref.hxx b/test/input_models/references/Trilu_k2_input.ref.hxx
new file mode 100644
index 0000000..7078b76
--- /dev/null
+++ b/test/input_models/references/Trilu_k2_input.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_k2_Input {
+   static float data[15] = {-0.51224273f, -0.81377274f, 0.61597943f, 1.12897229f, -0.11394746f, -0.84015650f, -0.82448119f, 0.65059280f, 0.74325418f, 0.54315424f, -0.66550970f, 0.23216133f, 0.11668581f, 0.21868859f, 0.87142879f};
+} // namespace Trilu_k2_Input
diff --git a/test/input_models/references/Trilu_kn1.ref.hxx b/test/input_models/references/Trilu_kn1.ref.hxx
new file mode 100644
index 0000000..52fc33c
--- /dev/null
+++ b/test/input_models/references/Trilu_kn1.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_kn1_ExpectedOutput {
+   static float outputs[15] = {0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, -1.45715582f, 0.00000000f, 0.00000000f, 0.00000000f, 0.00000000f, 1.49494135f, -0.86583114f, 0.00000000f, 0.00000000f, 0.00000000f};
+} // namespace Trilu_kn1_ExpectedOutput
diff --git a/test/input_models/references/Trilu_kn1_input.ref.hxx b/test/input_models/references/Trilu_kn1_input.ref.hxx
new file mode 100644
index 0000000..4852760
--- /dev/null
+++ b/test/input_models/references/Trilu_kn1_input.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_kn1_Input {
+   static float data[15] = {0.22359554f, 0.67891353f, 0.06757907f, 0.28911939f, 0.63128823f, -1.45715582f, -0.31967121f, -0.47037265f, -0.63887787f, -0.27514225f, 1.49494135f, -0.86583114f, 0.96827835f, -1.68286979f, -0.33488503f};
+} // namespace Trilu_kn1_Input
diff --git a/test/input_models/references/Trilu_lower.ref.hxx b/test/input_models/references/Trilu_lower.ref.hxx
new file mode 100644
index 0000000..dcacea7
--- /dev/null
+++ b/test/input_models/references/Trilu_lower.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_lower_ExpectedOutput {
+   static float outputs[16] = {0.36875078f, 0.00000000f, 0.00000000f, 0.00000000f, -0.18486236f, -0.68092954f, 0.00000000f, 0.00000000f, -0.42832783f, -0.35213354f, 0.53230917f, 0.00000000f, 0.41273260f, 0.43082100f, 2.14164758f, -0.40641502f};
+} // namespace Trilu_lower_ExpectedOutput
diff --git a/test/input_models/references/Trilu_lower_input.ref.hxx b/test/input_models/references/Trilu_lower_input.ref.hxx
new file mode 100644
index 0000000..ca44378
--- /dev/null
+++ b/test/input_models/references/Trilu_lower_input.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_lower_Input {
+   static float data[16] = {0.36875078f, -0.95888263f, 0.87845027f, -0.04992591f, -0.18486236f, -0.68092954f, 1.22254133f, -0.15452948f, -0.42832783f, -0.35213354f, 0.53230917f, 0.36544406f, 0.41273260f, 0.43082100f, 2.14164758f, -0.40641502f};
+} // namespace Trilu_lower_Input
diff --git a/test/input_models/references/Trilu_upper.ref.hxx b/test/input_models/references/Trilu_upper.ref.hxx
new file mode 100644
index 0000000..920861b
--- /dev/null
+++ b/test/input_models/references/Trilu_upper.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_upper_ExpectedOutput {
+   static float outputs[16] = {0.30471709f, -1.03998411f, 0.75045121f, 0.94056469f, 0.00000000f, -1.30217946f, 0.12784040f, -0.31624261f, 0.00000000f, 0.00000000f, 0.87939799f, 0.77779192f, 0.00000000f, 0.00000000f, 0.00000000f, -0.85929245f};
+} // namespace Trilu_upper_ExpectedOutput
diff --git a/test/input_models/references/Trilu_upper_input.ref.hxx b/test/input_models/references/Trilu_upper_input.ref.hxx
new file mode 100644
index 0000000..cae440f
--- /dev/null
+++ b/test/input_models/references/Trilu_upper_input.ref.hxx
@@ -0,0 +1,5 @@
+// Auto-generated by TriluModelGenerator.py — DO NOT EDIT
+#pragma once
+namespace Trilu_upper_Input {
+   static float data[16] = {0.30471709f, -1.03998411f, 0.75045121f, 0.94056469f, -1.95103514f, -1.30217946f, 0.12784040f, -0.31624261f, -0.01680116f, -0.85304391f, 0.87939799f, 0.77779192f, 0.06603070f, 1.12724125f, 0.46750933f, -0.85929245f};
+} // namespace Trilu_upper_Input
diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt
new file mode 100644
index 0000000..36cfc55
--- /dev/null
+++ b/utils/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_library(utils INTERFACE)
+
+target_include_directories(utils INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+install(TARGETS utils
+  EXPORT SOFIETargets
+)
+
+install(
+    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
diff --git a/utils/SOFIE/RTensor.hxx b/utils/SOFIE/RTensor.hxx
new file mode 100644
index 0000000..db82dc9
--- /dev/null
+++ b/utils/SOFIE/RTensor.hxx
@@ -0,0 +1,628 @@
+#ifndef SOFIE_RTENSOR
+#define SOFIE_RTENSOR
+
+#include <vector>
+#include <cstddef>     // std::size_t
+#include <cstdint>
+#include <stdexcept>   // std::runtime_error
+#include <sstream>     // std::stringstream
+#include <memory>      // std::shared_ptr
+#include <type_traits> // std::is_convertible
+#include <algorithm>   // std::reverse
+#include <iterator>    // std::random_access_iterator_tag
+
+namespace SOFIE {
+
+/// Memory layout type
+enum class MemoryLayout : uint8_t {
+   RowMajor = 0x01,
+   ColumnMajor = 0x02
+};
+
+namespace Internal {
+
+/// \brief Get size of tensor from shape vector
+/// \param[in] shape Shape vector
+/// \return Size of contiguous memory
+template <typename T>
+inline std::size_t GetSizeFromShape(const T &shape)
+{
+   if (shape.size() == 0)
+      return 0;
+   std::size_t size = 1;
+   for (auto &s : shape)
+      size *= s;
+   return size;
+}
+
+/// \brief Compute strides from shape vector.
+/// \param[in] shape Shape vector
+/// \param[in] layout Memory layout
+/// \return Size of contiguous memory
+///
+/// This information is needed for the multi-dimensional indexing. See here:
+/// https://en.wikipedia.org/wiki/Row-_and_column-major_order
+/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html
+template <typename T>
+inline std::vector<std::size_t> ComputeStridesFromShape(const T &shape, MemoryLayout layout)
+{
+   const auto size = shape.size();
+   T strides(size);
+   if (layout == MemoryLayout::RowMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[size - 1 - i] = 1;
+         } else {
+            strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1];
+         }
+      }
+   } else if (layout == MemoryLayout::ColumnMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[i] = 1;
+         } else {
+            strides[i] = strides[i - 1] * shape[i - 1];
+         }
+      }
+   } else {
+      std::stringstream ss;
+      ss << "Memory layout type is not valid for calculating strides.";
+      throw std::runtime_error(ss.str());
+   }
+   return strides;
+}
+
+/// \brief Compute indices from global index
+/// \param[in] shape Shape vector
+/// \param[in] idx Global index
+/// \param[in] layout Memory layout
+/// \return Indice vector
+template <typename T>
+inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx)
+{
+    const auto size = shape.size();
+    auto strides = ComputeStridesFromShape(shape, layout);
+    T indices(size);
+    auto r = idx;
+    for (std::size_t i = 0; i < size; i++) {
+        indices[i] = int(r / strides[i]);
+        r = r % strides[i];
+    }
+    return indices;
+}
+
+/// \brief Compute global index from indices
+/// \param[in] strides Strides vector
+/// \param[in] idx Indice vector
+/// \return Global index
+template <typename U, typename V>
+inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx)
+{
+   std::size_t globalIndex = 0;
+   const auto size = idx.size();
+   for (std::size_t i = 0; i < size; i++) {
+      globalIndex += strides[size - 1 - i] * idx[size - 1 - i];
+   }
+   return globalIndex;
+}
+
+/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible
+template <class... Ts>
+struct and_types : std::true_type {
+};
+
+template <class T0, class... Ts>
+struct and_types<T0, Ts...> : std::integral_constant<bool, T0() && and_types<Ts...>()> {
+};
+
+/// \brief Copy slice of a tensor recursively from here to there
+/// \param[in] here Source tensor
+/// \param[in] there Target tensor (slice of source tensor)
+/// \param[in] mins Minimum of indices for each dimension
+/// \param[in] maxs Maximum of indices for each dimension
+/// \param[in] idx Current indices
+/// \param[in] active Active index needed to stop the recursion
+///
+/// Copy the content of a slice of a tensor from source to target. This is done
+/// by recursively iterating over the ranges of the slice for each dimension.
+template <typename T>
+void RecursiveCopy(const T &here, T &there,
+                   const std::vector<std::size_t> &mins, const std::vector<std::size_t> &maxs,
+                   std::vector<std::size_t> idx, std::size_t active)
+{
+   const auto size = idx.size();
+   for (std::size_t i = mins[active]; i < maxs[active]; i++) {
+      idx[active] = i;
+      if (active == size - 1) {
+         auto idxThere = idx;
+         for (std::size_t j = 0; j < size; j++) {
+            idxThere[j] -= mins[j];
+         }
+         there(idxThere) = here(idx);
+      } else {
+         Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1);
+      }
+   }
+}
+
+} // namespace SOFIE::Internal
+
+/// \class SOFIE::RTensor
+/// \brief RTensor is a container with contiguous memory and shape information.
+/// \tparam T Data-type of the tensor
+///
+/// An RTensor is a vector-like container, which has additional shape information.
+/// The elements of the multi-dimensional container can be accessed by their
+/// indices in a coherent way without taking care about the one-dimensional memory
+/// layout of the contiguous storage. This also allows to manipulate the shape
+/// of the container without moving the actual elements in memory. Another feature
+/// is that an RTensor can own the underlying contiguous memory but can also represent
+/// only a view on existing data without owning it.
+template <typename V, typename C = std::vector<V>>
+class RTensor {
+public:
+   // Typedefs
+   using Value_t = V;
+   using Shape_t = std::vector<std::size_t>;
+   using Index_t = Shape_t;
+   using Slice_t = std::vector<Shape_t>;
+   using Container_t = C;
+
+private:
+   Shape_t fShape;
+   Shape_t fStrides;
+   std::size_t fSize;
+   MemoryLayout fLayout;
+   Value_t *fData;
+   std::shared_ptr<Container_t> fContainer;
+
+protected:
+   void ReshapeInplace(const Shape_t &shape);
+
+public:
+   // Constructors
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+   }
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] strides Strides vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+   }
+
+   /// \brief Construct a tensor owning externally provided data
+   /// \param[in] container Shared pointer to data container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(std::shared_ptr<Container_t> container, Shape_t shape,
+           MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fContainer(container)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fData = std::data(*fContainer);
+   }
+
+   /// \brief Construct a tensor owning data initialized with new container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout)
+   {
+      // TODO: Document how data pointer is determined using STL iterator interface.
+      // TODO: Sanitize given container type with type traits
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fContainer = std::make_shared<Container_t>(fSize);
+      fData = std::data(*fContainer);
+   }
+
+   // Access elements
+   Value_t &operator()(const Index_t &idx);
+   const Value_t &operator() (const Index_t &idx) const;
+   template <typename... Idx> Value_t &operator()(Idx... idx);
+   template <typename... Idx> const Value_t &operator() (Idx... idx) const;
+
+   // Access properties
+   std::size_t GetSize() const { return fSize; }
+   const Shape_t &GetShape() const { return fShape; }
+   const Shape_t &GetStrides() const { return fStrides; }
+   Value_t *GetData() { return fData; }
+   const Value_t *GetData() const { return fData; }
+   std::shared_ptr<Container_t> GetContainer() { return fContainer; }
+   const std::shared_ptr<Container_t> GetContainer() const { return fContainer; }
+   MemoryLayout GetMemoryLayout() const { return fLayout; }
+   bool IsView() const { return fContainer == nullptr; }
+   bool IsOwner() const { return !IsView(); }
+
+   // Copy
+   RTensor<Value_t, Container_t> Copy(MemoryLayout layout = MemoryLayout::RowMajor) const;
+
+   // Transformations
+   RTensor<Value_t, Container_t> Transpose() const;
+   RTensor<Value_t, Container_t> Squeeze() const;
+   RTensor<Value_t, Container_t> ExpandDims(int idx) const;
+   RTensor<Value_t, Container_t> Reshape(const Shape_t &shape) const;
+   RTensor<Value_t, Container_t> Resize(const Shape_t &shape);
+   RTensor<Value_t, Container_t> Slice(const Slice_t &slice);
+
+   // Iterator class
+   class Iterator {
+   private:
+      RTensor<Value_t, Container_t>& fTensor;
+      Index_t::value_type fGlobalIndex;
+   public:
+      using iterator_category = std::random_access_iterator_tag;
+      using value_type = Value_t;
+      using difference_type = std::ptrdiff_t;
+      using pointer = Value_t *;
+      using reference = Value_t &;
+
+      Iterator(RTensor<Value_t, Container_t>& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {}
+      Iterator& operator++() { fGlobalIndex++; return *this; }
+      Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; }
+      Iterator& operator--() { fGlobalIndex--; return *this; }
+      Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; }
+      Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); }
+      Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); }
+      difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); }
+      Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; }
+      Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; }
+      Value_t& operator*()
+      {
+         auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex);
+         return fTensor(idx);
+      }
+      bool operator==(const Iterator& rhs) const
+      {
+         if (fGlobalIndex == rhs.GetGlobalIndex()) return true;
+         return false;
+      }
+      bool operator!=(const Iterator& rhs) const { return !operator==(rhs); };
+      bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); }
+      bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); }
+      bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); }
+      bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); }
+      typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; };
+   };
+
+   // Iterator interface
+   // TODO: Document that the iterator always iterates following the physical memory layout.
+   Iterator begin() noexcept {
+      return Iterator(*this, 0);
+   }
+   Iterator end() noexcept {
+      return Iterator(*this, fSize);
+   }
+};
+
+/// \brief Reshape tensor in place
+/// \param[in] shape Shape vector
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline void RTensor<Value_t, Container_t>::ReshapeInplace(const Shape_t &shape)
+{
+   const auto size = Internal::GetSizeFromShape(shape);
+   if (size != fSize) {
+      std::stringstream ss;
+      ss << "Cannot reshape tensor with size " << fSize << " into shape { ";
+      for (std::size_t i = 0; i < shape.size(); i++) {
+         if (i != shape.size() - 1) {
+            ss << shape[i] << ", ";
+         } else {
+            ss << shape[i] << " }.";
+         }
+      }
+      throw std::runtime_error(ss.str());
+   }
+
+   // Compute new strides from shape
+   auto strides = Internal::ComputeStridesFromShape(shape, fLayout);
+   fShape = shape;
+   fStrides = strides;
+}
+
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline Value_t &RTensor<Value_t, Container_t>::operator()(const Index_t &idx)
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline const Value_t &RTensor<Value_t, Container_t>::operator() (const Index_t &idx) const
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+Value_t &RTensor<Value_t, Container_t>::operator()(Idx... idx)
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+const Value_t &RTensor<Value_t, Container_t>::operator() (Idx... idx) const
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Transpose
+/// \returns New RTensor
+/// The tensor is transposed by inverting the associated memory layout from row-
+/// major to column-major and vice versa. Therefore, the underlying data is not
+/// touched.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Transpose() const
+{
+   MemoryLayout layout;
+   // Transpose by inverting memory layout
+   if (fLayout == MemoryLayout::RowMajor) {
+      layout = MemoryLayout::ColumnMajor;
+   } else if (fLayout == MemoryLayout::ColumnMajor) {
+      layout = MemoryLayout::RowMajor;
+   } else {
+      throw std::runtime_error("Memory layout is not known.");
+   }
+
+   // Create copy of container
+   RTensor<Value_t, Container_t> x(fData, fShape, fStrides, layout);
+
+   // Reverse shape
+   std::reverse(x.fShape.begin(), x.fShape.end());
+
+   // Reverse strides
+   std::reverse(x.fStrides.begin(), x.fStrides.end());
+
+   return x;
+}
+
+/// \brief Squeeze dimensions
+/// \returns New RTensor
+/// Squeeze removes the dimensions of size one from the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Squeeze() const
+{
+   // Remove dimensions of one and associated strides
+   Shape_t shape;
+   Shape_t strides;
+   for (std::size_t i = 0; i < fShape.size(); i++) {
+      if (fShape[i] != 1) {
+         shape.emplace_back(fShape[i]);
+         strides.emplace_back(fStrides[i]);
+      }
+   }
+
+   // If all dimensions are 1, we need to keep one.
+   // This does not apply if the inital shape is already empty. Then, return
+   // the empty shape.
+   if (shape.size() == 0 && fShape.size() != 0) {
+      shape.emplace_back(1);
+      strides.emplace_back(1);
+   }
+
+   // Create copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Expand dimensions
+/// \param[in] idx Index in shape vector where dimension is added
+/// \returns New RTensor
+/// Inserts a dimension of one into the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::ExpandDims(int idx) const
+{
+   // Compose shape vector with additional dimensions and adjust strides
+   const int len = fShape.size();
+   auto shape = fShape;
+   auto strides = fStrides;
+   if (idx < 0) {
+      idx = len + 1 + idx;
+   }
+   if (idx < 0) {
+      throw std::runtime_error("Given negative index is invalid.");
+   }
+   else if (idx > len) {
+      throw std::runtime_error("Given index is invalid.");
+   }
+   shape.insert(shape.begin() + idx, 1);
+   strides = Internal::ComputeStridesFromShape(shape, fLayout);
+
+   // Create view copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Reshape tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Reshape(const Shape_t &shape) const
+{
+   // Create copy, replace and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.ReshapeInplace(shape);
+   return x;
+}
+
+/// \brief Resize tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Resize tensor into new shape
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Resize(const Shape_t &shape)
+{
+   // Create new tensor with the specified shape
+   RTensor <Value_t, Container_t> x(shape, fLayout);
+
+   // Copying contents from previous tensor
+   size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize();
+   std::copy(this->GetData(), this->GetData() + n, x.GetData() );
+
+   return x;
+}
+
+/// \brief Create a slice of the tensor
+/// \param[in] slice Slice vector
+/// \returns New RTensor
+/// A slice is a subset of the tensor defined by a vector of pairs of indices.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Slice(const Slice_t &slice)
+{
+   // Sanitize size of slice
+   const auto sliceSize = slice.size();
+   const auto shapeSize = fShape.size();
+   if (sliceSize != shapeSize) {
+      std::stringstream ss;
+      ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ").";
+      throw std::runtime_error(ss.str());
+   }
+
+   // Sanitize slice indices
+   // TODO: Sanitize slice indices
+   /*
+   for (std::size_t i = 0; i < sliceSize; i++) {
+   }
+   */
+
+   // Convert -1 in slice to proper pair of indices
+   // TODO
+
+   // Recompute shape and size
+   Shape_t shape(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      shape[i] = slice[i][1] - slice[i][0];
+   }
+   auto size = Internal::GetSizeFromShape(shape);
+
+   // Determine first element contributing to the slice and get the data pointer
+   Value_t *data;
+   Shape_t idx(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      idx[i] = slice[i][0];
+   }
+   data = &operator()(idx);
+
+   // Create copy and modify properties
+   RTensor<Value_t, Container_t> x(*this);
+   x.fData = data;
+   x.fShape = shape;
+   x.fSize = size;
+
+   // Squeeze tensor and return
+   return x.Squeeze();
+}
+
+/// Copy RTensor to new object
+/// \param[in] layout Memory layout of the new RTensor
+/// \returns New RTensor
+/// The operation copies all elements of the current RTensor to a new RTensor
+/// with the given layout contiguous in memory. Note that this copies by default
+/// to a row major memory layout.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Copy(MemoryLayout layout) const
+{
+   // Create new tensor with zeros owning the memory
+   RTensor<Value_t, Container_t> r(fShape, layout);
+
+   // Copy over the elements from this tensor
+   const auto mins = Shape_t(fShape.size());
+   const auto maxs = fShape;
+   auto idx = mins;
+   Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0);
+
+   return r;
+}
+
+/// \brief Pretty printing
+/// \param[in] os Output stream
+/// \param[in] x RTensor
+/// \return Modified output stream
+template <typename T>
+std::ostream &operator<<(std::ostream &os, RTensor<T> &x)
+{
+   const auto shapeSize = x.GetShape().size();
+   if (shapeSize == 1) {
+      os << "{ ";
+      const auto size = x.GetSize();
+      for (std::size_t i = 0; i < size; i++) {
+         os << x({i});
+         if (i != size - 1)
+            os << ", ";
+      }
+      os << " }";
+   } else if (shapeSize == 2) {
+      os << "{";
+      const auto shape = x.GetShape();
+      for (std::size_t i = 0; i < shape[0]; i++) {
+         os << " { ";
+         for (std::size_t j = 0; j < shape[1]; j++) {
+            os << x({i, j});
+            if (j < shape[1] - 1) {
+               os << ", ";
+            } else {
+               os << " ";
+            }
+         }
+         os << "}";
+      }
+      os << " }";
+   } else {
+      os << "{ printing not yet implemented for this rank }";
+   }
+   return os;
+}
+   
+} // namespace SOFIE
+
+namespace cling {
+template <typename T>
+std::string printValue(SOFIE::RTensor<T> *x)
+{
+   std::stringstream ss;
+   ss << *x;
+   return ss.str();
+}
+} // namespace cling
+
+#endif // SOFIE_RTENSOR