From 8cd86509ae4e376a8e161e1eee2e22c178a944b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 09:52:39 +0000 Subject: [PATCH 1/9] ci: do not build kernels test image inside a buildx container This adds a lot of extra serialization/deserialization, but for no gain, since we are not doing multi-platform builds or registry caching. --- .github/workflows/build_kernel.yaml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index 7b5ebd87..fdd34148 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -118,17 +118,11 @@ jobs: name: built-kernels-x86_64-linux path: . - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - name: Build Docker image - uses: docker/build-push-action@v6 - with: - context: . - file: nix-builder/tests/Dockerfile.test-kernel - platforms: linux/amd64 - load: true - push: false - tags: kernel-builder:latest + run: | + docker build \ + -t kernel-builder:latest \ + -f nix-builder/tests/Dockerfile.test-kernel . - name: Run Tests run: | From 27d42033e5d1a31119e6fc75e199ea5c1b304e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 09:53:55 +0000 Subject: [PATCH 2/9] ci: kernel tests workflow must fail when one of the tests fails --- nix-builder/tests/run-tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/nix-builder/tests/run-tests.sh b/nix-builder/tests/run-tests.sh index b28325b7..3cb5a7c2 100644 --- a/nix-builder/tests/run-tests.sh +++ b/nix-builder/tests/run-tests.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail # Expand to build variant directories. EXTRA_DATA_PATH=$(echo extra-data/torch*) From f462a9d489e5c403853e0b00c58d073ecac5e43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 10:17:23 +0000 Subject: [PATCH 3/9] Fixup undefined PYTHONPATH --- nix-builder/tests/run-tests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nix-builder/tests/run-tests.sh b/nix-builder/tests/run-tests.sh index 3cb5a7c2..968a5df7 100644 --- a/nix-builder/tests/run-tests.sh +++ b/nix-builder/tests/run-tests.sh @@ -10,13 +10,13 @@ CUTLASS_TVM_FFI_PATH=$(echo cutlass-gemm-tvm-ffi-kernel/tvm-ffi*) SILU_MUL_PATH=$(echo silu-and-mul-kernel/torch*) RELU_CPU_PATH=$(echo relu-kernel-cpu/torch*) -PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH:$PYTHONPATH" \ +PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH" \ .venv/bin/pytest extra_data_tests relu_tests relu_tvm_ffi_tests cutlass_gemm_tests cutlass_gemm_tvm_ffi_tests # We only care about importing, the kernel is trivial. -PYTHONPATH="$SILU_MUL_PATH:$PYTHONPATH" \ +PYTHONPATH="$SILU_MUL_PATH" \ .venv/bin/python -c "import silu_and_mul" -PYTHONPATH="$RELU_CPU_PATH:$PYTHONPATH" \ +PYTHONPATH="$RELU_CPU_PATH" \ CUDA_VISIBLE_DEVICES="" \ .venv/bin/pytest relu_tests From dc3edaef30a54a10770291a4c347e371368dc58c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 10:38:27 +0000 Subject: [PATCH 4/9] ci: avoid storing dnf cache --- nix-builder/tests/Dockerfile.test-kernel | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/nix-builder/tests/Dockerfile.test-kernel b/nix-builder/tests/Dockerfile.test-kernel index aa46ad1a..47fc3831 100644 --- a/nix-builder/tests/Dockerfile.test-kernel +++ b/nix-builder/tests/Dockerfile.test-kernel @@ -14,12 +14,10 @@ ENV DEBIAN_FRONTEND=noninteractive \ NVIDIA_VISIBLE_DEVICES=all \ NVIDIA_DRIVER_CAPABILITIES=compute,utility -# Install system dependencies -RUN dnf install -y \ - curl - -# Install uv package manager -RUN curl -LsSf https://astral.sh/uv/install.sh | sh +# Install uv. +RUN dnf install -y curl && \ + curl -LsSf https://astral.sh/uv/install.sh | sh && \ + dnf clean all # Set working directory WORKDIR /app From 51de3b76979927ba1b8d0e1ad2886065fb5547fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 10:47:45 +0000 Subject: [PATCH 5/9] ci: fix tests --- examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py | 0 examples/kernels/cutlass-gemm/tests/__init__.py | 0 nix-builder/tests/Dockerfile.test-kernel | 5 +++-- 3 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py create mode 100644 examples/kernels/cutlass-gemm/tests/__init__.py diff --git a/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py b/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/kernels/cutlass-gemm/tests/__init__.py b/examples/kernels/cutlass-gemm/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nix-builder/tests/Dockerfile.test-kernel b/nix-builder/tests/Dockerfile.test-kernel index 47fc3831..df98e0d9 100644 --- a/nix-builder/tests/Dockerfile.test-kernel +++ b/nix-builder/tests/Dockerfile.test-kernel @@ -57,8 +57,8 @@ RUN CUDA_MAJOR_MINOR=$(echo ${CUDA_VERSION} | cut -d'.' -f1,2) && \ uv add "torch==${TORCH_VERSION}"; \ fi -# add pytest for runtime tests -RUN uv add numpy pytest +# Add additional dependencies. +RUN uv add "apache-tvm-ffi~=0.1.9" numpy pytest # Copy kernels and tests COPY relu-kernel ./relu-kernel @@ -67,6 +67,7 @@ COPY relu-kernel-cpu ./relu-kernel-cpu COPY cutlass-gemm-kernel ./cutlass-gemm-kernel COPY cutlass-gemm-tvm-ffi-kernel ./cutlass-gemm-tvm-ffi-kernel COPY silu-and-mul-kernel ./silu-and-mul-kernel +COPY extra-data ./extra-data COPY examples/kernels/extra-data/tests ./extra_data_tests COPY examples/kernels/relu/tests ./relu_tests COPY examples/kernels/relu-tvm-ffi/tests ./relu_tvm_ffi_tests From 2f89d91c9b7a4f87618691cfda88cceb2eaf0ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 11:31:56 +0000 Subject: [PATCH 6/9] Fix compat module --- kernel-builder/src/pyproject/templates/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel-builder/src/pyproject/templates/compat.py b/kernel-builder/src/pyproject/templates/compat.py index 03dbc1af..a9b2672c 100644 --- a/kernel-builder/src/pyproject/templates/compat.py +++ b/kernel-builder/src/pyproject/templates/compat.py @@ -1,10 +1,10 @@ import ctypes +import importlib.util import sys - -import importlib from pathlib import Path from types import ModuleType + def _import_from_path(file_path: Path) -> ModuleType: # We cannot use the module name as-is, after adding it to `sys.modules`, # it would also be used for other imports. So, we make a module name that From 6072d503d272c7db1c36ea15790e32b677c00661 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 12:12:54 +0000 Subject: [PATCH 7/9] ci: parallelize build of example/test kernels --- .github/workflows/build_kernel.yaml | 53 +--------- examples/kernels/flake.nix | 152 ++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 49 deletions(-) create mode 100644 examples/kernels/flake.nix diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index fdd34148..a08cc3a1 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -39,55 +39,10 @@ jobs: - name: Nix info run: nix-shell -p nix-info --run "nix-info -m" - - name: Build relu kernel - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy relu kernel - run: cp -rL examples/kernels/relu/result relu-kernel - - - name: Build relu-tvm-ffi kernel - run: ( cd examples/kernels/relu-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} ) - - name: Copy relu-tvm-ffi kernel - run: cp -rL examples/kernels/relu-tvm-ffi/result relu-tvm-ffi-kernel - - - name: Build extra-data kernel - run: ( cd examples/kernels/extra-data && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy extra-data kernel - run: cp -rL examples/kernels/extra-data/result extra-data - - - name: Build relu kernel (CPU) - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cpu-${{ matrix.arch }} ) - - name: Copy relu kernel (CPU) - run: cp -rL examples/kernels/relu/result relu-kernel-cpu - - - name: Build cutlass GEMM kernel - run: ( cd examples/kernels/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy cutlass GEMM kernel - run: cp -rL examples/kernels/cutlass-gemm/result cutlass-gemm-kernel - - - name: Build cutlass-gemm-tvm-ffi kernel - run: ( cd examples/kernels/cutlass-gemm-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} ) - - name: Copy cutlass-gemm-tvm-ffi kernel - run: cp -rL examples/kernels/cutlass-gemm-tvm-ffi/result cutlass-gemm-tvm-ffi-kernel - - - name: Build relu-backprop-compile kernel - run: ( cd examples/kernels/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy relu-backprop-compile kernel - run: cp -rL examples/kernels/relu-backprop-compile/result relu-backprop-compile-kernel - - # Just test that we build with the extra torchVersions argument. - - name: Build relu kernel (specific Torch version) - run: ( cd examples/kernels/relu-specific-torch && nix build . ) - - - name: Build relu kernel (compiler flags) - run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - - name: Test that we can build a test shell (e.g. that gcc corresponds to CUDA-required) - run: ( cd examples/kernels/relu && nix build .#devShells.${{ matrix.arch }}.test ) - - - name: Build silu-and-mul kernel - run: ( cd examples/kernels/silu-and-mul && nix build .\#redistributable.torch-cuda ) - - name: Copy silu-and-mul kernel - run: cp -rL examples/kernels/silu-and-mul/result silu-and-mul-kernel + - name: Build all example kernels + run: nix build -L ./examples/kernels#ci-build + - name: Copy kernel artifacts + run: cp -rL result/* . - name: Upload kernel artifacts uses: actions/upload-artifact@v6 diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix new file mode 100644 index 00000000..62ffc139 --- /dev/null +++ b/examples/kernels/flake.nix @@ -0,0 +1,152 @@ +{ + description = "All example kernels"; + + inputs = { + kernel-builder.url = "path:../.."; + }; + + outputs = + { + self, + kernel-builder, + }: + let + inherit (kernel-builder.inputs) flake-utils nixpkgs; + inherit (kernel-builder.inputs.nixpkgs) lib; + + cudaVersion = "cu126"; + torchVersion = "29"; + tvmFfiVersion = "01"; + + # All example kernels to build in CI. + # + # - name: name in the output path + # - path: kernel flake path + # - drv (system -> flakeOutputs -> derivation): the derivation for the given + # system and flake outputs. + # - torchVersions: optional override for the torchVersions argument + ciKernels = [ + { + name = "relu-kernel"; + path = ./relu; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "relu-tvm-ffi-kernel"; + path = ./relu-tvm-ffi; + drv = sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + } + { + name = "extra-data"; + path = ./extra-data; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "relu-kernel-cpu"; + path = ./relu; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"}; + } + { + name = "cutlass-gemm-kernel"; + path = ./cutlass-gemm; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "cutlass-gemm-tvm-ffi-kernel"; + path = ./cutlass-gemm-tvm-ffi; + drv = sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + } + { + name = "relu-backprop-compile-kernel"; + path = ./relu-backprop-compile; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "silu-and-mul-kernel"; + path = ./silu-and-mul; + drv = sys: out: out.packages.${sys}.redistributable.torch-cuda; + } + { + # Tests that we can build with the extra torchVersions argument. + name = "relu-specific-torch"; + path = ./relu-specific-torch; + drv = sys: out: out.packages.${sys}.default; + torchVersions = _defaultVersions: [ + { + torchVersion = "2.9"; + cudaVersion = "12.8"; + systems = [ + "x86_64-linux" + "aarch64-linux" + ]; + bundleBuild = true; + } + ]; + } + { + name = "relu-compiler-flags"; + path = ./relu-compiler-flags; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + # Check that we can build a test shell (e.g. gcc is compatible with + # CUDA requirements). + name = "relu-test-shell"; + path = ./relu; + drv = sys: out: out.devShells.${sys}.test; + } + ]; + + mkKernelOutputs = + { + path, + torchVersions ? null, + }: + kernel-builder.lib.genKernelFlakeOutputs ( + { + inherit self path; + } + // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; } + ); + + ciKernelOutputs = map ( + kernel: + kernel + // { + outputs = mkKernelOutputs { + inherit (kernel) path; + torchVersions = kernel.torchVersions or null; + }; + } + ) ciKernels; + in + flake-utils.lib.eachSystem + [ + "x86_64-linux" + "aarch64-linux" + ] + ( + system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + resolvedKernels = map (kernel: { + inherit (kernel) name; + drv = kernel.drv system kernel.outputs; + }) ciKernelOutputs; + + ci-build = pkgs.linkFarm "ci-kernels" ( + map (kernel: { + inherit (kernel) name; + path = kernel.drv; + }) resolvedKernels + ); + in + { + packages = { + inherit ci-build; + default = ci-build; + }; + } + ); +} From ca838264f6be1da72805340f6b9515638f1bc48b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 12:13:39 +0000 Subject: [PATCH 8/9] ci: set Nix max-jobs to 8 Should not be an issue for these small kernels. --- .github/workflows/build_kernel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index a08cc3a1..d97fc1e9 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -27,7 +27,7 @@ jobs: - uses: DeterminateSystems/nix-installer-action@main with: extra-conf: | - max-jobs = 4 + max-jobs = 8 cores = 12 sandbox-fallback = false - uses: cachix/cachix-action@v16 From 31f0601fa1b4cc3e64a22f94a040eea8c0b50d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 25 Mar 2026 12:14:17 +0000 Subject: [PATCH 9/9] nix fmt --- examples/kernels/flake.nix | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix index 62ffc139..919fe6ad 100644 --- a/examples/kernels/flake.nix +++ b/examples/kernels/flake.nix @@ -29,17 +29,20 @@ { name = "relu-kernel"; path = ./relu; - drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; } { name = "relu-tvm-ffi-kernel"; path = ./relu-tvm-ffi; - drv = sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; } { name = "extra-data"; path = ./extra-data; - drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; } { name = "relu-kernel-cpu"; @@ -49,17 +52,20 @@ { name = "cutlass-gemm-kernel"; path = ./cutlass-gemm; - drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; } { name = "cutlass-gemm-tvm-ffi-kernel"; path = ./cutlass-gemm-tvm-ffi; - drv = sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; } { name = "relu-backprop-compile-kernel"; path = ./relu-backprop-compile; - drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; } { name = "silu-and-mul-kernel"; @@ -86,7 +92,8 @@ { name = "relu-compiler-flags"; path = ./relu-compiler-flags; - drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; } { # Check that we can build a test shell (e.g. gcc is compatible with