diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml
index 7b5ebd87..d97fc1e9 100644
--- a/.github/workflows/build_kernel.yaml
+++ b/.github/workflows/build_kernel.yaml
@@ -27,7 +27,7 @@ jobs:
       - uses: DeterminateSystems/nix-installer-action@main
         with:
           extra-conf: |
-            max-jobs = 4
+            max-jobs = 8
             cores = 12
             sandbox-fallback = false
       - uses: cachix/cachix-action@v16
@@ -39,55 +39,10 @@ jobs:
       - name: Nix info
         run: nix-shell -p nix-info --run "nix-info -m"
 
-      - name: Build relu kernel
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} )
-      - name: Copy relu kernel
-        run: cp -rL examples/kernels/relu/result relu-kernel
-
-      - name: Build relu-tvm-ffi kernel
-        run: ( cd examples/kernels/relu-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} )
-      - name: Copy relu-tvm-ffi kernel
-        run: cp -rL examples/kernels/relu-tvm-ffi/result relu-tvm-ffi-kernel
-
-      - name: Build extra-data kernel
-        run: ( cd examples/kernels/extra-data && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} )
-      - name: Copy extra-data kernel
-        run: cp -rL examples/kernels/extra-data/result extra-data
-
-      - name: Build relu kernel (CPU)
-        run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cpu-${{ matrix.arch }} )
-      - name: Copy relu kernel (CPU)
-        run: cp -rL examples/kernels/relu/result relu-kernel-cpu
-
-      - name: Build cutlass GEMM kernel
-        run: ( cd examples/kernels/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} )
-      - name: Copy cutlass GEMM kernel
-        run: cp -rL examples/kernels/cutlass-gemm/result cutlass-gemm-kernel
-
-      - name: Build cutlass-gemm-tvm-ffi kernel
-        run: ( cd examples/kernels/cutlass-gemm-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} )
-      - name: Copy cutlass-gemm-tvm-ffi kernel
-        run: cp -rL examples/kernels/cutlass-gemm-tvm-ffi/result cutlass-gemm-tvm-ffi-kernel
-
-      - name: Build relu-backprop-compile kernel
-        run: ( cd examples/kernels/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} )
-      - name: Copy relu-backprop-compile kernel
-        run: cp -rL examples/kernels/relu-backprop-compile/result relu-backprop-compile-kernel
-
-      # Just test that we build with the extra torchVersions argument.
-      - name: Build relu kernel (specific Torch version)
-        run: ( cd examples/kernels/relu-specific-torch && nix build . )
-
-      - name: Build relu kernel (compiler flags)
-        run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} )
-
-      - name: Test that we can build a test shell (e.g. that gcc corresponds to CUDA-required)
-        run: ( cd examples/kernels/relu && nix build .#devShells.${{ matrix.arch }}.test )
-
-      - name: Build silu-and-mul kernel
-        run: ( cd examples/kernels/silu-and-mul && nix build .\#redistributable.torch-cuda )
-      - name: Copy silu-and-mul kernel
-        run: cp -rL examples/kernels/silu-and-mul/result silu-and-mul-kernel
+      - name: Build all example kernels
+        run: nix build -L ./examples/kernels#ci-build
+      - name: Copy kernel artifacts
+        run: cp -rL result/* .
 
       - name: Upload kernel artifacts
         uses: actions/upload-artifact@v6
@@ -118,17 +73,11 @@ jobs:
           name: built-kernels-x86_64-linux
           path: .
 
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
       - name: Build Docker image
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: nix-builder/tests/Dockerfile.test-kernel
-          platforms: linux/amd64
-          load: true
-          push: false
-          tags: kernel-builder:latest
+        run: |
+          docker build \
+            -t kernel-builder:latest \
+            -f nix-builder/tests/Dockerfile.test-kernel .
 
       - name: Run Tests
         run: |
diff --git a/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py b/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/kernels/cutlass-gemm/tests/__init__.py b/examples/kernels/cutlass-gemm/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix
new file mode 100644
index 00000000..919fe6ad
--- /dev/null
+++ b/examples/kernels/flake.nix
@@ -0,0 +1,159 @@
+{
+  description = "All example kernels";
+
+  inputs = {
+    kernel-builder.url = "path:../..";
+  };
+
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    let
+      inherit (kernel-builder.inputs) flake-utils nixpkgs;
+      inherit (kernel-builder.inputs.nixpkgs) lib;
+
+      cudaVersion = "cu126";
+      torchVersion = "29";
+      tvmFfiVersion = "01";
+
+      # All example kernels to build in CI.
+      #
+      # - name: name in the output path
+      # - path: kernel flake path
+      # - drv (system -> flakeOutputs -> derivation): the derivation for the given
+      #        system and flake outputs.
+      # - torchVersions: optional override for the torchVersions argument
+      ciKernels = [
+        {
+          name = "relu-kernel";
+          path = ./relu;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "relu-tvm-ffi-kernel";
+          path = ./relu-tvm-ffi;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "extra-data";
+          path = ./extra-data;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "relu-kernel-cpu";
+          path = ./relu;
+          drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"};
+        }
+        {
+          name = "cutlass-gemm-kernel";
+          path = ./cutlass-gemm;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "cutlass-gemm-tvm-ffi-kernel";
+          path = ./cutlass-gemm-tvm-ffi;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "relu-backprop-compile-kernel";
+          path = ./relu-backprop-compile;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+        }
+        {
+          name = "silu-and-mul-kernel";
+          path = ./silu-and-mul;
+          drv = sys: out: out.packages.${sys}.redistributable.torch-cuda;
+        }
+        {
+          # Tests that we can build with the extra torchVersions argument.
+          name = "relu-specific-torch";
+          path = ./relu-specific-torch;
+          drv = sys: out: out.packages.${sys}.default;
+          torchVersions = _defaultVersions: [
+            {
+              torchVersion = "2.9";
+              cudaVersion = "12.8";
+              systems = [
+                "x86_64-linux"
+                "aarch64-linux"
+              ];
+              bundleBuild = true;
+            }
+          ];
+        }
+        {
+          name = "relu-compiler-flags";
+          path = ./relu-compiler-flags;
+          drv =
+            sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"};
+        }
+        {
+          # Check that we can build a test shell (e.g. gcc is compatible with
+          # CUDA requirements).
+          name = "relu-test-shell";
+          path = ./relu;
+          drv = sys: out: out.devShells.${sys}.test;
+        }
+      ];
+
+      mkKernelOutputs =
+        {
+          path,
+          torchVersions ? null,
+        }:
+        kernel-builder.lib.genKernelFlakeOutputs (
+          {
+            inherit self path;
+          }
+          // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; }
+        );
+
+      ciKernelOutputs = map (
+        kernel:
+        kernel
+        // {
+          outputs = mkKernelOutputs {
+            inherit (kernel) path;
+            torchVersions = kernel.torchVersions or null;
+          };
+        }
+      ) ciKernels;
+    in
+    flake-utils.lib.eachSystem
+      [
+        "x86_64-linux"
+        "aarch64-linux"
+      ]
+      (
+        system:
+        let
+          pkgs = nixpkgs.legacyPackages.${system};
+
+          resolvedKernels = map (kernel: {
+            inherit (kernel) name;
+            drv = kernel.drv system kernel.outputs;
+          }) ciKernelOutputs;
+
+          ci-build = pkgs.linkFarm "ci-kernels" (
+            map (kernel: {
+              inherit (kernel) name;
+              path = kernel.drv;
+            }) resolvedKernels
+          );
+        in
+        {
+          packages = {
+            inherit ci-build;
+            default = ci-build;
+          };
+        }
+      );
+}
diff --git a/kernel-builder/src/pyproject/templates/compat.py b/kernel-builder/src/pyproject/templates/compat.py
index 03dbc1af..a9b2672c 100644
--- a/kernel-builder/src/pyproject/templates/compat.py
+++ b/kernel-builder/src/pyproject/templates/compat.py
@@ -1,10 +1,10 @@
 import ctypes
+import importlib.util
 import sys
-
-import importlib
 from pathlib import Path
 from types import ModuleType
 
+
 def _import_from_path(file_path: Path) -> ModuleType:
     # We cannot use the module name as-is, after adding it to `sys.modules`,
     # it would also be used for other imports. So, we make a module name that
diff --git a/nix-builder/tests/Dockerfile.test-kernel b/nix-builder/tests/Dockerfile.test-kernel
index aa46ad1a..df98e0d9 100644
--- a/nix-builder/tests/Dockerfile.test-kernel
+++ b/nix-builder/tests/Dockerfile.test-kernel
@@ -14,12 +14,10 @@ ENV DEBIAN_FRONTEND=noninteractive \
     NVIDIA_VISIBLE_DEVICES=all \
     NVIDIA_DRIVER_CAPABILITIES=compute,utility
 
-# Install system dependencies
-RUN dnf install -y \
-    curl
-
-# Install uv package manager
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# Install uv.
+RUN dnf install -y curl && \
+    curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    dnf clean all
 
 # Set working directory
 WORKDIR /app
@@ -59,8 +57,8 @@ RUN CUDA_MAJOR_MINOR=$(echo ${CUDA_VERSION} | cut -d'.' -f1,2) && \
     uv add "torch==${TORCH_VERSION}"; \
     fi
 
-# add pytest for runtime tests
-RUN uv add numpy pytest
+# Add additional dependencies.
+RUN uv add "apache-tvm-ffi~=0.1.9" numpy pytest
 
 # Copy kernels and tests
 COPY relu-kernel ./relu-kernel
@@ -69,6 +67,7 @@ COPY relu-kernel-cpu ./relu-kernel-cpu
 COPY cutlass-gemm-kernel ./cutlass-gemm-kernel
 COPY cutlass-gemm-tvm-ffi-kernel ./cutlass-gemm-tvm-ffi-kernel
 COPY silu-and-mul-kernel ./silu-and-mul-kernel
+COPY extra-data ./extra-data
 COPY examples/kernels/extra-data/tests ./extra_data_tests
 COPY examples/kernels/relu/tests ./relu_tests
 COPY examples/kernels/relu-tvm-ffi/tests ./relu_tvm_ffi_tests
diff --git a/nix-builder/tests/run-tests.sh b/nix-builder/tests/run-tests.sh
index b28325b7..968a5df7 100644
--- a/nix-builder/tests/run-tests.sh
+++ b/nix-builder/tests/run-tests.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -euo pipefail
 
 # Expand to build variant directories.
 EXTRA_DATA_PATH=$(echo extra-data/torch*)
@@ -9,13 +10,13 @@ CUTLASS_TVM_FFI_PATH=$(echo cutlass-gemm-tvm-ffi-kernel/tvm-ffi*)
 SILU_MUL_PATH=$(echo silu-and-mul-kernel/torch*)
 RELU_CPU_PATH=$(echo relu-kernel-cpu/torch*)
 
-PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH:$PYTHONPATH" \
+PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH" \
   .venv/bin/pytest extra_data_tests relu_tests relu_tvm_ffi_tests cutlass_gemm_tests cutlass_gemm_tvm_ffi_tests
 
 # We only care about importing, the kernel is trivial.
-PYTHONPATH="$SILU_MUL_PATH:$PYTHONPATH" \
+PYTHONPATH="$SILU_MUL_PATH" \
   .venv/bin/python -c "import silu_and_mul"
 
-PYTHONPATH="$RELU_CPU_PATH:$PYTHONPATH" \
+PYTHONPATH="$RELU_CPU_PATH" \
    CUDA_VISIBLE_DEVICES="" \
   .venv/bin/pytest relu_tests