diff --git a/.github/workflows/build_kernel.yaml b/.github/workflows/build_kernel.yaml index 7b5ebd87..d97fc1e9 100644 --- a/.github/workflows/build_kernel.yaml +++ b/.github/workflows/build_kernel.yaml @@ -27,7 +27,7 @@ jobs: - uses: DeterminateSystems/nix-installer-action@main with: extra-conf: | - max-jobs = 4 + max-jobs = 8 cores = 12 sandbox-fallback = false - uses: cachix/cachix-action@v16 @@ -39,55 +39,10 @@ jobs: - name: Nix info run: nix-shell -p nix-info --run "nix-info -m" - - name: Build relu kernel - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy relu kernel - run: cp -rL examples/kernels/relu/result relu-kernel - - - name: Build relu-tvm-ffi kernel - run: ( cd examples/kernels/relu-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} ) - - name: Copy relu-tvm-ffi kernel - run: cp -rL examples/kernels/relu-tvm-ffi/result relu-tvm-ffi-kernel - - - name: Build extra-data kernel - run: ( cd examples/kernels/extra-data && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy extra-data kernel - run: cp -rL examples/kernels/extra-data/result extra-data - - - name: Build relu kernel (CPU) - run: ( cd examples/kernels/relu && nix build .\#redistributable.torch29-cxx11-cpu-${{ matrix.arch }} ) - - name: Copy relu kernel (CPU) - run: cp -rL examples/kernels/relu/result relu-kernel-cpu - - - name: Build cutlass GEMM kernel - run: ( cd examples/kernels/cutlass-gemm && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy cutlass GEMM kernel - run: cp -rL examples/kernels/cutlass-gemm/result cutlass-gemm-kernel - - - name: Build cutlass-gemm-tvm-ffi kernel - run: ( cd examples/kernels/cutlass-gemm-tvm-ffi && nix build .\#redistributable.tvm-ffi01-cu126-${{ matrix.arch }} ) - - name: Copy cutlass-gemm-tvm-ffi kernel - run: cp -rL examples/kernels/cutlass-gemm-tvm-ffi/result cutlass-gemm-tvm-ffi-kernel - - - name: Build relu-backprop-compile kernel - run: ( cd examples/kernels/relu-backprop-compile && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - name: Copy relu-backprop-compile kernel - run: cp -rL examples/kernels/relu-backprop-compile/result relu-backprop-compile-kernel - - # Just test that we build with the extra torchVersions argument. - - name: Build relu kernel (specific Torch version) - run: ( cd examples/kernels/relu-specific-torch && nix build . ) - - - name: Build relu kernel (compiler flags) - run: ( cd examples/kernels/relu-compiler-flags && nix build .\#redistributable.torch29-cxx11-cu126-${{ matrix.arch }} ) - - - name: Test that we can build a test shell (e.g. that gcc corresponds to CUDA-required) - run: ( cd examples/kernels/relu && nix build .#devShells.${{ matrix.arch }}.test ) - - - name: Build silu-and-mul kernel - run: ( cd examples/kernels/silu-and-mul && nix build .\#redistributable.torch-cuda ) - - name: Copy silu-and-mul kernel - run: cp -rL examples/kernels/silu-and-mul/result silu-and-mul-kernel + - name: Build all example kernels + run: nix build -L ./examples/kernels#ci-build + - name: Copy kernel artifacts + run: cp -rL result/* . - name: Upload kernel artifacts uses: actions/upload-artifact@v6 @@ -118,17 +73,11 @@ jobs: name: built-kernels-x86_64-linux path: . - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - name: Build Docker image - uses: docker/build-push-action@v6 - with: - context: . - file: nix-builder/tests/Dockerfile.test-kernel - platforms: linux/amd64 - load: true - push: false - tags: kernel-builder:latest + run: | + docker build \ + -t kernel-builder:latest \ + -f nix-builder/tests/Dockerfile.test-kernel . - name: Run Tests run: | diff --git a/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py b/examples/kernels/cutlass-gemm-tvm-ffi/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/kernels/cutlass-gemm/tests/__init__.py b/examples/kernels/cutlass-gemm/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/kernels/flake.nix b/examples/kernels/flake.nix new file mode 100644 index 00000000..919fe6ad --- /dev/null +++ b/examples/kernels/flake.nix @@ -0,0 +1,159 @@ +{ + description = "All example kernels"; + + inputs = { + kernel-builder.url = "path:../.."; + }; + + outputs = + { + self, + kernel-builder, + }: + let + inherit (kernel-builder.inputs) flake-utils nixpkgs; + inherit (kernel-builder.inputs.nixpkgs) lib; + + cudaVersion = "cu126"; + torchVersion = "29"; + tvmFfiVersion = "01"; + + # All example kernels to build in CI. + # + # - name: name in the output path + # - path: kernel flake path + # - drv (system -> flakeOutputs -> derivation): the derivation for the given + # system and flake outputs. + # - torchVersions: optional override for the torchVersions argument + ciKernels = [ + { + name = "relu-kernel"; + path = ./relu; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "relu-tvm-ffi-kernel"; + path = ./relu-tvm-ffi; + drv = + sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + } + { + name = "extra-data"; + path = ./extra-data; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "relu-kernel-cpu"; + path = ./relu; + drv = sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-cpu-${sys}"}; + } + { + name = "cutlass-gemm-kernel"; + path = ./cutlass-gemm; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "cutlass-gemm-tvm-ffi-kernel"; + path = ./cutlass-gemm-tvm-ffi; + drv = + sys: out: out.packages.${sys}.redistributable.${"tvm-ffi${tvmFfiVersion}-${cudaVersion}-${sys}"}; + } + { + name = "relu-backprop-compile-kernel"; + path = ./relu-backprop-compile; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + name = "silu-and-mul-kernel"; + path = ./silu-and-mul; + drv = sys: out: out.packages.${sys}.redistributable.torch-cuda; + } + { + # Tests that we can build with the extra torchVersions argument. + name = "relu-specific-torch"; + path = ./relu-specific-torch; + drv = sys: out: out.packages.${sys}.default; + torchVersions = _defaultVersions: [ + { + torchVersion = "2.9"; + cudaVersion = "12.8"; + systems = [ + "x86_64-linux" + "aarch64-linux" + ]; + bundleBuild = true; + } + ]; + } + { + name = "relu-compiler-flags"; + path = ./relu-compiler-flags; + drv = + sys: out: out.packages.${sys}.redistributable.${"torch${torchVersion}-cxx11-${cudaVersion}-${sys}"}; + } + { + # Check that we can build a test shell (e.g. gcc is compatible with + # CUDA requirements). + name = "relu-test-shell"; + path = ./relu; + drv = sys: out: out.devShells.${sys}.test; + } + ]; + + mkKernelOutputs = + { + path, + torchVersions ? null, + }: + kernel-builder.lib.genKernelFlakeOutputs ( + { + inherit self path; + } + // lib.optionalAttrs (torchVersions != null) { inherit torchVersions; } + ); + + ciKernelOutputs = map ( + kernel: + kernel + // { + outputs = mkKernelOutputs { + inherit (kernel) path; + torchVersions = kernel.torchVersions or null; + }; + } + ) ciKernels; + in + flake-utils.lib.eachSystem + [ + "x86_64-linux" + "aarch64-linux" + ] + ( + system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + resolvedKernels = map (kernel: { + inherit (kernel) name; + drv = kernel.drv system kernel.outputs; + }) ciKernelOutputs; + + ci-build = pkgs.linkFarm "ci-kernels" ( + map (kernel: { + inherit (kernel) name; + path = kernel.drv; + }) resolvedKernels + ); + in + { + packages = { + inherit ci-build; + default = ci-build; + }; + } + ); +} diff --git a/kernel-builder/src/pyproject/templates/compat.py b/kernel-builder/src/pyproject/templates/compat.py index 03dbc1af..a9b2672c 100644 --- a/kernel-builder/src/pyproject/templates/compat.py +++ b/kernel-builder/src/pyproject/templates/compat.py @@ -1,10 +1,10 @@ import ctypes +import importlib.util import sys - -import importlib from pathlib import Path from types import ModuleType + def _import_from_path(file_path: Path) -> ModuleType: # We cannot use the module name as-is, after adding it to `sys.modules`, # it would also be used for other imports. So, we make a module name that diff --git a/nix-builder/tests/Dockerfile.test-kernel b/nix-builder/tests/Dockerfile.test-kernel index aa46ad1a..df98e0d9 100644 --- a/nix-builder/tests/Dockerfile.test-kernel +++ b/nix-builder/tests/Dockerfile.test-kernel @@ -14,12 +14,10 @@ ENV DEBIAN_FRONTEND=noninteractive \ NVIDIA_VISIBLE_DEVICES=all \ NVIDIA_DRIVER_CAPABILITIES=compute,utility -# Install system dependencies -RUN dnf install -y \ - curl - -# Install uv package manager -RUN curl -LsSf https://astral.sh/uv/install.sh | sh +# Install uv. +RUN dnf install -y curl && \ + curl -LsSf https://astral.sh/uv/install.sh | sh && \ + dnf clean all # Set working directory WORKDIR /app @@ -59,8 +57,8 @@ RUN CUDA_MAJOR_MINOR=$(echo ${CUDA_VERSION} | cut -d'.' -f1,2) && \ uv add "torch==${TORCH_VERSION}"; \ fi -# add pytest for runtime tests -RUN uv add numpy pytest +# Add additional dependencies. +RUN uv add "apache-tvm-ffi~=0.1.9" numpy pytest # Copy kernels and tests COPY relu-kernel ./relu-kernel @@ -69,6 +67,7 @@ COPY relu-kernel-cpu ./relu-kernel-cpu COPY cutlass-gemm-kernel ./cutlass-gemm-kernel COPY cutlass-gemm-tvm-ffi-kernel ./cutlass-gemm-tvm-ffi-kernel COPY silu-and-mul-kernel ./silu-and-mul-kernel +COPY extra-data ./extra-data COPY examples/kernels/extra-data/tests ./extra_data_tests COPY examples/kernels/relu/tests ./relu_tests COPY examples/kernels/relu-tvm-ffi/tests ./relu_tvm_ffi_tests diff --git a/nix-builder/tests/run-tests.sh b/nix-builder/tests/run-tests.sh index b28325b7..968a5df7 100644 --- a/nix-builder/tests/run-tests.sh +++ b/nix-builder/tests/run-tests.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail # Expand to build variant directories. EXTRA_DATA_PATH=$(echo extra-data/torch*) @@ -9,13 +10,13 @@ CUTLASS_TVM_FFI_PATH=$(echo cutlass-gemm-tvm-ffi-kernel/tvm-ffi*) SILU_MUL_PATH=$(echo silu-and-mul-kernel/torch*) RELU_CPU_PATH=$(echo relu-kernel-cpu/torch*) -PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH:$PYTHONPATH" \ +PYTHONPATH="$EXTRA_DATA_PATH:$RELU_PATH:$RELU_TVM_FFI_PATH:$CUTLASS_PATH:$CUTLASS_TVM_FFI_PATH" \ .venv/bin/pytest extra_data_tests relu_tests relu_tvm_ffi_tests cutlass_gemm_tests cutlass_gemm_tvm_ffi_tests # We only care about importing, the kernel is trivial. -PYTHONPATH="$SILU_MUL_PATH:$PYTHONPATH" \ +PYTHONPATH="$SILU_MUL_PATH" \ .venv/bin/python -c "import silu_and_mul" -PYTHONPATH="$RELU_CPU_PATH:$PYTHONPATH" \ +PYTHONPATH="$RELU_CPU_PATH" \ CUDA_VISIBLE_DEVICES="" \ .venv/bin/pytest relu_tests