From 9ab214909f6261314086b9d5594db484078c46b2 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Sat, 20 Dec 2025 00:29:59 +0530 Subject: [PATCH] [MLIR][NVPTX] Add intrinsics and Ops to read smem-sizes This patch adds three intrinsics and their corresponding Ops representing the PTX special-register read instructions that report the shared-memory sizes. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 30 +++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 11 +++++-- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 8 +++++ .../CodeGen/NVPTX/intrinsics-sm90-ptx81.ll | 18 +++++++++++ llvm/test/CodeGen/NVPTX/intrinsics.ll | 28 +++++++++++++++++ mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 6 ++++ mlir/test/Target/LLVMIR/nvvmir.mlir | 6 ++++ 7 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 59a5c9c91e620..f6038372b6b70 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -264,6 +264,36 @@ map in the following way to CUDA builtins: ``gridDim`` ``@llvm.nvvm.read.ptx.sreg.nctaid.*`` ============ ===================================== +'``llvm.nvvm.read.ptx.sreg.*_smem_size``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare i32 @llvm.nvvm.read.ptx.sreg.total_smem_size() + declare i32 @llvm.nvvm.read.ptx.sreg.aggr_smem_size() + declare i32 @llvm.nvvm.read.ptx.sreg.dynamic_smem_size() + +Overview: +""""""""" + +The '``@llvm.nvvm.read.ptx.sreg.total_smem_size``' intrinsic reads the +PTX special register that holds the total amount of shared memory +allocated per CTA for the kernel at launch. + +The reported value includes both statically allocated and dynamically +requested shared memory, but excludes any shared memory reserved for +system use. The size is expressed in units of the architecture-specific +shared memory allocation granularity. For targets sm_8x and newer, +this granularity is 128 bytes. + +The '``aggr_smem_size``' variant returns the aggregate shared memory size, +including the portion reserved for system software use. + +The '``dynamic_smem_size``' variant returns the amount of dynamic shared +memory allocated per CTA for the kernel at launch time. Barriers -------- diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index bddbf4ea3c185..911de5e14e9db 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2303,8 +2303,8 @@ foreach vec = [TV_I8, TV_I16, TV_I32, // // Accessing special registers. // -class PTXReadSRegIntrinsicNB_r32 properties = []> - : PureIntrinsic<[llvm_i32_ty], [], [NoUndef] # properties>; +class PTXReadSRegIntrinsicNB_r32 properties = [], string name = ""> + : PureIntrinsic<[llvm_i32_ty], [], [NoUndef] # properties, name>; class PTXReadSRegIntrinsic_r32 properties = []> : PTXReadSRegIntrinsicNB_r32, NVVMBuiltin; @@ -2406,6 +2406,13 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32; +def int_nvvm_read_ptx_sreg_aggr_smem_size : + PTXReadSRegIntrinsicNB_r32<[], "llvm.nvvm.read.ptx.sreg.aggr_smem_size">; +def int_nvvm_read_ptx_sreg_dynamic_smem_size : + PTXReadSRegIntrinsicNB_r32<[], "llvm.nvvm.read.ptx.sreg.dynamic_smem_size">; + // // SHUFFLE // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b145e1d53f46c..0bf4ab0bff8eb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -4761,6 +4761,14 @@ def INT_PTX_SREG_CLUSTER_NCTARANK: int_nvvm_read_ptx_sreg_cluster_nctarank, [hasSM<90>, hasPTX<78>]>; +def INT_PTX_SREG_TOTAL_SMEM_SIZE : + PTX_READ_SREG_R32<"total_smem_size", int_nvvm_read_ptx_sreg_total_smem_size>; +def INT_PTX_SREG_DYNAMIC_SMEM_SIZE : + PTX_READ_SREG_R32<"dynamic_smem_size", int_nvvm_read_ptx_sreg_dynamic_smem_size>; +def INT_PTX_SREG_AGGR_SMEM_SIZE : + PTX_READ_SREG_R32<"aggr_smem_size", + int_nvvm_read_ptx_sreg_aggr_smem_size, + [hasSM<90>, hasPTX<81>]>; def SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>; def SREG_WARPID : PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>; diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll new file mode 100644 index 0000000000000..25efcc9931aff --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| %ptxas-verify -arch=sm_90 %} + +define i32 @test_aggr_smem_size() { +; CHECK-LABEL: test_aggr_smem_size( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %aggr_smem_size; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %a = tail call i32 @llvm.nvvm.read.ptx.sreg.aggr_smem_size() + ret i32 %a +} + +declare i32 @llvm.nvvm.read.ptx.sreg.aggr_smem_size() diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index 00eb8e293e0fd..7a63d81274559 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -318,6 +318,32 @@ define i64 @test_steadycounter() { ret i64 %ret } +define i32 @test_total_smem_size() { +; CHECK-LABEL: test_total_smem_size( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %total_smem_size; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %a = tail call i32 @llvm.nvvm.read.ptx.sreg.total_smem_size() + ret i32 %a +} + +define i32 @test_dynamic_smem_size() { +; CHECK-LABEL: test_dynamic_smem_size( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: mov.u32 %r1, %dynamic_smem_size; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %a = tail call i32 @llvm.nvvm.read.ptx.sreg.dynamic_smem_size() + ret i32 %a +} + declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) declare float @llvm.nvvm.sqrt.f(float) @@ -335,3 +361,5 @@ declare void @llvm.nvvm.exit() declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer() declare i64 @llvm.readcyclecounter() declare i64 @llvm.readsteadycounter() +declare i32 @llvm.nvvm.read.ptx.sreg.total_smem_size() +declare i32 @llvm.nvvm.read.ptx.sreg.dynamic_smem_size() diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index ed9dad4389453..16133a2c135b7 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -353,6 +353,12 @@ def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sre def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>; def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">; +//===----------------------------------------------------------------------===// +// Various configurations of Shared memory sizes +def NVVM_TotalSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.total.smem.size">; +def NVVM_DynamicSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.dynamic.smem.size">; +def NVVM_AggrSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.aggr.smem.size", [NVVMRequiresSM<90>]>; + //===----------------------------------------------------------------------===// // Clock registers def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">; diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 9e4aadac69896..fbcf911082c57 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -156,6 +156,12 @@ llvm.func @nvvm_special_regs() -> i32 { %76 = nvvm.read.ptx.sreg.tid.x range : i32 // CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %77 = nvvm.read.ptx.sreg.tid.x range : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.total_smem_size() + %78 = nvvm.read.ptx.sreg.total.smem.size : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.dynamic_smem_size() + %79 = nvvm.read.ptx.sreg.dynamic.smem.size : i32 + // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.aggr_smem_size() + %80 = nvvm.read.ptx.sreg.aggr.smem.size : i32 llvm.return %1 : i32 }