Skip to content

Conversation

@durga4github
Copy link
Contributor

This patch adds three intrinsics and their corresponding Ops
representing the PTX special-register read instructions
that report various configurations of shared-memory sizes.

This patch adds three intrinsics and their corresponding Ops
representing the PTX special-register read instructions
that report the shared-memory sizes.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
@llvmbot
Copy link
Member

llvmbot commented Dec 19, 2025

@llvm/pr-subscribers-mlir-llvm
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-nvptx

Author: Durgadoss R (durga4github)

Changes

This patch adds three intrinsics and their corresponding Ops
representing the PTX special-register read instructions
that report various configurations of shared-memory sizes.


Full diff: https://github.com/llvm/llvm-project/pull/173089.diff

6 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+4)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+8)
  • (added) llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll (+18)
  • (modified) llvm/test/CodeGen/NVPTX/intrinsics.ll (+28)
  • (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+6)
  • (modified) mlir/test/Target/LLVMIR/nvvmir.mlir (+6)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bddbf4ea3c185..bc62cb1ca5fcb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2406,6 +2406,10 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GR
 def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
 def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
 
+def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_aggr_smem_size  : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_dynamic_smem_size : PTXReadSRegIntrinsicNB_r32;
+
 //
 // SHUFFLE
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b145e1d53f46c..0bf4ab0bff8eb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4761,6 +4761,14 @@ def  INT_PTX_SREG_CLUSTER_NCTARANK:
                          int_nvvm_read_ptx_sreg_cluster_nctarank,
                          [hasSM<90>, hasPTX<78>]>;
 
+def INT_PTX_SREG_TOTAL_SMEM_SIZE :
+    PTX_READ_SREG_R32<"total_smem_size", int_nvvm_read_ptx_sreg_total_smem_size>;
+def INT_PTX_SREG_DYNAMIC_SMEM_SIZE :
+    PTX_READ_SREG_R32<"dynamic_smem_size", int_nvvm_read_ptx_sreg_dynamic_smem_size>;
+def INT_PTX_SREG_AGGR_SMEM_SIZE :
+    PTX_READ_SREG_R32<"aggr_smem_size",
+                      int_nvvm_read_ptx_sreg_aggr_smem_size,
+                      [hasSM<90>, hasPTX<81>]>;
 
 def SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
 def SREG_WARPID : PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
new file mode 100644
index 0000000000000..4a79e94ce98cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| %ptxas-verify -arch=sm_90 %}
+
+define i32 @test_aggr_smem_size() {
+; CHECK-LABEL: test_aggr_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %aggr_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+  ret i32 %a
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 00eb8e293e0fd..7d29b82eeaa76 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -318,6 +318,32 @@ define i64 @test_steadycounter() {
   ret i64 %ret
 }
 
+define i32 @test_total_smem_size() {
+; CHECK-LABEL: test_total_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %total_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+  ret i32 %a
+}
+
+define i32 @test_dynamic_smem_size() {
+; CHECK-LABEL: test_dynamic_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %dynamic_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+  ret i32 %a
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
@@ -335,3 +361,5 @@ declare void @llvm.nvvm.exit()
 declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
 declare i64 @llvm.readcyclecounter()
 declare i64 @llvm.readsteadycounter()
+declare i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+declare i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ed9dad4389453..16133a2c135b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -353,6 +353,12 @@ def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sre
 def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
 def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
+//===----------------------------------------------------------------------===//
+// Various configurations of Shared memory sizes
+def NVVM_TotalSmemSize   : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.total.smem.size">;
+def NVVM_DynamicSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.dynamic.smem.size">;
+def NVVM_AggrSmemSize    : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.aggr.smem.size", [NVVMRequiresSM<90>]>;
+
 //===----------------------------------------------------------------------===//
 // Clock registers
 def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9e4aadac69896..13f795e2118f5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -156,6 +156,12 @@ llvm.func @nvvm_special_regs() -> i32 {
   %76 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 0> : i32
   // CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %77 = nvvm.read.ptx.sreg.tid.x range <i32, 4294967295, 4294967295> : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+  %78 = nvvm.read.ptx.sreg.total.smem.size : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+  %79 = nvvm.read.ptx.sreg.dynamic.smem.size : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+  %80 = nvvm.read.ptx.sreg.aggr.smem.size : i32
   llvm.return %1 : i32
 }
 

@llvmbot
Copy link
Member

llvmbot commented Dec 19, 2025

@llvm/pr-subscribers-mlir

Author: Durgadoss R (durga4github)

Changes

This patch adds three intrinsics and their corresponding Ops
representing the PTX special-register read instructions
that report various configurations of shared-memory sizes.


Full diff: https://github.com/llvm/llvm-project/pull/173089.diff

6 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+4)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+8)
  • (added) llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll (+18)
  • (modified) llvm/test/CodeGen/NVPTX/intrinsics.ll (+28)
  • (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+6)
  • (modified) mlir/test/Target/LLVMIR/nvvmir.mlir (+6)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bddbf4ea3c185..bc62cb1ca5fcb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2406,6 +2406,10 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GR
 def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
 def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
 
+def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_aggr_smem_size  : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_dynamic_smem_size : PTXReadSRegIntrinsicNB_r32;
+
 //
 // SHUFFLE
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b145e1d53f46c..0bf4ab0bff8eb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4761,6 +4761,14 @@ def  INT_PTX_SREG_CLUSTER_NCTARANK:
                          int_nvvm_read_ptx_sreg_cluster_nctarank,
                          [hasSM<90>, hasPTX<78>]>;
 
+def INT_PTX_SREG_TOTAL_SMEM_SIZE :
+    PTX_READ_SREG_R32<"total_smem_size", int_nvvm_read_ptx_sreg_total_smem_size>;
+def INT_PTX_SREG_DYNAMIC_SMEM_SIZE :
+    PTX_READ_SREG_R32<"dynamic_smem_size", int_nvvm_read_ptx_sreg_dynamic_smem_size>;
+def INT_PTX_SREG_AGGR_SMEM_SIZE :
+    PTX_READ_SREG_R32<"aggr_smem_size",
+                      int_nvvm_read_ptx_sreg_aggr_smem_size,
+                      [hasSM<90>, hasPTX<81>]>;
 
 def SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
 def SREG_WARPID : PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
new file mode 100644
index 0000000000000..4a79e94ce98cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| %ptxas-verify -arch=sm_90 %}
+
+define i32 @test_aggr_smem_size() {
+; CHECK-LABEL: test_aggr_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %aggr_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+  ret i32 %a
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 00eb8e293e0fd..7d29b82eeaa76 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -318,6 +318,32 @@ define i64 @test_steadycounter() {
   ret i64 %ret
 }
 
+define i32 @test_total_smem_size() {
+; CHECK-LABEL: test_total_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %total_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+  ret i32 %a
+}
+
+define i32 @test_dynamic_smem_size() {
+; CHECK-LABEL: test_dynamic_smem_size(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.u32 %r1, %dynamic_smem_size;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %a = tail call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+  ret i32 %a
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
@@ -335,3 +361,5 @@ declare void @llvm.nvvm.exit()
 declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
 declare i64 @llvm.readcyclecounter()
 declare i64 @llvm.readsteadycounter()
+declare i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+declare i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ed9dad4389453..16133a2c135b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -353,6 +353,12 @@ def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sre
 def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
 def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
+//===----------------------------------------------------------------------===//
+// Various configurations of Shared memory sizes
+def NVVM_TotalSmemSize   : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.total.smem.size">;
+def NVVM_DynamicSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.dynamic.smem.size">;
+def NVVM_AggrSmemSize    : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.aggr.smem.size", [NVVMRequiresSM<90>]>;
+
 //===----------------------------------------------------------------------===//
 // Clock registers
 def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9e4aadac69896..13f795e2118f5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -156,6 +156,12 @@ llvm.func @nvvm_special_regs() -> i32 {
   %76 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 0> : i32
   // CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   %77 = nvvm.read.ptx.sreg.tid.x range <i32, 4294967295, 4294967295> : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+  %78 = nvvm.read.ptx.sreg.total.smem.size : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+  %79 = nvvm.read.ptx.sreg.dynamic.smem.size : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+  %80 = nvvm.read.ptx.sreg.aggr.smem.size : i32
   llvm.return %1 : i32
 }
 

@durga4github durga4github requested a review from grypp December 19, 2025 19:51
Copy link
Member

@AlexMaclean AlexMaclean left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update NVPTXUsage.rst with these intrinsics as well.

def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;

def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: perhaps we should give these things names like "nvvm.read.prx.sreg.total_smem_size". The underscore is more consistent with PTX while the "." seems to imply that each word is a subfield.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants