-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[MLIR][NVPTX] Add intrinsics and Ops to read smem-sizes #173089
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This patch adds three intrinsics and their corresponding Ops representing the PTX special-register read instructions that report the shared-memory sizes. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
|
@llvm/pr-subscribers-mlir-llvm @llvm/pr-subscribers-backend-nvptx Author: Durgadoss R (durga4github) ChangesThis patch adds three intrinsics and their corresponding Ops Full diff: https://github.com/llvm/llvm-project/pull/173089.diff 6 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bddbf4ea3c185..bc62cb1ca5fcb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2406,6 +2406,10 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GR
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_aggr_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_dynamic_smem_size : PTXReadSRegIntrinsicNB_r32;
+
//
// SHUFFLE
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b145e1d53f46c..0bf4ab0bff8eb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4761,6 +4761,14 @@ def INT_PTX_SREG_CLUSTER_NCTARANK:
int_nvvm_read_ptx_sreg_cluster_nctarank,
[hasSM<90>, hasPTX<78>]>;
+def INT_PTX_SREG_TOTAL_SMEM_SIZE :
+ PTX_READ_SREG_R32<"total_smem_size", int_nvvm_read_ptx_sreg_total_smem_size>;
+def INT_PTX_SREG_DYNAMIC_SMEM_SIZE :
+ PTX_READ_SREG_R32<"dynamic_smem_size", int_nvvm_read_ptx_sreg_dynamic_smem_size>;
+def INT_PTX_SREG_AGGR_SMEM_SIZE :
+ PTX_READ_SREG_R32<"aggr_smem_size",
+ int_nvvm_read_ptx_sreg_aggr_smem_size,
+ [hasSM<90>, hasPTX<81>]>;
def SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
def SREG_WARPID : PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
new file mode 100644
index 0000000000000..4a79e94ce98cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| %ptxas-verify -arch=sm_90 %}
+
+define i32 @test_aggr_smem_size() {
+; CHECK-LABEL: test_aggr_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %aggr_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+ ret i32 %a
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 00eb8e293e0fd..7d29b82eeaa76 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -318,6 +318,32 @@ define i64 @test_steadycounter() {
ret i64 %ret
}
+define i32 @test_total_smem_size() {
+; CHECK-LABEL: test_total_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %total_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+ ret i32 %a
+}
+
+define i32 @test_dynamic_smem_size() {
+; CHECK-LABEL: test_dynamic_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %dynamic_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+ ret i32 %a
+}
+
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.nvvm.sqrt.f(float)
@@ -335,3 +361,5 @@ declare void @llvm.nvvm.exit()
declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
declare i64 @llvm.readcyclecounter()
declare i64 @llvm.readsteadycounter()
+declare i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+declare i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ed9dad4389453..16133a2c135b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -353,6 +353,12 @@ def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sre
def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+//===----------------------------------------------------------------------===//
+// Various configurations of Shared memory sizes
+def NVVM_TotalSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.total.smem.size">;
+def NVVM_DynamicSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.dynamic.smem.size">;
+def NVVM_AggrSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.aggr.smem.size", [NVVMRequiresSM<90>]>;
+
//===----------------------------------------------------------------------===//
// Clock registers
def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9e4aadac69896..13f795e2118f5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -156,6 +156,12 @@ llvm.func @nvvm_special_regs() -> i32 {
%76 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 0> : i32
// CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%77 = nvvm.read.ptx.sreg.tid.x range <i32, 4294967295, 4294967295> : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+ %78 = nvvm.read.ptx.sreg.total.smem.size : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+ %79 = nvvm.read.ptx.sreg.dynamic.smem.size : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+ %80 = nvvm.read.ptx.sreg.aggr.smem.size : i32
llvm.return %1 : i32
}
|
|
@llvm/pr-subscribers-mlir Author: Durgadoss R (durga4github) ChangesThis patch adds three intrinsics and their corresponding Ops Full diff: https://github.com/llvm/llvm-project/pull/173089.diff 6 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bddbf4ea3c185..bc62cb1ca5fcb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2406,6 +2406,10 @@ defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32<MAX_GR
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_aggr_smem_size : PTXReadSRegIntrinsicNB_r32;
+def int_nvvm_read_ptx_sreg_dynamic_smem_size : PTXReadSRegIntrinsicNB_r32;
+
//
// SHUFFLE
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b145e1d53f46c..0bf4ab0bff8eb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4761,6 +4761,14 @@ def INT_PTX_SREG_CLUSTER_NCTARANK:
int_nvvm_read_ptx_sreg_cluster_nctarank,
[hasSM<90>, hasPTX<78>]>;
+def INT_PTX_SREG_TOTAL_SMEM_SIZE :
+ PTX_READ_SREG_R32<"total_smem_size", int_nvvm_read_ptx_sreg_total_smem_size>;
+def INT_PTX_SREG_DYNAMIC_SMEM_SIZE :
+ PTX_READ_SREG_R32<"dynamic_smem_size", int_nvvm_read_ptx_sreg_dynamic_smem_size>;
+def INT_PTX_SREG_AGGR_SMEM_SIZE :
+ PTX_READ_SREG_R32<"aggr_smem_size",
+ int_nvvm_read_ptx_sreg_aggr_smem_size,
+ [hasSM<90>, hasPTX<81>]>;
def SREG_LANEID : PTX_READ_SREG_R32<"laneid", int_nvvm_read_ptx_sreg_laneid>;
def SREG_WARPID : PTX_READ_SREG_R32<"warpid", int_nvvm_read_ptx_sreg_warpid>;
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
new file mode 100644
index 0000000000000..4a79e94ce98cf
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90-ptx81.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx81| %ptxas-verify -arch=sm_90 %}
+
+define i32 @test_aggr_smem_size() {
+; CHECK-LABEL: test_aggr_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %aggr_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+ ret i32 %a
+}
+
+declare i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll
index 00eb8e293e0fd..7d29b82eeaa76 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll
@@ -318,6 +318,32 @@ define i64 @test_steadycounter() {
ret i64 %ret
}
+define i32 @test_total_smem_size() {
+; CHECK-LABEL: test_total_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %total_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+ ret i32 %a
+}
+
+define i32 @test_dynamic_smem_size() {
+; CHECK-LABEL: test_dynamic_smem_size(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.u32 %r1, %dynamic_smem_size;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: ret;
+ %a = tail call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+ ret i32 %a
+}
+
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare float @llvm.nvvm.sqrt.f(float)
@@ -335,3 +361,5 @@ declare void @llvm.nvvm.exit()
declare i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
declare i64 @llvm.readcyclecounter()
declare i64 @llvm.readsteadycounter()
+declare i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+declare i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ed9dad4389453..16133a2c135b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -353,6 +353,12 @@ def NVVM_ClusterDimBlocksZOp : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sre
def NVVM_ClusterId : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank", [NVVMRequiresSM<90>]>;
def NVVM_ClusterDim : NVVM_PureSpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+//===----------------------------------------------------------------------===//
+// Various configurations of Shared memory sizes
+def NVVM_TotalSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.total.smem.size">;
+def NVVM_DynamicSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.dynamic.smem.size">;
+def NVVM_AggrSmemSize : NVVM_PureSpecialRegisterOp<"read.ptx.sreg.aggr.smem.size", [NVVMRequiresSM<90>]>;
+
//===----------------------------------------------------------------------===//
// Clock registers
def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9e4aadac69896..13f795e2118f5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -156,6 +156,12 @@ llvm.func @nvvm_special_regs() -> i32 {
%76 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 0> : i32
// CHECK: %77 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%77 = nvvm.read.ptx.sreg.tid.x range <i32, 4294967295, 4294967295> : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.total.smem.size()
+ %78 = nvvm.read.ptx.sreg.total.smem.size : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.dynamic.smem.size()
+ %79 = nvvm.read.ptx.sreg.dynamic.smem.size : i32
+ // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.aggr.smem.size()
+ %80 = nvvm.read.ptx.sreg.aggr.smem.size : i32
llvm.return %1 : i32
}
|
AlexMaclean
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please update NVPTXUsage.rst with these intrinsics as well.
| def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32; | ||
| def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; | ||
|
|
||
| def int_nvvm_read_ptx_sreg_total_smem_size : PTXReadSRegIntrinsicNB_r32; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: perhaps we should give these things names like "nvvm.read.prx.sreg.total_smem_size". The underscore is more consistent with PTX while the "." seems to imply that each word is a subfield.
This patch adds three intrinsics and their corresponding Ops
representing the PTX special-register read instructions
that report various configurations of shared-memory sizes.