diff --git a/lib/PTO/IR/PTO.cpp b/lib/PTO/IR/PTO.cpp
index 4047bc7a6..874ba04e6 100644
--- a/lib/PTO/IR/PTO.cpp
+++ b/lib/PTO/IR/PTO.cpp
@@ -10677,9 +10677,17 @@ static LogicalResult verifyFrontendInitCommon(InitOpT op,
           "globaltensor pipe init expects only 'gm_slot_tensor' and no "
           "'gm_slot_buffer', 'c2v_consumer_buf', or 'v2c_consumer_buf'");
     }
-    if (op.getLocalSlotNumAttr())
-      return op.emitOpError(
-          "globaltensor pipe init does not use 'local_slot_num'");
+    if (auto localSlotNumAttr = op.getLocalSlotNumAttr()) {
+      int32_t localSlotNum = localSlotNumAttr.getInt();
+      if (localSlotNum <= 0)
+        return op.emitOpError("expects 'local_slot_num' to be greater than 0");
+      int32_t loweredSlotNum = dirMask == 3 ? 4 : 8;
+      if (localSlotNum > loweredSlotNum) {
+        return op.emitOpError()
+               << "expects 'local_slot_num' to be less than or equal to "
+               << loweredSlotNum << " for dir_mask = " << static_cast<int>(dirMask);
+      }
+    }
     if (getTargetArch(op.getOperation()) == PTOArch::A5) {
       return op.emitOpError(
           "globaltensor pipe entries are supported for a2/a3 l2g2l pipes");
@@ -11437,12 +11445,24 @@ LogicalResult InitializeL2G2LPipeOp::verify() {
                                  : std::nullopt)))
     return failure();
 
+  bool hasGmSlotTensor =
+      getGmAddr() && isa<TensorViewType>(getGmAddr().getType());
+
   if (!getLocalAddr()) {
     if (getPeerLocalAddr())
       return emitOpError("'peer_local_addr' requires 'local_addr'");
-    if (getLocalSlotNumAttr())
+    if (getLocalSlotNumAttr() && !hasGmSlotTensor)
       return emitOpError(
-          "'local_slot_num' is only allowed when 'local_addr' is present");
+          "'local_slot_num' is only allowed when 'local_addr' is present "
+          "or the pipe init uses a globaltensor slot");
+    if (auto localSlotNumAttr = getLocalSlotNumAttr()) {
+      int32_t localSlotNum = localSlotNumAttr.getInt();
+      if (localSlotNum <= 0)
+        return emitOpError("expects 'local_slot_num' to be greater than 0");
+      if (static_cast<uint32_t>(localSlotNum) > getSlotNum())
+        return emitOpError(
+            "expects 'local_slot_num' to be less than or equal to slot_num");
+    }
     return success();
   }
 
diff --git a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
index 162e7e9b5..fe00dd457 100644
--- a/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
+++ b/lib/PTO/Transforms/PTOLowerFrontendPipeOpsPass.cpp
@@ -126,9 +126,9 @@ static FailureOr<Value> createFrontendPipe(InitOpT initOp, IRRewriter &rewriter,
           "globaltensor pipe entries are supported for a2/a3 l2g2l pipes");
 
     auto pipe = rewriter.create<InitializeL2G2LPipeOp>(
-        loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr, IntegerAttr{},
-        IntegerAttr{}, noSplitAttr, initOp.getGmSlotTensor(), Value{},
-        Value{});
+        loc, pipeTy, dirAttr, slotSizeAttr, slotNumAttr,
+        initOp.getLocalSlotNumAttr(), IntegerAttr{}, noSplitAttr,
+        initOp.getGmSlotTensor(), Value{}, Value{});
     propagateFrontendIdAttr(initOp, pipe.getOperation(), rewriter);
     return pipe.getPipe();
   }
diff --git a/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto
new file mode 100644
index 000000000..22132bb9b
--- /dev/null
+++ b/test/lit/pto/tpush_tpop_globaltensor_local_slot_num_a3.pto
@@ -0,0 +1,66 @@
+// RUN: ptoas --pto-arch=a3 %s 2>&1 | FileCheck %s
+
+// Verify that `local_slot_num` is accepted on the gm_slot_tensor (address-based)
+// form of pto.aic_initialize_pipe / pto.aiv_initialize_pipe and that it flows
+// through to the lowered TPipe<..., LocalSlotNum, ...> template instantiation.
+//
+// Without an override, ptoas keeps the current default of LocalSlotNum=SlotNum.
+// With local_slot_num=2, the lowered TPipe must use 2, which matches the
+// `LocalSlotNum=2` template default in
+// `include/pto/npu/a2a3/TPush.hpp` and the manual flash-attention kernel in
+// `kernels/manual/common/flash_atten/fa_performance_kernel.cpp`.
+
+module {
+  func.func @cube_kernel(
+      %gm_slot_buffer : !pto.ptr<f32>,
+      %src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<cube>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %gm_slots = pto.make_tensor_view %gm_slot_buffer,
+      shape = [%c16, %c16], strides = [%c16, %c1]
+      : !pto.tensor_view<16x16xf32>
+    pto.aic_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
+      (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)
+
+    %entry = pto.talloc_to_aiv {id = 0, split = 0}
+      -> !pto.tensor_view<16x16xf32>
+    %entry_partition = pto.partition_view %entry,
+      offsets = [%c0, %c0], sizes = [%c16, %c16]
+      : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tstore ins(%src : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+               outs(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
+    pto.tpush_to_aiv(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
+    func.return
+  }
+
+  func.func @vector_kernel(
+      %gm_slot_buffer : !pto.ptr<f32>,
+      %dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+      attributes {pto.kernel_kind = #pto.kernel_kind<vector>} {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c16 = arith.constant 16 : index
+    %gm_slots = pto.make_tensor_view %gm_slot_buffer,
+      shape = [%c16, %c16], strides = [%c16, %c1]
+      : !pto.tensor_view<16x16xf32>
+    pto.aiv_initialize_pipe {id = 0, dir_mask = 1, slot_size = 1024, local_slot_num = 2}
+      (gm_slot_tensor = %gm_slots : !pto.tensor_view<16x16xf32>)
+
+    %entry = pto.tpop_from_aic {id = 0, split = 0}
+      -> !pto.tensor_view<16x16xf32>
+    %entry_partition = pto.partition_view %entry,
+      offsets = [%c0, %c0], sizes = [%c16, %c16]
+      : !pto.tensor_view<16x16xf32> -> !pto.partition_tensor_view<16x16xf32>
+    pto.tload ins(%entry_partition : !pto.partition_tensor_view<16x16xf32>)
+              outs(%dst : !pto.tile_buf<loc=vec, dtype=f32, rows=16, cols=16, v_row=16, v_col=16, blayout=row_major, slayout=none_box, fractal=1024, pad=0>)
+    pto.tfree_from_aic(%entry : !pto.tensor_view<16x16xf32>) {id = 0, split = 0}
+    func.return
+  }
+}
+
+// CHECK-LABEL: AICORE void cube_kernel
+// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,
+// CHECK-LABEL: AICORE void vector_kernel
+// CHECK: TPipe<0, Direction::DIR_C2V, 1024, 8, 2,