From 2de4a1039f477c920269847b154a94c7ec3256a3 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 3 Feb 2026 07:08:58 -0800
Subject: [PATCH 1/6] add 2d inputs and copy transpose to transpose benchmark

---
 benchmarks/python/test_transpose.py | 96 +++++++++++++++++++++--------
 tests/cpp/test_transpose.cpp        | 89 ++++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 26 deletions(-)

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index 363f32ed8be..79da68eee18 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -12,14 +12,14 @@
 def transpose_fusion(
     fd: FusionDefinition,
     dtype: DataType,
+    is_copy_transpose: bool,
     axes: list,
+    rank: int,
 ):
-    T0 = fd.define_tensor(
-        shape=[-1, -1, -1], contiguity=[True, True, True], dtype=dtype, is_cpu=False
-    )
-    T1 = fd.define_tensor(
-        shape=[-1, -1, -1], contiguity=[True, True, True], dtype=dtype, is_cpu=False
-    )
+    shape = [-1] * rank
+    contiguity = [True] * rank
+    T0 = fd.define_tensor(shape=shape, contiguity=contiguity, dtype=dtype, is_cpu=False)
+    T1 = fd.define_tensor(shape=shape, contiguity=contiguity, dtype=dtype, is_cpu=False)
 
     if dtype in PROMOTE_DTYPES:
         T0 = fd.ops.cast(T0, dtype=DataType.Float)
@@ -34,25 +34,56 @@ def transpose_fusion(
     S6 = fd.define_scalar(0.00000, dtype=DataType.Double)
     T7 = fd.ops.gt(T5, S6)
     T9 = fd.ops.where(T7, T5, S6)
-
-    fd.add_output(T9)
-
-
-def transpose_fwd_fn(inputs: list):  # [input1, input2, dim0, dim1]
-    return torch.nn.functional.relu(
-        torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
-    )
-
-
-@pytest.mark.parametrize("size", generate_input_sizes(dims=3))
+    # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler.
+    # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enfource output to be contiguous and then transpose scheduler will be used.
+    if is_copy_transpose:
+        T10 = fd.ops.segment_set(T9)
+        fd.add_output(T10)
+    else:
+        fd.add_output(T9)
+
+
+# Without contiguous, transpose returns a view with swapped strides.
+# contiguous() materializes a contiguous copy of the result.
+# When compiled with thunder, contiguous version will use nvFuser's transpose scheduler, otherwise it will use the pointwise scheduler.
+def transpose_fwd_fn(inputs: list):  # [input1, input2, dim0, dim1, is_copy_transpose]
+    is_copy_transpose = inputs[4]
+    if is_copy_transpose:
+        return torch.nn.functional.relu(
+            torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
+        ).contiguous()
+    else:
+        return torch.nn.functional.relu(
+            torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
+        )
+
+
+def _generate_transpose_params():
+    params = []
+    for dims in (2, 3):
+        sizes = generate_input_sizes(dims=dims)
+        axes_list = [(0, 1)] if dims == 2 else [(0, 1), (0, 2), (1, 2)]
+        for size in sizes:
+            for axes in axes_list:
+                params.append((size, axes, dims))
+    return params
+
+
+@pytest.mark.parametrize("size,axes,dims", _generate_transpose_params())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-@pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])
+@pytest.mark.parametrize(
+    "is_copy_transpose",
+    [True, False],
+    ids=["copy_transpose", "view_transpose"],
+)
 @pytest.mark.pointwise
 def test_transpose_nvf_benchmark(
     benchmark,
     size: tuple,
+    is_copy_transpose: bool,
     dtype: torch.dtype,
-    axes: list,
+    axes: tuple,
+    dims: int,
     disable_validation: bool,
     disable_benchmarking: bool,
 ):
@@ -65,10 +96,18 @@ def test_transpose_nvf_benchmark(
     )
 
     with FusionDefinition() as fd:
-        transpose_fusion(fd, torch_dtype_to_nvfuser_dtype(dtype), permute_axes)
+        transpose_fusion(
+            fd,
+            torch_dtype_to_nvfuser_dtype(dtype),
+            is_copy_transpose,
+            permute_axes,
+            rank=dims,
+        )
 
     if not disable_validation:
-        eager_output = transpose_fwd_fn([input1, input2, axes[0], axes[1]])
+        eager_output = transpose_fwd_fn(
+            [input1, input2, axes[0], axes[1], is_copy_transpose]
+        )
         fd.validate([input1, input2], [eager_output])
 
     if not disable_benchmarking:
@@ -76,15 +115,20 @@ def test_transpose_nvf_benchmark(
 
 
 @pytest.mark.parametrize("executor", DEFAULT_EXECUTORS)
-@pytest.mark.parametrize("size", generate_input_sizes(dims=3))
+@pytest.mark.parametrize("size,axes,dims", _generate_transpose_params())
 @pytest.mark.parametrize("dtype", FLOAT_DTYPES)
-@pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)])
-@pytest.mark.pointwise
+@pytest.mark.parametrize(
+    "is_copy_transpose",
+    [True, False],
+    ids=["copy_transpose", "view_transpose"],
+)
 def test_transpose_baseline_benchmark(
     benchmark,
     size: tuple,
     dtype: torch.dtype,
-    axes: list,
+    is_copy_transpose: bool,
+    axes: tuple,
+    dims: int,
     executor: str,
 ):
     if executor == "torchcompile":
@@ -98,5 +142,5 @@ def test_transpose_baseline_benchmark(
     run_benchmark(
         benchmark,
         benchmark_fn,
-        [input1, input2, axes[0], axes[1]],
+        [input1, input2, axes[0], axes[1], is_copy_transpose],
     )
diff --git a/tests/cpp/test_transpose.cpp b/tests/cpp/test_transpose.cpp
index 4e3864e6a4a..baf46c2fab2 100644
--- a/tests/cpp/test_transpose.cpp
+++ b/tests/cpp/test_transpose.cpp
@@ -1407,4 +1407,93 @@ TEST_F(TransposeTest, DanglingBroadcastIssue4957) {
   testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
+TEST_F(TransposeTest, SwizzleNoBankConflict) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  FusionGuard fg(fusion_ptr.get());
+  Fusion& fusion = *fusion_ptr;
+
+  auto dtype = DataType::Float;
+  auto tv0 = makeContigConcreteTensor({262144, 5120}, dtype);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, 0, 1);
+  fusion.addOutput(tv1);
+
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({262144, 5120}, options);
+
+  auto input_cache = tv0->cacheAfter();
+  auto output_cache = tv1->cacheBefore();
+  input_cache->setMemoryType(MemoryType::Shared);
+
+  // Step-1, tiling and parallelizing non-tile dimensions
+  int64_t tile_size1 = 32, tile_size2 = 32;
+  // Group 1 (output-side layout [y, x]).
+  for (auto tv : {output_cache, tv1}) {
+    // [y, x] -> [y/tile_size2, tile_size2, x/tile_size1, tile_size1]
+    tv->split(1, tile_size1);
+    tv->split(0, tile_size2);
+    // [x/tile_size1, y/tile_size2, tile_size1, tile_size2]
+    tv->reorder({{0, 1}, {1, 3}, {2, 0}, {3, 2}});
+    // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2]
+    tv->merge(0);
+    tv->split(0, 1);
+    tv->axis(1)->parallelize(ParallelType::Unswitch);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+  }
+  // Group 2 (input-side layout [x, y]).
+  for (auto tv : {tv0, input_cache}) {
+    // [x, y] -> [x/tile_size1, tile_size1, y/tile_size2, tile_size2]
+    tv->split(1, tile_size2);
+    tv->split(0, tile_size1);
+    // [x/tile_size1, y/tile_size2, tile_size1, tile_size2]
+    tv->reorder({{1, 2}, {2, 1}});
+    // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2]
+    tv->merge(0);
+    tv->split(0, 1);
+    tv->axis(1)->parallelize(ParallelType::Unswitch);
+    tv->axis(0)->parallelize(ParallelType::BIDx);
+  }
+
+  // Step-2, schedule input shared cache to avoid bank conflict
+  int64_t pos = 2;
+  int64_t vectorize_factor = 4, threads_per_block = 128;
+  // Schedule input shared cache.
+  // [BIDx, Unswitch, tile_size1, tile_size2]
+  input_cache->split(3, vectorize_factor);
+  // [BIDx, Unswitch, tile_size1, tile_size2/vectorize_factor,
+  // vectorize_factor]
+  input_cache->split(2, vectorize_factor);
+  // [BIDx, Unswitch, tile_size1/vectorize_factor, vectorize_factor,
+  // tile_size2/vectorize_factor, vectorize_factor]
+  input_cache->swizzle(SwizzleType::XOR, 2, 4);
+  input_cache->merge(2);
+  input_cache->merge(2);
+  input_cache->split(2, threads_per_block);
+  // [BIDx, Unswitch, Unroll, TIDx, Vectorize]
+  input_cache->setAllocationDomain(input_cache->getLoopDomain(), true);
+  input_cache->axis(2)->parallelize(ParallelType::Unroll);
+  input_cache->axis(3)->parallelize(ParallelType::TIDx);
+  input_cache->axis(4)->parallelize(ParallelType::Vectorize);
+
+  // Step-3, schedule output cache
+  for (auto tv : {output_cache, tv1}) {
+    tv->reorder({{-2, -1}});
+    // [..., tile2, tile1]
+    tv->merge(pos);
+    tv->split(pos, vectorize_factor);
+    tv->split(pos, threads_per_block);
+    tv->axis(2)->parallelize(ParallelType::Unroll);
+    tv->axis(3)->parallelize(ParallelType::TIDx);
+    if (tv == tv1) {
+      tv->axis(4)->parallelize(ParallelType::Vectorize);
+    }
+  }
+  inlineMost();
+  KernelExecutor ke;
+  ke.compile(&fusion, {input0});
+  auto outputs = ke.run({input0});
+  testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser

From 180c9b3d5024c397b433d362f62822a55f7d19fd Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 3 Feb 2026 07:10:25 -0800
Subject: [PATCH 2/6] clean

---
 tests/cpp/test_transpose.cpp | 89 ------------------------------------
 1 file changed, 89 deletions(-)

diff --git a/tests/cpp/test_transpose.cpp b/tests/cpp/test_transpose.cpp
index baf46c2fab2..4e3864e6a4a 100644
--- a/tests/cpp/test_transpose.cpp
+++ b/tests/cpp/test_transpose.cpp
@@ -1407,93 +1407,4 @@ TEST_F(TransposeTest, DanglingBroadcastIssue4957) {
   testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__);
 }
 
-TEST_F(TransposeTest, SwizzleNoBankConflict) {
-  auto fusion_ptr = std::make_unique<Fusion>();
-  FusionGuard fg(fusion_ptr.get());
-  Fusion& fusion = *fusion_ptr;
-
-  auto dtype = DataType::Float;
-  auto tv0 = makeContigConcreteTensor({262144, 5120}, dtype);
-  fusion.addInput(tv0);
-  auto tv1 = transpose(tv0, 0, 1);
-  fusion.addOutput(tv1);
-
-  auto options =
-      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({262144, 5120}, options);
-
-  auto input_cache = tv0->cacheAfter();
-  auto output_cache = tv1->cacheBefore();
-  input_cache->setMemoryType(MemoryType::Shared);
-
-  // Step-1, tiling and parallelizing non-tile dimensions
-  int64_t tile_size1 = 32, tile_size2 = 32;
-  // Group 1 (output-side layout [y, x]).
-  for (auto tv : {output_cache, tv1}) {
-    // [y, x] -> [y/tile_size2, tile_size2, x/tile_size1, tile_size1]
-    tv->split(1, tile_size1);
-    tv->split(0, tile_size2);
-    // [x/tile_size1, y/tile_size2, tile_size1, tile_size2]
-    tv->reorder({{0, 1}, {1, 3}, {2, 0}, {3, 2}});
-    // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2]
-    tv->merge(0);
-    tv->split(0, 1);
-    tv->axis(1)->parallelize(ParallelType::Unswitch);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-  }
-  // Group 2 (input-side layout [x, y]).
-  for (auto tv : {tv0, input_cache}) {
-    // [x, y] -> [x/tile_size1, tile_size1, y/tile_size2, tile_size2]
-    tv->split(1, tile_size2);
-    tv->split(0, tile_size1);
-    // [x/tile_size1, y/tile_size2, tile_size1, tile_size2]
-    tv->reorder({{1, 2}, {2, 1}});
-    // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2]
-    tv->merge(0);
-    tv->split(0, 1);
-    tv->axis(1)->parallelize(ParallelType::Unswitch);
-    tv->axis(0)->parallelize(ParallelType::BIDx);
-  }
-
-  // Step-2, schedule input shared cache to avoid bank conflict
-  int64_t pos = 2;
-  int64_t vectorize_factor = 4, threads_per_block = 128;
-  // Schedule input shared cache.
-  // [BIDx, Unswitch, tile_size1, tile_size2]
-  input_cache->split(3, vectorize_factor);
-  // [BIDx, Unswitch, tile_size1, tile_size2/vectorize_factor,
-  // vectorize_factor]
-  input_cache->split(2, vectorize_factor);
-  // [BIDx, Unswitch, tile_size1/vectorize_factor, vectorize_factor,
-  // tile_size2/vectorize_factor, vectorize_factor]
-  input_cache->swizzle(SwizzleType::XOR, 2, 4);
-  input_cache->merge(2);
-  input_cache->merge(2);
-  input_cache->split(2, threads_per_block);
-  // [BIDx, Unswitch, Unroll, TIDx, Vectorize]
-  input_cache->setAllocationDomain(input_cache->getLoopDomain(), true);
-  input_cache->axis(2)->parallelize(ParallelType::Unroll);
-  input_cache->axis(3)->parallelize(ParallelType::TIDx);
-  input_cache->axis(4)->parallelize(ParallelType::Vectorize);
-
-  // Step-3, schedule output cache
-  for (auto tv : {output_cache, tv1}) {
-    tv->reorder({{-2, -1}});
-    // [..., tile2, tile1]
-    tv->merge(pos);
-    tv->split(pos, vectorize_factor);
-    tv->split(pos, threads_per_block);
-    tv->axis(2)->parallelize(ParallelType::Unroll);
-    tv->axis(3)->parallelize(ParallelType::TIDx);
-    if (tv == tv1) {
-      tv->axis(4)->parallelize(ParallelType::Vectorize);
-    }
-  }
-  inlineMost();
-  KernelExecutor ke;
-  ke.compile(&fusion, {input0});
-  auto outputs = ke.run({input0});
-  testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__);
-}
-
 } // namespace nvfuser

From 56de94a4ce4a3719e1b75733e384013d5d51f0e0 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <liqiangxl@gmail.com>
Date: Tue, 3 Feb 2026 07:24:17 -0800
Subject: [PATCH 3/6] clean

---
 benchmarks/python/test_transpose.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index 79da68eee18..66f1aee1b27 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -47,15 +47,14 @@ def transpose_fusion(
 # contiguous() materializes a contiguous copy of the result.
 # When compiled with thunder, contiguous version will use nvFuser's transpose scheduler, otherwise it will use the pointwise scheduler.
 def transpose_fwd_fn(inputs: list):  # [input1, input2, dim0, dim1, is_copy_transpose]
+    relu_transpose_result = torch.nn.functional.relu(
+        torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
+    )
     is_copy_transpose = inputs[4]
     if is_copy_transpose:
-        return torch.nn.functional.relu(
-            torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
-        ).contiguous()
+        return relu_transpose_result.contiguous()
     else:
-        return torch.nn.functional.relu(
-            torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3])
-        )
+        return relu_transpose_result
 
 
 def _generate_transpose_params():

From 1ceac27fcf1058a9f156974ddd8da6966031a831 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com>
Date: Tue, 3 Feb 2026 10:47:45 -0500
Subject: [PATCH 4/6] Apply suggestions from code review

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 benchmarks/python/test_transpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index 66f1aee1b27..6ee4c5377d0 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -35,7 +35,7 @@ def transpose_fusion(
     T7 = fd.ops.gt(T5, S6)
     T9 = fd.ops.where(T7, T5, S6)
     # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler.
-    # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enfource output to be contiguous and then transpose scheduler will be used.
+    # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enforce output to be contiguous and then transpose scheduler will be used.
     if is_copy_transpose:
         T10 = fd.ops.segment_set(T9)
         fd.add_output(T10)

From bb136c3b60d34686a82f5d50a27cc32b72e3d6e5 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com>
Date: Tue, 3 Feb 2026 14:27:54 -0500
Subject: [PATCH 5/6] Update benchmarks/python/test_transpose.py

Co-authored-by: Priya Mishra <52657555+Priya2698@users.noreply.github.com>
---
 benchmarks/python/test_transpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index 6ee4c5377d0..0a26d4b2dc4 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -35,7 +35,7 @@ def transpose_fusion(
     T7 = fd.ops.gt(T5, S6)
     T9 = fd.ops.where(T7, T5, S6)
     # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler.
-    # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enforce output to be contiguous and then transpose scheduler will be used.
+    #we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used.
     if is_copy_transpose:
         T10 = fd.ops.segment_set(T9)
         fd.add_output(T10)

From e1dcbe7ead89f05db3a36df3d9f42a18fc5ea719 Mon Sep 17 00:00:00 2001
From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com>
Date: Tue, 3 Feb 2026 14:48:40 -0500
Subject: [PATCH 6/6] Update benchmarks/python/test_transpose.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 benchmarks/python/test_transpose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py
index 0a26d4b2dc4..11b66774708 100644
--- a/benchmarks/python/test_transpose.py
+++ b/benchmarks/python/test_transpose.py
@@ -35,7 +35,7 @@ def transpose_fusion(
     T7 = fd.ops.gt(T5, S6)
     T9 = fd.ops.where(T7, T5, S6)
     # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler.
-    #we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used.
+    # we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used.
     if is_copy_transpose:
         T10 = fd.ops.segment_set(T9)
         fd.add_output(T10)