From 2de4a1039f477c920269847b154a94c7ec3256a3 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Tue, 3 Feb 2026 07:08:58 -0800 Subject: [PATCH 1/6] add 2d inputs and copy transpose to transpose benchmark --- benchmarks/python/test_transpose.py | 96 +++++++++++++++++++++-------- tests/cpp/test_transpose.cpp | 89 ++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 26 deletions(-) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index 363f32ed8be..79da68eee18 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -12,14 +12,14 @@ def transpose_fusion( fd: FusionDefinition, dtype: DataType, + is_copy_transpose: bool, axes: list, + rank: int, ): - T0 = fd.define_tensor( - shape=[-1, -1, -1], contiguity=[True, True, True], dtype=dtype, is_cpu=False - ) - T1 = fd.define_tensor( - shape=[-1, -1, -1], contiguity=[True, True, True], dtype=dtype, is_cpu=False - ) + shape = [-1] * rank + contiguity = [True] * rank + T0 = fd.define_tensor(shape=shape, contiguity=contiguity, dtype=dtype, is_cpu=False) + T1 = fd.define_tensor(shape=shape, contiguity=contiguity, dtype=dtype, is_cpu=False) if dtype in PROMOTE_DTYPES: T0 = fd.ops.cast(T0, dtype=DataType.Float) @@ -34,25 +34,56 @@ def transpose_fusion( S6 = fd.define_scalar(0.00000, dtype=DataType.Double) T7 = fd.ops.gt(T5, S6) T9 = fd.ops.where(T7, T5, S6) - - fd.add_output(T9) - - -def transpose_fwd_fn(inputs: list): # [input1, input2, dim0, dim1] - return torch.nn.functional.relu( - torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) - ) - - -@pytest.mark.parametrize("size", generate_input_sizes(dims=3)) + # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler. + # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enfource output to be contiguous and then transpose scheduler will be used. + if is_copy_transpose: + T10 = fd.ops.segment_set(T9) + fd.add_output(T10) + else: + fd.add_output(T9) + + +# Without contiguous, transpose returns a view with swapped strides. +# contiguous() materializes a contiguous copy of the result. +# When compiled with thunder, contiguous version will use nvFuser's transpose scheduler, otherwise it will use the pointwise scheduler. +def transpose_fwd_fn(inputs: list): # [input1, input2, dim0, dim1, is_copy_transpose] + is_copy_transpose = inputs[4] + if is_copy_transpose: + return torch.nn.functional.relu( + torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) + ).contiguous() + else: + return torch.nn.functional.relu( + torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) + ) + + +def _generate_transpose_params(): + params = [] + for dims in (2, 3): + sizes = generate_input_sizes(dims=dims) + axes_list = [(0, 1)] if dims == 2 else [(0, 1), (0, 2), (1, 2)] + for size in sizes: + for axes in axes_list: + params.append((size, axes, dims)) + return params + + +@pytest.mark.parametrize("size,axes,dims", _generate_transpose_params()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) -@pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)]) +@pytest.mark.parametrize( + "is_copy_transpose", + [True, False], + ids=["copy_transpose", "view_transpose"], +) @pytest.mark.pointwise def test_transpose_nvf_benchmark( benchmark, size: tuple, + is_copy_transpose: bool, dtype: torch.dtype, - axes: list, + axes: tuple, + dims: int, disable_validation: bool, disable_benchmarking: bool, ): @@ -65,10 +96,18 @@ def test_transpose_nvf_benchmark( ) with FusionDefinition() as fd: - transpose_fusion(fd, torch_dtype_to_nvfuser_dtype(dtype), permute_axes) + transpose_fusion( + fd, + torch_dtype_to_nvfuser_dtype(dtype), + is_copy_transpose, + permute_axes, + rank=dims, + ) if not disable_validation: - eager_output = transpose_fwd_fn([input1, input2, axes[0], axes[1]]) + eager_output = transpose_fwd_fn( + [input1, input2, axes[0], axes[1], is_copy_transpose] + ) fd.validate([input1, input2], [eager_output]) if not disable_benchmarking: @@ -76,15 +115,20 @@ def test_transpose_nvf_benchmark( @pytest.mark.parametrize("executor", DEFAULT_EXECUTORS) -@pytest.mark.parametrize("size", generate_input_sizes(dims=3)) +@pytest.mark.parametrize("size,axes,dims", _generate_transpose_params()) @pytest.mark.parametrize("dtype", FLOAT_DTYPES) -@pytest.mark.parametrize("axes", [(0, 1), (0, 2), (1, 2)]) -@pytest.mark.pointwise +@pytest.mark.parametrize( + "is_copy_transpose", + [True, False], + ids=["copy_transpose", "view_transpose"], +) def test_transpose_baseline_benchmark( benchmark, size: tuple, dtype: torch.dtype, - axes: list, + is_copy_transpose: bool, + axes: tuple, + dims: int, executor: str, ): if executor == "torchcompile": @@ -98,5 +142,5 @@ def test_transpose_baseline_benchmark( run_benchmark( benchmark, benchmark_fn, - [input1, input2, axes[0], axes[1]], + [input1, input2, axes[0], axes[1], is_copy_transpose], ) diff --git a/tests/cpp/test_transpose.cpp b/tests/cpp/test_transpose.cpp index 4e3864e6a4a..baf46c2fab2 100644 --- a/tests/cpp/test_transpose.cpp +++ b/tests/cpp/test_transpose.cpp @@ -1407,4 +1407,93 @@ TEST_F(TransposeTest, DanglingBroadcastIssue4957) { testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } +TEST_F(TransposeTest, SwizzleNoBankConflict) { + auto fusion_ptr = std::make_unique(); + FusionGuard fg(fusion_ptr.get()); + Fusion& fusion = *fusion_ptr; + + auto dtype = DataType::Float; + auto tv0 = makeContigConcreteTensor({262144, 5120}, dtype); + fusion.addInput(tv0); + auto tv1 = transpose(tv0, 0, 1); + fusion.addOutput(tv1); + + auto options = + at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); + at::Tensor input0 = at::randn({262144, 5120}, options); + + auto input_cache = tv0->cacheAfter(); + auto output_cache = tv1->cacheBefore(); + input_cache->setMemoryType(MemoryType::Shared); + + // Step-1, tiling and parallelizing non-tile dimensions + int64_t tile_size1 = 32, tile_size2 = 32; + // Group 1 (output-side layout [y, x]). + for (auto tv : {output_cache, tv1}) { + // [y, x] -> [y/tile_size2, tile_size2, x/tile_size1, tile_size1] + tv->split(1, tile_size1); + tv->split(0, tile_size2); + // [x/tile_size1, y/tile_size2, tile_size1, tile_size2] + tv->reorder({{0, 1}, {1, 3}, {2, 0}, {3, 2}}); + // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2] + tv->merge(0); + tv->split(0, 1); + tv->axis(1)->parallelize(ParallelType::Unswitch); + tv->axis(0)->parallelize(ParallelType::BIDx); + } + // Group 2 (input-side layout [x, y]). + for (auto tv : {tv0, input_cache}) { + // [x, y] -> [x/tile_size1, tile_size1, y/tile_size2, tile_size2] + tv->split(1, tile_size2); + tv->split(0, tile_size1); + // [x/tile_size1, y/tile_size2, tile_size1, tile_size2] + tv->reorder({{1, 2}, {2, 1}}); + // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2] + tv->merge(0); + tv->split(0, 1); + tv->axis(1)->parallelize(ParallelType::Unswitch); + tv->axis(0)->parallelize(ParallelType::BIDx); + } + + // Step-2, schedule input shared cache to avoid bank conflict + int64_t pos = 2; + int64_t vectorize_factor = 4, threads_per_block = 128; + // Schedule input shared cache. + // [BIDx, Unswitch, tile_size1, tile_size2] + input_cache->split(3, vectorize_factor); + // [BIDx, Unswitch, tile_size1, tile_size2/vectorize_factor, + // vectorize_factor] + input_cache->split(2, vectorize_factor); + // [BIDx, Unswitch, tile_size1/vectorize_factor, vectorize_factor, + // tile_size2/vectorize_factor, vectorize_factor] + input_cache->swizzle(SwizzleType::XOR, 2, 4); + input_cache->merge(2); + input_cache->merge(2); + input_cache->split(2, threads_per_block); + // [BIDx, Unswitch, Unroll, TIDx, Vectorize] + input_cache->setAllocationDomain(input_cache->getLoopDomain(), true); + input_cache->axis(2)->parallelize(ParallelType::Unroll); + input_cache->axis(3)->parallelize(ParallelType::TIDx); + input_cache->axis(4)->parallelize(ParallelType::Vectorize); + + // Step-3, schedule output cache + for (auto tv : {output_cache, tv1}) { + tv->reorder({{-2, -1}}); + // [..., tile2, tile1] + tv->merge(pos); + tv->split(pos, vectorize_factor); + tv->split(pos, threads_per_block); + tv->axis(2)->parallelize(ParallelType::Unroll); + tv->axis(3)->parallelize(ParallelType::TIDx); + if (tv == tv1) { + tv->axis(4)->parallelize(ParallelType::Vectorize); + } + } + inlineMost(); + KernelExecutor ke; + ke.compile(&fusion, {input0}); + auto outputs = ke.run({input0}); + testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__); +} + } // namespace nvfuser From 180c9b3d5024c397b433d362f62822a55f7d19fd Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Tue, 3 Feb 2026 07:10:25 -0800 Subject: [PATCH 2/6] clean --- tests/cpp/test_transpose.cpp | 89 ------------------------------------ 1 file changed, 89 deletions(-) diff --git a/tests/cpp/test_transpose.cpp b/tests/cpp/test_transpose.cpp index baf46c2fab2..4e3864e6a4a 100644 --- a/tests/cpp/test_transpose.cpp +++ b/tests/cpp/test_transpose.cpp @@ -1407,93 +1407,4 @@ TEST_F(TransposeTest, DanglingBroadcastIssue4957) { testValidate(executor_cache.fusion(), outputs, {t0}, __LINE__, __FILE__); } -TEST_F(TransposeTest, SwizzleNoBankConflict) { - auto fusion_ptr = std::make_unique(); - FusionGuard fg(fusion_ptr.get()); - Fusion& fusion = *fusion_ptr; - - auto dtype = DataType::Float; - auto tv0 = makeContigConcreteTensor({262144, 5120}, dtype); - fusion.addInput(tv0); - auto tv1 = transpose(tv0, 0, 1); - fusion.addOutput(tv1); - - auto options = - at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); - at::Tensor input0 = at::randn({262144, 5120}, options); - - auto input_cache = tv0->cacheAfter(); - auto output_cache = tv1->cacheBefore(); - input_cache->setMemoryType(MemoryType::Shared); - - // Step-1, tiling and parallelizing non-tile dimensions - int64_t tile_size1 = 32, tile_size2 = 32; - // Group 1 (output-side layout [y, x]). - for (auto tv : {output_cache, tv1}) { - // [y, x] -> [y/tile_size2, tile_size2, x/tile_size1, tile_size1] - tv->split(1, tile_size1); - tv->split(0, tile_size2); - // [x/tile_size1, y/tile_size2, tile_size1, tile_size2] - tv->reorder({{0, 1}, {1, 3}, {2, 0}, {3, 2}}); - // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2] - tv->merge(0); - tv->split(0, 1); - tv->axis(1)->parallelize(ParallelType::Unswitch); - tv->axis(0)->parallelize(ParallelType::BIDx); - } - // Group 2 (input-side layout [x, y]). - for (auto tv : {tv0, input_cache}) { - // [x, y] -> [x/tile_size1, tile_size1, y/tile_size2, tile_size2] - tv->split(1, tile_size2); - tv->split(0, tile_size1); - // [x/tile_size1, y/tile_size2, tile_size1, tile_size2] - tv->reorder({{1, 2}, {2, 1}}); - // [x/tile_size1 * y/tile_size2, tile_size1, tile_size2] - tv->merge(0); - tv->split(0, 1); - tv->axis(1)->parallelize(ParallelType::Unswitch); - tv->axis(0)->parallelize(ParallelType::BIDx); - } - - // Step-2, schedule input shared cache to avoid bank conflict - int64_t pos = 2; - int64_t vectorize_factor = 4, threads_per_block = 128; - // Schedule input shared cache. - // [BIDx, Unswitch, tile_size1, tile_size2] - input_cache->split(3, vectorize_factor); - // [BIDx, Unswitch, tile_size1, tile_size2/vectorize_factor, - // vectorize_factor] - input_cache->split(2, vectorize_factor); - // [BIDx, Unswitch, tile_size1/vectorize_factor, vectorize_factor, - // tile_size2/vectorize_factor, vectorize_factor] - input_cache->swizzle(SwizzleType::XOR, 2, 4); - input_cache->merge(2); - input_cache->merge(2); - input_cache->split(2, threads_per_block); - // [BIDx, Unswitch, Unroll, TIDx, Vectorize] - input_cache->setAllocationDomain(input_cache->getLoopDomain(), true); - input_cache->axis(2)->parallelize(ParallelType::Unroll); - input_cache->axis(3)->parallelize(ParallelType::TIDx); - input_cache->axis(4)->parallelize(ParallelType::Vectorize); - - // Step-3, schedule output cache - for (auto tv : {output_cache, tv1}) { - tv->reorder({{-2, -1}}); - // [..., tile2, tile1] - tv->merge(pos); - tv->split(pos, vectorize_factor); - tv->split(pos, threads_per_block); - tv->axis(2)->parallelize(ParallelType::Unroll); - tv->axis(3)->parallelize(ParallelType::TIDx); - if (tv == tv1) { - tv->axis(4)->parallelize(ParallelType::Vectorize); - } - } - inlineMost(); - KernelExecutor ke; - ke.compile(&fusion, {input0}); - auto outputs = ke.run({input0}); - testValidate(&fusion, outputs, {input0}, __LINE__, __FILE__); -} - } // namespace nvfuser From 56de94a4ce4a3719e1b75733e384013d5d51f0e0 Mon Sep 17 00:00:00 2001 From: Liqiang Lu Date: Tue, 3 Feb 2026 07:24:17 -0800 Subject: [PATCH 3/6] clean --- benchmarks/python/test_transpose.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index 79da68eee18..66f1aee1b27 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -47,15 +47,14 @@ def transpose_fusion( # contiguous() materializes a contiguous copy of the result. # When compiled with thunder, contiguous version will use nvFuser's transpose scheduler, otherwise it will use the pointwise scheduler. def transpose_fwd_fn(inputs: list): # [input1, input2, dim0, dim1, is_copy_transpose] + relu_transpose_result = torch.nn.functional.relu( + torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) + ) is_copy_transpose = inputs[4] if is_copy_transpose: - return torch.nn.functional.relu( - torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) - ).contiguous() + return relu_transpose_result.contiguous() else: - return torch.nn.functional.relu( - torch.transpose(inputs[0] + inputs[1], inputs[2], inputs[3]) - ) + return relu_transpose_result def _generate_transpose_params(): From 1ceac27fcf1058a9f156974ddd8da6966031a831 Mon Sep 17 00:00:00 2001 From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com> Date: Tue, 3 Feb 2026 10:47:45 -0500 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- benchmarks/python/test_transpose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index 66f1aee1b27..6ee4c5377d0 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -35,7 +35,7 @@ def transpose_fusion( T7 = fd.ops.gt(T5, S6) T9 = fd.ops.where(T7, T5, S6) # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler. - # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enfource output to be contiguous and then transpose scheduler will be used. + # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enforce output to be contiguous and then transpose scheduler will be used. if is_copy_transpose: T10 = fd.ops.segment_set(T9) fd.add_output(T10) From bb136c3b60d34686a82f5d50a27cc32b72e3d6e5 Mon Sep 17 00:00:00 2001 From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com> Date: Tue, 3 Feb 2026 14:27:54 -0500 Subject: [PATCH 5/6] Update benchmarks/python/test_transpose.py Co-authored-by: Priya Mishra <52657555+Priya2698@users.noreply.github.com> --- benchmarks/python/test_transpose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index 6ee4c5377d0..0a26d4b2dc4 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -35,7 +35,7 @@ def transpose_fusion( T7 = fd.ops.gt(T5, S6) T9 = fd.ops.where(T7, T5, S6) # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler. - # we can also expose OptimizationPassGuard to python frontend and disable presegment passes to enforce output to be contiguous and then transpose scheduler will be used. + #we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used. if is_copy_transpose: T10 = fd.ops.segment_set(T9) fd.add_output(T10) From e1dcbe7ead89f05db3a36df3d9f42a18fc5ea719 Mon Sep 17 00:00:00 2001 From: Liqiang Lu <116412316+liqiangxl@users.noreply.github.com> Date: Tue, 3 Feb 2026 14:48:40 -0500 Subject: [PATCH 6/6] Update benchmarks/python/test_transpose.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- benchmarks/python/test_transpose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/python/test_transpose.py b/benchmarks/python/test_transpose.py index 0a26d4b2dc4..11b66774708 100644 --- a/benchmarks/python/test_transpose.py +++ b/benchmarks/python/test_transpose.py @@ -35,7 +35,7 @@ def transpose_fusion( T7 = fd.ops.gt(T5, S6) T9 = fd.ops.where(T7, T5, S6) # add segmenter set to avoid presegment passes setting the output as a view of the input without any data movement. It leads to pointwise instead of transpose scheduler. - #we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used. + # we can also expose OptimizationPassGuard to python frontend and disable presegmentation passes to enforce output to be contiguous and then transpose scheduler will be used. if is_copy_transpose: T10 = fd.ops.segment_set(T9) fd.add_output(T10)