ROCm · matthiasdiener · Mar 9, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
@@ -0,0 +1,66 @@
+# Transformer Engine Microbenchmarks
+
+This directory contains lightweight Python microbenchmarks for selected
+Transformer Engine kernels and helper scripts for comparing benchmark CSVs.
+
+## Benchmarks
+
+- `benchmark_gemm.py`: dense BF16 GEMM benchmark
+- `benchmark_gemm_fp8.py`: dense FP8 GEMM benchmark using `fp8_autocast`
+- `benchmark_grouped_gemm.py`: grouped GEMM benchmark for MoE-style shapes
+- `benchmark_casting.py`: BF16 `<->` FP8 casting benchmark
+- `benchmark_normalization.py`: LayerNorm and RMSNorm benchmark
+
+Run a benchmark directly from this directory. Pass `--csv` to write results.
+When no filename is provided, `run_benchmarks` derives the CSV name from the
+benchmark script file name.
+
+```bash
+python benchmark_gemm.py --csv
+python benchmark_grouped_gemm.py --csv grouped_results.csv
+```
+
+## Shared configuration
+
+Common benchmark settings live in `utils.py`.
+
+- `M_SIZE_LIST`: default token-count sweep for dense and elementwise kernels
+- `DTYPE_LIST`: shared dtype sweep for TE activation benchmarks
+- `MODEL_CONFIGS`: dense GEMM model shapes
+- `MODEL_HIDDEN_SIZES`: hidden sizes for elementwise kernels
+
+Grouped GEMM keeps its own smaller M sweep because its working set scales with
+expert count `B` in addition to `M`.
+
+## Adding a benchmark
+
+Use `run_benchmarks(test_cases, bench_fn, param_columns)`.
+
+- `test_cases` is a list of dictionaries containing benchmark inputs.
+- `param_columns` lists the case fields that should appear in stdout headers
+  and CSV output.
+- `bench_fn(**case)` must return a list of metric records created by
+  `make_metric_record(...)` or `make_forward_backward_metric_records(...)`.
+
+Each metric record represents one benchmark line such as `GEMM Forward`. The
+runner prints that line to stdout and expands it into two CSV columns:
+
+- `<label> Time (ms)`
+- `<label> <unit>`
+
+For example, a `GEMM Forward` metric with unit `TFLOPS` becomes:
+
+- `GEMM Forward Time (ms)`
+- `GEMM Forward TFLOPS`
+
+## Comparing results
+
+Use `compare_results.py` to compare two CSV files from the same benchmark
+family:
+
+```bash
+python compare_results.py baseline.csv candidate.csv --bench-name GEMM
+```
+
+The script auto-detects metric columns, computes speedups for overlapping rows,
+and reports rows that exist only in the baseline or only in the candidate.
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 casting micro-benchmark.
+
+Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
+both E4M3 (activations/weights) and E5M2 (gradients) formats.
+
+These casts are memory-bound; we report GB/s (input + output bytes).
+Output: benchmark_casting.csv (written to cwd)
+"""
+
+import torch
+import transformer_engine
+import transformer_engine_torch as tex
+from transformer_engine.pytorch import Float8Quantizer
+from utils import (
+    MODEL_HIDDEN_SIZES, M_SIZE_LIST,
+    time_func, compute_gbps, make_metric_record, run_benchmarks,
+)
+
+TE_FP8_E4M3 = tex.DType.kFloat8E4M3
+TE_FP8_E5M2 = tex.DType.kFloat8E5M2
+
+CAST_LABEL = "Cast"
+
+CAST_CONFIGS = [
+    # (name, direction, fp8_dtype)
+    ("BF16-to-FP8-E4M3", "quantize",   TE_FP8_E4M3),
+    ("FP8-E4M3-to-BF16", "dequantize", TE_FP8_E4M3),
+    ("BF16-to-FP8-E5M2", "quantize",   TE_FP8_E5M2),
+    ("FP8-E5M2-to-BF16", "dequantize", TE_FP8_E5M2),
+]
+
+
+def _generate_cast_test_cases():
+    test_cases = []
+    for model_name, hidden in MODEL_HIDDEN_SIZES:
+        for cast_name, direction, fp8_dtype in CAST_CONFIGS:
+            for M in M_SIZE_LIST:
+                test_cases.append({
+                    "Case": f"{model_name}/{cast_name}",
+                    "M": M,
+                    "hidden_size": hidden,
+                    "direction": direction,
+                    "fp8_dtype": fp8_dtype,
+                    "dtype_str": cast_name,
+                })
+    return test_cases
+
+
+def bench_cast(Case, M, hidden_size, direction, fp8_dtype, dtype_str):
+    device = "cuda"
+
+    numel = M * hidden_size
+    scale = torch.ones(1, dtype=torch.float32, device=device)
+    amax = torch.zeros(1, dtype=torch.float32, device=device)
+    quantizer = Float8Quantizer(scale, amax, fp8_dtype)
+
+    if direction == "quantize":
+        x = torch.randn(M, hidden_size, dtype=torch.bfloat16, device=device)
+        out = quantizer(x)
+        cast_func = lambda: quantizer.quantize(x, out=out)
+        total_bytes = numel * (2 + 1)  # BF16 read + FP8 write
+    else:
+        x = torch.randn(M, hidden_size, dtype=torch.bfloat16, device=device)
+        fp8_tensor = quantizer(x)
+        cast_func = lambda: fp8_tensor.dequantize()
+        total_bytes = numel * (1 + 2)  # FP8 read + BF16 write
+
+    ms = time_func(cast_func, method="blocked")
+    gbps = compute_gbps(total_bytes, ms)
+
+    return [make_metric_record(CAST_LABEL, ms, "GB/s", gbps)]
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=_generate_cast_test_cases(),
+        bench_fn=bench_cast,
+        param_columns=["Case", "M", "hidden_size", "dtype_str"],
+    )
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+
+import torch
+import transformer_engine.pytorch as te
+from utils import (
+    generate_gemm_test_cases,
+    time_func, compute_tflops, make_forward_backward_metric_records, run_benchmarks,
+)
+
+BENCHMARK_LABEL = "GEMM"
+
+
+def bench_gemm(Case, M, N, K, dtype):
+    device = "cuda"
+
+    linear = te.Linear(K, N, bias=False).to(device=device, dtype=dtype)
+    x = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)
+
+    fwd_func = lambda: linear(x)
+    out = fwd_func()
+    grad_out = torch.randn_like(out)
+
+    def fwd_bwd_func():
+        out = linear(x)
+        out.backward(grad_out)
+        x.grad = None
+        linear.weight.grad = None
+
+    fwd_bwd_func()
+
+    fwd_flops = 2 * M * N * K
+    bwd_flops = 2 * fwd_flops  # dX + dW
+
+    fwd_ms = time_func(fwd_func)
+    fwd_bwd_ms = time_func(fwd_bwd_func)
+    bwd_ms = fwd_bwd_ms - fwd_ms
+
+    fwd_tflops = compute_tflops(fwd_flops, fwd_ms)
+    bwd_tflops = compute_tflops(bwd_flops, bwd_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "TFLOPS",
+        fwd_ms,
+        fwd_tflops,
+        bwd_ms,
+        bwd_tflops,
+        backward_derived=True,
+    )
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=generate_gemm_test_cases(),
+        bench_fn=bench_gemm,
+        param_columns=["Case", "M", "N", "K", "dtype"],
+    )
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 GEMM micro-benchmark using te.Linear under fp8_autocast.
+
+Same model shapes as benchmark_gemm.py.
+Output: benchmark_gemm_fp8.csv (written to cwd)
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+from utils import (
+    generate_gemm_test_cases,
+    time_func, compute_tflops, make_forward_backward_metric_records, run_benchmarks,
+)
+
+RECIPES = {
+    "hybrid": DelayedScaling(
+        fp8_format=Format.HYBRID,
+        amax_history_len=16,
+        amax_compute_algo="max",
+    ),
+}
+
+FP8_RECIPE = RECIPES["hybrid"]
+
+BENCHMARK_LABEL = "FP8 GEMM"
+
+
+def bench_fp8_gemm(Case, M, N, K, dtype):
+    device = "cuda"
+
+    linear = te.Linear(K, N, bias=False).to(device=device, dtype=dtype)
+    x = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)
+    grad_out = torch.randn(M, N, dtype=dtype, device=device)
+
+    def fwd_func():
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            return linear(x)
+
+    def fwd_bwd_func():
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            out = linear(x)
+            out.backward(grad_out)
+        x.grad = None
+        linear.weight.grad = None
+
+    fwd_flops = 2 * M * N * K
+    bwd_flops = 2 * fwd_flops
+
+    fwd_ms = time_func(fwd_func)
+    fwd_bwd_ms = time_func(fwd_bwd_func)
+    bwd_ms = fwd_bwd_ms - fwd_ms
+
+    fwd_tflops = compute_tflops(fwd_flops, fwd_ms)
+    bwd_tflops = compute_tflops(bwd_flops, bwd_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "TFLOPS",
+        fwd_ms,
+        fwd_tflops,
+        bwd_ms,
+        bwd_tflops,
+        backward_derived=True,
+    )
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=generate_gemm_test_cases(),
+        bench_fn=bench_fp8_gemm,
+        param_columns=["Case", "M", "N", "K", "dtype"],
+    )