diff --git a/benchmarks/microbenchmarks/README.md b/benchmarks/microbenchmarks/README.md
new file mode 100644
index 000000000..1f8ca3ee6
--- /dev/null
+++ b/benchmarks/microbenchmarks/README.md
@@ -0,0 +1,66 @@
+# Transformer Engine Microbenchmarks
+
+This directory contains lightweight Python microbenchmarks for selected
+Transformer Engine kernels and helper scripts for comparing benchmark CSVs.
+
+## Benchmarks
+
+- `benchmark_gemm.py`: dense BF16 GEMM benchmark
+- `benchmark_gemm_fp8.py`: dense FP8 GEMM benchmark using `fp8_autocast`
+- `benchmark_grouped_gemm.py`: grouped GEMM benchmark for MoE-style shapes
+- `benchmark_casting.py`: BF16 `<->` FP8 casting benchmark
+- `benchmark_normalization.py`: LayerNorm and RMSNorm benchmark
+
+Run a benchmark directly from this directory. Pass `--csv` to write results.
+When no filename is provided, `run_benchmarks` derives the CSV name from the
+benchmark script file name.
+
+```bash
+python benchmark_gemm.py --csv
+python benchmark_grouped_gemm.py --csv grouped_results.csv
+```
+
+## Shared configuration
+
+Common benchmark settings live in `utils.py`.
+
+- `M_SIZE_LIST`: default token-count sweep for dense and elementwise kernels
+- `DTYPE_LIST`: shared dtype sweep for TE activation benchmarks
+- `MODEL_CONFIGS`: dense GEMM model shapes
+- `MODEL_HIDDEN_SIZES`: hidden sizes for elementwise kernels
+
+Grouped GEMM keeps its own smaller M sweep because its working set scales with
+expert count `B` in addition to `M`.
+
+## Adding a benchmark
+
+Use `run_benchmarks(test_cases, bench_fn, param_columns)`.
+
+- `test_cases` is a list of dictionaries containing benchmark inputs.
+- `param_columns` lists the case fields that should appear in stdout headers
+  and CSV output.
+- `bench_fn(**case)` must return a list of metric records created by
+  `make_metric_record(...)` or `make_forward_backward_metric_records(...)`.
+
+Each metric record represents one benchmark line such as `GEMM Forward`. The
+runner prints that line to stdout and expands it into two CSV columns:
+
+- `<label> Time (ms)`
+- `<label> <unit>`
+
+For example, a `GEMM Forward` metric with unit `TFLOPS` becomes:
+
+- `GEMM Forward Time (ms)`
+- `GEMM Forward TFLOPS`
+
+## Comparing results
+
+Use `compare_results.py` to compare two CSV files from the same benchmark
+family:
+
+```bash
+python compare_results.py baseline.csv candidate.csv --bench-name GEMM
+```
+
+The script auto-detects metric columns, computes speedups for overlapping rows,
+and reports rows that exist only in the baseline or only in the candidate.
diff --git a/benchmarks/microbenchmarks/benchmark_casting.py b/benchmarks/microbenchmarks/benchmark_casting.py
new file mode 100755
index 000000000..118070770
--- /dev/null
+++ b/benchmarks/microbenchmarks/benchmark_casting.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 casting micro-benchmark.
+
+Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
+both E4M3 (activations/weights) and E5M2 (gradients) formats.
+
+These casts are memory-bound; we report GB/s (input + output bytes).
+Output: benchmark_casting.csv (written to cwd)
+"""
+
+import torch
+import transformer_engine
+import transformer_engine_torch as tex
+from transformer_engine.pytorch import Float8Quantizer
+from utils import (
+    MODEL_HIDDEN_SIZES, M_SIZE_LIST,
+    time_func, compute_gbps, make_metric_record, run_benchmarks,
+)
+
+TE_FP8_E4M3 = tex.DType.kFloat8E4M3
+TE_FP8_E5M2 = tex.DType.kFloat8E5M2
+
+CAST_LABEL = "Cast"
+
+CAST_CONFIGS = [
+    # (name, direction, fp8_dtype)
+    ("BF16-to-FP8-E4M3", "quantize",   TE_FP8_E4M3),
+    ("FP8-E4M3-to-BF16", "dequantize", TE_FP8_E4M3),
+    ("BF16-to-FP8-E5M2", "quantize",   TE_FP8_E5M2),
+    ("FP8-E5M2-to-BF16", "dequantize", TE_FP8_E5M2),
+]
+
+
+def _generate_cast_test_cases():
+    test_cases = []
+    for model_name, hidden in MODEL_HIDDEN_SIZES:
+        for cast_name, direction, fp8_dtype in CAST_CONFIGS:
+            for M in M_SIZE_LIST:
+                test_cases.append({
+                    "Case": f"{model_name}/{cast_name}",
+                    "M": M,
+                    "hidden_size": hidden,
+                    "direction": direction,
+                    "fp8_dtype": fp8_dtype,
+                    "dtype_str": cast_name,
+                })
+    return test_cases
+
+
+def bench_cast(Case, M, hidden_size, direction, fp8_dtype, dtype_str):
+    device = "cuda"
+
+    numel = M * hidden_size
+    scale = torch.ones(1, dtype=torch.float32, device=device)
+    amax = torch.zeros(1, dtype=torch.float32, device=device)
+    quantizer = Float8Quantizer(scale, amax, fp8_dtype)
+
+    if direction == "quantize":
+        x = torch.randn(M, hidden_size, dtype=torch.bfloat16, device=device)
+        out = quantizer(x)
+        cast_func = lambda: quantizer.quantize(x, out=out)
+        total_bytes = numel * (2 + 1)  # BF16 read + FP8 write
+    else:
+        x = torch.randn(M, hidden_size, dtype=torch.bfloat16, device=device)
+        fp8_tensor = quantizer(x)
+        cast_func = lambda: fp8_tensor.dequantize()
+        total_bytes = numel * (1 + 2)  # FP8 read + BF16 write
+
+    ms = time_func(cast_func, method="blocked")
+    gbps = compute_gbps(total_bytes, ms)
+
+    return [make_metric_record(CAST_LABEL, ms, "GB/s", gbps)]
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=_generate_cast_test_cases(),
+        bench_fn=bench_cast,
+        param_columns=["Case", "M", "hidden_size", "dtype_str"],
+    )
diff --git a/benchmarks/microbenchmarks/benchmark_gemm.py b/benchmarks/microbenchmarks/benchmark_gemm.py
new file mode 100755
index 000000000..8634e7f09
--- /dev/null
+++ b/benchmarks/microbenchmarks/benchmark_gemm.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+
+import torch
+import transformer_engine.pytorch as te
+from utils import (
+    generate_gemm_test_cases,
+    time_func, compute_tflops, make_forward_backward_metric_records, run_benchmarks,
+)
+
+BENCHMARK_LABEL = "GEMM"
+
+
+def bench_gemm(Case, M, N, K, dtype):
+    device = "cuda"
+
+    linear = te.Linear(K, N, bias=False).to(device=device, dtype=dtype)
+    x = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)
+
+    fwd_func = lambda: linear(x)
+    out = fwd_func()
+    grad_out = torch.randn_like(out)
+
+    def fwd_bwd_func():
+        out = linear(x)
+        out.backward(grad_out)
+        x.grad = None
+        linear.weight.grad = None
+
+    fwd_bwd_func()
+
+    fwd_flops = 2 * M * N * K
+    bwd_flops = 2 * fwd_flops  # dX + dW
+
+    fwd_ms = time_func(fwd_func)
+    fwd_bwd_ms = time_func(fwd_bwd_func)
+    bwd_ms = fwd_bwd_ms - fwd_ms
+
+    fwd_tflops = compute_tflops(fwd_flops, fwd_ms)
+    bwd_tflops = compute_tflops(bwd_flops, bwd_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "TFLOPS",
+        fwd_ms,
+        fwd_tflops,
+        bwd_ms,
+        bwd_tflops,
+        backward_derived=True,
+    )
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=generate_gemm_test_cases(),
+        bench_fn=bench_gemm,
+        param_columns=["Case", "M", "N", "K", "dtype"],
+    )
diff --git a/benchmarks/microbenchmarks/benchmark_gemm_fp8.py b/benchmarks/microbenchmarks/benchmark_gemm_fp8.py
new file mode 100755
index 000000000..85623204a
--- /dev/null
+++ b/benchmarks/microbenchmarks/benchmark_gemm_fp8.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 GEMM micro-benchmark using te.Linear under fp8_autocast.
+
+Same model shapes as benchmark_gemm.py.
+Output: benchmark_gemm_fp8.csv (written to cwd)
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+from utils import (
+    generate_gemm_test_cases,
+    time_func, compute_tflops, make_forward_backward_metric_records, run_benchmarks,
+)
+
+RECIPES = {
+    "hybrid": DelayedScaling(
+        fp8_format=Format.HYBRID,
+        amax_history_len=16,
+        amax_compute_algo="max",
+    ),
+}
+
+FP8_RECIPE = RECIPES["hybrid"]
+
+BENCHMARK_LABEL = "FP8 GEMM"
+
+
+def bench_fp8_gemm(Case, M, N, K, dtype):
+    device = "cuda"
+
+    linear = te.Linear(K, N, bias=False).to(device=device, dtype=dtype)
+    x = torch.randn(M, K, dtype=dtype, device=device, requires_grad=True)
+    grad_out = torch.randn(M, N, dtype=dtype, device=device)
+
+    def fwd_func():
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            return linear(x)
+
+    def fwd_bwd_func():
+        with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+            out = linear(x)
+            out.backward(grad_out)
+        x.grad = None
+        linear.weight.grad = None
+
+    fwd_flops = 2 * M * N * K
+    bwd_flops = 2 * fwd_flops
+
+    fwd_ms = time_func(fwd_func)
+    fwd_bwd_ms = time_func(fwd_bwd_func)
+    bwd_ms = fwd_bwd_ms - fwd_ms
+
+    fwd_tflops = compute_tflops(fwd_flops, fwd_ms)
+    bwd_tflops = compute_tflops(bwd_flops, bwd_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "TFLOPS",
+        fwd_ms,
+        fwd_tflops,
+        bwd_ms,
+        bwd_tflops,
+        backward_derived=True,
+    )
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=generate_gemm_test_cases(),
+        bench_fn=bench_fp8_gemm,
+        param_columns=["Case", "M", "N", "K", "dtype"],
+    )
diff --git a/benchmarks/microbenchmarks/benchmark_grouped_gemm.py b/benchmarks/microbenchmarks/benchmark_grouped_gemm.py
new file mode 100755
index 000000000..5197a9a38
--- /dev/null
+++ b/benchmarks/microbenchmarks/benchmark_grouped_gemm.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+
+import torch
+from utils import (
+    DTYPE_LIST,
+    time_func,
+    compute_tflops,
+    make_forward_backward_metric_records,
+    run_benchmarks,
+)
+
+BENCHMARK_LABEL = "Grouped GEMM"
+
+def generate_grouped_gemm_group_lens(b, m, balance: bool):
+    if balance:
+        return torch.full((b,), m, dtype=torch.int64)
+    else:
+        dist = 0.2 + 0.8 * torch.rand(b)
+        dist /= dist.sum()
+        group_lens = (dist * b * m).to(torch.int64)
+        error = b * m - group_lens.sum()
+        group_lens[-1] += error
+        return group_lens
+
+# Grouped GEMM scales with expert count B, so we sweep smaller M values than
+# the dense GEMM benchmarks to keep the working set and runtime reasonable.
+GROUPED_GEMM_M_SIZE_LIST = [512, 1024, 2048, 4096]
+EP_SIZE_LIST = [32, 16, 8]
+
+
+def _generate_moe_test_cases(
+    name_prefix: str,
+    n_routed_experts: int,
+    moe_intermediate_size: int,
+    hidden_size: int,
+):
+    test_cases = []
+    shapes_dict = {
+        f"{name_prefix}-GateUP": (2 * moe_intermediate_size, hidden_size),
+        f"{name_prefix}-Down": (hidden_size, moe_intermediate_size),
+    }
+
+    for ep in EP_SIZE_LIST:
+        if n_routed_experts % ep != 0:
+            continue
+        B = n_routed_experts // ep
+        if B < 1:
+            continue
+        for M in GROUPED_GEMM_M_SIZE_LIST:
+            for name, (N, K) in shapes_dict.items():
+                for dtype in DTYPE_LIST:
+                    test_cases.append(
+                        {
+                            "Case": name,
+                            "B": B,
+                            "M": M,
+                            "N": N,
+                            "K": K,
+                            "dtype": dtype,
+                        }
+                    )
+    return test_cases
+
+
+def generate_deepseekv3_test_cases():
+    return _generate_moe_test_cases(
+        "DSV3", n_routed_experts=256, moe_intermediate_size=2048, hidden_size=7168
+    )
+
+
+def generate_deepseekv2_test_cases():
+    return _generate_moe_test_cases(
+        "DSV2", n_routed_experts=160, moe_intermediate_size=1536, hidden_size=5120
+    )
+
+
+def generate_deepseekv2_lite_test_cases():
+    return _generate_moe_test_cases(
+        "DSV2-Lite", n_routed_experts=64, moe_intermediate_size=1408, hidden_size=2048
+    )
+
+
+def generate_grok_v2_test_cases():
+    return _generate_moe_test_cases(
+        "Grok-V2", n_routed_experts=8, moe_intermediate_size=16384, hidden_size=8192
+    )
+
+
+def make_fwd_bwd_funcs_te(x, w, group_lens, activation_dtype):
+    from transformer_engine.pytorch.cpp_extensions import general_grouped_gemm
+
+    B = int(group_lens.numel())
+    N = int(w.shape[1])
+    K = int(w.shape[2])
+
+    m_splits = [int(v) for v in group_lens.tolist()]
+    assert len(m_splits) == B
+    sum_M = sum(m_splits)
+    assert x.numel() > 0 and x.shape[0] == sum_M
+
+    x_view = x.reshape(-1, x.shape[-1])
+    xs = list(torch.split(x_view, m_splits))
+    weights = [w[i] for i in range(B)]
+
+    out = torch.empty((sum_M, N), device=x.device, dtype=activation_dtype)
+
+    def fwd_func_te():
+        general_grouped_gemm(
+            A=weights,
+            B=xs,
+            out=[out],
+            quantization_params=[None] * B,
+            out_dtype=activation_dtype,
+            single_output=True,
+            m_splits=m_splits,
+            use_bias=False,
+            bias=None,
+            layout="TN",
+        )
+        return out
+
+    dx = torch.empty((sum_M, K), device=x.device, dtype=activation_dtype)
+    dxs = list(torch.split(dx, m_splits))
+
+    dw_stacked = torch.empty((B, N, K), device=x.device, dtype=activation_dtype)
+    dws = [dw_stacked[i] for i in range(B)]
+
+    def bwd_func_te(grad_out):
+        go = grad_out.view(-1, grad_out.shape[-1])
+        splits = torch.split(go, m_splits)
+
+        general_grouped_gemm(
+            A=weights,
+            B=splits,
+            out=dxs,
+            quantization_params=[None] * B,
+            out_dtype=activation_dtype,
+            single_output=False,
+            layout="NN",
+            m_splits=m_splits,
+            grad=False,
+            use_bias=False,
+            bias=None,
+        )
+
+        general_grouped_gemm(
+            A=xs,
+            B=splits,
+            out=dws,
+            quantization_params=[None] * B,
+            out_dtype=activation_dtype,
+            single_output=False,
+            layout="NT",
+            m_splits=m_splits,
+            grad=False,
+            use_bias=False,
+            bias=None,
+            accumulate=False,
+        )
+
+        return dx, dw_stacked
+
+    return fwd_func_te, bwd_func_te
+
+
+def bench_grouped_gemm(Case, B, M, N, K, dtype):
+    device = "cuda"
+
+    x = torch.randn((B * M, K), dtype=dtype, device=device, requires_grad=True)
+    w = torch.randn((B, N, K), dtype=dtype, device=device, requires_grad=True)
+    group_lens = generate_grouped_gemm_group_lens(B, M, balance=True).to(device)
+
+    x_te = x.clone().detach()
+    w_te = w.clone().detach()
+    fwd_func_te, bwd_func_te_inner = make_fwd_bwd_funcs_te(
+        x_te, w_te, group_lens, activation_dtype=dtype
+    )
+
+    out_te = fwd_func_te()
+    grad_out = torch.randn_like(out_te)
+    bwd_func_te = lambda: bwd_func_te_inner(grad_out)
+
+    fwd_total_flops = 2 * B * M * N * K
+    bwd_total_flops = 2 * fwd_total_flops
+
+    fwd_te_ms = time_func(fwd_func_te)
+    bwd_te_ms = time_func(bwd_func_te)
+
+    fwd_te_tflops = compute_tflops(fwd_total_flops, fwd_te_ms)
+    bwd_te_tflops = compute_tflops(bwd_total_flops, bwd_te_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "TFLOPS",
+        fwd_te_ms,
+        fwd_te_tflops,
+        bwd_te_ms,
+        bwd_te_tflops,
+    )
+
+
+if __name__ == "__main__":
+    test_cases = (
+        generate_deepseekv2_lite_test_cases()
+        + generate_deepseekv2_test_cases()
+        + generate_deepseekv3_test_cases()
+        + generate_grok_v2_test_cases()
+    )
+
+    run_benchmarks(
+        test_cases=test_cases,
+        bench_fn=bench_grouped_gemm,
+        param_columns=["Case", "B", "M", "N", "K", "dtype"],
+    )
diff --git a/benchmarks/microbenchmarks/benchmark_normalization.py b/benchmarks/microbenchmarks/benchmark_normalization.py
new file mode 100755
index 000000000..25c8cef46
--- /dev/null
+++ b/benchmarks/microbenchmarks/benchmark_normalization.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Normalization micro-benchmark using te.LayerNorm and te.RMSNorm.
+
+Both LayerNorm and RMSNorm share the same kernel infrastructure.
+The M dimension (batch * seq_len) is swept across typical training sizes.
+
+Output: benchmark_normalization.csv (written to cwd)
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from utils import (
+    DTYPE_LIST, MODEL_HIDDEN_SIZES, M_SIZE_LIST,
+    time_func, compute_gbps, make_forward_backward_metric_records, run_benchmarks,
+)
+
+NORM_TYPES = [
+    ("RMSNorm",   te.RMSNorm),
+    ("LayerNorm", te.LayerNorm),
+]
+
+BENCHMARK_LABEL = "Normalization"
+
+
+def _generate_norm_test_cases():
+    test_cases = []
+    for model_name, hidden in MODEL_HIDDEN_SIZES:
+        for norm_name, norm_cls in NORM_TYPES:
+            for M in M_SIZE_LIST:
+                for dtype in DTYPE_LIST:
+                    test_cases.append({
+                        "Case": f"{model_name}/{norm_name}",
+                        "M": M,
+                        "hidden_size": hidden,
+                        "norm_name": norm_name,
+                        "norm_cls": norm_cls,
+                        "dtype": dtype,
+                    })
+    return test_cases
+
+
+def bench_norm(Case, M, hidden_size, norm_name, norm_cls, dtype):
+    device = "cuda"
+
+    norm = norm_cls(hidden_size).to(device=device, dtype=dtype)
+    x = torch.randn(M, hidden_size, dtype=dtype, device=device, requires_grad=True)
+
+    fwd_func = lambda: norm(x)
+    out = fwd_func()
+    grad_out = torch.randn_like(out)
+
+    def fwd_bwd_func():
+        out = norm(x)
+        out.backward(grad_out)
+        x.grad = None
+        for p in norm.parameters():
+            p.grad = None
+
+    fwd_bwd_func()
+
+    elem_bytes = x.element_size()
+    fwd_bytes = 2 * M * hidden_size * elem_bytes   # read x, write y
+    bwd_bytes = 4 * M * hidden_size * elem_bytes   # read grad+x+y, write grad_x
+
+    fwd_ms = time_func(fwd_func)
+    fwd_bwd_ms = time_func(fwd_bwd_func)
+    bwd_ms = fwd_bwd_ms - fwd_ms
+
+    fwd_gbps = compute_gbps(fwd_bytes, fwd_ms)
+    bwd_gbps = compute_gbps(bwd_bytes, bwd_ms)
+
+    return make_forward_backward_metric_records(
+        BENCHMARK_LABEL,
+        "GB/s",
+        fwd_ms,
+        fwd_gbps,
+        bwd_ms,
+        bwd_gbps,
+        backward_derived=True
+    )
+
+
+if __name__ == "__main__":
+    run_benchmarks(
+        test_cases=_generate_norm_test_cases(),
+        bench_fn=bench_norm,
+        param_columns=["Case", "M", "hidden_size", "dtype"],
+    )
diff --git a/benchmarks/microbenchmarks/compare_results.py b/benchmarks/microbenchmarks/compare_results.py
new file mode 100755
index 000000000..4a7e1dab8
--- /dev/null
+++ b/benchmarks/microbenchmarks/compare_results.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Compare two CSVs from the same benchmark suite.
+
+Auto-detects metric columns (containing "TFLOPS" or "GB/s") and key columns.
+Outputs a markdown <details> block to stdout with per-config results,
+and optionally appends a summary table row to --summary-file.
+
+Usage:
+    python compare_results.py baseline.csv candidate.csv --bench-name NAME --summary-file FILE
+"""
+
+import argparse
+import sys
+
+import numpy as np
+import pandas as pd
+
+SKIP_COLS = {"TestID", "Label"}
+DEFAULT_MIN_BASELINE_METRIC = 0.5
+
+
+def auto_detect_columns(df):
+    metric_cols = [c for c in df.columns if "TFLOPS" in c or "GB/s" in c]
+    key_cols = [
+        c for c in df.columns
+        if c not in metric_cols and c not in SKIP_COLS
+        and "Time" not in c
+    ]
+    return key_cols, metric_cols
+
+
+def print_key_table(title, rows_df, key_cols):
+    if rows_df.empty:
+        return
+
+    print(title)
+    print()
+    print("| " + " | ".join(key_cols) + " |")
+    print("|" + "|".join(["---"] * len(key_cols)) + "|")
+    for idx in rows_df.index:
+        cells = [str(rows_df.loc[idx, key]) for key in key_cols]
+        print("| " + " | ".join(cells) + " |")
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare benchmark CSVs")
+    parser.add_argument("baseline_csv", help="Baseline CSV")
+    parser.add_argument("candidate_csv", help="Candidate CSV")
+    parser.add_argument("--bench-name", default="benchmark",
+                        help="Benchmark name for markdown output")
+    parser.add_argument("--summary-file", default=None,
+                        help="Append a summary table row (markdown) to this file")
+    parser.add_argument(
+        "--min-baseline-metric",
+        type=float,
+        default=DEFAULT_MIN_BASELINE_METRIC,
+        help=(
+            "Small baseline metrics can produce noisy speedups; skip speedup "
+            "calculations when the baseline metric is below this threshold. "
+            "Set to 0 to disable the filter."
+        ),
+    )
+    args = parser.parse_args()
+
+    baseline_df = pd.read_csv(args.baseline_csv)
+    candidate_df = pd.read_csv(args.candidate_csv)
+
+    key_cols, metric_cols = auto_detect_columns(baseline_df)
+
+    if not metric_cols:
+        print("No metric columns found.")
+        return 0
+
+    for col in metric_cols:
+        baseline_df[col] = pd.to_numeric(baseline_df[col], errors="coerce")
+        candidate_df[col] = pd.to_numeric(candidate_df[col], errors="coerce")
+
+    merged = baseline_df.merge(
+        candidate_df,
+        on=key_cols,
+        suffixes=("_baseline", "_candidate"),
+        how="outer",
+        indicator=True,
+    )
+    if merged.empty:
+        print("WARNING: No rows found in baseline or candidate CSVs.")
+        return 0
+
+    matched = merged[merged["_merge"] == "both"]
+    baseline_only = merged[merged["_merge"] == "left_only"]
+    candidate_only = merged[merged["_merge"] == "right_only"]
+
+    all_speedups = []
+    per_row_data = []
+
+    for idx in matched.index:
+        row_keys = {k: matched.loc[idx, k] for k in key_cols}
+        row_metrics = {}
+
+        for metric in metric_cols:
+            baseline_col = f"{metric}_baseline"
+            candidate_col = f"{metric}_candidate"
+            baseline_value = matched.loc[idx, baseline_col]
+            candidate_value = matched.loc[idx, candidate_col]
+
+            if pd.isna(baseline_value) or pd.isna(candidate_value):
+                continue
+            if not np.isfinite(baseline_value) or not np.isfinite(candidate_value):
+                continue
+            if baseline_value <= 0:
+                continue
+            if (
+                args.min_baseline_metric > 0
+                and baseline_value < args.min_baseline_metric
+            ):
+                continue
+
+            speedup = candidate_value / baseline_value
+            all_speedups.append(speedup)
+            row_metrics[metric] = {
+                "baseline": baseline_value,
+                "candidate": candidate_value,
+                "speedup": speedup,
+            }
+
+        if row_metrics:
+            per_row_data.append({"keys": row_keys, "metrics": row_metrics})
+
+    summary_row = None
+
+    if all_speedups:
+        speedups = np.array(all_speedups)
+        median_sp = float(np.median(speedups))
+        min_sp = float(np.min(speedups))
+        max_sp = float(np.max(speedups))
+        summary_row = (
+            f"| {args.bench_name} | {median_sp:.3f}x | {min_sp:.3f}x | {max_sp:.3f}x |\n"
+        )
+        summary = (
+            f"<summary><b>{args.bench_name}</b> "
+            f"(median {median_sp:.3f}x, min {min_sp:.3f}x, max {max_sp:.3f}x)</summary>"
+        )
+    elif not matched.empty:
+        summary = (
+            f"<summary><b>{args.bench_name}</b> "
+            f"(no valid speedups after filtering)</summary>"
+        )
+    else:
+        summary = f"<summary><b>{args.bench_name}</b> (no overlapping rows)</summary>"
+
+    # Details block
+    print("<details>")
+    print(summary)
+    print()
+
+    if per_row_data:
+        header_cols = list(key_cols)
+        for m in metric_cols:
+            short = m.replace(" TFLOPS", "")
+            header_cols.extend([
+                f"{short} Baseline",
+                f"{short} Candidate",
+                f"{short} Speedup",
+            ])
+
+        print("| " + " | ".join(header_cols) + " |")
+        print("|" + "|".join(["---"] * len(header_cols)) + "|")
+
+        for row in per_row_data:
+            cells = [str(row["keys"].get(k, "")) for k in key_cols]
+            for metric in metric_cols:
+                if metric in row["metrics"]:
+                    v = row["metrics"][metric]
+                    cells.append(f"{v['baseline']:.2f}")
+                    cells.append(f"{v['candidate']:.2f}")
+                    cells.append(f"{v['speedup']:.3f}x")
+                else:
+                    cells.extend(["", "", ""])
+            print("| " + " | ".join(cells) + " |")
+        print()
+    elif not matched.empty:
+        print("No overlapping metric rows produced a valid speedup after filtering.")
+        print()
+
+    print_key_table("Rows only in candidate", candidate_only, key_cols)
+    print_key_table("Rows only in baseline", baseline_only, key_cols)
+
+    print("</details>")
+    print()
+
+    # Summary row
+    if args.summary_file:
+        with open(args.summary_file, "a") as f:
+            if summary_row is not None:
+                f.write(summary_row)
+            else:
+                f.write(f"| {args.bench_name} | n/a | n/a | n/a |\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/microbenchmarks/utils.py b/benchmarks/microbenchmarks/utils.py
new file mode 100644
index 000000000..cad25c413
--- /dev/null
+++ b/benchmarks/microbenchmarks/utils.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Shared utilities for microbenchmarks: model configs, timing, throughput, runner."""
+
+import argparse
+import torch
+import torch.utils.benchmark as benchmark
+
+# ---------------------------------------------------------------------------
+# Sequence / batch-token sizes
+# ---------------------------------------------------------------------------
+M_SIZE_LIST = [1024, 2048, 4096, 8192]
+
+# Shared dtype sweep for TE activation benchmarks. Extend this list to add
+# additional precisions such as torch.float16.
+DTYPE_LIST = [torch.bfloat16]
+
+DEFAULT_MIN_RUN_TIME_SECONDS = 0.2
+
+# ---------------------------------------------------------------------------
+# Model configurations
+# ---------------------------------------------------------------------------
+# (name, hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+#
+# Sources:
+# - Llama 3.1 8B   https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+# - Llama 3.1 70B  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+# - Llama 3.1 405B https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+# - Qwen 2.5 7B  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+# - Qwen 2.5 72B https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+
+MODEL_CONFIGS = [
+    ("Llama3.1-8B/TP1",   4096,  14336,  32,  8, 128,  1),
+    ("Llama3.1-8B/TP8",   4096,  14336,  32,  8, 128,  8),
+    ("Llama3.1-70B/TP8",  8192,  28672,  64,  8, 128,  8),
+    ("Llama3.1-405B/TP8", 16384, 53248, 128,  8, 128,  8),
+    ("Qwen2.5-7B/TP1",  3584, 18944,  28,  4, 128,  1),
+    ("Qwen2.5-72B/TP8", 8192, 29568,  64,  8, 128,  8),
+]
+
+# Unique (model_name, hidden_size) pairs for element-wise benchmarks
+MODEL_HIDDEN_SIZES = [
+    ("Llama3-8B",   4096),
+    ("Llama3-70B",  8192),
+    ("Llama3-405B", 16384),
+    ("Qwen2.5-7B",  3584),
+    ("Qwen2.5-72B", 8192),
+]
+
+
+def gemm_shapes(configs=None):
+    """Generate {case_name: (N, K)} dict from MODEL_CONFIGS.
+
+    Each model contributes up to four GEMM shapes:
+      QKV, AttnOut, GateUp (SwiGLU), Down.
+    """
+    shapes = {}
+    for (name, hidden, intermediate, n_q, n_kv, hd, tp) in (configs or MODEL_CONFIGS):
+        shapes[f"{name}-QKV"]     = ((n_q * hd + 2 * n_kv * hd) // tp, hidden)
+        shapes[f"{name}-AttnOut"] = (hidden, (n_q * hd) // tp)
+        shapes[f"{name}-GateUp"]  = ((2 * intermediate) // tp, hidden)
+        shapes[f"{name}-Down"]    = (hidden, intermediate // tp)
+    return shapes
+
+
+def generate_gemm_test_cases(configs=None, m_sizes=None, dtypes=None):
+    """Generate dense GEMM benchmark cases shared by BF16 and FP8 GEMM."""
+    test_cases = []
+    active_shapes = gemm_shapes(configs)
+    for m_value in (m_sizes or M_SIZE_LIST):
+        for case_name, (n_value, k_value) in active_shapes.items():
+            for dtype in (dtypes or DTYPE_LIST):
+                test_cases.append({
+                    "Case": case_name,
+                    "M": m_value,
+                    "N": n_value,
+                    "K": k_value,
+                    "dtype": dtype,
+                })
+    return test_cases
+
+
+# ---------------------------------------------------------------------------
+# Timing helpers
+# ---------------------------------------------------------------------------
+
+def time_func(fn, method="adaptive", min_run_time=DEFAULT_MIN_RUN_TIME_SECONDS):
+    """Time *fn* and return elapsed milliseconds.
+
+    method: "adaptive" uses adaptive_autorange (good for compute-bound),
+            "blocked"  uses blocked_autorange  (good for memory-bound).
+    """
+    timer = benchmark.Timer(stmt="fn()", globals={"fn": fn})
+    if method == "blocked":
+        return timer.blocked_autorange(min_run_time=min_run_time).mean * 1e3
+    return timer.adaptive_autorange(min_run_time=min_run_time).mean * 1e3
+
+
+# ---------------------------------------------------------------------------
+# Throughput helpers
+# ---------------------------------------------------------------------------
+
+def compute_tflops(flops, ms):
+    """TFLOPS from operation count and milliseconds."""
+    return flops / (ms * 1e-3) / 1e12
+
+
+def compute_gbps(nbytes, ms):
+    """GB/s from byte count and milliseconds."""
+    return nbytes / (ms * 1e-3) / 1e9
+
+
+def make_metric_record(label, ms, unit, throughput, derived=False,
+                       ms_precision=3, throughput_precision=2):
+    """Create a structured metric record for stdout and CSV generation.
+
+    Each record describes one benchmark line item such as "GEMM Forward".
+    ``run_benchmarks`` formats these records for stdout and expands them into
+    ``<label> Time (ms)`` and ``<label> <unit>`` CSV columns.
+    """
+    return {
+        "label": label,
+        "ms": ms,
+        "unit": unit,
+        "throughput": throughput,
+        "derived": derived,
+        "ms_precision": ms_precision,
+        "throughput_precision": throughput_precision,
+    }
+
+
+def make_forward_backward_metric_records(label_prefix, unit,
+                                         forward_ms, forward_throughput,
+                                         backward_ms, backward_throughput,
+                                         backward_derived=False,
+                                         ms_precision=3,
+                                         throughput_precision=2):
+    """Create standard forward/backward metric records for a benchmark."""
+    return [
+        make_metric_record(
+            f"{label_prefix} Forward",
+            forward_ms,
+            unit,
+            forward_throughput,
+            ms_precision=ms_precision,
+            throughput_precision=throughput_precision,
+        ),
+        make_metric_record(
+            f"{label_prefix} Backward",
+            backward_ms,
+            unit,
+            backward_throughput,
+            derived=backward_derived,
+            ms_precision=ms_precision,
+            throughput_precision=throughput_precision,
+        ),
+    ]
+
+
+def _metric_time_key(metric):
+    return f"{metric['label']} Time (ms)"
+
+
+def _metric_throughput_key(metric):
+    return f"{metric['label']} {metric['unit']}"
+
+
+def _format_metric_number(value, precision):
+    return f"{value:.{precision}f}"
+
+
+def _metric_row_from_records(metric_records):
+    row = {}
+    for metric in metric_records:
+        row[_metric_time_key(metric)] = _format_metric_number(
+            metric["ms"], metric.get("ms_precision", 3)
+        )
+        row[_metric_throughput_key(metric)] = _format_metric_number(
+            metric["throughput"], metric.get("throughput_precision", 2)
+        )
+    return row
+
+
+def _print_metric_records(metric_records):
+    label_width = max(24, *(len(metric["label"]) for metric in metric_records))
+    for metric in metric_records:
+        ms_str = _format_metric_number(metric["ms"], metric.get("ms_precision", 3))
+        throughput_str = _format_metric_number(
+            metric["throughput"], metric.get("throughput_precision", 2)
+        )
+        derived_suffix = " (derived)" if metric.get("derived", False) else ""
+        print(
+            f"  {metric['label']:<{label_width}} {ms_str} ms | "
+            f"{throughput_str} {metric['unit']}{derived_suffix}"
+        )
+
+
+def _default_csv_name(bench_fn):
+    import inspect
+    from pathlib import Path
+    return Path(inspect.getfile(bench_fn)).with_suffix(".csv").name
+
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+
+def add_csv_arg(parser):
+    """Add a ``--csv`` flag to an argparse parser."""
+    parser.add_argument(
+        "--csv", nargs="?", const=True, default=None, metavar="FILE",
+        help="Write results to CSV. Optional filename; default derived from script name.",
+    )
+
+
+def run_benchmarks(test_cases, bench_fn, param_columns, default_csv=None):
+    """Iterate *test_cases*, call *bench_fn*, and optionally write a CSV.
+
+    Parameters
+    ----------
+    test_cases : list[dict]
+        Each dict has at least the keys in *param_columns* plus any extra
+        keys the bench_fn needs (passed as **case).
+    bench_fn : callable
+        Called as ``bench_fn(**case)`` and must return a list of metric
+        records created by ``make_metric_record``. Each record corresponds to
+        one stdout line and expands to a time column plus a throughput column in
+        the CSV output.
+    param_columns : list[str]
+        Column names to pull from each test case into the output row.
+    default_csv : str or None
+        Default CSV filename used when ``--csv`` is passed without a
+        filename. If omitted, the CSV name is derived from the caller's
+        file name. CSV output is only written when the caller passes
+        ``--csv`` on the command line.
+    """
+    parser = argparse.ArgumentParser(add_help=False)
+    add_csv_arg(parser)
+    args, _ = parser.parse_known_args()
+
+    rows = []
+    resolved_metric_columns = None
+
+    for case in test_cases:
+        label = "  ".join(f"{k}={case[k]}" for k in param_columns)
+        print(f"\n{'='*60}")
+        print(f"Testing: {label}")
+        print(f"{'='*60}")
+
+        metric_records = bench_fn(**case)
+        metric_row = _metric_row_from_records(metric_records)
+        _print_metric_records(metric_records)
+        current_metric_columns = list(metric_row.keys())
+
+        if resolved_metric_columns is None:
+            resolved_metric_columns = current_metric_columns
+        elif current_metric_columns != resolved_metric_columns:
+            raise ValueError(
+                f"Inconsistent metric columns for case {case}: "
+                f"expected {resolved_metric_columns}, got {current_metric_columns}"
+            )
+
+        row = {k: (str(case[k]) if isinstance(case[k], torch.dtype) else case[k])
+               for k in param_columns}
+        row.update(metric_row)
+        rows.append(row)
+
+    if args.csv is not None:
+        import pandas as pd
+        out_csv = args.csv if isinstance(args.csv, str) else (
+            default_csv or _default_csv_name(bench_fn)
+        )
+        columns = param_columns + (resolved_metric_columns or [])
+        results = pd.DataFrame(rows, columns=columns)
+        results.to_csv(out_csv, index=False)
+        print(f"\nResults saved to {out_csv}")