diff --git a/tests/jax/test_distributed_fused_attn.py b/tests/jax/test_distributed_fused_attn.py
index 50c5de1db7..39efabc598 100644
--- a/tests/jax/test_distributed_fused_attn.py
+++ b/tests/jax/test_distributed_fused_attn.py
@@ -75,6 +75,7 @@ def impl_test_self_attn(
 
         if not is_fused_attn_kernel_available(
             is_training,
+            batch,
             dtype,
             dtype,
             QKVLayout.BS3HD,
@@ -227,6 +228,7 @@ def test_cross_attn(
 
         if not is_fused_attn_kernel_available(
             is_training,
+            batch,
             dtype,
             dtype,
             QKVLayout.BSHD_BS2HD,
@@ -368,6 +370,7 @@ def impl_test_context_parallel_attn(
         def check_has_backend_for_mask(mask_type):
             return is_fused_attn_kernel_available(
                 is_training,
+                batch,
                 dtype,
                 dtype,
                 qkv_layout,
diff --git a/tests/jax/test_fused_attn.py b/tests/jax/test_fused_attn.py
index 1fb0108068..88c485db81 100644
--- a/tests/jax/test_fused_attn.py
+++ b/tests/jax/test_fused_attn.py
@@ -444,8 +444,9 @@ def _check_configs(self):
                 "is either BSHD_BSHD_BSHD or THD_THD_THD"
             )
 
-        self.backend = FusedAttnHelper(
+        self.backend, message = FusedAttnHelper(
             self.is_training,
+            self.batch_size,
             self.dtype,
             self.dtype,
             self.qkv_layout,
@@ -460,9 +461,10 @@ def _check_configs(self):
             self.head_dim_qk,
             self.head_dim_v,
             (-1, -1) if self.window_size is None else self.window_size,
+            self.attn_mask_type.is_bottom_right(),
         ).get_fused_attn_backend()
         if self.backend != NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen:
-            pytest.skip("Unsupported inputs combination or device compute capability.")
+            pytest.skip(message)
 
         if (
             self.attn_bias_type == AttnBiasType.POST_SCALE_BIAS
diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
index 32ea1694ee..681dbea2c8 100644
--- a/tests/pytorch/attention/test_attention.py
+++ b/tests/pytorch/attention/test_attention.py
@@ -1775,12 +1775,23 @@ def test_dpa_fp8_extra_state(model, dtype):
     config = model_configs_fp8_extra_state[model]
     # Test backend availability
     is_training = True
+    fp8_recipe = recipe.DelayedScaling(
+        margin=0,
+        fp8_format=recipe.Format.HYBRID,
+        amax_history_len=1,
+        amax_compute_algo="most_recent",
+        fp8_dpa=True,
+    )
+    fp8_meta = {}
+    fp8_meta["recipe"] = fp8_recipe
     available_backends, _, fused_attn_backends = get_available_attention_backends(
         config,
         qkv_dtype=torch.float8_e4m3fn,
         qkv_layout="sb3hd",
         is_training=is_training,
         deterministic=_deterministic,
+        fp8=True,
+        fp8_meta=fp8_meta,
     )
     flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends
     if not fused_attn_supported and not flash_attn_supported:
@@ -2567,13 +2578,25 @@ def test_custom_mha_fp8_vs_f16(dtype, model):
     Both paths take F16 input and output. QKV layout is bs3hd"""
 
     config = model_configs_fp8[model]
+    os.environ["NVTE_UnfusedDPA_Emulate_FP8"] = "1"
 
     # Test backend availability
     is_training = True
+    fp8_meta = {}
+    fp8_recipe = recipe.DelayedScaling(
+        margin=0,
+        fp8_format=recipe.Format.HYBRID,
+        amax_history_len=1,
+        amax_compute_algo="most_recent",
+        fp8_dpa=True,
+    )
+    fp8_meta["recipe"] = fp8_recipe
     available_backends, _, fused_attn_backends = get_available_attention_backends(
         config,
         qkv_dtype=torch.float8_e4m3fn,
         qkv_layout="bs3hd",
+        fp8=True,
+        fp8_meta=fp8_meta,
         is_training=is_training,
         deterministic=_deterministic,
     )
@@ -2651,6 +2674,7 @@ def _run_custom_mha_fp8(dtype, config, backend):
         fp8_format=recipe.Format.HYBRID,
         amax_history_len=1,
         amax_compute_algo="most_recent",
+        fp8_dpa=True,
     )
 
     mha = Custom_MHA_FP8(config).to(dtype=dtype, device="cuda")
diff --git a/tests/pytorch/utils.py b/tests/pytorch/utils.py
index 2ee18aaf57..dfa0b62cfc 100644
--- a/tests/pytorch/utils.py
+++ b/tests/pytorch/utils.py
@@ -300,6 +300,10 @@ def __init__(
         self.attn_type = "self" if (self.max_seqlen_q == self.max_seqlen_kv) else "cross"
         self.bias_shape = bias_shape
         self.window_size = check_set_window_size(self.attn_mask_type, window_size)
+        self.bottom_right_diagonal = self.attn_mask_type in {
+            "causal_bottom_right",
+            "padding_causal_bottom_right",
+        }
         self.context_parallel = context_parallel
         self.cp_comm_type = cp_comm_type
         self.return_max_logit = return_max_logit
@@ -376,6 +380,7 @@ def test():
             head_dim_v=config.head_dim_v,
             attn_mask_type=config.attn_mask_type,
             window_size=config.window_size,
+            bottom_right_diagonal=config.bottom_right_diagonal,
             alibi_slopes_shape=alibi_slopes_shape,
             core_attention_bias_type=config.attn_bias_type,
             core_attention_bias_shape=core_attention_bias_shape,
diff --git a/transformer_engine/common/fused_attn/fused_attn.cpp b/transformer_engine/common/fused_attn/fused_attn.cpp
index d2eb1a831c..0fbd9a21ae 100644
--- a/transformer_engine/common/fused_attn/fused_attn.cpp
+++ b/transformer_engine/common/fused_attn/fused_attn.cpp
@@ -225,304 +225,156 @@ NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout) {
   }
 }
 
+namespace {
+
+// per-thread storage for the diagnostic string
+// re-used (cleared + re-populated) on every call to nvte_get_fused_attn_backend_v2 on this thread
+thread_local std::string fused_attn_backend_message_buffer;
+
+// Stash `reason` in the thread-local buffer and, if the caller asked for a diagnostic,
+// publish a NUL-terminated pointer to it via `*message`. Safe to call with `message == nullptr`.
+void set_message(const char **message, std::string reason) {
+  fused_attn_backend_message_buffer = std::move(reason);
+  if (message != nullptr) {
+    *message = fused_attn_backend_message_buffer.c_str();
+  }
+}
+
+}  // namespace
+
 // select a backend for fused attention
-NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
-    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
+NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend_v2(const NVTEFusedAttnConfig *cfg,
+                                                       const char **message) {
   using namespace transformer_engine;
-  NVTE_Fused_Attn_Backend backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-  const int device_id = cuda::current_device();
-  const int sm_arch_ = cuda::sm_arch(device_id);
-  NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
-  NVTE_QKV_Format qkv_format = nvte_get_qkv_format(qkv_layout);
-  NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
-  NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
-  NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
-  auto cudnn_runtime_version = cudnnGetVersion();
+  set_message(message, "");
+  NVTE_CHECK(cfg != nullptr, "NVTEFusedAttnConfig pointer must not be NULL.");
+  NVTE_CHECK(cfg->struct_size == sizeof(NVTEFusedAttnConfig),
+             "NVTEFusedAttnConfig::struct_size must equal sizeof(NVTEFusedAttnConfig); "
+             "did you forget NVTE_FUSED_ATTN_CONFIG_INIT?");
+
+  cudnnHandle_t handle = cudnnExecutionPlanManager::Instance().GetHandle();
+  const NVTE_QKV_Format qkv_format = nvte_get_qkv_format(cfg->qkv_layout);
+  const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(cfg->qkv_layout);
+  const auto cudnn_runtime_version = cudnnGetVersion();
 
-  // For ragged offsets we only support 32-bit prior to cuDNN 9.5
-  // Only used when THD format is requested.
+  // THD + 64-bit ragged offsets require cuDNN >= 9.5
   const bool requires_64bit_ragged_offset =
-      (qkv_format == NVTE_THD && fused_attn::get_ragged_offset_dtype(
-                                     layout_group, num_attn_heads, num_gqa_groups, max_seqlen_q,
-                                     max_seqlen_kv, head_dim_qk, head_dim_v) == DType::kInt64);
-  const bool supported_ragged_offset_size =
-      (!requires_64bit_ragged_offset || cudnn_runtime_version >= 90500);
-
-  if ((q_dtype == NVTEDType::kNVTEFloat8E4M3 || q_dtype == NVTEDType::kNVTEFloat8E5M2) &&
-      sm_arch_ >= 90 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-      (
-          // 9.2.1: {bshd, sbhd}, any seqlen, d=128, {no_mask, causal}
-          (cudnn_runtime_version >= 90201 && sm_arch_ < 100 && max_seqlen_q % 128 == 0 &&
-           max_seqlen_kv % 128 == 0 && head_dim_qk == 128 && head_dim_v == 128 &&
-           (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) ||
-          // 9.7: {bshd, sbhd}, any seqlen, d<=256 for sm90 and d<=128 for sm100, {padding, padding_causal}
-          (cudnn_runtime_version >= 90700 &&
-           // TODO (cyang): add is_training to nvte_get_fused_attn_backend
-           // sm90: fwd d<=256, bwd d=128 only
-           // sm100: fwd d<=128, bwd d<=128
-           ((sm_arch_ < 100 && (!is_training) && head_dim_qk <= 256 && head_dim_v <= 256) ||
-            (sm_arch_ < 100 && is_training && head_dim_qk == 128 && head_dim_v == 128) ||
-            (sm_arch_ >= 100 && head_dim_qk <= 128 && head_dim_v <= 128)) &&
-           head_dim_qk % 16 == 0 && head_dim_v % 16 == 0 &&
-           (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)) ||
-          // 9.21: d_qk=192, d_v=128
-          (cudnn_runtime_version >= 92100 && sm_arch_ >= 100 && head_dim_qk <= 192 &&
-           head_dim_v <= 128 && head_dim_qk % 16 == 0 && head_dim_v % 16 == 0 &&
-           (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK))) &&
-      // pre-9.21: {bshd, sbhd}, {vanilla}
-      // 9.21+: {bshd, sbhd, bhsd}, {vanilla, off-by-one, learnable}
-      ((cudnn_runtime_version < 92100 &&
-        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
-        softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX) ||
-       (cudnn_runtime_version >= 92100 &&
-        (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD ||
-         qkv_format == NVTE_QKV_Format::NVTE_BHSD))) &&
-      !requires_64bit_ragged_offset &&
-      // 9.10.0: known bugs with SDPA FP8
-      (cudnn_runtime_version != 91000) && !return_max_logit) {
-    backend = NVTE_Fused_Attn_Backend::NVTE_FP8;
-  } else if ((q_dtype == NVTEDType::kNVTEFloat16) || (q_dtype == NVTEDType::kNVTEBFloat16)) {
-    bool flag_arb = false;
-    if (
-        // TODO(cyang): replace with cudnn-frontend check_support for cleaner logic and better error messaging
-        // architecture
-        ((cudnn_runtime_version < 8903 && (sm_arch_ == 80 || sm_arch_ == 90)) ||
-         (cudnn_runtime_version >= 8903 && sm_arch_ >= 80 && sm_arch_ < 100) ||
-         (cudnn_runtime_version >= 90700 && sm_arch_ >= 100)) &&
-        // sequence length
-        ((cudnn_runtime_version < 90000 && max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0) ||
-         (cudnn_runtime_version >= 90000)) &&
-        // number of heads
-        ((cudnn_runtime_version < 8907 && num_attn_heads == num_gqa_groups) ||
-         (cudnn_runtime_version >= 8907)) &&
-        // head dimension
-        // multiples of 8
-        (head_dim_qk % 8 == 0 && head_dim_v % 8 == 0 &&
-         // <= 128
-         ((head_dim_qk <= 128 && head_dim_v <= 128) ||
-          // 9.1: <= 256 + Hopper + fprop
-          // 9.5: <= 256 + Hopper + bprop
-          (head_dim_qk <= 256 && head_dim_v <= 256 &&
-           ((!is_training && sm_arch_ == 90 && cudnn_runtime_version >= 90100) ||
-            (is_training && sm_arch_ == 90 && cudnn_runtime_version >= 90500))) ||
-          // 9.9: any head_dim + Blackwell + fprop + non_paged + sq > 1
-          (!is_training && sm_arch_ >= 100 && cudnn_runtime_version >= 90900 && max_seqlen_q > 1 &&
-           layout_group != NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD) ||
-          // 9.10.2: any head_dim + any arch + fprop + paged
-          // 9.10.2: any head_dim + any arch + fprop + non_paged + sq > 1
-          // 9.10.2: any head_dim + any arch + fprop + non_paged + sq = 1 + {no_mask, padding, BRCM, padding_BRCM}
-          (!is_training && cudnn_runtime_version >= 91002 &&
-           (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD || max_seqlen_q > 1 ||
-            (max_seqlen_q == 1 && attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_MASK &&
-             attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK))) ||
-          // 9.11: d_qk = 192, d_v = 128 + Blackwell + bprop + non-paged
-          (head_dim_qk == 192 && head_dim_v == 128 && is_training && sm_arch_ >= 100 &&
-           cudnn_runtime_version >= 91100)) &&
-         // 9.11+ bug: 128 < d_qk <= 256, 128 < d_v <= 256 + Hopper + bprop + MLA
-         // Conditional to temporarily use blanket cudnn_runtime_version >= 9.11 until fixed
-         (!((cudnn_runtime_version >= 91100) && is_training && sm_arch_ == 90 &&
-            head_dim_qk >= 128 && head_dim_v >= 128 && !(head_dim_qk == 192 && head_dim_v == 128) &&
-            head_dim_qk != head_dim_v))) &&
-        // bias type
-        ((cudnn_runtime_version < 8906 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS) ||
-         (cudnn_runtime_version >= 8906 &&
-          (bias_type == NVTE_Bias_Type::NVTE_NO_BIAS ||
-           (bias_type == NVTE_Bias_Type::NVTE_ALIBI &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_NO_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
-            attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-            sm_arch_ >= 90) ||
-           (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS && sm_arch_ >= 90))) ||
-         (cudnn_runtime_version >= 90000 &&
-          (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS && sm_arch_ >= 80))) &&
-        // mask type
-        // pre-8.9.6: causal
-        ((cudnn_runtime_version < 8906 && attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK) ||
-         // 8.9.6: {bshd, sbhd} + {no_mask, causal, padding, padding_causal}
-         (cudnn_runtime_version >= 8906 &&
-          (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD) &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK)) ||
-         // 9.1: adds thd + {padding, padding_causal}
-         (cudnn_runtime_version >= 90100 && qkv_format == NVTE_QKV_Format::NVTE_THD &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK)) ||
-         // 9.3: adds {bshd, sbhd} + causal_bottom_right + self/cross-attn (sq <= skv)
-         (cudnn_runtime_version >= 90300 &&
-          (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD) &&
-          attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-          max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.5: adds {paged_kv_bshd, paged_kv_sbhd} + {padding, padding_causal, padding_causal_bottom_right}
-         (cudnn_runtime_version >= 90500 &&
-          layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-           (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-            max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv)) &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.6: adds {bshd, sbhd, thd} + padding_causal_bottom_right + self/cross-attn (sq <= skv)
-         (cudnn_runtime_version >= 90600 &&
-          attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-          max_seqlen_q % 64 == 0 && max_seqlen_kv % 64 == 0 && max_seqlen_q <= max_seqlen_kv &&
-          bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-         // 9.7: removes s_q/s_kv % 64 = 0 for {causal_bottom_right, padding_causal_bottom_right}
-         // for any q_format/kv_format, and paged/non-paged
-         (cudnn_runtime_version >= 90700 &&
-          (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-           attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-           ((attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) &&
-            bias_type == NVTE_Bias_Type::NVTE_NO_BIAS && dropout == 0.0) ||
-           ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) &&
-            max_seqlen_q <= max_seqlen_kv)))) &&
-        // bias + mask combination
-        (!(cudnn_runtime_version >= 8906 &&
-           (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-            attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK) &&
-           bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS)) &&
-        // qkv format
-        (qkv_format == NVTE_QKV_Format::NVTE_SBHD || qkv_format == NVTE_QKV_Format::NVTE_BSHD ||
-         qkv_format == NVTE_QKV_Format::NVTE_BHSD ||
-         (qkv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90 &&
-          ((cudnn_runtime_version >= 90100 && num_attn_heads == num_gqa_groups) ||
-           cudnn_runtime_version >= 90600)) ||
-         ((q_format == NVTE_QKV_Format::NVTE_SBHD || q_format == NVTE_QKV_Format::NVTE_BSHD ||
-           q_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (q_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90) ||
-           kv_format == NVTE_QKV_Format::NVTE_SBHD || kv_format == NVTE_QKV_Format::NVTE_BSHD ||
-           kv_format == NVTE_QKV_Format::NVTE_BHSD ||
-           (kv_format == NVTE_QKV_Format::NVTE_THD && sm_arch_ >= 90)) &&
-          cudnn_runtime_version >= 90700)) &&
-        // sliding window
-        // pre-9.2: full attn, causal
-        ((cudnn_runtime_version < 90200 && window_size_left == -1 &&
-          (window_size_right == -1 || window_size_right == 0)) ||
-         // 9.2: SWA (left, 0) + top-left diagonal + {bshd, sbhd}
-         (cudnn_runtime_version >= 90200 &&
-          ((window_size_left == -1 && window_size_right == -1 &&
-            attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK) ||
-           ((window_size_left == -1 || window_size_left >= 0) && window_size_right == 0 &&
-            (attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_MASK ||
-             (attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-              max_seqlen_q == max_seqlen_kv)) &&
-            max_seqlen_q <= max_seqlen_kv && dropout == 0.0 &&
-            bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-            (qkv_format == NVTE_QKV_Format::NVTE_BSHD ||
-             qkv_format == NVTE_QKV_Format::NVTE_SBHD)))) ||
-         // 9.6: SWA (left, 0) + top-left/bottom-right diagonal + {bshd, sbhd, thd}
-         (cudnn_runtime_version >= 90600 &&
-          ((window_size_left == -1 && (window_size_right == -1 || window_size_right == 0)) ||
-           ((window_size_left >= 0 || window_size_left == -1) &&
-            (window_size_right >= 0 || window_size_right == -1) &&
-            ((attn_mask_type == NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK &&
-              // TODO(cyang): fix bug for BRCM + cross-attention on sm100
-              (sm_arch_ < 100 || (sm_arch_ >= 100 && ((max_seqlen_q == max_seqlen_kv &&
-                                                       cudnn_runtime_version <= 90700) ||
-                                                      cudnn_runtime_version > 90700)))) ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_NO_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_MASK ||
-             attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK ||
-             (attn_mask_type == NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK &&
-              (sm_arch_ < 100 || (sm_arch_ >= 100 && ((max_seqlen_q == max_seqlen_kv &&
-                                                       cudnn_runtime_version <= 90700) ||
-                                                      cudnn_runtime_version > 90700))))) &&
-            max_seqlen_q <= max_seqlen_kv && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS &&
-            dropout == 0.0)))) &&
-        // check 64-bit ragged offset support
-        (supported_ragged_offset_size) &&
-        // 9.10.0/9.10.1: known bugs with SDPA F16
-        (cudnn_runtime_version != 91000) && (cudnn_runtime_version != 91001) &&
-        // softmax type
-        // pre-9.13.1: vanilla
-        // 9.13.1+: vanilla, off-by-one, learnable
-        (cudnn_runtime_version >= 91301 ||
-         (cudnn_runtime_version < 91301 &&
-          softmax_type == NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)) &&
-        // determinism on Blackwell
-        // pre-9.18.1: fwd: deterministic; bwd: non-deterministic
-        // 9.18.1+: fwd: deterministic; bwd: non-deterministic/deterministic
-        (sm_arch_ < 100 ||
-         (sm_arch_ >= 100 && (!is_training ||
-                              (is_training && !deterministic &&
-                               (dropout == 0.0 || bias_type == NVTE_Bias_Type::NVTE_NO_BIAS)) ||
-                              (is_training && deterministic && cudnn_runtime_version >= 91801 &&
-                               dropout == 0.0 && bias_type == NVTE_Bias_Type::NVTE_NO_BIAS))))) {
-      flag_arb = true;
+      (qkv_format == NVTE_THD &&
+       fused_attn::get_ragged_offset_dtype(layout_group, cfg->num_attn_heads, cfg->num_gqa_groups,
+                                           cfg->max_seqlen_q, cfg->max_seqlen_kv, cfg->head_dim_qk,
+                                           cfg->head_dim_v) == DType::kInt64);
+  if (requires_64bit_ragged_offset && cudnn_runtime_version < 90500) {
+    set_message(message,
+                "Configuration requires 64-bit ragged offsets, which require "
+                "cuDNN >= 9.5.");
+    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  }
+
+  // THD requires padding-style mask
+  if (qkv_format == NVTE_QKV_Format::NVTE_THD &&
+      cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
+      cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
+      cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
+    set_message(message,
+                "THD format requires PADDING / PADDING_CAUSAL / PADDING_CAUSAL_BOTTOM_RIGHT mask.");
+    return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+  }
+
+  const bool is_fp8 = (cfg->qkv_dtype == NVTEDType::kNVTEFloat8E4M3 ||
+                       cfg->qkv_dtype == NVTEDType::kNVTEFloat8E5M2);
+  const bool is_f16_or_bf16 =
+      (cfg->qkv_dtype == NVTEDType::kNVTEFloat16 || cfg->qkv_dtype == NVTEDType::kNVTEBFloat16);
+
+  if (is_fp8) {
+    if (cfg->return_max_logit) {
+      set_message(message, "FP8 fused attention does not support return_max_logit=True.");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (flag_arb) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
+    if (qkv_format != NVTE_QKV_Format::NVTE_BSHD && qkv_format != NVTE_QKV_Format::NVTE_SBHD &&
+        qkv_format != NVTE_QKV_Format::NVTE_BHSD) {
+      set_message(message, "FP8 fused attention supports BSHD/SBHD/BHSD formats, found " +
+                               std::to_string(static_cast<int>(qkv_format)) + ".");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (cudnn_runtime_version < 8900 &&
-        backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: FP16/BF16 fused attention is supported by cuDNN 8.9.0+."
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
+    std::string fwd_reason = is_supported_fp8_fwd(cfg, handle);
+    if (!fwd_reason.empty()) {
+      set_message(message, std::move(fwd_reason));
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if ((cudnn_runtime_version == 91400) && (max_seqlen_kv > 1024) && (window_size_left != -1) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_CAUSAL_BOTTOM_RIGHT_MASK)) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: Given combination of attention mask (non-causal) and "
-                   "max_seqlen_kv (> 1024) does not support fused attention for cuDNN 9.14.0. "
-                   " Please upgrade your cuDNN version if possible."
-                << std::endl;
+    if (cfg->is_training) {
+      std::string bwd_reason = is_supported_fp8_bwd(cfg, handle);
+      if (!bwd_reason.empty()) {
+        set_message(message, std::move(bwd_reason));
+        return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+      }
     }
-    if ((cudnn_runtime_version <= 91500) && is_training &&
+    return NVTE_Fused_Attn_Backend::NVTE_FP8;
+  }
+
+  if (is_f16_or_bf16) {
+    if (cudnn_runtime_version <= 91500 && cfg->is_training &&
         (qkv_format == NVTE_QKV_Format::NVTE_BSHD || qkv_format == NVTE_QKV_Format::NVTE_SBHD) &&
-        (max_seqlen_kv % 128 != 0) && cuda_graph &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK) &&
-        (attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK)) {
-      backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-      std::cout << "Warning: Given combination of attention mask (non-padding),"
-                   " max_seqlen_kv (not divisible by 128), and qkv_format (BSHD/SBHD) for"
-                   " backward fused attention with graph capture requires cuDNN 9.15.1+. "
-                   "Please upgrade your cuDNN version if possible."
-                << std::endl;
+        (cfg->max_seqlen_kv % 128 != 0) && cfg->cuda_graph &&
+        cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_MASK &&
+        cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_MASK &&
+        cfg->attn_mask_type != NVTE_Mask_Type::NVTE_PADDING_CAUSAL_BOTTOM_RIGHT_MASK) {
+      set_message(message, "Known cuDNN <= 9.15 issue with CUDA graph. Please upgrade cuDNN.");
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+    }
+    std::string fwd_reason = is_supported_f16_fwd(cfg, handle);
+    if (!fwd_reason.empty()) {
+      set_message(message, std::move(fwd_reason));
+      return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
     }
-    if (backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen && sm_arch_ == 120) {
-      if (cudnn_runtime_version < 91801) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-        std::cout << "Warning: Given combination of sm_arch_ == 120 and cudnn_runtime_version < "
-                     "91801 is not supported. "
-                  << " Please upgrade your cuDNN version if possible." << std::endl;
-      } else if (deterministic && is_training) {
-        backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-        std::cout << "Warning: Deterministic fused attention on SM120 is not supported."
-                  << std::endl;
-      } else {
-        // Known missing support for T3HD/TH3D layouts on SM120
-        const bool is_t3hd_or_th3d =
-            (qkv_layout == NVTE_QKV_Layout::NVTE_T3HD || qkv_layout == NVTE_QKV_Layout::NVTE_TH3D);
-        if (is_t3hd_or_th3d) {
-          backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
-          std::cout << "Warning: Given combination of T3HD/TH3D layouts on SM120 is not supported. "
-                    << " Please consider using other THD layouts if possible." << std::endl;
-        }
+    if (cfg->is_training) {
+      std::string bwd_reason = is_supported_f16_bwd(cfg, handle);
+      if (!bwd_reason.empty()) {
+        set_message(message, std::move(bwd_reason));
+        return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
       }
     }
-  } else {
-    backend = NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+    return NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen;
   }
-  return backend;
+
+  set_message(message, "Unsupported QKV dtype qkv_dtype=" + std::to_string(cfg->qkv_dtype) + " .");
+  return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
+}
+
+// Deprecated: thin wrapper preserving the historical narrow signature. New callers should
+// construct an NVTEFusedAttnConfig and call nvte_get_fused_attn_backend_v2 directly to access
+// the additional fields (attn_scale, format/layout fields, scaling_mode, paged-KV/bias shape,
+// dO/dQKV dtypes, etc.) that this wrapper cannot express.
+NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend(
+    bool is_training, NVTEDType q_dtype, NVTEDType kv_dtype, NVTE_QKV_Layout qkv_layout,
+    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
+    float dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
+    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
+    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
+  (void)is_training;
+  NVTEFusedAttnConfig cfg = NVTE_FUSED_ATTN_CONFIG_INIT;
+  cfg.qkv_layout = qkv_layout;
+  cfg.bias_type = bias_type;
+  cfg.attn_mask_type = attn_mask_type;
+  cfg.softmax_type = softmax_type;
+  cfg.attn_scale = 1.0f;  // legacy default; matches the value pre-PR probes hardcoded
+  cfg.dropout = dropout;
+  cfg.max_seqlen_q = max_seqlen_q;
+  cfg.max_seqlen_kv = max_seqlen_kv;
+  cfg.window_size_left = window_size_left;
+  cfg.window_size_right = window_size_right;
+  cfg.cuda_graph = cuda_graph;
+  NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
+  cfg.qkv_dtype = q_dtype;
+  cfg.o_dtype = q_dtype;  // legacy: O dtype matches Q dtype
+  cfg.batch_size = 1;     // legacy: pre-PR probes assumed batch=1
+  cfg.num_attn_heads = num_attn_heads;
+  cfg.num_gqa_groups = num_gqa_groups;
+  cfg.head_dim_qk = head_dim_qk;
+  cfg.head_dim_v = head_dim_v;
+  cfg.is_training = false;  // legacy wrapper cannot express dO/dQKV dtypes; skip bwd probe
+  cfg.return_max_logit = return_max_logit;
+  cfg.deterministic = deterministic;
+  return nvte_get_fused_attn_backend_v2(&cfg, /*message=*/nullptr);
 }
 
 // NVTE fused attention FWD with separate Q, K and V
@@ -607,11 +459,56 @@ void nvte_fused_attn_fwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
   const NVTEDType KV_type = static_cast<NVTEDType>(input_K->data.dtype);
+  NVTE_CHECK(Q_type == KV_type, "Q and KV must have the same data type.");
+  const NVTEDType O_type = static_cast<NVTEDType>(output_O->data.dtype);
+  const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
+
+  size_t bias_b = 0, bias_h = 0, bias_sq = 0, bias_skv = 0;
+  if (input_Bias->data.dptr != nullptr && input_Bias->data.shape.size() >= 4) {
+    bias_b = input_Bias->data.shape[0];
+    bias_h = input_Bias->data.shape[1];
+    bias_sq = input_Bias->data.shape[2];
+    bias_skv = input_Bias->data.shape[3];
+  }
 
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout,
-      h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, false);
+  NVTEFusedAttnConfig cfg = NVTE_FUSED_ATTN_CONFIG_INIT;
+  cfg.qkv_layout = qkv_layout;
+  cfg.o_format = o_format;
+  cfg.qkv_scale_inv_format = qkv_scale_inv_format;
+  cfg.bias_type = bias_type;
+  cfg.attn_mask_type = attn_mask_type;
+  cfg.softmax_type = softmax_type;
+  cfg.scaling_mode = scaling_mode;
+  cfg.attn_scale = attn_scale;
+  cfg.dropout = dropout;
+  cfg.max_seqlen_q = max_seqlen_q;
+  cfg.max_seqlen_kv = max_seqlen_kv;
+  cfg.window_size_left = window_size_left;
+  cfg.window_size_right = window_size_right;
+  cfg.bottom_right_diagonal = bottom_right_diagonal;
+  cfg.cuda_graph = cuda_graph;
+  cfg.qkv_dtype = Q_type;
+  cfg.o_dtype = O_type;
+  cfg.batch_size = b;
+  cfg.num_attn_heads = h_q;
+  cfg.num_gqa_groups = h_kv;
+  cfg.head_dim_qk = d_qk;
+  cfg.head_dim_v = d_v;
+  cfg.num_pages_k = static_cast<size_t>(num_pages_k);
+  cfg.num_pages_v = static_cast<size_t>(num_pages_v);
+  cfg.page_size_k = static_cast<size_t>(page_size_k);
+  cfg.page_size_v = static_cast<size_t>(page_size_v);
+  cfg.max_pages_per_seq_k = static_cast<size_t>(max_pages_per_seq_k);
+  cfg.max_pages_per_seq_v = static_cast<size_t>(max_pages_per_seq_v);
+  cfg.bias_batch_size = bias_b;
+  cfg.bias_num_heads = bias_h;
+  cfg.bias_seqlen_q = bias_sq;
+  cfg.bias_seqlen_kv = bias_skv;
+  cfg.is_training = false;
+  cfg.return_max_logit = return_max_logit;
+  cfg.deterministic = false;
+  NVTE_Fused_Attn_Backend fused_attention_backend =
+      nvte_get_fused_attn_backend_v2(&cfg, /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
     fused_attn_arbitrary_seqlen_fwd(
@@ -688,11 +585,45 @@ void nvte_fused_attn_bwd(const NVTETensor Q, const NVTETensor K, const NVTETenso
   auto handle = cudnnExecutionPlanManager::Instance().GetHandle();
   const NVTEDType Q_type = static_cast<NVTEDType>(input_Q->data.dtype);
   const NVTEDType KV_type = static_cast<NVTEDType>(input_K->data.dtype);
+  NVTE_CHECK(Q_type == KV_type, "Q and KV must have the same data type.");
+  const NVTEDType O_type = static_cast<NVTEDType>(input_O->data.dtype);
+  const NVTEDType dO_type = static_cast<NVTEDType>(input_dO->data.dtype);
+  const NVTEDType dQKV_type = static_cast<NVTEDType>(output_dQ->data.dtype);
+  const NVTEScalingMode scaling_mode = input_Q->scaling_mode;
 
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      true, Q_type, KV_type, qkv_layout, bias_type, attn_mask_type, softmax_type, dropout, h_q,
-      h_kv, max_seqlen_q, max_seqlen_kv, d_qk, d_v, window_size_left, window_size_right, false,
-      cuda_graph, deterministic);
+  NVTEFusedAttnConfig cfg = NVTE_FUSED_ATTN_CONFIG_INIT;
+  cfg.qkv_layout = qkv_layout;
+  cfg.o_format = o_format;
+  cfg.do_format = do_format;
+  cfg.dqkv_layout = dqkv_layout;
+  cfg.qkv_scale_inv_format = qkv_scale_inv_format;
+  cfg.do_scale_inv_format = do_scale_inv_format;
+  cfg.bias_type = bias_type;
+  cfg.attn_mask_type = attn_mask_type;
+  cfg.softmax_type = softmax_type;
+  cfg.scaling_mode = scaling_mode;
+  cfg.attn_scale = attn_scale;
+  cfg.dropout = dropout;
+  cfg.max_seqlen_q = max_seqlen_q;
+  cfg.max_seqlen_kv = max_seqlen_kv;
+  cfg.window_size_left = window_size_left;
+  cfg.window_size_right = window_size_right;
+  cfg.bottom_right_diagonal = bottom_right_diagonal;
+  cfg.cuda_graph = cuda_graph;
+  cfg.qkv_dtype = Q_type;
+  cfg.o_dtype = O_type;
+  cfg.do_dtype = dO_type;
+  cfg.dqkv_dtype = dQKV_type;
+  cfg.batch_size = b;
+  cfg.num_attn_heads = h_q;
+  cfg.num_gqa_groups = h_kv;
+  cfg.head_dim_qk = d_qk;
+  cfg.head_dim_v = d_v;
+  cfg.is_training = true;
+  cfg.return_max_logit = false;
+  cfg.deterministic = deterministic;
+  NVTE_Fused_Attn_Backend fused_attention_backend =
+      nvte_get_fused_attn_backend_v2(&cfg, /*message=*/nullptr);
 
   if (fused_attention_backend == NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen) {
     size_t i = 0;
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
index 6df7ad35c8..9cdee256ed 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.cu
@@ -1333,4 +1333,138 @@ void fused_attn_arbitrary_seqlen_bwd(
     NVTE_ERROR("Unexpected workspace_size.");
   }
 }
+
+std::string is_supported_f16_fwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle) {
+  const size_t num_gqa_groups = cfg->num_gqa_groups;
+  const size_t head_dim_qk = cfg->head_dim_qk;
+  const size_t head_dim_v = cfg->head_dim_v;
+  const bool is_training = cfg->is_training;
+  const bool return_max_logit = cfg->return_max_logit;
+  const float attn_scale = cfg->attn_scale;
+  const float p_dropout = cfg->dropout;
+  const NVTE_QKV_Layout qkv_layout = cfg->qkv_layout;
+  const NVTE_QKV_Format o_format = cfg->o_format;
+  const NVTE_Bias_Type bias_type = cfg->bias_type;
+  const NVTE_Mask_Type mask_type = cfg->attn_mask_type;
+  const NVTE_Softmax_Type softmax_type = cfg->softmax_type;
+  const int64_t window_size_left = cfg->window_size_left;
+  const int64_t window_size_right = cfg->window_size_right;
+  const bool bottom_right_diagonal = cfg->bottom_right_diagonal;
+  const DType qkv_dtype = static_cast<DType>(cfg->qkv_dtype);
+  const auto b = static_cast<int64_t>(cfg->batch_size);
+  const auto h = static_cast<int64_t>(cfg->num_attn_heads);
+  const auto sq = static_cast<int64_t>(cfg->max_seqlen_q);
+  const auto skv = static_cast<int64_t>(cfg->max_seqlen_kv);
+
+  const NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
+  const NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
+  const bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
+  const bool is_ragged_kv = (kv_format == NVTE_QKV_Format::NVTE_THD);
+  const NVTE_QKV_Layout_Group layout_group = nvte_get_qkv_layout_group(qkv_layout);
+  const bool is_paged_kv = (layout_group == NVTE_QKV_Layout_Group::NVTE_Paged_KV_HD_HD_HD);
+  const bool has_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
+
+  const int64_t max_b = (is_ragged_q || is_ragged_kv) ? b : 0;
+  const int64_t max_t_q = is_ragged_q ? b * sq : 0;
+  const int64_t max_t_kv = is_ragged_kv ? b * skv : 0;
+  const int64_t num_pages_k = is_paged_kv ? b : 0;
+  const int64_t num_pages_v = is_paged_kv ? b : 0;
+  const int64_t page_size_k = is_paged_kv ? skv : 0;
+  const int64_t page_size_v = is_paged_kv ? skv : 0;
+  const int64_t max_pages_per_seq_k = is_paged_kv ? 1 : 0;
+  const int64_t max_pages_per_seq_v = is_paged_kv ? 1 : 0;
+  const int64_t bias_b = has_bias ? b : 0;
+  const int64_t bias_h = has_bias ? h : 0;
+  const int64_t bias_sq = has_bias ? sq : 0;
+  const int64_t bias_skv = has_bias ? skv : 0;
+
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_arbitrary_seqlen_fwd_impl(
+        b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, num_pages_k, num_pages_v,
+        page_size_k, page_size_v, max_pages_per_seq_k, max_pages_per_seq_v, bias_b, bias_h, bias_sq,
+        bias_skv, is_training, return_max_logit, attn_scale, p_dropout, qkv_layout, o_format,
+        bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        bottom_right_diagonal,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrBias=*/nullptr,
+        /*devPtrSoftmaxOffset=*/nullptr, /*devPtrS1=*/nullptr, /*devPtrS2=*/nullptr,
+        /*devPtrO=*/nullptr, /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
+        /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
+        /*devPtrPageTableK=*/nullptr, /*devPtrPageTableV=*/nullptr,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(qkv_dtype),
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return "";
+  } catch (const std::exception &e) {
+    return e.what();
+  } catch (...) {
+    return "is_supported_f16_fwd: unknown failure.";
+  }
+}
+
+std::string is_supported_f16_bwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle) {
+  const size_t num_gqa_groups = cfg->num_gqa_groups;
+  const size_t head_dim_qk = cfg->head_dim_qk;
+  const size_t head_dim_v = cfg->head_dim_v;
+  const float attn_scale = cfg->attn_scale;
+  const float p_dropout = cfg->dropout;
+  const NVTE_QKV_Layout qkv_layout = cfg->qkv_layout;
+  const NVTE_QKV_Format o_format = cfg->o_format;
+  const NVTE_QKV_Format do_format = cfg->do_format;
+  const NVTE_QKV_Layout dqkv_layout = cfg->dqkv_layout;
+  const NVTE_Bias_Type bias_type = cfg->bias_type;
+  const NVTE_Mask_Type mask_type = cfg->attn_mask_type;
+  const NVTE_Softmax_Type softmax_type = cfg->softmax_type;
+  const int64_t window_size_left = cfg->window_size_left;
+  const int64_t window_size_right = cfg->window_size_right;
+  const bool bottom_right_diagonal = cfg->bottom_right_diagonal;
+  const bool deterministic = cfg->deterministic;
+  const DType qkv_dtype = static_cast<DType>(cfg->qkv_dtype);
+  const auto b = static_cast<int64_t>(cfg->batch_size);
+  const auto h = static_cast<int64_t>(cfg->num_attn_heads);
+  const auto sq = static_cast<int64_t>(cfg->max_seqlen_q);
+  const auto skv = static_cast<int64_t>(cfg->max_seqlen_kv);
+
+  const NVTE_QKV_Format q_format = nvte_get_q_format(qkv_layout);
+  const NVTE_QKV_Format kv_format = nvte_get_kv_format(qkv_layout);
+  const bool is_ragged_q = (q_format == NVTE_QKV_Format::NVTE_THD);
+  const bool is_ragged_kv = (kv_format == NVTE_QKV_Format::NVTE_THD);
+  const bool has_bias = (bias_type == NVTE_Bias_Type::NVTE_POST_SCALE_BIAS);
+
+  const int64_t max_b = (is_ragged_q || is_ragged_kv) ? b : 0;
+  const int64_t max_t_q = is_ragged_q ? b * sq : 0;
+  const int64_t max_t_kv = is_ragged_kv ? b * skv : 0;
+  const int64_t bias_b = has_bias ? b : 0;
+  const int64_t bias_h = has_bias ? h : 0;
+  const int64_t bias_sq = has_bias ? sq : 0;
+  const int64_t bias_skv = has_bias ? skv : 0;
+
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_arbitrary_seqlen_bwd_impl(
+        b, h, static_cast<int64_t>(num_gqa_groups), sq, skv, static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), max_b, max_t_q, max_t_kv, bias_b, bias_h, bias_sq,
+        bias_skv, attn_scale, p_dropout, qkv_layout, o_format, do_format, dqkv_layout, bias_type,
+        mask_type, softmax_type, window_size_left, window_size_right, bottom_right_diagonal,
+        deterministic, /*devPtrQ=*/nullptr, /*devPtrKTranspose=*/nullptr,
+        /*devPtrVTranspose=*/nullptr, /*devPtrO=*/nullptr, /*devPtrSoftmaxStats=*/nullptr,
+        /*devPtrBias=*/nullptr, /*devPtrSoftmaxOffset=*/nullptr, /*devPtrdQ=*/nullptr,
+        /*devPtrdK=*/nullptr, /*devPtrdV=*/nullptr, /*devPtrdO=*/nullptr,
+        /*devPtrdBias=*/nullptr, /*devPtrdSoftmaxOffset=*/nullptr,
+        /*devPtrDropoutSeed=*/nullptr, /*devPtrDropoutOffset=*/nullptr,
+        /*devPtrCuSeqlensQ=*/nullptr, /*devPtrCuSeqlensKV=*/nullptr,
+        /*devPtrSeqOffsetsQ=*/nullptr, /*devPtrSeqOffsetsKV=*/nullptr,
+        get_cudnn_fe_dtype(qkv_dtype),
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return "";
+  } catch (const std::exception &e) {
+    return e.what();
+  } catch (...) {
+    return "is_supported_f16_bwd: unknown failure.";
+  }
+}
+
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
index 8f79b5bb4a..5d27e82278 100644
--- a/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
+++ b/transformer_engine/common/fused_attn/fused_attn_f16_arbitrary_seqlen.h
@@ -13,6 +13,8 @@
 
 #include <cudnn.h>
 
+#include <string>
+
 #include "common/common.h"
 #include "transformer_engine/fused_attn.h"
 
@@ -47,6 +49,16 @@ void fused_attn_arbitrary_seqlen_bwd(
     const Tensor *cu_seqlens_kv_padded, const Tensor *rng_state, Tensor *workspace,
     cudaStream_t stream, cudnnHandle_t handle);
 
+// check if a given configuration is supported for F16/BF16 forward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
+std::string is_supported_f16_fwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle);
+
+// check if a given configuration is supported for F16/BF16 backward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
+std::string is_supported_f16_bwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle);
+
 }  // namespace transformer_engine
 
 #endif  // TRANSFORMER_ENGINE_COMMON_FUSED_ATTN_FUSED_ATTN_ARBITRARY_SEQLEN_H_
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.cu b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
index eab1ae02e6..180bee2ab0 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.cu
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.cu
@@ -1324,4 +1324,123 @@ void fused_attn_fp8_bwd(
     return;
   }
 }
+
+std::string is_supported_fp8_fwd(const NVTEFusedAttnConfig* cfg, cudnnHandle_t handle) {
+  const size_t batch = cfg->batch_size;
+  const size_t num_attn_heads = cfg->num_attn_heads;
+  const size_t num_gqa_groups = cfg->num_gqa_groups;
+  const size_t max_seqlen_q = cfg->max_seqlen_q;
+  const size_t max_seqlen_kv = cfg->max_seqlen_kv;
+  const size_t head_dim_qk = cfg->head_dim_qk;
+  const size_t head_dim_v = cfg->head_dim_v;
+  const bool is_training = cfg->is_training;
+  const float attn_scale = cfg->attn_scale;
+  const float p_dropout = cfg->dropout;
+  const NVTE_QKV_Layout qkv_layout = cfg->qkv_layout;
+  const NVTE_QKV_Format o_format = cfg->o_format;
+  const NVTE_QKV_Format qkv_scale_inv_format = cfg->qkv_scale_inv_format;
+  const NVTE_Bias_Type bias_type = cfg->bias_type;
+  const NVTE_Mask_Type mask_type = cfg->attn_mask_type;
+  const NVTE_Softmax_Type softmax_type = cfg->softmax_type;
+  const int64_t window_size_left = cfg->window_size_left;
+  const int64_t window_size_right = cfg->window_size_right;
+  const bool bottom_right_diagonal = cfg->bottom_right_diagonal;
+  const DType qkv_dtype = static_cast<DType>(cfg->qkv_dtype);
+  const DType o_dtype = static_cast<DType>(cfg->o_dtype);
+  const NVTEScalingMode scaling_mode = cfg->scaling_mode;
+
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_fp8_fwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), is_training, attn_scale, p_dropout, qkv_layout, o_format,
+        bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        bottom_right_diagonal,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr,
+        /*devPtrSoftmaxOffset=*/nullptr, /*devPtrM=*/nullptr, /*devPtrO=*/nullptr,
+        /*devPtrDescaleQ=*/nullptr, /*devPtrDescaleK=*/nullptr, /*devPtrDescaleV=*/nullptr,
+        /*devPtrDescaleS=*/nullptr, /*devPtrScaleS=*/nullptr, /*devPtrScaleO=*/nullptr,
+        /*devPtrAmaxO=*/nullptr, /*devPtrAmaxS=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
+        /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
+        /*devPtrDropoutOffset=*/nullptr, get_cudnn_fe_dtype(qkv_dtype), get_cudnn_fe_dtype(o_dtype),
+        scaling_mode, qkv_scale_inv_format,
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return "";
+  } catch (const std::exception& e) {
+    return e.what();
+  } catch (...) {
+    return "is_supported_fp8_fwd: unknown failure.";
+  }
+}
+
+std::string is_supported_fp8_bwd(const NVTEFusedAttnConfig* cfg, cudnnHandle_t handle) {
+  const size_t batch = cfg->batch_size;
+  const size_t num_attn_heads = cfg->num_attn_heads;
+  const size_t num_gqa_groups = cfg->num_gqa_groups;
+  const size_t max_seqlen_q = cfg->max_seqlen_q;
+  const size_t max_seqlen_kv = cfg->max_seqlen_kv;
+  const size_t head_dim_qk = cfg->head_dim_qk;
+  const size_t head_dim_v = cfg->head_dim_v;
+  const float attn_scale = cfg->attn_scale;
+  const float p_dropout = cfg->dropout;
+  const NVTE_QKV_Layout qkv_layout = cfg->qkv_layout;
+  const NVTE_QKV_Format o_format = cfg->o_format;
+  const NVTE_QKV_Format do_format = cfg->do_format;
+  const NVTE_QKV_Layout dqkv_layout = cfg->dqkv_layout;
+  const NVTE_QKV_Format qkv_scale_inv_format = cfg->qkv_scale_inv_format;
+  const NVTE_QKV_Format do_scale_inv_format = cfg->do_scale_inv_format;
+  const NVTE_Bias_Type bias_type = cfg->bias_type;
+  const NVTE_Mask_Type mask_type = cfg->attn_mask_type;
+  const NVTE_Softmax_Type softmax_type = cfg->softmax_type;
+  const int64_t window_size_left = cfg->window_size_left;
+  const int64_t window_size_right = cfg->window_size_right;
+  const bool bottom_right_diagonal = cfg->bottom_right_diagonal;
+  const bool deterministic = cfg->deterministic;
+  const DType qkv_dtype = static_cast<DType>(cfg->qkv_dtype);
+  const DType o_dtype = static_cast<DType>(cfg->o_dtype);
+  const DType do_dtype = static_cast<DType>(cfg->do_dtype);
+  const DType dqkv_dtype = static_cast<DType>(cfg->dqkv_dtype);
+  const NVTEScalingMode scaling_mode = cfg->scaling_mode;
+
+  const cudnn_frontend::DataType_t qkv_t = get_cudnn_fe_dtype(qkv_dtype);
+  const cudnn_frontend::DataType_t o_t = get_cudnn_fe_dtype(o_dtype);
+  const cudnn_frontend::DataType_t do_t = get_cudnn_fe_dtype(do_dtype);
+  const cudnn_frontend::DataType_t dqkv_t = get_cudnn_fe_dtype(dqkv_dtype);
+  size_t workspace_size = 0;
+  try {
+    fused_attn::fused_attn_fp8_bwd_impl(
+        static_cast<int64_t>(batch), static_cast<int64_t>(num_attn_heads),
+        static_cast<int64_t>(num_gqa_groups), static_cast<int64_t>(max_seqlen_q),
+        static_cast<int64_t>(max_seqlen_kv), static_cast<int64_t>(head_dim_qk),
+        static_cast<int64_t>(head_dim_v), attn_scale, p_dropout, qkv_layout, o_format, do_format,
+        dqkv_layout, bias_type, mask_type, softmax_type, window_size_left, window_size_right,
+        bottom_right_diagonal, deterministic,
+        /*devPtrQ=*/nullptr, /*devPtrK=*/nullptr, /*devPtrV=*/nullptr, /*devPtrM=*/nullptr,
+        /*devPtrO=*/nullptr, /*devPtrdO=*/nullptr, /*devPtrSoftmaxOffset=*/nullptr,
+        /*devPtrdQ=*/nullptr, /*devPtrdK=*/nullptr, /*devPtrdV=*/nullptr,
+        /*devPtrdSoftmaxOffset=*/nullptr, /*devPtrDescaleQ=*/nullptr,
+        /*devPtrDescaleK=*/nullptr, /*devPtrDescaleV=*/nullptr, /*devPtrDescaleO=*/nullptr,
+        /*devPtrDescaledO=*/nullptr, /*devPtrDescaleS=*/nullptr, /*devPtrDescaledP=*/nullptr,
+        /*devPtrScaleS=*/nullptr, /*devPtrScaledP=*/nullptr, /*devPtrScaledQ=*/nullptr,
+        /*devPtrScaledK=*/nullptr, /*devPtrScaledV=*/nullptr, /*devPtrAmaxdP=*/nullptr,
+        /*devPtrAmaxdQ=*/nullptr, /*devPtrAmaxdK=*/nullptr, /*devPtrAmaxdV=*/nullptr,
+        /*devPtrQ_t=*/nullptr, /*devPtrK_t=*/nullptr, /*devPtrdO_f16=*/nullptr,
+        /*devPtrdO_t=*/nullptr, /*devPtrDescaleQ_t=*/nullptr, /*devPtrDescaleK_t=*/nullptr,
+        /*devPtrDescaledO_t=*/nullptr, /*devPtrcuSeqlensQ=*/nullptr,
+        /*devPtrcuSeqlensKV=*/nullptr, /*devPtrDropoutSeed=*/nullptr,
+        /*devPtrDropoutOffset=*/nullptr, qkv_t, o_t, do_t, dqkv_t, scaling_mode,
+        qkv_scale_inv_format, do_scale_inv_format,
+        /*workspace=*/nullptr, &workspace_size,
+        /*stream=*/static_cast<cudaStream_t>(0), handle);
+    return "";
+  } catch (const std::exception& e) {
+    return e.what();
+  } catch (...) {
+    return "is_supported_fp8_bwd: unknown failure.";
+  }
+}
+
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/fused_attn/fused_attn_fp8.h b/transformer_engine/common/fused_attn/fused_attn_fp8.h
index b9660128ca..fc60987cf3 100644
--- a/transformer_engine/common/fused_attn/fused_attn_fp8.h
+++ b/transformer_engine/common/fused_attn/fused_attn_fp8.h
@@ -8,6 +8,8 @@
  *  \brief Functions for fused attention for FP8
  */
 
+#include <string>
+
 #include "transformer_engine/fused_attn.h"
 #include "transformer_engine/transformer_engine.h"
 
@@ -39,4 +41,14 @@ void fused_attn_fp8_bwd(
     const Tensor *output_dK, const Tensor *output_dV, Tensor *output_dSoftmaxOffset,
     const Tensor *cu_seqlens_q, const Tensor *cu_seqlens_kv, const Tensor *rng_state,
     Tensor *workspace, cudaStream_t stream, cudnnHandle_t handle);
+
+// check if a given configuration is supported for FP8 forward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
+std::string is_supported_fp8_fwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle);
+
+// check if a given configuration is supported for FP8 backward;
+// if it is, cache the graph built for this config, and return an empty string;
+// if not, return a diagnostic message in the form of a string.
+std::string is_supported_fp8_bwd(const NVTEFusedAttnConfig *cfg, cudnnHandle_t handle);
 }  // namespace transformer_engine
diff --git a/transformer_engine/common/include/transformer_engine/fused_attn.h b/transformer_engine/common/include/transformer_engine/fused_attn.h
index d9d2786623..dba7dd68d6 100644
--- a/transformer_engine/common/include/transformer_engine/fused_attn.h
+++ b/transformer_engine/common/include/transformer_engine/fused_attn.h
@@ -11,6 +11,8 @@
 #ifndef TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_
 #define TRANSFORMER_ENGINE_FUSED_ATTN_FP8_H_
 
+#include <cudnn.h>
+
 #include "stdint.h"
 #include "transformer_engine.h"
 
@@ -194,7 +196,119 @@ NVTE_QKV_Format nvte_get_q_format(NVTE_QKV_Layout qkv_layout);
  */
 NVTE_QKV_Format nvte_get_kv_format(NVTE_QKV_Layout qkv_layout);
 
+/*! \struct NVTEFusedAttnConfig
+ *  \brief Attention configuration.
+ *
+ * Versioning rules:
+ *  - ``struct_size`` MUST be set to ``sizeof(NVTEFusedAttnConfig)`` by the
+ *    caller (use ``NVTE_FUSED_ATTN_CONFIG_INIT``).
+ *  - New fields may only be appended at the end; existing fields are never
+ *    reordered, removed, or resized. The library reads only fields that are
+ *    in range according to ``struct_size`` and uses safe defaults otherwise.
+ */
+typedef struct NVTEFusedAttnConfig {
+  size_t struct_size; /*!< MUST equal sizeof(NVTEFusedAttnConfig). */
+  uint32_t reserved0; /*!< Padding for layout stability; set to 0. */
+  uint32_t reserved1; /*!< Padding for layout stability; set to 0. */
+
+  NVTE_QKV_Layout qkv_layout;           /*!< QKV tensors' layout. */
+  NVTE_QKV_Format o_format;             /*!< Output O tensor format. */
+  NVTE_QKV_Format do_format;            /*!< Output-grad dO tensor format (bwd). */
+  NVTE_QKV_Layout dqkv_layout;          /*!< Gradient dQKV tensor layout (bwd). */
+  NVTE_QKV_Format qkv_scale_inv_format; /*!< QKV scale_inv tensor format (FP8). */
+  NVTE_QKV_Format do_scale_inv_format;  /*!< dO scale_inv tensor format (FP8 bwd). */
+  NVTE_Bias_Type bias_type;             /*!< Attention bias type. */
+  NVTE_Mask_Type attn_mask_type;        /*!< Attention mask type. */
+  NVTE_Softmax_Type softmax_type;       /*!< Attention softmax type. */
+  NVTEScalingMode scaling_mode;         /*!< Scaling mode (e.g. delayed, MXFP8). */
+  float attn_scale;                     /*!< Pre-softmax attention scale factor. */
+  float dropout;                        /*!< Dropout probability. */
+  size_t max_seqlen_q;                  /*!< Max sequence length for Q. */
+  size_t max_seqlen_kv;                 /*!< Max sequence length for K, V. */
+  int64_t window_size_left;             /*!< Sliding window size (left half); -1 = unlimited. */
+  int64_t window_size_right;            /*!< Sliding window size (right half); -1 = unlimited. */
+  bool bottom_right_diagonal; /*!< Whether causal mask aligns to the bottom-right diagonal. */
+  bool cuda_graph;            /*!< Whether CUDA graph capture is enabled. */
+
+  NVTEDType qkv_dtype;   /*!< Data type of Tensors Q, K, V. Q and K/V must share a dtype. */
+  NVTEDType o_dtype;     /*!< Data type of Tensor O. */
+  NVTEDType do_dtype;    /*!< Data type of Tensor dO (bwd). */
+  NVTEDType dqkv_dtype;  /*!< Data type of Tensors dQ, dK, dV (bwd). */
+  size_t batch_size;     /*!< Batch size. */
+  size_t num_attn_heads; /*!< Number of heads in Q. */
+  size_t num_gqa_groups; /*!< Number of heads in K, V. */
+  size_t head_dim_qk;    /*!< Head dimension of Q, K. */
+  size_t head_dim_v;     /*!< Head dimension of V. */
+
+  size_t num_pages_k;         /*!< Total number of K cache pages. */
+  size_t num_pages_v;         /*!< Total number of V cache pages. */
+  size_t page_size_k;         /*!< Tokens per K cache page. */
+  size_t page_size_v;         /*!< Tokens per V cache page. */
+  size_t max_pages_per_seq_k; /*!< Max K pages per sequence in the batch. */
+  size_t max_pages_per_seq_v; /*!< Max V pages per sequence in the batch. */
+
+  size_t bias_batch_size; /*!< Bias broadcast dim for batch. */
+  size_t bias_num_heads;  /*!< Bias broadcast dim for heads. */
+  size_t bias_seqlen_q;   /*!< Bias broadcast dim for Q sequence length. */
+  size_t bias_seqlen_kv;  /*!< Bias broadcast dim for K/V sequence length. */
+
+  bool is_training;      /*!< Whether the model is in training mode. */
+  bool return_max_logit; /*!< Whether to produce Max along with Stats (fwd-only). */
+  bool deterministic;    /*!< Whether determinism is required (bwd-only). */
+} NVTEFusedAttnConfig;
+
+/*! \brief Default-initialize an ``NVTEFusedAttnConfig``.
+ *
+ * Sets ``struct_size`` and the categorical fields (layouts, formats, masks,
+ * window sizes, scaling mode) to safe NOT_SET / no-op defaults. Numeric and
+ * tensor-derived fields, paged-KV shape, bias broadcast shape, and direction
+ * flags all default to zero/false; callers must set the fields relevant to
+ * their query.
+ */
+#define NVTE_FUSED_ATTN_CONFIG_INIT                    \
+  {                                                    \
+      .struct_size = sizeof(NVTEFusedAttnConfig),      \
+      .qkv_layout = NVTE_QKV_Layout_NOT_SET,           \
+      .o_format = NVTE_QKV_Format_NOT_SET,             \
+      .do_format = NVTE_QKV_Format_NOT_SET,            \
+      .dqkv_layout = NVTE_QKV_Layout_NOT_SET,          \
+      .qkv_scale_inv_format = NVTE_QKV_Format_NOT_SET, \
+      .do_scale_inv_format = NVTE_QKV_Format_NOT_SET,  \
+      .bias_type = NVTE_NO_BIAS,                       \
+      .attn_mask_type = NVTE_NO_MASK,                  \
+      .softmax_type = NVTE_VANILLA_SOFTMAX,            \
+      .scaling_mode = NVTE_DELAYED_TENSOR_SCALING,     \
+      .window_size_left = -1,                          \
+      .window_size_right = -1,                         \
+  }
+
+/*! \brief Get fused attention backend based on input parameters.
+ *
+ *  This call exercises cudnn-frontend's support checks by building (and caching)
+ *  the cuDNN execution graph for the supported configurations. The configuration
+ *  parameters are a superset of those of ``nvte_fused_attn_fwd`` and
+ *  ``nvte_fused_attn_bwd`` to maintain a consistent signature between graph
+ *  building and runtime calls.
+ *
+ *  \param[in]     cfg     Attention configuration. Must be initialized
+ *                         with ``NVTE_FUSED_ATTN_CONFIG_INIT`` and have
+ *                         ``cfg->struct_size`` set to ``sizeof(NVTEFusedAttnConfig)``.
+ *  \param[out]    message Empty on success, otherwise a diagnostic string describing
+ *                         why the configuration was rejected. The string pointer
+ *                         refers to a per-thread buffer owned by the library and
+ *                         remains valid only until the next call to
+ *                         ``nvte_get_fused_attn_backend_v2`` on the same thread;
+ *                         callers that need to retain the message across further
+ *                         calls must copy it. Pass NULL to skip diagnostics.
+ *
+ *  \return Backend able to execute this configuration, or ``NVTE_No_Backend`` if none.
+ */
+NVTE_Fused_Attn_Backend nvte_get_fused_attn_backend_v2(const NVTEFusedAttnConfig *cfg,
+                                                       const char **message);
+
 /*! \brief Get fused attention backend based on input parameters.
+ *
+ *  \deprecated This function has been deprecated in favor of nvte_get_fused_attn_backend_v2.
  *
  *  \param[in]     is_training         Whether the model is in training mode.
  *  \param[in]     q_dtype             The data type of Tensor Q.
diff --git a/transformer_engine/common/util/pybind_helper.h b/transformer_engine/common/util/pybind_helper.h
index ef7687e3e9..950bb7778f 100644
--- a/transformer_engine/common/util/pybind_helper.h
+++ b/transformer_engine/common/util/pybind_helper.h
@@ -82,6 +82,13 @@
       .value("NVTE_F16_arbitrary_seqlen", NVTE_Fused_Attn_Backend::NVTE_F16_arbitrary_seqlen)      \
       .value("NVTE_FP8", NVTE_Fused_Attn_Backend::NVTE_FP8)                                        \
       .value("NVTE_No_Backend", NVTE_Fused_Attn_Backend::NVTE_No_Backend);                         \
+  pybind11::enum_<NVTEScalingMode>(m, "NVTEScalingMode", pybind11::module_local())                 \
+      .value("NVTE_DELAYED_TENSOR_SCALING", NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING)          \
+      .value("NVTE_MXFP8_1D_SCALING", NVTEScalingMode::NVTE_MXFP8_1D_SCALING)                      \
+      .value("NVTE_BLOCK_SCALING_1D", NVTEScalingMode::NVTE_BLOCK_SCALING_1D)                      \
+      .value("NVTE_BLOCK_SCALING_2D", NVTEScalingMode::NVTE_BLOCK_SCALING_2D)                      \
+      .value("NVTE_NVFP4_1D_SCALING", NVTEScalingMode::NVTE_NVFP4_1D_SCALING)                      \
+      .value("NVTE_INVALID_SCALING", NVTEScalingMode::NVTE_INVALID_SCALING);                       \
   pybind11::enum_<transformer_engine::Float8BlockScaleTensorFormat>(                               \
       m, "Float8BlockScaleTensorFormat", pybind11::module_local())                                 \
       .value("GEMM_READY", transformer_engine::Float8BlockScaleTensorFormat::GEMM_READY)           \
diff --git a/transformer_engine/jax/attention.py b/transformer_engine/jax/attention.py
index f54a043fd2..e4fce42ce7 100644
--- a/transformer_engine/jax/attention.py
+++ b/transformer_engine/jax/attention.py
@@ -13,6 +13,7 @@
 import jax.numpy as jnp
 
 from transformer_engine_jax import NVTE_Bias_Type
+from transformer_engine_jax import NVTE_Fused_Attn_Backend
 from transformer_engine_jax import NVTE_Mask_Type
 from transformer_engine_jax import NVTE_QKV_Layout
 from transformer_engine_jax import NVTE_QKV_Format
@@ -325,6 +326,7 @@ def canonicalize_attn_mask_type(attn_mask_type: str):
 
 def is_fused_attn_kernel_available(
     is_training,
+    batch_size,
     q_dtype,
     kv_dtype,
     qkv_layout,
@@ -339,15 +341,26 @@ def is_fused_attn_kernel_available(
     head_dim_qk,
     head_dim_v,
     window_size: Optional[Tuple[int, int]] = None,
+    bottom_right_diagonal: Optional[bool] = None,
+    return_reason: bool = False,
 ):
     """
-    To check whether the fused attention kernel is supported
+    To check whether the fused attention kernel is supported.
+
+    When ``return_reason`` is ``True``, returns ``(available, message)`` where ``message`` is
+    the diagnostic string for the reason why the fused attention kernel is not supported (empty on success).
     """
     window_size_tuple = (-1, -1) if window_size is None else window_size
 
     def make_helper(attn_mask_type):
+        bottom_right = (
+            attn_mask_type.is_bottom_right()
+            if bottom_right_diagonal is None
+            else bottom_right_diagonal
+        )
         return tex.FusedAttnHelper(
             is_training,
+            batch_size,
             q_dtype,
             kv_dtype,
             qkv_layout,
@@ -362,9 +375,15 @@ def make_helper(attn_mask_type):
             head_dim_qk,
             head_dim_v,
             window_size_tuple,
+            bottom_right,
         )
 
-    return make_helper(attn_mask_type).is_fused_attn_kernel_available()
+    helper = make_helper(attn_mask_type)
+    if return_reason:
+        backend, message = helper.get_fused_attn_backend()
+        available = backend != NVTE_Fused_Attn_Backend.NVTE_No_Backend
+        return available, message
+    return helper.is_fused_attn_kernel_available()
 
 
 def _obtain_batch_and_max_seqlen(qkv, qkv_layout):
diff --git a/transformer_engine/jax/cpp_extensions/attention.py b/transformer_engine/jax/cpp_extensions/attention.py
index 489bfde997..a895d8eac3 100644
--- a/transformer_engine/jax/cpp_extensions/attention.py
+++ b/transformer_engine/jax/cpp_extensions/attention.py
@@ -16,7 +16,12 @@
 from jax.experimental.custom_partitioning import SdyShardingRule
 
 import transformer_engine_jax
-from transformer_engine_jax import NVTE_Fused_Attn_Backend
+from transformer_engine_jax import (
+    NVTE_Fused_Attn_Backend,
+    NVTE_QKV_Format,
+    NVTE_QKV_Layout,
+    NVTEScalingMode,
+)
 from transformer_engine.jax.attention import (
     AttnBiasType,
     AttnMaskType,
@@ -108,6 +113,7 @@ class FusedAttnHelper:
     """
 
     is_training: bool
+    batch_size: int
     q_dtype: jnp.dtype
     kv_dtype: jnp.dtype
     qkv_layout: QKVLayout
@@ -122,21 +128,44 @@ class FusedAttnHelper:
     head_dim_qk: int
     head_dim_v: int
     window_size: Tuple[int, int]
+    bottom_right_diagonal: bool
+    attn_scale: float = 1.0
 
     def is_fused_attn_kernel_available(self):
-        """Check if there is available fused attention kernel"""
-        return self.get_fused_attn_backend() != NVTE_Fused_Attn_Backend.NVTE_No_Backend
+        """Check if there is available fused attention kernel.
+
+        Use ``get_fused_attn_backend()`` directly to also get the diagnostic message
+        explaining why a configuration was rejected.
+        """
+        backend, _ = self.get_fused_attn_backend()
+        return backend != NVTE_Fused_Attn_Backend.NVTE_No_Backend
 
     def get_fused_attn_backend(self):
-        """Get the fused attention kernel backend"""
+        """Get the fused attention kernel backend.
+
+        Returns a ``(backend, message)`` tuple. ``message`` is empty on success, otherwise a
+        diagnostic string describing why the configuration was rejected when backend = NVTE_No_Backend.
+        """
+        q_type = jax_dtype_to_te_dtype(self.q_dtype)
         return transformer_engine_jax.get_fused_attn_backend(
             self.is_training,
-            jax_dtype_to_te_dtype(self.q_dtype),
+            self.batch_size,
+            q_type,
             jax_dtype_to_te_dtype(self.kv_dtype),
+            q_type,
+            q_type,
+            q_type,
+            NVTEScalingMode.NVTE_INVALID_SCALING,
             self.qkv_layout.value,
+            NVTE_QKV_Format.NVTE_QKV_Format_NOT_SET,
+            NVTE_QKV_Format.NVTE_QKV_Format_NOT_SET,
+            NVTE_QKV_Layout.NVTE_QKV_Layout_NOT_SET,
+            NVTE_QKV_Format.NVTE_QKV_Format_NOT_SET,
+            NVTE_QKV_Format.NVTE_QKV_Format_NOT_SET,
             self.attn_bias_type.value,
             self.attn_mask_type.value,
             self.softmax_type.value,
+            self.attn_scale,
             self.dropout_probability,
             self.q_num_heads,
             self.kv_num_heads,
@@ -146,6 +175,7 @@ def get_fused_attn_backend(self):
             self.head_dim_v,
             self.window_size[0],
             self.window_size[1],
+            self.bottom_right_diagonal,
             not self.is_non_deterministic_allowed(),
         )
 
@@ -335,8 +365,10 @@ def abstract(
         out_aval = q_aval.update(shape=output_shape, dtype=q_dtype)
 
         # backend determines the softmax buffer shape/dtype
-        backend = FusedAttnHelper(
+        input_batch = reduce(operator.mul, batch_shape)
+        backend, message = FusedAttnHelper(
             config.is_training,
+            input_batch,
             q_dtype,
             k_dtype,
             config.qkv_layout,
@@ -351,6 +383,8 @@ def abstract(
             q_head_dim,
             v_head_dim,
             config.window_size,
+            config.bottom_right_diagonal,
+            attn_scale=float(config.scaling_factor),
         ).get_fused_attn_backend()
 
         if backend == NVTE_Fused_Attn_Backend.NVTE_F16_arbitrary_seqlen:
@@ -369,7 +403,7 @@ def abstract(
                 )
             softmax_dtype = dtypes.canonicalize_dtype(jnp.float32)
         else:
-            raise ValueError(f"Unsupported {backend=}")
+            raise ValueError(f"Unsupported backend: {message}")
         softmax_aux_aval = q_aval.update(shape=softmax_shape, dtype=softmax_dtype)
 
         # JAX does not enable 64-bit int by default so we get XLA to allocate x8 memory with
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 2ecfedc8a2..b2adb3b042 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -23,6 +23,7 @@
 #include <iostream>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "common/common.h"
@@ -146,12 +147,16 @@ XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnForwardHandler);
 
 XLA_FFI_DECLARE_HANDLER_SYMBOL(FusedAttnBackwardHandler);
 
-NVTE_Fused_Attn_Backend GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, NVTE_QKV_Layout qkv_layout,
+std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
+    bool is_training, size_t batch_size, DType q_dtype, DType kv_dtype, DType o_dtype,
+    DType do_dtype, DType dqkv_dtype, NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout,
+    NVTE_QKV_Format o_format, NVTE_QKV_Format do_format, NVTE_QKV_Layout dqkv_layout,
+    NVTE_QKV_Format qkv_scale_inv_format, NVTE_QKV_Format do_scale_inv_format,
     NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen,
-    size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left,
-    int64_t window_size_right, bool deterministic);
+    float attn_scale, float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads,
+    size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim,
+    int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal,
+    bool deterministic);
 
 pybind11::tuple GetFusedAttnForwardWorkspaceSizes(
     size_t input_batch, size_t bias_batch, size_t q_max_seqlen, size_t kv_max_seqlen,
diff --git a/transformer_engine/jax/csrc/extensions/attention.cpp b/transformer_engine/jax/csrc/extensions/attention.cpp
index ed136d7b9e..573186b78d 100644
--- a/transformer_engine/jax/csrc/extensions/attention.cpp
+++ b/transformer_engine/jax/csrc/extensions/attention.cpp
@@ -11,18 +11,62 @@
 namespace transformer_engine {
 namespace jax {
 
-NVTE_Fused_Attn_Backend GetFusedAttnBackend(
-    bool is_training, DType q_dtype, DType kv_dtype, NVTE_QKV_Layout qkv_layout,
+std::tuple<NVTE_Fused_Attn_Backend, std::string> GetFusedAttnBackend(
+    bool is_training, size_t batch_size, DType q_dtype, DType kv_dtype, DType o_dtype,
+    DType do_dtype, DType dqkv_dtype, NVTEScalingMode scaling_mode, NVTE_QKV_Layout qkv_layout,
+    NVTE_QKV_Format o_format, NVTE_QKV_Format do_format, NVTE_QKV_Layout dqkv_layout,
+    NVTE_QKV_Format qkv_scale_inv_format, NVTE_QKV_Format do_scale_inv_format,
     NVTE_Bias_Type bias_type, NVTE_Mask_Type mask_type, NVTE_Softmax_Type softmax_type,
-    float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads, size_t q_max_seqlen,
-    size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim, int64_t window_size_left,
-    int64_t window_size_right, bool deterministic) {
-  auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, q_attn_heads, kv_attn_heads,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
-  return backend;
+    float attn_scale, float dropout_probability, size_t q_attn_heads, size_t kv_attn_heads,
+    size_t q_max_seqlen, size_t kv_max_seqlen, size_t qk_head_dim, size_t v_head_dim,
+    int64_t window_size_left, int64_t window_size_right, bool bottom_right_diagonal,
+    bool deterministic) {
+  if (o_format == NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET) {
+    o_format = nvte_get_q_format(qkv_layout);
+  }
+  if (do_format == NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET) {
+    do_format = o_format;
+  }
+  if (dqkv_layout == NVTE_QKV_Layout::NVTE_QKV_Layout_NOT_SET) {
+    dqkv_layout = qkv_layout;
+  }
+  NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
+
+  NVTEFusedAttnConfig cfg = NVTE_FUSED_ATTN_CONFIG_INIT;
+  cfg.qkv_layout = qkv_layout;
+  cfg.o_format = o_format;
+  cfg.do_format = do_format;
+  cfg.dqkv_layout = dqkv_layout;
+  cfg.qkv_scale_inv_format = qkv_scale_inv_format;
+  cfg.do_scale_inv_format = do_scale_inv_format;
+  cfg.bias_type = bias_type;
+  cfg.attn_mask_type = mask_type;
+  cfg.softmax_type = softmax_type;
+  cfg.scaling_mode = scaling_mode;
+  cfg.attn_scale = attn_scale;
+  cfg.dropout = dropout_probability;
+  cfg.max_seqlen_q = q_max_seqlen;
+  cfg.max_seqlen_kv = kv_max_seqlen;
+  cfg.window_size_left = window_size_left;
+  cfg.window_size_right = window_size_right;
+  cfg.bottom_right_diagonal = bottom_right_diagonal;
+  cfg.cuda_graph = false;
+  cfg.qkv_dtype = static_cast<NVTEDType>(q_dtype);
+  cfg.o_dtype = static_cast<NVTEDType>(o_dtype);
+  cfg.do_dtype = static_cast<NVTEDType>(do_dtype);
+  cfg.dqkv_dtype = static_cast<NVTEDType>(dqkv_dtype);
+  cfg.batch_size = batch_size;
+  cfg.num_attn_heads = q_attn_heads;
+  cfg.num_gqa_groups = kv_attn_heads;
+  cfg.head_dim_qk = qk_head_dim;
+  cfg.head_dim_v = v_head_dim;
+  cfg.is_training = is_training;
+  cfg.return_max_logit = false;
+  cfg.deterministic = deterministic;
+
+  const char *message = nullptr;
+  auto backend = nvte_get_fused_attn_backend_v2(&cfg, &message);
+  return {backend, message != nullptr ? std::string(message) : std::string()};
 }
 
 /*
@@ -261,11 +305,13 @@ static void FusedAttnForwardImpl(
   /* Prepare RNG state */
   auto rng_state_tensor = TensorWrapper(rng_state, std::vector<size_t>{2}, DType::kInt64);
 
-  auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
+  auto [backend, _fwd_msg] = GetFusedAttnBackend(
+      is_training, input_batch, dtype, dtype, dtype, dtype, dtype, NVTE_INVALID_SCALING, qkv_layout,
+      NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET, NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+      NVTE_QKV_Layout::NVTE_QKV_Layout_NOT_SET, NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+      NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET, bias_type, mask_type, softmax_type, scaling_factor,
+      dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen, qk_head_dim,
+      v_head_dim, window_size_left, window_size_right, bottom_right_diagonal, deterministic);
   nvte_populate_rng_state_async(rng_state, seed, q_max_seqlen, kv_max_seqlen, backend, stream);
 
   /* Auxiliary tensors (to be propagated to the backward pass later) */
@@ -537,11 +583,13 @@ static void FusedAttnBackwardImpl(
   /* Auxiliary tensors (propagated from the forward pass) */
   NVTETensorPack aux_input_tensors;
   nvte_tensor_pack_create(&aux_input_tensors);
-  auto backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(dtype), static_cast<NVTEDType>(dtype), qkv_layout,
-      bias_type, mask_type, softmax_type, dropout_probability, attn_heads, num_gqa_groups,
-      q_max_seqlen, kv_max_seqlen, qk_head_dim, v_head_dim, window_size_left, window_size_right,
-      false, false, deterministic);
+  auto [backend, _bwd_msg] = GetFusedAttnBackend(
+      is_training, input_batch, dtype, dtype, dtype, dtype, dtype, NVTE_INVALID_SCALING, qkv_layout,
+      NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET, NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+      NVTE_QKV_Layout::NVTE_QKV_Layout_NOT_SET, NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET,
+      NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET, bias_type, mask_type, softmax_type, scaling_factor,
+      dropout_probability, attn_heads, num_gqa_groups, q_max_seqlen, kv_max_seqlen, qk_head_dim,
+      v_head_dim, window_size_left, window_size_right, bottom_right_diagonal, deterministic);
   PrepareFusedAttnBackwardAuxTensors(&aux_input_tensors, input_batch, bias_batch, attn_heads,
                                      bias_heads, q_max_seqlen, kv_max_seqlen, dtype, backend,
                                      softmax_aux, rng_state, bias, softmax_offset);
diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
index 70d0403b3e..bdfec12b8b 100644
--- a/transformer_engine/jax/csrc/extensions/pybind.cpp
+++ b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -160,12 +160,14 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
       .value("NVTE_BSHD_BSHD_BSHD", NVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHD)
       .value("NVTE_T3HD", NVTE_QKV_Layout::NVTE_T3HD)
       .value("NVTE_THD_T2HD", NVTE_QKV_Layout::NVTE_THD_T2HD)
-      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD);
+      .value("NVTE_THD_THD_THD", NVTE_QKV_Layout::NVTE_THD_THD_THD)
+      .value("NVTE_QKV_Layout_NOT_SET", NVTE_QKV_Layout::NVTE_QKV_Layout_NOT_SET);
 
   pybind11::enum_<NVTE_QKV_Format>(m, "NVTE_QKV_Format", pybind11::module_local())
       .value("NVTE_SBHD", NVTE_QKV_Format::NVTE_SBHD)
       .value("NVTE_BSHD", NVTE_QKV_Format::NVTE_BSHD)
-      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD);
+      .value("NVTE_THD", NVTE_QKV_Format::NVTE_THD)
+      .value("NVTE_QKV_Format_NOT_SET", NVTE_QKV_Format::NVTE_QKV_Format_NOT_SET);
 
   pybind11::enum_<NVTE_Softmax_Type>(m, "NVTE_Softmax_Type", pybind11::module_local())
       .value("NVTE_VANILLA_SOFTMAX", NVTE_Softmax_Type::NVTE_VANILLA_SOFTMAX)
@@ -206,6 +208,14 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
       .value("NVFP4_2D_SCALING", JAXX_Scaling_Mode::NVFP4_2D_SCALING)
       .export_values();
 
+  pybind11::enum_<NVTEScalingMode>(m, "NVTEScalingMode", pybind11::module_local())
+      .value("NVTE_DELAYED_TENSOR_SCALING", NVTEScalingMode::NVTE_DELAYED_TENSOR_SCALING)
+      .value("NVTE_MXFP8_1D_SCALING", NVTEScalingMode::NVTE_MXFP8_1D_SCALING)
+      .value("NVTE_BLOCK_SCALING_1D", NVTEScalingMode::NVTE_BLOCK_SCALING_1D)
+      .value("NVTE_BLOCK_SCALING_2D", NVTEScalingMode::NVTE_BLOCK_SCALING_2D)
+      .value("NVTE_NVFP4_1D_SCALING", NVTEScalingMode::NVTE_NVFP4_1D_SCALING)
+      .value("NVTE_INVALID_SCALING", NVTEScalingMode::NVTE_INVALID_SCALING);
+
   pybind11::enum_<JAXX_Quantize_Layout>(m, "JAXX_Quantize_Layout", pybind11::module_local())
       .value("ROWWISE", JAXX_Quantize_Layout::ROWWISE)
       .value("COLWISE", JAXX_Quantize_Layout::COLWISE)
diff --git a/transformer_engine/jax/flax/transformer.py b/transformer_engine/jax/flax/transformer.py
index a2e7920843..35a48442d2 100644
--- a/transformer_engine/jax/flax/transformer.py
+++ b/transformer_engine/jax/flax/transformer.py
@@ -748,6 +748,8 @@ def __call__(
         enable_fused_attn = int(os.getenv("NVTE_FUSED_ATTN", "1"))
 
         sequence_dim = 0 if self.transpose_batch_sequence else 1
+        batch_dim = 1 - sequence_dim
+        batch_size = query.shape[batch_dim]
         seqlen_q = query.shape[sequence_dim]
         if qkv_layout == QKVLayout.BS3HD:
             seqlen_kv = seqlen_q
@@ -760,9 +762,10 @@ def __call__(
             head_dim_qk = self.head_dim
             head_dim_v = self.head_dim
 
-        has_fused_attn_kernel = is_fused_attn_kernel_available(
+        has_fused_attn_kernel, fused_attn_reject_reason = is_fused_attn_kernel_available(
             # This needs to be fixed: TE-Jax has historically correlated training mode with deterministic mode.
             not deterministic,
+            batch_size,
             input_dtype,
             # self._assert_dtypes enforces Q, K, V, bias to have the same dtype so using input_dtype as kv dtype is sufficient
             input_dtype,
@@ -778,18 +781,17 @@ def __call__(
             head_dim_qk,
             head_dim_v,
             self.window_size,
+            return_reason=True,
         )
 
         use_fused_attn = enable_fused_attn and has_fused_attn_kernel
 
         if enable_fused_attn and not has_fused_attn_kernel:
+            reason = fused_attn_reject_reason or "(no diagnostic message available)"
             warnings.warn(
-                "Fused attention is not enabled because there is no available kernel.\n"
-                "Fall back to the unfused attention.\n"
-                "Please try to update the cuDNN and TE to the latest version.\n"
-                f"{qkv_layout=}\n{attn_bias_type=}\n{attn_mask_type=}\n"
-                f"{self.attention_dropout=}\n{self.num_attention_heads=}\n{self.window_size=}\n"
-                f"{self.num_gqa_groups=}\n{seqlen_q=}\n{seqlen_kv=}\n{head_dim_qk=}\n{head_dim_v=}\n"
+                "Falling back to the unfused attention backend as fused attention does not"
+                f" support:\n{qkv_layout=}\n{attn_bias_type=}\n{attn_mask_type=}\n{self.attention_dropout=}\n{self.num_attention_heads=}\n{self.window_size=}\n{self.num_gqa_groups=}\n{seqlen_q=}\n{seqlen_kv=}\n{head_dim_qk=}\n{head_dim_v=}\nReason"
+                f" for this rejection: {reason}\n"
             )
 
         dropout_rng = None
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
index b38b66c3e6..beec39fd66 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/dot_product_attention.py
@@ -430,6 +430,7 @@ def __init__(
             softmax_scale = 1.0 / math.sqrt(
                 kv_channels if isinstance(kv_channels, int) else kv_channels[0]
             )
+        self.softmax_scale = softmax_scale
 
         self.deterministic = (
             not bool(int(os.getenv("NVTE_ALLOW_NONDETERMINISTIC_ALGO", "1")))
@@ -1563,6 +1564,7 @@ def forward(
                 return_max_logit=self.return_max_logit,
                 cuda_graph=is_graph_capturing(),
                 num_splits=num_splits,
+                softmax_scale=self.softmax_scale,
             )
             global _attention_backends
             if is_in_onnx_export_mode():
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
index 1f1637cecd..cf9b021e59 100644
--- a/transformer_engine/pytorch/attention/dot_product_attention/utils.py
+++ b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -23,6 +23,7 @@
 import transformer_engine as te
 from transformer_engine.pytorch.cpp_extensions.fused_attn import (
     QKVLayout,
+    QKVFormat,
     AttnBiasType,
     AttnMaskType,
     SoftmaxType,
@@ -256,6 +257,9 @@ class AttentionParams:
         Whether support for cuda graph capture is needed or not.
     num_splits : int, default = 1
         The number of kernels to split attention to.
+    softmax_scale : float, default = 1.0
+        Pre-softmax attention scale. Plumbed through to the cuDNN graph cache key so that the
+        backend probe builds the same execution graph the runtime call later reuses.
     """
 
     qkv_type: Union[torch.Tensor, Float8Tensor] = torch.Tensor
@@ -289,6 +293,7 @@ class AttentionParams:
     return_max_logit: bool = False
     cuda_graph: bool = False
     num_splits: int = 1
+    softmax_scale: float = 1.0
 
     def __eq__(self, other):
         """
@@ -367,6 +372,7 @@ def get_attention_backend(
     return_max_logit = attention_params.return_max_logit
     cuda_graph = attention_params.cuda_graph
     num_splits = attention_params.num_splits
+    softmax_scale = attention_params.softmax_scale
 
     # Run config
     logger = logging.getLogger("DotProductAttention")
@@ -1222,17 +1228,56 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
     if use_fused_attention:
         q_type = TE_DType[qkv_dtype]
         kv_type = q_type
+        o_type = q_type
+        do_type = q_type
+        dqkv_type = q_type
+        scaling_mode = tex.NVTEScalingMode.NVTE_INVALID_SCALING
+        qkv_scale_inv_format = None
+        do_scale_inv_format = None
         if fp8 and fp8_meta["recipe"].fp8_dpa:
-            q_type = get_fp8_te_dtype(fp8_meta["recipe"], fprop_tensor=True)
+            recipe = fp8_meta["recipe"]
+            q_type = get_fp8_te_dtype(recipe, fprop_tensor=True)
             kv_type = q_type
-        fused_attention_backend = tex.get_fused_attn_backend(
+            cs_o_in_f16 = os.getenv("NVTE_DPA_FP8CS_O_in_F16", "1") == "1"
+            if recipe.mxfp8():
+                scaling_mode = tex.NVTEScalingMode.NVTE_MXFP8_1D_SCALING
+                o_type = TE_DType[torch.bfloat16]
+                do_type = TE_DType[torch.bfloat16]
+                dqkv_type = TE_DType[torch.bfloat16]
+                qkv_scale_inv_format = "bhsd"
+                do_scale_inv_format = "bhsd"
+            elif recipe.float8_current_scaling() and cs_o_in_f16:
+                scaling_mode = tex.NVTEScalingMode.NVTE_DELAYED_TENSOR_SCALING
+                o_type = TE_DType[torch.bfloat16]
+                do_type = TE_DType[torch.bfloat16]
+                dqkv_type = TE_DType[torch.bfloat16]
+            else:
+                scaling_mode = tex.NVTEScalingMode.NVTE_DELAYED_TENSOR_SCALING
+                o_type = q_type
+                do_type = o_type
+                dqkv_type = q_type
+        o_format = q_format
+        do_format = o_format
+        dqkv_layout = qkv_layout
+        fused_attention_backend, reject_message = tex.get_fused_attn_backend(
             is_training,
+            batch_size,
             q_type,
             kv_type,
+            o_type,
+            do_type,
+            dqkv_type,
+            scaling_mode,
             QKVLayout[qkv_layout],
+            QKVFormat[o_format],
+            QKVFormat[do_format],
+            QKVLayout[dqkv_layout],
+            QKVFormat[qkv_scale_inv_format],
+            QKVFormat[do_scale_inv_format],
             AttnBiasType[fu_core_attention_bias_type],
             AttnMaskType[attn_mask_type],
             SoftmaxType[softmax_type],
+            softmax_scale,
             attention_dropout,
             num_heads,
             num_gqa_groups,
@@ -1242,12 +1287,16 @@ def _is_fa3_supported(num_heads, num_gqa_groups, head_dim_qk, head_dim_v, qkv_dt
             head_dim_v,
             window_size[0],
             window_size[1],
+            bottom_right_diagonal,
             return_max_logit,
             cuda_graph,
             deterministic,
         )
         if fused_attention_backend == FusedAttnBackend["No_Backend"]:
-            logger.debug("Disabling FusedAttention as no backend supports the provided input")
+            logger.debug(
+                "Disabling FusedAttention: %s",
+                reject_message,
+            )
             use_fused_attention = False
             fused_attention_backend = None
     # Filter: Determinism
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
index 9b10a9c5a4..1140cb5b02 100644
--- a/transformer_engine/pytorch/csrc/extensions.h
+++ b/transformer_engine/pytorch/csrc/extensions.h
@@ -75,12 +75,18 @@ std::tuple<at::Tensor, at::Tensor> moe_unpermute_bwd(at::Tensor input_bwd, at::T
  * Attention
  **************************************************************************************************/
 
-NVTE_Fused_Attn_Backend get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic);
+// Returns (backend, reason). `reason` is empty on success, otherwise a diagnostic string
+// describing why the configuration was rejected when backend = NVTE_No_Backend.
+std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
+    bool is_training, size_t batch_size, const DType q_dtype, const DType kv_dtype,
+    const DType o_dtype, const DType do_dtype, const DType dqkv_dtype, NVTEScalingMode scaling_mode,
+    NVTE_QKV_Layout qkv_layout, NVTE_QKV_Format o_format, NVTE_QKV_Format do_format,
+    NVTE_QKV_Layout dqkv_layout, NVTE_QKV_Format qkv_scale_inv_format,
+    NVTE_QKV_Format do_scale_inv_format, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
+    NVTE_Softmax_Type softmax_type, float attn_scale, float p_dropout, size_t num_attn_heads,
+    size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+    size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic);
 
 std::vector<py::object> fused_attn_fwd(
     size_t max_seqlen_q, size_t max_seqlen_kv, bool is_training, float attn_scale, float p_dropout,
diff --git a/transformer_engine/pytorch/csrc/extensions/attention.cpp b/transformer_engine/pytorch/csrc/extensions/attention.cpp
index 7e8018b3fd..afcdcae015 100644
--- a/transformer_engine/pytorch/csrc/extensions/attention.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cpp
@@ -40,18 +40,52 @@ void mha_fill(const transformer_engine::TensorWrapper &self, const at::Tensor &s
 namespace transformer_engine::pytorch {
 
 // get the fused attention backend
-NVTE_Fused_Attn_Backend get_fused_attn_backend(
-    bool is_training, const DType q_dtype, const DType kv_dtype, NVTE_QKV_Layout qkv_layout,
-    NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type, NVTE_Softmax_Type softmax_type,
-    float p_dropout, size_t num_attn_heads, size_t num_gqa_groups, size_t max_seqlen_q,
-    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
-    int64_t window_size_right, bool return_max_logit, bool cuda_graph, bool deterministic) {
-  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
-      is_training, static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout,
-      bias_type, attn_mask_type, softmax_type, p_dropout, num_attn_heads, num_gqa_groups,
-      max_seqlen_q, max_seqlen_kv, head_dim_qk, head_dim_v, window_size_left, window_size_right,
-      return_max_logit, cuda_graph, deterministic);
-  return fused_attention_backend;
+std::tuple<NVTE_Fused_Attn_Backend, std::string> get_fused_attn_backend(
+    bool is_training, size_t batch_size, const DType q_dtype, const DType kv_dtype,
+    const DType o_dtype, const DType do_dtype, const DType dqkv_dtype, NVTEScalingMode scaling_mode,
+    NVTE_QKV_Layout qkv_layout, NVTE_QKV_Format o_format, NVTE_QKV_Format do_format,
+    NVTE_QKV_Layout dqkv_layout, NVTE_QKV_Format qkv_scale_inv_format,
+    NVTE_QKV_Format do_scale_inv_format, NVTE_Bias_Type bias_type, NVTE_Mask_Type attn_mask_type,
+    NVTE_Softmax_Type softmax_type, float attn_scale, float p_dropout, size_t num_attn_heads,
+    size_t num_gqa_groups, size_t max_seqlen_q, size_t max_seqlen_kv, size_t head_dim_qk,
+    size_t head_dim_v, int64_t window_size_left, int64_t window_size_right,
+    bool bottom_right_diagonal, bool return_max_logit, bool cuda_graph, bool deterministic) {
+  NVTEFusedAttnConfig cfg = NVTE_FUSED_ATTN_CONFIG_INIT;
+  cfg.qkv_layout = qkv_layout;
+  cfg.o_format = o_format;
+  cfg.do_format = do_format;
+  cfg.dqkv_layout = dqkv_layout;
+  cfg.qkv_scale_inv_format = qkv_scale_inv_format;
+  cfg.do_scale_inv_format = do_scale_inv_format;
+  cfg.bias_type = bias_type;
+  cfg.attn_mask_type = attn_mask_type;
+  cfg.softmax_type = softmax_type;
+  cfg.scaling_mode = scaling_mode;
+  cfg.attn_scale = attn_scale;
+  cfg.dropout = p_dropout;
+  cfg.max_seqlen_q = max_seqlen_q;
+  cfg.max_seqlen_kv = max_seqlen_kv;
+  cfg.window_size_left = window_size_left;
+  cfg.window_size_right = window_size_right;
+  cfg.bottom_right_diagonal = bottom_right_diagonal;
+  cfg.cuda_graph = cuda_graph;
+  NVTE_CHECK(q_dtype == kv_dtype, "Q and KV must have the same data type.");
+  cfg.qkv_dtype = static_cast<NVTEDType>(q_dtype);
+  cfg.o_dtype = static_cast<NVTEDType>(o_dtype);
+  cfg.do_dtype = static_cast<NVTEDType>(do_dtype);
+  cfg.dqkv_dtype = static_cast<NVTEDType>(dqkv_dtype);
+  cfg.batch_size = batch_size;
+  cfg.num_attn_heads = num_attn_heads;
+  cfg.num_gqa_groups = num_gqa_groups;
+  cfg.head_dim_qk = head_dim_qk;
+  cfg.head_dim_v = head_dim_v;
+  cfg.is_training = is_training;
+  cfg.return_max_logit = return_max_logit;
+  cfg.deterministic = deterministic;
+
+  const char *message = nullptr;
+  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend_v2(&cfg, &message);
+  return {fused_attention_backend, message != nullptr ? std::string(message) : std::string()};
 }
 
 // helper function for S and dP quantizers