meta-pytorch · andyluo7 · Mar 13, 2026 · Mar 13, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/Fuser/auto_agent.py b/Fuser/auto_agent.py
@@ -349,7 +349,7 @@ def __init__(
         ignore_router_config: bool = False,
         use_router_cache: bool = True,
         no_cusolver: bool = False,
-        test_timeout_s: int = 30,
+        test_timeout_s: int = 300,
         test_code: str | None = None,
     ) -> None:
         self.ka_model = ka_model

diff --git a/Fuser/config/autoagent_default.yml b/Fuser/config/autoagent_default.yml
@@ -33,5 +33,5 @@ target_platform: cuda
 ignore_router_config: false
 use_router_cache: true
 no_cusolver: false
-test_timeout_s: 30
+test_timeout_s: 300
 test_code: null
diff --git a/Fuser/dispatch_kernel_agent.py b/Fuser/dispatch_kernel_agent.py
@@ -334,7 +334,7 @@ def run(
     target_platform: str = "cuda",
     max_iters: int = 10,
     no_cusolver: bool = False,
-    test_timeout_s: int = 30,
+    test_timeout_s: int = 300,
 ) -> Path:
     """Dispatch subgraphs to KernelAgent with optional parallelism.
 

diff --git a/Fuser/pipeline.py b/Fuser/pipeline.py
@@ -57,7 +57,7 @@ def run_pipeline(
     verify: bool = True,
     compose_max_iters: int = 5,
     target_platform: str = "cuda",
-    test_timeout_s: int = 30,
+    test_timeout_s: int = 300,
 ) -> dict:
     # Select default KernelAgent model if not provided: prefer GPT-5 for Level 2/3
     if dispatch_model is None:

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Every stage writes artifacts to a run directory under `.optimize/<run_id>/`, inc
 - Linux or macOS
 - **GPU Requirements (one of the following):**
   - **CUDA**: NVIDIA GPU with CUDA support
+  - **ROCm**: AMD GPU with ROCm 6.x+ (e.g., Instinct MI300X)
   - **XPU**: Intel GPU with oneAPI support (Arc, Data Center GPUs, or integrated Xe graphics)
 - Triton (installed separately: `pip install triton` or nightly from source)
 - PyTorch (https://pytorch.org/get-started/locally/)
@@ -42,6 +43,29 @@ pip install -e .
 
 ### Platform-Specific PyTorch Installation
 
+#### AMD ROCm (AMD GPUs)
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+```
+
+**Note:** AMD ROCm support requires:
+- ROCm 6.x installed and `rocprofv3` (or `rocprof`) on `$PATH`
+- Compatible AMD GPU (e.g., Instinct MI300X)
+
+For optimization, use the bundled config as a quickstart:
+```bash
+python examples/run_opt_manager.py \
+  --kernel-dir examples/optimize_01_matvec/ \
+  --config examples/configs/amd.yaml
+```
+
+Verify your ROCm installation:
+```python
+import torch
+print(torch.cuda.is_available())   # True if ROCm PyTorch detects GPU
+print(torch.version.hip)           # Should print the HIP/ROCm version
+```
+
 #### Intel XPU (Intel GPUs)
 ```bash
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
@@ -225,8 +249,18 @@ KernelAgent supports multiple GPU platforms for Triton kernel execution:
 | Platform | Device String | Flag | Status |
 |----------|---------------|------|--------|
 | NVIDIA CUDA | `cuda` | `--target-platform cuda` (default) | Fully supported |
+| AMD ROCm | `rocm` | `--target-platform rocm` | Supported |
 | Intel XPU | `xpu` | `--target-platform xpu` | Supported |
 
+### AMD ROCm Notes
+
+When targeting AMD ROCm, KernelAgent automatically:
+- Uses `rocprofv3` (or `rocprof` as fallback) for hardware profiling
+- Applies ROCm-specific Triton block/wave occupancy hints
+- Generates appropriate device availability checks
+
+See `examples/configs/amd.yaml` for a ready-to-use MI300X configuration.
+
 ### Intel XPU Notes
 
 When targeting Intel XPU, KernelAgent automatically:
@@ -237,9 +271,9 @@ When targeting Intel XPU, KernelAgent automatically:
 
 ### Verifying Platform Setup
 ```python
-# Check CUDA availability
+# Check CUDA/ROCm availability
 import torch
-print("CUDA available:", torch.cuda.is_available())
+print("CUDA/ROCm available:", torch.cuda.is_available())
 
 # Check XPU availability
 print("XPU available:", hasattr(torch, 'xpu') and torch.xpu.is_available())

diff --git a/examples/configs/amd.yaml b/examples/configs/amd.yaml
@@ -0,0 +1,39 @@
+# AMD platform config for MI300X
+#
+# Usage:
+#   python examples/run_opt_manager.py \
+#       --kernel-dir examples/optimize_01_matvec \
+#       --strategy amd
+#       --config examples/configs/amd.yaml
-#       --config examples/configs/amd.yaml
+#       --strategy amd
-#       --config examples/configs/amd.yaml
+#       --strategy amd
+
+strategy: beam_search
+num_workers: 4
+strategy_config:
+  num_top_kernels: 2
+  num_bottlenecks: 2
+openai_model: gpt-5
+high_reasoning_effort: true
+
+# Worker configuration
+benchmark_warmup: 25
+benchmark_repeat: 100
+divergence_threshold: 50.0
+target_platform: rocm
+gpu_name: "AMD Instinct MI300X"
+
+platform:
+  # Manager-level components
+  verifier: rocm
+  benchmarker: rocm
+  worker_runner: rocm
+  # Worker-level components
+  specs_provider: rocm
+  profiler: rocm
+  roofline_analyzer: rocm
+  bottleneck_analyzer: rocm
+  rag_prescriber: rocm
+
+templates:
+  kernel_optimization: triton_kernel_agent/templates/kernel_optimization.j2
+  reflexion_prompt: triton_kernel_agent/templates/reflexion_prompt.j2
+  triton_guidelines: triton_kernel_agent/templates/triton_guidelines.j2
diff --git a/examples/configs/nvidia.yaml b/examples/configs/nvidia.yaml
@@ -7,6 +7,7 @@
 # Usage:
 #   python examples/run_opt_manager.py \
 #       --kernel-dir examples/optimize_01_matvec \
+#       --strategy nvidia
 #       --config examples/configs/nvidia.yaml
 
 strategy: beam_search

diff --git a/examples/run_opt_manager.py b/examples/run_opt_manager.py
@@ -40,7 +40,7 @@
 _CONFIGS_DIR = Path(__file__).resolve().parent / "configs"
 
 # Available strategies and their config files.
-_STRATEGIES = ["beam_search", "greedy", "noop", "nvidia"]
+_STRATEGIES = ["beam_search", "greedy", "noop", "nvidia", "amd"]
 
 
 def _run_strategy(

diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
@@ -17,14 +17,16 @@
 
 This module contains the GPU hardware specifications database used for
 performance analysis and bottleneck identification. Updated to include
-specific SKU variants for multi-SKU GPUs like A100 and H100.
+specific SKU variants for multi-SKU GPUs like A100 and H100, and AMD
+Instinct GPUs for ROCm support.
 
 Sources:
 - NVIDIA official specifications and datasheets
+- AMD official specifications and datasheets
 - TechPowerUp GPU Database
 - Manufacturer datasheets
 
-Last Updated: January 2026
+Last Updated: March 2026
 """
 
 from types import MappingProxyType
@@ -181,6 +183,95 @@
         "form_factor": "PCIe",
         "tdp_w": 360,
     },
+    # -----------------------------------------------------------------------
+    # AMD Instinct GPU SKUs (ROCm / HIP)
+    # -----------------------------------------------------------------------
+    # AMD Instinct MI300X (CDNA3 / gfx942)
+    # Sources: AMD product page, Hot Chips 35 (2023)
+    "AMD Instinct MI300X": {
+        "name": "AMD Instinct MI300X",
+        "architecture": "CDNA3",
+        "gfx_target": "gfx942",
+        "peak_fp32_tflops": 163.4,
+        "peak_fp16_tflops": 1307.4,  # BF16/FP16 matrix (without sparsity)
+        "peak_bf16_tflops": 1307.4,
+        "peak_memory_bw_gbps": 5300,  # 5.3 TB/s HBM3
+        "cu_count": 304,  # Compute Units (AMD equiv of SM)
+        "sm_count": 304,  # Alias for compatibility
+        "max_threads_per_cu": 2048,
+        "max_threads_per_sm": 2048,  # Alias for compatibility
+        "wavefront_size": 64,
+        "l1_cache_kb": 32,  # L1 per CU (vector L1D)
+        "l2_cache_mb": 256,  # Total Infinity Cache (across all dies)
+        "memory_gb": 192,
+        "memory_type": "HBM3",
+        "form_factor": "OAM",
+        "tdp_w": 750,
+    },
+    # AMD Instinct MI300A (CDNA3 / gfx942, APU variant)
+    "AMD Instinct MI300A": {
+        "name": "AMD Instinct MI300A",
+        "architecture": "CDNA3",
+        "gfx_target": "gfx942",
+        "peak_fp32_tflops": 122.6,
+        "peak_fp16_tflops": 980.6,
+        "peak_bf16_tflops": 980.6,
+        "peak_memory_bw_gbps": 3200,  # Unified HBM3 (shared with CPU)
+        "cu_count": 228,
+        "sm_count": 228,
+        "max_threads_per_cu": 2048,
+        "max_threads_per_sm": 2048,
+        "wavefront_size": 64,
+        "l1_cache_kb": 32,
+        "l2_cache_mb": 192,
+        "memory_gb": 128,
+        "memory_type": "HBM3",
+        "form_factor": "OAM",
+        "tdp_w": 550,
+    },
+    # AMD Instinct MI350X (CDNA4 / gfx950)
+    # Sources: AMD press release (Nov 2024), estimated specs
+    "AMD Instinct MI350X": {
+        "name": "AMD Instinct MI350X",
+        "architecture": "CDNA4",
+        "gfx_target": "gfx950",
+        "peak_fp32_tflops": 288.0,
+        "peak_fp16_tflops": 2304.0,  # BF16 matrix estimate
+        "peak_bf16_tflops": 2304.0,
+        "peak_memory_bw_gbps": 8000,  # ~8 TB/s HBM3E
+        "cu_count": 304,
+        "sm_count": 304,
+        "max_threads_per_cu": 2048,
+        "max_threads_per_sm": 2048,
+        "wavefront_size": 64,
+        "l1_cache_kb": 32,
+        "l2_cache_mb": 256,
+        "memory_gb": 288,
+        "memory_type": "HBM3E",
+        "form_factor": "OAM",
+        "tdp_w": 1000,
+    },
+    # AMD Instinct MI250X (CDNA2 / gfx90a)
+    "AMD Instinct MI250X": {
+        "name": "AMD Instinct MI250X",
+        "architecture": "CDNA2",
+        "gfx_target": "gfx90a",
+        "peak_fp32_tflops": 47.9,
+        "peak_fp16_tflops": 383.0,
+        "peak_bf16_tflops": 383.0,
+        "peak_memory_bw_gbps": 3277,  # HBM2e
+        "cu_count": 220,
+        "sm_count": 220,
+        "max_threads_per_cu": 2048,
+        "max_threads_per_sm": 2048,
+        "wavefront_size": 64,
+        "l1_cache_kb": 16,
+        "l2_cache_mb": 32,
+        "memory_gb": 128,
+        "memory_type": "HBM2e",
+        "form_factor": "OAM",
+        "tdp_w": 560,
+    },
 }
 
 # Make database read-only to prevent accidental modification