Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Fuser/auto_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def __init__(
ignore_router_config: bool = False,
use_router_cache: bool = True,
no_cusolver: bool = False,
test_timeout_s: int = 30,
test_timeout_s: int = 300,
test_code: str | None = None,
) -> None:
self.ka_model = ka_model
Expand Down
2 changes: 1 addition & 1 deletion Fuser/config/autoagent_default.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ target_platform: cuda
ignore_router_config: false
use_router_cache: true
no_cusolver: false
test_timeout_s: 30
test_timeout_s: 300
test_code: null
2 changes: 1 addition & 1 deletion Fuser/dispatch_kernel_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def run(
target_platform: str = "cuda",
max_iters: int = 10,
no_cusolver: bool = False,
test_timeout_s: int = 30,
test_timeout_s: int = 300,
) -> Path:
"""Dispatch subgraphs to KernelAgent with optional parallelism.
Expand Down
2 changes: 1 addition & 1 deletion Fuser/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def run_pipeline(
verify: bool = True,
compose_max_iters: int = 5,
target_platform: str = "cuda",
test_timeout_s: int = 30,
test_timeout_s: int = 300,
) -> dict:
# Select default KernelAgent model if not provided: prefer GPT-5 for Level 2/3
if dispatch_model is None:
Expand Down
38 changes: 36 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Every stage writes artifacts to a run directory under `.optimize/<run_id>/`, inc
- Linux or macOS
- **GPU Requirements (one of the following):**
- **CUDA**: NVIDIA GPU with CUDA support
- **ROCm**: AMD GPU with ROCm 6.x+ (e.g., Instinct MI300X)
- **XPU**: Intel GPU with oneAPI support (Arc, Data Center GPUs, or integrated Xe graphics)
- Triton (installed separately: `pip install triton` or nightly from source)
- PyTorch (https://pytorch.org/get-started/locally/)
Expand All @@ -42,6 +43,29 @@ pip install -e .

### Platform-Specific PyTorch Installation

#### AMD ROCm (AMD GPUs)
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
```

**Note:** AMD ROCm support requires:
- ROCm 6.x installed and `rocprofv3` (or `rocprof`) on `$PATH`
- Compatible AMD GPU (e.g., Instinct MI300X)

For optimization, use the bundled config as a quickstart:
```bash
python examples/run_opt_manager.py \
--kernel-dir examples/optimize_01_matvec/ \
--config examples/configs/amd.yaml
```

Verify your ROCm installation:
```python
import torch
print(torch.cuda.is_available()) # True if ROCm PyTorch detects GPU
print(torch.version.hip) # Should print the HIP/ROCm version
```

#### Intel XPU (Intel GPUs)
```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
Expand Down Expand Up @@ -225,8 +249,18 @@ KernelAgent supports multiple GPU platforms for Triton kernel execution:
| Platform | Device String | Flag | Status |
|----------|---------------|------|--------|
| NVIDIA CUDA | `cuda` | `--target-platform cuda` (default) | Fully supported |
| AMD ROCm | `rocm` | `--target-platform rocm` | Supported |
| Intel XPU | `xpu` | `--target-platform xpu` | Supported |

### AMD ROCm Notes

When targeting AMD ROCm, KernelAgent automatically:
- Uses `rocprofv3` (or `rocprof` as fallback) for hardware profiling
- Applies ROCm-specific Triton block/wave occupancy hints
- Generates appropriate device availability checks

See `examples/configs/amd.yaml` for a ready-to-use MI300X configuration.

### Intel XPU Notes

When targeting Intel XPU, KernelAgent automatically:
Expand All @@ -237,9 +271,9 @@ When targeting Intel XPU, KernelAgent automatically:

### Verifying Platform Setup
```python
# Check CUDA availability
# Check CUDA/ROCm availability
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA/ROCm available:", torch.cuda.is_available())

# Check XPU availability
print("XPU available:", hasattr(torch, 'xpu') and torch.xpu.is_available())
Expand Down
39 changes: 39 additions & 0 deletions examples/configs/amd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# AMD platform config for MI300X
#
# Usage:
# python examples/run_opt_manager.py \
# --kernel-dir examples/optimize_01_matvec \
# --strategy amd
# --config examples/configs/amd.yaml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# --config examples/configs/amd.yaml
# --strategy amd

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You actually flagged a typo in nvidia.yaml too (it's also supposed to be strategy)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can actually remove teh --config arg, strategy will resolve it for us


strategy: beam_search
num_workers: 4
strategy_config:
num_top_kernels: 2
num_bottlenecks: 2
openai_model: gpt-5
high_reasoning_effort: true

# Worker configuration
benchmark_warmup: 25
benchmark_repeat: 100
divergence_threshold: 50.0
target_platform: rocm
gpu_name: "AMD Instinct MI300X"

platform:
# Manager-level components
verifier: rocm
benchmarker: rocm
worker_runner: rocm
# Worker-level components
specs_provider: rocm
profiler: rocm
roofline_analyzer: rocm
bottleneck_analyzer: rocm
rag_prescriber: rocm

templates:
kernel_optimization: triton_kernel_agent/templates/kernel_optimization.j2
reflexion_prompt: triton_kernel_agent/templates/reflexion_prompt.j2
triton_guidelines: triton_kernel_agent/templates/triton_guidelines.j2
1 change: 1 addition & 0 deletions examples/configs/nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Usage:
# python examples/run_opt_manager.py \
# --kernel-dir examples/optimize_01_matvec \
# --strategy nvidia
# --config examples/configs/nvidia.yaml

strategy: beam_search
Expand Down
2 changes: 1 addition & 1 deletion examples/run_opt_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
_CONFIGS_DIR = Path(__file__).resolve().parent / "configs"

# Available strategies and their config files.
_STRATEGIES = ["beam_search", "greedy", "noop", "nvidia"]
_STRATEGIES = ["beam_search", "greedy", "noop", "nvidia", "amd"]


def _run_strategy(
Expand Down
95 changes: 93 additions & 2 deletions kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@

This module contains the GPU hardware specifications database used for
performance analysis and bottleneck identification. Updated to include
specific SKU variants for multi-SKU GPUs like A100 and H100.
specific SKU variants for multi-SKU GPUs like A100 and H100, and AMD
Instinct GPUs for ROCm support.

Sources:
- NVIDIA official specifications and datasheets
- AMD official specifications and datasheets
- TechPowerUp GPU Database
- Manufacturer datasheets

Last Updated: January 2026
Last Updated: March 2026
"""

from types import MappingProxyType
Expand Down Expand Up @@ -181,6 +183,95 @@
"form_factor": "PCIe",
"tdp_w": 360,
},
# -----------------------------------------------------------------------
# AMD Instinct GPU SKUs (ROCm / HIP)
# -----------------------------------------------------------------------
# AMD Instinct MI300X (CDNA3 / gfx942)
# Sources: AMD product page, Hot Chips 35 (2023)
"AMD Instinct MI300X": {
"name": "AMD Instinct MI300X",
"architecture": "CDNA3",
"gfx_target": "gfx942",
"peak_fp32_tflops": 163.4,
"peak_fp16_tflops": 1307.4, # BF16/FP16 matrix (without sparsity)
"peak_bf16_tflops": 1307.4,
"peak_memory_bw_gbps": 5300, # 5.3 TB/s HBM3
"cu_count": 304, # Compute Units (AMD equiv of SM)
"sm_count": 304, # Alias for compatibility
"max_threads_per_cu": 2048,
"max_threads_per_sm": 2048, # Alias for compatibility
"wavefront_size": 64,
"l1_cache_kb": 32, # L1 per CU (vector L1D)
"l2_cache_mb": 256, # Total Infinity Cache (across all dies)
"memory_gb": 192,
"memory_type": "HBM3",
"form_factor": "OAM",
"tdp_w": 750,
},
# AMD Instinct MI300A (CDNA3 / gfx942, APU variant)
"AMD Instinct MI300A": {
"name": "AMD Instinct MI300A",
"architecture": "CDNA3",
"gfx_target": "gfx942",
"peak_fp32_tflops": 122.6,
"peak_fp16_tflops": 980.6,
"peak_bf16_tflops": 980.6,
"peak_memory_bw_gbps": 3200, # Unified HBM3 (shared with CPU)
"cu_count": 228,
"sm_count": 228,
"max_threads_per_cu": 2048,
"max_threads_per_sm": 2048,
"wavefront_size": 64,
"l1_cache_kb": 32,
"l2_cache_mb": 192,
"memory_gb": 128,
"memory_type": "HBM3",
"form_factor": "OAM",
"tdp_w": 550,
},
# AMD Instinct MI350X (CDNA4 / gfx950)
# Sources: AMD press release (Nov 2024), estimated specs
"AMD Instinct MI350X": {
"name": "AMD Instinct MI350X",
"architecture": "CDNA4",
"gfx_target": "gfx950",
"peak_fp32_tflops": 288.0,
"peak_fp16_tflops": 2304.0, # BF16 matrix estimate
"peak_bf16_tflops": 2304.0,
"peak_memory_bw_gbps": 8000, # ~8 TB/s HBM3E
"cu_count": 304,
"sm_count": 304,
"max_threads_per_cu": 2048,
"max_threads_per_sm": 2048,
"wavefront_size": 64,
"l1_cache_kb": 32,
"l2_cache_mb": 256,
"memory_gb": 288,
"memory_type": "HBM3E",
"form_factor": "OAM",
"tdp_w": 1000,
},
# AMD Instinct MI250X (CDNA2 / gfx90a)
"AMD Instinct MI250X": {
"name": "AMD Instinct MI250X",
"architecture": "CDNA2",
"gfx_target": "gfx90a",
"peak_fp32_tflops": 47.9,
"peak_fp16_tflops": 383.0,
"peak_bf16_tflops": 383.0,
"peak_memory_bw_gbps": 3277, # HBM2e
"cu_count": 220,
"sm_count": 220,
"max_threads_per_cu": 2048,
"max_threads_per_sm": 2048,
"wavefront_size": 64,
"l1_cache_kb": 16,
"l2_cache_mb": 32,
"memory_gb": 128,
"memory_type": "HBM2e",
"form_factor": "OAM",
"tdp_w": 560,
},
}

# Make database read-only to prevent accidental modification
Expand Down
Loading
Loading