From 9c5980e8e2ec1d2db28031853b802093ba51a643 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <whchung@gmail.com>
Date: Sat, 1 Feb 2025 11:24:34 -0600
Subject: [PATCH] Additional tuning for grouped page attention kernel.

Changed:
- waves_per_eu
---
 .../sglang/srt/layers/attention/triton_ops/decode_attention.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
index 86bc15c1cb5..4ccf38b0588 100644
--- a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
@@ -436,7 +436,7 @@ def _decode_grouped_att_m_fwd(
     if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
-        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
 
     _fwd_grouped_kernel_stage1[grid](
         q,