From 9c5980e8e2ec1d2db28031853b802093ba51a643 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Sat, 1 Feb 2025 11:24:34 -0600 Subject: [PATCH] Additional tuning for grouped page attention kernel. Changed: - waves_per_eu --- .../sglang/srt/layers/attention/triton_ops/decode_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py index 86bc15c1cb5..4ccf38b0588 100644 --- a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py +++ b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py @@ -436,7 +436,7 @@ def _decode_grouped_att_m_fwd( if is_hip_: # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py - extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2} + extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2} _fwd_grouped_kernel_stage1[grid]( q,