diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py index bf8d52cb1..41471067d 100644 --- a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py +++ b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py @@ -2991,7 +2991,7 @@ def get_full_non_persistent_tuning_space(): # For now we see better perf with num_stages=0 for all gemm configs we care # But keep this explicit so that we do not forget we may need to set it to # other values in the future - num_stage_range = [0] + num_stage_range = [2] waves_per_eu_range = [0] matrix_instr_nonkdim_range = [16, 32] kpack_range = [1, 2]