ROCm · sanyalington · Jan 17, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 21, 2025
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -9,8 +9,9 @@
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                         create_kv_caches_with_random)
 
-NUM_BLOCKS = 1024 * 1024
+NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
 
 
 @torch.inference_mode()
@@ -78,9 +79,12 @@ def main(
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v2":
-        if current_platform.is_rocm() and not args.custom_paged_attn:
+        if current_platform.is_rocm():
             global PARTITION_SIZE
-            PARTITION_SIZE = 1024
+            if not args.custom_paged_attn:
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -163,6 +167,8 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         kv_cache_dtype,
                         k_scale,
                         v_scale,
+                        None,
+                        PARTITION_SIZE,
                     )
             else:
                 raise ValueError(f"Invalid version: {version}")
@@ -176,13 +182,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     # Warmup.
     print("Warming up...")
     run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=3, profile=False)
+    run_benchmark(num_iters=500, profile=False)
 
     # Benchmark.
     if do_profile:
         latency = run_benchmark(num_iters=1, profile=True)
     else:
-        latency = run_benchmark(num_iters=1000, profile=False)
+        latency = run_benchmark(num_iters=10000, profile=False)
     print(f"Kernel running time: {latency * 1000000:.3f} us")