Speed Up Benchmarks in Test (#7129)

gfx-rs · Feb 13, 2025 · 03a01df · 03a01df
1 parent 2f50426
commit 03a01df
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 16 deletions.
diff --git a/benches/benches/bind_groups.rs b/benches/benches/bind_groups.rs
@@ -7,7 +7,17 @@ use criterion::{criterion_group, Criterion, Throughput};
 use nanorand::{Rng, WyRand};
 use std::sync::LazyLock;
 
-use crate::DeviceState;
+use crate::{is_test, DeviceState};
+
+// Creating 50_000 textures takes a considerable amount of time with syncval enabled.
+//
+// We greatly reduce the number of textures for the test case to keep the runtime
+// reasonable for testing.
+const MAX_TEXTURE_COUNT_BENCHMARK: u32 = 50_000;
+const TEXTURE_COUNTS_BENCHMARK: &[u32] = &[5, 50, 500, 5_000, 50_000];
+
+const MAX_TEXTURE_COUNT_TEST: u32 = 5;
+const TEXTURE_COUNTS_TEST: &[u32] = &[5];
 
 struct BindGroupState {
     device_state: DeviceState,
@@ -19,16 +29,20 @@ impl BindGroupState {
     fn new() -> Self {
         let device_state = DeviceState::new();
 
-        const TEXTURE_COUNT: u32 = 50_000;
+        let texture_count = if is_test() {
+            MAX_TEXTURE_COUNT_TEST
+        } else {
+            MAX_TEXTURE_COUNT_BENCHMARK
+        };
 
         // Performance gets considerably worse if the resources are shuffled.
         //
         // This more closely matches the real-world use case where resources have no
         // well defined usage order.
         let mut random = WyRand::new_seed(0x8BADF00D);
 
-        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT as usize);
-        for i in 0..TEXTURE_COUNT {
+        let mut texture_views = Vec::with_capacity(texture_count as usize);
+        for i in 0..texture_count {
             let texture = device_state
                 .device
                 .create_texture(&wgpu::TextureDescriptor {
@@ -64,7 +78,13 @@ fn run_bench(ctx: &mut Criterion) {
 
     let mut group = ctx.benchmark_group("Bind Group Creation");
 
-    for count in [5, 50, 500, 5_000, 50_000] {
+    let count_list = if is_test() {
+        TEXTURE_COUNTS_TEST
+    } else {
+        TEXTURE_COUNTS_BENCHMARK
+    };
+
+    for &count in count_list {
         group.throughput(Throughput::Elements(count as u64));
         group.bench_with_input(
             format!("{} Element Bind Group", count),

diff --git a/benches/benches/computepass.rs b/benches/benches/computepass.rs
@@ -8,12 +8,12 @@ use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;
 
-use crate::DeviceState;
+use crate::{is_test, DeviceState};
 
 fn dispatch_count() -> usize {
     // When testing we only want to run a very lightweight version of the benchmark
     // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
         8
     } else {
         10_000
@@ -28,13 +28,21 @@ fn dispatch_count() -> usize {
 fn dispatch_count_bindless() -> usize {
     // On CI we only want to run a very lightweight version of the benchmark
     // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
         8
     } else {
         1_000
     }
 }
 
+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[2, 4, 8]
+    }
+}
+
 // Must match the number of textures in the computepass.wgsl shader
 const TEXTURES_PER_DISPATCH: usize = 2;
 const STORAGE_TEXTURES_PER_DISPATCH: usize = 2;
@@ -437,7 +445,7 @@ fn run_bench(ctx: &mut Criterion) {
     group.throughput(Throughput::Elements(dispatch_count as _));
 
     for time_submit in [false, true] {
-        for cpasses in [1, 2, 4, 8] {
+        for &cpasses in thread_count_list() {
             let dispatch_per_pass = dispatch_count / cpasses;
 
             let label = if time_submit {
@@ -493,7 +501,7 @@ fn run_bench(ctx: &mut Criterion) {
     let mut group = ctx.benchmark_group("Computepass: Multi Threaded");
     group.throughput(Throughput::Elements(dispatch_count as _));
 
-    for threads in [2, 4, 8] {
+    for &threads in thread_count_list() {
         let dispatch_per_pass = dispatch_count / threads;
         group.bench_function(
             format!("{threads} threads x {dispatch_per_pass} dispatch"),

diff --git a/benches/benches/renderpass.rs b/benches/benches/renderpass.rs
@@ -8,18 +8,26 @@ use nanorand::{Rng, WyRand};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;
 
-use crate::DeviceState;
+use crate::{is_test, DeviceState};
 
 fn draw_count() -> usize {
     // When testing we only want to run a very lightweight version of the benchmark
     // to ensure that it does not break.
-    if std::env::var("NEXTEST").is_ok() {
+    if is_test() {
         8
     } else {
         10_000
     }
 }
 
+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[1, 2, 4, 8]
+    }
+}
+
 // Must match the number of textures in the renderpass.wgsl shader
 const TEXTURES_PER_DRAW: usize = 7;
 const VERTEX_BUFFERS_PER_DRAW: usize = 2;
@@ -438,7 +446,7 @@ fn run_bench(ctx: &mut Criterion) {
     group.throughput(Throughput::Elements(draw_count as _));
 
     for time_submit in [false, true] {
-        for rpasses in [1, 2, 4, 8] {
+        for &rpasses in thread_count_list() {
             let draws_per_pass = draw_count / rpasses;
 
             let label = if time_submit {
@@ -499,7 +507,7 @@ fn run_bench(ctx: &mut Criterion) {
     let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
     group.throughput(Throughput::Elements(draw_count as _));
 
-    for threads in [2, 4, 8] {
+    for &threads in thread_count_list() {
         let draws_per_pass = draw_count / threads;
         group.bench_function(format!("{threads} threads x {draws_per_pass} draws"), |b| {
             LazyLock::force(&state);

diff --git a/benches/benches/resource_creation.rs b/benches/benches/resource_creation.rs
@@ -4,7 +4,15 @@ use criterion::{criterion_group, Criterion, Throughput};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::sync::LazyLock;
 
-use crate::DeviceState;
+use crate::{is_test, DeviceState};
+
+fn thread_count_list() -> &'static [usize] {
+    if is_test() {
+        &[2]
+    } else {
+        &[1, 2, 4, 8]
+    }
+}
 
 fn run_bench(ctx: &mut Criterion) {
     let state = LazyLock::new(DeviceState::new);
@@ -14,7 +22,7 @@ fn run_bench(ctx: &mut Criterion) {
     let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
     group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
 
-    for threads in [1, 2, 4, 8] {
+    for &threads in thread_count_list() {
         let resources_per_thread = RESOURCES_TO_CREATE / threads;
         group.bench_function(
             format!("{threads} threads x {resources_per_thread} resource"),

diff --git a/benches/benches/root.rs b/benches/benches/root.rs
@@ -7,6 +7,10 @@ mod renderpass;
 mod resource_creation;
 mod shader;
 
+fn is_test() -> bool {
+    std::env::var("NEXTEST").is_ok()
+}
+
 struct DeviceState {
     adapter_info: wgpu::AdapterInfo,
     device: wgpu::Device,