From 766ecdcdcb93bdc245fb45961acd00662c9cfa44 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 17 Dec 2024 14:07:06 -0500
Subject: [PATCH 1/7] Add warnings about too few or too many samples

---
 pyperf/_bench.py | 29 +++++++++++++++++++++++++++++
 pyperf/_cli.py   | 13 +++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/pyperf/_bench.py b/pyperf/_bench.py
index 5d9bcf34..a0cc80cf 100644
--- a/pyperf/_bench.py
+++ b/pyperf/_bench.py
@@ -424,6 +424,35 @@ def median_abs_dev(self):
             raise ValueError("MAD must be >= 0")
         return value
 
+    def required_nsamples(self):
+        """
+        Determines the number of samples that would be required to have 95%
+        certainty that the samples have a variance of less than 1%.
+
+        This is described in this Wikipedia article about estimating the sampling of
+        a mean:
+
+        https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+        """
+        # Get the means of the values per run
+        values = []
+        for run in self._runs:
+            values.append(statistics.mean(run.values))
+
+        total = math.fsum(values)
+        mean = total / len(values)
+        stddev = statistics.stdev(values)
+        # Normalize the stddev so we can target "percentage changed" rather than
+        # absolute time
+        sigma = stddev / mean
+
+        # 95% certainty
+        Z = 1.96
+        # 1% variation
+        W = 0.01
+
+        return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)))
+
     def percentile(self, p):
         if not (0 <= p <= 100):
             raise ValueError("p must be in the range [0; 100]")
diff --git a/pyperf/_cli.py b/pyperf/_cli.py
index fba0ada8..41dc6f49 100644
--- a/pyperf/_cli.py
+++ b/pyperf/_cli.py
@@ -413,6 +413,8 @@ def format_checks(bench, lines=None):
     warnings = []
     warn = warnings.append
 
+    required_nsamples = bench.required_samples()
+
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
     if len(values) >= 2:
@@ -421,6 +423,10 @@ def format_checks(bench, lines=None):
         if percent >= 10.0:
             warn("the standard deviation (%s) is %.0f%% of the mean (%s)"
                  % (bench.format_value(stdev), percent, bench.format_value(mean)))
+        else:
+            # display a warning if the number of samples isn't enough to get a stable result
+            if required_nsamples > len(bench._runs):
+                warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
     # Minimum and maximum, detect obvious outliers
     for minimum, value in (
@@ -457,6 +463,13 @@ def format_checks(bench, lines=None):
         lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
         lines.append("Use --quiet option to hide these warnings.")
 
+    if required_nsamples < len(bench._runs) * 0.75:
+        lines.append("Benchmark was run more times than necessary to get a stable result.")
+        lines.append(
+            "Consider passing processes=%d to the Runner constructor to save time." %
+            required_nsamples
+        )
+
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
     for run in bench._runs:
         cpu_config = run._metadata.get('cpu_config')

From 0e3dbc196825a56f6f3ab89e6c9ff6f2495ce805 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 17 Dec 2024 14:20:57 -0500
Subject: [PATCH 2/7] Fix typo

---
 pyperf/_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyperf/_cli.py b/pyperf/_cli.py
index 41dc6f49..db270617 100644
--- a/pyperf/_cli.py
+++ b/pyperf/_cli.py
@@ -413,7 +413,7 @@ def format_checks(bench, lines=None):
     warnings = []
     warn = warnings.append
 
-    required_nsamples = bench.required_samples()
+    required_nsamples = bench.required_nsamples()
 
     # Display a warning if the standard deviation is greater than 10%
     # of the mean

From 7c47af18feb2f4030469c7ffd94209715c12c42c Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 17 Dec 2024 15:03:04 -0500
Subject: [PATCH 3/7] Fix tests

---
 pyperf/__main__.py            |  5 ++++-
 pyperf/_bench.py              |  6 +++++-
 pyperf/_cli.py                | 10 ++++++++--
 pyperf/tests/test_perf_cli.py | 15 ++++++++++++---
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/pyperf/__main__.py b/pyperf/__main__.py
index aee19d18..639ffeb4 100644
--- a/pyperf/__main__.py
+++ b/pyperf/__main__.py
@@ -491,10 +491,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                 empty_line(output)
                 output.extend(lines)
 
+        contains_warning = False
         for line in output:
+            if line.startswith("WARNING:"):
+                contains_warning = True
             print(line)
 
-        if not output and only_checks:
+        if not contains_warning and only_checks:
             if len(data) == 1:
                 print("The benchmark seems to be stable")
             else:
diff --git a/pyperf/_bench.py b/pyperf/_bench.py
index a0cc80cf..6278e8a9 100644
--- a/pyperf/_bench.py
+++ b/pyperf/_bench.py
@@ -437,7 +437,11 @@ def required_nsamples(self):
         # Get the means of the values per run
         values = []
         for run in self._runs:
-            values.append(statistics.mean(run.values))
+            if len(run.values):
+                values.append(statistics.mean(run.values))
+
+        if len(values) < 2:
+            return None
 
         total = math.fsum(values)
         mean = total / len(values)
diff --git a/pyperf/_cli.py b/pyperf/_cli.py
index db270617..266031e9 100644
--- a/pyperf/_cli.py
+++ b/pyperf/_cli.py
@@ -425,7 +425,10 @@ def format_checks(bench, lines=None):
                  % (bench.format_value(stdev), percent, bench.format_value(mean)))
         else:
             # display a warning if the number of samples isn't enough to get a stable result
-            if required_nsamples > len(bench._runs):
+            if (
+                required_nsamples is not None and
+                required_nsamples > len(bench._runs)
+            ):
                 warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
     # Minimum and maximum, detect obvious outliers
@@ -463,7 +466,10 @@ def format_checks(bench, lines=None):
         lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
         lines.append("Use --quiet option to hide these warnings.")
 
-    if required_nsamples < len(bench._runs) * 0.75:
+    if (
+        required_nsamples is not None and
+        required_nsamples < len(bench._runs) * 0.75
+    ):
         lines.append("Benchmark was run more times than necessary to get a stable result.")
         lines.append(
             "Consider passing processes=%d to the Runner constructor to save time." %
diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
index 6423c52b..8ce62cc8 100644
--- a/pyperf/tests/test_perf_cli.py
+++ b/pyperf/tests/test_perf_cli.py
@@ -478,11 +478,16 @@ def test_hist(self):
             22.8 ms:  3 ##############
             22.9 ms:  4 ###################
             22.9 ms:  4 ###################
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'hist', TELCO, env=env)
 
     def test_show(self):
         expected = ("""
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
+
             Mean +- std dev: 22.5 ms +- 0.2 ms
         """)
         self.check_command(expected, 'show', TELCO)
@@ -518,6 +523,8 @@ def test_stats(self):
             100th percentile: 22.9 ms (+2% of the mean) -- maximum
 
             Number of outlier (out of 22.0 ms..23.0 ms): 0
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'stats', TELCO)
 
@@ -628,8 +635,10 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
-        self.assertEqual(stdout.rstrip(),
-                         'The benchmark seems to be stable')
+        self.assertTrue(
+            'The benchmark seems to be stable' in
+            stdout.rstrip()
+        )
 
     def test_command(self):
         command = [sys.executable, '-c', 'pass']
@@ -689,7 +698,7 @@ def _check_track_memory(self, track_option):
                              '[1,2]*1000',
                              '-o', tmp_name)
             bench = pyperf.Benchmark.load(tmp_name)
-        
+
         self._check_track_memory_bench(bench, loops=5)
 
     def test_track_memory(self):

From a685a14d1efc588111dc00d4f6350d6a4fa6818e Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 18 Dec 2024 14:03:54 -0500
Subject: [PATCH 4/7] Address comments in the PR

---
 doc/api.rst                   | 13 +++++++++++++
 pyperf/__main__.py            |  3 ++-
 pyperf/_bench.py              | 20 ++++++++++++++------
 pyperf/_cli.py                | 19 ++++++++++---------
 pyperf/tests/test_perf_cli.py | 15 ++++++++-------
 5 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/doc/api.rst b/doc/api.rst
index 5647d157..538ceec9 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -345,6 +345,19 @@ Benchmark class
 
       Raise an exception if the benchmark has no values.
 
+   .. method:: required_nprocesses()
+
+      Determines the number of separate process runs that would be required
+      achieve stable results. Specifically, the target is to have 95% certainty
+      that there is a variance of less than 1%. If the result is greater than
+      the number of processes recorded in the input data, the value is
+      meaningless and only means "more samples are required".
+
+      The method used is described in this Wikipedia article about estimating
+      the sampling of a mean:
+
+      https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+
    .. method:: update_metadata(metadata: dict)
 
       Update metadata of all runs of the benchmark.
diff --git a/pyperf/__main__.py b/pyperf/__main__.py
index 639ffeb4..572253ae 100644
--- a/pyperf/__main__.py
+++ b/pyperf/__main__.py
@@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                                            dump=dump,
                                            checks=checks,
                                            result=result,
-                                           display_runs_args=display_runs_args)
+                                           display_runs_args=display_runs_args,
+                                           only_checks=only_checks)
 
             if bench_lines:
                 empty_line(lines)
diff --git a/pyperf/_bench.py b/pyperf/_bench.py
index 6278e8a9..5faa7361 100644
--- a/pyperf/_bench.py
+++ b/pyperf/_bench.py
@@ -424,17 +424,23 @@ def median_abs_dev(self):
             raise ValueError("MAD must be >= 0")
         return value
 
-    def required_nsamples(self):
+    def required_nprocesses(self):
         """
-        Determines the number of samples that would be required to have 95%
-        certainty that the samples have a variance of less than 1%.
+        Determines the number of separate process runs that would be required
+        achieve stable results. Specifically, the target is to have 95%
+        certainty that there is a variance of less than 1%. If the result is
+        greater than the number of processes recorded in the input data, the
+        value is meaningless and only means "more samples are required".
 
-        This is described in this Wikipedia article about estimating the sampling of
-        a mean:
+        The method used is described in this Wikipedia article about estimating
+        the sampling of a mean:
 
         https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
         """
-        # Get the means of the values per run
+        # Get the means of the values per process. The values within the process
+        # often vary considerably (e.g. due to cache effects), but the variances
+        # between processes should be fairly consistent. Additionally, this
+        # value is intended to be advice for the number of processes to run.
         values = []
         for run in self._runs:
             if len(run.values):
@@ -446,6 +452,7 @@ def required_nsamples(self):
         total = math.fsum(values)
         mean = total / len(values)
         stddev = statistics.stdev(values)
+
         # Normalize the stddev so we can target "percentage changed" rather than
         # absolute time
         sigma = stddev / mean
@@ -455,6 +462,7 @@ def required_nsamples(self):
         # 1% variation
         W = 0.01
 
+        # (4Z²σ²)/(W²)
         return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)))
 
     def percentile(self, p):
diff --git a/pyperf/_cli.py b/pyperf/_cli.py
index 266031e9..6625e19d 100644
--- a/pyperf/_cli.py
+++ b/pyperf/_cli.py
@@ -400,7 +400,7 @@ def value_bucket(value):
     return lines
 
 
-def format_checks(bench, lines=None):
+def format_checks(bench, lines=None, check_too_many_processes=False):
     if lines is None:
         lines = []
 
@@ -413,7 +413,7 @@ def format_checks(bench, lines=None):
     warnings = []
     warn = warnings.append
 
-    required_nsamples = bench.required_nsamples()
+    required_nprocesses = bench.required_nprocesses()
 
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
@@ -426,8 +426,8 @@ def format_checks(bench, lines=None):
         else:
             # display a warning if the number of samples isn't enough to get a stable result
             if (
-                required_nsamples is not None and
-                required_nsamples > len(bench._runs)
+                required_nprocesses is not None and
+                required_nprocesses > len(bench._runs)
             ):
                 warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
@@ -467,13 +467,14 @@ def format_checks(bench, lines=None):
         lines.append("Use --quiet option to hide these warnings.")
 
     if (
-        required_nsamples is not None and
-        required_nsamples < len(bench._runs) * 0.75
+        check_too_many_processes and
+        required_nprocesses is not None and
+        required_nprocesses < len(bench._runs) * 0.75
     ):
         lines.append("Benchmark was run more times than necessary to get a stable result.")
         lines.append(
             "Consider passing processes=%d to the Runner constructor to save time." %
-            required_nsamples
+            required_nprocesses
         )
 
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
@@ -568,7 +569,7 @@ def format_result(bench):
 
 def format_benchmark(bench, checks=True, metadata=False,
                      dump=False, stats=False, hist=False, show_name=False,
-                     result=True, display_runs_args=None):
+                     result=True, display_runs_args=None, only_checks=False):
     lines = []
 
     if metadata:
@@ -587,7 +588,7 @@ def format_benchmark(bench, checks=True, metadata=False,
         format_stats(bench, lines=lines)
 
     if checks:
-        format_checks(bench, lines=lines)
+        format_checks(bench, lines=lines, check_too_many_processes=only_checks)
 
     if result:
         empty_line(lines)
diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
index 8ce62cc8..0e01380f 100644
--- a/pyperf/tests/test_perf_cli.py
+++ b/pyperf/tests/test_perf_cli.py
@@ -478,16 +478,11 @@ def test_hist(self):
             22.8 ms:  3 ##############
             22.9 ms:  4 ###################
             22.9 ms:  4 ###################
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'hist', TELCO, env=env)
 
     def test_show(self):
         expected = ("""
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
-
             Mean +- std dev: 22.5 ms +- 0.2 ms
         """)
         self.check_command(expected, 'show', TELCO)
@@ -523,8 +518,6 @@ def test_stats(self):
             100th percentile: 22.9 ms (+2% of the mean) -- maximum
 
             Number of outlier (out of 22.0 ms..23.0 ms): 0
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'stats', TELCO)
 
@@ -635,6 +628,14 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
+        self.assertTrue(
+            textwrap.dedent(
+                """
+                Benchmark was run more times than necessary to get a stable result.
+                Consider passing processes=7 to the Runner constructor to save time.
+                """
+            ).strip() in stdout.rstrip()
+        )
         self.assertTrue(
             'The benchmark seems to be stable' in
             stdout.rstrip()

From 56dfad1f8f106eaefad86f9bd2d53f17ee3fe4ea Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 18 Dec 2024 14:07:27 -0500
Subject: [PATCH 5/7] Don't compute required_nprocesses unless necessary

---
 pyperf/_cli.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/pyperf/_cli.py b/pyperf/_cli.py
index 6625e19d..10686261 100644
--- a/pyperf/_cli.py
+++ b/pyperf/_cli.py
@@ -412,8 +412,7 @@ def format_checks(bench, lines=None, check_too_many_processes=False):
     mean = bench.mean()
     warnings = []
     warn = warnings.append
-
-    required_nprocesses = bench.required_nprocesses()
+    required_nprocesses = None
 
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
@@ -425,6 +424,7 @@ def format_checks(bench, lines=None, check_too_many_processes=False):
                  % (bench.format_value(stdev), percent, bench.format_value(mean)))
         else:
             # display a warning if the number of samples isn't enough to get a stable result
+            required_nprocesses = bench.required_nprocesses()
             if (
                 required_nprocesses is not None and
                 required_nprocesses > len(bench._runs)
@@ -466,16 +466,18 @@ def format_checks(bench, lines=None, check_too_many_processes=False):
         lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
         lines.append("Use --quiet option to hide these warnings.")
 
-    if (
-        check_too_many_processes and
-        required_nprocesses is not None and
-        required_nprocesses < len(bench._runs) * 0.75
-    ):
-        lines.append("Benchmark was run more times than necessary to get a stable result.")
-        lines.append(
-            "Consider passing processes=%d to the Runner constructor to save time." %
-            required_nprocesses
-        )
+    if check_too_many_processes:
+        if required_nprocesses is None:
+            required_nprocesses = bench.required_nprocesses()
+        if (
+            required_nprocesses is not None and
+            required_nprocesses < len(bench._runs) * 0.75
+        ):
+            lines.append("Benchmark was run more times than necessary to get a stable result.")
+            lines.append(
+                "Consider passing processes=%d to the Runner constructor to save time." %
+                required_nprocesses
+            )
 
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
     for run in bench._runs:

From 980003e384c93701e36decee63c3a45d6e71e5eb Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 19 Dec 2024 09:32:31 -0500
Subject: [PATCH 6/7] Update pyperf/_bench.py

Co-authored-by: Victor Stinner <vstinner@python.org>
---
 pyperf/_bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyperf/_bench.py b/pyperf/_bench.py
index 5faa7361..20e90349 100644
--- a/pyperf/_bench.py
+++ b/pyperf/_bench.py
@@ -463,7 +463,7 @@ def required_nprocesses(self):
         W = 0.01
 
         # (4Z²σ²)/(W²)
-        return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)))
+        return math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))
 
     def percentile(self, p):
         if not (0 <= p <= 100):

From a02bff90fb2f9c6e6fe27813aa024036d8221b11 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 19 Dec 2024 09:57:29 -0500
Subject: [PATCH 7/7] Use assertIn

---
 pyperf/tests/test_perf_cli.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
index 0e01380f..fbb94037 100644
--- a/pyperf/tests/test_perf_cli.py
+++ b/pyperf/tests/test_perf_cli.py
@@ -628,16 +628,16 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
-        self.assertTrue(
+        self.assertIn(
             textwrap.dedent(
                 """
                 Benchmark was run more times than necessary to get a stable result.
                 Consider passing processes=7 to the Runner constructor to save time.
                 """
-            ).strip() in stdout.rstrip()
+            ).strip(), stdout.rstrip()
         )
-        self.assertTrue(
-            'The benchmark seems to be stable' in
+        self.assertIn(
+            'The benchmark seems to be stable',
             stdout.rstrip()
         )