From 766ecdcdcb93bdc245fb45961acd00662c9cfa44 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 17 Dec 2024 14:07:06 -0500 Subject: [PATCH 1/7] Add warnings about too few or too many samples --- pyperf/_bench.py | 29 +++++++++++++++++++++++++++++ pyperf/_cli.py | 13 +++++++++++++ 2 files changed, 42 insertions(+) diff --git a/pyperf/_bench.py b/pyperf/_bench.py index 5d9bcf34..a0cc80cf 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -424,6 +424,35 @@ def median_abs_dev(self): raise ValueError("MAD must be >= 0") return value + def required_nsamples(self): + """ + Determines the number of samples that would be required to have 95% + certainty that the samples have a variance of less than 1%. + + This is described in this Wikipedia article about estimating the sampling of + a mean: + + https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean + """ + # Get the means of the values per run + values = [] + for run in self._runs: + values.append(statistics.mean(run.values)) + + total = math.fsum(values) + mean = total / len(values) + stddev = statistics.stdev(values) + # Normalize the stddev so we can target "percentage changed" rather than + # absolute time + sigma = stddev / mean + + # 95% certainty + Z = 1.96 + # 1% variation + W = 0.01 + + return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))) + def percentile(self, p): if not (0 <= p <= 100): raise ValueError("p must be in the range [0; 100]") diff --git a/pyperf/_cli.py b/pyperf/_cli.py index fba0ada8..41dc6f49 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -413,6 +413,8 @@ def format_checks(bench, lines=None): warnings = [] warn = warnings.append + required_nsamples = bench.required_samples() + # Display a warning if the standard deviation is greater than 10% # of the mean if len(values) >= 2: @@ -421,6 +423,10 @@ def format_checks(bench, lines=None): if percent >= 10.0: warn("the standard deviation (%s) is %.0f%% of the mean (%s)" % (bench.format_value(stdev), percent, bench.format_value(mean))) + else: + # display a warning if the number of samples isn't enough to get a stable result + if required_nsamples > len(bench._runs): + warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)") # Minimum and maximum, detect obvious outliers for minimum, value in ( @@ -457,6 +463,13 @@ def format_checks(bench, lines=None): lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.") lines.append("Use --quiet option to hide these warnings.") + if required_nsamples < len(bench._runs) * 0.75: + lines.append("Benchmark was run more times than necessary to get a stable result.") + lines.append( + "Consider passing processes=%d to the Runner constructor to save time." % + required_nsamples + ) + # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata for run in bench._runs: cpu_config = run._metadata.get('cpu_config') From 0e3dbc196825a56f6f3ab89e6c9ff6f2495ce805 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 17 Dec 2024 14:20:57 -0500 Subject: [PATCH 2/7] Fix typo --- pyperf/_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyperf/_cli.py b/pyperf/_cli.py index 41dc6f49..db270617 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -413,7 +413,7 @@ def format_checks(bench, lines=None): warnings = [] warn = warnings.append - required_nsamples = bench.required_samples() + required_nsamples = bench.required_nsamples() # Display a warning if the standard deviation is greater than 10% # of the mean From 7c47af18feb2f4030469c7ffd94209715c12c42c Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 17 Dec 2024 15:03:04 -0500 Subject: [PATCH 3/7] Fix tests --- pyperf/__main__.py | 5 ++++- pyperf/_bench.py | 6 +++++- pyperf/_cli.py | 10 ++++++++-- pyperf/tests/test_perf_cli.py | 15 ++++++++++++--- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pyperf/__main__.py b/pyperf/__main__.py index aee19d18..639ffeb4 100644 --- a/pyperf/__main__.py +++ b/pyperf/__main__.py @@ -491,10 +491,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False, empty_line(output) output.extend(lines) + contains_warning = False for line in output: + if line.startswith("WARNING:"): + contains_warning = True print(line) - if not output and only_checks: + if not contains_warning and only_checks: if len(data) == 1: print("The benchmark seems to be stable") else: diff --git a/pyperf/_bench.py b/pyperf/_bench.py index a0cc80cf..6278e8a9 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -437,7 +437,11 @@ def required_nsamples(self): # Get the means of the values per run values = [] for run in self._runs: - values.append(statistics.mean(run.values)) + if len(run.values): + values.append(statistics.mean(run.values)) + + if len(values) < 2: + return None total = math.fsum(values) mean = total / len(values) diff --git a/pyperf/_cli.py b/pyperf/_cli.py index db270617..266031e9 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -425,7 +425,10 @@ def format_checks(bench, lines=None): % (bench.format_value(stdev), percent, bench.format_value(mean))) else: # display a warning if the number of samples isn't enough to get a stable result - if required_nsamples > len(bench._runs): + if ( + required_nsamples is not None and + required_nsamples > len(bench._runs) + ): warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)") # Minimum and maximum, detect obvious outliers @@ -463,7 +466,10 @@ def format_checks(bench, lines=None): lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.") lines.append("Use --quiet option to hide these warnings.") - if required_nsamples < len(bench._runs) * 0.75: + if ( + required_nsamples is not None and + required_nsamples < len(bench._runs) * 0.75 + ): lines.append("Benchmark was run more times than necessary to get a stable result.") lines.append( "Consider passing processes=%d to the Runner constructor to save time." % diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py index 6423c52b..8ce62cc8 100644 --- a/pyperf/tests/test_perf_cli.py +++ b/pyperf/tests/test_perf_cli.py @@ -478,11 +478,16 @@ def test_hist(self): 22.8 ms: 3 ############## 22.9 ms: 4 ################### 22.9 ms: 4 ################### + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'hist', TELCO, env=env) def test_show(self): expected = (""" + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. + Mean +- std dev: 22.5 ms +- 0.2 ms """) self.check_command(expected, 'show', TELCO) @@ -518,6 +523,8 @@ def test_stats(self): 100th percentile: 22.9 ms (+2% of the mean) -- maximum Number of outlier (out of 22.0 ms..23.0 ms): 0 + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'stats', TELCO) @@ -628,8 +635,10 @@ def test_slowest(self): def test_check_stable(self): stdout = self.run_command('check', TELCO) - self.assertEqual(stdout.rstrip(), - 'The benchmark seems to be stable') + self.assertTrue( + 'The benchmark seems to be stable' in + stdout.rstrip() + ) def test_command(self): command = [sys.executable, '-c', 'pass'] @@ -689,7 +698,7 @@ def _check_track_memory(self, track_option): '[1,2]*1000', '-o', tmp_name) bench = pyperf.Benchmark.load(tmp_name) - + self._check_track_memory_bench(bench, loops=5) def test_track_memory(self): From a685a14d1efc588111dc00d4f6350d6a4fa6818e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 18 Dec 2024 14:03:54 -0500 Subject: [PATCH 4/7] Address comments in the PR --- doc/api.rst | 13 +++++++++++++ pyperf/__main__.py | 3 ++- pyperf/_bench.py | 20 ++++++++++++++------ pyperf/_cli.py | 19 ++++++++++--------- pyperf/tests/test_perf_cli.py | 15 ++++++++------- 5 files changed, 47 insertions(+), 23 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 5647d157..538ceec9 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -345,6 +345,19 @@ Benchmark class Raise an exception if the benchmark has no values. + .. method:: required_nprocesses() + + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% certainty + that there is a variance of less than 1%. If the result is greater than + the number of processes recorded in the input data, the value is + meaningless and only means "more samples are required". + + The method used is described in this Wikipedia article about estimating + the sampling of a mean: + + https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean + .. method:: update_metadata(metadata: dict) Update metadata of all runs of the benchmark. diff --git a/pyperf/__main__.py b/pyperf/__main__.py index 639ffeb4..572253ae 100644 --- a/pyperf/__main__.py +++ b/pyperf/__main__.py @@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False, dump=dump, checks=checks, result=result, - display_runs_args=display_runs_args) + display_runs_args=display_runs_args, + only_checks=only_checks) if bench_lines: empty_line(lines) diff --git a/pyperf/_bench.py b/pyperf/_bench.py index 6278e8a9..5faa7361 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -424,17 +424,23 @@ def median_abs_dev(self): raise ValueError("MAD must be >= 0") return value - def required_nsamples(self): + def required_nprocesses(self): """ - Determines the number of samples that would be required to have 95% - certainty that the samples have a variance of less than 1%. + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% + certainty that there is a variance of less than 1%. If the result is + greater than the number of processes recorded in the input data, the + value is meaningless and only means "more samples are required". - This is described in this Wikipedia article about estimating the sampling of - a mean: + The method used is described in this Wikipedia article about estimating + the sampling of a mean: https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean """ - # Get the means of the values per run + # Get the means of the values per process. The values within the process + # often vary considerably (e.g. due to cache effects), but the variances + # between processes should be fairly consistent. Additionally, this + # value is intended to be advice for the number of processes to run. values = [] for run in self._runs: if len(run.values): @@ -446,6 +452,7 @@ def required_nsamples(self): total = math.fsum(values) mean = total / len(values) stddev = statistics.stdev(values) + # Normalize the stddev so we can target "percentage changed" rather than # absolute time sigma = stddev / mean @@ -455,6 +462,7 @@ def required_nsamples(self): # 1% variation W = 0.01 + # (4Z²σ²)/(W²) return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))) def percentile(self, p): diff --git a/pyperf/_cli.py b/pyperf/_cli.py index 266031e9..6625e19d 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -400,7 +400,7 @@ def value_bucket(value): return lines -def format_checks(bench, lines=None): +def format_checks(bench, lines=None, check_too_many_processes=False): if lines is None: lines = [] @@ -413,7 +413,7 @@ def format_checks(bench, lines=None): warnings = [] warn = warnings.append - required_nsamples = bench.required_nsamples() + required_nprocesses = bench.required_nprocesses() # Display a warning if the standard deviation is greater than 10% # of the mean @@ -426,8 +426,8 @@ def format_checks(bench, lines=None): else: # display a warning if the number of samples isn't enough to get a stable result if ( - required_nsamples is not None and - required_nsamples > len(bench._runs) + required_nprocesses is not None and + required_nprocesses > len(bench._runs) ): warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)") @@ -467,13 +467,14 @@ def format_checks(bench, lines=None): lines.append("Use --quiet option to hide these warnings.") if ( - required_nsamples is not None and - required_nsamples < len(bench._runs) * 0.75 + check_too_many_processes and + required_nprocesses is not None and + required_nprocesses < len(bench._runs) * 0.75 ): lines.append("Benchmark was run more times than necessary to get a stable result.") lines.append( "Consider passing processes=%d to the Runner constructor to save time." % - required_nsamples + required_nprocesses ) # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata @@ -568,7 +569,7 @@ def format_result(bench): def format_benchmark(bench, checks=True, metadata=False, dump=False, stats=False, hist=False, show_name=False, - result=True, display_runs_args=None): + result=True, display_runs_args=None, only_checks=False): lines = [] if metadata: @@ -587,7 +588,7 @@ def format_benchmark(bench, checks=True, metadata=False, format_stats(bench, lines=lines) if checks: - format_checks(bench, lines=lines) + format_checks(bench, lines=lines, check_too_many_processes=only_checks) if result: empty_line(lines) diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py index 8ce62cc8..0e01380f 100644 --- a/pyperf/tests/test_perf_cli.py +++ b/pyperf/tests/test_perf_cli.py @@ -478,16 +478,11 @@ def test_hist(self): 22.8 ms: 3 ############## 22.9 ms: 4 ################### 22.9 ms: 4 ################### - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'hist', TELCO, env=env) def test_show(self): expected = (""" - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. - Mean +- std dev: 22.5 ms +- 0.2 ms """) self.check_command(expected, 'show', TELCO) @@ -523,8 +518,6 @@ def test_stats(self): 100th percentile: 22.9 ms (+2% of the mean) -- maximum Number of outlier (out of 22.0 ms..23.0 ms): 0 - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'stats', TELCO) @@ -635,6 +628,14 @@ def test_slowest(self): def test_check_stable(self): stdout = self.run_command('check', TELCO) + self.assertTrue( + textwrap.dedent( + """ + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. + """ + ).strip() in stdout.rstrip() + ) self.assertTrue( 'The benchmark seems to be stable' in stdout.rstrip() From 56dfad1f8f106eaefad86f9bd2d53f17ee3fe4ea Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 18 Dec 2024 14:07:27 -0500 Subject: [PATCH 5/7] Don't compute required_nprocesses unless necessary --- pyperf/_cli.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pyperf/_cli.py b/pyperf/_cli.py index 6625e19d..10686261 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -412,8 +412,7 @@ def format_checks(bench, lines=None, check_too_many_processes=False): mean = bench.mean() warnings = [] warn = warnings.append - - required_nprocesses = bench.required_nprocesses() + required_nprocesses = None # Display a warning if the standard deviation is greater than 10% # of the mean @@ -425,6 +424,7 @@ def format_checks(bench, lines=None, check_too_many_processes=False): % (bench.format_value(stdev), percent, bench.format_value(mean))) else: # display a warning if the number of samples isn't enough to get a stable result + required_nprocesses = bench.required_nprocesses() if ( required_nprocesses is not None and required_nprocesses > len(bench._runs) @@ -466,16 +466,18 @@ def format_checks(bench, lines=None, check_too_many_processes=False): lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.") lines.append("Use --quiet option to hide these warnings.") - if ( - check_too_many_processes and - required_nprocesses is not None and - required_nprocesses < len(bench._runs) * 0.75 - ): - lines.append("Benchmark was run more times than necessary to get a stable result.") - lines.append( - "Consider passing processes=%d to the Runner constructor to save time." % - required_nprocesses - ) + if check_too_many_processes: + if required_nprocesses is None: + required_nprocesses = bench.required_nprocesses() + if ( + required_nprocesses is not None and + required_nprocesses < len(bench._runs) * 0.75 + ): + lines.append("Benchmark was run more times than necessary to get a stable result.") + lines.append( + "Consider passing processes=%d to the Runner constructor to save time." % + required_nprocesses + ) # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata for run in bench._runs: From 980003e384c93701e36decee63c3a45d6e71e5eb Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 19 Dec 2024 09:32:31 -0500 Subject: [PATCH 6/7] Update pyperf/_bench.py Co-authored-by: Victor Stinner --- pyperf/_bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyperf/_bench.py b/pyperf/_bench.py index 5faa7361..20e90349 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -463,7 +463,7 @@ def required_nprocesses(self): W = 0.01 # (4Z²σ²)/(W²) - return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))) + return math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)) def percentile(self, p): if not (0 <= p <= 100): From a02bff90fb2f9c6e6fe27813aa024036d8221b11 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 19 Dec 2024 09:57:29 -0500 Subject: [PATCH 7/7] Use assertIn --- pyperf/tests/test_perf_cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py index 0e01380f..fbb94037 100644 --- a/pyperf/tests/test_perf_cli.py +++ b/pyperf/tests/test_perf_cli.py @@ -628,16 +628,16 @@ def test_slowest(self): def test_check_stable(self): stdout = self.run_command('check', TELCO) - self.assertTrue( + self.assertIn( textwrap.dedent( """ Benchmark was run more times than necessary to get a stable result. Consider passing processes=7 to the Runner constructor to save time. """ - ).strip() in stdout.rstrip() + ).strip(), stdout.rstrip() ) - self.assertTrue( - 'The benchmark seems to be stable' in + self.assertIn( + 'The benchmark seems to be stable', stdout.rstrip() )