diff --git a/doc/api.rst b/doc/api.rst index 5647d15..538ceec 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -345,6 +345,19 @@ Benchmark class Raise an exception if the benchmark has no values. + .. method:: required_nprocesses() + + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% certainty + that there is a variance of less than 1%. If the result is greater than + the number of processes recorded in the input data, the value is + meaningless and only means "more samples are required". + + The method used is described in this Wikipedia article about estimating + the sampling of a mean: + + https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean + .. method:: update_metadata(metadata: dict) Update metadata of all runs of the benchmark. diff --git a/pyperf/__main__.py b/pyperf/__main__.py index aee19d1..572253a 100644 --- a/pyperf/__main__.py +++ b/pyperf/__main__.py @@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False, dump=dump, checks=checks, result=result, - display_runs_args=display_runs_args) + display_runs_args=display_runs_args, + only_checks=only_checks) if bench_lines: empty_line(lines) @@ -491,10 +492,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False, empty_line(output) output.extend(lines) + contains_warning = False for line in output: + if line.startswith("WARNING:"): + contains_warning = True print(line) - if not output and only_checks: + if not contains_warning and only_checks: if len(data) == 1: print("The benchmark seems to be stable") else: diff --git a/pyperf/_bench.py b/pyperf/_bench.py index 5d9bcf3..20e9034 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -424,6 +424,47 @@ def median_abs_dev(self): raise ValueError("MAD must be >= 0") return value + def required_nprocesses(self): + """ + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% + certainty that there is a variance of less than 1%. If the result is + greater than the number of processes recorded in the input data, the + value is meaningless and only means "more samples are required". + + The method used is described in this Wikipedia article about estimating + the sampling of a mean: + + https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean + """ + # Get the means of the values per process. The values within the process + # often vary considerably (e.g. due to cache effects), but the variances + # between processes should be fairly consistent. Additionally, this + # value is intended to be advice for the number of processes to run. + values = [] + for run in self._runs: + if len(run.values): + values.append(statistics.mean(run.values)) + + if len(values) < 2: + return None + + total = math.fsum(values) + mean = total / len(values) + stddev = statistics.stdev(values) + + # Normalize the stddev so we can target "percentage changed" rather than + # absolute time + sigma = stddev / mean + + # 95% certainty + Z = 1.96 + # 1% variation + W = 0.01 + + # (4Z²σ²)/(W²) + return math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)) + def percentile(self, p): if not (0 <= p <= 100): raise ValueError("p must be in the range [0; 100]") diff --git a/pyperf/_cli.py b/pyperf/_cli.py index fba0ada..1068626 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -400,7 +400,7 @@ def value_bucket(value): return lines -def format_checks(bench, lines=None): +def format_checks(bench, lines=None, check_too_many_processes=False): if lines is None: lines = [] @@ -412,6 +412,7 @@ def format_checks(bench, lines=None): mean = bench.mean() warnings = [] warn = warnings.append + required_nprocesses = None # Display a warning if the standard deviation is greater than 10% # of the mean @@ -421,6 +422,14 @@ def format_checks(bench, lines=None): if percent >= 10.0: warn("the standard deviation (%s) is %.0f%% of the mean (%s)" % (bench.format_value(stdev), percent, bench.format_value(mean))) + else: + # display a warning if the number of samples isn't enough to get a stable result + required_nprocesses = bench.required_nprocesses() + if ( + required_nprocesses is not None and + required_nprocesses > len(bench._runs) + ): + warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)") # Minimum and maximum, detect obvious outliers for minimum, value in ( @@ -457,6 +466,19 @@ def format_checks(bench, lines=None): lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.") lines.append("Use --quiet option to hide these warnings.") + if check_too_many_processes: + if required_nprocesses is None: + required_nprocesses = bench.required_nprocesses() + if ( + required_nprocesses is not None and + required_nprocesses < len(bench._runs) * 0.75 + ): + lines.append("Benchmark was run more times than necessary to get a stable result.") + lines.append( + "Consider passing processes=%d to the Runner constructor to save time." % + required_nprocesses + ) + # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata for run in bench._runs: cpu_config = run._metadata.get('cpu_config') @@ -549,7 +571,7 @@ def format_result(bench): def format_benchmark(bench, checks=True, metadata=False, dump=False, stats=False, hist=False, show_name=False, - result=True, display_runs_args=None): + result=True, display_runs_args=None, only_checks=False): lines = [] if metadata: @@ -568,7 +590,7 @@ def format_benchmark(bench, checks=True, metadata=False, format_stats(bench, lines=lines) if checks: - format_checks(bench, lines=lines) + format_checks(bench, lines=lines, check_too_many_processes=only_checks) if result: empty_line(lines) diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py index 6423c52..fbb9403 100644 --- a/pyperf/tests/test_perf_cli.py +++ b/pyperf/tests/test_perf_cli.py @@ -628,8 +628,18 @@ def test_slowest(self): def test_check_stable(self): stdout = self.run_command('check', TELCO) - self.assertEqual(stdout.rstrip(), - 'The benchmark seems to be stable') + self.assertIn( + textwrap.dedent( + """ + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. + """ + ).strip(), stdout.rstrip() + ) + self.assertIn( + 'The benchmark seems to be stable', + stdout.rstrip() + ) def test_command(self): command = [sys.executable, '-c', 'pass'] @@ -689,7 +699,7 @@ def _check_track_memory(self, track_option): '[1,2]*1000', '-o', tmp_name) bench = pyperf.Benchmark.load(tmp_name) - + self._check_track_memory_bench(bench, loops=5) def test_track_memory(self):