diff --git a/.buildkite/pipeline_lib.sh b/.buildkite/pipeline_lib.sh index ced606e47a..bf61b3cdea 100644 --- a/.buildkite/pipeline_lib.sh +++ b/.buildkite/pipeline_lib.sh @@ -53,8 +53,12 @@ gen_bazel_test_steps() { echo " - \"bazel-testlogs.tar.gz\"" echo " timeout_in_minutes: 20" echo " retry:" + echo " manual:" + echo " permit_on_passed: true" echo " automatic:" echo " - exit_status: -1 # Agent was lost" echo " - exit_status: 255 # Forced agent shutdown" + echo " - exit_status: 3 # Test may be flaky or it just didn't pass" + echo " limit: 2" done } diff --git a/acceptance/router_benchmark/test.py b/acceptance/router_benchmark/test.py index f72d66e4ee..df2f753c22 100644 --- a/acceptance/router_benchmark/test.py +++ b/acceptance/router_benchmark/test.py @@ -20,7 +20,7 @@ from collections import namedtuple from plumbum import cli -from plumbum.cmd import docker, whoami +from plumbum.cmd import docker, whoami, cat from plumbum import cmd from acceptance.common import base @@ -34,11 +34,11 @@ # Those values are valid expectations only when running in the CI environment. TEST_CASES = { - 'in': 270000, - 'out': 240000, - 'in_transit': 270000, - 'out_transit': 240000, - 'br_transit': 260000, + 'in': 290000, + 'out': 290000, + 'in_transit': 250000, + 'out_transit': 250000, + 'br_transit': 290000, } @@ -60,6 +60,12 @@ def mac_for_ip(ip: str) -> str: return 'f0:0d:ca:fe:{:02x}:{:02x}'.format(int(ipBytes[2]), int(ipBytes[3])) +# Dump the cpu info into the log just to inform our thoughts on performance variability. +def log_cpu_info(): + cpu_info = cat('/proc/cpuinfo') + logger.info(f"CPU INFO BEGINS\n{cpu_info}\nCPU_INFO ENDS") + + class RouterBMTest(base.TestBase): """ Tests that the implementation of a router has sufficient performance in terms of packets @@ -148,6 +154,9 @@ def create_interface(self, req: IntfReq, ns: str): def setup_prepare(self): super().setup_prepare() + # As the name inplies. + log_cpu_info() + # get the config where the router can find it. shutil.copytree("acceptance/router_benchmark/conf/", self.artifacts / "conf") @@ -221,13 +230,16 @@ def teardown(self): def execBrLoad(self, case: str, mapArgs: list[str], count: int) -> str: brload = self.get_executable("brload") + # For num-streams, attempt to distribute uniformly on many possible number of cores. + # 840 is a multiple of 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 15, 20, 21, 24, 28, ... + # We can't go too far down that road; the test has to build one packet for each stream. return sudo(brload.executable, "run", "--artifacts", self.artifacts, *mapArgs, "--case", case, "--num-packets", str(count), - "--num-streams", "2") + "--num-streams", "840") def runTestCase(self, case: str, mapArgs: list[str]): logger.info(f"==> Starting load {case}") @@ -242,9 +254,8 @@ def runTestCase(self, case: str, mapArgs: list[str]): # The raw metrics are expressed in terms of core*seconds. We convert to machine*seconds # which allows us to provide a projected packet/s; ...more intuitive than packets/core*s. - # We're interested only in br_transit traffic. We measure the rate over 10s. For best - # results we sample the end of the middle 10s of the run. "beg" is the start time of the - # real action and "end" is the end time. + # We measure the rate over 10s. For best results we sample the end of the middle 10s of the + # run. "beg" is the start time of the real action and "end" is the end time. sampleTime = (int(beg) + int(end) + 10) / 2 promQuery = urlencode({ 'time': f'{sampleTime}', @@ -300,9 +311,9 @@ def _run(self): for label, intf in self.intfMap.items(): mapArgs.extend(["--interface", f"{label}={intf.name},{intf.mac},{intf.peerMac}"]) - # Run one (10% size) test as warm-up to trigger the frequency scaling, else the first test + # Run one (30% size) test as warm-up to trigger the frequency scaling, else the first test # gets much lower performance. - self.execBrLoad(list(TEST_CASES)[0], mapArgs, 1000000) + self.execBrLoad(list(TEST_CASES)[0], mapArgs, 3000000) # At long last, run the tests rateMap = {} diff --git a/router/config/config.go b/router/config/config.go index 711bd73b61..c4d78bc684 100644 --- a/router/config/config.go +++ b/router/config/config.go @@ -74,11 +74,24 @@ func (cfg *RouterConfig) Validate() error { func (cfg *RouterConfig) InitDefaults() { // NumProcessors is the number of goroutines used to handle the processing queue. - // By default, there are as many as cores allowed by Go and other goroutines displace - // the packet processors sporadically. It may be either good or bad to create more - // processors (plus the other goroutines) than there are cores... experience will tell. + // It has been observed that allowing the packet processors starve the other tasks was + // counterproductive. We get much better results by setting two cores aside for other go + // routines (such as for input and output). It remains to be seen if even more cores need to be + // set aside on large core-count systems. + if cfg.NumProcessors == 0 { - cfg.NumProcessors = runtime.GOMAXPROCS(0) + // Do what we think is best, so in most cases there's no need for an explicit config. + maxProcs := runtime.GOMAXPROCS(0) + if maxProcs > 3 { + // Two for I/Os, two or more for processing. + cfg.NumProcessors = maxProcs - 2 + } else if maxProcs > 1 { + // I/Os <= processing. + cfg.NumProcessors = maxProcs - 1 + } else { + // No choice. + cfg.NumProcessors = maxProcs + } } if cfg.NumSlowPathProcessors == 0 {