Skip to content

Commit

Permalink
router: make the benchmark and the router more adaptive to core count (
Browse files Browse the repository at this point in the history
…#4456)

In the router. Compute a better default for the number of processors based on multiple observations: given enough streams to occupy all processors, performance is degraded if there are as many processors as cores. Best performance is obtained by setting aside two cores for other tasks. Making this the default simplifies the benchmarking process by not having to configure an override for each type of target machine being tested.

In the benchmark. Run the test with many streams. It reflects reality and allows a very parallel router to perform at its best without distorting the performance of a less-parallel one. Make sure that the number of streams is likely to be a multiple of the number of cores so the cores are loaded evenly in spite of there being typically fewer streams than in real life. (TBD, learn the router's internal processors count and just use that).

As a side effect:
* Adjust CI's expectations for the benchmark
* Add automatic retries (2) for failed CI tests.
* Allow retrying successful CI tests (helps investigating performance variations).
* Add cpu_info to the router_benchmark output (same reason).
  • Loading branch information
jiceatscion authored Dec 12, 2023
1 parent f338000 commit b878f8f
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 16 deletions.
4 changes: 4 additions & 0 deletions .buildkite/pipeline_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,12 @@ gen_bazel_test_steps() {
echo " - \"bazel-testlogs.tar.gz\""
echo " timeout_in_minutes: 20"
echo " retry:"
echo " manual:"
echo " permit_on_passed: true"
echo " automatic:"
echo " - exit_status: -1 # Agent was lost"
echo " - exit_status: 255 # Forced agent shutdown"
echo " - exit_status: 3 # Test may be flaky or it just didn't pass"
echo " limit: 2"
done
}
35 changes: 23 additions & 12 deletions acceptance/router_benchmark/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from collections import namedtuple
from plumbum import cli
from plumbum.cmd import docker, whoami
from plumbum.cmd import docker, whoami, cat
from plumbum import cmd

from acceptance.common import base
Expand All @@ -34,11 +34,11 @@

# Those values are valid expectations only when running in the CI environment.
TEST_CASES = {
'in': 270000,
'out': 240000,
'in_transit': 270000,
'out_transit': 240000,
'br_transit': 260000,
'in': 290000,
'out': 290000,
'in_transit': 250000,
'out_transit': 250000,
'br_transit': 290000,
}


Expand All @@ -60,6 +60,12 @@ def mac_for_ip(ip: str) -> str:
return 'f0:0d:ca:fe:{:02x}:{:02x}'.format(int(ipBytes[2]), int(ipBytes[3]))


# Dump the cpu info into the log just to inform our thoughts on performance variability.
def log_cpu_info():
cpu_info = cat('/proc/cpuinfo')
logger.info(f"CPU INFO BEGINS\n{cpu_info}\nCPU_INFO ENDS")


class RouterBMTest(base.TestBase):
"""
Tests that the implementation of a router has sufficient performance in terms of packets
Expand Down Expand Up @@ -148,6 +154,9 @@ def create_interface(self, req: IntfReq, ns: str):
def setup_prepare(self):
super().setup_prepare()

# As the name inplies.
log_cpu_info()

# get the config where the router can find it.
shutil.copytree("acceptance/router_benchmark/conf/", self.artifacts / "conf")

Expand Down Expand Up @@ -221,13 +230,16 @@ def teardown(self):

def execBrLoad(self, case: str, mapArgs: list[str], count: int) -> str:
brload = self.get_executable("brload")
# For num-streams, attempt to distribute uniformly on many possible number of cores.
# 840 is a multiple of 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 15, 20, 21, 24, 28, ...
# We can't go too far down that road; the test has to build one packet for each stream.
return sudo(brload.executable,
"run",
"--artifacts", self.artifacts,
*mapArgs,
"--case", case,
"--num-packets", str(count),
"--num-streams", "2")
"--num-streams", "840")

def runTestCase(self, case: str, mapArgs: list[str]):
logger.info(f"==> Starting load {case}")
Expand All @@ -242,9 +254,8 @@ def runTestCase(self, case: str, mapArgs: list[str]):

# The raw metrics are expressed in terms of core*seconds. We convert to machine*seconds
# which allows us to provide a projected packet/s; ...more intuitive than packets/core*s.
# We're interested only in br_transit traffic. We measure the rate over 10s. For best
# results we sample the end of the middle 10s of the run. "beg" is the start time of the
# real action and "end" is the end time.
# We measure the rate over 10s. For best results we sample the end of the middle 10s of the
# run. "beg" is the start time of the real action and "end" is the end time.
sampleTime = (int(beg) + int(end) + 10) / 2
promQuery = urlencode({
'time': f'{sampleTime}',
Expand Down Expand Up @@ -300,9 +311,9 @@ def _run(self):
for label, intf in self.intfMap.items():
mapArgs.extend(["--interface", f"{label}={intf.name},{intf.mac},{intf.peerMac}"])

# Run one (10% size) test as warm-up to trigger the frequency scaling, else the first test
# Run one (30% size) test as warm-up to trigger the frequency scaling, else the first test
# gets much lower performance.
self.execBrLoad(list(TEST_CASES)[0], mapArgs, 1000000)
self.execBrLoad(list(TEST_CASES)[0], mapArgs, 3000000)

# At long last, run the tests
rateMap = {}
Expand Down
21 changes: 17 additions & 4 deletions router/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,24 @@ func (cfg *RouterConfig) Validate() error {
func (cfg *RouterConfig) InitDefaults() {

// NumProcessors is the number of goroutines used to handle the processing queue.
// By default, there are as many as cores allowed by Go and other goroutines displace
// the packet processors sporadically. It may be either good or bad to create more
// processors (plus the other goroutines) than there are cores... experience will tell.
// It has been observed that allowing the packet processors starve the other tasks was
// counterproductive. We get much better results by setting two cores aside for other go
// routines (such as for input and output). It remains to be seen if even more cores need to be
// set aside on large core-count systems.

if cfg.NumProcessors == 0 {
cfg.NumProcessors = runtime.GOMAXPROCS(0)
// Do what we think is best, so in most cases there's no need for an explicit config.
maxProcs := runtime.GOMAXPROCS(0)
if maxProcs > 3 {
// Two for I/Os, two or more for processing.
cfg.NumProcessors = maxProcs - 2
} else if maxProcs > 1 {
// I/Os <= processing.
cfg.NumProcessors = maxProcs - 1
} else {
// No choice.
cfg.NumProcessors = maxProcs
}
}

if cfg.NumSlowPathProcessors == 0 {
Expand Down

0 comments on commit b878f8f

Please sign in to comment.