diff --git a/content/assets/zero-alloc-call-sched/app-naive/window.go b/content/assets/zero-alloc-call-sched/app-naive/window.go new file mode 100644 index 0000000..0fb542a --- /dev/null +++ b/content/assets/zero-alloc-call-sched/app-naive/window.go @@ -0,0 +1,68 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package app + +import ( + mainthread "x/mainthread-opt2" + "x/thread" + + "github.com/go-gl/glfw/v3.3/glfw" +) + +// Init initializes an app environment. +func Init() (err error) { + mainthread.Call(func() { err = glfw.Init() }) + return +} + +// Terminate terminates the entire application. +func Terminate() { + mainthread.Call(glfw.Terminate) +} + +// Win is a window. +type Win struct { + win *glfw.Window + th *thread.Thread +} + +// NewWindow constructs a new graphical window. +func NewWindow() (*Win, error) { + var ( + w = &Win{} + err error + ) + mainthread.Call(func() { + w.win, err = glfw.CreateWindow(640, 480, "golang.design/research", nil, nil) + if err != nil { + return + } + }) + if err != nil { + return nil, err + } + + w.win.MakeContextCurrent() + return w, nil +} + +// Run runs the given window and blocks until it is destroied. +func (w *Win) Run() { + for !w.win.ShouldClose() { + mainthread.Call(func() { + w.win.SwapBuffers() + // This function must be called from the main thread. + glfw.WaitEventsTimeout(1.0 / 30) + }) + } + // This function must be called from the mainthread. + mainthread.Call(w.win.Destroy) +} + +// Stop stops and closes the given window. +func (w *Win) Stop() { + w.win.SetShouldClose(true) +} diff --git a/content/assets/zero-alloc-call-sched/app/window.go b/content/assets/zero-alloc-call-sched/app/window.go index 1c0ee44..73d6801 100644 --- a/content/assets/zero-alloc-call-sched/app/window.go +++ b/content/assets/zero-alloc-call-sched/app/window.go @@ -41,6 +41,9 @@ func NewWindow() (*Win, error) { return } }) + if err != nil { + return nil, err + } // This function can be called from any thread. w.th.Call(w.win.MakeContextCurrent) diff --git a/content/assets/zero-alloc-call-sched/cmd/app1/main.go b/content/assets/zero-alloc-call-sched/cmd/app1/main.go new file mode 100644 index 0000000..5874379 --- /dev/null +++ b/content/assets/zero-alloc-call-sched/cmd/app1/main.go @@ -0,0 +1,79 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package main + +import ( + "flag" + "fmt" + "os" + "runtime/trace" + "time" + app "x/app-naive" + mainthread "x/mainthread-opt2" +) + +func main() { + mainthread.Init(fn) +} + +func fn() { + d := parseArgs() + + err := app.Init() + if err != nil { + panic(err) + } + defer app.Terminate() + w, err := app.NewWindow() + if err != nil { + panic(err) + } + + done := make(chan struct{}, 2) + go func() { + f, _ := os.Create(*traceF) + defer f.Close() + trace.Start(f) + defer trace.Stop() + time.Sleep(d) + w.Stop() + }() + go func() { + w.Run() + done <- struct{}{} + }() + <-done +} + +var ( + run *bool + traceF *string + traceT *string +) + +func parseArgs() time.Duration { + run = flag.Bool("run", false, "start test") + traceF = flag.String("trace", "trace.out", "trace file, default: trace.out") + traceT = flag.String("d", "2s", "trace duration, default: 10s") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, `usage: go run main.go -run [-trace FILENAME -d DURATION] +options: +`) + flag.PrintDefaults() + } + flag.Parse() + if !*run { + flag.Usage() + os.Exit(2) + } + + d, err := time.ParseDuration(*traceT) + if err != nil { + flag.Usage() + os.Exit(2) + } + return d +} diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt1/bench-2021-01-25-21:02:29.txt b/content/assets/zero-alloc-call-sched/mainthread-opt1/bench-2021-01-25-21:02:29.txt new file mode 100644 index 0000000..af92abe --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt1/bench-2021-01-25-21:02:29.txt @@ -0,0 +1,25 @@ +goos: darwin +goarch: arm64 +pkg: x/mainthread-opt1 +BenchmarkDirectCall-8 1000000000 0.9476 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9479 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9567 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9528 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9468 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9480 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9538 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9506 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9607 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9489 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 2752809 439.6 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2750701 438.7 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2725908 440.3 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2723350 439.4 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2754843 439.0 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2733372 446.4 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2738767 440.6 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2741732 439.8 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2748228 439.6 ns/op 24 B/op 1 allocs/op +BenchmarkMainThreadCall-8 2717796 439.7 ns/op 24 B/op 1 allocs/op +PASS +ok x/mainthread-opt1 28.462s diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread.go b/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread.go new file mode 100644 index 0000000..816b9d4 --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread.go @@ -0,0 +1,57 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package mainthread + +import ( + "runtime" + "sync" +) + +func init() { + runtime.LockOSThread() +} + +var ( + funcQ = make(chan func(), runtime.GOMAXPROCS(0)) + donePool = sync.Pool{New: func() interface{} { + return make(chan struct{}) + }} +) + +// Init initializes the functionality of running arbitrary subsequent +// functions be called on the main system thread. +// +// Init must be called in the main.main function. +func Init(main func()) { + done := donePool.Get().(chan struct{}) + defer donePool.Put(done) + + go func() { + main() + done <- struct{}{} + }() + + for { + select { + case f := <-funcQ: + f() + case <-done: + return + } + } +} + +// Call calls f on the main thread and blocks until f finishes. +func Call(f func()) { + done := donePool.Get().(chan struct{}) + defer donePool.Put(done) + + funcQ <- func() { + f() + done <- struct{}{} + } + <-done +} diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread_test.go b/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread_test.go new file mode 100644 index 0000000..d1ae518 --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt1/mainthread_test.go @@ -0,0 +1,31 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package mainthread_test + +import ( + "testing" + mainthread "x/mainthread-opt1" +) + +var f = func() {} + +func BenchmarkDirectCall(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + f() + } +} + +func BenchmarkMainThreadCall(b *testing.B) { + mainthread.Init(func() { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + mainthread.Call(f) + } + }) +} diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt2/bench-2021-01-25-21:08:53.txt b/content/assets/zero-alloc-call-sched/mainthread-opt2/bench-2021-01-25-21:08:53.txt new file mode 100644 index 0000000..04b26d5 --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt2/bench-2021-01-25-21:08:53.txt @@ -0,0 +1,25 @@ +goos: darwin +goarch: arm64 +pkg: x/mainthread-opt2 +BenchmarkDirectCall-8 1000000000 0.9449 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9478 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9559 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9471 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9576 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9493 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9468 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9581 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9493 ns/op 0 B/op 0 allocs/op +BenchmarkDirectCall-8 1000000000 0.9480 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3299989 364.9 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3277143 365.3 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3275292 371.2 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3293971 366.1 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3299977 364.0 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3024205 367.1 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3294015 375.2 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3288907 364.9 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3285415 366.0 ns/op 0 B/op 0 allocs/op +BenchmarkMainThreadCall-8 3081945 368.7 ns/op 0 B/op 0 allocs/op +PASS +ok x/mainthread-opt2 26.493s diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread.go b/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread.go new file mode 100644 index 0000000..c1aab44 --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread.go @@ -0,0 +1,60 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package mainthread + +import ( + "runtime" + "sync" +) + +func init() { + runtime.LockOSThread() +} + +var ( + funcQ = make(chan funcdata, runtime.GOMAXPROCS(0)) + donePool = sync.Pool{New: func() interface{} { + return make(chan struct{}) + }} +) + +type funcdata struct { + fn func() + done chan struct{} +} + +// Init initializes the functionality of running arbitrary subsequent +// functions be called on the main system thread. +// +// Init must be called in the main.main function. +func Init(main func()) { + done := donePool.Get().(chan struct{}) + defer donePool.Put(done) + + go func() { + main() + done <- struct{}{} + }() + + for { + select { + case fdata := <-funcQ: + fdata.fn() + fdata.done <- struct{}{} + case <-done: + return + } + } +} + +// Call calls f on the main thread and blocks until f finishes. +func Call(f func()) { + done := donePool.Get().(chan struct{}) + defer donePool.Put(done) + + funcQ <- funcdata{fn: f, done: done} + <-done +} diff --git a/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread_test.go b/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread_test.go new file mode 100644 index 0000000..e23e53b --- /dev/null +++ b/content/assets/zero-alloc-call-sched/mainthread-opt2/mainthread_test.go @@ -0,0 +1,31 @@ +// Copyright (c) 2021 The golang.design Initiative Authors. +// All rights reserved. +// +// The code below is produced by Changkun Ou . + +package mainthread_test + +import ( + "testing" + mainthread "x/mainthread-opt2" +) + +var f = func() {} + +func BenchmarkDirectCall(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + f() + } +} + +func BenchmarkMainThreadCall(b *testing.B) { + mainthread.Init(func() { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + mainthread.Call(f) + } + }) +} diff --git a/content/assets/zero-alloc-call-sched/naive-sched-delay.png b/content/assets/zero-alloc-call-sched/naive-sched-delay.png new file mode 100644 index 0000000..520d04c Binary files /dev/null and b/content/assets/zero-alloc-call-sched/naive-sched-delay.png differ diff --git a/content/assets/zero-alloc-call-sched/naive-sched-trace.png b/content/assets/zero-alloc-call-sched/naive-sched-trace.png new file mode 100644 index 0000000..9d0792d Binary files /dev/null and b/content/assets/zero-alloc-call-sched/naive-sched-trace.png differ diff --git a/content/assets/zero-alloc-call-sched/opt.png b/content/assets/zero-alloc-call-sched/opt.png new file mode 100644 index 0000000..910f65e Binary files /dev/null and b/content/assets/zero-alloc-call-sched/opt.png differ diff --git a/content/assets/zero-alloc-call-sched/opt2-trace-end.png b/content/assets/zero-alloc-call-sched/opt2-trace-end.png new file mode 100644 index 0000000..06e264f Binary files /dev/null and b/content/assets/zero-alloc-call-sched/opt2-trace-end.png differ diff --git a/content/assets/zero-alloc-call-sched/opt2-trace-start.png b/content/assets/zero-alloc-call-sched/opt2-trace-start.png new file mode 100644 index 0000000..e653fee Binary files /dev/null and b/content/assets/zero-alloc-call-sched/opt2-trace-start.png differ diff --git a/content/posts/zero-alloc-call-sched.md b/content/posts/zero-alloc-call-sched.md index dda275b..0296a98 100644 --- a/content/posts/zero-alloc-call-sched.md +++ b/content/posts/zero-alloc-call-sched.md @@ -14,35 +14,602 @@ draft: true Author(s): [Changkun Ou](https://changkun.de) -GUI programming in Go is a little bit tricky. The infamous issue regarding interacting with the legacy GUI frameworks is that most of the graphics related APIs must be called from the main thread. This basically violates the concurrent nature of Go: A goroutine may be arbitrarily and randomly scheduled or rescheduled on different running threads, i.e., the same pice of code will be called from different threads over time, even without evolving the `go` keyword. +GUI programming in Go is a little bit tricky. The infamous issue +regarding interacting with the legacy GUI frameworks is that most of +the graphics related APIs must be called from the main thread. +This basically violates the concurrent nature of Go: A goroutine may be +arbitrarily and randomly scheduled or rescheduled on different running +threads, i.e., the same pice of code will be called from different +threads over time, even without evolving the `go` keyword. - ## Background -TODO: +In multi-threaded programming, operating systems provide space, +the so-called Thread Local Storage (TLS) for each thread of a process +to store their private and local content. In the era where multithreaded +programming and scheduling algorithms are not rich enough, +the TLS feature was very useful to avoid data race since this storage is +purely local and guaranteed by the operating system. + +For example, a graphics rendering backend such as OpenGL Context was +designed to store the rendering context of each thread on TLS; +In macOS, the famous GUI framework Cocoa also requires rendering user +interfaces on a specific thread, that is the so-called *main thread*. ## The Main Thread -TODO: +In Go, as we all know that a goroutine will be scheduled on different +threads due to its internal work-stealing scheduler [^work-steal] [^go11sched]. + +With work-tealing scheduler, goroutines are not promised to run on a specific +thread forever. Instead, whenever a goroutine goes to sleep, or endering a +system call, or the Go's runtime proactively interrupts the execution of +that goroutine, it is likely to be rescheduled to a different thread. +Therefore, if a rendering (OpenGL) context is stored on the old thread, +after switching to a new thread will cause the lose of that old context, too. +**Because such an interruption can happen at anytime and anywhere, +it is impossible to check if the goroutine remains on the same thread +when the execution resumes.** + +The original intntion of designing such a scheduler is to eliminate +the concept of system thread and multiplexing it. In this way, users won't +suffer from the paying the cost of threads switch/sleep whereas threads +always in its full power mode that constantly running tasks either from +user or the runtime. + +### Method `runtime.LockOSThread` and Package `mainthread` + +If GUI applications must interact with the OS on the main thread, +how can we achieve the goal where we want run a specific thread perminantly? +Luckily, there is a method called `LockOSThread` offered from the +`runtime` package, provides the exact feature we want: + +```go +// LockOSThread wires the calling goroutine to its current operating system thread. +// The calling goroutine will always execute in that thread, +// and no other goroutine will execute in it, +// until the calling goroutine has made as many calls to +// UnlockOSThread as to LockOSThread. +// If the calling goroutine exits without unlocking the thread, +// the thread will be terminated. +// +// All init functions are run on the startup thread. Calling LockOSThread +// from an init function will cause the main function to be invoked on +// that thread. +// +// A goroutine should call LockOSThread before calling OS services or +// non-Go library functions that depend on per-thread state. +func LockOSThread() +``` + +As the document of `LockOSThread` says: All `init` functions are run on +the startup thread. Calling `LockOSThread` from an `init` function will +cause the main function to be invoked on that thread. + +If you think about that carefully, you will immediately realize this gives +us the opportunity to identify, at least, the main thread. +When we would like to wrapping thread scheduling as a package `mainthread`, +we can do something like the following: + +```go +package mainthread // import "x/mainthread" + +import "runtime" + +func init() { + runtime.LockOSThread() +} + +// Init initializes the functionality of running arbitrary subsequent +// functions be called on the main system thread. +// +// Init must be called in the main.main function. +func Init(main func()) + +// Call calls f on the main thread and blocks until f finishes. +func Call(f func()) +``` + +As a user of such a package, one can: + +```go +package main + +func main() { + mainthread.Init(fn) +} + +func fn() { + // ... do what ever we want to do in main ... +} + + +func gn() { + // Wherever gn is running, the call will be executed on the main thread. + mainthread.Call(func() { + // ... do whatever we want to run on the main thread ... + }) +} +``` + +Once we solved the problem of API design, the next question is: +How can we implement the `Init` and `Call`? + +Well, it is not that difficult. Recall that we use `Init` method +to obtain the full control of the main thread, then we should never and +ever to give up such a power. Thus, creating another goroutine to run +what we initially wants to run, and use a channel to receive +the calls that we would like to schedule on the main thread +becomes our only option: + +```go +// funcQ is a global channel that responsible for receiving function +// calls that needs to run on the main thread. +var funcQ = make(chan func(), runtime.GOMAXPROCS(0)) + +func Init(main func()) { + done := make(chan struct{}) + go func() { + main() + + // main function terminates, signal and terminate + // the main thread too. + done <- struct{}{} + }() + + for { + select { + case f := <-funcQ: + f() + case <-done: + return + } + } +} +``` + +Since we have the global `funcQ`, scheduling a function via that channel +becomes an easy work: + +```go +// Call calls f on the main thread and blocks until f finishes. +func Call(f func()) { + done := make(chan struct{}) + funcQ <- func() { + f() + done <- struct{}{} + } + <-done +} +``` + +To use such a package, one can use `mainthread.Call` to schedule +a call to be executed on the main thread: + +```go +package main + +import "x/mainthread" + +func main() { + mainthread.Init(fn) +} + +func fn() { + done := make(chan struct{}) + go gn(done) + <-done +} + +func gn(done chan<- struct{}) { + mainthread.Call(func() { + println("call on the main thread.") + }) + done <- struct{}{} +} +``` + +### Creating A Window with `glfw` using `mainthread` + +Whenever we need to wrap a window package, such as initializing `glfw` [^glfw]: + +```go +package app // import "x/app" + +import ( + "x/mainthread" + + "github.com/go-gl/glfw/v3.3/glfw" +) + +// Init initializes an app environment. +func Init() (err error) { + mainthread.Call(func() { err = glfw.Init() }) + return +} + +// Terminate terminates the entire application. +func Terminate() { + mainthread.Call(glfw.Terminate) +} +``` + +and make sure critical calls like `glfw.WaitEventsTimeout` inside +the rendering loop always be executed from the main thread: + +```go +package app // import "x/app" + +// Win is a window. +type Win struct { + win *glfw.Window +} + +// NewWindow constructs a new graphical window. +func NewWindow() (*Win, error) { + var ( + w = &Win{} + err error + ) + mainthread.Call(func() { + w.win, err = glfw.CreateWindow(640, 480, "golang.design/research", nil, nil) + if err != nil { + return + } + }) + if err != nil { + return nil, err + } + w.win.MakeContextCurrent() + return w, nil +} + +// Run runs the given window and blocks until it is destroied. +func (w *Win) Run() { + for !w.win.ShouldClose() { + mainthread.Call(func() { + w.win.SwapBuffers() + + // This function must be called from the main thread. + glfw.WaitEventsTimeout(1.0 / 30) + }) + } + // This function must be called from the mainthread. + mainthread.Call(w.win.Destroy) +} +``` + +As a user of `app` package, can get rid of the understanding +and thought overhead about when and how do we call a function +on the main thread: + +```go +package main + +import ( + "x/app" + "x/mainthread" +) + +func main() { + mainthread.Init(fn) +} + +func fn() { + err := app.Init() + if err != nil { + panic(err) + } + defer app.Terminate() + w, err := app.NewWindow() + if err != nil { + panic(err) + } + w.Run() +} +``` + +![](../assets/zero-alloc-call-sched/app.png) + +Now, we have an empty solid window and will never be crashed randomly 😄. ## Cost Analysis and Optimization -TODO: +After implementing a first iteration of the `mainthread` package, +we might directly wonder about the performance of this package, +questions could be: + +_If a function is sent from a thread to the main thread, what's the +latency when calling such a function?_ + +Let's write a few benchmark tests that can measure the performance of +such a call. The idea is very simple, we need a baseline to identify +the initial cost of calling a function, then measure the completion +time when we schedule the same function call on the main thread: + +```go +var f = func() {} + +// Baseline: call f() directly. +func BenchmarkDirectCall(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + f() + } +} + +// MainthreadCall: call f() on the mainthread. +func BenchmarkMainThreadCall(b *testing.B) { + mainthread.Init(func() { + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + mainthread.Call(f) + } + }) +} +``` + +Be careful with micro benchmarks here, as we discussed in our previous +research [^bench-time], let's use the `golang.design/s/bench` +tool [^bench-tool] for benchmarking: + +``` +$ bench +goos: darwin +goarch: arm64 +pkg: x/mainthread-naive +... + +name time/op +DirectCall-8 0.95ns ±1% +MainThreadCall-8 448ns ±0% + +name alloc/op +DirectCall-8 0.00B +MainThreadCall-8 120B ±0% + +name allocs/op +DirectCall-8 0.00 +MainThreadCall-8 2.00 ±0% +``` + +The benchmark result indicates that calling an empty function directly in Go +will `1ns` whereas schedule the same empty function to the main thread +will spend `448ns`. Thus the cost is `447ns`. If we visualizes out the trace information, we can quickly see that sending function to a channel contributes a notably cost: + +![](./../assets/zero-alloc-call-sched/naive-sched-delay.png) + +Moreover, when we talk about cost, +we actually care about the cost of CPU as well as memory consumption. +According to the second report regarding `allocs/op`, the result shows +scheduling an empty function to the mainthread will cost `120B` allocation. + +Allocation of `120B` per operation might not be a big deal from our first impression. +However, if we consider the actual use case of this package, i.e. managing GUI rendering +calls, either CPU or memory allocation can be propagated to a huge cost over time. +If we are dealing with rendering, especially graphical rendering, the fresh rate +is typically minimum 25fps, ideally 30fps or even higher. + +This means, every 5 minutes, without considering mouse button, movements, +and keystrokes, a GUI application will allocate at least: + +$$ +5 \times 60\times 30 \times 120 \text{byte} = 1.08 \text{MiB} +$$ + +A directly impact from an excessive allocation behavior is the runtime garbage +collector and the scavenger. With higher allocation rate, the garbage collector +is triggered more often, and the scavenger releases memory to the OS more often. +Because of more works are produced for the GC, the GC will also consume more +CPU from the system. It is good enough to say the entire application is +a vicious circle. + +The following is a trace information of that above application runs in 6 minutes, the total heap allocation is actually 1.41 MiB (2113536-630784 byte), preety close to what we predicted before. + +![](./../assets/zero-alloc-call-sched/naive-sched-trace.png) + +How can we deal with these issues? How to optimize the exisiting naive +implementation? + + ## Optimal Threading Control -TODO: +The first optimization + +``` +$ bench +goos: darwin +goarch: arm64 +pkg: x/mainthread-opt1 + +name time/op +DirectCall-8 0.95ns ±1% +MainThreadCall-8 440ns ±0% + +name alloc/op +DirectCall-8 0.00B +MainThreadCall-8 24.0B ±0% + +name allocs/op +DirectCall-8 0.00 +MainThreadCall-8 1.00 ±0% +``` + +``` +name old time/op new time/op delta +DirectCall-8 0.95ns ±1% 0.95ns ±1% ~ (p=0.631 n=10+10) +MainThreadCall-8 448ns ±0% 440ns ±0% -1.83% (p=0.000 n=9+9) + +name old alloc/op new alloc/op delta +DirectCall-8 0.00B 0.00B ~ (all equal) +MainThreadCall-8 120B ±0% 24B ±0% -80.00% (p=0.000 n=10+10) + +name old allocs/op new allocs/op delta +DirectCall-8 0.00 0.00 ~ (all equal) +MainThreadCall-8 2.00 ±0% 1.00 ±0% -50.00% (p=0.000 n=10+10) +``` + +The second optimization: + +``` +$ bench +goos: darwin +goarch: arm64 +pkg: x/mainthread-opt2 + +name time/op +DirectCall-8 0.95ns ±1% +MainThreadCall-8 366ns ±1% + +name alloc/op +DirectCall-8 0.00B +MainThreadCall-8 0.00B + +name allocs/op +DirectCall-8 0.00 +MainThreadCall-8 0.00 +``` + +``` +name old time/op new time/op delta +DirectCall-8 0.95ns ±1% 0.95ns ±1% ~ (p=0.617 n=10+10) +MainThreadCall-8 440ns ±0% 366ns ±1% -16.64% (p=0.000 n=9+9) + +name old alloc/op new alloc/op delta +DirectCall-8 0.00B 0.00B ~ (all equal) +MainThreadCall-8 24.0B ±0% 0.0B -100.00% (p=0.000 n=10+10) + +name old allocs/op new allocs/op delta +DirectCall-8 0.00 0.00 ~ (all equal) +MainThreadCall-8 1.00 ±0% 0.00 -100.00% (p=0.000 n=10+10) +``` + +Comparing to the naive implementation: + +``` +name old time/op new time/op delta +DirectCall-8 0.95ns ±1% 0.95ns ±1% ~ (p=0.896 n=10+10) +MainThreadCall-8 448ns ±0% 366ns ±1% -18.17% (p=0.000 n=9+9) + +name old alloc/op new alloc/op delta +DirectCall-8 0.00B 0.00B ~ (all equal) +MainThreadCall-8 120B ±0% 0B -100.00% (p=0.000 n=10+10) + +name old allocs/op new allocs/op delta +DirectCall-8 0.00 0.00 ~ (all equal) +MainThreadCall-8 2.00 ±0% 0.00 -100.00% (p=0.000 n=10+10) +``` ## Verification and Discussion TODO: +``` +bench: run benchmarks under 90% cpufreq... +bench: go test -run=^$ -bench=. -count=10 +goos: linux +goarch: amd64 +pkg: x/mainthread +cpu: Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz + +name time/op +Call-8 373ns ± 0% +CallV-8 375ns ± 0% + +name alloc/op +Call-8 0.00B +CallV-8 0.00B + +name allocs/op +Call-8 0.00 +CallV-8 0.00 +``` + +Before v.s. After: + +``` +name old time/op new time/op delta +Call-8 398ns ± 0% 373ns ± 0% -6.31% (p=0.000 n=10+9) +CallV-8 375ns ± 0% 375ns ± 0% ~ (p=0.323 n=10+10) + +name old alloc/op new alloc/op delta +Call-8 96.0B ± 0% 0.0B -100.00% (p=0.000 n=10+10) +CallV-8 0.00B 0.00B ~ (all equal) + +name old allocs/op new allocs/op delta +Call-8 1.00 ± 0% 0.00 -100.00% (p=0.000 n=10+10) +CallV-8 0.00 0.00 ~ (all equal) +``` + +688128-679936 = 8192 + +```go +// src/runtime/malloc.go +func newobject(typ *_type) unsafe.Pointer { + f := FuncForPC(getcallerpc()) // add this + l, ll := f.FileLine(getcallerpc()) // add this + println(typ.size, f.Name(), l, ll) // add this + return mallocgc(typ.size, typ, true) +} +``` + +``` +16 x/app-naive.(*Win).Run /Users/changkun/dev/golang.design/research/content/assets/zero-alloc-call-sched/app-naive/window.go 55 +88 runtime.acquireSudog /Users/changkun/dev/godev/go-github/src/runtime/proc.go 375 +16 x/app-naive.(*Win).Run /Users/changkun/dev/golang.design/research/content/assets/zero-alloc-call-sched/app-naive/window.go 55 +... +``` + +```go +func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool { + ... + gp := getg() + mysg := acquireSudog() + ... +} + +//go:nosplit +func acquireSudog() *sudog { + mp := acquirem() + pp := mp.p.ptr() + if len(pp.sudogcache) == 0 { + lock(&sched.sudoglock) + for len(pp.sudogcache) < cap(pp.sudogcache)/2 && sched.sudogcache != nil { + s := sched.sudogcache + sched.sudogcache = s.next + s.next = nil + pp.sudogcache = append(pp.sudogcache, s) + } + unlock(&sched.sudoglock) + if len(pp.sudogcache) == 0 { + pp.sudogcache = append(pp.sudogcache, new(sudog)) // here + } + } + ... +} +``` + + + ## Conclusion TODO: +[^mainthread] +[^thread] + ## References -TODO: \ No newline at end of file +[^work-steal]: Robert D. Blumofe and Charles E. Leiserson. 1999. "Scheduling multithreaded computations by work stealing." J. ACM 46, 5 (September 1999), 720-748. https://dl.acm.org/citation.cfm?id=324234 +[^go11sched]: Dmitry Vyukov. "Scalable Go Scheduler Design Doc." May 2, 2012. https://golang.org/s/go11sched +[^glfw]: The glfw Library. https://www.glfw.org/ +[^bench-time]: Changkun Ou. "Eliminating A Source of Measurement Errors in Benchmarks +." 30.09.2020. https://golang.design/research/bench-time/ +[^bench-tool]: Changkun Ou. "bench: Reliable performance measurement for Go programs. All in one design." https://golang.design/s/bench +[^mainthread]: Changkun Ou. "Package golang.design/x/mainthread." https://golang.design/s/mainthread +[^thread]: Changkun Ou. "Package golang.design/x/thread." https://golang.design/s/thread \ No newline at end of file