article: revise the benchmarks for pointer-params

For #1
golang-design · Nov 4, 2020 · 6109308 · 6109308
1 parent 6a33110
commit 6109308
Show file tree

Hide file tree

Showing 8 changed files with 275 additions and 115 deletions.
diff --git a/pointer-params.md b/pointer-params.md
@@ -6,62 +6,61 @@ Last updated: 2020-10-27
 
 ## Introduction
 
-We are aware of that using pointers for passing parameters can avoid data copy, which will benefit the prformance. But there are always some edge cases you might need concern.
+We are aware that using pointers for passing parameters can avoid data copy,
+which will benefit the performance. Nevertheless, there are always some
+edge cases we might need concern.
 
-Let's check this example:
+Let's take this as an example:
 
 ```go
 // vec.go
-type vec1 struct {
+type vec struct {
 	x, y, z, w float64
 }
 
-func (v vec1) add(u vec1) vec1 {
-	return vec1{v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w}
+func (v vec) addv(u vec) vec {
+	return vec{v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w}
 }
 
-type vec2 struct {
-	x, y, z, w float64
-}
-
-func (v *vec2) add(u *vec2) *vec2 {
-	v.x += u.x
-	v.y += u.y
-	v.z += u.z
-	v.w += u.w
+func (v *vec) addp(u *vec) *vec {
+	v.x, v.y, v.z, v.w = v.x+u.x, v.y+u.y, v.z+u.z, v.w+u.w
 	return v
 }
 ```
 
-Which `add` implementation runs faster?
-Intuitively, we might think that `vec2` is faster because its parameter `u` uses pointer and there should have no copies on the data, whereas `vec1` involves data copy both when passing and returning.
+Which vector addition runs faster?
+Intuitively, we might consider that `vec.addp` is faster than `vec.addv`
+because its parameter `u` uses pointer form. There should be no copies
+of the data, whereas `vec.addv` involves data copy both when passing and
+returning.
 
-However, if we write a benchmark:
+However, if we do a micro-benchmark:
 
 ```go
 func BenchmarkVec(b *testing.B) {
-	b.ReportAllocs()
-	b.Run("vec1", func(b *testing.B) {
-		v1 := vec1{1, 2, 3, 4}
-		v2 := vec1{4, 5, 6, 7}
+	b.Run("addv", func(b *testing.B) {
+		v1 := vec{1, 2, 3, 4}
+		v2 := vec{4, 5, 6, 7}
+		b.ReportAllocs()
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			if i%2 == 0 {
-				v1 = v1.add(v2)
+				v1 = v1.addv(v2)
 			} else {
-				v2 = v2.add(v1)
+				v2 = v2.addv(v1)
 			}
 		}
 	})
-	b.Run("vec2", func(b *testing.B) {
-		v1 := vec2{1, 2, 3, 4}
-		v2 := vec2{4, 5, 6, 7}
+	b.Run("addp", func(b *testing.B) {
+		v1 := &vec{1, 2, 3, 4}
+		v2 := &vec{4, 5, 6, 7}
+		b.ReportAllocs()
 		b.ResetTimer()
 		for i := 0; i < b.N; i++ {
 			if i%2 == 0 {
-				v1.add(&v2)
+				v1 = v1.addp(v2)
 			} else {
-				v2.add(&v1)
+				v2 = v2.addp(v1)
 			}
 		}
 	})
@@ -79,8 +78,16 @@ The `benchstat` will give you the following result:
 
 ```
 name         time/op
-Vec/vec1-16  0.25ns ± 1%
-Vec/vec2-16  2.20ns ± 0%
+Vec/addv-16  0.25ns ± 2%
+Vec/addp-16  2.20ns ± 0%
+
+name         alloc/op
+Vec/addv-16   0.00B     
+Vec/addp-16   0.00B     
+
+name         allocs/op
+Vec/addv-16    0.00     
+Vec/addp-16    0.00
 ```
 
 How is this happening?
@@ -89,136 +96,122 @@ How is this happening?
 
 This is all because of compiler optimization, and mostly because of inlining.
 
-If we disable inline from the `add`:
+If we disable inline from the `addv` and `addp`:
 
 ```go
-// vec.go
-type vec1 struct {
-	x, y, z, w float64
-}
-
 //go:noinline
-func (v vec1) add(u vec1) vec1 {
-	return vec1{v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w}
-}
-
-type vec2 struct {
-	x, y, z, w float64
+func (v vec) addv(u vec) vec {
+	return vec{v.x + u.x, v.y + u.y, v.z + u.z, v.w + u.w}
 }
 
 //go:noinline
-func (v *vec2) add(u *vec2) *vec2 {
-	v.x += u.x
-	v.y += u.y
-	v.z += u.z
-	v.w += u.w
+func (v *vec) addp(u *vec) *vec {
+	v.x, v.y, v.z, v.w = v.x+u.x, v.y+u.y, v.z+u.z, v.w+u.w
 	return v
 }
 ```
 
-Run the benchmark and compare the perf with the previous one:
+Then run the benchmark and compare the perf with the previous one:
 
 ```sh
 $ perflock -governor 80% go test -v -run=none -bench=. -count=10 | tee old.txt
 $ benchstat old.txt new.txt
-name         old time/op  new time/op  delta
-Vec/vec1-16  4.92ns ± 1%  0.25ns ± 1%  -95.01%  (p=0.000 n=10+9)
-Vec/vec2-16  2.89ns ± 1%  2.20ns ± 0%  -23.77%  (p=0.000 n=10+8)
+name         old time/op    new time/op    delta
+Vec/addv-16    4.99ns ± 1%    0.25ns ± 2%  -95.05%  (p=0.000 n=9+10)
+Vec/addp-16    3.35ns ± 1%    2.20ns ± 0%  -34.37%  (p=0.000 n=10+8)
 ```
 
-The inline optimization transforms the code:
+The inline optimization transforms the `vec.addv`:
 
 ```go
-v1 := vec1{1, 2, 3, 4}
-v2 := vec1{4, 5, 6, 7}
-v1 = v1.add(v2)
+v1 := vec{1, 2, 3, 4}
+v2 := vec{4, 5, 6, 7}
+v1 = v1.addv(v2)
 ```
 
 to a direct assign statement:
 
 ```go
-v1 := vec1{1, 2, 3, 4}
-v2 := vec1{4, 5, 6, 7}
-v1 = vec1{1+4, 2+5, 3+6, 4+7}
+v1 := vec{1, 2, 3, 4}
+v2 := vec{4, 5, 6, 7}
+v1 = vec{1+4, 2+5, 3+6, 4+7}
 ```
 
-And for the `vec2`'s case:
+And for the `vec.addp`'s case:
 
 ```go
-v1 := vec2{1, 2, 3, 4}
-v2 := vec2{4, 5, 6, 7}
-v1 = v1.add(v2)
+v1 := &vec{1, 2, 3, 4}
+v2 := &vec{4, 5, 6, 7}
+v1 = v1.addp(v2)
 ```
 
 to a direct manipulation:
 
 ```go
-v1 := vec2{1, 2, 3, 4}
-v2 := vec2{4, 5, 6, 7}
-v1.x += v2.x
-v1.y += v2.y
-v1.z += v2.z
-v1.w += v2.w
+v1 := vec{1, 2, 3, 4}
+v2 := vec{4, 5, 6, 7}
+v1.x, v1.y, v1.z, v1.w = v1.x+v2.x, v1.y+v2.y, v1.z+v2.z, v1.w+v2.w 
 ```
 
-## Unoptimized Move Semantics
+## Addressing Modes
 
 If we check the compiled assembly, the reason reveals quickly:
 
 ```sh
-$ go tool compile -S vec.go > vec.s
+$ mkdir asm && go tool compile -S vec.go > asm/vec.s
 ```
 
 The dumped assumbly code is as follows:
 
 ```asm
-"".vec1.add STEXT nosplit size=89 args=0x60 locals=0x0
-	0x0000 00000 (vec.go:8)	TEXT	"".vec1.add(SB), NOSPLIT|ABIInternal, $0-96
-	0x0000 00000 (vec.go:8)	FUNCDATA	$0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
-	0x0000 00000 (vec.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
-	0x0000 00000 (vec.go:9)	MOVSD	"".u+40(SP), X0
-	0x0006 00006 (vec.go:9)	MOVSD	"".v+8(SP), X1
-	0x000c 00012 (vec.go:9)	ADDSD	X1, X0
-	0x0010 00016 (vec.go:9)	MOVSD	X0, "".~r1+72(SP)
-	0x0016 00022 (vec.go:9)	MOVSD	"".u+48(SP), X0
-	0x001c 00028 (vec.go:9)	MOVSD	"".v+16(SP), X1
-	0x0022 00034 (vec.go:9)	ADDSD	X1, X0
-	0x0026 00038 (vec.go:9)	MOVSD	X0, "".~r1+80(SP)
-	0x002c 00044 (vec.go:9)	MOVSD	"".u+56(SP), X0
-	0x0032 00050 (vec.go:9)	MOVSD	"".v+24(SP), X1
-	0x0038 00056 (vec.go:9)	ADDSD	X1, X0
-	0x003c 00060 (vec.go:9)	MOVSD	X0, "".~r1+88(SP)
-	0x0042 00066 (vec.go:9)	MOVSD	"".u+64(SP), X0
-	0x0048 00072 (vec.go:9)	MOVSD	"".v+32(SP), X1
-	0x004e 00078 (vec.go:9)	ADDSD	X1, X0
-	0x0052 00082 (vec.go:9)	MOVSD	X0, "".~r1+96(SP)
-	0x0058 00088 (vec.go:9)	RET
-"".(*vec2).add STEXT nosplit size=73 args=0x18 locals=0x0
-	0x0000 00000 (vec.go:17)	TEXT	"".(*vec2).add(SB), NOSPLIT|ABIInternal, $0-24
-	0x0000 00000 (vec.go:17)	FUNCDATA	$0, gclocals·8f9cec06d1ae35cc9900c511c5e4bdab(SB)
-	0x0000 00000 (vec.go:17)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
-	0x0000 00000 (vec.go:18)	MOVQ	"".u+16(SP), AX
-	0x0005 00005 (vec.go:18)	MOVSD	(AX), X0
-	0x0009 00009 (vec.go:18)	MOVQ	"".v+8(SP), CX
-	0x000e 00014 (vec.go:18)	ADDSD	(CX), X0
-	0x0012 00018 (vec.go:18)	MOVSD	X0, (CX)
-	0x0016 00022 (vec.go:19)	MOVSD	8(AX), X0
-	0x001b 00027 (vec.go:19)	ADDSD	8(CX), X0
-	0x0020 00032 (vec.go:19)	MOVSD	X0, 8(CX)
-	0x0025 00037 (vec.go:20)	MOVSD	16(CX), X0
-	0x002a 00042 (vec.go:20)	ADDSD	16(AX), X0
-	0x002f 00047 (vec.go:20)	MOVSD	X0, 16(CX)
-	0x0034 00052 (vec.go:21)	MOVSD	24(AX), X0
-	0x0039 00057 (vec.go:21)	ADDSD	24(CX), X0
-	0x003e 00062 (vec.go:21)	MOVSD	X0, 24(CX)
-	0x0043 00067 (vec.go:22)	MOVQ	CX, "".~r1+24(SP)
-	0x0048 00072 (vec.go:22)	RET
+"".vec.addv STEXT nosplit size=89 args=0x60 locals=0x0 funcid=0x0
+	0x0000 00000 (vec.go:7)	TEXT	"".vec.addv(SB), NOSPLIT|ABIInternal, $0-96
+	0x0000 00000 (vec.go:7)	FUNCDATA	$0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
+	0x0000 00000 (vec.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
+	0x0000 00000 (vec.go:8)	MOVSD	"".u+40(SP), X0
+	0x0006 00006 (vec.go:8)	MOVSD	"".v+8(SP), X1
+	0x000c 00012 (vec.go:8)	ADDSD	X1, X0
+	0x0010 00016 (vec.go:8)	MOVSD	X0, "".~r1+72(SP)
+	0x0016 00022 (vec.go:8)	MOVSD	"".u+48(SP), X0
+	0x001c 00028 (vec.go:8)	MOVSD	"".v+16(SP), X1
+	0x0022 00034 (vec.go:8)	ADDSD	X1, X0
+	0x0026 00038 (vec.go:8)	MOVSD	X0, "".~r1+80(SP)
+	0x002c 00044 (vec.go:8)	MOVSD	"".u+56(SP), X0
+	0x0032 00050 (vec.go:8)	MOVSD	"".v+24(SP), X1
+	0x0038 00056 (vec.go:8)	ADDSD	X1, X0
+	0x003c 00060 (vec.go:8)	MOVSD	X0, "".~r1+88(SP)
+	0x0042 00066 (vec.go:8)	MOVSD	"".u+64(SP), X0
+	0x0048 00072 (vec.go:8)	MOVSD	"".v+32(SP), X1
+	0x004e 00078 (vec.go:8)	ADDSD	X1, X0
+	0x0052 00082 (vec.go:8)	MOVSD	X0, "".~r1+96(SP)
+	0x0058 00088 (vec.go:8)	RET
+"".(*vec).addp STEXT nosplit size=73 args=0x18 locals=0x0 funcid=0x0
+	0x0000 00000 (vec.go:11)	TEXT	"".(*vec).addp(SB), NOSPLIT|ABIInternal, $0-24
+	0x0000 00000 (vec.go:11)	FUNCDATA	$0, gclocals·522734ad228da40e2256ba19cf2bc72c(SB)
+	0x0000 00000 (vec.go:11)	FUNCDATA	$1, gclocals·69c1753bd5f81501d95132d08af04464(SB)
+	0x0000 00000 (vec.go:12)	MOVQ	"".u+16(SP), AX
+	0x0005 00005 (vec.go:12)	MOVSD	(AX), X0
+	0x0009 00009 (vec.go:12)	MOVQ	"".v+8(SP), CX
+	0x000e 00014 (vec.go:12)	ADDSD	(CX), X0
+	0x0012 00018 (vec.go:12)	MOVSD	8(AX), X1
+	0x0017 00023 (vec.go:12)	ADDSD	8(CX), X1
+	0x001c 00028 (vec.go:12)	MOVSD	16(CX), X2
+	0x0021 00033 (vec.go:12)	ADDSD	16(AX), X2
+	0x0026 00038 (vec.go:12)	MOVSD	24(AX), X3
+	0x002b 00043 (vec.go:12)	ADDSD	24(CX), X3
+	0x0030 00048 (vec.go:12)	MOVSD	X0, (CX)
+	0x0034 00052 (vec.go:12)	MOVSD	X1, 8(CX)
+	0x0039 00057 (vec.go:12)	MOVSD	X2, 16(CX)
+	0x003e 00062 (vec.go:12)	MOVSD	X3, 24(CX)
+	0x0043 00067 (vec.go:13)	MOVQ	CX, "".~r1+24(SP)
+	0x0048 00072 (vec.go:13)	RET
 ```
 
-The `add` implementation of `vec1` uses values from the previous stack frame and writes the result directly to the return;
-whereas `vec2` needs MOVQ that copies the parameter to different registers (e.g., copy pointers to AX and CX,), then write back to the return.
-
-The unexpected move cost in `vec2` is the additional two `MOVQ` instructions and read operations on the two pointer addresses.
+The `addv` implementation uses values from the previous stack frame and
+writes the result directly to the return; whereas `addp` needs MOVQ that
+copies the parameter to different registers (e.g., copy pointers to AX and CX,),
+then write back when returning. Therefore, another unexpected cost in
+`addp` is caused by the indirect addressing mode for accessing the memory unit.
 
 ## Further Reading Suggestions
 

diff --git a/pointer-params/Makefile b/pointer-params/Makefile
@@ -0,0 +1,6 @@
+GOVERSION=$(shell go version | awk '{print $$3}')
+all:
+	perflock -governor 80% go test -v -run=none -bench=. -count=10 | tee $(GOVERSION).txt
+	benchstat $(GOVERSION).txt
+asm:
+	mkdir asm && go tool compile -S vec.go > asm/vec.s
diff --git a/pointer-params/asm/vec.s b/pointer-params/asm/vec.s
@@ -0,0 +1,52 @@
+"".vec.addv STEXT nosplit size=89 args=0x60 locals=0x0 funcid=0x0
+	0x0000 00000 (vec.go:7)	TEXT	"".vec.addv(SB), NOSPLIT|ABIInternal, $0-96
+	0x0000 00000 (vec.go:7)	FUNCDATA	$0, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
+	0x0000 00000 (vec.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
+	0x0000 00000 (vec.go:8)	MOVSD	"".u+40(SP), X0
+	0x0006 00006 (vec.go:8)	MOVSD	"".v+8(SP), X1
+	0x000c 00012 (vec.go:8)	ADDSD	X1, X0
+	0x0010 00016 (vec.go:8)	MOVSD	X0, "".~r1+72(SP)
+	0x0016 00022 (vec.go:8)	MOVSD	"".u+48(SP), X0
+	0x001c 00028 (vec.go:8)	MOVSD	"".v+16(SP), X1
+	0x0022 00034 (vec.go:8)	ADDSD	X1, X0
+	0x0026 00038 (vec.go:8)	MOVSD	X0, "".~r1+80(SP)
+	0x002c 00044 (vec.go:8)	MOVSD	"".u+56(SP), X0
+	0x0032 00050 (vec.go:8)	MOVSD	"".v+24(SP), X1
+	0x0038 00056 (vec.go:8)	ADDSD	X1, X0
+	0x003c 00060 (vec.go:8)	MOVSD	X0, "".~r1+88(SP)
+	0x0042 00066 (vec.go:8)	MOVSD	"".u+64(SP), X0
+	0x0048 00072 (vec.go:8)	MOVSD	"".v+32(SP), X1
+	0x004e 00078 (vec.go:8)	ADDSD	X1, X0
+	0x0052 00082 (vec.go:8)	MOVSD	X0, "".~r1+96(SP)
+	0x0058 00088 (vec.go:8)	RET
+	0x0000 f2 0f 10 44 24 28 f2 0f 10 4c 24 08 f2 0f 58 c1  ...D$(...L$...X.
+	0x0010 f2 0f 11 44 24 48 f2 0f 10 44 24 30 f2 0f 10 4c  ...D$H...D$0...L
+	0x0020 24 10 f2 0f 58 c1 f2 0f 11 44 24 50 f2 0f 10 44  $...X....D$P...D
+	0x0030 24 38 f2 0f 10 4c 24 18 f2 0f 58 c1 f2 0f 11 44  $8...L$...X....D
+	0x0040 24 58 f2 0f 10 44 24 40 f2 0f 10 4c 24 20 f2 0f  [email protected]$ ..
+	0x0050 58 c1 f2 0f 11 44 24 60 c3                       X....D$`.
+"".(*vec).addp STEXT nosplit size=73 args=0x18 locals=0x0 funcid=0x0
+	0x0000 00000 (vec.go:11)	TEXT	"".(*vec).addp(SB), NOSPLIT|ABIInternal, $0-24
+	0x0000 00000 (vec.go:11)	FUNCDATA	$0, gclocals·522734ad228da40e2256ba19cf2bc72c(SB)
+	0x0000 00000 (vec.go:11)	FUNCDATA	$1, gclocals·69c1753bd5f81501d95132d08af04464(SB)
+	0x0000 00000 (vec.go:12)	MOVQ	"".u+16(SP), AX
+	0x0005 00005 (vec.go:12)	MOVSD	(AX), X0
+	0x0009 00009 (vec.go:12)	MOVQ	"".v+8(SP), CX
+	0x000e 00014 (vec.go:12)	ADDSD	(CX), X0
+	0x0012 00018 (vec.go:12)	MOVSD	8(AX), X1
+	0x0017 00023 (vec.go:12)	ADDSD	8(CX), X1
+	0x001c 00028 (vec.go:12)	MOVSD	16(CX), X2
+	0x0021 00033 (vec.go:12)	ADDSD	16(AX), X2
+	0x0026 00038 (vec.go:12)	MOVSD	24(AX), X3
+	0x002b 00043 (vec.go:12)	ADDSD	24(CX), X3
+	0x0030 00048 (vec.go:12)	MOVSD	X0, (CX)
+	0x0034 00052 (vec.go:12)	MOVSD	X1, 8(CX)
+	0x0039 00057 (vec.go:12)	MOVSD	X2, 16(CX)
+	0x003e 00062 (vec.go:12)	MOVSD	X3, 24(CX)
+	0x0043 00067 (vec.go:13)	MOVQ	CX, "".~r1+24(SP)
+	0x0048 00072 (vec.go:13)	RET
+	0x0000 48 8b 44 24 10 f2 0f 10 00 48 8b 4c 24 08 f2 0f  H.D$.....H.L$...
+	0x0010 58 01 f2 0f 10 48 08 f2 0f 58 49 08 f2 0f 10 51  X....H...XI....Q
+	0x0020 10 f2 0f 58 50 10 f2 0f 10 58 18 f2 0f 58 59 18  ...XP....X...XY.
+	0x0030 f2 0f 11 01 f2 0f 11 49 08 f2 0f 11 51 10 f2 0f  .......I....Q...
+	0x0040 11 59 18 48 89 4c 24 18 c3                       .Y.H.L$..
diff --git a/pointer-params/go.mod b/pointer-params/go.mod
@@ -0,0 +1,3 @@
+module pparam
+
+go 1.16