dec/arith: boost shl10VU/shr10VU performance

These functions now use the "constant division by multiplication" algorithm. This is made possible by the small range of possible values for the shift count (18 at most) and a precoomputed table of magic numbers (uses the same algorithm as the Go compiler). The amd64 assembler version of shl10VU now has a slight performance gain over the go version (about 5% less time per op). The performance boost is significative: name old time/op new time/op delta Shl10VU 9.06µs ± 0% 3.32µs ± 2% -63.36% (p=0.008 n=5+5) This improves (*Decimal).Add/Sub performance as well (when the mantissae of the operands need to be aligned) and Decimal -> Int(64) conversions.
db47h · May 25, 2020 · de46678 · de46678
1 parent 39c92df
commit de46678
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 25 deletions.
diff --git a/dec_arith.go b/dec_arith.go
@@ -132,6 +132,67 @@ func decMaxPow(b Word) (p Word, n int) {
 	return Word(decMaxPow64[i]), int(decMaxPow64[i+1])
 }
 
+// pow10DivTab64 contains the "magic" numbers for fast division by 10**n
+// where 1 <= n < 19, x / 10**n = ((x >> pre) * m) >> (_W + post).
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf
+// generated using Go's src/cmd/compile/internal/ssa/magic.go and rewritegeneric.go rules
+var pow10DivTab64 = [...]magic{
+	{10, 0xcccccccccccccccd, 0, 3},
+	{100, 0xa3d70a3d70a3d70b, 1, 5},
+	{1000, 0x83126e978d4fdf3c, 1, 8},
+	{10000, 0xd1b71758e219652c, 0, 13},
+	{100000, 0xa7c5ac471b478424, 1, 15},
+	{1000000, 0x8637bd05af6c69b6, 0, 19},
+	{10000000, 0xd6bf94d5e57a42bd, 1, 22},
+	{100000000, 0xabcc77118461cefd, 0, 26},
+	{1000000000, 0x89705f4136b4a598, 1, 28},
+	{10000000000, 0xdbe6fecebdedd5bf, 0, 33},
+	{100000000000, 0xafebff0bcb24aaff, 0, 36},
+	{1000000000000, 0x8cbccc096f5088cc, 0, 39},
+	{10000000000000, 0xe12e13424bb40e14, 1, 42},
+	{100000000000000, 0xb424dc35095cd810, 1, 45},
+	{1000000000000000, 0x901d7cf73ab0acda, 1, 48},
+	{10000000000000000, 0xe69594bec44de15c, 1, 52},
+	{100000000000000000, 0xb877aa3236a4b44a, 1, 55},
+	{1000000000000000000, 0x9392ee8e921d5d08, 1, 58},
+	{10000000000000000000, 0xec1e4a7db69561a6, 1, 62},
+}
+
+var pow10DivTab32 = [...]magic{
+	{10, 0xcccccccd, 0, 3},
+	{100, 0xa3d70a3e, 1, 5},
+	{1000, 0x83126e98, 0, 9},
+	{10000, 0xd1b71759, 0, 13},
+	{100000, 0xa7c5ac48, 1, 15},
+	{1000000, 0x8637bd06, 0, 19},
+	{10000000, 0xd6bf94d6, 0, 23},
+	{100000000, 0xabcc7712, 0, 26},
+	{1000000000, 0x89705f42, 1, 28},
+}
+
+type magic struct {
+	d    uint64 // divisor
+	m    uint64 // multiplier
+	pre  byte   // pre-shift
+	post byte   // post-shift
+}
+
+func divisorPow10(n uint) magic {
+	if debugDecimal && n == 0 {
+		panic("divisorPow10: 10**0 is not a valid divisor")
+	}
+	if _W == 32 {
+		return pow10DivTab32[n-1]
+	}
+	return pow10DivTab64[n-1]
+}
+
+func (m magic) div(n Word) (q, r Word) {
+	h, _ := bits.Mul(uint(n)>>m.pre, uint(m.m))
+	q = Word(h) >> m.post
+	return q, n - q*Word(m.d)
+}
+
 //-----------------------------------------------------------------------------
 // Arithmetic primitives
 //
@@ -231,12 +292,12 @@ func shl10VU_g(z, x []Word, s uint) (r Word) {
 	if len(z) == 0 || len(x) == 0 {
 		return
 	}
-	d, m := pow10(_DW-s), pow10(s)
+	d, m := divisorPow10(_DW-s), pow10(s)
 	var h, l Word
-	r, l = divWW(0, x[len(x)-1], d)
+	r, l = d.div(x[len(x)-1])
 	for i := len(z) - 1; i > 0; i-- {
 		t := l
-		h, l = divWW(0, x[i-1], d)
+		h, l = d.div(x[i-1])
 		z[i] = t*m + h
 	}
 	z[0] = l * m
@@ -255,11 +316,11 @@ func shr10VU_g(z, x []Word, s uint) (r Word) {
 	}
 
 	var h, l Word
-	d, m := pow10(s), pow10(_DW-s)
-	h, r = divWW(0, x[0], d)
+	d, m := divisorPow10(s), pow10(_DW-s)
+	h, r = d.div(x[0])
 	for i := 1; i < len(z) && i < len(x); i++ {
 		t := h
-		h, l = divWW(0, x[i], d)
+		h, l = d.div(x[i])
 		z[i-1] = t + l*m
 	}
 	z[len(z)-1] = h
@@ -304,9 +365,8 @@ func div10VWW_g(z, x []Word, y, xn Word) (r Word) {
 // This function uses the algorithm from "Division by invariant integers using
 // multiplication" by Torbjörn Granlund & Peter L. Montgomery.
 //
-// See https://doi.org/10.1145/773473.178249, section 8, Dividing udword by uword.
-//
-// On 386 and amd64, this is over 2 times faster than calling bits.Div(n1, n0, _BD).
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf, section 8, Dividing udword
+// by uword.
 //
 // In the article, some equations show an addition or subtraction of 2**N, which
 // is a no-op. In the comments below, these have been removed for the sake of

diff --git a/dec_arith_amd64.s b/dec_arith_amd64.s
@@ -516,6 +516,7 @@ CE:
 
 // func shl10VU(z, x []Word, s uint) (c Word)
 TEXT ·shl10VU(SB),NOSPLIT,$0
+	// TODO(db47h): unroll loop
 	MOVQ z_len+8(FP), SI	// i = z
 	SUBQ $1, SI				// i--
 	JL X8b					// i < 0	(n <= 0)
@@ -528,34 +529,53 @@ TEXT ·shl10VU(SB),NOSPLIT,$0
 	JEQ X8c		// copy if s = 0
 
 	MOVQ $_DW, CX
-	LEAQ ·pow10tab(SB), DI
+	LEAQ ·pow10DivTab64(SB), DI
 	SUBQ BX, CX
-	MOVQ 0(DI)(BX*8), R12 // m = pow10(s)
-	MOVQ 0(DI)(CX*8), R11 // d = pow10(_DW-s)
+	LEAQ -3(BX)(BX*2), BX
+	LEAQ -3(CX)(CX*2), CX
 
-	XORQ DX, DX
+	MOVQ 0(DI)(BX*8), R12 // m = pow10DivTab64(s-1).d
+	LEAQ 0(DI)(CX*8), R11 // d = &pow10DivTab64(_DW-s-1)
+
+	// r, l = x[len(x)-1] / d
+	MOVQ 0(R11), R13		// d
+	MOVQ 8(R11), R14		// m
 	MOVQ 0(R8)(SI*8), AX
-	DIVQ R11		// AX:DX = x[len(x)-1] / d
-	MOVQ AX, BX		// save r
+	MOVQ AX, BX				// x[i]
+	MOVWLZX 16(R11), CX		// post|pre
+	SHRQ CX, AX				// x[i] >> pre
+	MULQ R14				// *m
+	RORW $8, CX				// post
 	MOVQ DX, AX
+	SHRQ CX, AX				// r
+	MOVQ AX, c+56(FP)		// save r
+	MULQ R13				// DX:AX = r*d
+	SUBQ AX, BX				// l = x[i]-r*d
+	MOVQ BX, AX				// AX = l
 
 	TESTQ SI, SI
 	JEQ X8a
 L8:
 	MULQ R12
-	MOVQ AX, CX		// z[i] = l*m
-	XORQ DX, DX
-	MOVQ -8(R8)(SI*8), AX
-	DIVQ R11		// q, r = div(0, x[i-1], d)
-	ADDQ AX, CX		// z[i] += q
-	MOVQ DX, AX
-	MOVQ CX, 0(R10)(SI*8)
+	MOVQ AX, R9				// z[i] = l*m
+	MOVQ -8(R8)(SI*8), AX	
+	MOVQ AX, BX				// x[i-1]
+	RORW $8, CX				// pre
+	SHRQ CX, AX				// x[i-1] >> pre
+	MULQ R14				// *m
+	RORW $8, CX				// post
+	MOVQ DX, AX			
+	SHRQ CX, AX				// h
+	ADDQ AX, R9				// z[i] += h
+	MOVQ R9, 0(R10)(SI*8)
+	MULQ R13				// DX:AX = d*h
+	SUBQ AX, BX				// l = x[i-1]-d*h
+	MOVQ BX, AX
 	SUBQ $1, SI
 	JG L8
 X8a:
 	MULQ R12
 	MOVQ AX, 0(R10)(SI*8)
-	MOVQ BX, c+56(FP)
 	RET
 X8b:
 	MOVQ $0, c+56(FP)
@@ -570,6 +590,7 @@ X8c:
 
 // func shr10VU(z, x []Word, s uint) (c Word)
 TEXT ·shr10VU(SB),NOSPLIT,$0
+	// TODO(db47h): implement constant division by multiplication, unroll loop.
 	MOVQ z_len+8(FP), DI
 	SUBQ $1, DI		// n--
 	JL X9b			// n < 0	(n <= 0)

diff --git a/dec_arith_test.go b/dec_arith_test.go
@@ -519,9 +519,17 @@ func BenchmarkAddMul10VVW(b *testing.B) {
 }
 
 func BenchmarkShl10VU(b *testing.B) {
-	x := dec(rnd10V(1000))
-	z := dec(nil).make(1000)
+	x := dec(rnd10V(100000))
+	z := dec(nil).make(100000)
 	for i := 0; i < b.N; i++ {
 		z.shl(x, 8)
 	}
 }
+
+func BenchmarkShr10VU(b *testing.B) {
+	x := dec(rnd10V(100000))
+	z := dec(nil).make(100000)
+	for i := 0; i < b.N; i++ {
+		z.shr(x, 8)
+	}
+}
diff --git a/dec_test.go b/dec_test.go
@@ -745,6 +745,16 @@ func TestGoIssue37499(t *testing.T) {
 	}
 }
 
+var benchU uint
+
+func BenchmarkDecDigit(b *testing.B) {
+	x := dec(rnd10V(100))
+	for i := 0; i < b.N; i++ {
+		n := i % (100 * _DW)
+		benchU = x.digit(uint(n))
+	}
+}
+
 // TODO(bd47h): move this to decimal_test
 func benchmarkDecimalDiv(b *testing.B, aSize, bSize int) {
 	aa := rndDec1(aSize)