From de466780ea99e8f5037d55179c0b0cde7a412ffc Mon Sep 17 00:00:00 2001
From: Denis Bernard <db047h@gmail.com>
Date: Mon, 25 May 2020 23:26:49 +0200
Subject: [PATCH] dec/arith: boost shl10VU/shr10VU performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions now use the "constant division by multiplication"
algorithm. This is made possible by the small range of possible values
for the shift count (18 at most) and a precoomputed table of magic
numbers (uses the same algorithm as the Go compiler). The amd64
assembler version of shl10VU now has a slight performance gain over the
go version (about 5% less time per op).

The performance boost is significative:

    name     old time/op  new time/op  delta
    Shl10VU  9.06µs ± 0%  3.32µs ± 2%  -63.36%  (p=0.008 n=5+5)

This improves (*Decimal).Add/Sub performance as well (when the mantissae
of the operands need to be aligned) and Decimal -> Int(64) conversions.
---
 dec_arith.go      | 78 +++++++++++++++++++++++++++++++++++++++++------
 dec_arith_amd64.s | 49 ++++++++++++++++++++---------
 dec_arith_test.go | 12 ++++++--
 dec_test.go       | 10 ++++++
 4 files changed, 124 insertions(+), 25 deletions(-)

diff --git a/dec_arith.go b/dec_arith.go
index f60bbf2..5101f08 100644
--- a/dec_arith.go
+++ b/dec_arith.go
@@ -132,6 +132,67 @@ func decMaxPow(b Word) (p Word, n int) {
 	return Word(decMaxPow64[i]), int(decMaxPow64[i+1])
 }
 
+// pow10DivTab64 contains the "magic" numbers for fast division by 10**n
+// where 1 <= n < 19, x / 10**n = ((x >> pre) * m) >> (_W + post).
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf
+// generated using Go's src/cmd/compile/internal/ssa/magic.go and rewritegeneric.go rules
+var pow10DivTab64 = [...]magic{
+	{10, 0xcccccccccccccccd, 0, 3},
+	{100, 0xa3d70a3d70a3d70b, 1, 5},
+	{1000, 0x83126e978d4fdf3c, 1, 8},
+	{10000, 0xd1b71758e219652c, 0, 13},
+	{100000, 0xa7c5ac471b478424, 1, 15},
+	{1000000, 0x8637bd05af6c69b6, 0, 19},
+	{10000000, 0xd6bf94d5e57a42bd, 1, 22},
+	{100000000, 0xabcc77118461cefd, 0, 26},
+	{1000000000, 0x89705f4136b4a598, 1, 28},
+	{10000000000, 0xdbe6fecebdedd5bf, 0, 33},
+	{100000000000, 0xafebff0bcb24aaff, 0, 36},
+	{1000000000000, 0x8cbccc096f5088cc, 0, 39},
+	{10000000000000, 0xe12e13424bb40e14, 1, 42},
+	{100000000000000, 0xb424dc35095cd810, 1, 45},
+	{1000000000000000, 0x901d7cf73ab0acda, 1, 48},
+	{10000000000000000, 0xe69594bec44de15c, 1, 52},
+	{100000000000000000, 0xb877aa3236a4b44a, 1, 55},
+	{1000000000000000000, 0x9392ee8e921d5d08, 1, 58},
+	{10000000000000000000, 0xec1e4a7db69561a6, 1, 62},
+}
+
+var pow10DivTab32 = [...]magic{
+	{10, 0xcccccccd, 0, 3},
+	{100, 0xa3d70a3e, 1, 5},
+	{1000, 0x83126e98, 0, 9},
+	{10000, 0xd1b71759, 0, 13},
+	{100000, 0xa7c5ac48, 1, 15},
+	{1000000, 0x8637bd06, 0, 19},
+	{10000000, 0xd6bf94d6, 0, 23},
+	{100000000, 0xabcc7712, 0, 26},
+	{1000000000, 0x89705f42, 1, 28},
+}
+
+type magic struct {
+	d    uint64 // divisor
+	m    uint64 // multiplier
+	pre  byte   // pre-shift
+	post byte   // post-shift
+}
+
+func divisorPow10(n uint) magic {
+	if debugDecimal && n == 0 {
+		panic("divisorPow10: 10**0 is not a valid divisor")
+	}
+	if _W == 32 {
+		return pow10DivTab32[n-1]
+	}
+	return pow10DivTab64[n-1]
+}
+
+func (m magic) div(n Word) (q, r Word) {
+	h, _ := bits.Mul(uint(n)>>m.pre, uint(m.m))
+	q = Word(h) >> m.post
+	return q, n - q*Word(m.d)
+}
+
 //-----------------------------------------------------------------------------
 // Arithmetic primitives
 //
@@ -231,12 +292,12 @@ func shl10VU_g(z, x []Word, s uint) (r Word) {
 	if len(z) == 0 || len(x) == 0 {
 		return
 	}
-	d, m := pow10(_DW-s), pow10(s)
+	d, m := divisorPow10(_DW-s), pow10(s)
 	var h, l Word
-	r, l = divWW(0, x[len(x)-1], d)
+	r, l = d.div(x[len(x)-1])
 	for i := len(z) - 1; i > 0; i-- {
 		t := l
-		h, l = divWW(0, x[i-1], d)
+		h, l = d.div(x[i-1])
 		z[i] = t*m + h
 	}
 	z[0] = l * m
@@ -255,11 +316,11 @@ func shr10VU_g(z, x []Word, s uint) (r Word) {
 	}
 
 	var h, l Word
-	d, m := pow10(s), pow10(_DW-s)
-	h, r = divWW(0, x[0], d)
+	d, m := divisorPow10(s), pow10(_DW-s)
+	h, r = d.div(x[0])
 	for i := 1; i < len(z) && i < len(x); i++ {
 		t := h
-		h, l = divWW(0, x[i], d)
+		h, l = d.div(x[i])
 		z[i-1] = t + l*m
 	}
 	z[len(z)-1] = h
@@ -304,9 +365,8 @@ func div10VWW_g(z, x []Word, y, xn Word) (r Word) {
 // This function uses the algorithm from "Division by invariant integers using
 // multiplication" by Torbjörn Granlund & Peter L. Montgomery.
 //
-// See https://doi.org/10.1145/773473.178249, section 8, Dividing udword by uword.
-//
-// On 386 and amd64, this is over 2 times faster than calling bits.Div(n1, n0, _BD).
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf, section 8, Dividing udword
+// by uword.
 //
 // In the article, some equations show an addition or subtraction of 2**N, which
 // is a no-op. In the comments below, these have been removed for the sake of
diff --git a/dec_arith_amd64.s b/dec_arith_amd64.s
index ab5dad2..1aaaf13 100644
--- a/dec_arith_amd64.s
+++ b/dec_arith_amd64.s
@@ -516,6 +516,7 @@ CE:
 
 // func shl10VU(z, x []Word, s uint) (c Word)
 TEXT ·shl10VU(SB),NOSPLIT,$0
+	// TODO(db47h): unroll loop
 	MOVQ z_len+8(FP), SI	// i = z
 	SUBQ $1, SI				// i--
 	JL X8b					// i < 0	(n <= 0)
@@ -528,34 +529,53 @@ TEXT ·shl10VU(SB),NOSPLIT,$0
 	JEQ X8c		// copy if s = 0
 
 	MOVQ $_DW, CX
-	LEAQ ·pow10tab(SB), DI
+	LEAQ ·pow10DivTab64(SB), DI
 	SUBQ BX, CX
-	MOVQ 0(DI)(BX*8), R12 // m = pow10(s)
-	MOVQ 0(DI)(CX*8), R11 // d = pow10(_DW-s)
+	LEAQ -3(BX)(BX*2), BX
+	LEAQ -3(CX)(CX*2), CX
 
-	XORQ DX, DX
+	MOVQ 0(DI)(BX*8), R12 // m = pow10DivTab64(s-1).d
+	LEAQ 0(DI)(CX*8), R11 // d = &pow10DivTab64(_DW-s-1)
+
+	// r, l = x[len(x)-1] / d
+	MOVQ 0(R11), R13		// d
+	MOVQ 8(R11), R14		// m
 	MOVQ 0(R8)(SI*8), AX
-	DIVQ R11		// AX:DX = x[len(x)-1] / d
-	MOVQ AX, BX		// save r
+	MOVQ AX, BX				// x[i]
+	MOVWLZX 16(R11), CX		// post|pre
+	SHRQ CX, AX				// x[i] >> pre
+	MULQ R14				// *m
+	RORW $8, CX				// post
 	MOVQ DX, AX
+	SHRQ CX, AX				// r
+	MOVQ AX, c+56(FP)		// save r
+	MULQ R13				// DX:AX = r*d
+	SUBQ AX, BX				// l = x[i]-r*d
+	MOVQ BX, AX				// AX = l
 
 	TESTQ SI, SI
 	JEQ X8a
 L8:
 	MULQ R12
-	MOVQ AX, CX		// z[i] = l*m
-	XORQ DX, DX
-	MOVQ -8(R8)(SI*8), AX
-	DIVQ R11		// q, r = div(0, x[i-1], d)
-	ADDQ AX, CX		// z[i] += q
-	MOVQ DX, AX
-	MOVQ CX, 0(R10)(SI*8)
+	MOVQ AX, R9				// z[i] = l*m
+	MOVQ -8(R8)(SI*8), AX	
+	MOVQ AX, BX				// x[i-1]
+	RORW $8, CX				// pre
+	SHRQ CX, AX				// x[i-1] >> pre
+	MULQ R14				// *m
+	RORW $8, CX				// post
+	MOVQ DX, AX			
+	SHRQ CX, AX				// h
+	ADDQ AX, R9				// z[i] += h
+	MOVQ R9, 0(R10)(SI*8)
+	MULQ R13				// DX:AX = d*h
+	SUBQ AX, BX				// l = x[i-1]-d*h
+	MOVQ BX, AX
 	SUBQ $1, SI
 	JG L8
 X8a:
 	MULQ R12
 	MOVQ AX, 0(R10)(SI*8)
-	MOVQ BX, c+56(FP)
 	RET
 X8b:
 	MOVQ $0, c+56(FP)
@@ -570,6 +590,7 @@ X8c:
 
 // func shr10VU(z, x []Word, s uint) (c Word)
 TEXT ·shr10VU(SB),NOSPLIT,$0
+	// TODO(db47h): implement constant division by multiplication, unroll loop.
 	MOVQ z_len+8(FP), DI
 	SUBQ $1, DI		// n--
 	JL X9b			// n < 0	(n <= 0)
diff --git a/dec_arith_test.go b/dec_arith_test.go
index fe51a71..24edb54 100644
--- a/dec_arith_test.go
+++ b/dec_arith_test.go
@@ -519,9 +519,17 @@ func BenchmarkAddMul10VVW(b *testing.B) {
 }
 
 func BenchmarkShl10VU(b *testing.B) {
-	x := dec(rnd10V(1000))
-	z := dec(nil).make(1000)
+	x := dec(rnd10V(100000))
+	z := dec(nil).make(100000)
 	for i := 0; i < b.N; i++ {
 		z.shl(x, 8)
 	}
 }
+
+func BenchmarkShr10VU(b *testing.B) {
+	x := dec(rnd10V(100000))
+	z := dec(nil).make(100000)
+	for i := 0; i < b.N; i++ {
+		z.shr(x, 8)
+	}
+}
diff --git a/dec_test.go b/dec_test.go
index 1f5e33c..6cf1c0b 100644
--- a/dec_test.go
+++ b/dec_test.go
@@ -745,6 +745,16 @@ func TestGoIssue37499(t *testing.T) {
 	}
 }
 
+var benchU uint
+
+func BenchmarkDecDigit(b *testing.B) {
+	x := dec(rnd10V(100))
+	for i := 0; i < b.N; i++ {
+		n := i % (100 * _DW)
+		benchU = x.digit(uint(n))
+	}
+}
+
 // TODO(bd47h): move this to decimal_test
 func benchmarkDecimalDiv(b *testing.B, aSize, bSize int) {
 	aa := rndDec1(aSize)