From de466780ea99e8f5037d55179c0b0cde7a412ffc Mon Sep 17 00:00:00 2001 From: Denis Bernard Date: Mon, 25 May 2020 23:26:49 +0200 Subject: [PATCH] dec/arith: boost shl10VU/shr10VU performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions now use the "constant division by multiplication" algorithm. This is made possible by the small range of possible values for the shift count (18 at most) and a precoomputed table of magic numbers (uses the same algorithm as the Go compiler). The amd64 assembler version of shl10VU now has a slight performance gain over the go version (about 5% less time per op). The performance boost is significative: name old time/op new time/op delta Shl10VU 9.06µs ± 0% 3.32µs ± 2% -63.36% (p=0.008 n=5+5) This improves (*Decimal).Add/Sub performance as well (when the mantissae of the operands need to be aligned) and Decimal -> Int(64) conversions. --- dec_arith.go | 78 +++++++++++++++++++++++++++++++++++++++++------ dec_arith_amd64.s | 49 ++++++++++++++++++++--------- dec_arith_test.go | 12 ++++++-- dec_test.go | 10 ++++++ 4 files changed, 124 insertions(+), 25 deletions(-) diff --git a/dec_arith.go b/dec_arith.go index f60bbf2..5101f08 100644 --- a/dec_arith.go +++ b/dec_arith.go @@ -132,6 +132,67 @@ func decMaxPow(b Word) (p Word, n int) { return Word(decMaxPow64[i]), int(decMaxPow64[i+1]) } +// pow10DivTab64 contains the "magic" numbers for fast division by 10**n +// where 1 <= n < 19, x / 10**n = ((x >> pre) * m) >> (_W + post). +// See https://gmplib.org/~tege/divcnst-pldi94.pdf +// generated using Go's src/cmd/compile/internal/ssa/magic.go and rewritegeneric.go rules +var pow10DivTab64 = [...]magic{ + {10, 0xcccccccccccccccd, 0, 3}, + {100, 0xa3d70a3d70a3d70b, 1, 5}, + {1000, 0x83126e978d4fdf3c, 1, 8}, + {10000, 0xd1b71758e219652c, 0, 13}, + {100000, 0xa7c5ac471b478424, 1, 15}, + {1000000, 0x8637bd05af6c69b6, 0, 19}, + {10000000, 0xd6bf94d5e57a42bd, 1, 22}, + {100000000, 0xabcc77118461cefd, 0, 26}, + {1000000000, 0x89705f4136b4a598, 1, 28}, + {10000000000, 0xdbe6fecebdedd5bf, 0, 33}, + {100000000000, 0xafebff0bcb24aaff, 0, 36}, + {1000000000000, 0x8cbccc096f5088cc, 0, 39}, + {10000000000000, 0xe12e13424bb40e14, 1, 42}, + {100000000000000, 0xb424dc35095cd810, 1, 45}, + {1000000000000000, 0x901d7cf73ab0acda, 1, 48}, + {10000000000000000, 0xe69594bec44de15c, 1, 52}, + {100000000000000000, 0xb877aa3236a4b44a, 1, 55}, + {1000000000000000000, 0x9392ee8e921d5d08, 1, 58}, + {10000000000000000000, 0xec1e4a7db69561a6, 1, 62}, +} + +var pow10DivTab32 = [...]magic{ + {10, 0xcccccccd, 0, 3}, + {100, 0xa3d70a3e, 1, 5}, + {1000, 0x83126e98, 0, 9}, + {10000, 0xd1b71759, 0, 13}, + {100000, 0xa7c5ac48, 1, 15}, + {1000000, 0x8637bd06, 0, 19}, + {10000000, 0xd6bf94d6, 0, 23}, + {100000000, 0xabcc7712, 0, 26}, + {1000000000, 0x89705f42, 1, 28}, +} + +type magic struct { + d uint64 // divisor + m uint64 // multiplier + pre byte // pre-shift + post byte // post-shift +} + +func divisorPow10(n uint) magic { + if debugDecimal && n == 0 { + panic("divisorPow10: 10**0 is not a valid divisor") + } + if _W == 32 { + return pow10DivTab32[n-1] + } + return pow10DivTab64[n-1] +} + +func (m magic) div(n Word) (q, r Word) { + h, _ := bits.Mul(uint(n)>>m.pre, uint(m.m)) + q = Word(h) >> m.post + return q, n - q*Word(m.d) +} + //----------------------------------------------------------------------------- // Arithmetic primitives // @@ -231,12 +292,12 @@ func shl10VU_g(z, x []Word, s uint) (r Word) { if len(z) == 0 || len(x) == 0 { return } - d, m := pow10(_DW-s), pow10(s) + d, m := divisorPow10(_DW-s), pow10(s) var h, l Word - r, l = divWW(0, x[len(x)-1], d) + r, l = d.div(x[len(x)-1]) for i := len(z) - 1; i > 0; i-- { t := l - h, l = divWW(0, x[i-1], d) + h, l = d.div(x[i-1]) z[i] = t*m + h } z[0] = l * m @@ -255,11 +316,11 @@ func shr10VU_g(z, x []Word, s uint) (r Word) { } var h, l Word - d, m := pow10(s), pow10(_DW-s) - h, r = divWW(0, x[0], d) + d, m := divisorPow10(s), pow10(_DW-s) + h, r = d.div(x[0]) for i := 1; i < len(z) && i < len(x); i++ { t := h - h, l = divWW(0, x[i], d) + h, l = d.div(x[i]) z[i-1] = t + l*m } z[len(z)-1] = h @@ -304,9 +365,8 @@ func div10VWW_g(z, x []Word, y, xn Word) (r Word) { // This function uses the algorithm from "Division by invariant integers using // multiplication" by Torbjörn Granlund & Peter L. Montgomery. // -// See https://doi.org/10.1145/773473.178249, section 8, Dividing udword by uword. -// -// On 386 and amd64, this is over 2 times faster than calling bits.Div(n1, n0, _BD). +// See https://gmplib.org/~tege/divcnst-pldi94.pdf, section 8, Dividing udword +// by uword. // // In the article, some equations show an addition or subtraction of 2**N, which // is a no-op. In the comments below, these have been removed for the sake of diff --git a/dec_arith_amd64.s b/dec_arith_amd64.s index ab5dad2..1aaaf13 100644 --- a/dec_arith_amd64.s +++ b/dec_arith_amd64.s @@ -516,6 +516,7 @@ CE: // func shl10VU(z, x []Word, s uint) (c Word) TEXT ·shl10VU(SB),NOSPLIT,$0 + // TODO(db47h): unroll loop MOVQ z_len+8(FP), SI // i = z SUBQ $1, SI // i-- JL X8b // i < 0 (n <= 0) @@ -528,34 +529,53 @@ TEXT ·shl10VU(SB),NOSPLIT,$0 JEQ X8c // copy if s = 0 MOVQ $_DW, CX - LEAQ ·pow10tab(SB), DI + LEAQ ·pow10DivTab64(SB), DI SUBQ BX, CX - MOVQ 0(DI)(BX*8), R12 // m = pow10(s) - MOVQ 0(DI)(CX*8), R11 // d = pow10(_DW-s) + LEAQ -3(BX)(BX*2), BX + LEAQ -3(CX)(CX*2), CX - XORQ DX, DX + MOVQ 0(DI)(BX*8), R12 // m = pow10DivTab64(s-1).d + LEAQ 0(DI)(CX*8), R11 // d = &pow10DivTab64(_DW-s-1) + + // r, l = x[len(x)-1] / d + MOVQ 0(R11), R13 // d + MOVQ 8(R11), R14 // m MOVQ 0(R8)(SI*8), AX - DIVQ R11 // AX:DX = x[len(x)-1] / d - MOVQ AX, BX // save r + MOVQ AX, BX // x[i] + MOVWLZX 16(R11), CX // post|pre + SHRQ CX, AX // x[i] >> pre + MULQ R14 // *m + RORW $8, CX // post MOVQ DX, AX + SHRQ CX, AX // r + MOVQ AX, c+56(FP) // save r + MULQ R13 // DX:AX = r*d + SUBQ AX, BX // l = x[i]-r*d + MOVQ BX, AX // AX = l TESTQ SI, SI JEQ X8a L8: MULQ R12 - MOVQ AX, CX // z[i] = l*m - XORQ DX, DX - MOVQ -8(R8)(SI*8), AX - DIVQ R11 // q, r = div(0, x[i-1], d) - ADDQ AX, CX // z[i] += q - MOVQ DX, AX - MOVQ CX, 0(R10)(SI*8) + MOVQ AX, R9 // z[i] = l*m + MOVQ -8(R8)(SI*8), AX + MOVQ AX, BX // x[i-1] + RORW $8, CX // pre + SHRQ CX, AX // x[i-1] >> pre + MULQ R14 // *m + RORW $8, CX // post + MOVQ DX, AX + SHRQ CX, AX // h + ADDQ AX, R9 // z[i] += h + MOVQ R9, 0(R10)(SI*8) + MULQ R13 // DX:AX = d*h + SUBQ AX, BX // l = x[i-1]-d*h + MOVQ BX, AX SUBQ $1, SI JG L8 X8a: MULQ R12 MOVQ AX, 0(R10)(SI*8) - MOVQ BX, c+56(FP) RET X8b: MOVQ $0, c+56(FP) @@ -570,6 +590,7 @@ X8c: // func shr10VU(z, x []Word, s uint) (c Word) TEXT ·shr10VU(SB),NOSPLIT,$0 + // TODO(db47h): implement constant division by multiplication, unroll loop. MOVQ z_len+8(FP), DI SUBQ $1, DI // n-- JL X9b // n < 0 (n <= 0) diff --git a/dec_arith_test.go b/dec_arith_test.go index fe51a71..24edb54 100644 --- a/dec_arith_test.go +++ b/dec_arith_test.go @@ -519,9 +519,17 @@ func BenchmarkAddMul10VVW(b *testing.B) { } func BenchmarkShl10VU(b *testing.B) { - x := dec(rnd10V(1000)) - z := dec(nil).make(1000) + x := dec(rnd10V(100000)) + z := dec(nil).make(100000) for i := 0; i < b.N; i++ { z.shl(x, 8) } } + +func BenchmarkShr10VU(b *testing.B) { + x := dec(rnd10V(100000)) + z := dec(nil).make(100000) + for i := 0; i < b.N; i++ { + z.shr(x, 8) + } +} diff --git a/dec_test.go b/dec_test.go index 1f5e33c..6cf1c0b 100644 --- a/dec_test.go +++ b/dec_test.go @@ -745,6 +745,16 @@ func TestGoIssue37499(t *testing.T) { } } +var benchU uint + +func BenchmarkDecDigit(b *testing.B) { + x := dec(rnd10V(100)) + for i := 0; i < b.N; i++ { + n := i % (100 * _DW) + benchU = x.digit(uint(n)) + } +} + // TODO(bd47h): move this to decimal_test func benchmarkDecimalDiv(b *testing.B, aSize, bSize int) { aa := rndDec1(aSize)