Skip to content

Commit

Permalink
dec/arith: boost shl10VU/shr10VU performance
Browse files Browse the repository at this point in the history
These functions now use the "constant division by multiplication"
algorithm. This is made possible by the small range of possible values
for the shift count (18 at most) and a precoomputed table of magic
numbers (uses the same algorithm as the Go compiler). The amd64
assembler version of shl10VU now has a slight performance gain over the
go version (about 5% less time per op).

The performance boost is significative:

    name     old time/op  new time/op  delta
    Shl10VU  9.06µs ± 0%  3.32µs ± 2%  -63.36%  (p=0.008 n=5+5)

This improves (*Decimal).Add/Sub performance as well (when the mantissae
of the operands need to be aligned) and Decimal -> Int(64) conversions.
  • Loading branch information
db47h committed May 25, 2020
1 parent 39c92df commit de46678
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 25 deletions.
78 changes: 69 additions & 9 deletions dec_arith.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,67 @@ func decMaxPow(b Word) (p Word, n int) {
return Word(decMaxPow64[i]), int(decMaxPow64[i+1])
}

// pow10DivTab64 contains the "magic" numbers for fast division by 10**n
// where 1 <= n < 19, x / 10**n = ((x >> pre) * m) >> (_W + post).
// See https://gmplib.org/~tege/divcnst-pldi94.pdf
// generated using Go's src/cmd/compile/internal/ssa/magic.go and rewritegeneric.go rules
var pow10DivTab64 = [...]magic{
{10, 0xcccccccccccccccd, 0, 3},
{100, 0xa3d70a3d70a3d70b, 1, 5},
{1000, 0x83126e978d4fdf3c, 1, 8},
{10000, 0xd1b71758e219652c, 0, 13},
{100000, 0xa7c5ac471b478424, 1, 15},
{1000000, 0x8637bd05af6c69b6, 0, 19},
{10000000, 0xd6bf94d5e57a42bd, 1, 22},
{100000000, 0xabcc77118461cefd, 0, 26},
{1000000000, 0x89705f4136b4a598, 1, 28},
{10000000000, 0xdbe6fecebdedd5bf, 0, 33},
{100000000000, 0xafebff0bcb24aaff, 0, 36},
{1000000000000, 0x8cbccc096f5088cc, 0, 39},
{10000000000000, 0xe12e13424bb40e14, 1, 42},
{100000000000000, 0xb424dc35095cd810, 1, 45},
{1000000000000000, 0x901d7cf73ab0acda, 1, 48},
{10000000000000000, 0xe69594bec44de15c, 1, 52},
{100000000000000000, 0xb877aa3236a4b44a, 1, 55},
{1000000000000000000, 0x9392ee8e921d5d08, 1, 58},
{10000000000000000000, 0xec1e4a7db69561a6, 1, 62},
}

var pow10DivTab32 = [...]magic{
{10, 0xcccccccd, 0, 3},
{100, 0xa3d70a3e, 1, 5},
{1000, 0x83126e98, 0, 9},
{10000, 0xd1b71759, 0, 13},
{100000, 0xa7c5ac48, 1, 15},
{1000000, 0x8637bd06, 0, 19},
{10000000, 0xd6bf94d6, 0, 23},
{100000000, 0xabcc7712, 0, 26},
{1000000000, 0x89705f42, 1, 28},
}

type magic struct {
d uint64 // divisor
m uint64 // multiplier
pre byte // pre-shift
post byte // post-shift
}

func divisorPow10(n uint) magic {
if debugDecimal && n == 0 {
panic("divisorPow10: 10**0 is not a valid divisor")
}
if _W == 32 {
return pow10DivTab32[n-1]
}
return pow10DivTab64[n-1]
}

func (m magic) div(n Word) (q, r Word) {
h, _ := bits.Mul(uint(n)>>m.pre, uint(m.m))
q = Word(h) >> m.post
return q, n - q*Word(m.d)
}

//-----------------------------------------------------------------------------
// Arithmetic primitives
//
Expand Down Expand Up @@ -231,12 +292,12 @@ func shl10VU_g(z, x []Word, s uint) (r Word) {
if len(z) == 0 || len(x) == 0 {
return
}
d, m := pow10(_DW-s), pow10(s)
d, m := divisorPow10(_DW-s), pow10(s)
var h, l Word
r, l = divWW(0, x[len(x)-1], d)
r, l = d.div(x[len(x)-1])
for i := len(z) - 1; i > 0; i-- {
t := l
h, l = divWW(0, x[i-1], d)
h, l = d.div(x[i-1])
z[i] = t*m + h
}
z[0] = l * m
Expand All @@ -255,11 +316,11 @@ func shr10VU_g(z, x []Word, s uint) (r Word) {
}

var h, l Word
d, m := pow10(s), pow10(_DW-s)
h, r = divWW(0, x[0], d)
d, m := divisorPow10(s), pow10(_DW-s)
h, r = d.div(x[0])
for i := 1; i < len(z) && i < len(x); i++ {
t := h
h, l = divWW(0, x[i], d)
h, l = d.div(x[i])
z[i-1] = t + l*m
}
z[len(z)-1] = h
Expand Down Expand Up @@ -304,9 +365,8 @@ func div10VWW_g(z, x []Word, y, xn Word) (r Word) {
// This function uses the algorithm from "Division by invariant integers using
// multiplication" by Torbjörn Granlund & Peter L. Montgomery.
//
// See https://doi.org/10.1145/773473.178249, section 8, Dividing udword by uword.
//
// On 386 and amd64, this is over 2 times faster than calling bits.Div(n1, n0, _BD).
// See https://gmplib.org/~tege/divcnst-pldi94.pdf, section 8, Dividing udword
// by uword.
//
// In the article, some equations show an addition or subtraction of 2**N, which
// is a no-op. In the comments below, these have been removed for the sake of
Expand Down
49 changes: 35 additions & 14 deletions dec_arith_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ CE:

// func shl10VU(z, x []Word, s uint) (c Word)
TEXT ·shl10VU(SB),NOSPLIT,$0
// TODO(db47h): unroll loop
MOVQ z_len+8(FP), SI // i = z
SUBQ $1, SI // i--
JL X8b // i < 0 (n <= 0)
Expand All @@ -528,34 +529,53 @@ TEXT ·shl10VU(SB),NOSPLIT,$0
JEQ X8c // copy if s = 0

MOVQ $_DW, CX
LEAQ ·pow10tab(SB), DI
LEAQ ·pow10DivTab64(SB), DI
SUBQ BX, CX
MOVQ 0(DI)(BX*8), R12 // m = pow10(s)
MOVQ 0(DI)(CX*8), R11 // d = pow10(_DW-s)
LEAQ -3(BX)(BX*2), BX
LEAQ -3(CX)(CX*2), CX

XORQ DX, DX
MOVQ 0(DI)(BX*8), R12 // m = pow10DivTab64(s-1).d
LEAQ 0(DI)(CX*8), R11 // d = &pow10DivTab64(_DW-s-1)

// r, l = x[len(x)-1] / d
MOVQ 0(R11), R13 // d
MOVQ 8(R11), R14 // m
MOVQ 0(R8)(SI*8), AX
DIVQ R11 // AX:DX = x[len(x)-1] / d
MOVQ AX, BX // save r
MOVQ AX, BX // x[i]
MOVWLZX 16(R11), CX // post|pre
SHRQ CX, AX // x[i] >> pre
MULQ R14 // *m
RORW $8, CX // post
MOVQ DX, AX
SHRQ CX, AX // r
MOVQ AX, c+56(FP) // save r
MULQ R13 // DX:AX = r*d
SUBQ AX, BX // l = x[i]-r*d
MOVQ BX, AX // AX = l

TESTQ SI, SI
JEQ X8a
L8:
MULQ R12
MOVQ AX, CX // z[i] = l*m
XORQ DX, DX
MOVQ -8(R8)(SI*8), AX
DIVQ R11 // q, r = div(0, x[i-1], d)
ADDQ AX, CX // z[i] += q
MOVQ DX, AX
MOVQ CX, 0(R10)(SI*8)
MOVQ AX, R9 // z[i] = l*m
MOVQ -8(R8)(SI*8), AX
MOVQ AX, BX // x[i-1]
RORW $8, CX // pre
SHRQ CX, AX // x[i-1] >> pre
MULQ R14 // *m
RORW $8, CX // post
MOVQ DX, AX
SHRQ CX, AX // h
ADDQ AX, R9 // z[i] += h
MOVQ R9, 0(R10)(SI*8)
MULQ R13 // DX:AX = d*h
SUBQ AX, BX // l = x[i-1]-d*h
MOVQ BX, AX
SUBQ $1, SI
JG L8
X8a:
MULQ R12
MOVQ AX, 0(R10)(SI*8)
MOVQ BX, c+56(FP)
RET
X8b:
MOVQ $0, c+56(FP)
Expand All @@ -570,6 +590,7 @@ X8c:

// func shr10VU(z, x []Word, s uint) (c Word)
TEXT ·shr10VU(SB),NOSPLIT,$0
// TODO(db47h): implement constant division by multiplication, unroll loop.
MOVQ z_len+8(FP), DI
SUBQ $1, DI // n--
JL X9b // n < 0 (n <= 0)
Expand Down
12 changes: 10 additions & 2 deletions dec_arith_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -519,9 +519,17 @@ func BenchmarkAddMul10VVW(b *testing.B) {
}

func BenchmarkShl10VU(b *testing.B) {
x := dec(rnd10V(1000))
z := dec(nil).make(1000)
x := dec(rnd10V(100000))
z := dec(nil).make(100000)
for i := 0; i < b.N; i++ {
z.shl(x, 8)
}
}

func BenchmarkShr10VU(b *testing.B) {
x := dec(rnd10V(100000))
z := dec(nil).make(100000)
for i := 0; i < b.N; i++ {
z.shr(x, 8)
}
}
10 changes: 10 additions & 0 deletions dec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,16 @@ func TestGoIssue37499(t *testing.T) {
}
}

var benchU uint

func BenchmarkDecDigit(b *testing.B) {
x := dec(rnd10V(100))
for i := 0; i < b.N; i++ {
n := i % (100 * _DW)
benchU = x.digit(uint(n))
}
}

// TODO(bd47h): move this to decimal_test
func benchmarkDecimalDiv(b *testing.B, aSize, bSize int) {
aa := rndDec1(aSize)
Expand Down

0 comments on commit de46678

Please sign in to comment.