From 5821f1083ea8a07f9b0c13a29afbeccd89466914 Mon Sep 17 00:00:00 2001 From: Denis Bernard Date: Thu, 14 May 2020 02:21:15 +0200 Subject: [PATCH] dec/amd64: implement add10VW/sub10VW MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit name go time/op asm time/op delta Sub10VW/1 10.1ns ± 2% 5.6ns ± 1% -45.03% Sub10VW/2 11.7ns ± 1% 6.1ns ± 1% -48.01% Sub10VW/3 13.2ns ± 2% 8.1ns ± 0% -39.03% Sub10VW/4 14.6ns ± 0% 8.4ns ± 0% -42.77% Sub10VW/5 14.9ns ± 1% 8.8ns ± 0% -40.66% Sub10VW/10 15.1ns ± 0% 10.6ns ± 3% -30.12% Sub10VW/100 116ns ± 1% 45ns ± 6% -61.62% Sub10VW/1000 1.22µs ± 1% 0.54µs ±14% -55.85% Sub10VW/10000 11.9µs ± 0% 5.4µs ± 1% -54.85% Sub10VW/100000 122µs ± 0% 62µs ± 0% -49.45% The Go implementation can check if the carry is zero and switch to copy() for free (no need to have a standard add10VW vs. add10VW large). In the assembler version I chose to keep a single implementation of the function and switch to a memcpy whenever the carry is 0 (checked every 4 Words). Considering that the carry is almost always 0, this logic is the likely cause of the performance drop between 5-15 Words. Also past 1000 Words, the performance gains seem to slowly drop. The very likely cause is the simplistic memcpy implementation vs. runtime·memmove. --- arith_test.go | 2 +- dec.go | 4 +- dec_arith.go | 26 ++-- dec_arith_amd64.s | 367 +++++++++++++++++++++++++++++---------------- dec_arith_decl.go | 10 +- dec_arith_test.go | 373 +++++++++++++++++++++++++--------------------- decimal_test.go | 4 +- go.mod | 2 - testdata/bench | 12 -- testdata/benchasm | 14 ++ testdata/benchgit | 23 +-- 11 files changed, 494 insertions(+), 343 deletions(-) delete mode 100755 testdata/bench create mode 100755 testdata/benchasm diff --git a/arith_test.go b/arith_test.go index 4b5a1f0..00f39c9 100644 --- a/arith_test.go +++ b/arith_test.go @@ -14,7 +14,7 @@ import ( var isRaceBuilder bool func init() { - flag.BoolVar(&isRaceBuilder, "rb", true, "race builder") + flag.BoolVar(&isRaceBuilder, "rb", false, "race builder") } type funVV func(z, x, y []Word) (c Word) diff --git a/dec.go b/dec.go index 5962c58..6c4a05d 100644 --- a/dec.go +++ b/dec.go @@ -64,7 +64,7 @@ func (x dec) digit(i uint) uint { return 0 } // 0 <= j < len(x) - return (uint(x[j]) / pow10(i)) % 10 + return uint(x[j]/pow10(i)) % 10 } func (z dec) make(n int) dec { @@ -166,7 +166,7 @@ func (x dec) sticky(i uint) uint { return 1 } } - if uint(x[j])%pow10(i) != 0 { + if x[j]%pow10(i) != 0 { return 1 } return 0 diff --git a/dec_arith.go b/dec_arith.go index c1d6d29..75a3338 100644 --- a/dec_arith.go +++ b/dec_arith.go @@ -25,7 +25,7 @@ var pow10s = [...]uint64{ 10000000000000000, 100000000000000000, 1000000000000000000, 10000000000000000000, } -func pow10(n uint) uint { return uint(pow10s[n]) } +func pow10(n uint) Word { return Word(pow10s[n]) } var maxDigits = [...]uint{ 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, @@ -54,7 +54,7 @@ func decDigits64(x uint64) (n uint) { func decDigits32(x uint) (n uint) { n = maxDigits[bits.Len(x)] - if x < pow10(n-1) { + if x < uint(pow10(n-1)) { n-- } return n @@ -148,12 +148,9 @@ func div10WW_g(u1, u0, v Word) (q, r Word) { func add10WWW_g(x, y, cIn Word) (s, c Word) { r, cc := bits.Add(uint(x), uint(y), uint(cIn)) - // if cc != 0 || r > _DB-1 { - // cc = 1 - // r -= _DB - // } - // c1 := uint(int(r-_DB) >> 63) var c1 uint + // this simple if statement is compiled without jumps + // at least on amd64. if r >= _DB { c1 = 1 } @@ -187,7 +184,10 @@ func sub10VV_g(z, x, y []Word) (c Word) { } // add10VW adds y to x. The resulting carry c is either 0 or 1. -func add10VW_g(z, x dec, y Word) (c Word) { +func add10VW_g(z, x []Word, y Word) (c Word) { + if len(z) == 0 { + return y + } z[0], c = add10WWW_g(x[0], y, 0) // propagate carry for i := 1; i < len(z) && i < len(x); i++ { @@ -219,7 +219,7 @@ func sub10VW_g(z, x []Word, y Word) (c Word) { } // shl10VU sets z to x*(10**s), s < _WD -func shl10VU_g(z, x dec, s uint) (r Word) { +func shl10VU_g(z, x []Word, s uint) (r Word) { if s == 0 { copy(z, x) return @@ -227,7 +227,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) { if len(z) == 0 || len(x) == 0 { return } - d, m := Word(pow10(_DW-s)), Word(pow10(s)) + d, m := pow10(_DW-s), pow10(s) var h, l Word r, l = divWW(0, x[len(x)-1], d) for i := len(z) - 1; i > 0; i-- { @@ -241,7 +241,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) { } // shr10VU sets z to x/(10**s) -func shr10VU_g(z, x dec, s uint) (r Word) { +func shr10VU_g(z, x []Word, s uint) (r Word) { if s == 0 { copy(z, x) return @@ -251,7 +251,7 @@ func shr10VU_g(z, x dec, s uint) (r Word) { } var h, l Word - d, m := Word(pow10(s)), Word(pow10(_DW-s)) + d, m := pow10(s), pow10(_DW-s) h, r = divWW(0, x[0], Word(d)) for i := 1; i < len(z) && i < len(x); i++ { t := h @@ -259,7 +259,7 @@ func shr10VU_g(z, x dec, s uint) (r Word) { z[i-1] = t + l*m } z[len(z)-1] = h - return r + return r * m } func mulAdd10VWW_g(z, x []Word, y, r Word) (c Word) { diff --git a/dec_arith_amd64.s b/dec_arith_amd64.s index c095405..2d3d685 100644 --- a/dec_arith_amd64.s +++ b/dec_arith_amd64.s @@ -127,8 +127,7 @@ U1: // n >= 0 SBBQ BX, BX ORQ BX, CX LEAQ 1(DX), AX - MOVQ CX, BX - ANDQ BX, AX + ANDQ CX, AX SUBQ AX, R11 ADDQ CX, CX // restore CF ADCQ 8(R9)(SI*8), R12 @@ -137,8 +136,7 @@ U1: // n >= 0 SBBQ BX, BX ORQ BX, CX LEAQ 1(DX), AX - MOVQ CX, BX - ANDQ BX, AX + ANDQ CX, AX SUBQ AX, R12 ADDQ CX, CX // restore CF ADCQ 16(R9)(SI*8), R13 @@ -147,8 +145,7 @@ U1: // n >= 0 SBBQ BX, BX ORQ BX, CX LEAQ 1(DX), AX - MOVQ CX, BX - ANDQ BX, AX + ANDQ CX, AX SUBQ AX, R13 ADDQ CX, CX // restore CF ADCQ 24(R9)(SI*8), R14 @@ -157,8 +154,7 @@ U1: // n >= 0 SBBQ BX, BX ORQ BX, CX LEAQ 1(DX), AX - MOVQ CX, BX - ANDQ BX, AX + ANDQ CX, AX SUBQ AX, R14 MOVQ R11, 0(R10)(SI*8) MOVQ R12, 8(R10)(SI*8) @@ -181,20 +177,19 @@ L1: // n > 0 SBBQ BX, BX ORQ BX, CX LEAQ 1(DX), AX - MOVQ CX, BX - ANDQ BX, AX + ANDQ CX, AX SUBQ AX, R11 MOVQ R11, 0(R10)(SI*8) - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- + INCQ SI // i++ + DECQ DI // n-- JG L1 // if n > 0 goto L1 E1: NEGQ CX MOVQ CX, c+72(FP) // return c RET -// func add10VV(z, x, y []Word) (c Word) +// func sub10VV(z, x, y []Word) (c Word) TEXT ·sub10VV(SB),NOSPLIT,$0 MOVQ z_len+8(FP), DI MOVQ x+24(FP), R8 @@ -261,126 +256,244 @@ L2: // n > 0 ADDQ AX, R11 MOVQ R11, 0(R10)(SI*8) - ADDQ $1, SI // i++ - SUBQ $1, DI // n-- + INCQ SI // i++ + DECQ DI // n-- JG L2 // if n > 0 goto L2 E2: NEGQ CX MOVQ CX, c+72(FP) // return c RET -// // func addVW(z, x []Word, y Word) (c Word) -// TEXT ·addVW(SB),NOSPLIT,$0 -// MOVQ z_len+8(FP), DI -// CMPQ DI, $32 -// JG large -// MOVQ x+24(FP), R8 -// MOVQ y+48(FP), CX // c = y -// MOVQ z+0(FP), R10 -// -// MOVQ $0, SI // i = 0 -// -// // s/JL/JMP/ below to disable the unrolled loop -// SUBQ $4, DI // n -= 4 -// JL V3 // if n < 4 goto V3 -// -// U3: // n >= 0 -// // regular loop body unrolled 4x -// MOVQ 0(R8)(SI*8), R11 -// MOVQ 8(R8)(SI*8), R12 -// MOVQ 16(R8)(SI*8), R13 -// MOVQ 24(R8)(SI*8), R14 -// ADDQ CX, R11 -// ADCQ $0, R12 -// ADCQ $0, R13 -// ADCQ $0, R14 -// SBBQ CX, CX // save CF -// NEGQ CX -// MOVQ R11, 0(R10)(SI*8) -// MOVQ R12, 8(R10)(SI*8) -// MOVQ R13, 16(R10)(SI*8) -// MOVQ R14, 24(R10)(SI*8) -// -// ADDQ $4, SI // i += 4 -// SUBQ $4, DI // n -= 4 -// JGE U3 // if n >= 0 goto U3 -// -// V3: ADDQ $4, DI // n += 4 -// JLE E3 // if n <= 0 goto E3 -// -// L3: // n > 0 -// ADDQ 0(R8)(SI*8), CX -// MOVQ CX, 0(R10)(SI*8) -// SBBQ CX, CX // save CF -// NEGQ CX -// -// ADDQ $1, SI // i++ -// SUBQ $1, DI // n-- -// JG L3 // if n > 0 goto L3 -// -// E3: MOVQ CX, c+56(FP) // return c -// RET -// large: -// JMP ·addVWlarge(SB) -// -// -// // func subVW(z, x []Word, y Word) (c Word) -// // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) -// TEXT ·subVW(SB),NOSPLIT,$0 -// MOVQ z_len+8(FP), DI -// CMPQ DI, $32 -// JG large -// MOVQ x+24(FP), R8 -// MOVQ y+48(FP), CX // c = y -// MOVQ z+0(FP), R10 -// -// MOVQ $0, SI // i = 0 -// -// // s/JL/JMP/ below to disable the unrolled loop -// SUBQ $4, DI // n -= 4 -// JL V4 // if n < 4 goto V4 -// -// U4: // n >= 0 -// // regular loop body unrolled 4x -// MOVQ 0(R8)(SI*8), R11 -// MOVQ 8(R8)(SI*8), R12 -// MOVQ 16(R8)(SI*8), R13 -// MOVQ 24(R8)(SI*8), R14 -// SUBQ CX, R11 -// SBBQ $0, R12 -// SBBQ $0, R13 -// SBBQ $0, R14 -// SBBQ CX, CX // save CF -// NEGQ CX -// MOVQ R11, 0(R10)(SI*8) -// MOVQ R12, 8(R10)(SI*8) -// MOVQ R13, 16(R10)(SI*8) -// MOVQ R14, 24(R10)(SI*8) -// -// ADDQ $4, SI // i += 4 -// SUBQ $4, DI // n -= 4 -// JGE U4 // if n >= 0 goto U4 -// -// V4: ADDQ $4, DI // n += 4 -// JLE E4 // if n <= 0 goto E4 -// -// L4: // n > 0 -// MOVQ 0(R8)(SI*8), R11 -// SUBQ CX, R11 -// MOVQ R11, 0(R10)(SI*8) -// SBBQ CX, CX // save CF -// NEGQ CX -// -// ADDQ $1, SI // i++ -// SUBQ $1, DI // n-- -// JG L4 // if n > 0 goto L4 -// -// E4: MOVQ CX, c+56(FP) // return c -// RET -// large: -// JMP ·subVWlarge(SB) -// -// +// func add10VW(z, x []Word, y Word) (c Word) +TEXT ·add10VW(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + MOVQ x+24(FP), R8 + MOVQ y+48(FP), CX // c = y + MOVQ z+0(FP), R10 + + MOVQ $0, SI // i = 0 + MOVQ $_DB, DX + + // Once we start looping, we won't handle the hardware carry since + // x[i] < _DB, so x[i] + 1 < 1<<64-1 always. + // This still needs to be handled for the first element. + + DECQ DI // n-- + JL E3 // abort if n < 0 + ADDQ 0(R8)(SI*8), CX + LEAQ -1(DX), AX + SBBQ BX, BX + CMPQ AX, CX + SBBQ AX, AX + ORQ AX, BX + MOVQ DX, AX + ANDQ BX, AX + SUBQ AX, CX + NEGQ BX // convert to C = 0/1 + MOVQ CX, 0(R10)(SI*8) + MOVQ BX, CX // save c + LEAQ 1(SI), SI // i++ + JG T3 // if c != 0 propagate + SUBQ $4, DI + JL CV3 // if n < 4 goto CV3 + CMPQ R8, R10 + JEQ E3 // don't copy if &x[0] == &z[0] + JMP CU3 + +T3: + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V3 // if n < 4 goto V3 + +U3: // n >= 0 + // regular loop body unrolled 4x + ADDQ 0(R8)(SI*8), CX + CMPQ CX, DX + SBBQ BX, BX + ANDQ BX, CX + MOVQ CX, 0(R10)(SI*8) + LEAQ 1(BX), CX + ADDQ 8(R8)(SI*8), CX + CMPQ CX, DX + SBBQ BX, BX + ANDQ BX, CX + MOVQ CX, 8(R10)(SI*8) + LEAQ 1(BX), CX + ADDQ 16(R8)(SI*8), CX + CMPQ CX, DX + SBBQ BX, BX + ANDQ BX, CX + MOVQ CX, 16(R10)(SI*8) + LEAQ 1(BX), CX + ADDQ 24(R8)(SI*8), CX + CMPQ CX, DX + SBBQ BX, BX + ANDQ BX, CX + MOVQ CX, 24(R10)(SI*8) + LEAQ 1(BX), CX + TESTQ BX, BX + JL C3 + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U3 // if n >= 0 goto U3 + +V3: ADDQ $4, DI // n += 4 + JLE E3 // if n <= 0 goto E3 + +L3: // n > 0 + ADDQ 0(R8)(SI*8), CX + CMPQ CX, DX + SBBQ BX, BX // BX = _DB > CX ? -1 : 0 + ANDQ BX, CX // sets CX to 0 if CX >= _DB + MOVQ CX, 0(R10)(SI*8) + LEAQ 1(BX), CX // eqv to NOTQ BX, NEGQ BX, MOVQ BX CX + + INCQ SI // i++ + DECQ DI // n-- + JG L3 // if n > 0 goto L3 + +E3: MOVQ CX, c+56(FP) // return c + RET + +C3: // memcpy + CMPQ R8, R10 // don't copy if &x[0] == &z[0] + JEQ CE3 + ADDQ $4, SI + SUBQ $4, DI + JL CV3 + +CU3: // n >= 4 + MOVQ 0(R8)(SI*8), AX + MOVQ 8(R8)(SI*8), BX + MOVQ 16(R8)(SI*8), CX + MOVQ 24(R8)(SI*8), DX + MOVQ AX, 0(R10)(SI*8) + MOVQ BX, 8(R10)(SI*8) + MOVQ CX, 16(R10)(SI*8) + MOVQ DX, 24(R10)(SI*8) + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE CU3 // if n >= 0 goto C3 +CV3: + ADDQ $4, DI + JLE CE3 +CL3: + MOVQ 0(R8)(SI*8), AX + MOVQ AX, 0(R10)(SI*8) + INCQ SI + DECQ DI + JG CL3 +CE3: + MOVQ $0, c+56(FP) + RET + +// func sub10VW(z, x []Word, y Word) (c Word) +// (same as add10VW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) +TEXT ·sub10VW(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + MOVQ x+24(FP), R8 + MOVQ y+48(FP), CX // c = y + MOVQ z+0(FP), R10 + + XORQ SI, SI // i = 0 + MOVQ $_DB, DX + + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V4 // if n < 4 goto V4 + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVQ 0(R8)(SI*8), BX + SUBQ CX, BX + SBBQ CX, CX + MOVQ DX, AX + ANDQ CX, AX + ADDQ AX, BX + MOVQ BX, 0(R10)(SI*8) + NEGQ CX + MOVQ 8(R8)(SI*8), BX + SUBQ CX, BX + SBBQ CX, CX + MOVQ DX, AX + ANDQ CX, AX + ADDQ AX, BX + MOVQ BX, 8(R10)(SI*8) + NEGQ CX + MOVQ 16(R8)(SI*8), BX + SUBQ CX, BX + SBBQ CX, CX + MOVQ DX, AX + ANDQ CX, AX + ADDQ AX, BX + MOVQ BX, 16(R10)(SI*8) + NEGQ CX + MOVQ 24(R8)(SI*8), BX + SUBQ CX, BX + SBBQ CX, CX + MOVQ DX, AX + ANDQ CX, AX + ADDQ AX, BX + MOVQ BX, 24(R10)(SI*8) + NEGQ CX + JCC C4 + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U4 // if n >= 0 goto U4 + +V4: ADDQ $4, DI // n += 4 + JLE E4 // if n <= 0 goto E4 + +L4: // n > 0 + MOVQ 0(R8)(SI*8), R11 + SUBQ CX, R11 + SBBQ CX, CX + MOVQ DX, AX + ANDQ CX, AX + ADDQ AX, R11 + NEGQ CX + MOVQ R11, 0(R10)(SI*8) + + INCQ SI // i++ + DECQ DI // n-- + JG L4 // if n > 0 goto L4 + +E4: MOVQ CX, c+56(FP) // return c + RET + +C4: // memcpy + CMPQ R8, R10 // don't copy if &x[0] == &z[0] + JEQ CE4 + ADDQ $4, SI + SUBQ $4, DI + JL CV4 +CU4: // n >= 4 + MOVQ 0(R8)(SI*8), AX + MOVQ 8(R8)(SI*8), BX + MOVQ 16(R8)(SI*8), CX + MOVQ 24(R8)(SI*8), DX + MOVQ AX, 0(R10)(SI*8) + MOVQ BX, 8(R10)(SI*8) + MOVQ CX, 16(R10)(SI*8) + MOVQ DX, 24(R10)(SI*8) + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE CU4 // if n >= 0 goto C4 +CV4: + ADDQ $4, DI + JLE CE4 +CL4: + MOVQ 0(R8)(SI*8), AX + MOVQ AX, 0(R10)(SI*8) + INCQ SI + DECQ DI + JG CL4 +CE4: + MOVQ $0, c+56(FP) + RET + // // func shlVU(z, x []Word, s uint) (c Word) // TEXT ·shlVU(SB),NOSPLIT,$0 // MOVQ z_len+8(FP), BX // i = z diff --git a/dec_arith_decl.go b/dec_arith_decl.go index 017ff7e..d7bc30b 100644 --- a/dec_arith_decl.go +++ b/dec_arith_decl.go @@ -13,15 +13,11 @@ func div10W(n1, n0 Word) (q, r Word) func sub10VV(z, x, y []Word) (c Word) -// Not implemented yet +func add10VW(z, x []Word, y Word) (c Word) -func add10VW(z, x []Word, y Word) (c Word) { - return add10VW_g(z, x, y) -} +func sub10VW(z, x []Word, y Word) (c Word) -func sub10VW(z, x []Word, y Word) (c Word) { - return sub10VW_g(z, x, y) -} +// Not implemented yet func shl10VU(z, x []Word, s uint) (c Word) { return shl10VU_g(z, x, s) diff --git a/dec_arith_test.go b/dec_arith_test.go index aee6a74..5ee69c6 100644 --- a/dec_arith_test.go +++ b/dec_arith_test.go @@ -153,194 +153,235 @@ func BenchmarkDecSub10VV(b *testing.B) { } } -// TODO(db47h): complete port of the tests +type fun10VW func(z, x []Word, y Word) (c Word) +type arg10VW struct { + z, x dec + y Word + c Word +} -// type funVW func(z, x []Word, y Word) (c Word) -// type argVW struct { -// z, x nat -// y Word -// c Word -// } +var sum10VW = []arg10VW{ + {}, + {nil, nil, 2, 2}, + {dec{0}, dec{0}, 0, 0}, + {dec{1}, dec{0}, 1, 0}, + {dec{1}, dec{1}, 0, 0}, + {dec{0}, dec{_DMax}, 1, 1}, + {dec{0, 0, 0, 0, 0}, dec{_DMax, _DMax, _DMax, _DMax, _DMax}, 1, 1}, + {dec{585}, dec{314}, 271, 0}, +} -// var sumVW = []argVW{ -// {}, -// {nil, nil, 2, 2}, -// {nat{0}, nat{0}, 0, 0}, -// {nat{1}, nat{0}, 1, 0}, -// {nat{1}, nat{1}, 0, 0}, -// {nat{0}, nat{_M}, 1, 1}, -// {nat{0, 0, 0, 0}, nat{_M, _M, _M, _M}, 1, 1}, -// {nat{585}, nat{314}, 271, 0}, -// } +var lsh10VW = []arg10VW{ + {}, + {dec{0}, dec{0}, 0, 0}, + {dec{0}, dec{0}, 1, 0}, + {dec{0}, dec{0}, 7, 0}, -// var lshVW = []argVW{ -// {}, -// {nat{0}, nat{0}, 0, 0}, -// {nat{0}, nat{0}, 1, 0}, -// {nat{0}, nat{0}, 20, 0}, + {dec{_DMax}, dec{_DMax}, 0, 0}, + {dec{_DMax - _DMax%pow10(1)}, dec{_DMax}, 1, _DMax / pow10(_DW-1)}, + {dec{_DMax - _DMax%pow10(7)}, dec{_DMax}, 7, _DMax / pow10(_DW-7)}, -// {nat{_M}, nat{_M}, 0, 0}, -// {nat{_M << 1 & _M}, nat{_M}, 1, 1}, -// {nat{_M << 20 & _M}, nat{_M}, 20, _M >> (_W - 20)}, + {dec{_DMax, _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 0, 0}, + {dec{_DMax - _DMax%pow10(1), _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 1, _DMax / pow10(_DW-1)}, + {dec{_DMax - _DMax%pow10(7), _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 7, _DMax / pow10(_DW-7)}, +} -// {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, -// {nat{_M << 1 & _M, _M, _M}, nat{_M, _M, _M}, 1, 1}, -// {nat{_M << 20 & _M, _M, _M}, nat{_M, _M, _M}, 20, _M >> (_W - 20)}, -// } +var rsh10VW = []arg10VW{ + {}, + {dec{0}, dec{0}, 0, 0}, + {dec{0}, dec{0}, 1, 0}, + {dec{0}, dec{0}, 7, 0}, -// var rshVW = []argVW{ -// {}, -// {nat{0}, nat{0}, 0, 0}, -// {nat{0}, nat{0}, 1, 0}, -// {nat{0}, nat{0}, 20, 0}, + {dec{_DMax}, dec{_DMax}, 0, 0}, + {dec{_DMax / pow10(1)}, dec{_DMax}, 1, _DMax - _DMax%pow10(_DW-1)}, + {dec{_DMax / pow10(7)}, dec{_DMax}, 7, _DMax - _DMax%pow10(_DW-7)}, -// {nat{_M}, nat{_M}, 0, 0}, -// {nat{_M >> 1}, nat{_M}, 1, _M << (_W - 1) & _M}, -// {nat{_M >> 20}, nat{_M}, 20, _M << (_W - 20) & _M}, + {dec{_DMax, _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 0, 0}, + {dec{_DMax, _DMax, _DMax / pow10(1)}, dec{_DMax, _DMax, _DMax}, 1, _DMax - _DMax%pow10(_DW-1)}, + {dec{_DMax, _DMax, _DMax / pow10(7)}, dec{_DMax, _DMax, _DMax}, 7, _DMax - _DMax%pow10(_DW-7)}, +} -// {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, -// {nat{_M, _M, _M >> 1}, nat{_M, _M, _M}, 1, _M << (_W - 1) & _M}, -// {nat{_M, _M, _M >> 20}, nat{_M, _M, _M}, 20, _M << (_W - 20) & _M}, -// } +func testFun10VW(t *testing.T, msg string, f fun10VW, a arg10VW) { + n := len(a.z) + z := make(nat, n+1) + c := f(z[:n], a.x, a.y) + for i, zi := range z[:n] { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %d; want %d", msg, a, i, zi, a.z[i]) + break + } + } + if c != a.c { + t.Errorf("%s%+v\n\tgot c = %d; want %d", msg, a, c, a.c) + } + // TestDecAddSub10VW sets x[len(x)] = some value + // check that it does not get copied. + if z[n] != 0 { + panic("memcpy overflow") + } +} -// func testFunVW(t *testing.T, msg string, f funVW, a argVW) { -// z := make(nat, len(a.z)) -// c := f(z, a.x, a.y) -// for i, zi := range z { -// if zi != a.z[i] { -// t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) -// break -// } -// } -// if c != a.c { -// t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) -// } -// } +func makeFun10VW(f func(z, x []Word, s uint) (c Word)) fun10VW { + return func(z, x []Word, s Word) (c Word) { + return f(z, x, uint(s)) + } +} -// func makeFunVW(f func(z, x []Word, s uint) (c Word)) funVW { -// return func(z, x []Word, s Word) (c Word) { -// return f(z, x, uint(s)) -// } -// } +func TestDecFun10VW(t *testing.T) { + for _, a := range sum10VW { + arg := a + testFun10VW(t, "add10VW_g", add10VW_g, arg) + testFun10VW(t, "add10VW", add10VW, arg) -// func TestFunVW(t *testing.T) { -// for _, a := range sumVW { -// arg := a -// testFunVW(t, "addVW_g", addVW_g, arg) -// testFunVW(t, "addVW", addVW, arg) + arg = arg10VW{a.x, a.z, a.y, a.c} + testFun10VW(t, "sub10VW_g", sub10VW_g, arg) + testFun10VW(t, "sub10VW", sub10VW, arg) + } -// arg = argVW{a.x, a.z, a.y, a.c} -// testFunVW(t, "subVW_g", subVW_g, arg) -// testFunVW(t, "subVW", subVW, arg) -// } + shl10VW_g := makeFun10VW(shl10VU_g) + shl10VW := makeFun10VW(shl10VU) + for _, a := range lsh10VW { + arg := a + testFun10VW(t, "shl10VU_g", shl10VW_g, arg) + testFun10VW(t, "shl10VU", shl10VW, arg) + } -// shlVW_g := makeFunVW(shlVU_g) -// shlVW := makeFunVW(shlVU) -// for _, a := range lshVW { -// arg := a -// testFunVW(t, "shlVU_g", shlVW_g, arg) -// testFunVW(t, "shlVU", shlVW, arg) -// } + shr10VW_g := makeFun10VW(shr10VU_g) + shr10VW := makeFun10VW(shr10VU) + for _, a := range rsh10VW { + arg := a + testFun10VW(t, "shr10VU_g", shr10VW_g, arg) + testFun10VW(t, "shr10VU", shr10VW, arg) + } +} -// shrVW_g := makeFunVW(shrVU_g) -// shrVW := makeFunVW(shrVU) -// for _, a := range rshVW { -// arg := a -// testFunVW(t, "shrVU_g", shrVW_g, arg) -// testFunVW(t, "shrVU", shrVW, arg) -// } -// } +// TestDecAddSub10VW tests proper behavior of assembly versions of add10Vw and +// sub10VW on edge cases. +func TestDecAddSub10VW(t *testing.T) { + for n := 0; n < 10; n++ { + for i := 0; i <= n; i++ { + z := dec(nil).make(n) + x := dec(nil).make(n + 1) + // Bounds check. testFun10VW will allocate a larger result slice and + // check that the higher Words are not overwritten. + x[n] = 42 + x = x[:n] + // test _DMax + 1 = 0 + // fill x[:j] with _DMax, z[:j] with 0 + for j := 0; j < i; j++ { + x[j] = _DMax + } + // fill x[j:] and z[:j] with random-ish data + for j := i; j < n; j++ { + x[j] = Word(j + 1) + z[j] = Word(j + 1) + // add carry + if j == i { + z[j]++ + } + } + c := Word(0) + if i == n { + c = 1 + } + testFun10VW(t, "add10VW_asm", add10VW, arg10VW{z, x, 1, c}) + testFun10VW(t, "sub10VW_asm", sub10VW, arg10VW{x, z, 1, c}) + } + } +} -// type argVU struct { -// d []Word // d is a Word slice, the input parameters x and z come from this array. -// l uint // l is the length of the input parameters x and z. -// xp uint // xp is the starting position of the input parameter x, x := d[xp:xp+l]. -// zp uint // zp is the starting position of the input parameter z, z := d[zp:zp+l]. -// s uint // s is the shift number. -// r []Word // r is the expected output result z. -// c Word // c is the expected return value. -// m string // message. -// } +type arg10VU struct { + d []Word // d is a Word slice, the input parameters x and z come from this array. + l uint // l is the length of the input parameters x and z. + xp uint // xp is the starting position of the input parameter x, x := d[xp:xp+l]. + zp uint // zp is the starting position of the input parameter z, z := d[zp:zp+l]. + s uint // s is the shift number. + r []Word // r is the expected output result z. + c Word // c is the expected return value. + m string // message. +} -// var argshlVU = []argVU{ -// // test cases for shlVU -// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0}, 7, 0, 0, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "complete overlap of shlVU"}, -// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by half of shlVU"}, -// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by 1 Word of shlVU"}, -// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "no overlap of shlVU"}, -// } +var argshl10VU = []arg10VU{ + // test cases for shlVU + {[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0}, 7, 0, 0, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "complete overlap of shlVU"}, + {[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "partial overlap by half of shlVU"}, + {[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "partial overlap by 1 Word of shlVU"}, + {[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "no overlap of shlVU"}, +} -// var argshrVU = []argVU{ -// // test cases for shrVU -// {[]Word{0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 1, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "complete overlap of shrVU"}, -// {[]Word{0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 4, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by half of shrVU"}, -// {[]Word{0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 7, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by 1 Word of shrVU"}, -// {[]Word{0, 0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 8, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "no overlap of shrVU"}, -// } +var argshr10VU = []arg10VU{ + // test cases for shrVU + {[]Word{0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 1, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "complete overlap of shrVU"}, + {[]Word{0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 4, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "partial overlap by half of shrVU"}, + {[]Word{0, 0, 0, 0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 7, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "partial overlap by 1 Word of shrVU"}, + {[]Word{0, 0, 0, 0, 0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 8, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "no overlap of shrVU"}, +} -// func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) { -// // save a.d for error message, or it will be overwritten. -// b := make([]Word, len(a.d)) -// copy(b, a.d) -// z := a.d[a.zp : a.zp+a.l] -// x := a.d[a.xp : a.xp+a.l] -// c := f(z, x, a.s) -// for i, zi := range z { -// if zi != a.r[i] { -// t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) -// break -// } -// } -// if c != a.c { -// t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) -// } -// } +func testShift10Func(t *testing.T, f func(z, x []Word, s uint) Word, a arg10VU) { + // save a.d for error message, or it will be overwritten. + b := make([]Word, len(a.d)) + copy(b, a.d) + z := a.d[a.zp : a.zp+a.l] + x := a.d[a.xp : a.xp+a.l] + c := f(z, x, a.s) + for i, zi := range z { + if zi != a.r[i] { + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %d; want %d", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) + break + } + } + if c != a.c { + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %d; want %d", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) + } +} -// func TestShiftOverlap(t *testing.T) { -// for _, a := range argshlVU { -// arg := a -// testShiftFunc(t, shlVU, arg) -// } +func TestShift10Overlap(t *testing.T) { + for _, a := range argshl10VU { + arg := a + testShift10Func(t, shl10VU, arg) + } -// for _, a := range argshrVU { -// arg := a -// testShiftFunc(t, shrVU, arg) -// } -// } + for _, a := range argshr10VU { + arg := a + testShift10Func(t, shr10VU, arg) + } +} -// func BenchmarkAddVW(b *testing.B) { -// for _, n := range benchSizes { -// if isRaceBuilder && n > 1e3 { -// continue -// } -// x := rndV(n) -// y := rndW() -// z := make([]Word, n) -// b.Run(fmt.Sprint(n), func(b *testing.B) { -// b.SetBytes(int64(n * _S)) -// for i := 0; i < b.N; i++ { -// addVW(z, x, y) -// } -// }) -// } -// } +func BenchmarkAdd10VW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rnd10V(n) + y := rnd10W() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + add10VW(z, x, y) + } + }) + } +} -// func BenchmarkSubVW(b *testing.B) { -// for _, n := range benchSizes { -// if isRaceBuilder && n > 1e3 { -// continue -// } -// x := rndV(n) -// y := rndW() -// z := make([]Word, n) -// b.Run(fmt.Sprint(n), func(b *testing.B) { -// b.SetBytes(int64(n * _S)) -// for i := 0; i < b.N; i++ { -// subVW(z, x, y) -// } -// }) -// } -// } +func BenchmarkSub10VW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rnd10V(n) + y := rnd10W() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + sub10VW(z, x, y) + } + }) + } +} + +// TODO(db47h): complete port of the tests // type funVWW func(z, x []Word, y, r Word) (c Word) // type argVWW struct { diff --git a/decimal_test.go b/decimal_test.go index 2d6b570..e112b7d 100644 --- a/decimal_test.go +++ b/decimal_test.go @@ -34,7 +34,7 @@ func TestDecimal_dnorm(t *testing.T) { again: w := uint(rand.Uint64()) % _DB e := uint(rand.Intn(_DW + 1)) - h, l := mulWW(Word(w), Word(pow10(e))) + h, l := mulWW(Word(w), pow10(e)) // convert h, l from base _B (2**64) to base _BD (10**19) or 2**32 -> 10**9 h, l = div10W(h, l) d := dec{Word(l), Word(h)}.norm() @@ -47,7 +47,7 @@ func TestDecimal_dnorm(t *testing.T) { dd := dec(nil).set(d) s := dnorm(dd) // d should now have a single element with e shifted left - ew := w * pow10(_DW-decDigits(w)) + ew := w * uint(pow10(_DW-decDigits(w))) es := int64(uint(len(d)*_DW) - (decDigits(w) + e)) if dd[len(dd)-1] != Word(ew) || s != es { t.Fatalf("%ve%v => dnorm(%v) = %v, %v --- Expected %d, %d", diff --git a/go.mod b/go.mod index 9755fbf..8e60a82 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module github.com/db47h/decimal go 1.14 - -require golang.org/x/tools v0.0.0-20200513122804-866d71a3170a // indirect diff --git a/testdata/bench b/testdata/bench deleted file mode 100755 index c234221..0000000 --- a/testdata/bench +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -TEST="$1" -shift - -go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 -tags decimal_pure_go "$@" | tee bench-go -go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-asm - -benchstat bench-go bench-asm - -rm bench-go -rm bench-asm diff --git a/testdata/benchasm b/testdata/benchasm new file mode 100755 index 0000000..0bcfa9a --- /dev/null +++ b/testdata/benchasm @@ -0,0 +1,14 @@ +#!/bin/bash + +TEST="$1" +shift + +if [ ! -e "bench-go" ]; then + go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 -tags decimal_pure_go "$@" | tee bench-go +fi +go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-asm + +benchstat bench-go bench-asm + +# rm bench-go +# rm bench-asm diff --git a/testdata/benchgit b/testdata/benchgit index 5767a31..cd767dc 100755 --- a/testdata/benchgit +++ b/testdata/benchgit @@ -20,23 +20,24 @@ function doBench { shift local REV="$1" shift - echo $TEST - $REV go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-"$REV" } -if [ "$REV" == "master" ]; then - # compare current against master - git stash - doBench "$TEST" "$REV" "$@" - git stash pop -else - git checkout "$REV" - doBench "$TEST" "$REV" "$@" - git checkout master +if [ ! -e "bench-$REV" ]; then + if [ "$REV" == "master" ]; then + # compare current against master + git stash + doBench "$TEST" "$REV" "$@" + git stash pop + else + git checkout "$REV" + doBench "$TEST" "$REV" "$@" + git checkout master + fi fi doBench "$TEST" "current" "$@" benchstat bench-"$REV" bench-current -rm bench-"$REV" bench-current +# rm bench-"$REV" bench-current