diff --git a/arith.go b/arith.go new file mode 100644 index 0000000..3cdf890 --- /dev/null +++ b/arith.go @@ -0,0 +1,216 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This file provides Go implementations of elementary multi-precision +// arithmetic operations on word vectors. These have the suffix _g. +// These are needed for platforms without assembly implementations of these routines. +// This file also contains elementary operations that can be implemented +// sufficiently efficiently in Go. + +package decimal + +import "math/bits" + +// A Word represents a single digit of a multi-precision unsigned integer. +type Word uint + +const ( + _S = _W / 8 // word size in bytes + + _W = bits.UintSize // word size in bits + _B = 1 << _W // digit base + _M = _B - 1 // digit mask +) + +// Many of the loops in this file are of the form +// for i := 0; i < len(z) && i < len(x) && i < len(y); i++ +// i < len(z) is the real condition. +// However, checking i < len(x) && i < len(y) as well is faster than +// having the compiler do a bounds check in the body of the loop; +// remarkably it is even faster than hoisting the bounds check +// out of the loop, by doing something like +// _, _ = x[len(z)-1], y[len(z)-1] +// There are other ways to hoist the bounds check out of the loop, +// but the compiler's BCE isn't powerful enough for them (yet?). +// See the discussion in CL 164966. + +// ---------------------------------------------------------------------------- +// Elementary operations on words +// +// These operations are used by the vector operations below. + +// z1<<_W + z0 = x*y +func mulWW_g(x, y Word) (z1, z0 Word) { + hi, lo := bits.Mul(uint(x), uint(y)) + return Word(hi), Word(lo) +} + +// z1<<_W + z0 = x*y + c +func mulAddWWW_g(x, y, c Word) (z1, z0 Word) { + hi, lo := bits.Mul(uint(x), uint(y)) + var cc uint + lo, cc = bits.Add(lo, uint(c), 0) + return Word(hi + cc), Word(lo) +} + +// nlz returns the number of leading zeros in x. +// Wraps bits.LeadingZeros call for convenience. +func nlz(x Word) uint { + return uint(bits.LeadingZeros(uint(x))) +} + +// q = (u1<<_W + u0 - r)/v +func divWW_g(u1, u0, v Word) (q, r Word) { + qq, rr := bits.Div(uint(u1), uint(u0), uint(v)) + return Word(qq), Word(rr) +} + +// The resulting carry c is either 0 or 1. +func addVV_g(z, x, y []Word) (c Word) { + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + zi, cc := bits.Add(uint(x[i]), uint(y[i]), uint(c)) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +// The resulting carry c is either 0 or 1. +func subVV_g(z, x, y []Word) (c Word) { + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + zi, cc := bits.Sub(uint(x[i]), uint(y[i]), uint(c)) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +// The resulting carry c is either 0 or 1. +func addVW_g(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + zi, cc := bits.Add(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +// addVWlarge is addVW, but intended for large z. +// The only difference is that we check on every iteration +// whether we are done with carries, +// and if so, switch to a much faster copy instead. +// This is only a good idea for large z, +// because the overhead of the check and the function call +// outweigh the benefits when z is small. +func addVWlarge(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + if c == 0 { + copy(z[i:], x[i:]) + return + } + zi, cc := bits.Add(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +func subVW_g(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + zi, cc := bits.Sub(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +// subVWlarge is to subVW as addVWlarge is to addVW. +func subVWlarge(z, x []Word, y Word) (c Word) { + c = y + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + if c == 0 { + copy(z[i:], x[i:]) + return + } + zi, cc := bits.Sub(uint(x[i]), uint(c), 0) + z[i] = Word(zi) + c = Word(cc) + } + return +} + +func shlVU_g(z, x []Word, s uint) (c Word) { + if s == 0 { + copy(z, x) + return + } + if len(z) == 0 { + return + } + s &= _W - 1 // hint to the compiler that shifts by s don't need guard code + ŝ := _W - s + ŝ &= _W - 1 // ditto + c = x[len(z)-1] >> ŝ + for i := len(z) - 1; i > 0; i-- { + z[i] = x[i]<>ŝ + } + z[0] = x[0] << s + return +} + +func shrVU_g(z, x []Word, s uint) (c Word) { + if s == 0 { + copy(z, x) + return + } + if len(z) == 0 { + return + } + s &= _W - 1 // hint to the compiler that shifts by s don't need guard code + ŝ := _W - s + ŝ &= _W - 1 // ditto + c = x[0] << ŝ + for i := 0; i < len(z)-1; i++ { + z[i] = x[i]>>s | x[i+1]<<ŝ + } + z[len(z)-1] = x[len(z)-1] >> s + return +} + +func mulAddVWW_g(z, x []Word, y, r Word) (c Word) { + c = r + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + c, z[i] = mulAddWWW_g(x[i], y, c) + } + return +} + +func addMulVVW_g(z, x []Word, y Word) (c Word) { + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + z1, z0 := mulAddWWW_g(x[i], y, z[i]) + lo, cc := bits.Add(uint(z0), uint(c), 0) + c, z[i] = Word(cc), Word(lo) + c += z1 + } + return +} + +func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) { + r = xn + for i := len(z) - 1; i >= 0; i-- { + z[i], r = divWW_g(r, x[i], y) + } + return +} diff --git a/arith_386.s b/arith_386.s new file mode 100644 index 0000000..f61da2a --- /dev/null +++ b/arith_386.s @@ -0,0 +1,271 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVL x+0(FP), AX + MULL y+4(FP) + MOVL DX, z1+8(FP) + MOVL AX, z0+12(FP) + RET + + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB),NOSPLIT,$0 + MOVL x1+0(FP), DX + MOVL x0+4(FP), AX + DIVL y+8(FP) + MOVL AX, q+12(FP) + MOVL DX, r+16(FP) + RET + + +// func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), CX + MOVL z_len+4(FP), BP + MOVL $0, BX // i = 0 + MOVL $0, DX // c = 0 + JMP E1 + +L1: MOVL (SI)(BX*4), AX + ADDL DX, DX // restore CF + ADCL (CX)(BX*4), AX + SBBL DX, DX // save CF + MOVL AX, (DI)(BX*4) + ADDL $1, BX // i++ + +E1: CMPL BX, BP // i < n + JL L1 + + NEGL DX + MOVL DX, c+36(FP) + RET + + +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SBBL instead of ADCL and label names) +TEXT ·subVV(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), CX + MOVL z_len+4(FP), BP + MOVL $0, BX // i = 0 + MOVL $0, DX // c = 0 + JMP E2 + +L2: MOVL (SI)(BX*4), AX + ADDL DX, DX // restore CF + SBBL (CX)(BX*4), AX + SBBL DX, DX // save CF + MOVL AX, (DI)(BX*4) + ADDL $1, BX // i++ + +E2: CMPL BX, BP // i < n + JL L2 + + NEGL DX + MOVL DX, c+36(FP) + RET + + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), AX // c = y + MOVL z_len+4(FP), BP + MOVL $0, BX // i = 0 + JMP E3 + +L3: ADDL (SI)(BX*4), AX + MOVL AX, (DI)(BX*4) + SBBL AX, AX // save CF + NEGL AX + ADDL $1, BX // i++ + +E3: CMPL BX, BP // i < n + JL L3 + + MOVL AX, c+28(FP) + RET + + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), AX // c = y + MOVL z_len+4(FP), BP + MOVL $0, BX // i = 0 + JMP E4 + +L4: MOVL (SI)(BX*4), DX + SUBL AX, DX + MOVL DX, (DI)(BX*4) + SBBL AX, AX // save CF + NEGL AX + ADDL $1, BX // i++ + +E4: CMPL BX, BP // i < n + JL L4 + + MOVL AX, c+28(FP) + RET + + +// func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB),NOSPLIT,$0 + MOVL z_len+4(FP), BX // i = z + SUBL $1, BX // i-- + JL X8b // i < 0 (n <= 0) + + // n > 0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL s+24(FP), CX + MOVL (SI)(BX*4), AX // w1 = x[n-1] + MOVL $0, DX + SHLL CX, AX, DX // w1>>ŝ + MOVL DX, c+28(FP) + + CMPL BX, $0 + JLE X8a // i <= 0 + + // i > 0 +L8: MOVL AX, DX // w = w1 + MOVL -4(SI)(BX*4), AX // w1 = x[i-1] + SHLL CX, AX, DX // w<>ŝ + MOVL DX, (DI)(BX*4) // z[i] = w<>ŝ + SUBL $1, BX // i-- + JG L8 // i > 0 + + // i <= 0 +X8a: SHLL CX, AX // w1< 0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL s+24(FP), CX + MOVL (SI), AX // w1 = x[0] + MOVL $0, DX + SHRL CX, AX, DX // w1<<ŝ + MOVL DX, c+28(FP) + + MOVL $0, BX // i = 0 + JMP E9 + + // i < n-1 +L9: MOVL AX, DX // w = w1 + MOVL 4(SI)(BX*4), AX // w1 = x[i+1] + SHRL CX, AX, DX // w>>s | w1<<ŝ + MOVL DX, (DI)(BX*4) // z[i] = w>>s | w1<<ŝ + ADDL $1, BX // i++ + +E9: CMPL BX, BP + JL L9 // i < n-1 + + // i >= n-1 +X9a: SHRL CX, AX // w1>>s + MOVL AX, (DI)(BP*4) // z[n-1] = w1>>s + RET + +X9b: MOVL $0, c+28(FP) + RET + + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), BP + MOVL r+28(FP), CX // c = r + MOVL z_len+4(FP), BX + LEAL (DI)(BX*4), DI + LEAL (SI)(BX*4), SI + NEGL BX // i = -n + JMP E5 + +L5: MOVL (SI)(BX*4), AX + MULL BP + ADDL CX, AX + ADCL $0, DX + MOVL AX, (DI)(BX*4) + MOVL DX, CX + ADDL $1, BX // i++ + +E5: CMPL BX, $0 // i < 0 + JL L5 + + MOVL CX, c+32(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL x+12(FP), SI + MOVL y+24(FP), BP + MOVL z_len+4(FP), BX + LEAL (DI)(BX*4), DI + LEAL (SI)(BX*4), SI + NEGL BX // i = -n + MOVL $0, CX // c = 0 + JMP E6 + +L6: MOVL (SI)(BX*4), AX + MULL BP + ADDL CX, AX + ADCL $0, DX + ADDL AX, (DI)(BX*4) + ADCL $0, DX + MOVL DX, CX + ADDL $1, BX // i++ + +E6: CMPL BX, $0 // i < 0 + JL L6 + + MOVL CX, c+28(FP) + RET + + +// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) +TEXT ·divWVW(SB),NOSPLIT,$0 + MOVL z+0(FP), DI + MOVL xn+12(FP), DX // r = xn + MOVL x+16(FP), SI + MOVL y+28(FP), CX + MOVL z_len+4(FP), BX // i = z + JMP E7 + +L7: MOVL (SI)(BX*4), AX + DIVL CX + MOVL AX, (DI)(BX*4) + +E7: SUBL $1, BX // i-- + JGE L7 // i >= 0 + + MOVL DX, r+32(FP) + RET diff --git a/arith_amd64.go b/arith_amd64.go new file mode 100644 index 0000000..7d31807 --- /dev/null +++ b/arith_amd64.go @@ -0,0 +1,9 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +package decimal + +var support_adx = false diff --git a/arith_amd64.s b/arith_amd64.s new file mode 100644 index 0000000..b75639f --- /dev/null +++ b/arith_amd64.s @@ -0,0 +1,551 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVQ x+0(FP), AX + MULQ y+8(FP) + MOVQ DX, z1+16(FP) + MOVQ AX, z0+24(FP) + RET + + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB),NOSPLIT,$0 + MOVQ x1+0(FP), DX + MOVQ x0+8(FP), AX + DIVQ y+16(FP) + MOVQ AX, q+24(FP) + MOVQ DX, r+32(FP) + RET + +// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. +// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. +// This is faster than using rotate instructions. + +// func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + MOVQ x+24(FP), R8 + MOVQ y+48(FP), R9 + MOVQ z+0(FP), R10 + + MOVQ $0, CX // c = 0 + MOVQ $0, SI // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V1 // if n < 0 goto V1 + +U1: // n >= 0 + // regular loop body unrolled 4x + ADDQ CX, CX // restore CF + MOVQ 0(R8)(SI*8), R11 + MOVQ 8(R8)(SI*8), R12 + MOVQ 16(R8)(SI*8), R13 + MOVQ 24(R8)(SI*8), R14 + ADCQ 0(R9)(SI*8), R11 + ADCQ 8(R9)(SI*8), R12 + ADCQ 16(R9)(SI*8), R13 + ADCQ 24(R9)(SI*8), R14 + MOVQ R11, 0(R10)(SI*8) + MOVQ R12, 8(R10)(SI*8) + MOVQ R13, 16(R10)(SI*8) + MOVQ R14, 24(R10)(SI*8) + SBBQ CX, CX // save CF + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U1 // if n >= 0 goto U1 + +V1: ADDQ $4, DI // n += 4 + JLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + ADDQ CX, CX // restore CF + MOVQ 0(R8)(SI*8), R11 + ADCQ 0(R9)(SI*8), R11 + MOVQ R11, 0(R10)(SI*8) + SBBQ CX, CX // save CF + + ADDQ $1, SI // i++ + SUBQ $1, DI // n-- + JG L1 // if n > 0 goto L1 + +E1: NEGQ CX + MOVQ CX, c+72(FP) // return c + RET + + +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SBBQ instead of ADCQ and label names) +TEXT ·subVV(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + MOVQ x+24(FP), R8 + MOVQ y+48(FP), R9 + MOVQ z+0(FP), R10 + + MOVQ $0, CX // c = 0 + MOVQ $0, SI // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V2 // if n < 0 goto V2 + +U2: // n >= 0 + // regular loop body unrolled 4x + ADDQ CX, CX // restore CF + MOVQ 0(R8)(SI*8), R11 + MOVQ 8(R8)(SI*8), R12 + MOVQ 16(R8)(SI*8), R13 + MOVQ 24(R8)(SI*8), R14 + SBBQ 0(R9)(SI*8), R11 + SBBQ 8(R9)(SI*8), R12 + SBBQ 16(R9)(SI*8), R13 + SBBQ 24(R9)(SI*8), R14 + MOVQ R11, 0(R10)(SI*8) + MOVQ R12, 8(R10)(SI*8) + MOVQ R13, 16(R10)(SI*8) + MOVQ R14, 24(R10)(SI*8) + SBBQ CX, CX // save CF + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U2 // if n >= 0 goto U2 + +V2: ADDQ $4, DI // n += 4 + JLE E2 // if n <= 0 goto E2 + +L2: // n > 0 + ADDQ CX, CX // restore CF + MOVQ 0(R8)(SI*8), R11 + SBBQ 0(R9)(SI*8), R11 + MOVQ R11, 0(R10)(SI*8) + SBBQ CX, CX // save CF + + ADDQ $1, SI // i++ + SUBQ $1, DI // n-- + JG L2 // if n > 0 goto L2 + +E2: NEGQ CX + MOVQ CX, c+72(FP) // return c + RET + + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + CMPQ DI, $32 + JG large + MOVQ x+24(FP), R8 + MOVQ y+48(FP), CX // c = y + MOVQ z+0(FP), R10 + + MOVQ $0, SI // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V3 // if n < 4 goto V3 + +U3: // n >= 0 + // regular loop body unrolled 4x + MOVQ 0(R8)(SI*8), R11 + MOVQ 8(R8)(SI*8), R12 + MOVQ 16(R8)(SI*8), R13 + MOVQ 24(R8)(SI*8), R14 + ADDQ CX, R11 + ADCQ $0, R12 + ADCQ $0, R13 + ADCQ $0, R14 + SBBQ CX, CX // save CF + NEGQ CX + MOVQ R11, 0(R10)(SI*8) + MOVQ R12, 8(R10)(SI*8) + MOVQ R13, 16(R10)(SI*8) + MOVQ R14, 24(R10)(SI*8) + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U3 // if n >= 0 goto U3 + +V3: ADDQ $4, DI // n += 4 + JLE E3 // if n <= 0 goto E3 + +L3: // n > 0 + ADDQ 0(R8)(SI*8), CX + MOVQ CX, 0(R10)(SI*8) + SBBQ CX, CX // save CF + NEGQ CX + + ADDQ $1, SI // i++ + SUBQ $1, DI // n-- + JG L3 // if n > 0 goto L3 + +E3: MOVQ CX, c+56(FP) // return c + RET +large: + JMP ·addVWlarge(SB) + + +// func subVW(z, x []Word, y Word) (c Word) +// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) +TEXT ·subVW(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), DI + CMPQ DI, $32 + JG large + MOVQ x+24(FP), R8 + MOVQ y+48(FP), CX // c = y + MOVQ z+0(FP), R10 + + MOVQ $0, SI // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUBQ $4, DI // n -= 4 + JL V4 // if n < 4 goto V4 + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVQ 0(R8)(SI*8), R11 + MOVQ 8(R8)(SI*8), R12 + MOVQ 16(R8)(SI*8), R13 + MOVQ 24(R8)(SI*8), R14 + SUBQ CX, R11 + SBBQ $0, R12 + SBBQ $0, R13 + SBBQ $0, R14 + SBBQ CX, CX // save CF + NEGQ CX + MOVQ R11, 0(R10)(SI*8) + MOVQ R12, 8(R10)(SI*8) + MOVQ R13, 16(R10)(SI*8) + MOVQ R14, 24(R10)(SI*8) + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE U4 // if n >= 0 goto U4 + +V4: ADDQ $4, DI // n += 4 + JLE E4 // if n <= 0 goto E4 + +L4: // n > 0 + MOVQ 0(R8)(SI*8), R11 + SUBQ CX, R11 + MOVQ R11, 0(R10)(SI*8) + SBBQ CX, CX // save CF + NEGQ CX + + ADDQ $1, SI // i++ + SUBQ $1, DI // n-- + JG L4 // if n > 0 goto L4 + +E4: MOVQ CX, c+56(FP) // return c + RET +large: + JMP ·subVWlarge(SB) + + +// func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB),NOSPLIT,$0 + MOVQ z_len+8(FP), BX // i = z + SUBQ $1, BX // i-- + JL X8b // i < 0 (n <= 0) + + // n > 0 + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + MOVQ s+48(FP), CX + MOVQ (R8)(BX*8), AX // w1 = x[n-1] + MOVQ $0, DX + SHLQ CX, AX, DX // w1>>ŝ + MOVQ DX, c+56(FP) + + CMPQ BX, $0 + JLE X8a // i <= 0 + + // i > 0 +L8: MOVQ AX, DX // w = w1 + MOVQ -8(R8)(BX*8), AX // w1 = x[i-1] + SHLQ CX, AX, DX // w<>ŝ + MOVQ DX, (R10)(BX*8) // z[i] = w<>ŝ + SUBQ $1, BX // i-- + JG L8 // i > 0 + + // i <= 0 +X8a: SHLQ CX, AX // w1< 0 + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + MOVQ s+48(FP), CX + MOVQ (R8), AX // w1 = x[0] + MOVQ $0, DX + SHRQ CX, AX, DX // w1<<ŝ + MOVQ DX, c+56(FP) + + MOVQ $0, BX // i = 0 + JMP E9 + + // i < n-1 +L9: MOVQ AX, DX // w = w1 + MOVQ 8(R8)(BX*8), AX // w1 = x[i+1] + SHRQ CX, AX, DX // w>>s | w1<<ŝ + MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ + ADDQ $1, BX // i++ + +E9: CMPQ BX, R11 + JL L9 // i < n-1 + + // i >= n-1 +X9a: SHRQ CX, AX // w1>>s + MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s + RET + +X9b: MOVQ $0, c+56(FP) + RET + + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + MOVQ y+48(FP), R9 + MOVQ r+56(FP), CX // c = r + MOVQ z_len+8(FP), R11 + MOVQ $0, BX // i = 0 + + CMPQ R11, $4 + JL E5 + +U5: // i+4 <= n + // regular loop body unrolled 4x + MOVQ (0*8)(R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + MOVQ AX, (0*8)(R10)(BX*8) + MOVQ DX, CX + MOVQ (1*8)(R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + MOVQ AX, (1*8)(R10)(BX*8) + MOVQ DX, CX + MOVQ (2*8)(R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + MOVQ AX, (2*8)(R10)(BX*8) + MOVQ DX, CX + MOVQ (3*8)(R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + MOVQ AX, (3*8)(R10)(BX*8) + MOVQ DX, CX + ADDQ $4, BX // i += 4 + + LEAQ 4(BX), DX + CMPQ DX, R11 + JLE U5 + JMP E5 + +L5: MOVQ (R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + MOVQ AX, (R10)(BX*8) + MOVQ DX, CX + ADDQ $1, BX // i++ + +E5: CMPQ BX, R11 // i < n + JL L5 + + MOVQ CX, c+64(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB),NOSPLIT,$0 + CMPB ·support_adx(SB), $1 + JEQ adx + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + MOVQ y+48(FP), R9 + MOVQ z_len+8(FP), R11 + MOVQ $0, BX // i = 0 + MOVQ $0, CX // c = 0 + MOVQ R11, R12 + ANDQ $-2, R12 + CMPQ R11, $2 + JAE A6 + JMP E6 + +A6: + MOVQ (R8)(BX*8), AX + MULQ R9 + ADDQ (R10)(BX*8), AX + ADCQ $0, DX + ADDQ CX, AX + ADCQ $0, DX + MOVQ DX, CX + MOVQ AX, (R10)(BX*8) + + MOVQ (8)(R8)(BX*8), AX + MULQ R9 + ADDQ (8)(R10)(BX*8), AX + ADCQ $0, DX + ADDQ CX, AX + ADCQ $0, DX + MOVQ DX, CX + MOVQ AX, (8)(R10)(BX*8) + + ADDQ $2, BX + CMPQ BX, R12 + JL A6 + JMP E6 + +L6: MOVQ (R8)(BX*8), AX + MULQ R9 + ADDQ CX, AX + ADCQ $0, DX + ADDQ AX, (R10)(BX*8) + ADCQ $0, DX + MOVQ DX, CX + ADDQ $1, BX // i++ + +E6: CMPQ BX, R11 // i < n + JL L6 + + MOVQ CX, c+56(FP) + RET + +adx: + MOVQ z_len+8(FP), R11 + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + MOVQ y+48(FP), DX + MOVQ $0, BX // i = 0 + MOVQ $0, CX // carry + CMPQ R11, $8 + JAE adx_loop_header + CMPQ BX, R11 + JL adx_short + MOVQ CX, c+56(FP) + RET + +adx_loop_header: + MOVQ R11, R13 + ANDQ $-8, R13 +adx_loop: + XORQ R9, R9 // unset flags + MULXQ (R8), SI, DI + ADCXQ CX,SI + ADOXQ (R10), SI + MOVQ SI,(R10) + + MULXQ 8(R8), AX, CX + ADCXQ DI, AX + ADOXQ 8(R10), AX + MOVQ AX, 8(R10) + + MULXQ 16(R8), SI, DI + ADCXQ CX, SI + ADOXQ 16(R10), SI + MOVQ SI, 16(R10) + + MULXQ 24(R8), AX, CX + ADCXQ DI, AX + ADOXQ 24(R10), AX + MOVQ AX, 24(R10) + + MULXQ 32(R8), SI, DI + ADCXQ CX, SI + ADOXQ 32(R10), SI + MOVQ SI, 32(R10) + + MULXQ 40(R8), AX, CX + ADCXQ DI, AX + ADOXQ 40(R10), AX + MOVQ AX, 40(R10) + + MULXQ 48(R8), SI, DI + ADCXQ CX, SI + ADOXQ 48(R10), SI + MOVQ SI, 48(R10) + + MULXQ 56(R8), AX, CX + ADCXQ DI, AX + ADOXQ 56(R10), AX + MOVQ AX, 56(R10) + + ADCXQ R9, CX + ADOXQ R9, CX + + ADDQ $64, R8 + ADDQ $64, R10 + ADDQ $8, BX + + CMPQ BX, R13 + JL adx_loop + MOVQ z+0(FP), R10 + MOVQ x+24(FP), R8 + CMPQ BX, R11 + JL adx_short + MOVQ CX, c+56(FP) + RET + +adx_short: + MULXQ (R8)(BX*8), SI, DI + ADDQ CX, SI + ADCQ $0, DI + ADDQ SI, (R10)(BX*8) + ADCQ $0, DI + MOVQ DI, CX + ADDQ $1, BX // i++ + + CMPQ BX, R11 + JL adx_short + + MOVQ CX, c+56(FP) + RET + + + +// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) +TEXT ·divWVW(SB),NOSPLIT,$0 + MOVQ z+0(FP), R10 + MOVQ xn+24(FP), DX // r = xn + MOVQ x+32(FP), R8 + MOVQ y+56(FP), R9 + MOVQ z_len+8(FP), BX // i = z + JMP E7 + +L7: MOVQ (R8)(BX*8), AX + DIVQ R9 + MOVQ AX, (R10)(BX*8) + +E7: SUBQ $1, BX // i-- + JGE L7 // i >= 0 + + MOVQ DX, r+64(FP) + RET diff --git a/arith_arm.s b/arith_arm.s new file mode 100644 index 0000000..33aa36f --- /dev/null +++ b/arith_arm.s @@ -0,0 +1,294 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB),NOSPLIT,$0 + ADD.S $0, R0 // clear carry flag + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R4 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + ADD R4<<2, R1, R4 + B E1 +L1: + MOVW.P 4(R2), R5 + MOVW.P 4(R3), R6 + ADC.S R6, R5 + MOVW.P R5, 4(R1) +E1: + TEQ R1, R4 + BNE L1 + + MOVW $0, R0 + MOVW.CS $1, R0 + MOVW R0, c+36(FP) + RET + + +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SBC instead of ADC and label names) +TEXT ·subVV(SB),NOSPLIT,$0 + SUB.S $0, R0 // clear borrow flag + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R4 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + ADD R4<<2, R1, R4 + B E2 +L2: + MOVW.P 4(R2), R5 + MOVW.P 4(R3), R6 + SBC.S R6, R5 + MOVW.P R5, 4(R1) +E2: + TEQ R1, R4 + BNE L2 + + MOVW $0, R0 + MOVW.CC $1, R0 + MOVW R0, c+36(FP) + RET + + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB),NOSPLIT,$0 + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R4 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + ADD R4<<2, R1, R4 + TEQ R1, R4 + BNE L3a + MOVW R3, c+28(FP) + RET +L3a: + MOVW.P 4(R2), R5 + ADD.S R3, R5 + MOVW.P R5, 4(R1) + B E3 +L3: + MOVW.P 4(R2), R5 + ADC.S $0, R5 + MOVW.P R5, 4(R1) +E3: + TEQ R1, R4 + BNE L3 + + MOVW $0, R0 + MOVW.CS $1, R0 + MOVW R0, c+28(FP) + RET + + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW(SB),NOSPLIT,$0 + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R4 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + ADD R4<<2, R1, R4 + TEQ R1, R4 + BNE L4a + MOVW R3, c+28(FP) + RET +L4a: + MOVW.P 4(R2), R5 + SUB.S R3, R5 + MOVW.P R5, 4(R1) + B E4 +L4: + MOVW.P 4(R2), R5 + SBC.S $0, R5 + MOVW.P R5, 4(R1) +E4: + TEQ R1, R4 + BNE L4 + + MOVW $0, R0 + MOVW.CC $1, R0 + MOVW R0, c+28(FP) + RET + + +// func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB),NOSPLIT,$0 + MOVW z_len+4(FP), R5 + TEQ $0, R5 + BEQ X7 + + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + ADD R5<<2, R2, R2 + ADD R5<<2, R1, R5 + MOVW s+24(FP), R3 + TEQ $0, R3 // shift 0 is special + BEQ Y7 + ADD $4, R1 // stop one word early + MOVW $32, R4 + SUB R3, R4 + MOVW $0, R7 + + MOVW.W -4(R2), R6 + MOVW R6<>R4, R6 + MOVW R6, c+28(FP) + B E7 + +L7: + MOVW.W -4(R2), R6 + ORR R6>>R4, R7 + MOVW.W R7, -4(R5) + MOVW R6<>R3, R7 + MOVW R6<>R3, R7 +E6: + TEQ R1, R5 + BNE L6 + + MOVW R7, 0(R1) + RET + +Y6: // copy loop, because shift 0 == shift 32 + MOVW.P 4(R2), R6 + MOVW.P R6, 4(R1) + TEQ R1, R5 + BNE Y6 + +X6: + MOVW $0, R1 + MOVW R1, c+28(FP) + RET + + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R5 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW r+28(FP), R4 + ADD R5<<2, R1, R5 + B E8 + + // word loop +L8: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E8: + TEQ R1, R5 + BNE L8 + + MOVW R4, c+32(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB),NOSPLIT,$0 + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW z_len+4(FP), R5 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + ADD R5<<2, R1, R5 + MOVW $0, R4 + B E9 + + // word loop +L9: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW 0(R1), R4 + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E9: + TEQ R1, R5 + BNE L9 + + MOVW R4, c+28(FP) + RET + + +// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) +TEXT ·divWVW(SB),NOSPLIT,$0 + // ARM has no multiword division, so use portable code. + B ·divWVW_g(SB) + + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB),NOSPLIT,$0 + // ARM has no multiword division, so use portable code. + B ·divWW_g(SB) + + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVW x+0(FP), R1 + MOVW y+4(FP), R2 + MULLU R1, R2, (R4, R3) + MOVW R4, z1+8(FP) + MOVW R3, z0+12(FP) + RET diff --git a/arith_arm64.s b/arith_arm64.s new file mode 100644 index 0000000..18e513e --- /dev/null +++ b/arith_arm64.s @@ -0,0 +1,517 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// TODO: Consider re-implementing using Advanced SIMD +// once the assembler supports those instructions. + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVD x+0(FP), R0 + MOVD y+8(FP), R1 + MUL R0, R1, R2 + UMULH R0, R1, R3 + MOVD R3, z1+16(FP) + MOVD R2, z0+24(FP) + RET + + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB),NOSPLIT,$0 + B ·divWW_g(SB) // ARM64 has no multiword division + + +// func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R10 + ADDS $0, R0 // clear carry flag + TBZ $0, R0, two + MOVD.P 8(R8), R11 + MOVD.P 8(R9), R15 + ADCS R15, R11 + MOVD.P R11, 8(R10) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R8), (R11, R12) + LDP.P 16(R9), (R15, R16) + ADCS R15, R11 + ADCS R16, R12 + STP.P (R11, R12), 16(R10) + SUB $2, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + LDP.P 32(R8), (R11, R12) + LDP -16(R8), (R13, R14) + LDP.P 32(R9), (R15, R16) + LDP -16(R9), (R17, R19) + ADCS R15, R11 + ADCS R16, R12 + ADCS R17, R13 + ADCS R19, R14 + STP.P (R11, R12), 32(R10) + STP (R13, R14), -16(R10) + SUB $4, R0 + B loop +done: + CSET HS, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + + +// func subVV(z, x, y []Word) (c Word) +TEXT ·subVV(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R10 + CMP R0, R0 // set carry flag + TBZ $0, R0, two + MOVD.P 8(R8), R11 + MOVD.P 8(R9), R15 + SBCS R15, R11 + MOVD.P R11, 8(R10) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R8), (R11, R12) + LDP.P 16(R9), (R15, R16) + SBCS R15, R11 + SBCS R16, R12 + STP.P (R11, R12), 16(R10) + SUB $2, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + LDP.P 32(R8), (R11, R12) + LDP -16(R8), (R13, R14) + LDP.P 32(R9), (R15, R16) + LDP -16(R9), (R17, R19) + SBCS R15, R11 + SBCS R16, R12 + SBCS R17, R13 + SBCS R19, R14 + STP.P (R11, R12), 32(R10) + STP (R13, R14), -16(R10) + SUB $4, R0 + B loop +done: + CSET LO, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CBZ R0, len0 // the length of z is 0 + MOVD.P 8(R1), R4 + ADDS R2, R4 // z[0] = x[0] + y, set carry + MOVD.P R4, 8(R3) + SUB $1, R0 + CBZ R0, len1 // the length of z is 1 + TBZ $0, R0, two + MOVD.P 8(R1), R4 // do it once + ADCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +two: // do it twice + TBZ $1, R0, loop + LDP.P 16(R1), (R4, R5) + ADCS $0, R4, R8 // c, z[i] = x[i] + c + ADCS $0, R5, R9 + STP.P (R8, R9), 16(R3) + SUB $2, R0 +loop: // do four times per round + CBZ R0, len1 // careful not to touch the carry flag + LDP.P 32(R1), (R4, R5) + LDP -16(R1), (R6, R7) + ADCS $0, R4, R8 + ADCS $0, R5, R9 + ADCS $0, R6, R10 + ADCS $0, R7, R11 + STP.P (R8, R9), 32(R3) + STP (R10, R11), -16(R3) + SUB $4, R0 + B loop +len1: + CSET HS, R2 // extract carry flag +len0: + MOVD R2, c+56(FP) + RET + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CBZ R0, len0 // the length of z is 0 + MOVD.P 8(R1), R4 + SUBS R2, R4 // z[0] = x[0] - y, set carry + MOVD.P R4, 8(R3) + SUB $1, R0 + CBZ R0, len1 // the length of z is 1 + TBZ $0, R0, two // do it once + MOVD.P 8(R1), R4 + SBCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +two: // do it twice + TBZ $1, R0, loop + LDP.P 16(R1), (R4, R5) + SBCS $0, R4, R8 // c, z[i] = x[i] + c + SBCS $0, R5, R9 + STP.P (R8, R9), 16(R3) + SUB $2, R0 +loop: // do four times per round + CBZ R0, len1 // careful not to touch the carry flag + LDP.P 32(R1), (R4, R5) + LDP -16(R1), (R6, R7) + SBCS $0, R4, R8 + SBCS $0, R5, R9 + SBCS $0, R6, R10 + SBCS $0, R7, R11 + STP.P (R8, R9), 32(R3) + STP (R10, R11), -16(R3) + SUB $4, R0 + B loop +len1: + CSET LO, R2 // extract carry flag +len0: + MOVD R2, c+56(FP) + RET + +// func shlVU(z, x []Word, s uint) (c Word) +// This implementation handles the shift operation from the high word to the low word, +// which may be an error for the case where the low word of x overlaps with the high +// word of z. When calling this function directly, you need to pay attention to this +// situation. +TEXT ·shlVU(SB),NOSPLIT,$0 + LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) + MOVD x+24(FP), R2 + MOVD s+48(FP), R3 + ADD R1<<3, R0 // R0 = &z[n] + ADD R1<<3, R2 // R2 = &x[n] + CBZ R1, len0 + CBZ R3, copy // if the number of shift is 0, just copy x to z + MOVD $64, R4 + SUB R3, R4 + // handling the most significant element x[n-1] + MOVD.W -8(R2), R6 + LSR R4, R6, R5 // return value + LSL R3, R6, R8 // x[i] << s + SUB $1, R1 +one: TBZ $0, R1, two + MOVD.W -8(R2), R6 + LSR R4, R6, R7 + ORR R8, R7 + LSL R3, R6, R8 + SUB $1, R1 + MOVD.W R7, -8(R0) +two: + TBZ $1, R1, loop + LDP.W -16(R2), (R6, R7) + LSR R4, R7, R10 + ORR R8, R10 + LSL R3, R7 + LSR R4, R6, R9 + ORR R7, R9 + LSL R3, R6, R8 + SUB $2, R1 + STP.W (R9, R10), -16(R0) +loop: + CBZ R1, done + LDP.W -32(R2), (R10, R11) + LDP 16(R2), (R12, R13) + LSR R4, R13, R23 + ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) + LSL R3, R13 + LSR R4, R12, R22 + ORR R13, R22 + LSL R3, R12 + LSR R4, R11, R21 + ORR R12, R21 + LSL R3, R11 + LSR R4, R10, R20 + ORR R11, R20 + LSL R3, R10, R8 + STP.W (R20, R21), -32(R0) + STP (R22, R23), 16(R0) + SUB $4, R1 + B loop +done: + MOVD.W R8, -8(R0) // the first element x[0] + MOVD R5, c+56(FP) // the part moved out from x[n-1] + RET +copy: + CMP R0, R2 + BEQ len0 + TBZ $0, R1, ctwo + MOVD.W -8(R2), R4 + MOVD.W R4, -8(R0) + SUB $1, R1 +ctwo: + TBZ $1, R1, cloop + LDP.W -16(R2), (R4, R5) + STP.W (R4, R5), -16(R0) + SUB $2, R1 +cloop: + CBZ R1, len0 + LDP.W -32(R2), (R4, R5) + LDP 16(R2), (R6, R7) + STP.W (R4, R5), -32(R0) + STP (R6, R7), 16(R0) + SUB $4, R1 + B cloop +len0: + MOVD $0, c+56(FP) + RET + +// func shrVU(z, x []Word, s uint) (c Word) +// This implementation handles the shift operation from the low word to the high word, +// which may be an error for the case where the high word of x overlaps with the low +// word of z. When calling this function directly, you need to pay attention to this +// situation. +TEXT ·shrVU(SB),NOSPLIT,$0 + MOVD z+0(FP), R0 + MOVD z_len+8(FP), R1 + MOVD x+24(FP), R2 + MOVD s+48(FP), R3 + MOVD $0, R8 + MOVD $64, R4 + SUB R3, R4 + CBZ R1, len0 + CBZ R3, copy // if the number of shift is 0, just copy x to z + + MOVD.P 8(R2), R20 + LSR R3, R20, R8 + LSL R4, R20 + MOVD R20, c+56(FP) // deal with the first element + SUB $1, R1 + + TBZ $0, R1, two + MOVD.P 8(R2), R6 + LSL R4, R6, R20 + ORR R8, R20 + LSR R3, R6, R8 + MOVD.P R20, 8(R0) + SUB $1, R1 +two: + TBZ $1, R1, loop + LDP.P 16(R2), (R6, R7) + LSL R4, R6, R20 + LSR R3, R6 + ORR R8, R20 + LSL R4, R7, R21 + LSR R3, R7, R8 + ORR R6, R21 + STP.P (R20, R21), 16(R0) + SUB $2, R1 +loop: + CBZ R1, done + LDP.P 32(R2), (R10, R11) + LDP -16(R2), (R12, R13) + LSL R4, R10, R20 + LSR R3, R10 + ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) + LSL R4, R11, R21 + LSR R3, R11 + ORR R10, R21 + LSL R4, R12, R22 + LSR R3, R12 + ORR R11, R22 + LSL R4, R13, R23 + LSR R3, R13, R8 + ORR R12, R23 + STP.P (R20, R21), 32(R0) + STP (R22, R23), -16(R0) + SUB $4, R1 + B loop +done: + MOVD R8, (R0) // deal with the last element + RET +copy: + CMP R0, R2 + BEQ len0 + TBZ $0, R1, ctwo + MOVD.P 8(R2), R3 + MOVD.P R3, 8(R0) + SUB $1, R1 +ctwo: + TBZ $1, R1, cloop + LDP.P 16(R2), (R4, R5) + STP.P (R4, R5), 16(R0) + SUB $2, R1 +cloop: + CBZ R1, len0 + LDP.P 32(R2), (R4, R5) + LDP -16(R2), (R6, R7) + STP.P (R4, R5), 32(R0) + STP (R6, R7), -16(R0) + SUB $4, R1 + B cloop +len0: + MOVD $0, c+56(FP) + RET + + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD r+56(FP), R4 + // c, z = x * y + r + TBZ $0, R0, two + MOVD.P 8(R2), R5 + MUL R3, R5, R7 + UMULH R3, R5, R8 + ADDS R4, R7 + ADC $0, R8, R4 // c, z[i] = x[i] * y + r + MOVD.P R7, 8(R1) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R2), (R5, R6) + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R12, R11 + ADC $0, R13, R4 + + STP.P (R10, R11), 16(R1) + SUB $2, R0 +loop: + CBZ R0, done + LDP.P 32(R2), (R5, R6) + LDP -16(R2), (R7, R8) + + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R11, R12 + + MUL R3, R7, R14 + UMULH R3, R7, R15 + ADCS R13, R14 + MUL R3, R8, R16 + UMULH R3, R8, R17 + ADCS R15, R16 + ADC $0, R17, R4 + + STP.P (R10, R12), 32(R1) + STP (R14, R16), -16(R1) + SUB $4, R0 + B loop +done: + MOVD R4, c+64(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD $0, R4 + + TBZ $0, R0, two + + MOVD.P 8(R2), R5 + MOVD (R1), R6 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + + ADDS R7, R6 + ADC $0, R8, R4 + + MOVD.P R6, 8(R1) + SUB $1, R0 + +two: + TBZ $1, R0, loop + + LDP.P 16(R2), (R5, R10) + LDP (R1), (R6, R11) + + MUL R10, R3, R13 + UMULH R10, R3, R12 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + + ADDS R4, R6 + ADCS R13, R11 + ADC $0, R12 + + ADDS R7, R6 + ADCS R8, R11 + ADC $0, R12, R4 + + STP.P (R6, R11), 16(R1) + SUB $2, R0 + +// The main loop of this code operates on a block of 4 words every iteration +// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] +// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next +// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. +loop: + CBZ R0, done + + LDP.P 16(R2), (R5, R6) + LDP.P 16(R2), (R7, R8) + + LDP (R1), (R9, R10) + ADDS R4, R9 + MUL R6, R3, R14 + ADCS R14, R10 + MUL R7, R3, R15 + LDP 16(R1), (R11, R12) + ADCS R15, R11 + MUL R8, R3, R16 + ADCS R16, R12 + UMULH R8, R3, R20 + ADC $0, R20 + + MUL R5, R3, R13 + ADDS R13, R9 + UMULH R5, R3, R17 + ADCS R17, R10 + UMULH R6, R3, R21 + STP.P (R9, R10), 16(R1) + ADCS R21, R11 + UMULH R7, R3, R19 + ADCS R19, R12 + STP.P (R11, R12), 16(R1) + ADC $0, R20, R4 + + SUB $4, R0 + B loop + +done: + MOVD R4, c+56(FP) + RET + +// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) +TEXT ·divWVW(SB),NOSPLIT,$0 + B ·divWVW_g(SB) diff --git a/arith_decl.go b/arith_decl.go new file mode 100644 index 0000000..36854cf --- /dev/null +++ b/arith_decl.go @@ -0,0 +1,20 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,!riscv64 + +package decimal + +// implemented in arith_$GOARCH.s +func mulWW(x, y Word) (z1, z0 Word) +func divWW(x1, x0, y Word) (q, r Word) +func addVV(z, x, y []Word) (c Word) +func subVV(z, x, y []Word) (c Word) +func addVW(z, x []Word, y Word) (c Word) +func subVW(z, x []Word, y Word) (c Word) +func shlVU(z, x []Word, s uint) (c Word) +func shrVU(z, x []Word, s uint) (c Word) +func mulAddVWW(z, x []Word, y, r Word) (c Word) +func addMulVVW(z, x []Word, y Word) (c Word) +func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) diff --git a/arith_decl_pure.go b/arith_decl_pure.go new file mode 100644 index 0000000..b18a0fd --- /dev/null +++ b/arith_decl_pure.go @@ -0,0 +1,61 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build math_big_pure_go riscv64 + +package decimal + +func mulWW(x, y Word) (z1, z0 Word) { + return mulWW_g(x, y) +} + +func divWW(x1, x0, y Word) (q, r Word) { + return divWW_g(x1, x0, y) +} + +func addVV(z, x, y []Word) (c Word) { + return addVV_g(z, x, y) +} + +func subVV(z, x, y []Word) (c Word) { + return subVV_g(z, x, y) +} + +func addVW(z, x []Word, y Word) (c Word) { + // TODO: remove indirect function call when golang.org/issue/30548 is fixed + fn := addVW_g + if len(z) > 32 { + fn = addVWlarge + } + return fn(z, x, y) +} + +func subVW(z, x []Word, y Word) (c Word) { + // TODO: remove indirect function call when golang.org/issue/30548 is fixed + fn := subVW_g + if len(z) > 32 { + fn = subVWlarge + } + return fn(z, x, y) +} + +func shlVU(z, x []Word, s uint) (c Word) { + return shlVU_g(z, x, s) +} + +func shrVU(z, x []Word, s uint) (c Word) { + return shrVU_g(z, x, s) +} + +func mulAddVWW(z, x []Word, y, r Word) (c Word) { + return mulAddVWW_g(z, x, y, r) +} + +func addMulVVW(z, x []Word, y Word) (c Word) { + return addMulVVW_g(z, x, y) +} + +func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { + return divWVW_g(z, xn, x, y) +} diff --git a/arith_decl_s390x.go b/arith_decl_s390x.go new file mode 100644 index 0000000..19a4543 --- /dev/null +++ b/arith_decl_s390x.go @@ -0,0 +1,23 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +package decimal + +func addVV_check(z, x, y []Word) (c Word) +func addVV_vec(z, x, y []Word) (c Word) +func addVV_novec(z, x, y []Word) (c Word) +func subVV_check(z, x, y []Word) (c Word) +func subVV_vec(z, x, y []Word) (c Word) +func subVV_novec(z, x, y []Word) (c Word) +func addVW_check(z, x []Word, y Word) (c Word) +func addVW_vec(z, x []Word, y Word) (c Word) +func addVW_novec(z, x []Word, y Word) (c Word) +func subVW_check(z, x []Word, y Word) (c Word) +func subVW_vec(z, x []Word, y Word) (c Word) +func subVW_novec(z, x []Word, y Word) (c Word) +func hasVectorFacility() bool + +var hasVX = hasVectorFacility() diff --git a/arith_mips64x.s b/arith_mips64x.s new file mode 100644 index 0000000..983510e --- /dev/null +++ b/arith_mips64x.s @@ -0,0 +1,43 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,mips64 !math_big_pure_go,mips64le + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +TEXT ·mulWW(SB),NOSPLIT,$0 + JMP ·mulWW_g(SB) + +TEXT ·divWW(SB),NOSPLIT,$0 + JMP ·divWW_g(SB) + +TEXT ·addVV(SB),NOSPLIT,$0 + JMP ·addVV_g(SB) + +TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) + +TEXT ·addVW(SB),NOSPLIT,$0 + JMP ·addVW_g(SB) + +TEXT ·subVW(SB),NOSPLIT,$0 + JMP ·subVW_g(SB) + +TEXT ·shlVU(SB),NOSPLIT,$0 + JMP ·shlVU_g(SB) + +TEXT ·shrVU(SB),NOSPLIT,$0 + JMP ·shrVU_g(SB) + +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + JMP ·mulAddVWW_g(SB) + +TEXT ·addMulVVW(SB),NOSPLIT,$0 + JMP ·addMulVVW_g(SB) + +TEXT ·divWVW(SB),NOSPLIT,$0 + JMP ·divWVW_g(SB) diff --git a/arith_mipsx.s b/arith_mipsx.s new file mode 100644 index 0000000..54cafbd --- /dev/null +++ b/arith_mipsx.s @@ -0,0 +1,43 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,mips !math_big_pure_go,mipsle + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +TEXT ·mulWW(SB),NOSPLIT,$0 + JMP ·mulWW_g(SB) + +TEXT ·divWW(SB),NOSPLIT,$0 + JMP ·divWW_g(SB) + +TEXT ·addVV(SB),NOSPLIT,$0 + JMP ·addVV_g(SB) + +TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) + +TEXT ·addVW(SB),NOSPLIT,$0 + JMP ·addVW_g(SB) + +TEXT ·subVW(SB),NOSPLIT,$0 + JMP ·subVW_g(SB) + +TEXT ·shlVU(SB),NOSPLIT,$0 + JMP ·shlVU_g(SB) + +TEXT ·shrVU(SB),NOSPLIT,$0 + JMP ·shrVU_g(SB) + +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + JMP ·mulAddVWW_g(SB) + +TEXT ·addMulVVW(SB),NOSPLIT,$0 + JMP ·addMulVVW_g(SB) + +TEXT ·divWVW(SB),NOSPLIT,$0 + JMP ·divWVW_g(SB) diff --git a/arith_ppc64x.s b/arith_ppc64x.s new file mode 100644 index 0000000..dbb168a --- /dev/null +++ b/arith_ppc64x.s @@ -0,0 +1,522 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB), NOSPLIT, $0 + MOVD x+0(FP), R4 + MOVD y+8(FP), R5 + MULHDU R4, R5, R6 + MULLD R4, R5, R7 + MOVD R6, z1+16(FP) + MOVD R7, z0+24(FP) + RET + +// func addVV(z, y, y []Word) (c Word) +// z[i] = x[i] + y[i] for all i, carrying +TEXT ·addVV(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] + + // If z_len = 0, we are done + CMP R0, R7 + MOVD R0, R4 + BEQ done + + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.90x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). +loop: + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA + ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA + ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done + + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + ADDE R11, R16, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final + + MOVD 8(R8), R11 + MOVD 8(R9), R16 + ADDE R11, R16, R20 + MOVD R20, 8(R10) + +final: + ADDZE R4 // Capture CA + +done: + MOVD R4, c+72(FP) + RET + +// func subVV(z, x, y []Word) (c Word) +// z[i] = x[i] - y[i] for all i, carrying +TEXT ·subVV(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] + + // If z_len = 0, we are done + CMP R0, R7 + MOVD R0, R4 + BEQ done + + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.92x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). +loop: + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA + SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA + SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done + + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + SUBE R16, R11, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final + + MOVD 8(R8), R11 + MOVD 8(R9), R16 + SUBE R16, R11, R20 + MOVD R20, 8(R10) + +final: + ADDZE R4 + XOR $1, R4 + +done: + MOVD R4, c+72(FP) + RET + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R20, R4, R6 // R6 = x[i] + c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + +loop: + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + ADDZE R20, R24 // R24 = x[i] + CA + ADDZE R21, R25 // R25 = x[i+1] + CA + ADDZE R22, R26 // R26 = x[i+2] + CA + ADDZE R23, R27 // R27 = x[i+3] + CA + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + ADDZE R20, R24 + MOVD R24, 8(R10) + +final: + ADDZE R0, R4 // c = CA +done: + MOVD R4, c+56(FP) + RET + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + SUBC R4, R20, R6 // R6 = x[i] - c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + + // The loop here is almost the same as the one used in s390x, but + // we don't need to capture CA every iteration because we've already + // done that above. +loop: + MOVD 8(R8), R20 + MOVD 16(R8), R21 + MOVD 24(R8), R22 + MOVDU 32(R8), R23 + SUBE R0, R20 + SUBE R0, R21 + SUBE R0, R22 + SUBE R0, R23 + MOVD R20, 8(R10) + MOVD R21, 16(R10) + MOVD R22, 24(R10) + MOVDU R23, 32(R10) + ADD $-4, R11 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + SUBE R0, R20 + MOVD R20, 8(R10) + +final: + // Capture CA + SUBE R4, R4 + NEG R4, R4 + +done: + MOVD R4, c+56(FP) + RET + +TEXT ·shlVU(SB), NOSPLIT, $0 + BR ·shlVU_g(SB) + +TEXT ·shrVU(SB), NOSPLIT, $0 + BR ·shrVU_g(SB) + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD r+56(FP), R4 // R4 = r = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 + BEQ done + + MOVD 0(R8), R20 + ADD $-1, R11 + MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) + ADDC R4, R6 // R6 = z0 + r + ADDZE R7 // R7 = z1 + CA + CMP R0, R11 + MOVD R7, R4 // R4 = c + MOVD R6, 0(R10) // z[i] + BEQ done + + // We will read 4 elements per iteration + SRD $2, R11, R14 // R14 = z_len/4 + DCBT (R8) + CMP R0, R14 + MOVD R14, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + +loop: + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R20 // R20 = z1[i] + ADDC R4, R24 // R24 = z0[i] + c + ADDZE R20 // R7 = z1[i] + CA + MULLD R9, R21, R25 + MULHDU R9, R21, R21 + ADDC R20, R25 + ADDZE R21 + MULLD R9, R22, R26 + MULHDU R9, R22, R22 + ADDC R21, R26 + ADDZE R22 + MULLD R9, R23, R27 + MULHDU R9, R23, R23 + ADDC R22, R27 + ADDZE R23 + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + MOVD R23, R4 // R4 = c + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ done + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R20 // R20 = x[i] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R25 // R25 = z1[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) // z[i] + CMP R0, R11 + MOVD R25, R4 // R4 = c + BEQ done // If R11 = 0, we are done + + MOVDU 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) + CMP R0, R11 + MOVD R25, R4 + BEQ done + + MOVD 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVD R24, 8(R10) + MOVD R25, R4 + +done: + MOVD R4, c+64(FP) + RET + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD z_len+8(FP), R22 // R22 = z_len + + MOVD R0, R3 // R3 will be the index register + CMP R0, R22 + MOVD R0, R4 // R4 = c = 0 + MOVD R22, CTR // Initialize loop counter + BEQ done + +loop: + MOVD (R8)(R3), R20 // Load x[i] + MOVD (R10)(R3), R21 // Load z[i] + MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) + ADDC R21, R6 // R6 = z0 + ADDZE R7 // R7 = z1 + ADDC R4, R6 // R6 = z0 + c + 0 + ADDZE R7, R4 // c += z1 + MOVD R6, (R10)(R3) // Store z[i] + ADD $8, R3 + BC 16, 0, loop // bdnz + +done: + MOVD R4, c+56(FP) + RET + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB), NOSPLIT, $0 + MOVD x1+0(FP), R4 + MOVD x0+8(FP), R5 + MOVD y+16(FP), R6 + + CMPU R4, R6 + BGE divbigger + + // from the programmer's note in ch. 3 of the ISA manual, p.74 + DIVDEU R6, R4, R3 + DIVDU R6, R5, R7 + MULLD R6, R3, R8 + MULLD R6, R7, R20 + SUB R20, R5, R10 + ADD R7, R3, R3 + SUB R8, R10, R4 + CMPU R4, R10 + BLT adjust + CMPU R4, R6 + BLT end + +adjust: + MOVD $1, R21 + ADD R21, R3, R3 + SUB R6, R4, R4 + +end: + MOVD R3, q+24(FP) + MOVD R4, r+32(FP) + + RET + +divbigger: + MOVD $-1, R7 + MOVD R7, q+24(FP) + MOVD R7, r+32(FP) + RET + +TEXT ·divWVW(SB), NOSPLIT, $0 + BR ·divWVW_g(SB) diff --git a/arith_s390x.s b/arith_s390x.s new file mode 100644 index 0000000..9156d9d --- /dev/null +++ b/arith_s390x.s @@ -0,0 +1,1239 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go,s390x + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET +novector: + MOVB $0, ret+0(FP) // no vx + RET + +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVD x+0(FP), R3 + MOVD y+8(FP), R4 + MULHDU R3, R4 + MOVD R10, z1+16(FP) + MOVD R11, z0+24(FP) + RET + +// func divWW(x1, x0, y Word) (q, r Word) +TEXT ·divWW(SB),NOSPLIT,$0 + MOVD x1+0(FP), R10 + MOVD x0+8(FP), R11 + MOVD y+16(FP), R5 + WORD $0xb98700a5 // dlgr r10,r5 + MOVD R11, q+24(FP) + MOVD R10, r+32(FP) + RET + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func addVV(z, x, y []Word) (c Word) + + +TEXT ·addVV(SB),NOSPLIT,$0 + MOVD addvectorfacility+0x00(SB),R1 + BR (R1) + +TEXT ·addVV_check(SB),NOSPLIT, $0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_novec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·addVV_novec(SB), 0(R1) + BR ·addVV_novec(SB) +vectorimpl: + MOVD $addvectorfacility+0x00(SB), R1 + MOVD $·addVV_vec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·addVV_vec(SB), 0(R1) + BR ·addVV_vec(SB) + +GLOBL addvectorfacility+0x00(SB), NOPTR, $8 +DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB) + +TEXT ·addVV_vec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 + BLT v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 + + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 + // n >= 0 + // regular loop body unrolled 16x + VZERO V0 // c = 0 +UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order + VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order + + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order + VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order + + VACCCQ V1, V9, V0, V25 + VACQ V1, V9, V0, V17 + VACCCQ V2, V10, V25, V26 + VACQ V2, V10, V25, V18 + + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order + VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order + VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order + VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order + + VACCCQ V3, V11, V26, V27 + VACQ V3, V11, V26, V19 + VACCCQ V4, V12, V27, V28 + VACQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order + VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order + VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order + VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order + + VACCCQ V5, V13, V28, V29 + VACQ V5, V13, V28, V21 + VACCCQ V6, V14, V29, V30 + VACQ V6, V14, V29, V22 + + VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order + VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order + VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order + VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order + + VACCCQ V7, V15, V30, V31 + VACQ V7, V15, V30, V23 + VACCCQ V8, V16, V31, V0 //V0 has carry-over + VACQ V8, V16, V31, V24 + + VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order + VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order + VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order + VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order + VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order + VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order + VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order + VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + NEG R4, R4 // save cf + +A1: ADD $12, R3 // n += 16 + + + // s/JL/JMP/ below to disable the unrolled loop + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 + +v1: ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 + +E1: NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +TEXT ·addVV_novec(SB),NOSPLIT,$0 +novec: + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1n // if n < 0 goto v1n +U1n: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R4 // restore CF + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD 8(R9)(R10*1), R11 + ADDE R11, R6 + MOVD 16(R9)(R10*1), R11 + ADDE R11, R7 + MOVD 24(R9)(R10*1), R11 + ADDE R11, R1 + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1n // if n >= 0 goto U1n + +v1n: ADD $4, R3 // n += 4 + BLE E1n // if n <= 0 goto E1n + +L1n: // n > 0 + ADDC R4, R4 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + ADDE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + ADDE R4, R4 // save CF + NEG R4, R4 + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1n // if n > 0 goto L1n + +E1n: NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + + +TEXT ·subVV(SB),NOSPLIT,$0 + MOVD subvectorfacility+0x00(SB),R1 + BR (R1) + +TEXT ·subVV_check(SB),NOSPLIT,$0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_novec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·subVV_novec(SB), 0(R1) + BR ·subVV_novec(SB) +vectorimpl: + MOVD $subvectorfacility+0x00(SB), R1 + MOVD $·subVV_vec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·subVV_vec(SB), 0(R1) + BR ·subVV_vec(SB) + +GLOBL subvectorfacility+0x00(SB), NOPTR, $8 +DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB) + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) +TEXT ·subVV_vec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 + SUB $12, R3 // n -= 16 + BLT A1 // if n < 0 goto A1 + + MOVD R8, R5 + MOVD R9, R6 + MOVD R2, R7 + + // n >= 0 + // regular loop body unrolled 16x + VZERO V0 // cf = 0 + MOVD $1, R4 // for 390 subtraction cf starts as 1 (no borrow) + VLVGG $1, R4, V0 //put carry into V0 + +UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V8 + ADD $64, R5 + VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order + VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order + + + VLM 0(R6), V9, V12 // 64-bytes into V9..V16 + ADD $64, R6 + VPDI $0x4,V9,V9,V9 // flip the doublewords to big-endian order + VPDI $0x4,V10,V10,V10 // flip the doublewords to big-endian order + + VSBCBIQ V1, V9, V0, V25 + VSBIQ V1, V9, V0, V17 + VSBCBIQ V2, V10, V25, V26 + VSBIQ V2, V10, V25, V18 + + + VLM 0(R5), V5, V6 // 32-bytes into V1..V8 + VLM 0(R6), V13, V14 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order + VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order + VPDI $0x4,V11,V11,V11 // flip the doublewords to big-endian order + VPDI $0x4,V12,V12,V12 // flip the doublewords to big-endian order + + VSBCBIQ V3, V11, V26, V27 + VSBIQ V3, V11, V26, V19 + VSBCBIQ V4, V12, V27, V28 + VSBIQ V4, V12, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V1..V8 + VLM 0(R6), V15, V16 // 32-bytes into V9..V16 + ADD $32, R5 + ADD $32, R6 + + VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order + VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order + VPDI $0x4,V13,V13,V13 // flip the doublewords to big-endian order + VPDI $0x4,V14,V14,V14 // flip the doublewords to big-endian order + + VSBCBIQ V5, V13, V28, V29 + VSBIQ V5, V13, V28, V21 + VSBCBIQ V6, V14, V29, V30 + VSBIQ V6, V14, V29, V22 + + VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order + VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order + VPDI $0x4,V15,V15,V15 // flip the doublewords to big-endian order + VPDI $0x4,V16,V16,V16 // flip the doublewords to big-endian order + + VSBCBIQ V7, V15, V30, V31 + VSBIQ V7, V15, V30, V23 + VSBCBIQ V8, V16, V31, V0 //V0 has carry-over + VSBIQ V8, V16, V31, V24 + + VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order + VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order + VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order + VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order + VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order + VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order + VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order + VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 + SUB $1, R4 // save cf + +A1: ADD $12, R3 // n += 16 + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD 8(R9)(R10*1), R11 + SUBE R11, R6 + MOVD 16(R9)(R10*1), R11 + SUBE R11, R7 + MOVD 24(R9)(R10*1), R11 + SUBE R11, R1 + MOVD R0, R4 + SUBE R4, R4 // save CF + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1n + +v1: ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + SUBE R4, R4 // save CF + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1n + +E1: NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + + +// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names) +TEXT ·subVV_novec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R2 + + MOVD $0, R4 // c = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v1 // if n < 0 goto v1 + +U1: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD 8(R9)(R10*1), R11 + SUBE R11, R6 + MOVD 16(R9)(R10*1), R11 + SUBE R11, R7 + MOVD 24(R9)(R10*1), R11 + SUBE R11, R1 + MOVD R0, R4 + SUBE R4, R4 // save CF + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + + ADD $32, R10 // i += 4 + SUB $4, R3 // n -= 4 + BGE U1 // if n >= 0 goto U1 + +v1: ADD $4, R3 // n += 4 + BLE E1 // if n <= 0 goto E1 + +L1: // n > 0 + MOVD R0, R11 + SUBC R4, R11 // restore CF + MOVD 0(R8)(R10*1), R5 + MOVD 0(R9)(R10*1), R11 + SUBE R11, R5 + MOVD R5, 0(R2)(R10*1) + MOVD R0, R4 + SUBE R4, R4 // save CF + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L1 // if n > 0 goto L1 + +E1: NEG R4, R4 + MOVD R4, c+72(FP) // return c + RET + +TEXT ·addVW(SB),NOSPLIT,$0 + MOVD addwvectorfacility+0x00(SB),R1 + BR (R1) + +TEXT ·addVW_check(SB),NOSPLIT,$0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $addwvectorfacility+0x00(SB), R1 + MOVD $·addVW_novec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·addVW_novec(SB), 0(R1) + BR ·addVW_novec(SB) +vectorimpl: + MOVD $addwvectorfacility+0x00(SB), R1 + MOVD $·addVW_vec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·addVW_vec(SB), 0(R1) + BR ·addVW_vec(SB) + +GLOBL addwvectorfacility+0x00(SB), NOPTR, $8 +DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB) + + +// func addVW_vec(z, x []Word, y Word) (c Word) +TEXT ·addVW_vec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 + + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + MOVD R8, R5 + MOVD R2, R7 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v10 // if n < 0 goto v10 + SUB $12, R3 + BLT A10 + + // n >= 0 + // regular loop body unrolled 16x + + VZERO V0 // prepare V0 to be final carry register + VZERO V9 // to ensure upper half is zero + VLVGG $1, R4, V9 +UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 + ADD $64, R5 + VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order + VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order + + + VACCCQ V1, V9, V0, V25 + VACQ V1, V9, V0, V17 + VZERO V9 + VACCCQ V2, V9, V25, V26 + VACQ V2, V9, V25, V18 + + + VLM 0(R5), V5, V6 // 32-bytes into V5..V6 + ADD $32, R5 + + VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order + VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order + + VACCCQ V3, V9, V26, V27 + VACQ V3, V9, V26, V19 + VACCCQ V4, V9, V27, V28 + VACQ V4, V9, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V7..V8 + ADD $32, R5 + + VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order + VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order + + VACCCQ V5, V9, V28, V29 + VACQ V5, V9, V28, V21 + VACCCQ V6, V9, V29, V30 + VACQ V6, V9, V29, V22 + + VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order + VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order + + VACCCQ V7, V9, V30, V31 + VACQ V7, V9, V30, V23 + VACCCQ V8, V9, V31, V0 //V0 has carry-over + VACQ V8, V9, V31, V24 + + VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order + VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order + VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order + VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order + VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order + VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order + VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order + VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 + +A10: ADD $12, R3 // n += 16 + + + // s/JL/JMP/ below to disable the unrolled loop + + BLT v10 // if n < 0 goto v10 + + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R5 + ADDE R0, R6 + ADDE R0, R7 + ADDE R0, R1 + ADDE R0, R0 + MOVD R0, R4 // save CF + SUB R0, R0 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 + +v10: ADD $4, R3 // n += 4 + BLE E10 // if n <= 0 goto E4 + + +L4: // n > 0 + MOVD 0(R8)(R10*1), R5 + ADDC R4, R5 + ADDE R0, R0 + MOVD R0, R4 // save CF + SUB R0, R0 + MOVD R5, 0(R2)(R10*1) + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 + +E10: MOVD R4, c+56(FP) // return c + + RET + + +TEXT ·addVW_novec(SB),NOSPLIT,$0 +//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 + MOVD $0, R0 // make sure it's 0 + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v4 // if n < 4 goto v4 + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + ADDC R4, R5 + ADDE R0, R6 + ADDE R0, R7 + ADDE R0, R1 + ADDE R0, R0 + MOVD R0, R4 // save CF + SUB R0, R0 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 + +v4: ADD $4, R3 // n += 4 + BLE E4 // if n <= 0 goto E4 + +L4: // n > 0 + MOVD 0(R8)(R10*1), R5 + ADDC R4, R5 + ADDE R0, R0 + MOVD R0, R4 // save CF + SUB R0, R0 + MOVD R5, 0(R2)(R10*1) + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 + +E4: MOVD R4, c+56(FP) // return c + + RET + +TEXT ·subVW(SB),NOSPLIT,$0 + MOVD subwvectorfacility+0x00(SB),R1 + BR (R1) + +TEXT ·subVW_check(SB),NOSPLIT,$0 + MOVB ·hasVX(SB), R1 + CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported + MOVD $subwvectorfacility+0x00(SB), R1 + MOVD $·subVW_novec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·subVW_novec(SB), 0(R1) + BR ·subVW_novec(SB) +vectorimpl: + MOVD $subwvectorfacility+0x00(SB), R1 + MOVD $·subVW_vec(SB), R2 + MOVD R2, 0(R1) + //MOVD $·subVW_vec(SB), 0(R1) + BR ·subVW_vec(SB) + +GLOBL subwvectorfacility+0x00(SB), NOPTR, $8 +DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB) + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW_vec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 + + MOVD $0, R0 // make sure it's zero + MOVD $0, R10 // i = 0 + MOVD R8, R5 + MOVD R2, R7 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v11 // if n < 0 goto v11 + SUB $12, R3 + BLT A11 + + VZERO V0 + MOVD $1, R6 // prepare V0 to be final carry register + VLVGG $1, R6, V0 // borrow is initially "no borrow" + VZERO V9 // to ensure upper half is zero + VLVGG $1, R4, V9 + + // n >= 0 + // regular loop body unrolled 16x + + +UU1: VLM 0(R5), V1, V4 // 64-bytes into V1..V4 + ADD $64, R5 + VPDI $0x4,V1,V1,V1 // flip the doublewords to big-endian order + VPDI $0x4,V2,V2,V2 // flip the doublewords to big-endian order + + + VSBCBIQ V1, V9, V0, V25 + VSBIQ V1, V9, V0, V17 + VZERO V9 + VSBCBIQ V2, V9, V25, V26 + VSBIQ V2, V9, V25, V18 + + VLM 0(R5), V5, V6 // 32-bytes into V5..V6 + ADD $32, R5 + + VPDI $0x4,V3,V3,V3 // flip the doublewords to big-endian order + VPDI $0x4,V4,V4,V4 // flip the doublewords to big-endian order + + + VSBCBIQ V3, V9, V26, V27 + VSBIQ V3, V9, V26, V19 + VSBCBIQ V4, V9, V27, V28 + VSBIQ V4, V9, V27, V20 + + VLM 0(R5), V7, V8 // 32-bytes into V7..V8 + ADD $32, R5 + + VPDI $0x4,V5,V5,V5 // flip the doublewords to big-endian order + VPDI $0x4,V6,V6,V6 // flip the doublewords to big-endian order + + VSBCBIQ V5, V9, V28, V29 + VSBIQ V5, V9, V28, V21 + VSBCBIQ V6, V9, V29, V30 + VSBIQ V6, V9, V29, V22 + + VPDI $0x4,V7,V7,V7 // flip the doublewords to big-endian order + VPDI $0x4,V8,V8,V8 // flip the doublewords to big-endian order + + VSBCBIQ V7, V9, V30, V31 + VSBIQ V7, V9, V30, V23 + VSBCBIQ V8, V9, V31, V0 // V0 has carry-over + VSBIQ V8, V9, V31, V24 + + VPDI $0x4,V17,V17,V17 // flip the doublewords to big-endian order + VPDI $0x4,V18,V18,V18 // flip the doublewords to big-endian order + VPDI $0x4,V19,V19,V19 // flip the doublewords to big-endian order + VPDI $0x4,V20,V20,V20 // flip the doublewords to big-endian order + VPDI $0x4,V21,V21,V21 // flip the doublewords to big-endian order + VPDI $0x4,V22,V22,V22 // flip the doublewords to big-endian order + VPDI $0x4,V23,V23,V23 // flip the doublewords to big-endian order + VPDI $0x4,V24,V24,V24 // flip the doublewords to big-endian order + VSTM V17, V24, 0(R7) // 128-bytes into z + ADD $128, R7 + ADD $128, R10 // i += 16 + SUB $16, R3 // n -= 16 + BGE UU1 // if n >= 0 goto U1 + VLGVG $1, V0, R4 // put cf into R4 in case we branch to v10 + SUB $1, R4 // save cf + NEG R4, R4 +A11: ADD $12, R3 // n += 16 + + BLT v11 // if n < 0 goto v11 + + // n >= 0 + // regular loop body unrolled 4x + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + SUBC R4, R5 //SLGR -> SUBC + SUBE R0, R6 //SLBGR -> SUBE + SUBE R0, R7 + SUBE R0, R1 + SUBE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 + +v11: ADD $4, R3 // n += 4 + BLE E11 // if n <= 0 goto E4 + +L4: // n > 0 + + MOVD 0(R8)(R10*1), R5 + SUBC R4, R5 + SUBE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 + +E11: MOVD R4, c+56(FP) // return c + + RET + +//DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) +// func subVW(z, x []Word, y Word) (c Word) +// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names) +TEXT ·subVW_novec(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R3 + MOVD x+24(FP), R8 + MOVD y+48(FP), R4 // c = y + MOVD z+0(FP), R2 + MOVD $0, R0 // make sure it's 0 + MOVD $0, R10 // i = 0 + + // s/JL/JMP/ below to disable the unrolled loop + SUB $4, R3 // n -= 4 + BLT v4 // if n < 4 goto v4 + +U4: // n >= 0 + // regular loop body unrolled 4x + MOVD 0(R8)(R10*1), R5 + MOVD 8(R8)(R10*1), R6 + MOVD 16(R8)(R10*1), R7 + MOVD 24(R8)(R10*1), R1 + SUBC R4, R5 //SLGR -> SUBC + SUBE R0, R6 //SLBGR -> SUBE + SUBE R0, R7 + SUBE R0, R1 + SUBE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + MOVD R6, 8(R2)(R10*1) + MOVD R7, 16(R2)(R10*1) + MOVD R1, 24(R2)(R10*1) + + ADD $32, R10 // i += 4 -> i +=32 + SUB $4, R3 // n -= 4 + BGE U4 // if n >= 0 goto U4 + +v4: ADD $4, R3 // n += 4 + BLE E4 // if n <= 0 goto E4 + +L4: // n > 0 + MOVD 0(R8)(R10*1), R5 + SUBC R4, R5 + SUBE R4, R4 // save CF + NEG R4, R4 + MOVD R5, 0(R2)(R10*1) + + ADD $8, R10 // i++ + SUB $1, R3 // n-- + BGT L4 // if n > 0 goto L4 + +E4: MOVD R4, c+56(FP) // return c + + RET + +// func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R5 + MOVD $0, R0 + SUB $1, R5 // n-- + BLT X8b // n < 0 (n <= 0) + + // n > 0 + MOVD s+48(FP), R4 + CMPBEQ R0, R4, Z80 //handle 0 case beq + MOVD $64, R6 + CMPBEQ R6, R4, Z864 //handle 64 case beq + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + SUB R4, R6, R7 + MOVD (R8)(R5*1), R10 // w1 = x[i-1] + SRD R7, R10, R3 + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E8 + + // i < n-1 +L8: MOVD R10, R3 // w = w1 + MOVD -8(R8)(R5*1), R10 // w1 = x[i+1] + + SLD R4, R3 // w<>ŝ + SRD R7, R10, R6 + OR R6, R3 + MOVD R3, (R2)(R5*1) // z[i] = w<>ŝ + SUB $8, R5 // i-- + +E8: CMPBGT R5, R0, L8 // i < n-1 + + // i >= n-1 +X8a: SLD R4, R10 // w1<= n-1 + MOVD R10, (R2)(R5*1) + RET + +Z864: MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + MOVD (R8)(R5*1), R3 // w1 = x[n-1] + MOVD R3, c+56(FP) // z[i] = x[n-1] + + BR E864 + + // i < n-1 +L864: MOVD -8(R8)(R5*1), R3 + + MOVD R3, (R2)(R5*1) // z[i] = x[n-1] + SUB $8, R5 // i-- + +E864: CMPBGT R5, R0, L864 // i < n-1 + + MOVD R0, (R2) // z[n-1] = 0 + RET + + +// CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6 +// func shrVU(z, x []Word, s uint) (c Word) +TEXT ·shrVU(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R5 + MOVD $0, R0 + SUB $1, R5 // n-- + BLT X9b // n < 0 (n <= 0) + + // n > 0 + MOVD s+48(FP), R4 + CMPBEQ R0, R4, ZB0 //handle 0 case beq + MOVD $64, R6 + CMPBEQ R6, R4, ZB64 //handle 64 case beq + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + SUB R4, R6, R7 + MOVD (R8), R10 // w1 = x[0] + SLD R7, R10, R3 + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E9 + + // i < n-1 +L9: MOVD R10, R3 // w = w1 + MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] + + SRD R4, R3 // w>>s | w1<>s | w1<= n-1 +X9a: SRD R4, R10 // w1>>s + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s + RET + +X9b: MOVD R0, c+56(FP) + RET + +ZB0: MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + + MOVD (R8), R10 // w1 = x[0] + MOVD $0, R3 // R10 << 64 + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E9Z + + // i < n-1 +L9Z: MOVD R10, R3 // w = w1 + MOVD 8(R8)(R1*1), R10 // w1 = x[i+1] + + MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<= n-1 + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s + RET + +ZB64: MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + SLD $3, R5 // n = n*8 + MOVD (R8), R3 // w1 = x[0] + MOVD R3, c+56(FP) + + MOVD $0, R1 // i = 0 + BR E964 + + // i < n-1 +L964: MOVD 8(R8)(R1*1), R3 // w1 = x[i+1] + + MOVD R3, (R2)(R1*1) // z[i] = w>>s | w1<= n-1 + MOVD $0, R10 // w1>>s + MOVD R10, (R2)(R5*1) // z[n-1] = w1>>s + RET + +// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD r+56(FP), R4 // c = r + MOVD z_len+8(FP), R5 + MOVD $0, R1 // i = 0 + MOVD $0, R7 // i*8 = 0 + MOVD $0, R0 // make sure it's zero + BR E5 + +L5: MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + ADDC R4, R11 //add to low order bits + ADDE R0, R6 + MOVD R11, (R2)(R1*1) + MOVD R6, R4 + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E5: CMPBLT R7, R5, L5 // i < n + + MOVD R4, c+64(FP) + RET + +// func addMulVVW(z, x []Word, y Word) (c Word) +// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i +TEXT ·addMulVVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R2 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z_len+8(FP), R5 + + MOVD $0, R1 // i*8 = 0 + MOVD $0, R7 // i = 0 + MOVD $0, R0 // make sure it's zero + MOVD $0, R4 // c = 0 + + MOVD R5, R12 + AND $-2, R12 + CMPBGE R5, $2, A6 + BR E6 + +A6: MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 //add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + MOVD (8)(R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (8)(R2)(R1*1), R10 + ADDC R10, R11 //add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (8)(R2)(R1*1) + + ADD $16, R1 // i*8 + 8 + ADD $2, R7 // i++ + + CMPBLT R7, R12, A6 + BR E6 + +L6: MOVD (R8)(R1*1), R6 + MULHDU R9, R6 + MOVD (R2)(R1*1), R10 + ADDC R10, R11 //add to low order bits + ADDE R0, R6 + ADDC R4, R11 + ADDE R0, R6 + MOVD R6, R4 + MOVD R11, (R2)(R1*1) + + ADD $8, R1 // i*8 + 8 + ADD $1, R7 // i++ + +E6: CMPBLT R7, R5, L6 // i < n + + MOVD R4, c+56(FP) + RET + +// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) +// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i +TEXT ·divWVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R2 + MOVD xn+24(FP), R10 // r = xn + MOVD x+32(FP), R8 + MOVD y+56(FP), R9 + MOVD z_len+8(FP), R7 // i = z + SLD $3, R7, R1 // i*8 + MOVD $0, R0 // make sure it's zero + BR E7 + +L7: MOVD (R8)(R1*1), R11 + WORD $0xB98700A9 //DLGR R10,R9 + MOVD R11, (R2)(R1*1) + +E7: SUB $1, R7 // i-- + SUB $8, R1 + BGE L7 // i >= 0 + + MOVD R10, r+64(FP) + RET diff --git a/arith_test.go b/arith_test.go new file mode 100644 index 0000000..93bac62 --- /dev/null +++ b/arith_test.go @@ -0,0 +1,463 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package decimal + +import ( + "fmt" + "math/rand" + "testing" +) + +// TODO(db47h): set this to false +var isRaceBuilder = true + +type funVV func(z, x, y []Word) (c Word) +type argVV struct { + z, x, y nat + c Word +} + +var sumVV = []argVV{ + {}, + {nat{0}, nat{0}, nat{0}, 0}, + {nat{1}, nat{1}, nat{0}, 0}, + {nat{0}, nat{_M}, nat{1}, 1}, + {nat{80235}, nat{12345}, nat{67890}, 0}, + {nat{_M - 1}, nat{_M}, nat{_M}, 1}, + {nat{0, 0, 0, 0}, nat{_M, _M, _M, _M}, nat{1, 0, 0, 0}, 1}, + {nat{0, 0, 0, _M}, nat{_M, _M, _M, _M - 1}, nat{1, 0, 0, 0}, 0}, + {nat{0, 0, 0, 0}, nat{_M, 0, _M, 0}, nat{1, _M, 0, _M}, 1}, +} + +func testFunVV(t *testing.T, msg string, f funVV, a argVV) { + z := make(nat, len(a.z)) + c := f(z, a.x, a.y) + for i, zi := range z { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) + break + } + } + if c != a.c { + t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) + } +} + +func TestFunVV(t *testing.T) { + for _, a := range sumVV { + arg := a + testFunVV(t, "addVV_g", addVV_g, arg) + testFunVV(t, "addVV", addVV, arg) + + arg = argVV{a.z, a.y, a.x, a.c} + testFunVV(t, "addVV_g symmetric", addVV_g, arg) + testFunVV(t, "addVV symmetric", addVV, arg) + + arg = argVV{a.x, a.z, a.y, a.c} + testFunVV(t, "subVV_g", subVV_g, arg) + testFunVV(t, "subVV", subVV, arg) + + arg = argVV{a.y, a.z, a.x, a.c} + testFunVV(t, "subVV_g symmetric", subVV_g, arg) + testFunVV(t, "subVV symmetric", subVV, arg) + } +} + +// Always the same seed for reproducible results. +var rnd = rand.New(rand.NewSource(0)) + +func rndW() Word { + return Word(rnd.Int63()<<1 | rnd.Int63n(2)) +} + +func rndV(n int) []Word { + v := make([]Word, n) + for i := range v { + v[i] = rndW() + } + return v +} + +var benchSizes = []int{1, 2, 3, 4, 5, 1e1, 1e2, 1e3, 1e4, 1e5} + +func BenchmarkAddVV(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndV(n) + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + addVV(z, x, y) + } + }) + } +} + +func BenchmarkSubVV(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndV(n) + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + subVV(z, x, y) + } + }) + } +} + +type funVW func(z, x []Word, y Word) (c Word) +type argVW struct { + z, x nat + y Word + c Word +} + +var sumVW = []argVW{ + {}, + {nil, nil, 2, 2}, + {nat{0}, nat{0}, 0, 0}, + {nat{1}, nat{0}, 1, 0}, + {nat{1}, nat{1}, 0, 0}, + {nat{0}, nat{_M}, 1, 1}, + {nat{0, 0, 0, 0}, nat{_M, _M, _M, _M}, 1, 1}, + {nat{585}, nat{314}, 271, 0}, +} + +var lshVW = []argVW{ + {}, + {nat{0}, nat{0}, 0, 0}, + {nat{0}, nat{0}, 1, 0}, + {nat{0}, nat{0}, 20, 0}, + + {nat{_M}, nat{_M}, 0, 0}, + {nat{_M << 1 & _M}, nat{_M}, 1, 1}, + {nat{_M << 20 & _M}, nat{_M}, 20, _M >> (_W - 20)}, + + {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, + {nat{_M << 1 & _M, _M, _M}, nat{_M, _M, _M}, 1, 1}, + {nat{_M << 20 & _M, _M, _M}, nat{_M, _M, _M}, 20, _M >> (_W - 20)}, +} + +var rshVW = []argVW{ + {}, + {nat{0}, nat{0}, 0, 0}, + {nat{0}, nat{0}, 1, 0}, + {nat{0}, nat{0}, 20, 0}, + + {nat{_M}, nat{_M}, 0, 0}, + {nat{_M >> 1}, nat{_M}, 1, _M << (_W - 1) & _M}, + {nat{_M >> 20}, nat{_M}, 20, _M << (_W - 20) & _M}, + + {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, + {nat{_M, _M, _M >> 1}, nat{_M, _M, _M}, 1, _M << (_W - 1) & _M}, + {nat{_M, _M, _M >> 20}, nat{_M, _M, _M}, 20, _M << (_W - 20) & _M}, +} + +func testFunVW(t *testing.T, msg string, f funVW, a argVW) { + z := make(nat, len(a.z)) + c := f(z, a.x, a.y) + for i, zi := range z { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) + break + } + } + if c != a.c { + t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) + } +} + +func makeFunVW(f func(z, x []Word, s uint) (c Word)) funVW { + return func(z, x []Word, s Word) (c Word) { + return f(z, x, uint(s)) + } +} + +func TestFunVW(t *testing.T) { + for _, a := range sumVW { + arg := a + testFunVW(t, "addVW_g", addVW_g, arg) + testFunVW(t, "addVW", addVW, arg) + + arg = argVW{a.x, a.z, a.y, a.c} + testFunVW(t, "subVW_g", subVW_g, arg) + testFunVW(t, "subVW", subVW, arg) + } + + shlVW_g := makeFunVW(shlVU_g) + shlVW := makeFunVW(shlVU) + for _, a := range lshVW { + arg := a + testFunVW(t, "shlVU_g", shlVW_g, arg) + testFunVW(t, "shlVU", shlVW, arg) + } + + shrVW_g := makeFunVW(shrVU_g) + shrVW := makeFunVW(shrVU) + for _, a := range rshVW { + arg := a + testFunVW(t, "shrVU_g", shrVW_g, arg) + testFunVW(t, "shrVU", shrVW, arg) + } +} + +type argVU struct { + d []Word // d is a Word slice, the input parameters x and z come from this array. + l uint // l is the length of the input parameters x and z. + xp uint // xp is the starting position of the input parameter x, x := d[xp:xp+l]. + zp uint // zp is the starting position of the input parameter z, z := d[zp:zp+l]. + s uint // s is the shift number. + r []Word // r is the expected output result z. + c Word // c is the expected return value. + m string // message. +} + +var argshlVU = []argVU{ + // test cases for shlVU + {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0}, 7, 0, 0, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "complete overlap of shlVU"}, + {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by half of shlVU"}, + {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by 1 Word of shlVU"}, + {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "no overlap of shlVU"}, +} + +var argshrVU = []argVU{ + // test cases for shrVU + {[]Word{0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 1, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "complete overlap of shrVU"}, + {[]Word{0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 4, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by half of shrVU"}, + {[]Word{0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 7, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by 1 Word of shrVU"}, + {[]Word{0, 0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 8, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "no overlap of shrVU"}, +} + +func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) { + // save a.d for error message, or it will be overwritten. + b := make([]Word, len(a.d)) + copy(b, a.d) + z := a.d[a.zp : a.zp+a.l] + x := a.d[a.xp : a.xp+a.l] + c := f(z, x, a.s) + for i, zi := range z { + if zi != a.r[i] { + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) + break + } + } + if c != a.c { + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) + } +} + +func TestShiftOverlap(t *testing.T) { + for _, a := range argshlVU { + arg := a + testShiftFunc(t, shlVU, arg) + } + + for _, a := range argshrVU { + arg := a + testShiftFunc(t, shrVU, arg) + } +} + +func BenchmarkAddVW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndW() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + addVW(z, x, y) + } + }) + } +} + +func BenchmarkSubVW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndW() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + subVW(z, x, y) + } + }) + } +} + +type funVWW func(z, x []Word, y, r Word) (c Word) +type argVWW struct { + z, x nat + y, r Word + c Word +} + +var prodVWW = []argVWW{ + {}, + {nat{0}, nat{0}, 0, 0, 0}, + {nat{991}, nat{0}, 0, 991, 0}, + {nat{0}, nat{_M}, 0, 0, 0}, + {nat{991}, nat{_M}, 0, 991, 0}, + {nat{0}, nat{0}, _M, 0, 0}, + {nat{991}, nat{0}, _M, 991, 0}, + {nat{1}, nat{1}, 1, 0, 0}, + {nat{992}, nat{1}, 1, 991, 0}, + {nat{22793}, nat{991}, 23, 0, 0}, + {nat{22800}, nat{991}, 23, 7, 0}, + {nat{0, 0, 0, 22793}, nat{0, 0, 0, 991}, 23, 0, 0}, + {nat{7, 0, 0, 22793}, nat{0, 0, 0, 991}, 23, 7, 0}, + {nat{0, 0, 0, 0}, nat{7893475, 7395495, 798547395, 68943}, 0, 0, 0}, + {nat{991, 0, 0, 0}, nat{7893475, 7395495, 798547395, 68943}, 0, 991, 0}, + {nat{0, 0, 0, 0}, nat{0, 0, 0, 0}, 894375984, 0, 0}, + {nat{991, 0, 0, 0}, nat{0, 0, 0, 0}, 894375984, 991, 0}, + {nat{_M << 1 & _M}, nat{_M}, 1 << 1, 0, _M >> (_W - 1)}, + {nat{_M<<1&_M + 1}, nat{_M}, 1 << 1, 1, _M >> (_W - 1)}, + {nat{_M << 7 & _M}, nat{_M}, 1 << 7, 0, _M >> (_W - 7)}, + {nat{_M<<7&_M + 1<<6}, nat{_M}, 1 << 7, 1 << 6, _M >> (_W - 7)}, + {nat{_M << 7 & _M, _M, _M, _M}, nat{_M, _M, _M, _M}, 1 << 7, 0, _M >> (_W - 7)}, + {nat{_M<<7&_M + 1<<6, _M, _M, _M}, nat{_M, _M, _M, _M}, 1 << 7, 1 << 6, _M >> (_W - 7)}, +} + +func testFunVWW(t *testing.T, msg string, f funVWW, a argVWW) { + z := make(nat, len(a.z)) + c := f(z, a.x, a.y, a.r) + for i, zi := range z { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) + break + } + } + if c != a.c { + t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) + } +} + +// TODO(gri) mulAddVWW and divWVW are symmetric operations but +// their signature is not symmetric. Try to unify. + +type funWVW func(z []Word, xn Word, x []Word, y Word) (r Word) +type argWVW struct { + z nat + xn Word + x nat + y Word + r Word +} + +func testFunWVW(t *testing.T, msg string, f funWVW, a argWVW) { + z := make(nat, len(a.z)) + r := f(z, a.xn, a.x, a.y) + for i, zi := range z { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) + break + } + } + if r != a.r { + t.Errorf("%s%+v\n\tgot r = %#x; want %#x", msg, a, r, a.r) + } +} + +func TestFunVWW(t *testing.T) { + for _, a := range prodVWW { + arg := a + testFunVWW(t, "mulAddVWW_g", mulAddVWW_g, arg) + testFunVWW(t, "mulAddVWW", mulAddVWW, arg) + + if a.y != 0 && a.r < a.y { + arg := argWVW{a.x, a.c, a.z, a.y, a.r} + testFunWVW(t, "divWVW_g", divWVW_g, arg) + testFunWVW(t, "divWVW", divWVW, arg) + } + } +} + +var mulWWTests = []struct { + x, y Word + q, r Word +}{ + {_M, _M, _M - 1, 1}, + // 32 bit only: {0xc47dfa8c, 50911, 0x98a4, 0x998587f4}, +} + +func TestMulWW(t *testing.T) { + for i, test := range mulWWTests { + q, r := mulWW_g(test.x, test.y) + if q != test.q || r != test.r { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) + } + } +} + +var mulAddWWWTests = []struct { + x, y, c Word + q, r Word +}{ + // TODO(agl): These will only work on 64-bit platforms. + // {15064310297182388543, 0xe7df04d2d35d5d80, 13537600649892366549, 13644450054494335067, 10832252001440893781}, + // {15064310297182388543, 0xdab2f18048baa68d, 13644450054494335067, 12869334219691522700, 14233854684711418382}, + {_M, _M, 0, _M - 1, 1}, + {_M, _M, _M, _M, 0}, +} + +func TestMulAddWWW(t *testing.T) { + for i, test := range mulAddWWWTests { + q, r := mulAddWWW_g(test.x, test.y, test.c) + if q != test.q || r != test.r { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) + } + } +} + +func BenchmarkMulAddVWW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + z := make([]Word, n+1) + x := rndV(n) + y := rndW() + r := rndW() + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + mulAddVWW(z, x, y, r) + } + }) + } +} + +func BenchmarkAddMulVVW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndW() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + addMulVVW(z, x, y) + } + }) + } +} diff --git a/arith_wasm.s b/arith_wasm.s new file mode 100644 index 0000000..382597c --- /dev/null +++ b/arith_wasm.s @@ -0,0 +1,40 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !math_big_pure_go + +#include "textflag.h" + +TEXT ·mulWW(SB),NOSPLIT,$0 + JMP ·mulWW_g(SB) + +TEXT ·divWW(SB),NOSPLIT,$0 + JMP ·divWW_g(SB) + +TEXT ·addVV(SB),NOSPLIT,$0 + JMP ·addVV_g(SB) + +TEXT ·subVV(SB),NOSPLIT,$0 + JMP ·subVV_g(SB) + +TEXT ·addVW(SB),NOSPLIT,$0 + JMP ·addVW_g(SB) + +TEXT ·subVW(SB),NOSPLIT,$0 + JMP ·subVW_g(SB) + +TEXT ·shlVU(SB),NOSPLIT,$0 + JMP ·shlVU_g(SB) + +TEXT ·shrVU(SB),NOSPLIT,$0 + JMP ·shrVU_g(SB) + +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + JMP ·mulAddVWW_g(SB) + +TEXT ·addMulVVW(SB),NOSPLIT,$0 + JMP ·addMulVVW_g(SB) + +TEXT ·divWVW(SB),NOSPLIT,$0 + JMP ·divWVW_g(SB) diff --git a/dec.go b/dec.go index 282a5fa..61f46ce 100644 --- a/dec.go +++ b/dec.go @@ -126,7 +126,7 @@ func (x dec) toNat(z []Word) []Word { // r = zz & _B; zz = zz >> _W var r Word for j := len(zz) - 1; j >= 0; j-- { - zz[j], r = mulAddWWW(r, _DB, zz[j]) + zz[j], r = mulAddWWW_g(r, _DB, zz[j]) } zz = zz.norm() z[i] = r @@ -382,10 +382,10 @@ func (q dec) divBasic(u, v dec) { } if ujn != vn1 { var rhat Word - qhat, rhat = div10WW(ujn, u[j+n-1], vn1) + qhat, rhat = div10WW_g(ujn, u[j+n-1], vn1) // x1 | x2 = q̂v_{n-2} vn2 := v[n-2] - x1, x2 := mul10WW(qhat, vn2) + x1, x2 := mul10WW_g(qhat, vn2) // test if q̂v_{n-2} > br̂ + u_{j+n-2} ujn2 := u[j+n-2] for greaterThan(x1, x2, rhat, ujn2) { @@ -396,7 +396,7 @@ func (q dec) divBasic(u, v dec) { if rhat < prevRhat { break } - x1, x2 = mul10WW(qhat, vn2) + x1, x2 = mul10WW_g(qhat, vn2) } } @@ -430,7 +430,7 @@ func greaterThan(x1, x2, y1, y2 Word) bool { // modW returns x % d. func (x dec) modW(d Word) (r Word) { for i := len(x) - 1; i >= 0; i-- { - _, r = div10WW(r, x[i], d) + _, r = div10WW_g(r, x[i], d) } return r } @@ -512,7 +512,7 @@ func (z dec) sqr(x dec) dec { case n == 1: d := x[0] z = z.make(2) - z[1], z[0] = mul10WW(d, d) + z[1], z[0] = mul10WW_g(d, d) return z.norm() } diff --git a/dec_arith.go b/dec_arith.go index 5f9e1ed..1632e50 100644 --- a/dec_arith.go +++ b/dec_arith.go @@ -67,50 +67,6 @@ func nlz10(x Word) uint { return _DW - decDigits64(uint64(x)) } -// shl10VU sets z to x*(10**s), s < _WD -func shl10VU(z, x dec, s uint) (r Word) { - if s == 0 { - copy(z, x) - return - } - if len(z) == 0 || len(x) == 0 { - return - } - d, m := Word(pow10(_DW-s)), Word(pow10(s)) - var h, l Word - r, l = divWW(0, x[len(x)-1], d) - for i := len(z) - 1; i > 0; i-- { - t := l - h, l = divWW(0, x[i-1], d) - z[i] = t*m + h - } - z[0] = l * m - - return r -} - -// shr10VU sets z to x/(10**s) -func shr10VU(z, x dec, s uint) (r Word) { - if s == 0 { - copy(z, x) - return - } - if len(z) == 0 || len(x) == 0 { - return - } - - var h, l Word - d, m := Word(pow10(s)), Word(pow10(_DW-s)) - h, r = divWW(0, x[0], Word(d)) - for i := 1; i < len(z) && i < len(x); i++ { - t := h - h, l = divWW(0, x[i], d) - z[i-1] = t + l*m - } - z[len(z)-1] = h - return r -} - func decTrailingZeros(n uint) uint { var d uint if bits.UintSize > 32 { @@ -172,9 +128,60 @@ func decMaxPow(b Word) (p Word, n int) { return Word(decMaxPow64[i]), int(decMaxPow64[i+1]) } +//----------------------------------------------------------------------------- +// Arithmetic primitives +// + +// z1<<_W + z0 = x*y +func mul10WW_g(x, y Word) (z1, z0 Word) { + hi, lo := bits.Mul(uint(x), uint(y)) + return div10W_g(Word(hi), Word(lo)) +} + +// q = (u1<<_W + u0 - r)/v +func div10WW_g(u1, u0, v Word) (q, r Word) { + // convert to base 2 + hi, lo := mulAddWWW_g(u1, _DB, u0) + // q = (u-r)/v. Since v < _BD => r < _BD + return divWW_g(hi, lo, v) +} + +func add10WWW_g(x, y, cIn Word) (s, c Word) { + r, cc := bits.Add(uint(x), uint(y), uint(cIn)) + if cc != 0 || r >= _DB { + cc = 1 + r -= _DB + } + return Word(r), Word(cc) +} + +// The resulting carry c is either 0 or 1. +func add10VV_g(z, x, y []Word) (c Word) { + for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + z[i], c = add10WWW_g(x[i], y[i], c) + } + return +} + +func sub10WWW_g(x, y, b Word) (d, c Word) { + dd, cc := bits.Sub(uint(x), uint(y), uint(b)) + if cc != 0 { + dd += _DB + } + return Word(dd), Word(cc) +} + +// The resulting carry c is either 0 or 1. +func sub10VV_g(z, x, y []Word) (c Word) { + for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + z[i], c = sub10WWW_g(x[i], y[i], c) + } + return +} + // add10VW adds y to x. The resulting carry c is either 0 or 1. -func add10VW(z, x dec, y Word) (c Word) { - z[0], c = add10WWW(x[0], y, 0) +func add10VW_g(z, x dec, y Word) (c Word) { + z[0], c = add10WWW_g(x[0], y, 0) // propagate carry for i := 1; i < len(z) && i < len(x); i++ { s := x[i] + c @@ -189,15 +196,97 @@ func add10VW(z, x dec, y Word) (c Word) { return } -func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) { +func sub10VW_g(z, x []Word, y Word) (c Word) { + c = y + for i := 0; i < len(z) && i < len(x); i++ { + zi, cc := bits.Sub(uint(x[i]), uint(c), 0) + c = Word(cc) + if c == 0 { + z[i] = Word(zi) + copy(z[i+1:], x[i+1:]) + return + } + z[i] = Word(zi + _DB) + } + return +} + +// shl10VU sets z to x*(10**s), s < _WD +func shl10VU_g(z, x dec, s uint) (r Word) { + if s == 0 { + copy(z, x) + return + } + if len(z) == 0 || len(x) == 0 { + return + } + d, m := Word(pow10(_DW-s)), Word(pow10(s)) + var h, l Word + r, l = divWW(0, x[len(x)-1], d) + for i := len(z) - 1; i > 0; i-- { + t := l + h, l = divWW(0, x[i-1], d) + z[i] = t*m + h + } + z[0] = l * m + + return r +} + +// shr10VU sets z to x/(10**s) +func shr10VU_g(z, x dec, s uint) (r Word) { + if s == 0 { + copy(z, x) + return + } + if len(z) == 0 || len(x) == 0 { + return + } + + var h, l Word + d, m := Word(pow10(s)), Word(pow10(_DW-s)) + h, r = divWW(0, x[0], Word(d)) + for i := 1; i < len(z) && i < len(x); i++ { + t := h + h, l = divWW(0, x[i], d) + z[i-1] = t + l*m + } + z[len(z)-1] = h + return r +} + +func mulAdd10VWW_g(z, x []Word, y, r Word) (c Word) { + c = r + // The comment near the top of this file discusses this for loop condition. + for i := 0; i < len(z) && i < len(x); i++ { + hi, lo := mulAddWWW_g(x[i], y, c) + c, z[i] = div10W_g(hi, lo) + } + return +} + +func addMul10VVW_g(z, x []Word, y Word) (c Word) { + for i := 0; i < len(z) && i < len(x); i++ { + // do x[i] * y + c in base 2 => (hi+cc) * 2**_W + lo + hi, z0 := mulAddWWW_g(x[i], y, z[i]) + lo, cc := bits.Add(uint(z0), uint(c), 0) + c, z[i] = div10W_g(hi+Word(cc), Word(lo)) + } + return +} + +func div10WVW_g(z []Word, xn Word, x []Word, y Word) (r Word) { r = xn for i := len(z) - 1; i >= 0; i-- { - z[i], r = div10WW(r, x[i], y) + // z[i], r = div10WW(r, x[i], y) + // FORCE INLINE + hi, lo := mulAddWWW_g(r, _DB, x[i]) + z[i], r = divWW_g(hi, lo, y) } return } -// divWDB returns the quotient and remainder of a double-Word n divided by _DB: +// div10W_g returns the quotient and remainder of a double-Word n divided by _DB: // // q = n/_DB, r = n%_DB // @@ -215,7 +304,7 @@ func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) { // is a no-op. In the comments below, these have been removed for the sake of // clarity. // -func divWDB(n1, n0 Word) (q, r Word) { +func div10W_g(n1, n0 Word) (q, r Word) { const ( N = _W d = _DB @@ -236,109 +325,16 @@ func divWDB(n1, n0 Word) (q, r Word) { nAdj := n10 + (_n1 & dNorm) // q1 = n2 + HIGH(mP * (n2-_n1) + nAdj) - q1, _ := mulAddWWW(mP, n2-_n1, nAdj) + q1, _ := mulAddWWW_g(mP, n2-_n1, nAdj) q1 += n2 // dr = 2**N*n1 + n0 - 2**N*d + (-1-q1)*d // = (-1-q1) * d + n0 + (1) // 2**N * (n1 - d) (2) // let t = -1 - q1 = (^q1 + 1) - 1 = ^q1 t := ^q1 - drHi, drLo := mulAddWWW(t, d, n0) // (1) - drHi += n1 - d // (2) + drHi, drLo := mulAddWWW_g(t, d, n0) // (1) + drHi += n1 - d // (2) // q = drHi - (-1-q1) // r = drLow + (d & drHi) return drHi - t, drLo + d&drHi } - -// q = (u1<<_W + u0 - r)/v -func div10WW(u1, u0, v Word) (q, r Word) { - // q < _BD if u1 < v < _BD - if debugDecimal && !(u1 < v && v < _DB) { - panic("decimal: integer overflow") - } - // convert to base 2 - hi, lo := mulAddWWW(u1, _DB, u0) - // q = (u-r)/v. Since v < _BD => r < _BD - return divWW(hi, lo, v) -} - -func mulAdd10VWW(z, x []Word, y, r Word) (c Word) { - c = r - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { - c, z[i] = mulAdd10WWW(x[i], y, c) - } - return -} - -// z1*_BD + z0 = x*y + c -func mulAdd10WWW(x, y, c Word) (z1, z0 Word) { - hi, lo := bits.Mul(uint(x), uint(y)) - var cc uint - lo, cc = bits.Add(lo, uint(c), 0) - return divWDB(Word(hi+cc), Word(lo)) -} - -// z1<<_W + z0 = x*y -func mul10WW(x, y Word) (z1, z0 Word) { - hi, lo := bits.Mul(uint(x), uint(y)) - return divWDB(Word(hi), Word(lo)) -} - -func add10WWW(x, y, cIn Word) (s, c Word) { - r, cc := bits.Add(uint(x), uint(y), uint(cIn)) - if cc != 0 || r >= _DB { - cc = 1 - r -= _DB - } - return Word(r), Word(cc) -} - -func addMul10VVW(z, x []Word, y Word) (c Word) { - for i := 0; i < len(z) && i < len(x); i++ { - // do x[i] * y + c in base 2 => (hi+cc) * 2**_W + lo - hi, z0 := mulAddWWW(x[i], y, z[i]) - lo, cc := bits.Add(uint(z0), uint(c), 0) - c, z[i] = divWDB(hi+Word(cc), Word(lo)) - } - return -} - -// The resulting carry c is either 0 or 1. -func add10VV(z, x, y []Word) (c Word) { - for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { - z[i], c = add10WWW(x[i], y[i], c) - } - return -} - -func sub10WWW(x, y, b Word) (d, c Word) { - dd, cc := bits.Sub(uint(x), uint(y), uint(b)) - if cc != 0 { - dd += _DB - } - return Word(dd), Word(cc) -} - -// The resulting carry c is either 0 or 1. -func sub10VV(z, x, y []Word) (c Word) { - for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { - z[i], c = sub10WWW(x[i], y[i], c) - } - return -} - -func sub10VW(z, x []Word, y Word) (c Word) { - c = y - for i := 0; i < len(z) && i < len(x); i++ { - zi, cc := bits.Sub(uint(x[i]), uint(c), 0) - c = Word(cc) - if c == 0 { - z[i] = Word(zi) - copy(z[i+1:], x[i+1:]) - return - } - z[i] = Word(zi + _DB) - } - return -} diff --git a/dec_arith_decl.go b/dec_arith_decl.go new file mode 100644 index 0000000..52737ec --- /dev/null +++ b/dec_arith_decl.go @@ -0,0 +1,15 @@ +// +build !decimal_pure_go + +package decimal + +// implemented in arith_$GOARCH.s +// func add10VV(z, x, y []Word) (c Word) +// func sub10VV(z, x, y []Word) (c Word) +// func add10VW(z, x []Word, y Word) (c Word) +// func sub10VW(z, x []Word, y Word) (c Word) +// func shl10VU(z, x []Word, s uint) (c Word) +// func shr10VU(z, x []Word, s uint) (c Word) +// func mulAdd10VWW(z, x []Word, y, r Word) (c Word) +// func addMul10VVW(z, x []Word, y Word) (c Word) +// func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) +// func div10W(n1, n0 Word) (q, r Word) diff --git a/dec_arith_decl_pure.go b/dec_arith_decl_pure.go new file mode 100644 index 0000000..db68305 --- /dev/null +++ b/dec_arith_decl_pure.go @@ -0,0 +1,41 @@ +package decimal + +func add10VV(z, x, y []Word) (c Word) { + return add10VV_g(z, x, y) +} + +func sub10VV(z, x, y []Word) (c Word) { + return sub10VV_g(z, x, y) +} + +func add10VW(z, x []Word, y Word) (c Word) { + return add10VW_g(z, x, y) +} + +func sub10VW(z, x []Word, y Word) (c Word) { + return sub10VW_g(z, x, y) +} + +func shl10VU(z, x []Word, s uint) (c Word) { + return shl10VU_g(z, x, s) +} + +func shr10VU(z, x []Word, s uint) (c Word) { + return shr10VU_g(z, x, s) +} + +func mulAdd10VWW(z, x []Word, y, r Word) (c Word) { + return mulAdd10VWW_g(z, x, y, r) +} + +func addMul10VVW(z, x []Word, y Word) (c Word) { + return addMul10VVW_g(z, x, y) +} + +func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) { + return div10WVW_g(z, xn, x, y) +} + +func div10W(n1, n0 Word) (q, r Word) { + return div10W_g(n1, n0) +} diff --git a/dec_arith_test.go b/dec_arith_test.go index 18638f1..ff7707c 100644 --- a/dec_arith_test.go +++ b/dec_arith_test.go @@ -1,41 +1,11 @@ package decimal import ( + "fmt" "math/bits" - "reflect" - "strconv" "testing" ) -func TestAdd10VW(t *testing.T) { - td := []struct { - i dec - x Word - o dec - c Word - s int64 - }{ - {dec{_DMax - 1, _DMax}, 2, dec{}, 1, 0}, - {dec{_DMax - 1, _DMax}, 1, dec{_DMax, _DMax}, 0, 0}, - {dec{_DMax - 1, _DMax - 1}, 2, dec{0, _DMax}, 0, 0}, - } - for i, d := range td { - t.Run(strconv.Itoa(i), func(t *testing.T) { - z := d.i - c := add10VW(z, z, d.x) - var s int64 - z = z.norm() - if len(z) > 0 { - s = dnorm(z) - } - if !reflect.DeepEqual(z, d.o) || s != d.s || c != d.c { - t.Fatalf("addW failed: expected z = %v, s = %d, c = %d, got d = %v, s = %v, c = %v", d.o, d.s, d.c, z, s, c) - } - - }) - } -} - func TestDecDigits(t *testing.T) { for i := 0; i < 10000; i++ { n := uint(rnd.Uint64()) @@ -67,10 +37,10 @@ func rnd10V(n int) []Word { return v } -func TestDivWDB(t *testing.T) { +func TestDiv10W(t *testing.T) { h, l := rnd10W(), Word(rnd.Uint64()) for i := 0; i < 1e7; i++ { - q, r := divWDB(h, l) + q, r := div10W(h, l) qq, rr := bits.Div(uint(h), uint(l), _DB) if q != Word(qq) || r != Word(rr) { t.Fatalf("Got (%d,%d)/_DB = %d, %d. Expected %d %d", h, l, q, r, qq, rr) @@ -80,7 +50,7 @@ func TestDivWDB(t *testing.T) { var benchH, benchL Word -func BenchmarkDivWDB_bits(b *testing.B) { +func BenchmarkDiv10W_bits(b *testing.B) { h, l := rnd10W(), Word(rnd.Uint64()) for i := 0; i < b.N; i++ { h, l := bits.Div(uint(h), uint(l), _DB) @@ -88,9 +58,443 @@ func BenchmarkDivWDB_bits(b *testing.B) { } } -func BenchmarkDivWDB_mul(b *testing.B) { +func BenchmarkDiv10W_mul(b *testing.B) { h, l := rnd10W(), Word(rnd.Uint64()) for i := 0; i < b.N; i++ { - benchH, benchL = divWDB(h, l) + benchH, benchL = div10W(h, l) + } +} + +/////////////////////////// + +type fun10VV func(z, x, y []Word) (c Word) +type arg10VV struct { + z, x, y dec + c Word +} + +var sum10VV = []arg10VV{ + {}, + {dec{0}, dec{0}, dec{0}, 0}, + {dec{1}, dec{1}, dec{0}, 0}, + {dec{0}, dec{_DMax}, dec{1}, 1}, + {dec{80235}, dec{12345}, dec{67890}, 0}, + {dec{_DMax - 1}, dec{_DMax}, dec{_DMax}, 1}, + {dec{0, 0, 0, 0}, dec{_DMax, _DMax, _DMax, _DMax}, dec{1, 0, 0, 0}, 1}, + {dec{0, 0, 0, _DMax}, dec{_DMax, _DMax, _DMax, _DMax - 1}, dec{1, 0, 0, 0}, 0}, + {dec{0, 0, 0, 0}, dec{_DMax, 0, _DMax, 0}, dec{1, _DMax, 0, _DMax}, 1}, +} + +func testFun10VV(t *testing.T, msg string, f fun10VV, a arg10VV) { + z := make(nat, len(a.z)) + c := f(z, a.x, a.y) + for i, zi := range z { + if zi != a.z[i] { + t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) + break + } + } + if c != a.c { + t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) } } + +func TestFun10VV(t *testing.T) { + for _, a := range sum10VV { + arg := a + testFun10VV(t, "add10VV_g", add10VV_g, arg) + testFun10VV(t, "add10VV", add10VV, arg) + + arg = arg10VV{a.z, a.y, a.x, a.c} + testFun10VV(t, "add10VV_g symmetric", add10VV_g, arg) + testFun10VV(t, "add10VV symmetric", add10VV, arg) + + arg = arg10VV{a.x, a.z, a.y, a.c} + testFun10VV(t, "sub10VV_g", sub10VV_g, arg) + testFun10VV(t, "sub10VV", sub10VV, arg) + + arg = arg10VV{a.y, a.z, a.x, a.c} + testFun10VV(t, "sub10VV_g symmetric", sub10VV_g, arg) + testFun10VV(t, "sub10VV symmetric", sub10VV, arg) + } +} + +func BenchmarkAdd10VV(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndV(n) + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + addVV(z, x, y) + } + }) + } +} + +// func BenchmarkSubVV(b *testing.B) { +// for _, n := range benchSizes { +// if isRaceBuilder && n > 1e3 { +// continue +// } +// x := rndV(n) +// y := rndV(n) +// z := make([]Word, n) +// b.Run(fmt.Sprint(n), func(b *testing.B) { +// b.SetBytes(int64(n * _W)) +// for i := 0; i < b.N; i++ { +// subVV(z, x, y) +// } +// }) +// } +// } + +// type funVW func(z, x []Word, y Word) (c Word) +// type argVW struct { +// z, x nat +// y Word +// c Word +// } + +// var sumVW = []argVW{ +// {}, +// {nil, nil, 2, 2}, +// {nat{0}, nat{0}, 0, 0}, +// {nat{1}, nat{0}, 1, 0}, +// {nat{1}, nat{1}, 0, 0}, +// {nat{0}, nat{_M}, 1, 1}, +// {nat{0, 0, 0, 0}, nat{_M, _M, _M, _M}, 1, 1}, +// {nat{585}, nat{314}, 271, 0}, +// } + +// var lshVW = []argVW{ +// {}, +// {nat{0}, nat{0}, 0, 0}, +// {nat{0}, nat{0}, 1, 0}, +// {nat{0}, nat{0}, 20, 0}, + +// {nat{_M}, nat{_M}, 0, 0}, +// {nat{_M << 1 & _M}, nat{_M}, 1, 1}, +// {nat{_M << 20 & _M}, nat{_M}, 20, _M >> (_W - 20)}, + +// {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, +// {nat{_M << 1 & _M, _M, _M}, nat{_M, _M, _M}, 1, 1}, +// {nat{_M << 20 & _M, _M, _M}, nat{_M, _M, _M}, 20, _M >> (_W - 20)}, +// } + +// var rshVW = []argVW{ +// {}, +// {nat{0}, nat{0}, 0, 0}, +// {nat{0}, nat{0}, 1, 0}, +// {nat{0}, nat{0}, 20, 0}, + +// {nat{_M}, nat{_M}, 0, 0}, +// {nat{_M >> 1}, nat{_M}, 1, _M << (_W - 1) & _M}, +// {nat{_M >> 20}, nat{_M}, 20, _M << (_W - 20) & _M}, + +// {nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0}, +// {nat{_M, _M, _M >> 1}, nat{_M, _M, _M}, 1, _M << (_W - 1) & _M}, +// {nat{_M, _M, _M >> 20}, nat{_M, _M, _M}, 20, _M << (_W - 20) & _M}, +// } + +// func testFunVW(t *testing.T, msg string, f funVW, a argVW) { +// z := make(nat, len(a.z)) +// c := f(z, a.x, a.y) +// for i, zi := range z { +// if zi != a.z[i] { +// t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) +// break +// } +// } +// if c != a.c { +// t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) +// } +// } + +// func makeFunVW(f func(z, x []Word, s uint) (c Word)) funVW { +// return func(z, x []Word, s Word) (c Word) { +// return f(z, x, uint(s)) +// } +// } + +// func TestFunVW(t *testing.T) { +// for _, a := range sumVW { +// arg := a +// testFunVW(t, "addVW_g", addVW_g, arg) +// testFunVW(t, "addVW", addVW, arg) + +// arg = argVW{a.x, a.z, a.y, a.c} +// testFunVW(t, "subVW_g", subVW_g, arg) +// testFunVW(t, "subVW", subVW, arg) +// } + +// shlVW_g := makeFunVW(shlVU_g) +// shlVW := makeFunVW(shlVU) +// for _, a := range lshVW { +// arg := a +// testFunVW(t, "shlVU_g", shlVW_g, arg) +// testFunVW(t, "shlVU", shlVW, arg) +// } + +// shrVW_g := makeFunVW(shrVU_g) +// shrVW := makeFunVW(shrVU) +// for _, a := range rshVW { +// arg := a +// testFunVW(t, "shrVU_g", shrVW_g, arg) +// testFunVW(t, "shrVU", shrVW, arg) +// } +// } + +// type argVU struct { +// d []Word // d is a Word slice, the input parameters x and z come from this array. +// l uint // l is the length of the input parameters x and z. +// xp uint // xp is the starting position of the input parameter x, x := d[xp:xp+l]. +// zp uint // zp is the starting position of the input parameter z, z := d[zp:zp+l]. +// s uint // s is the shift number. +// r []Word // r is the expected output result z. +// c Word // c is the expected return value. +// m string // message. +// } + +// var argshlVU = []argVU{ +// // test cases for shlVU +// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0}, 7, 0, 0, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "complete overlap of shlVU"}, +// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by half of shlVU"}, +// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by 1 Word of shlVU"}, +// {[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "no overlap of shlVU"}, +// } + +// var argshrVU = []argVU{ +// // test cases for shrVU +// {[]Word{0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 1, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "complete overlap of shrVU"}, +// {[]Word{0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 4, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by half of shrVU"}, +// {[]Word{0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 7, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by 1 Word of shrVU"}, +// {[]Word{0, 0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 8, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "no overlap of shrVU"}, +// } + +// func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) { +// // save a.d for error message, or it will be overwritten. +// b := make([]Word, len(a.d)) +// copy(b, a.d) +// z := a.d[a.zp : a.zp+a.l] +// x := a.d[a.xp : a.xp+a.l] +// c := f(z, x, a.s) +// for i, zi := range z { +// if zi != a.r[i] { +// t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) +// break +// } +// } +// if c != a.c { +// t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) +// } +// } + +// func TestShiftOverlap(t *testing.T) { +// for _, a := range argshlVU { +// arg := a +// testShiftFunc(t, shlVU, arg) +// } + +// for _, a := range argshrVU { +// arg := a +// testShiftFunc(t, shrVU, arg) +// } +// } + +// func BenchmarkAddVW(b *testing.B) { +// for _, n := range benchSizes { +// if isRaceBuilder && n > 1e3 { +// continue +// } +// x := rndV(n) +// y := rndW() +// z := make([]Word, n) +// b.Run(fmt.Sprint(n), func(b *testing.B) { +// b.SetBytes(int64(n * _S)) +// for i := 0; i < b.N; i++ { +// addVW(z, x, y) +// } +// }) +// } +// } + +// func BenchmarkSubVW(b *testing.B) { +// for _, n := range benchSizes { +// if isRaceBuilder && n > 1e3 { +// continue +// } +// x := rndV(n) +// y := rndW() +// z := make([]Word, n) +// b.Run(fmt.Sprint(n), func(b *testing.B) { +// b.SetBytes(int64(n * _S)) +// for i := 0; i < b.N; i++ { +// subVW(z, x, y) +// } +// }) +// } +// } + +// type funVWW func(z, x []Word, y, r Word) (c Word) +// type argVWW struct { +// z, x nat +// y, r Word +// c Word +// } + +// var prodVWW = []argVWW{ +// {}, +// {nat{0}, nat{0}, 0, 0, 0}, +// {nat{991}, nat{0}, 0, 991, 0}, +// {nat{0}, nat{_M}, 0, 0, 0}, +// {nat{991}, nat{_M}, 0, 991, 0}, +// {nat{0}, nat{0}, _M, 0, 0}, +// {nat{991}, nat{0}, _M, 991, 0}, +// {nat{1}, nat{1}, 1, 0, 0}, +// {nat{992}, nat{1}, 1, 991, 0}, +// {nat{22793}, nat{991}, 23, 0, 0}, +// {nat{22800}, nat{991}, 23, 7, 0}, +// {nat{0, 0, 0, 22793}, nat{0, 0, 0, 991}, 23, 0, 0}, +// {nat{7, 0, 0, 22793}, nat{0, 0, 0, 991}, 23, 7, 0}, +// {nat{0, 0, 0, 0}, nat{7893475, 7395495, 798547395, 68943}, 0, 0, 0}, +// {nat{991, 0, 0, 0}, nat{7893475, 7395495, 798547395, 68943}, 0, 991, 0}, +// {nat{0, 0, 0, 0}, nat{0, 0, 0, 0}, 894375984, 0, 0}, +// {nat{991, 0, 0, 0}, nat{0, 0, 0, 0}, 894375984, 991, 0}, +// {nat{_M << 1 & _M}, nat{_M}, 1 << 1, 0, _M >> (_W - 1)}, +// {nat{_M<<1&_M + 1}, nat{_M}, 1 << 1, 1, _M >> (_W - 1)}, +// {nat{_M << 7 & _M}, nat{_M}, 1 << 7, 0, _M >> (_W - 7)}, +// {nat{_M<<7&_M + 1<<6}, nat{_M}, 1 << 7, 1 << 6, _M >> (_W - 7)}, +// {nat{_M << 7 & _M, _M, _M, _M}, nat{_M, _M, _M, _M}, 1 << 7, 0, _M >> (_W - 7)}, +// {nat{_M<<7&_M + 1<<6, _M, _M, _M}, nat{_M, _M, _M, _M}, 1 << 7, 1 << 6, _M >> (_W - 7)}, +// } + +// func testFunVWW(t *testing.T, msg string, f funVWW, a argVWW) { +// z := make(nat, len(a.z)) +// c := f(z, a.x, a.y, a.r) +// for i, zi := range z { +// if zi != a.z[i] { +// t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) +// break +// } +// } +// if c != a.c { +// t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c) +// } +// } + +// // TODO(gri) mulAddVWW and divWVW are symmetric operations but +// // their signature is not symmetric. Try to unify. + +// type funWVW func(z []Word, xn Word, x []Word, y Word) (r Word) +// type argWVW struct { +// z nat +// xn Word +// x nat +// y Word +// r Word +// } + +// func testFunWVW(t *testing.T, msg string, f funWVW, a argWVW) { +// z := make(nat, len(a.z)) +// r := f(z, a.xn, a.x, a.y) +// for i, zi := range z { +// if zi != a.z[i] { +// t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i]) +// break +// } +// } +// if r != a.r { +// t.Errorf("%s%+v\n\tgot r = %#x; want %#x", msg, a, r, a.r) +// } +// } + +// func TestFunVWW(t *testing.T) { +// for _, a := range prodVWW { +// arg := a +// testFunVWW(t, "mulAddVWW_g", mulAddVWW_g, arg) +// testFunVWW(t, "mulAddVWW", mulAddVWW, arg) + +// if a.y != 0 && a.r < a.y { +// arg := argWVW{a.x, a.c, a.z, a.y, a.r} +// testFunWVW(t, "divWVW_g", divWVW_g, arg) +// testFunWVW(t, "divWVW", divWVW, arg) +// } +// } +// } + +// var mulWWTests = []struct { +// x, y Word +// q, r Word +// }{ +// {_M, _M, _M - 1, 1}, +// // 32 bit only: {0xc47dfa8c, 50911, 0x98a4, 0x998587f4}, +// } + +// func TestMulWW(t *testing.T) { +// for i, test := range mulWWTests { +// q, r := mulWW_g(test.x, test.y) +// if q != test.q || r != test.r { +// t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) +// } +// } +// } + +// var mulAddWWWTests = []struct { +// x, y, c Word +// q, r Word +// }{ +// // TODO(agl): These will only work on 64-bit platforms. +// // {15064310297182388543, 0xe7df04d2d35d5d80, 13537600649892366549, 13644450054494335067, 10832252001440893781}, +// // {15064310297182388543, 0xdab2f18048baa68d, 13644450054494335067, 12869334219691522700, 14233854684711418382}, +// {_M, _M, 0, _M - 1, 1}, +// {_M, _M, _M, _M, 0}, +// } + +// func TestMulAddWWW(t *testing.T) { +// for i, test := range mulAddWWWTests { +// q, r := mulAddWWW_g(test.x, test.y, test.c) +// if q != test.q || r != test.r { +// t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) +// } +// } +// } + +// func BenchmarkMulAddVWW(b *testing.B) { +// for _, n := range benchSizes { +// if isRaceBuilder && n > 1e3 { +// continue +// } +// z := make([]Word, n+1) +// x := rndV(n) +// y := rndW() +// r := rndW() +// b.Run(fmt.Sprint(n), func(b *testing.B) { +// b.SetBytes(int64(n * _W)) +// for i := 0; i < b.N; i++ { +// mulAddVWW(z, x, y, r) +// } +// }) +// } +// } + +// func BenchmarkAddMulVVW(b *testing.B) { +// for _, n := range benchSizes { +// if isRaceBuilder && n > 1e3 { +// continue +// } +// x := rndV(n) +// y := rndW() +// z := make([]Word, n) +// b.Run(fmt.Sprint(n), func(b *testing.B) { +// b.SetBytes(int64(n * _W)) +// for i := 0; i < b.N; i++ { +// addMulVVW(z, x, y) +// } +// }) +// } +// } diff --git a/decimal_test.go b/decimal_test.go index 6a32cde..bfd16ef 100644 --- a/decimal_test.go +++ b/decimal_test.go @@ -36,7 +36,7 @@ func TestDnorm(t *testing.T) { e := uint(rand.Intn(_DW + 1)) h, l := mulWW(Word(w), Word(pow10(e))) // convert h, l from base _B (2**64) to base _BD (10**19) or 2**32 -> 10**9 - h, l = divWDB(h, l) + h, l = div10W(h, l) d := dec{Word(l), Word(h)}.norm() if len(d) == 0 { if w == 0 { diff --git a/stdlib.go b/stdlib.go index 37907aa..724096d 100644 --- a/stdlib.go +++ b/stdlib.go @@ -7,14 +7,9 @@ import ( "fmt" "io" "math" - "math/bits" - "math/rand" "strconv" ) -// TODO(db47h): set this to false -const isRaceBuilder = true - const digits = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" // MaxBase is the largest number base accepted for string conversions. @@ -86,17 +81,6 @@ func makeAcc(above bool) Accuracy { return Below } -// A Word represents a single digit of a multi-precision unsigned integer. -type Word uint - -const ( - _S = _W / 8 // word size in bytes - - _W = bits.UintSize // word size in bits - // _B = 1 << _W // digit base - // _M = _B - 1 // digit mask -) - // byteReader is a local wrapper around fmt.ScanState; // it implements the ByteReader interface. type byteReader struct { @@ -122,34 +106,6 @@ func umax32(x, y uint32) uint32 { return y } -// q = (u1<<_W + u0 - r)/v -func divWW(u1, u0, v Word) (q, r Word) { - qq, rr := bits.Div(uint(u1), uint(u0), uint(v)) - return Word(qq), Word(rr) -} - -func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { - r = xn - for i := len(z) - 1; i >= 0; i-- { - z[i], r = divWW(r, x[i], y) - } - return r -} - -// z1<<_W + z0 = x*y + c -func mulAddWWW(x, y, c Word) (z1, z0 Word) { - hi, lo := bits.Mul(uint(x), uint(y)) - var cc uint - lo, cc = bits.Add(lo, uint(c), 0) - return Word(hi + cc), Word(lo) -} - -// z1<<_W + z0 = x*y -func mulWW(x, y Word) (z1, z0 Word) { - hi, lo := bits.Mul(uint(x), uint(y)) - return Word(hi), Word(lo) -} - func same(x, y []Word) bool { return len(x) == len(y) && len(x) > 0 && &x[0] == &y[0] } @@ -283,10 +239,4 @@ func (err ErrNaN) Error() string { return err.msg } -var rnd = rand.New(rand.NewSource(0)) - -// nlz returns the number of leading zeros in x. -// Wraps bits.LeadingZeros call for convenience. -func nlz(x Word) uint { - return uint(bits.LeadingZeros(uint(x))) -} +type nat []Word