From 5cfed50607d16d4adb0b58cc9e8f8ecc4acedfaf Mon Sep 17 00:00:00 2001
From: Denis Bernard <db047h@gmail.com>
Date: Sun, 10 May 2020 03:21:31 +0200
Subject: [PATCH] Implement division by constant _DB using multiplication

Doubles the performance over bits.Div(n1, n0, _DB) on x86 and x86_64.
While it theoretically benefits all base-_DB arithmetic primitives,
it prevents them from being inlined, leading to diminishing returns
until we start manipulating numbers > 100 Words.
---
 dec.go            | 221 +++++++++++++++++++++++++++++++++++++++-------
 dec_arith.go      | 101 +++++++++++++++++----
 dec_arith_test.go |  39 ++++++--
 dec_conv.go       |   4 +-
 dec_test.go       |  27 +++---
 decimal.go        |  20 ++---
 decimal_conv.go   |  10 +--
 decimal_test.go   |  23 +++--
 8 files changed, 347 insertions(+), 98 deletions(-)

diff --git a/dec.go b/dec.go
index da468de..282a5fa 100644
--- a/dec.go
+++ b/dec.go
@@ -8,18 +8,6 @@ import (
 
 const debugDecimal = true
 
-const (
-	// _W * log10(2) = decimal digits per word. 9 decimal digits per 32 bits
-	// word and 19 per 64 bits word.
-	_WD = _W * 30103 / 100000
-	// Decimal base for a word. 1e9 for 32 bits words and 1e19 for 64 bits
-	// words.
-	// We want this value to be a const. This is a dirty hack to avoid
-	// conditional compilation; it will break if bits.UintSize != 32 or 64
-	_BD = 9999999998000000000*(_WD/19) + 1000000000*(_WD/9)
-	_MD = _BD - 1
-)
-
 // dec is an unsigned integer x of the form
 //
 //   x = x[n-1]*_BD^(n-1) + x[n-2]*_BD^(n-2) + ... + x[1]*_BD + x[0]
@@ -56,7 +44,7 @@ func (z dec) norm() dec {
 // digits returns the number of digits of x.
 func (x dec) digits() uint {
 	if i := len(x) - 1; i >= 0 {
-		return uint(i*_WD) + decDigits(uint(x[i]))
+		return uint(i*_DW) + decDigits(uint(x[i]))
 	}
 	return 0
 }
@@ -64,14 +52,14 @@ func (x dec) digits() uint {
 func (x dec) ntz() uint {
 	for i, w := range x {
 		if w != 0 {
-			return uint(i)*_WD + decTrailingZeros(uint(w))
+			return uint(i)*_DW + decTrailingZeros(uint(w))
 		}
 	}
 	return 0
 }
 
 func (x dec) digit(i uint) uint {
-	j, i := bits.Div(0, i, _WD)
+	j, i := bits.Div(0, i, _DW)
 	if j >= uint(len(x)) {
 		return 0
 	}
@@ -110,13 +98,13 @@ func (z dec) setWord(x Word) dec {
 
 func (z dec) setUint64(x uint64) (dec, int32) {
 	dig := int32(decDigits64(x))
-	if w := Word(x); uint64(w) == x && w < _BD {
+	if w := Word(x); uint64(w) == x && w < _DB {
 		return z.setWord(w), dig
 	}
 	// x could be a 2 to 3 words value
-	z = z.make(int(dig+_WD-1) / _WD)
+	z = z.make(int(dig+_DW-1) / _DW)
 	for i := 0; i < len(z); i++ {
-		hi, lo := bits.Div64(0, x, _BD)
+		hi, lo := bits.Div64(0, x, _DB)
 		z[i] = Word(lo)
 		x = hi
 	}
@@ -138,7 +126,7 @@ func (x dec) toNat(z []Word) []Word {
 		// r = zz & _B; zz = zz >> _W
 		var r Word
 		for j := len(zz) - 1; j >= 0; j-- {
-			zz[j], r = mulAddWWW(r, _BD, zz[j])
+			zz[j], r = mulAddWWW(r, _DB, zz[j])
 		}
 		zz = zz.norm()
 		z[i] = r
@@ -156,7 +144,7 @@ func (z dec) setInt(x *big.Int) dec {
 		b[i] = Word(bb[i])
 	}
 	for i := 0; i < len(z); i++ {
-		z[i] = divWVW(b, 0, b, _BD)
+		z[i] = divWVW(b, 0, b, _DB)
 	}
 	z = z.norm()
 	return z
@@ -165,7 +153,7 @@ func (z dec) setInt(x *big.Int) dec {
 // sticky returns 1 if there's a non zero digit within the
 // i least significant digits, otherwise it returns 0.
 func (x dec) sticky(i uint) uint {
-	j, i := bits.Div(0, i, _WD)
+	j, i := bits.Div(0, i, _DW)
 	if j >= uint(len(x)) {
 		if len(x) == 0 {
 			return 0
@@ -327,6 +315,8 @@ func putDec(x *dec) {
 
 var decPool sync.Pool
 
+const divRecursiveThreshold = 100
+
 // q = (uIn-r)/vIn, with 0 <= r < vIn
 // Uses z as storage for q, and u as storage for r if possible.
 // See Knuth, Volume 2, section 4.3.1, Algorithm D.
@@ -339,7 +329,7 @@ func (z dec) divLarge(u, uIn, vIn dec) (q, r dec) {
 	m := len(uIn)
 
 	// D1.
-	d := _BD / (vIn[n-1] + 1)
+	d := _DB / (vIn[n-1] + 1)
 	// do not modify vIn, it may be used by another goroutine simultaneously
 	vp := getDec(n)
 	v := *vp
@@ -356,11 +346,11 @@ func (z dec) divLarge(u, uIn, vIn dec) (q, r dec) {
 	q = z.make(m - n + 1)
 
 	// TODO(db47h): implement divRecursive
-	// if n < divRecursiveThreshold {
-	q.divBasic(u, v)
-	// } else {
-	// 	q.divRecursive(u, v)
-	// }
+	if n < divRecursiveThreshold {
+		q.divBasic(u, v)
+	} else {
+		q.divRecursive(u, v)
+	}
 	putDec(vp)
 
 	q = q.norm()
@@ -385,7 +375,7 @@ func (q dec) divBasic(u, v dec) {
 	vn1 := v[n-1]
 	for j := m; j >= 0; j-- {
 		// D3.
-		qhat := Word(_MD)
+		qhat := Word(_DMax)
 		var ujn Word
 		if j+n < len(u) {
 			ujn = u[j+n]
@@ -475,9 +465,9 @@ func (z dec) shl(x dec, s uint) dec {
 	}
 	// m > 0
 
-	n := m + int(s/_WD)
+	n := m + int(s/_DW)
 	z = z.make(n + 1)
-	z[n] = shl10VU(z[n-m:n], x, s%_WD)
+	z[n] = shl10VU(z[n-m:n], x, s%_DW)
 	z[0 : n-m].clear()
 
 	return z.norm()
@@ -495,14 +485,14 @@ func (z dec) shr(x dec, s uint) dec {
 	}
 
 	m := len(x)
-	n := m - int(s/_WD)
+	n := m - int(s/_DW)
 	if n <= 0 {
 		return z[:0]
 	}
 	// n > 0
 
 	z = z.make(n)
-	shr10VU(z, x[m-n:], s%_WD)
+	shr10VU(z, x[m-n:], s%_DW)
 
 	return z.norm()
 }
@@ -780,3 +770,170 @@ func (z dec) expNN(x, y, m dec) dec {
 
 	return z.norm()
 }
+
+// divRecursive performs word-by-word division of u by v.
+// The quotient is written in pre-allocated z.
+// The remainder overwrites input u.
+//
+// Precondition:
+// - len(z) >= len(u)-len(v)
+//
+// See Burnikel, Ziegler, "Fast Recursive Division", Algorithm 1 and 2.
+func (z dec) divRecursive(u, v dec) {
+	// Recursion depth is less than 2 log2(len(v))
+	// Allocate a slice of temporaries to be reused across recursion.
+	recDepth := 2 * bits.Len(uint(len(v)))
+	// large enough to perform Karatsuba on operands as large as v
+	tmp := getDec(3 * len(v))
+	temps := make([]*dec, recDepth)
+	z.clear()
+	z.divRecursiveStep(u, v, 0, tmp, temps)
+	for _, n := range temps {
+		if n != nil {
+			putDec(n)
+		}
+	}
+	putDec(tmp)
+}
+
+// divRecursiveStep computes the division of u by v.
+// - z must be large enough to hold the quotient
+// - the quotient will overwrite z
+// - the remainder will overwrite u
+func (z dec) divRecursiveStep(u, v dec, depth int, tmp *dec, temps []*dec) {
+	u = u.norm()
+	v = v.norm()
+
+	if len(u) == 0 {
+		z.clear()
+		return
+	}
+	n := len(v)
+	if n < divRecursiveThreshold {
+		z.divBasic(u, v)
+		return
+	}
+	m := len(u) - n
+	if m < 0 {
+		return
+	}
+
+	// Produce the quotient by blocks of B words.
+	// Division by v (length n) is done using a length n/2 division
+	// and a length n/2 multiplication for each block. The final
+	// complexity is driven by multiplication complexity.
+	B := n / 2
+
+	// Allocate a nat for qhat below.
+	if temps[depth] == nil {
+		temps[depth] = getDec(n)
+	} else {
+		*temps[depth] = temps[depth].make(B + 1)
+	}
+
+	j := m
+	for j > B {
+		// Divide u[j-B:j+n] by vIn. Keep remainder in u
+		// for next block.
+		//
+		// The following property will be used (Lemma 2):
+		// if u = u1 << s + u0
+		//    v = v1 << s + v0
+		// then floor(u1/v1) >= floor(u/v)
+		//
+		// Moreover, the difference is at most 2 if len(v1) >= len(u/v)
+		// We choose s = B-1 since len(v)-B >= B+1 >= len(u/v)
+		s := (B - 1)
+		// Except for the first step, the top bits are always
+		// a division remainder, so the quotient length is <= n.
+		uu := u[j-B:]
+
+		qhat := *temps[depth]
+		qhat.clear()
+		qhat.divRecursiveStep(uu[s:B+n], v[s:], depth+1, tmp, temps)
+		qhat = qhat.norm()
+		// Adjust the quotient:
+		//    u = u_h << s + u_l
+		//    v = v_h << s + v_l
+		//  u_h = q̂ v_h + rh
+		//    u = q̂ (v - v_l) + rh << s + u_l
+		// After the above step, u contains a remainder:
+		//    u = rh << s + u_l
+		// and we need to subtract q̂ v_l
+		//
+		// But it may be a bit too large, in which case q̂ needs to be smaller.
+		qhatv := tmp.make(3 * n)
+		qhatv.clear()
+		qhatv = qhatv.mul(qhat, v[:s])
+		for i := 0; i < 2; i++ {
+			e := qhatv.cmp(uu.norm())
+			if e <= 0 {
+				break
+			}
+			sub10VW(qhat, qhat, 1)
+			c := sub10VV(qhatv[:s], qhatv[:s], v[:s])
+			if len(qhatv) > s {
+				sub10VW(qhatv[s:], qhatv[s:], c)
+			}
+			decAddAt(uu[s:], v[s:], 0)
+		}
+		if qhatv.cmp(uu.norm()) > 0 {
+			panic("impossible")
+		}
+		c := sub10VV(uu[:len(qhatv)], uu[:len(qhatv)], qhatv)
+		if c > 0 {
+			sub10VW(uu[len(qhatv):], uu[len(qhatv):], c)
+		}
+		decAddAt(z, qhat, j-B)
+		j -= B
+	}
+
+	// Now u < (v<<B), compute lower bits in the same way.
+	// Choose shift = B-1 again.
+	s := B
+	qhat := *temps[depth]
+	qhat.clear()
+	qhat.divRecursiveStep(u[s:].norm(), v[s:], depth+1, tmp, temps)
+	qhat = qhat.norm()
+	qhatv := tmp.make(3 * n)
+	qhatv.clear()
+	qhatv = qhatv.mul(qhat, v[:s])
+	// Set the correct remainder as before.
+	for i := 0; i < 2; i++ {
+		if e := qhatv.cmp(u.norm()); e > 0 {
+			sub10VW(qhat, qhat, 1)
+			c := sub10VV(qhatv[:s], qhatv[:s], v[:s])
+			if len(qhatv) > s {
+				sub10VW(qhatv[s:], qhatv[s:], c)
+			}
+			decAddAt(u[s:], v[s:], 0)
+		}
+	}
+	if qhatv.cmp(u.norm()) > 0 {
+		panic("impossible")
+	}
+	c := sub10VV(u[0:len(qhatv)], u[0:len(qhatv)], qhatv)
+	if c > 0 {
+		c = sub10VW(u[len(qhatv):], u[len(qhatv):], c)
+	}
+	if c > 0 {
+		panic("impossible")
+	}
+
+	// Done!
+	decAddAt(z, qhat.norm(), 0)
+}
+
+// addAt implements z += x*10**(_WD*i); z must be long enough.
+// (we don't use dec.add because we need z to stay the same
+// slice, and we don't need to normalize z after each addition)
+func decAddAt(z, x dec, i int) {
+	if n := len(x); n > 0 {
+		if c := add10VV(z[i:i+n], z[i:], x); c != 0 {
+			j := i + n
+			if j < len(z) {
+				add10VW(z[j:], z[j:], c)
+			}
+		}
+	}
+}
diff --git a/dec_arith.go b/dec_arith.go
index 34b479a..5f9e1ed 100644
--- a/dec_arith.go
+++ b/dec_arith.go
@@ -4,6 +4,21 @@ import (
 	"math/bits"
 )
 
+const (
+	// _W * log10(2) = decimal digits per word. 9 decimal digits per 32 bits
+	// word and 19 per 64 bits word.
+	_DW = _W * 30103 / 100000
+	// Decimal base for a word. 1e9 for 32 bits words and 1e19 for 64 bits
+	// words.
+	// We want this value to be a const. This is a dirty hack to avoid
+	// conditional compilation; it will break if bits.UintSize != 32 or 64
+	_DB = 9999999998000000000*(_DW/19) + 1000000000*(_DW/9)
+	// Maximum value of a decimal Word
+	_DMax = _DB - 1
+	// Bits per decimal Word: Log2(_DB)+1 = _DW * Log2(10) + 1
+	_DWb = _DW*100000/30103 + 1
+)
+
 var pow10s = [...]uint64{
 	1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000,
 	10000000000, 100000000000, 1000000000000, 10000000000000, 100000000000000, 1000000000000000,
@@ -47,9 +62,9 @@ func decDigits32(x uint) (n uint) {
 
 func nlz10(x Word) uint {
 	if bits.UintSize == 32 {
-		return _WD - decDigits32(uint(x))
+		return _DW - decDigits32(uint(x))
 	}
-	return _WD - decDigits64(uint64(x))
+	return _DW - decDigits64(uint64(x))
 }
 
 // shl10VU sets z to x*(10**s), s < _WD
@@ -61,7 +76,7 @@ func shl10VU(z, x dec, s uint) (r Word) {
 	if len(z) == 0 || len(x) == 0 {
 		return
 	}
-	d, m := Word(pow10(_WD-s)), Word(pow10(s))
+	d, m := Word(pow10(_DW-s)), Word(pow10(s))
 	var h, l Word
 	r, l = divWW(0, x[len(x)-1], d)
 	for i := len(z) - 1; i > 0; i-- {
@@ -85,7 +100,7 @@ func shr10VU(z, x dec, s uint) (r Word) {
 	}
 
 	var h, l Word
-	d, m := Word(pow10(s)), Word(pow10(_WD-s))
+	d, m := Word(pow10(s)), Word(pow10(_DW-s))
 	h, r = divWW(0, x[0], Word(d))
 	for i := 1; i < len(z) && i < len(x); i++ {
 		t := h
@@ -163,7 +178,7 @@ func add10VW(z, x dec, y Word) (c Word) {
 	// propagate carry
 	for i := 1; i < len(z) && i < len(x); i++ {
 		s := x[i] + c
-		if s < _BD {
+		if s < _DB {
 			z[i] = s
 			// copy remaining digits
 			copy(z[i+1:], x[i+1:])
@@ -182,14 +197,67 @@ func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) {
 	return
 }
 
+// divWDB returns the quotient and remainder of a double-Word n divided by _DB:
+//
+// q = n/_DB, r = n%_DB
+//
+// with the dividend bits' upper half in parameter n1 and the lower half in
+// parameter n0. divDecBase panics if n1 > _DMax (quotient overflow).
+//
+// This function uses the algorithm from "Division by invariant integers using
+// multiplication" by Torbjörn Granlund & Peter L. Montgomery.
+//
+// See https://doi.org/10.1145/773473.178249, section 8, Dividing udword by uword.
+//
+// On 386 and amd64, this is over 2 times faster than calling bits.Div(n1, n0, _BD).
+//
+// In the article, some equations show an addition or subtraction of 2**N, which
+// is a no-op. In the comments below, these have been removed for the sake of
+// clarity.
+//
+func divWDB(n1, n0 Word) (q, r Word) {
+	const (
+		N     = _W
+		d     = _DB
+		l     = _DWb
+		mP    = (1<<(N+l)-1)/d - 1<<N // m'
+		dNorm = d << (N - l)
+	)
+	if debugDecimal && n1 > _DMax {
+		panic("decimal: integer overflow")
+	}
+
+	// if N == 64, N == l => n2 == n1 && n10 == n0
+	// go vet complains, but this should be optimized out.
+	n2 := n1<<(N-l) + n0>>l
+	n10 := n0 << (N - l)
+	// -n1 = (n10 < 0 ? -1 : 0)
+	_n1 := Word(int(n10) >> (N - 1))
+	nAdj := n10 + (_n1 & dNorm)
+
+	// q1 = n2 + HIGH(mP * (n2-_n1) + nAdj)
+	q1, _ := mulAddWWW(mP, n2-_n1, nAdj)
+	q1 += n2
+	// dr = 2**N*n1 + n0 - 2**N*d + (-1-q1)*d
+	//    = (-1-q1) * d + n0 +           (1)
+	//      2**N * (n1 - d)              (2)
+	// let t = -1 - q1 = (^q1 + 1) - 1 = ^q1
+	t := ^q1
+	drHi, drLo := mulAddWWW(t, d, n0) // (1)
+	drHi += n1 - d                    // (2)
+	// q = drHi - (-1-q1)
+	// r = drLow + (d & drHi)
+	return drHi - t, drLo + d&drHi
+}
+
 // q = (u1<<_W + u0 - r)/v
 func div10WW(u1, u0, v Word) (q, r Word) {
 	// q < _BD if u1 < v < _BD
-	if debugDecimal && !(u1 < v && v < _BD) {
-		panic("decimal overflow")
+	if debugDecimal && !(u1 < v && v < _DB) {
+		panic("decimal: integer overflow")
 	}
 	// convert to base 2
-	hi, lo := mulAddWWW(u1, _BD, u0)
+	hi, lo := mulAddWWW(u1, _DB, u0)
 	// q = (u-r)/v. Since v < _BD => r < _BD
 	return divWW(hi, lo, v)
 }
@@ -208,22 +276,20 @@ func mulAdd10WWW(x, y, c Word) (z1, z0 Word) {
 	hi, lo := bits.Mul(uint(x), uint(y))
 	var cc uint
 	lo, cc = bits.Add(lo, uint(c), 0)
-	hi, lo = bits.Div(hi+cc, lo, _BD)
-	return Word(hi), Word(lo)
+	return divWDB(Word(hi+cc), Word(lo))
 }
 
 // z1<<_W + z0 = x*y
 func mul10WW(x, y Word) (z1, z0 Word) {
 	hi, lo := bits.Mul(uint(x), uint(y))
-	hi, lo = bits.Div(hi, lo, _BD)
-	return Word(hi), Word(lo)
+	return divWDB(Word(hi), Word(lo))
 }
 
 func add10WWW(x, y, cIn Word) (s, c Word) {
 	r, cc := bits.Add(uint(x), uint(y), uint(cIn))
-	if cc != 0 || r >= _BD {
+	if cc != 0 || r >= _DB {
 		cc = 1
-		r -= _BD
+		r -= _DB
 	}
 	return Word(r), Word(cc)
 }
@@ -233,8 +299,7 @@ func addMul10VVW(z, x []Word, y Word) (c Word) {
 		// do x[i] * y + c in base 2 => (hi+cc) * 2**_W + lo
 		hi, z0 := mulAddWWW(x[i], y, z[i])
 		lo, cc := bits.Add(uint(z0), uint(c), 0)
-		// convert to base _BD
-		c, z[i] = divWW(hi+Word(cc), Word(lo), _BD)
+		c, z[i] = divWDB(hi+Word(cc), Word(lo))
 	}
 	return
 }
@@ -250,7 +315,7 @@ func add10VV(z, x, y []Word) (c Word) {
 func sub10WWW(x, y, b Word) (d, c Word) {
 	dd, cc := bits.Sub(uint(x), uint(y), uint(b))
 	if cc != 0 {
-		dd += _BD
+		dd += _DB
 	}
 	return Word(dd), Word(cc)
 }
@@ -273,7 +338,7 @@ func sub10VW(z, x []Word, y Word) (c Word) {
 			copy(z[i+1:], x[i+1:])
 			return
 		}
-		z[i] = Word(zi + _BD)
+		z[i] = Word(zi + _DB)
 	}
 	return
 }
diff --git a/dec_arith_test.go b/dec_arith_test.go
index 13f764b..18638f1 100644
--- a/dec_arith_test.go
+++ b/dec_arith_test.go
@@ -1,6 +1,7 @@
 package decimal
 
 import (
+	"math/bits"
 	"reflect"
 	"strconv"
 	"testing"
@@ -14,9 +15,9 @@ func TestAdd10VW(t *testing.T) {
 		c Word
 		s int64
 	}{
-		{dec{_BD - 2, _BD - 1}, 2, dec{}, 1, 0},
-		{dec{_BD - 2, _BD - 1}, 1, dec{_BD - 1, _BD - 1}, 0, 0},
-		{dec{_BD - 2, _BD - 2}, 2, dec{0, _BD - 1}, 0, 0},
+		{dec{_DMax - 1, _DMax}, 2, dec{}, 1, 0},
+		{dec{_DMax - 1, _DMax}, 1, dec{_DMax, _DMax}, 0, 0},
+		{dec{_DMax - 1, _DMax - 1}, 2, dec{0, _DMax}, 0, 0},
 	}
 	for i, d := range td {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {
@@ -50,12 +51,12 @@ func TestDecDigits(t *testing.T) {
 
 func BenchmarkDecDigits(b *testing.B) {
 	for i := 0; i < b.N; i++ {
-		benchU = decDigits(uint(rnd.Uint64()) % _BD)
+		benchH = Word(decDigits(uint(rnd.Uint64()) % _DB))
 	}
 }
 
 func rnd10W() Word {
-	return Word(rnd.Uint64() % _BD)
+	return Word(rnd.Uint64() % _DB)
 }
 
 func rnd10V(n int) []Word {
@@ -65,3 +66,31 @@ func rnd10V(n int) []Word {
 	}
 	return v
 }
+
+func TestDivWDB(t *testing.T) {
+	h, l := rnd10W(), Word(rnd.Uint64())
+	for i := 0; i < 1e7; i++ {
+		q, r := divWDB(h, l)
+		qq, rr := bits.Div(uint(h), uint(l), _DB)
+		if q != Word(qq) || r != Word(rr) {
+			t.Fatalf("Got (%d,%d)/_DB = %d, %d. Expected %d %d", h, l, q, r, qq, rr)
+		}
+	}
+}
+
+var benchH, benchL Word
+
+func BenchmarkDivWDB_bits(b *testing.B) {
+	h, l := rnd10W(), Word(rnd.Uint64())
+	for i := 0; i < b.N; i++ {
+		h, l := bits.Div(uint(h), uint(l), _DB)
+		benchH, benchL = Word(h), Word(l)
+	}
+}
+
+func BenchmarkDivWDB_mul(b *testing.B) {
+	h, l := rnd10W(), Word(rnd.Uint64())
+	for i := 0; i < b.N; i++ {
+		benchH, benchL = divWDB(h, l)
+	}
+}
diff --git a/dec_conv.go b/dec_conv.go
index 9201723..3b821ba 100644
--- a/dec_conv.go
+++ b/dec_conv.go
@@ -111,7 +111,7 @@ func (z dec) scan(r io.ByteScanner, base int, fracOk bool) (res dec, b, count in
 
 			// if di is "full", add it to the result
 			if i == n {
-				if bn == _BD {
+				if bn == _DB {
 					// shift left by _WD digits
 					t := z.make(len(z) + 1)
 					copy(t[1:], z)
@@ -233,7 +233,7 @@ func (q dec) convertWords(s []byte, b Word, ndigits int, bb Word, table []decDiv
 		for len(q) > 0 {
 			// extract least significant, base bb "digit"
 			r = q[0]
-			q = q.shr(q, _WD)
+			q = q.shr(q, _DW)
 			for j := 0; j < ndigits && i > 0; j++ {
 				i--
 				// avoid % computation since r%10 == r - int(r/10)*10;
diff --git a/dec_test.go b/dec_test.go
index ef1d62b..931da38 100644
--- a/dec_test.go
+++ b/dec_test.go
@@ -23,10 +23,10 @@ var decCmpTests = []struct {
 	{dec{0}, dec{1}, -1},
 	{dec{1}, dec{0}, 1},
 	{dec{1}, dec{1}, 0},
-	{dec{0, _MD}, dec{1}, 1},
-	{dec{1}, dec{0, _MD}, -1},
-	{dec{1, _MD}, dec{0, _MD}, 1},
-	{dec{0, _MD}, dec{1, _MD}, -1},
+	{dec{0, _DMax}, dec{1}, 1},
+	{dec{1}, dec{0, _DMax}, -1},
+	{dec{1, _DMax}, dec{0, _DMax}, 1},
+	{dec{0, _DMax}, dec{1, _DMax}, -1},
 	{dec{16, 571956, 8794, 68}, dec{837, 9146, 1, 754489}, -1},
 	{dec{34986, 41, 105, 1957}, dec{56, 7458, 104, 1957}, 1},
 }
@@ -51,7 +51,7 @@ var decSumNN = []decArgNN{
 	{dec{1111111110}, dec{123456789}, dec{987654321}},
 	{dec{0, 0, 0, 1}, nil, dec{0, 0, 0, 1}},
 	{dec{0, 0, 0, 1111111110}, dec{0, 0, 0, 123456789}, dec{0, 0, 0, 987654321}},
-	{dec{0, 0, 0, 1}, dec{0, 0, _MD}, dec{0, 0, 1}},
+	{dec{0, 0, 0, 1}, dec{0, 0, _DMax}, dec{0, 0, 1}},
 }
 
 var decProdNN = []decArgNN{
@@ -242,8 +242,8 @@ func BenchmarkDecMul(b *testing.B) {
 }
 
 func TestDecNLZ10(t *testing.T) {
-	var x Word = _MD
-	for i := 0; i <= _WD; i++ {
+	var x Word = _DMax
+	for i := 0; i <= _DW; i++ {
 		if int(nlz10(x)) != i {
 			t.Errorf("failed at %x: got %d want %d", x, nlz10(x), i)
 		}
@@ -262,8 +262,8 @@ var decLeftShiftTests = []decShiftTest{
 	{nil, 1, nil},
 	{decOne, 0, decOne},
 	{decOne, 1, decTen},
-	{dec{_BD / 10}, 1, dec{0}},
-	{dec{_BD / 10, 0}, 1, dec{0, 1}},
+	{dec{_DB / 10}, 1, dec{0}},
+	{dec{_DB / 10, 0}, 1, dec{0, 1}},
 }
 
 func TestDecShiftLeft(t *testing.T) {
@@ -285,8 +285,8 @@ var decRightShiftTests = []decShiftTest{
 	{decOne, 0, decOne},
 	{decOne, 1, nil},
 	{decTen, 1, decOne},
-	{dec{0, 1}, 1, dec{_BD / 10}},
-	{dec{10, 1, 1}, 1, dec{_BD/10 + 1, _BD / 10}},
+	{dec{0, 1}, 1, dec{_DB / 10}},
+	{dec{10, 1, 1}, 1, dec{_DB/10 + 1, _DB / 10}},
 }
 
 func TestDecShiftRight(t *testing.T) {
@@ -813,8 +813,7 @@ func TestDecDiv(t *testing.T) {
 // TODO(bd47h): move this to decimal_test
 func benchmarkDiv(b *testing.B, aSize, bSize int) {
 	aa := rndDec1(aSize)
-	// bb := rndDec1(bSize)
-	bb := dec(nil).setWord(Word(rnd.Intn(_W-1)) + 1)
+	bb := rndDec1(bSize)
 	if aa.cmp(bb) < 0 {
 		aa, bb = bb, aa
 	}
@@ -830,7 +829,7 @@ func benchmarkDiv(b *testing.B, aSize, bSize int) {
 func BenchmarkDiv(b *testing.B) {
 	sizes := []int{
 		10, 20, 50, 100, 200, 500, 1000,
-		// 1e4, 1e5, 1e6, 1e7,
+		1e4, 1e5, 1e6, 1e7,
 	}
 	for _, i := range sizes {
 		j := 2 * i
diff --git a/decimal.go b/decimal.go
index 912446d..0e7d4eb 100644
--- a/decimal.go
+++ b/decimal.go
@@ -120,7 +120,7 @@ func (x *Decimal) MinPrec() uint {
 	if x.form != finite {
 		return 0
 	}
-	return uint(len(x.mant))*_WD - x.mant.ntz()
+	return uint(len(x.mant))*_DW - x.mant.ntz()
 }
 
 // Mode returns the rounding mode of x.
@@ -258,9 +258,9 @@ func (z *Decimal) SetInt(x *big.Int) *Decimal {
 		return z
 	}
 	// x != 0
-	z.mant = z.mant.make((int(prec) + _WD - 1) / _WD).setInt(x)
+	z.mant = z.mant.make((int(prec) + _DW - 1) / _DW).setInt(x)
 	exp := dnorm(z.mant)
-	z.setExpAndRound(int64(len(z.mant))*_WD-exp, 0)
+	z.setExpAndRound(int64(len(z.mant))*_DW-exp, 0)
 	return z
 }
 
@@ -432,8 +432,8 @@ func (x *Decimal) validate() {
 	if m == 0 {
 		panic("nonzero finite number with empty mantissa")
 	}
-	if msw := x.mant[m-1]; !(_BD/10 <= msw && msw < _BD) {
-		panic(fmt.Sprintf("last word of %s is not within [%d %d)", x.Text('e', 0), uint(_BD/10), uint(_BD)))
+	if msw := x.mant[m-1]; !(_DB/10 <= msw && msw < _DB) {
+		panic(fmt.Sprintf("last word of %s is not within [%d %d)", x.Text('e', 0), uint(_DB/10), uint(_DB)))
 	}
 	if x.prec == 0 {
 		panic("zero precision finite number")
@@ -473,7 +473,7 @@ func (z *Decimal) round(sbit uint) {
 	// z.form == finite && len(z.mant) > 0
 	// m > 0 implies z.prec > 0 (checked by validate)
 	m := uint32(len(z.mant)) // present mantissa length in words
-	digits := m * _WD
+	digits := m * _DW
 	if digits <= z.prec {
 		// mantissa fits => nothing to do
 		return
@@ -490,14 +490,14 @@ func (z *Decimal) round(sbit uint) {
 	}
 	sbit &= 1 // be safe and ensure it's a single bit	// cut off extra words
 
-	n := (z.prec + (_WD - 1)) / _WD // mantissa length in words for desired precision
+	n := (z.prec + (_DW - 1)) / _DW // mantissa length in words for desired precision
 	if m > n {
 		copy(z.mant, z.mant[m-n:]) // move n last words to front
 		z.mant = z.mant[:n]
 	}
 
 	// determine number of trailing zero digits (ntz) and compute lsd of mantissa's least-significant word
-	ntz := uint(n*_WD - z.prec) // 0 <= ntz < _W
+	ntz := uint(n*_DW - z.prec) // 0 <= ntz < _W
 	lsd := pow10(ntz)
 
 	// round if result is inexact
@@ -535,7 +535,7 @@ func (z *Decimal) round(sbit uint) {
 				z.exp++
 				// mantissa overflow means that the mantissa before increment
 				// was all nines. In that case, the result is 1**(z.exp+1)
-				z.mant[n-1] = _BD / 10
+				z.mant[n-1] = _DB / 10
 			}
 		}
 	}
@@ -555,7 +555,7 @@ func dnorm(m dec) int64 {
 	if debugDecimal && (len(m) == 0 || m[len(m)-1] == 0) {
 		panic("msw of mantissa is 0")
 	}
-	s := _WD - decDigits(uint(m[len(m)-1]))
+	s := _DW - decDigits(uint(m[len(m)-1]))
 	// partial shift
 	if s > 0 {
 		c := shl10VU(m, m, s)
diff --git a/decimal_conv.go b/decimal_conv.go
index 40355c6..bc598a9 100644
--- a/decimal_conv.go
+++ b/decimal_conv.go
@@ -27,7 +27,7 @@ func (z *Decimal) SetString(s string) (*Decimal, bool) {
 func (z *Decimal) scan(r io.ByteScanner, base int) (f *Decimal, b int, err error) {
 	prec := z.prec
 	if prec == 0 {
-		prec = _WD
+		prec = _DW
 	}
 
 	// A reasonable value in case of an error.
@@ -77,7 +77,7 @@ func (z *Decimal) scan(r io.ByteScanner, base int) (f *Decimal, b int, err error
 
 	// normalize mantissa and determine initial exponent contributions
 	exp2 := int64(0)
-	exp10 := int64(len(z.mant))*_WD - dnorm(z.mant)
+	exp10 := int64(len(z.mant))*_DW - dnorm(z.mant)
 
 	// determine binary or decimal exponent contribution of radix point
 	if fcount < 0 {
@@ -130,7 +130,7 @@ func (z *Decimal) scan(r io.ByteScanner, base int) (f *Decimal, b int, err error
 	// exp2 != 0
 
 	// // apply 2**exp2
-	p := new(Decimal).SetPrec(z.Prec() + _WD) // use more bits for p -- TODO(db47h) what is the right number?
+	p := new(Decimal).SetPrec(z.Prec() + _DW) // use more bits for p -- TODO(db47h) what is the right number?
 	if exp2 < 0 {
 		z.Quo(z, p.pow2(uint64(-exp2)))
 	} else {
@@ -143,7 +143,7 @@ func (z *Decimal) scan(r io.ByteScanner, base int) (f *Decimal, b int, err error
 // pow5 sets z to 5**n and returns z.
 // n must not be negative.
 func (z *Decimal) pow2(n uint64) *Decimal {
-	const m = _WD * 100000 / 30103 // maximum exponent such that 2**m < _BD
+	const m = _DW * 100000 / 30103 // maximum exponent such that 2**m < _BD
 	if n <= m {
 		return z.SetUint64(1 << n)
 	}
@@ -154,7 +154,7 @@ func (z *Decimal) pow2(n uint64) *Decimal {
 
 	// use more bits for f than for z
 	// TODO(db47h) what is the right number?
-	f := new(Decimal).SetPrec(z.Prec() + _WD).SetUint64(2)
+	f := new(Decimal).SetPrec(z.Prec() + _DW).SetUint64(2)
 
 	for n > 0 {
 		if n&1 != 0 {
diff --git a/decimal_test.go b/decimal_test.go
index 6e2e58c..6a32cde 100644
--- a/decimal_test.go
+++ b/decimal_test.go
@@ -2,7 +2,6 @@ package decimal
 
 import (
 	"math/big"
-	"math/bits"
 	"math/rand"
 	"reflect"
 	"strconv"
@@ -23,21 +22,21 @@ var intData = []struct {
 	{"1234567890123456789_0123456789012345678_9012345678901234567_8901234567890123456_78901234567890", 0, 90,
 		dec{7890123456789000000, 8901234567890123456, 9012345678901234567, 123456789012345678, 1234567890123456789},
 		90, 90},
-	{"1235", 0, 0, dec{1235000000000000000}, _WD, 4},
+	{"1235", 0, 0, dec{1235000000000000000}, _DW, 4},
 	{"1235", 0, 3, dec{1240000000000000000}, 3, 4},
 	{"1245", 0, 3, dec{1240000000000000000}, 3, 4},
 	{"12451", 0, 3, dec{1250000000000000000}, 3, 5},
-	{"0", 0, 0, nil, _WD, 0},
+	{"0", 0, 0, nil, _DW, 0},
 }
 
 func TestDnorm(t *testing.T) {
 	for i := 0; i < 10000; i++ {
 	again:
-		w := uint(rand.Uint64()) % _BD
-		e := uint(rand.Intn(_WD + 1))
-		h, l := bits.Mul(w, pow10(e))
+		w := uint(rand.Uint64()) % _DB
+		e := uint(rand.Intn(_DW + 1))
+		h, l := mulWW(Word(w), Word(pow10(e)))
 		// convert h, l from base _B (2**64) to base _BD (10**19) or 2**32 -> 10**9
-		h, l = bits.Div(h, l, _BD)
+		h, l = divWDB(h, l)
 		d := dec{Word(l), Word(h)}.norm()
 		if len(d) == 0 {
 			if w == 0 {
@@ -48,8 +47,8 @@ func TestDnorm(t *testing.T) {
 		dd := dec(nil).set(d)
 		s := dnorm(dd)
 		// d should now have a single element with e shifted left
-		ew := w * pow10(_WD-decDigits(w))
-		es := int64(uint(len(d)*_WD) - (decDigits(w) + e))
+		ew := w * pow10(_DW-decDigits(w))
+		es := int64(uint(len(d)*_DW) - (decDigits(w) + e))
 		if dd[len(dd)-1] != Word(ew) || s != es {
 			t.Fatalf("%ve%v => dnorm(%v) = %v, %v --- Expected %d, %d",
 				w, e, d, dd, s, w, es)
@@ -99,11 +98,11 @@ func TestDecimal_SetString(t *testing.T) {
 func BenchmarkDnorm(b *testing.B) {
 	d := dec(nil).make(1000)
 	for i := range d {
-		d[i] = Word(rand.Uint64()) % _BD
+		d[i] = Word(rand.Uint64()) % _DB
 	}
 	for i := 0; i < b.N; i++ {
-		d[0] = Word(rand.Uint64()) % _BD
-		d[len(d)-1] = Word(rand.Uint64()) % _BD
+		d[0] = Word(rand.Uint64()) % _DB
+		d[len(d)-1] = Word(rand.Uint64()) % _DB
 		benchU = uint(dnorm(d))
 	}
 }