From f86aba1d0da2664f87d89a515b2686d610f30107 Mon Sep 17 00:00:00 2001
From: Denis Bernard <db047h@gmail.com>
Date: Fri, 8 May 2020 03:43:34 +0200
Subject: [PATCH] dec: implement add, sub, div, and modW.

Import nat_test.go from stdlib => dec_test.go.
---
 dec.go            | 345 +++++++++++++++++--
 dec_arith.go      |  82 ++++-
 dec_arith_test.go |  20 +-
 dec_conv.go       |   5 +
 dec_conv_test.go  |  43 +--
 dec_test.go       | 833 ++++++++++++++++++++++++++++++++++++++++++----
 decimal_test.go   |   3 -
 stdlib.go         |  14 +-
 8 files changed, 1212 insertions(+), 133 deletions(-)

diff --git a/dec.go b/dec.go
index dd3fcec..fc51b65 100644
--- a/dec.go
+++ b/dec.go
@@ -14,9 +14,10 @@ const (
 	_WD = _W * 30103 / 100000
 	// Decimal base for a word. 1e9 for 32 bits words and 1e19 for 64 bits
 	// words.
-	// TODO(db47h): We want this value to be a const. This is a dirty hack to
-	// avoid conditional compilation that will break if bits.UintSize>64
+	// We want this value to be a const. This is a dirty hack to avoid
+	// conditional compilation; it will break if bits.UintSize != 32 or 64
 	_BD = 9999999998000000000*(_WD/19) + 1000000000*(_WD/9)
+	_MD = _BD - 1
 )
 
 // dec is an unsigned integer x of the form
@@ -34,6 +35,7 @@ type dec []Word
 
 var (
 	decOne = dec{1}
+	decTen = dec{10}
 )
 
 func (z dec) clear() {
@@ -158,6 +160,60 @@ func (x dec) sticky(i uint) uint {
 	return 0
 }
 
+func (z dec) add(x, y dec) dec {
+	m := len(x)
+	n := len(y)
+
+	switch {
+	case m < n:
+		return z.add(y, x)
+	case m == 0:
+		// n == 0 because m >= n; result is 0
+		return z[:0]
+	case n == 0:
+		// result is x
+		return z.set(x)
+	}
+	// m > 0
+
+	z = z.make(m + 1)
+	c := add10VV(z[0:n], x, y)
+	if m > n {
+		c = add10VW(z[n:m], x[n:], c)
+	}
+	z[m] = c
+
+	return z.norm()
+}
+
+func (z dec) sub(x, y dec) dec {
+	m := len(x)
+	n := len(y)
+
+	switch {
+	case m < n:
+		panic("underflow")
+	case m == 0:
+		// n == 0 because m >= n; result is 0
+		return z[:0]
+	case n == 0:
+		// result is x
+		return z.set(x)
+	}
+	// m > 0
+
+	z = z.make(m)
+	c := sub10VV(z[0:n], x, y)
+	if m > n {
+		c = sub10VW(z[n:], x[n:], c)
+	}
+	if c != 0 {
+		panic("underflow")
+	}
+
+	return z.norm()
+}
+
 func (x dec) cmp(y dec) (r int) {
 	m := len(x)
 	n := len(y)
@@ -205,6 +261,166 @@ func (z dec) divW(x dec, y Word) (q dec, r Word) {
 	return
 }
 
+func (z dec) div(z2, u, v dec) (q, r dec) {
+	if len(v) == 0 {
+		panic("division by zero")
+	}
+
+	if u.cmp(v) < 0 {
+		q = z[:0]
+		r = z2.set(u)
+		return
+	}
+
+	if len(v) == 1 {
+		var r2 Word
+		q, r2 = z.divW(u, v[0])
+		r = z2.setWord(r2)
+		return
+	}
+
+	q, r = z.divLarge(z2, u, v)
+	return
+}
+
+// getDec returns a *dec of len n. The contents may not be zero.
+// The pool holds *dec to avoid allocation when converting to interface{}.
+func getDec(n int) *dec {
+	var z *dec
+	if v := decPool.Get(); v != nil {
+		z = v.(*dec)
+	}
+	if z == nil {
+		z = new(dec)
+	}
+	*z = z.make(n)
+	return z
+}
+
+func putDec(x *dec) {
+	decPool.Put(x)
+}
+
+var decPool sync.Pool
+
+// q = (uIn-r)/vIn, with 0 <= r < vIn
+// Uses z as storage for q, and u as storage for r if possible.
+// See Knuth, Volume 2, section 4.3.1, Algorithm D.
+// Preconditions:
+//    len(vIn) >= 2
+//    len(uIn) >= len(vIn)
+//    u must not alias z
+func (z dec) divLarge(u, uIn, vIn dec) (q, r dec) {
+	n := len(vIn)
+	m := len(uIn)
+
+	// D1.
+	d := _BD / (vIn[n-1] + 1)
+	// do not modify vIn, it may be used by another goroutine simultaneously
+	vp := getDec(n)
+	v := *vp
+	mulAdd10VWW(v, vIn, d, 0)
+
+	// u may safely alias uIn or vIn, the value of uIn is used to set u and vIn was already used
+	u = u.make(m + 1)
+	u[m] = mulAdd10VWW(u[:m], uIn, d, 0)
+
+	// z may safely alias uIn or vIn, both values were used already
+	if alias(z, u) {
+		z = nil // z is an alias for u - cannot reuse
+	}
+	q = z.make(m - n + 1)
+
+	// TODO(db47h): implement divRecursive
+	// if n < divRecursiveThreshold {
+	q.divBasic(u, v)
+	// } else {
+	// 	q.divRecursive(u, v)
+	// }
+	putDec(vp)
+
+	q = q.norm()
+	r, _ = u.divW(u, d)
+	r = r.norm()
+	return q, r
+}
+
+// divBasic performs word-by-word division of u by v.
+// The quotient is written in pre-allocated q.
+// The remainder overwrites input u.
+//
+// Precondition:
+// - len(q) >= len(u)-len(v)
+func (q dec) divBasic(u, v dec) {
+	n := len(v)
+	m := len(u) - n
+
+	qhatvp := getDec(n + 1)
+	qhatv := *qhatvp
+	// D2.
+	vn1 := v[n-1]
+	for j := m; j >= 0; j-- {
+		// D3.
+		qhat := Word(_MD)
+		var ujn Word
+		if j+n < len(u) {
+			ujn = u[j+n]
+		}
+		if ujn != vn1 {
+			var rhat Word
+			qhat, rhat = div10WW(ujn, u[j+n-1], vn1)
+			// x1 | x2 = q̂v_{n-2}
+			vn2 := v[n-2]
+			x1, x2 := mul10WW(qhat, vn2)
+			// test if q̂v_{n-2} > br̂ + u_{j+n-2}
+			ujn2 := u[j+n-2]
+			for greaterThan(x1, x2, rhat, ujn2) {
+				qhat--
+				prevRhat := rhat
+				rhat += vn1
+				// v[n-1] >= 0, so this tests for overflow.
+				if rhat < prevRhat {
+					break
+				}
+				x1, x2 = mul10WW(qhat, vn2)
+			}
+		}
+
+		// D4.
+		qhatv[n] = mulAdd10VWW(qhatv[0:n], v, qhat, 0)
+		qhl := len(qhatv)
+		if j+qhl > len(u) && qhatv[n] == 0 {
+			qhl--
+		}
+		c := sub10VV(u[j:j+qhl], u[j:], qhatv)
+		if c != 0 {
+			c := add10VV(u[j:j+n], u[j:], v)
+			u[j+n] += c
+			qhat--
+		}
+
+		if j == m && m == len(q) && qhat == 0 {
+			continue
+		}
+		q[j] = qhat
+	}
+
+	putDec(qhatvp)
+}
+
+// greaterThan reports whether (x1*_BD + x2) > (y1*_BD + y2)
+func greaterThan(x1, x2, y1, y2 Word) bool {
+	return x1 > y1 || x1 == y1 && x2 > y2
+}
+
+// modW returns x % d.
+func (x dec) modW(d Word) (r Word) {
+	for i := len(x) - 1; i >= 0; i-- {
+		_, r = div10WW(r, x[i], d)
+	}
+	return r
+}
+
 func (z dec) mulAddWW(x dec, y, r Word) dec {
 	m := len(x)
 	if m == 0 || y == 0 {
@@ -267,26 +483,6 @@ func (z dec) shr(x dec, s uint) dec {
 	return z.norm()
 }
 
-// getDec returns a *dec of len n. The contents may not be zero.
-// The pool holds *dec to avoid allocation when converting to interface{}.
-func getDec(n int) *dec {
-	var z *dec
-	if v := decPool.Get(); v != nil {
-		z = v.(*dec)
-	}
-	if z == nil {
-		z = new(dec)
-	}
-	*z = z.make(n)
-	return z
-}
-
-func putDec(x *dec) {
-	decPool.Put(x)
-}
-
-var decPool sync.Pool
-
 // Operands that are shorter than basicSqrThreshold are squared using
 // "grade school" multiplication; for operands longer than karatsubaSqrThreshold
 // we use the Karatsuba algorithm optimized for x == y.
@@ -448,3 +644,108 @@ func (z dec) mul(x, y dec) dec {
 
 	// return z.norm()
 }
+
+func (z dec) expNN(x, y, m dec) dec {
+	panic("not implemented")
+}
+
+// // If m != 0 (i.e., len(m) != 0), expNN sets z to x**y mod m;
+// // otherwise it sets z to x**y. The result is the value of z.
+// func (z dec) expNN(x, y, m dec) dec {
+// 	if alias(z, x) || alias(z, y) {
+// 		// We cannot allow in-place modification of x or y.
+// 		z = nil
+// 	}
+
+// 	// x**y mod 1 == 0
+// 	if len(m) == 1 && m[0] == 1 {
+// 		return z.setWord(0)
+// 	}
+// 	// m == 0 || m > 1
+
+// 	// x**0 == 1
+// 	if len(y) == 0 {
+// 		return z.setWord(1)
+// 	}
+// 	// y > 0
+
+// 	// x**1 mod m == x mod m
+// 	if len(y) == 1 && y[0] == 1 && len(m) != 0 {
+// 		_, z = dec(nil).div(z, x, m)
+// 		return z
+// 	}
+// 	// y > 1
+
+// 	if len(m) != 0 {
+// 		// We likely end up being as long as the modulus.
+// 		z = z.make(len(m))
+// 	}
+// 	z = z.set(x)
+
+// 	// If the base is non-trivial and the exponent is large, we use
+// 	// 4-bit, windowed exponentiation. This involves precomputing 14 values
+// 	// (x^2...x^15) but then reduces the number of multiply-reduces by a
+// 	// third. Even for a 32-bit exponent, this reduces the number of
+// 	// operations. Uses Montgomery method for odd moduli.
+// 	if x.cmp(decOne) > 0 && len(y) > 1 && len(m) > 0 {
+// 		if m[0]&1 == 1 {
+// 			return z.expNNMontgomery(x, y, m)
+// 		}
+// 		return z.expNNWindowed(x, y, m)
+// 	}
+
+// 	v := y[len(y)-1] // v > 0 because y is normalized and y > 0
+// 	shift := nlz(v) + 1
+// 	v <<= shift
+// 	var q nat
+
+// 	const mask = 1 << (_W - 1)
+
+// 	// We walk through the bits of the exponent one by one. Each time we
+// 	// see a bit, we square, thus doubling the power. If the bit is a one,
+// 	// we also multiply by x, thus adding one to the power.
+
+// 	w := _W - int(shift)
+// 	// zz and r are used to avoid allocating in mul and div as
+// 	// otherwise the arguments would alias.
+// 	var zz, r nat
+// 	for j := 0; j < w; j++ {
+// 		zz = zz.sqr(z)
+// 		zz, z = z, zz
+
+// 		if v&mask != 0 {
+// 			zz = zz.mul(z, x)
+// 			zz, z = z, zz
+// 		}
+
+// 		if len(m) != 0 {
+// 			zz, r = zz.div(r, z, m)
+// 			zz, r, q, z = q, z, zz, r
+// 		}
+
+// 		v <<= 1
+// 	}
+
+// 	for i := len(y) - 2; i >= 0; i-- {
+// 		v = y[i]
+
+// 		for j := 0; j < _W; j++ {
+// 			zz = zz.sqr(z)
+// 			zz, z = z, zz
+
+// 			if v&mask != 0 {
+// 				zz = zz.mul(z, x)
+// 				zz, z = z, zz
+// 			}
+
+// 			if len(m) != 0 {
+// 				zz, r = zz.div(r, z, m)
+// 				zz, r, q, z = q, z, zz, r
+// 			}
+
+// 			v <<= 1
+// 		}
+// 	}
+
+// 	return z.norm()
+// }
diff --git a/dec_arith.go b/dec_arith.go
index db31db8..9041faf 100644
--- a/dec_arith.go
+++ b/dec_arith.go
@@ -45,7 +45,7 @@ func decDigits32(x uint) (n uint) {
 	return n
 }
 
-func nldz(x Word) uint {
+func nlz10(x Word) uint {
 	if bits.UintSize == 32 {
 		return _WD - decDigits32(uint(x))
 	}
@@ -159,34 +159,41 @@ func decMaxPow(b Word) (p Word, n int) {
 
 // add10VW adds y to x. The resulting carry c is either 0 or 1.
 func add10VW(z, x dec, y Word) (c Word) {
-	z[0], c = add10WW(x[0], y)
+	z[0], c = add10WWW(x[0], y, 0)
 	// propagate carry
 	for i := 1; i < len(z) && i < len(x); i++ {
 		s := x[i] + c
-		if s >= _BD {
-			z[i] = 0
-			continue
-		}
-		// c = 0 from this point
-		z[i] = s
-		// copy remaining digits if not adding in-place
-		if !same(z, x) {
+		if s < _BD {
+			z[i] = s
+			// copy remaining digits
 			copy(z[i+1:], x[i+1:])
+			return 0
 		}
-		return 0
+		z[i] = 0
 	}
-	return c
+	return
 }
 
 func div10WVW(z []Word, xn Word, x []Word, y Word) (r Word) {
 	r = xn
 	for i := len(z) - 1; i >= 0; i-- {
-		h, l := mulAddWWW(r, _BD, x[i])
-		z[i], r = divWW(h, l, y)
+		z[i], r = div10WW(r, x[i], y)
 	}
 	return
 }
 
+// q = (u1<<_W + u0 - r)/v
+func div10WW(u1, u0, v Word) (q, r Word) {
+	// q < _BD if u1 < v < _BD
+	if debugDecimal && !(u1 < v && v < _BD) {
+		panic("decimal overflow")
+	}
+	// convert to base 2
+	hi, lo := mulAddWWW(u1, _BD, u0)
+	// q = (u-r)/v. Since v < _BD => r < _BD
+	return divWW(hi, lo, v)
+}
+
 func mulAdd10VWW(z, x []Word, y, r Word) (c Word) {
 	c = r
 	// The comment near the top of this file discusses this for loop condition.
@@ -212,11 +219,11 @@ func mul10WW(x, y Word) (z1, z0 Word) {
 	return Word(hi), Word(lo)
 }
 
-func add10WW(x, y Word) (s, c Word) {
-	r, cc := bits.Add(uint(x), uint(y), 0)
-	if r >= _BD {
-		r -= _BD
+func add10WWW(x, y, cIn Word) (s, c Word) {
+	r, cc := bits.Add(uint(x), uint(y), uint(cIn))
+	if cc != 0 || r >= _BD {
 		cc = 1
+		r -= _BD
 	}
 	return Word(r), Word(cc)
 }
@@ -231,3 +238,42 @@ func addMul10VVW(z, x []Word, y Word) (c Word) {
 	}
 	return
 }
+
+// The resulting carry c is either 0 or 1.
+func add10VV(z, x, y []Word) (c Word) {
+	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
+		z[i], c = add10WWW(x[i], y[i], c)
+	}
+	return
+}
+
+func sub10WWW(x, y, b Word) (d, c Word) {
+	dd, cc := bits.Sub(uint(x), uint(y), uint(b))
+	if cc != 0 {
+		dd += _BD
+	}
+	return Word(dd), Word(cc)
+}
+
+// The resulting carry c is either 0 or 1.
+func sub10VV(z, x, y []Word) (c Word) {
+	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
+		z[i], c = sub10WWW(x[i], y[i], c)
+	}
+	return
+}
+
+func sub10VW(z, x []Word, y Word) (c Word) {
+	c = y
+	for i := 0; i < len(z) && i < len(x); i++ {
+		zi, cc := bits.Sub(uint(x[i]), uint(c), 0)
+		c = Word(cc)
+		if c == 0 {
+			z[i] = Word(zi)
+			copy(z[i+1:], x[i+1:])
+			return
+		}
+		z[i] = Word(zi + _BD)
+	}
+	return
+}
diff --git a/dec_arith_test.go b/dec_arith_test.go
index 9a3a597..13f764b 100644
--- a/dec_arith_test.go
+++ b/dec_arith_test.go
@@ -1,11 +1,9 @@
 package decimal
 
 import (
-	"math/rand"
 	"reflect"
 	"strconv"
 	"testing"
-	"time"
 )
 
 func TestAdd10VW(t *testing.T) {
@@ -38,9 +36,8 @@ func TestAdd10VW(t *testing.T) {
 }
 
 func TestDecDigits(t *testing.T) {
-	rand.Seed(time.Now().UnixNano())
 	for i := 0; i < 10000; i++ {
-		n := uint(rand.Uint64())
+		n := uint(rnd.Uint64())
 		d := uint(0)
 		for m := n; m != 0; m /= 10 {
 			d++
@@ -52,8 +49,19 @@ func TestDecDigits(t *testing.T) {
 }
 
 func BenchmarkDecDigits(b *testing.B) {
-	rand.Seed(0xdeadbeefbadf00d)
 	for i := 0; i < b.N; i++ {
-		benchU = decDigits(uint(rand.Uint64()) % _BD)
+		benchU = decDigits(uint(rnd.Uint64()) % _BD)
 	}
 }
+
+func rnd10W() Word {
+	return Word(rnd.Uint64() % _BD)
+}
+
+func rnd10V(n int) []Word {
+	v := make([]Word, n)
+	for i := range v {
+		v[i] = rnd10W()
+	}
+	return v
+}
diff --git a/dec_conv.go b/dec_conv.go
index fe3c417..9201723 100644
--- a/dec_conv.go
+++ b/dec_conv.go
@@ -271,6 +271,11 @@ type decDivisor struct {
 	ndigits int // digit length of divisor in terms of output base digits
 }
 
+// expWW computes x**y
+func (z dec) expWW(x, y Word) dec {
+	return z.expNN(dec(nil).setWord(x), dec(nil).setWord(y), nil)
+}
+
 func divisors(m int, b Word, ndigits int, bb Word) []decDivisor {
 	return nil
 }
diff --git a/dec_conv_test.go b/dec_conv_test.go
index 6790c46..4f87f4e 100644
--- a/dec_conv_test.go
+++ b/dec_conv_test.go
@@ -5,6 +5,7 @@
 package decimal
 
 import (
+	"fmt"
 	"io"
 	"math"
 	"strings"
@@ -360,27 +361,27 @@ func BenchmarkDecStringPiParallel(b *testing.B) {
 // 	}
 // }
 
-// func BenchmarkString(b *testing.B) {
-// 	const x = 10
-// 	for _, base := range []int{2, 8, 10, 16} {
-// 		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
-// 			if isRaceBuilder && y > 1000 {
-// 				continue
-// 			}
-// 			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
-// 				b.StopTimer()
-// 				var z dec
-// 				z = z.expWW(x, y)
-// 				z.utoa(base) // warm divisor cache
-// 				b.StartTimer()
-
-// 				for i := 0; i < b.N; i++ {
-// 					_ = z.utoa(base)
-// 				}
-// 			})
-// 		}
-// 	}
-// }
+func BenchmarkString(b *testing.B) {
+	const x = 10
+	for _, base := range []int{2, 8, 10, 16} {
+		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			if isRaceBuilder && y > 1000 {
+				continue
+			}
+			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
+				b.StopTimer()
+				var z dec
+				z = z.expWW(x, y)
+				z.utoa(base) // warm divisor cache
+				b.StartTimer()
+
+				for i := 0; i < b.N; i++ {
+					_ = z.utoa(base)
+				}
+			})
+		}
+	}
+}
 
 // func BenchmarkLeafSize(b *testing.B) {
 // 	for n := 0; n <= 16; n++ {
diff --git a/dec_test.go b/dec_test.go
index ce82d08..29bc5d1 100644
--- a/dec_test.go
+++ b/dec_test.go
@@ -1,79 +1,788 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 package decimal
 
 import (
-	"math/bits"
-	"math/rand"
-	"strconv"
+	"fmt"
+	"runtime"
+	"strings"
 	"testing"
-	"time"
 )
 
-func TestDec_ntz(t *testing.T) {
-	rand.Seed(time.Now().UnixNano())
-	for i := 0; i < 10000; i++ {
-		w := uint(rand.Uint64()) % _BD
-		e := uint(rand.Intn(_WD + 1))
-		h, l := bits.Mul(w, pow10(e))
-		h, l = bits.Div(h, l, _BD)
-		d := dec{Word(l), Word(h)}.norm()
-		// adjust e if w == 0 or w%10 == 0
-		if w == 0 {
-			e = 0
-		} else {
-			e += decTrailingZeros(w)
-		}
-		if d.ntz() != e {
-			t.Fatalf("dec{%v}.ntz() = %d, expected %d", d, d.ntz(), e)
-		}
-	}
-}
-
-func TestDec_digit(t *testing.T) {
-	data := []struct {
-		d dec
-		n uint
-		r uint
-	}{
-		{dec{123}, 0, 3},
-		{dec{123}, 2, 1},
-		{dec{123}, 3, 0},
-		{dec{0, 1234567891234567891}, 37, 1},
-		{dec{0, 1234567891234567891}, 36, 2},
-		{dec{0, 1234567891234567891}, 38, 0},
-	}
-	for di, d := range data {
-		t.Run(strconv.Itoa(di), func(t *testing.T) {
-			if dig := d.d.digit(d.n); dig != d.r {
-				t.Fatalf("%v.digit(%d) = %d, expected %d", d.d, d.n, dig, d.r)
-			}
-		})
+var cmpTests = []struct {
+	x, y dec
+	r    int
+}{
+	{nil, nil, 0},
+	{nil, dec(nil), 0},
+	{dec(nil), nil, 0},
+	{dec(nil), dec(nil), 0},
+	{dec{0}, dec{0}, 0},
+	{dec{0}, dec{1}, -1},
+	{dec{1}, dec{0}, 1},
+	{dec{1}, dec{1}, 0},
+	{dec{0, _MD}, dec{1}, 1},
+	{dec{1}, dec{0, _MD}, -1},
+	{dec{1, _MD}, dec{0, _MD}, 1},
+	{dec{0, _MD}, dec{1, _MD}, -1},
+	{dec{16, 571956, 8794, 68}, dec{837, 9146, 1, 754489}, -1},
+	{dec{34986, 41, 105, 1957}, dec{56, 7458, 104, 1957}, 1},
+}
+
+func TestCmp(t *testing.T) {
+	for i, a := range cmpTests {
+		r := a.x.cmp(a.y)
+		if r != a.r {
+			t.Errorf("#%d got r = %v; want %v", i, r, a.r)
+		}
+	}
+}
+
+type funNN func(z, x, y dec) dec
+type argNN struct {
+	z, x, y dec
+}
+
+var sumNN = []argNN{
+	{},
+	{dec{1}, nil, dec{1}},
+	{dec{1111111110}, dec{123456789}, dec{987654321}},
+	{dec{0, 0, 0, 1}, nil, dec{0, 0, 0, 1}},
+	{dec{0, 0, 0, 1111111110}, dec{0, 0, 0, 123456789}, dec{0, 0, 0, 987654321}},
+	{dec{0, 0, 0, 1}, dec{0, 0, _MD}, dec{0, 0, 1}},
+}
+
+var prodNN = []argNN{
+	{},
+	{nil, nil, nil},
+	{nil, dec{991}, nil},
+	{dec{991}, dec{991}, dec{1}},
+	{dec{991 * 991}, dec{991}, dec{991}},
+	{dec{0, 0, 991 * 991}, dec{0, 991}, dec{0, 991}},
+	{dec{1 * 991, 2 * 991, 3 * 991, 4 * 991}, dec{1, 2, 3, 4}, dec{991}},
+	{dec{4, 11, 20, 30, 20, 11, 4}, dec{1, 2, 3, 4}, dec{4, 3, 2, 1}},
+	// 3^100 * 3^28 = 3^128
+	{
+		decFromString("11790184577738583171520872861412518665678211592275841109096961"),
+		decFromString("515377520732011331036461129765621272702107522001"),
+		decFromString("22876792454961"),
+	},
+	// z = 111....1 (70000 digits)
+	// x = 10^(99*700) + ... + 10^1400 + 10^700 + 1
+	// y = 111....1 (700 digits, larger than Karatsuba threshold on 32-bit and 64-bit)
+	{
+		decFromString(strings.Repeat("1", 70000)),
+		decFromString("1" + strings.Repeat(strings.Repeat("0", 699)+"1", 99)),
+		decFromString(strings.Repeat("1", 700)),
+	},
+	// z = 111....1 (20000 digits)
+	// x = 10^10000 + 1
+	// y = 111....1 (10000 digits)
+	{
+		decFromString(strings.Repeat("1", 20000)),
+		decFromString("1" + strings.Repeat("0", 9999) + "1"),
+		decFromString(strings.Repeat("1", 10000)),
+	},
+}
+
+func decFromString(s string) dec {
+	x, _, _, err := dec(nil).scan(strings.NewReader(s), 0, false)
+	if err != nil {
+		panic(err)
+	}
+	return x
+}
+
+func TestSet(t *testing.T) {
+	for _, a := range sumNN {
+		z := dec(nil).set(a.z)
+		if z.cmp(a.z) != 0 {
+			t.Errorf("got z = %v; want %v", z, a.z)
+		}
+	}
+}
+
+func testFunNN(t *testing.T, msg string, f funNN, a argNN) {
+	z := f(nil, a.x, a.y)
+	if z.cmp(a.z) != 0 {
+		t.Errorf("%s%+v\n\tgot z = %v; want %v", msg, a, z, a.z)
+	}
+}
+
+func TestFunNN(t *testing.T) {
+	for _, a := range sumNN {
+		arg := a
+		testFunNN(t, "add", dec.add, arg)
+
+		arg = argNN{a.z, a.y, a.x}
+		testFunNN(t, "add symmetric", dec.add, arg)
+
+		arg = argNN{a.x, a.z, a.y}
+		testFunNN(t, "sub", dec.sub, arg)
+
+		arg = argNN{a.y, a.z, a.x}
+		testFunNN(t, "sub symmetric", dec.sub, arg)
+	}
+
+	for _, a := range prodNN {
+		arg := a
+		testFunNN(t, "mul", dec.mul, arg)
+
+		arg = argNN{a.z, a.y, a.x}
+		testFunNN(t, "mul symmetric", dec.mul, arg)
+	}
+}
+
+// var mulRangesN = []struct {
+// 	a, b uint64
+// 	prod string
+// }{
+// 	{0, 0, "0"},
+// 	{1, 1, "1"},
+// 	{1, 2, "2"},
+// 	{1, 3, "6"},
+// 	{10, 10, "10"},
+// 	{0, 100, "0"},
+// 	{0, 1e9, "0"},
+// 	{1, 0, "1"},                    // empty range
+// 	{100, 1, "1"},                  // empty range
+// 	{1, 10, "3628800"},             // 10!
+// 	{1, 20, "2432902008176640000"}, // 20!
+// 	{1, 100,
+// 		"933262154439441526816992388562667004907159682643816214685929" +
+// 			"638952175999932299156089414639761565182862536979208272237582" +
+// 			"51185210916864000000000000000000000000", // 100!
+// 	},
+// }
+
+// func TestMulRangeN(t *testing.T) {
+// 	for i, r := range mulRangesN {
+// 		prod := string(nat(nil).mulRange(r.a, r.b).utoa(10))
+// 		if prod != r.prod {
+// 			t.Errorf("#%d: got %s; want %s", i, prod, r.prod)
+// 		}
+// 	}
+// }
+
+// allocBytes returns the number of bytes allocated by invoking f.
+func allocBytes(f func()) uint64 {
+	var stats runtime.MemStats
+	runtime.ReadMemStats(&stats)
+	t := stats.TotalAlloc
+	f()
+	runtime.ReadMemStats(&stats)
+	return stats.TotalAlloc - t
+}
+
+// TestMulUnbalanced tests that multiplying numbers of different lengths
+// does not cause deep recursion and in turn allocate too much memory.
+// Test case for issue 3807.
+func TestMulUnbalanced(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+	x := rndDec(50000)
+	y := rndDec(40)
+	allocSize := allocBytes(func() {
+		dec(nil).mul(x, y)
+	})
+	inputSize := uint64(len(x)+len(y)) * _S
+	if ratio := allocSize / uint64(inputSize); ratio > 10 {
+		t.Errorf("multiplication uses too much memory (%d > %d times the size of inputs)", allocSize, ratio)
+	}
+}
+
+// rndDec returns a random dec value >= 0 of (usually) n words in length.
+// In extremely unlikely cases it may be smaller than n words if the top-
+// most words are 0.
+func rndDec(n int) dec {
+	return dec(rnd10V(n)).norm()
+}
+
+// rndDec1 is like rndDec but the result is guaranteed to be > 0.
+func rndDec1(n int) dec {
+	x := dec(rnd10V(n)).norm()
+	if len(x) == 0 {
+		x.setWord(1)
 	}
+	return x
 }
 
-func TestDecSetUint64(t *testing.T) {
-	data := []struct {
-		in  uint64
-		exp int32
-	}{
-		{_BD + 1, int32(_WD + 1)},
-		{_BD - 1, int32(_WD)},
-		{9999, 4},
+func BenchmarkMul(b *testing.B) {
+	mulx := rndDec(1e4)
+	muly := rndDec(1e4)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var z dec
+		z.mul(mulx, muly)
 	}
+}
+
+func benchmarkDecMul(b *testing.B, nwords int) {
+	x := rndDec(nwords)
+	y := rndDec(nwords)
 	var z dec
-	for _, d := range data {
-		out := fmt.Sprintf("%d", d.in)
-		t.Run(out, func(t *testing.T) {
-			var exp int32
-			z, exp = z.setUint64(d.in)
-			if a := string(z.utoa(10)); a != out {
-				t.Fatalf("expected mantissa %v, got %v", out, a)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		z.mul(x, y)
+	}
+}
+
+var mulBenchSizes = []int{10, 100, 1000, 10000, 100000}
+
+func BenchmarkDecMul(b *testing.B) {
+	for _, n := range mulBenchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			benchmarkDecMul(b, n)
+		})
+	}
+}
+
+func TestNLZ(t *testing.T) {
+	var x Word = _MD
+	for i := 0; i <= _WD; i++ {
+		if int(nlz10(x)) != i {
+			t.Errorf("failed at %x: got %d want %d", x, nlz10(x), i)
+		}
+		x /= 10
+	}
+}
+
+type shiftTest struct {
+	in    dec
+	shift uint
+	out   dec
+}
+
+var leftShiftTests = []shiftTest{
+	{nil, 0, nil},
+	{nil, 1, nil},
+	{decOne, 0, decOne},
+	{decOne, 1, decTen},
+	{dec{_BD / 10}, 1, dec{0}},
+	{dec{_BD / 10, 0}, 1, dec{0, 1}},
+}
+
+func TestShiftLeft(t *testing.T) {
+	for i, test := range leftShiftTests {
+		var z dec
+		z = z.shl(test.in, test.shift)
+		for j, d := range test.out {
+			if j >= len(z) || z[j] != d {
+				t.Errorf("#%d: got: %v want: %v", i, z, test.out)
+				break
 			}
-			if exp != d.exp {
-				t.Fatalf("expected exponent %v, got %v", d.exp, exp)
+		}
+	}
+}
+
+var rightShiftTests = []shiftTest{
+	{nil, 0, nil},
+	{nil, 1, nil},
+	{decOne, 0, decOne},
+	{decOne, 1, nil},
+	{decTen, 1, decOne},
+	{dec{0, 1}, 1, dec{_BD / 10}},
+	{dec{10, 1, 1}, 1, dec{_BD/10 + 1, _BD / 10}},
+}
+
+func TestShiftRight(t *testing.T) {
+	for i, test := range rightShiftTests {
+		var z dec
+		z = z.shr(test.in, test.shift)
+		for j, d := range test.out {
+			if j >= len(z) || z[j] != d {
+				t.Errorf("#%d: got: %v want: %v", i, z, test.out)
+				break
 			}
-			if l := (exp + _WD - 1) / _WD; l != int32(len(z)) {
-				t.Fatalf("expected length %v, got %v", l, len(z))
+		}
+	}
+}
+
+func BenchmarkZeroShifts(b *testing.B) {
+	x := rndDec(800)
+
+	b.Run("Shl", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			var z dec
+			z.shl(x, 0)
+		}
+	})
+	b.Run("ShlSame", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			x.shl(x, 0)
+		}
+	})
+
+	b.Run("Shr", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			var z dec
+			z.shr(x, 0)
+		}
+	})
+	b.Run("ShrSame", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			x.shr(x, 0)
+		}
+	})
+}
+
+type modWTest struct {
+	in       string
+	dividend string
+	out      string
+}
+
+var modWTests32 = []modWTest{
+	{"23492635982634928349238759823742", "252341", "220170"},
+}
+
+var modWTests64 = []modWTest{
+	{"6527895462947293856291561095690465243862946", "524326975699234", "375066989628668"},
+}
+
+func runModWTests(t *testing.T, tests []modWTest) {
+	for i, test := range tests {
+		in := decFromString(test.in)
+		d := decFromString(test.dividend)
+		out := decFromString(test.out)
+
+		r := in.modW(d[0])
+		if r != out[0] {
+			t.Errorf("#%d failed: got %d want %s", i, r, out.utoa(10))
+		}
+	}
+}
+
+func TestModW(t *testing.T) {
+	if _W >= 32 {
+		runModWTests(t, modWTests32)
+	}
+	if _W >= 64 {
+		runModWTests(t, modWTests64)
+	}
+}
+
+// var montgomeryTests = []struct {
+// 	x, y, m      string
+// 	k0           uint64
+// 	out32, out64 string
+// }{
+// 	{
+// 		"0xffffffffffffffffffffffffffffffffffffffffffffffffe",
+// 		"0xffffffffffffffffffffffffffffffffffffffffffffffffe",
+// 		"0xfffffffffffffffffffffffffffffffffffffffffffffffff",
+// 		1,
+// 		"0x1000000000000000000000000000000000000000000",
+// 		"0x10000000000000000000000000000000000",
+// 	},
+// 	{
+// 		"0x000000000ffffff5",
+// 		"0x000000000ffffff0",
+// 		"0x0000000010000001",
+// 		0xff0000000fffffff,
+// 		"0x000000000bfffff4",
+// 		"0x0000000003400001",
+// 	},
+// 	{
+// 		"0x0000000080000000",
+// 		"0x00000000ffffffff",
+// 		"0x1000000000000001",
+// 		0xfffffffffffffff,
+// 		"0x0800000008000001",
+// 		"0x0800000008000001",
+// 	},
+// 	{
+// 		"0x0000000080000000",
+// 		"0x0000000080000000",
+// 		"0xffffffff00000001",
+// 		0xfffffffeffffffff,
+// 		"0xbfffffff40000001",
+// 		"0xbfffffff40000001",
+// 	},
+// 	{
+// 		"0x0000000080000000",
+// 		"0x0000000080000000",
+// 		"0x00ffffff00000001",
+// 		0xfffffeffffffff,
+// 		"0xbfffff40000001",
+// 		"0xbfffff40000001",
+// 	},
+// 	{
+// 		"0x0000000080000000",
+// 		"0x0000000080000000",
+// 		"0x0000ffff00000001",
+// 		0xfffeffffffff,
+// 		"0xbfff40000001",
+// 		"0xbfff40000001",
+// 	},
+// 	{
+// 		"0x3321ffffffffffffffffffffffffffff00000000000022222623333333332bbbb888c0",
+// 		"0x3321ffffffffffffffffffffffffffff00000000000022222623333333332bbbb888c0",
+// 		"0x33377fffffffffffffffffffffffffffffffffffffffffffff0000000000022222eee1",
+// 		0xdecc8f1249812adf,
+// 		"0x04eb0e11d72329dc0915f86784820fc403275bf2f6620a20e0dd344c5cd0875e50deb5",
+// 		"0x0d7144739a7d8e11d72329dc0915f86784820fc403275bf2f61ed96f35dd34dbb3d6a0",
+// 	},
+// 	{
+// 		"0x10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffffffffffff00000000000022222223333333333444444444",
+// 		"0x10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffffffffffff999999999999999aaabbbbbbbbcccccccccccc",
+// 		"0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff33377fffffffffffffffffffffffffffffffffffffffffffff0000000000022222eee1",
+// 		0xdecc8f1249812adf,
+// 		"0x5c0d52f451aec609b15da8e5e5626c4eaa88723bdeac9d25ca9b961269400410ca208a16af9c2fb07d7a11c7772cba02c22f9711078d51a3797eb18e691295293284d988e349fa6deba46b25a4ecd9f715",
+// 		"0x92fcad4b5c0d52f451aec609b15da8e5e5626c4eaa88723bdeac9d25ca9b961269400410ca208a16af9c2fb07d799c32fe2f3cc5422f9711078d51a3797eb18e691295293284d8f5e69caf6decddfe1df6",
+// 	},
+// }
+
+// func TestMontgomery(t *testing.T) {
+// 	one := NewInt(1)
+// 	_B := new(Int).Lsh(one, _W)
+// 	for i, test := range montgomeryTests {
+// 		x := decFromString(test.x)
+// 		y := decFromString(test.y)
+// 		m := decFromString(test.m)
+// 		for len(x) < len(m) {
+// 			x = append(x, 0)
+// 		}
+// 		for len(y) < len(m) {
+// 			y = append(y, 0)
+// 		}
+
+// 		if x.cmp(m) > 0 {
+// 			_, r := nat(nil).div(nil, x, m)
+// 			t.Errorf("#%d: x > m (0x%s > 0x%s; use 0x%s)", i, x.utoa(16), m.utoa(16), r.utoa(16))
+// 		}
+// 		if y.cmp(m) > 0 {
+// 			_, r := nat(nil).div(nil, x, m)
+// 			t.Errorf("#%d: y > m (0x%s > 0x%s; use 0x%s)", i, y.utoa(16), m.utoa(16), r.utoa(16))
+// 		}
+
+// 		var out nat
+// 		if _W == 32 {
+// 			out = decFromString(test.out32)
+// 		} else {
+// 			out = decFromString(test.out64)
+// 		}
+
+// 		// t.Logf("#%d: len=%d\n", i, len(m))
+
+// 		// check output in table
+// 		xi := &Int{abs: x}
+// 		yi := &Int{abs: y}
+// 		mi := &Int{abs: m}
+// 		p := new(Int).Mod(new(Int).Mul(xi, new(Int).Mul(yi, new(Int).ModInverse(new(Int).Lsh(one, uint(len(m))*_W), mi))), mi)
+// 		if out.cmp(p.abs.norm()) != 0 {
+// 			t.Errorf("#%d: out in table=0x%s, computed=0x%s", i, out.utoa(16), p.abs.norm().utoa(16))
+// 		}
+
+// 		// check k0 in table
+// 		k := new(Int).Mod(&Int{abs: m}, _B)
+// 		k = new(Int).Sub(_B, k)
+// 		k = new(Int).Mod(k, _B)
+// 		k0 := Word(new(Int).ModInverse(k, _B).Uint64())
+// 		if k0 != Word(test.k0) {
+// 			t.Errorf("#%d: k0 in table=%#x, computed=%#x\n", i, test.k0, k0)
+// 		}
+
+// 		// check montgomery with correct k0 produces correct output
+// 		z := nat(nil).montgomery(x, y, m, k0, len(m))
+// 		z = z.norm()
+// 		if z.cmp(out) != 0 {
+// 			t.Errorf("#%d: got 0x%s want 0x%s", i, z.utoa(16), out.utoa(16))
+// 		}
+// 	}
+// }
+
+// var expNNTests = []struct {
+// 	x, y, m string
+// 	out     string
+// }{
+// 	{"0", "0", "0", "1"},
+// 	{"0", "0", "1", "0"},
+// 	{"1", "1", "1", "0"},
+// 	{"2", "1", "1", "0"},
+// 	{"2", "2", "1", "0"},
+// 	{"10", "100000000000", "1", "0"},
+// 	{"0x8000000000000000", "2", "", "0x40000000000000000000000000000000"},
+// 	{"0x8000000000000000", "2", "6719", "4944"},
+// 	{"0x8000000000000000", "3", "6719", "5447"},
+// 	{"0x8000000000000000", "1000", "6719", "1603"},
+// 	{"0x8000000000000000", "1000000", "6719", "3199"},
+// 	{
+// 		"2938462938472983472983659726349017249287491026512746239764525612965293865296239471239874193284792387498274256129746192347",
+// 		"298472983472983471903246121093472394872319615612417471234712061",
+// 		"29834729834729834729347290846729561262544958723956495615629569234729836259263598127342374289365912465901365498236492183464",
+// 		"23537740700184054162508175125554701713153216681790245129157191391322321508055833908509185839069455749219131480588829346291",
+// 	},
+// 	{
+// 		"11521922904531591643048817447554701904414021819823889996244743037378330903763518501116638828335352811871131385129455853417360623007349090150042001944696604737499160174391019030572483602867266711107136838523916077674888297896995042968746762200926853379",
+// 		"426343618817810911523",
+// 		"444747819283133684179",
+// 		"42",
+// 	},
+// }
+
+// func TestExpNN(t *testing.T) {
+// 	for i, test := range expNNTests {
+// 		x := decFromString(test.x)
+// 		y := decFromString(test.y)
+// 		out := decFromString(test.out)
+
+// 		var m nat
+// 		if len(test.m) > 0 {
+// 			m = decFromString(test.m)
+// 		}
+
+// 		z := nat(nil).expNN(x, y, m)
+// 		if z.cmp(out) != 0 {
+// 			t.Errorf("#%d got %s want %s", i, z.utoa(10), out.utoa(10))
+// 		}
+// 	}
+// }
+
+// func BenchmarkExp3Power(b *testing.B) {
+// 	const x = 3
+// 	for _, y := range []Word{
+// 		0x10, 0x40, 0x100, 0x400, 0x1000, 0x4000, 0x10000, 0x40000, 0x100000, 0x400000,
+// 	} {
+// 		b.Run(fmt.Sprintf("%#x", y), func(b *testing.B) {
+// 			var z nat
+// 			for i := 0; i < b.N; i++ {
+// 				z.expWW(x, y)
+// 			}
+// 		})
+// 	}
+// }
+
+// func fibo(n int) nat {
+// 	switch n {
+// 	case 0:
+// 		return nil
+// 	case 1:
+// 		return nat{1}
+// 	}
+// 	f0 := fibo(0)
+// 	f1 := fibo(1)
+// 	var f2 nat
+// 	for i := 1; i < n; i++ {
+// 		f2 = f2.add(f0, f1)
+// 		f0, f1, f2 = f1, f2, f0
+// 	}
+// 	return f1
+// }
+
+// var fiboNums = []string{
+// 	"0",
+// 	"55",
+// 	"6765",
+// 	"832040",
+// 	"102334155",
+// 	"12586269025",
+// 	"1548008755920",
+// 	"190392490709135",
+// 	"23416728348467685",
+// 	"2880067194370816120",
+// 	"354224848179261915075",
+// }
+
+// func TestFibo(t *testing.T) {
+// 	for i, want := range fiboNums {
+// 		n := i * 10
+// 		got := string(fibo(n).utoa(10))
+// 		if got != want {
+// 			t.Errorf("fibo(%d) failed: got %s want %s", n, got, want)
+// 		}
+// 	}
+// }
+
+// func BenchmarkFibo(b *testing.B) {
+// 	for i := 0; i < b.N; i++ {
+// 		fibo(1e0)
+// 		fibo(1e1)
+// 		fibo(1e2)
+// 		fibo(1e3)
+// 		fibo(1e4)
+// 		fibo(1e5)
+// 	}
+// }
+
+// var bitTests = []struct {
+// 	x    string
+// 	i    uint
+// 	want uint
+// }{
+// 	{"0", 0, 0},
+// 	{"0", 1, 0},
+// 	{"0", 1000, 0},
+
+// 	{"0x1", 0, 1},
+// 	{"0x10", 0, 0},
+// 	{"0x10", 3, 0},
+// 	{"0x10", 4, 1},
+// 	{"0x10", 5, 0},
+
+// 	{"0x8000000000000000", 62, 0},
+// 	{"0x8000000000000000", 63, 1},
+// 	{"0x8000000000000000", 64, 0},
+
+// 	{"0x3" + strings.Repeat("0", 32), 127, 0},
+// 	{"0x3" + strings.Repeat("0", 32), 128, 1},
+// 	{"0x3" + strings.Repeat("0", 32), 129, 1},
+// 	{"0x3" + strings.Repeat("0", 32), 130, 0},
+// }
+
+// func TestBit(t *testing.T) {
+// 	for i, test := range bitTests {
+// 		x := decFromString(test.x)
+// 		if got := x.bit(test.i); got != test.want {
+// 			t.Errorf("#%d: %s.bit(%d) = %v; want %v", i, test.x, test.i, got, test.want)
+// 		}
+// 	}
+// }
+
+var stickyTests = []struct {
+	x    string
+	i    uint
+	want uint
+}{
+	{"0", 0, 0},
+	{"0", 1, 0},
+	{"0", 1000, 0},
+
+	{"1", 0, 0},
+	{"1", 1, 1},
+
+	{"1001101010000", 0, 0},
+	{"1001101010000", 4, 0},
+	{"1001101010000", 5, 1},
+
+	{"1000000000000000000", 18, 0},
+	{"1000000000000000000", 29, 1},
+
+	{"1" + strings.Repeat("0", 100), 100, 0},
+	{"1" + strings.Repeat("0", 100), 101, 1},
+}
+
+func TestSticky(t *testing.T) {
+	for i, test := range stickyTests {
+		x := decFromString(test.x)
+		if got := x.sticky(test.i); got != test.want {
+			t.Errorf("#%d: %s.sticky(%d) = %v; want %v", i, test.x, test.i, got, test.want)
+		}
+		if test.want == 1 {
+			// all subsequent i's should also return 1
+			for d := uint(1); d <= 3; d++ {
+				if got := x.sticky(test.i + d); got != 1 {
+					t.Errorf("#%d: %s.sticky(%d) = %v; want %v", i, test.x, test.i+d, got, 1)
+				}
 			}
+		}
+	}
+}
+
+func testSqr(t *testing.T, x dec) {
+	got := make(dec, 2*len(x))
+	want := make(dec, 2*len(x))
+	got = got.sqr(x)
+	want = want.mul(x, x)
+	if got.cmp(want) != 0 {
+		t.Errorf("basicSqr(%v), got %v, want %v", x, got, want)
+	}
+}
+
+func TestSqr(t *testing.T) {
+	for _, a := range prodNN {
+		if a.x != nil {
+			testSqr(t, a.x)
+		}
+		if a.y != nil {
+			testSqr(t, a.y)
+		}
+		if a.z != nil {
+			testSqr(t, a.z)
+		}
+	}
+}
+
+func benchmarkDecSqr(b *testing.B, nwords int) {
+	x := rndDec(nwords)
+	var z dec
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		z.sqr(x)
+	}
+}
+
+var sqrBenchSizes = []int{
+	1, 2, 3, 5, 8, 10, 20, 30, 50, 80,
+	100, 200, 300, 500, 800,
+	1000, 10000, 100000,
+}
+
+func BenchmarkDecSqr(b *testing.B) {
+	for _, n := range sqrBenchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			benchmarkDecSqr(b, n)
 		})
 	}
 }
+
+// func BenchmarkNatSetBytes(b *testing.B) {
+// 	const maxLength = 128
+// 	lengths := []int{
+// 		// No remainder:
+// 		8, 24, maxLength,
+// 		// With remainder:
+// 		7, 23, maxLength - 1,
+// 	}
+// 	n := make(nat, maxLength/_W) // ensure n doesn't need to grow during the test
+// 	buf := make([]byte, maxLength)
+// 	for _, l := range lengths {
+// 		b.Run(fmt.Sprint(l), func(b *testing.B) {
+// 			for i := 0; i < b.N; i++ {
+// 				n.setBytes(buf[:l])
+// 			}
+// 		})
+// 	}
+// }
+
+func TestNatDiv(t *testing.T) {
+	sizes := []int{
+		1, 2, 5, 8, 15, 25, 40, 65, 100,
+		200, 500, 800, 1500, 2500, 4000, 6500, 10000,
+	}
+	for _, i := range sizes {
+		for _, j := range sizes {
+			a := rndDec1(i)
+			b := rndDec1(j)
+			// the test requires b >= 2
+			if len(b) == 1 && b[0] == 1 {
+				b[0] = 2
+			}
+			// choose a remainder c < b
+			c := rndDec1(len(b))
+			if len(c) == len(b) && c[len(c)-1] >= b[len(b)-1] {
+				c[len(c)-1] = 0
+				c = c.norm()
+			}
+			// compute x = a*b+c
+			x := dec(nil).mul(a, b)
+			x = x.add(x, c)
+
+			var q, r dec
+			q, r = q.div(r, x, b)
+			if q.cmp(a) != 0 {
+				t.Fatalf("wrong quotient: got %s; want %s for %s/%s", q.utoa(10), a.utoa(10), x.utoa(10), b.utoa(10))
+			}
+			if r.cmp(c) != 0 {
+				t.Fatalf("wrong remainder: got %s; want %s for %s/%s", r.utoa(10), c.utoa(10), x.utoa(10), b.utoa(10))
+			}
+		}
+	}
+}
diff --git a/decimal_test.go b/decimal_test.go
index 8284923..6e2e58c 100644
--- a/decimal_test.go
+++ b/decimal_test.go
@@ -7,7 +7,6 @@ import (
 	"reflect"
 	"strconv"
 	"testing"
-	"time"
 )
 
 var benchU uint
@@ -32,7 +31,6 @@ var intData = []struct {
 }
 
 func TestDnorm(t *testing.T) {
-	rand.Seed(time.Now().UnixNano())
 	for i := 0; i < 10000; i++ {
 	again:
 		w := uint(rand.Uint64()) % _BD
@@ -99,7 +97,6 @@ func TestDecimal_SetString(t *testing.T) {
 }
 
 func BenchmarkDnorm(b *testing.B) {
-	rand.Seed(0xdeadbeefbadf00d)
 	d := dec(nil).make(1000)
 	for i := range d {
 		d[i] = Word(rand.Uint64()) % _BD
diff --git a/stdlib.go b/stdlib.go
index 6dfff78..64aedba 100644
--- a/stdlib.go
+++ b/stdlib.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"math"
 	"math/bits"
+	"math/rand"
 	"strconv"
 )
 
@@ -86,7 +87,7 @@ func makeAcc(above bool) Accuracy {
 type Word uint
 
 const (
-	// _S = _W / 8 // word size in bytes
+	_S = _W / 8 // word size in bytes
 
 	_W = bits.UintSize // word size in bits
 	// _B = 1 << _W       // digit base
@@ -140,6 +141,12 @@ func mulAddWWW(x, y, c Word) (z1, z0 Word) {
 	return Word(hi + cc), Word(lo)
 }
 
+// z1<<_W + z0 = x*y
+func mulWW(x, y Word) (z1, z0 Word) {
+	hi, lo := bits.Mul(uint(x), uint(y))
+	return Word(hi), Word(lo)
+}
+
 func same(x, y []Word) bool {
 	return len(x) == len(y) && len(x) > 0 && &x[0] == &y[0]
 }
@@ -272,3 +279,8 @@ type ErrNaN struct {
 func (err ErrNaN) Error() string {
 	return err.msg
 }
+
+var rnd = rand.New(rand.NewSource(0))
+
+// TODO(db47h): set this to false
+const isRaceBuilder = true