dec: add calibration tests and update thresholds

db47h · May 22, 2020 · af127d2 · af127d2
1 parent 00e29bc
commit af127d2
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 18 deletions.
diff --git a/dec.go b/dec.go
@@ -10,6 +10,20 @@ import (
 	"sync"
 )
 
+// The following thresholds are hugely different from their counterparts
+// in math/big.
+
+// Operands that are shorter than decKaratsubaThreshold are multiplied using
+// "grade school" multiplication; for longer operands the Karatsuba algorithm
+// is used.
+var decKaratsubaThreshold = 30 // computed by calibrate_test.go
+
+// Operands that are shorter than decBasicSqrThreshold are squared using
+// "grade school" multiplication; for operands longer than karatsubaSqrThreshold
+// we use the Karatsuba algorithm optimized for x == y.
+var decBasicSqrThreshold = 10     // computed by calibrate_test.go
+var decKaratsubaSqrThreshold = 50 // computed by calibrate_test.go
+
 // dec is an unsigned integer x of the form
 //
 //   x = x[n-1]*_BD^(n-1) + x[n-2]*_BD^(n-2) + ... + x[1]*_BD + x[0]
@@ -569,12 +583,12 @@ func (z dec) sqr(x dec) dec {
 		z = nil // z is an alias for x - cannot reuse
 	}
 
-	if n < basicSqrThreshold {
+	if n < decBasicSqrThreshold {
 		z = z.make(2 * n)
 		decBasicMul(z, x, x)
 		return z.norm()
 	}
-	if n < karatsubaSqrThreshold {
+	if n < decKaratsubaSqrThreshold {
 		z = z.make(2 * n)
 		decBasicSqr(z, x)
 		return z.norm()
@@ -584,7 +598,7 @@ func (z dec) sqr(x dec) dec {
 
 	// z = (x1*b + x0)^2 = x1^2*b^2 + 2*x1*x0*b + x0^2
 
-	k := karatsubaLen(n, karatsubaSqrThreshold)
+	k := karatsubaLen(n, decKaratsubaSqrThreshold)
 
 	x0 := x[0:k]
 	z = z.make(max(6*k, 2*n))
@@ -639,7 +653,7 @@ func decBasicSqr(z, x dec) {
 func decKaratsubaSqr(z, x dec) {
 	n := len(x)
 
-	if n&1 != 0 || n < karatsubaSqrThreshold || n < 2 {
+	if n&1 != 0 || n < decKaratsubaSqrThreshold || n < 2 {
 		decBasicSqr(z[:2*n], x)
 		return
 	}
@@ -698,7 +712,7 @@ func (z dec) mul(x, y dec) dec {
 	}
 
 	// use basic multiplication if the numbers are small
-	if n < karatsubaThreshold {
+	if n < decKaratsubaThreshold {
 		z = z.make(m + n)
 		decBasicMul(z, x, y)
 		return z.norm()
@@ -711,7 +725,7 @@ func (z dec) mul(x, y dec) dec {
 	//   y = yh*b + y0  (0 <= y0 < b)
 	//   b = 10**(_DW*k)  ("base" of digits xi, yi)
 	//
-	k := karatsubaLen(n, karatsubaThreshold)
+	k := karatsubaLen(n, decKaratsubaThreshold)
 	// k <= n
 
 	// // multiply x0 and y0 via Karatsuba
@@ -959,7 +973,7 @@ func decKaratsuba(z, x, y dec) {
 	// Switch to basic multiplication if numbers are odd or small.
 	// (n is always even if karatsubaThreshold is even, but be
 	// conservative)
-	if n&1 != 0 || n < karatsubaThreshold || n < 2 {
+	if n&1 != 0 || n < decKaratsubaThreshold || n < 2 {
 		decBasicMul(z, x, y)
 		return
 	}

diff --git a/dec_calibrate_test.go b/dec_calibrate_test.go
@@ -0,0 +1,175 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Calibration used to determine thresholds for using
+// different algorithms.  Ideally, this would be converted
+// to go generate to create thresholds.go
+
+// This file prints execution times for the Mul benchmark
+// given different Karatsuba thresholds. The result may be
+// used to manually fine-tune the threshold constant. The
+// results are somewhat fragile; use repeated runs to get
+// a clear picture.
+
+// Calculates lower and upper thresholds for when basicSqr
+// is faster than standard multiplication.
+
+// Usage: go test -run=TestDecCalibrate -v -calibrate -cpu 1
+// Forcing a single logical CPU seems to yield more stable
+// benchmarks.
+
+package decimal
+
+import (
+	"flag"
+	"fmt"
+	"testing"
+	"time"
+)
+
+var calibrate = flag.Bool("calibrate", false, "run calibration test")
+
+const (
+	sqrModeMul       = "mul(x, x)"
+	sqrModeBasic     = "basicSqr(x)"
+	sqrModeKaratsuba = "karatsubaSqr(x)"
+)
+
+func TestDecCalibrate(t *testing.T) {
+	if !*calibrate {
+		return
+	}
+
+	computeKaratsubaThresholds()
+
+	// compute basicSqrThreshold where overhead becomes negligible
+	minSqr := computeSqrThreshold(5, 20, 1, 3, sqrModeMul, sqrModeBasic)
+	// compute karatsubaSqrThreshold where karatsuba is faster
+	maxSqr := computeSqrThreshold(30, 300, 10, 3, sqrModeBasic, sqrModeKaratsuba)
+	if minSqr != 0 {
+		fmt.Printf("found basicSqrThreshold = %d\n", minSqr)
+	} else {
+		fmt.Println("no basicSqrThreshold found")
+	}
+	if maxSqr != 0 {
+		fmt.Printf("found karatsubaSqrThreshold = %d\n", maxSqr)
+	} else {
+		fmt.Println("no karatsubaSqrThreshold found")
+	}
+}
+
+func karatsubaLoad(b *testing.B) {
+	BenchmarkDecMul1e4(b)
+}
+
+// measureKaratsuba returns the time to run a Karatsuba-relevant benchmark
+// given Karatsuba threshold th.
+func measureKaratsuba(th int) time.Duration {
+	th, decKaratsubaThreshold = decKaratsubaThreshold, th
+	res := testing.Benchmark(karatsubaLoad)
+	decKaratsubaThreshold = th
+	return time.Duration(res.NsPerOp())
+}
+
+func computeKaratsubaThresholds() {
+	fmt.Printf("Multiplication times for varying Karatsuba thresholds\n")
+	fmt.Printf("(run repeatedly for good results)\n")
+
+	// determine Tk, the work load execution time using basic multiplication
+	Tb := measureKaratsuba(1e9) // th == 1e9 => Karatsuba multiplication disabled
+	fmt.Printf("Tb = %10s\n", Tb)
+
+	// thresholds
+	th := 4
+	th1 := -1
+	th2 := -1
+
+	var deltaOld time.Duration
+	for count := -1; count != 0 && th < 128; count-- {
+		// determine Tk, the work load execution time using Karatsuba multiplication
+		Tk := measureKaratsuba(th)
+
+		// improvement over Tb
+		delta := (Tb - Tk) * 100 / Tb
+
+		fmt.Printf("th = %3d  Tk = %10s  %4d%%", th, Tk, delta)
+
+		// determine break-even point
+		if Tk < Tb && th1 < 0 {
+			th1 = th
+			fmt.Print("  break-even point")
+		}
+
+		// determine diminishing return
+		if 0 < delta && delta < deltaOld && th2 < 0 {
+			th2 = th
+			fmt.Print("  diminishing return")
+		}
+		deltaOld = delta
+
+		fmt.Println()
+
+		// trigger counter
+		if th1 >= 0 && th2 >= 0 && count < 0 {
+			count = 10 // this many extra measurements after we got both thresholds
+		}
+
+		th++
+	}
+}
+
+func measureSqr(words, nruns int, mode string) time.Duration {
+	// more runs for better statistics
+	initBasicSqr, initKaratsubaSqr := decBasicSqrThreshold, decKaratsubaSqrThreshold
+
+	switch mode {
+	case sqrModeMul:
+		decBasicSqrThreshold = words + 1
+	case sqrModeBasic:
+		decBasicSqrThreshold, decKaratsubaSqrThreshold = words-1, words+1
+	case sqrModeKaratsuba:
+		decKaratsubaSqrThreshold = words - 1
+	}
+
+	var testval int64
+	for i := 0; i < nruns; i++ {
+		res := testing.Benchmark(func(b *testing.B) { benchmarkDecSqr(b, words) })
+		testval += res.NsPerOp()
+	}
+	testval /= int64(nruns)
+
+	decBasicSqrThreshold, decKaratsubaSqrThreshold = initBasicSqr, initKaratsubaSqr
+
+	return time.Duration(testval)
+}
+
+func computeSqrThreshold(from, to, step, nruns int, lower, upper string) int {
+	fmt.Printf("Calibrating threshold between %s and %s\n", lower, upper)
+	fmt.Printf("Looking for a timing difference for x between %d - %d words by %d step\n", from, to, step)
+	var initPos bool
+	var threshold int
+	for i := from; i <= to; i += step {
+		baseline := measureSqr(i, nruns, lower)
+		testval := measureSqr(i, nruns, upper)
+		pos := baseline > testval
+		delta := baseline - testval
+		percent := delta * 100 / baseline
+		fmt.Printf("words = %3d deltaT = %10s (%4d%%) is %s better: %v", i, delta, percent, upper, pos)
+		if i == from {
+			initPos = pos
+		}
+		if threshold == 0 && pos != initPos {
+			threshold = i
+			fmt.Printf("  threshold  found")
+		}
+		fmt.Println()
+
+	}
+	if threshold != 0 {
+		fmt.Printf("Found threshold = %d between %d - %d\n", threshold, from, to)
+	} else {
+		fmt.Printf("Found NO threshold between %d - %d\n", from, to)
+	}
+	return threshold
+}
diff --git a/stdlib.go b/stdlib.go
@@ -249,17 +249,6 @@ type nat []Word
 
 const divRecursiveThreshold = 100
 
-// Operands that are shorter than karatsubaThreshold are multiplied using
-// "grade school" multiplication; for longer operands the Karatsuba algorithm
-// is used.
-const karatsubaThreshold = 40 // computed by calibrate_test.go
-
-// Operands that are shorter than basicSqrThreshold are squared using
-// "grade school" multiplication; for operands longer than karatsubaSqrThreshold
-// we use the Karatsuba algorithm optimized for x == y.
-var basicSqrThreshold = 20      // computed by calibrate_test.go
-var karatsubaSqrThreshold = 260 // computed by calibrate_test.go
-
 // karatsubaLen computes an approximation to the maximum k <= n such that
 // k = p/10**i for a number p <= threshold and an i >= 0. Thus, the
 // result is the largest number that can be divided repeatedly by 10 before