From 5821f1083ea8a07f9b0c13a29afbeccd89466914 Mon Sep 17 00:00:00 2001
From: Denis Bernard <db047h@gmail.com>
Date: Thu, 14 May 2020 02:21:15 +0200
Subject: [PATCH] dec/amd64: implement add10VW/sub10VW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    name             go time/op     asm time/op     delta
    Sub10VW/1         10.1ns ± 2%      5.6ns ± 1%   -45.03%
    Sub10VW/2         11.7ns ± 1%      6.1ns ± 1%   -48.01%
    Sub10VW/3         13.2ns ± 2%      8.1ns ± 0%   -39.03%
    Sub10VW/4         14.6ns ± 0%      8.4ns ± 0%   -42.77%
    Sub10VW/5         14.9ns ± 1%      8.8ns ± 0%   -40.66%
    Sub10VW/10        15.1ns ± 0%     10.6ns ± 3%   -30.12%
    Sub10VW/100        116ns ± 1%       45ns ± 6%   -61.62%
    Sub10VW/1000      1.22µs ± 1%     0.54µs ±14%   -55.85%
    Sub10VW/10000     11.9µs ± 0%      5.4µs ± 1%   -54.85%
    Sub10VW/100000     122µs ± 0%       62µs ± 0%   -49.45%

The Go implementation can check if the carry is zero and switch to
copy() for free (no need to have a standard add10VW vs. add10VW large).
In the assembler version I chose to keep a single implementation of the
function and switch to a memcpy whenever the carry is 0 (checked every 4
Words). Considering that the carry is almost always 0, this logic is the
likely cause of the performance drop between 5-15 Words.

Also past 1000 Words, the performance gains seem to slowly drop. The
very likely cause is the simplistic memcpy implementation vs.
runtime·memmove.
---
 arith_test.go     |   2 +-
 dec.go            |   4 +-
 dec_arith.go      |  26 ++--
 dec_arith_amd64.s | 367 +++++++++++++++++++++++++++++----------------
 dec_arith_decl.go |  10 +-
 dec_arith_test.go | 373 +++++++++++++++++++++++++---------------------
 decimal_test.go   |   4 +-
 go.mod            |   2 -
 testdata/bench    |  12 --
 testdata/benchasm |  14 ++
 testdata/benchgit |  23 +--
 11 files changed, 494 insertions(+), 343 deletions(-)
 delete mode 100755 testdata/bench
 create mode 100755 testdata/benchasm

diff --git a/arith_test.go b/arith_test.go
index 4b5a1f0..00f39c9 100644
--- a/arith_test.go
+++ b/arith_test.go
@@ -14,7 +14,7 @@ import (
 var isRaceBuilder bool
 
 func init() {
-	flag.BoolVar(&isRaceBuilder, "rb", true, "race builder")
+	flag.BoolVar(&isRaceBuilder, "rb", false, "race builder")
 }
 
 type funVV func(z, x, y []Word) (c Word)
diff --git a/dec.go b/dec.go
index 5962c58..6c4a05d 100644
--- a/dec.go
+++ b/dec.go
@@ -64,7 +64,7 @@ func (x dec) digit(i uint) uint {
 		return 0
 	}
 	// 0 <= j < len(x)
-	return (uint(x[j]) / pow10(i)) % 10
+	return uint(x[j]/pow10(i)) % 10
 }
 
 func (z dec) make(n int) dec {
@@ -166,7 +166,7 @@ func (x dec) sticky(i uint) uint {
 			return 1
 		}
 	}
-	if uint(x[j])%pow10(i) != 0 {
+	if x[j]%pow10(i) != 0 {
 		return 1
 	}
 	return 0
diff --git a/dec_arith.go b/dec_arith.go
index c1d6d29..75a3338 100644
--- a/dec_arith.go
+++ b/dec_arith.go
@@ -25,7 +25,7 @@ var pow10s = [...]uint64{
 	10000000000000000, 100000000000000000, 1000000000000000000, 10000000000000000000,
 }
 
-func pow10(n uint) uint { return uint(pow10s[n]) }
+func pow10(n uint) Word { return Word(pow10s[n]) }
 
 var maxDigits = [...]uint{
 	1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5,
@@ -54,7 +54,7 @@ func decDigits64(x uint64) (n uint) {
 
 func decDigits32(x uint) (n uint) {
 	n = maxDigits[bits.Len(x)]
-	if x < pow10(n-1) {
+	if x < uint(pow10(n-1)) {
 		n--
 	}
 	return n
@@ -148,12 +148,9 @@ func div10WW_g(u1, u0, v Word) (q, r Word) {
 
 func add10WWW_g(x, y, cIn Word) (s, c Word) {
 	r, cc := bits.Add(uint(x), uint(y), uint(cIn))
-	// if cc != 0 || r > _DB-1 {
-	// 	cc = 1
-	// 	r -= _DB
-	// }
-	// c1 := uint(int(r-_DB) >> 63)
 	var c1 uint
+	// this simple if statement is compiled without jumps
+	// at least on amd64.
 	if r >= _DB {
 		c1 = 1
 	}
@@ -187,7 +184,10 @@ func sub10VV_g(z, x, y []Word) (c Word) {
 }
 
 // add10VW adds y to x. The resulting carry c is either 0 or 1.
-func add10VW_g(z, x dec, y Word) (c Word) {
+func add10VW_g(z, x []Word, y Word) (c Word) {
+	if len(z) == 0 {
+		return y
+	}
 	z[0], c = add10WWW_g(x[0], y, 0)
 	// propagate carry
 	for i := 1; i < len(z) && i < len(x); i++ {
@@ -219,7 +219,7 @@ func sub10VW_g(z, x []Word, y Word) (c Word) {
 }
 
 // shl10VU sets z to x*(10**s), s < _WD
-func shl10VU_g(z, x dec, s uint) (r Word) {
+func shl10VU_g(z, x []Word, s uint) (r Word) {
 	if s == 0 {
 		copy(z, x)
 		return
@@ -227,7 +227,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) {
 	if len(z) == 0 || len(x) == 0 {
 		return
 	}
-	d, m := Word(pow10(_DW-s)), Word(pow10(s))
+	d, m := pow10(_DW-s), pow10(s)
 	var h, l Word
 	r, l = divWW(0, x[len(x)-1], d)
 	for i := len(z) - 1; i > 0; i-- {
@@ -241,7 +241,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) {
 }
 
 // shr10VU sets z to x/(10**s)
-func shr10VU_g(z, x dec, s uint) (r Word) {
+func shr10VU_g(z, x []Word, s uint) (r Word) {
 	if s == 0 {
 		copy(z, x)
 		return
@@ -251,7 +251,7 @@ func shr10VU_g(z, x dec, s uint) (r Word) {
 	}
 
 	var h, l Word
-	d, m := Word(pow10(s)), Word(pow10(_DW-s))
+	d, m := pow10(s), pow10(_DW-s)
 	h, r = divWW(0, x[0], Word(d))
 	for i := 1; i < len(z) && i < len(x); i++ {
 		t := h
@@ -259,7 +259,7 @@ func shr10VU_g(z, x dec, s uint) (r Word) {
 		z[i-1] = t + l*m
 	}
 	z[len(z)-1] = h
-	return r
+	return r * m
 }
 
 func mulAdd10VWW_g(z, x []Word, y, r Word) (c Word) {
diff --git a/dec_arith_amd64.s b/dec_arith_amd64.s
index c095405..2d3d685 100644
--- a/dec_arith_amd64.s
+++ b/dec_arith_amd64.s
@@ -127,8 +127,7 @@ U1:	// n >= 0
 	SBBQ BX, BX
 	ORQ BX, CX
 	LEAQ 1(DX), AX
-	MOVQ CX, BX
-	ANDQ BX, AX
+	ANDQ CX, AX
 	SUBQ AX, R11
 	ADDQ CX, CX			// restore CF
 	ADCQ 8(R9)(SI*8), R12
@@ -137,8 +136,7 @@ U1:	// n >= 0
 	SBBQ BX, BX
 	ORQ BX, CX
 	LEAQ 1(DX), AX
-	MOVQ CX, BX
-	ANDQ BX, AX
+	ANDQ CX, AX
 	SUBQ AX, R12
 	ADDQ CX, CX			// restore CF
 	ADCQ 16(R9)(SI*8), R13
@@ -147,8 +145,7 @@ U1:	// n >= 0
 	SBBQ BX, BX
 	ORQ BX, CX
 	LEAQ 1(DX), AX
-	MOVQ CX, BX
-	ANDQ BX, AX
+	ANDQ CX, AX
 	SUBQ AX, R13
 	ADDQ CX, CX			// restore CF
 	ADCQ 24(R9)(SI*8), R14
@@ -157,8 +154,7 @@ U1:	// n >= 0
 	SBBQ BX, BX
 	ORQ BX, CX
 	LEAQ 1(DX), AX
-	MOVQ CX, BX
-	ANDQ BX, AX
+	ANDQ CX, AX
 	SUBQ AX, R14
 	MOVQ R11, 0(R10)(SI*8)
 	MOVQ R12, 8(R10)(SI*8)
@@ -181,20 +177,19 @@ L1:	// n > 0
 	SBBQ BX, BX
 	ORQ BX, CX
 	LEAQ 1(DX), AX
-	MOVQ CX, BX
-	ANDQ BX, AX
+	ANDQ CX, AX
 	SUBQ AX, R11
 	MOVQ R11, 0(R10)(SI*8)
 
-	ADDQ $1, SI		// i++
-	SUBQ $1, DI		// n--
+	INCQ SI			// i++
+	DECQ DI			// n--
 	JG L1			// if n > 0 goto L1
 
 E1: NEGQ CX
 	MOVQ CX, c+72(FP)	// return c
 	RET
 
-// func add10VV(z, x, y []Word) (c Word)
+// func sub10VV(z, x, y []Word) (c Word)
 TEXT ·sub10VV(SB),NOSPLIT,$0
 	MOVQ z_len+8(FP), DI
 	MOVQ x+24(FP), R8
@@ -261,126 +256,244 @@ L2:	// n > 0
 	ADDQ AX, R11
 	MOVQ R11, 0(R10)(SI*8)
 
-	ADDQ $1, SI		// i++
-	SUBQ $1, DI		// n--
+	INCQ SI			// i++
+	DECQ DI			// n--
 	JG L2			// if n > 0 goto L2
 
 E2:	NEGQ CX
 	MOVQ CX, c+72(FP)	// return c
 	RET
 
-// // func addVW(z, x []Word, y Word) (c Word)
-// TEXT ·addVW(SB),NOSPLIT,$0
-// 	MOVQ z_len+8(FP), DI
-// 	CMPQ DI, $32
-// 	JG large
-// 	MOVQ x+24(FP), R8
-// 	MOVQ y+48(FP), CX	// c = y
-// 	MOVQ z+0(FP), R10
-// 
-// 	MOVQ $0, SI		// i = 0
-// 
-// 	// s/JL/JMP/ below to disable the unrolled loop
-// 	SUBQ $4, DI		// n -= 4
-// 	JL V3			// if n < 4 goto V3
-// 
-// U3:	// n >= 0
-// 	// regular loop body unrolled 4x
-// 	MOVQ 0(R8)(SI*8), R11
-// 	MOVQ 8(R8)(SI*8), R12
-// 	MOVQ 16(R8)(SI*8), R13
-// 	MOVQ 24(R8)(SI*8), R14
-// 	ADDQ CX, R11
-// 	ADCQ $0, R12
-// 	ADCQ $0, R13
-// 	ADCQ $0, R14
-// 	SBBQ CX, CX		// save CF
-// 	NEGQ CX
-// 	MOVQ R11, 0(R10)(SI*8)
-// 	MOVQ R12, 8(R10)(SI*8)
-// 	MOVQ R13, 16(R10)(SI*8)
-// 	MOVQ R14, 24(R10)(SI*8)
-// 
-// 	ADDQ $4, SI		// i += 4
-// 	SUBQ $4, DI		// n -= 4
-// 	JGE U3			// if n >= 0 goto U3
-// 
-// V3:	ADDQ $4, DI		// n += 4
-// 	JLE E3			// if n <= 0 goto E3
-// 
-// L3:	// n > 0
-// 	ADDQ 0(R8)(SI*8), CX
-// 	MOVQ CX, 0(R10)(SI*8)
-// 	SBBQ CX, CX		// save CF
-// 	NEGQ CX
-// 
-// 	ADDQ $1, SI		// i++
-// 	SUBQ $1, DI		// n--
-// 	JG L3			// if n > 0 goto L3
-// 
-// E3:	MOVQ CX, c+56(FP)	// return c
-// 	RET
-// large:
-// 	JMP ·addVWlarge(SB)
-// 
-// 
-// // func subVW(z, x []Word, y Word) (c Word)
-// // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
-// TEXT ·subVW(SB),NOSPLIT,$0
-// 	MOVQ z_len+8(FP), DI
-// 	CMPQ DI, $32
-// 	JG large
-// 	MOVQ x+24(FP), R8
-// 	MOVQ y+48(FP), CX	// c = y
-// 	MOVQ z+0(FP), R10
-// 
-// 	MOVQ $0, SI		// i = 0
-// 
-// 	// s/JL/JMP/ below to disable the unrolled loop
-// 	SUBQ $4, DI		// n -= 4
-// 	JL V4			// if n < 4 goto V4
-// 
-// U4:	// n >= 0
-// 	// regular loop body unrolled 4x
-// 	MOVQ 0(R8)(SI*8), R11
-// 	MOVQ 8(R8)(SI*8), R12
-// 	MOVQ 16(R8)(SI*8), R13
-// 	MOVQ 24(R8)(SI*8), R14
-// 	SUBQ CX, R11
-// 	SBBQ $0, R12
-// 	SBBQ $0, R13
-// 	SBBQ $0, R14
-// 	SBBQ CX, CX		// save CF
-// 	NEGQ CX
-// 	MOVQ R11, 0(R10)(SI*8)
-// 	MOVQ R12, 8(R10)(SI*8)
-// 	MOVQ R13, 16(R10)(SI*8)
-// 	MOVQ R14, 24(R10)(SI*8)
-// 
-// 	ADDQ $4, SI		// i += 4
-// 	SUBQ $4, DI		// n -= 4
-// 	JGE U4			// if n >= 0 goto U4
-// 
-// V4:	ADDQ $4, DI		// n += 4
-// 	JLE E4			// if n <= 0 goto E4
-// 
-// L4:	// n > 0
-// 	MOVQ 0(R8)(SI*8), R11
-// 	SUBQ CX, R11
-// 	MOVQ R11, 0(R10)(SI*8)
-// 	SBBQ CX, CX		// save CF
-// 	NEGQ CX
-// 
-// 	ADDQ $1, SI		// i++
-// 	SUBQ $1, DI		// n--
-// 	JG L4			// if n > 0 goto L4
-// 
-// E4:	MOVQ CX, c+56(FP)	// return c
-// 	RET
-// large:
-// 	JMP ·subVWlarge(SB)
-// 
-// 
+// func add10VW(z, x []Word, y Word) (c Word)
+TEXT ·add10VW(SB),NOSPLIT,$0
+	MOVQ z_len+8(FP), DI
+	MOVQ x+24(FP), R8
+	MOVQ y+48(FP), CX	// c = y
+	MOVQ z+0(FP), R10
+
+	MOVQ $0, SI			// i = 0
+	MOVQ $_DB, DX
+
+	// Once we start looping, we won't handle the hardware carry since
+	// x[i] < _DB, so x[i] + 1 < 1<<64-1 always.
+	// This still needs to be handled for the first element.
+
+	DECQ DI			// n--
+	JL E3			// abort if n < 0
+	ADDQ 0(R8)(SI*8), CX
+	LEAQ -1(DX), AX
+	SBBQ BX, BX
+	CMPQ AX, CX
+	SBBQ AX, AX
+	ORQ AX, BX
+	MOVQ DX, AX
+	ANDQ BX, AX
+	SUBQ AX, CX
+	NEGQ BX			// convert to C = 0/1
+	MOVQ CX, 0(R10)(SI*8)
+	MOVQ BX, CX		// save c
+	LEAQ 1(SI), SI	// i++
+	JG T3			// if c != 0 propagate
+	SUBQ $4, DI		
+	JL CV3			// if n < 4 goto CV3
+	CMPQ R8, R10
+	JEQ E3			// don't copy if &x[0] == &z[0]
+	JMP CU3
+
+T3:
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUBQ $4, DI		// n -= 4
+	JL V3			// if n < 4 goto V3
+
+U3:	// n >= 0
+	// regular loop body unrolled 4x
+	ADDQ 0(R8)(SI*8), CX
+	CMPQ CX, DX
+	SBBQ BX, BX
+	ANDQ BX, CX
+	MOVQ CX, 0(R10)(SI*8)
+	LEAQ 1(BX), CX
+	ADDQ 8(R8)(SI*8), CX
+	CMPQ CX, DX
+	SBBQ BX, BX
+	ANDQ BX, CX
+	MOVQ CX, 8(R10)(SI*8)
+	LEAQ 1(BX), CX
+	ADDQ 16(R8)(SI*8), CX
+	CMPQ CX, DX
+	SBBQ BX, BX
+	ANDQ BX, CX
+	MOVQ CX, 16(R10)(SI*8)
+	LEAQ 1(BX), CX
+	ADDQ 24(R8)(SI*8), CX
+	CMPQ CX, DX
+	SBBQ BX, BX
+	ANDQ BX, CX
+	MOVQ CX, 24(R10)(SI*8)
+	LEAQ 1(BX), CX
+	TESTQ BX, BX
+	JL C3
+
+	ADDQ $4, SI		// i += 4
+	SUBQ $4, DI		// n -= 4
+	JGE U3			// if n >= 0 goto U3
+
+V3:	ADDQ $4, DI		// n += 4
+	JLE E3			// if n <= 0 goto E3
+
+L3:	// n > 0
+	ADDQ 0(R8)(SI*8), CX
+	CMPQ CX, DX
+	SBBQ BX, BX		// BX = _DB > CX ? -1 : 0
+	ANDQ BX, CX		// sets CX to 0 if CX >= _DB
+	MOVQ CX, 0(R10)(SI*8)
+	LEAQ 1(BX), CX	// eqv to NOTQ BX, NEGQ BX, MOVQ BX CX
+
+	INCQ SI			// i++
+	DECQ DI			// n--
+	JG L3			// if n > 0 goto L3
+
+E3:	MOVQ CX, c+56(FP)	// return c
+	RET
+
+C3: // memcpy
+	CMPQ R8, R10	// don't copy if &x[0] == &z[0]
+	JEQ CE3
+	ADDQ $4, SI
+	SUBQ $4, DI
+	JL CV3
+
+CU3: // n >= 4
+	MOVQ 0(R8)(SI*8), AX
+	MOVQ 8(R8)(SI*8), BX
+	MOVQ 16(R8)(SI*8), CX
+	MOVQ 24(R8)(SI*8), DX
+	MOVQ AX, 0(R10)(SI*8)
+	MOVQ BX, 8(R10)(SI*8)
+	MOVQ CX, 16(R10)(SI*8)
+	MOVQ DX, 24(R10)(SI*8)
+	ADDQ $4, SI		// i += 4
+	SUBQ $4, DI		// n -= 4
+	JGE CU3			// if n >= 0 goto C3
+CV3:
+	ADDQ $4, DI
+	JLE CE3
+CL3:
+	MOVQ 0(R8)(SI*8), AX
+	MOVQ AX, 0(R10)(SI*8)
+	INCQ SI
+	DECQ DI
+	JG CL3
+CE3:
+	MOVQ $0, c+56(FP)
+	RET
+
+// func sub10VW(z, x []Word, y Word) (c Word)
+// (same as add10VW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
+TEXT ·sub10VW(SB),NOSPLIT,$0
+	MOVQ z_len+8(FP), DI
+	MOVQ x+24(FP), R8
+	MOVQ y+48(FP), CX	// c = y
+	MOVQ z+0(FP), R10
+
+	XORQ SI, SI			// i = 0
+	MOVQ $_DB, DX
+
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUBQ $4, DI		// n -= 4
+	JL V4			// if n < 4 goto V4
+
+U4:	// n >= 0
+	// regular loop body unrolled 4x
+	MOVQ 0(R8)(SI*8), BX
+	SUBQ CX, BX
+	SBBQ CX, CX
+	MOVQ DX, AX
+	ANDQ CX, AX
+	ADDQ AX, BX
+	MOVQ BX, 0(R10)(SI*8)
+	NEGQ CX
+	MOVQ 8(R8)(SI*8), BX
+	SUBQ CX, BX
+	SBBQ CX, CX
+	MOVQ DX, AX
+	ANDQ CX, AX
+	ADDQ AX, BX
+	MOVQ BX, 8(R10)(SI*8)
+	NEGQ CX
+	MOVQ 16(R8)(SI*8), BX
+	SUBQ CX, BX
+	SBBQ CX, CX
+	MOVQ DX, AX
+	ANDQ CX, AX
+	ADDQ AX, BX
+	MOVQ BX, 16(R10)(SI*8)
+	NEGQ CX
+	MOVQ 24(R8)(SI*8), BX
+	SUBQ CX, BX
+	SBBQ CX, CX
+	MOVQ DX, AX
+	ANDQ CX, AX
+	ADDQ AX, BX
+	MOVQ BX, 24(R10)(SI*8)
+	NEGQ CX
+	JCC C4
+
+	ADDQ $4, SI		// i += 4
+	SUBQ $4, DI		// n -= 4
+	JGE U4			// if n >= 0 goto U4
+
+V4:	ADDQ $4, DI		// n += 4
+	JLE E4			// if n <= 0 goto E4
+
+L4:	// n > 0
+	MOVQ 0(R8)(SI*8), R11
+	SUBQ CX, R11
+	SBBQ CX, CX
+	MOVQ DX, AX
+	ANDQ CX, AX
+	ADDQ AX, R11
+	NEGQ CX
+	MOVQ R11, 0(R10)(SI*8)
+
+	INCQ SI			// i++
+	DECQ DI			// n--
+	JG L4			// if n > 0 goto L4
+
+E4:	MOVQ CX, c+56(FP)	// return c
+	RET
+
+C4: // memcpy
+	CMPQ R8, R10	// don't copy if &x[0] == &z[0]
+	JEQ CE4
+	ADDQ $4, SI
+	SUBQ $4, DI
+	JL CV4
+CU4: // n >= 4
+	MOVQ 0(R8)(SI*8), AX
+	MOVQ 8(R8)(SI*8), BX
+	MOVQ 16(R8)(SI*8), CX
+	MOVQ 24(R8)(SI*8), DX
+	MOVQ AX, 0(R10)(SI*8)
+	MOVQ BX, 8(R10)(SI*8)
+	MOVQ CX, 16(R10)(SI*8)
+	MOVQ DX, 24(R10)(SI*8)
+	ADDQ $4, SI		// i += 4
+	SUBQ $4, DI		// n -= 4
+	JGE CU4			// if n >= 0 goto C4
+CV4:
+	ADDQ $4, DI
+	JLE CE4
+CL4:
+	MOVQ 0(R8)(SI*8), AX
+	MOVQ AX, 0(R10)(SI*8)
+	INCQ SI
+	DECQ DI
+	JG CL4
+CE4:
+	MOVQ $0, c+56(FP)
+	RET
+
 // // func shlVU(z, x []Word, s uint) (c Word)
 // TEXT ·shlVU(SB),NOSPLIT,$0
 // 	MOVQ z_len+8(FP), BX	// i = z
diff --git a/dec_arith_decl.go b/dec_arith_decl.go
index 017ff7e..d7bc30b 100644
--- a/dec_arith_decl.go
+++ b/dec_arith_decl.go
@@ -13,15 +13,11 @@ func div10W(n1, n0 Word) (q, r Word)
 
 func sub10VV(z, x, y []Word) (c Word)
 
-// Not implemented yet
+func add10VW(z, x []Word, y Word) (c Word)
 
-func add10VW(z, x []Word, y Word) (c Word) {
-	return add10VW_g(z, x, y)
-}
+func sub10VW(z, x []Word, y Word) (c Word)
 
-func sub10VW(z, x []Word, y Word) (c Word) {
-	return sub10VW_g(z, x, y)
-}
+// Not implemented yet
 
 func shl10VU(z, x []Word, s uint) (c Word) {
 	return shl10VU_g(z, x, s)
diff --git a/dec_arith_test.go b/dec_arith_test.go
index aee6a74..5ee69c6 100644
--- a/dec_arith_test.go
+++ b/dec_arith_test.go
@@ -153,194 +153,235 @@ func BenchmarkDecSub10VV(b *testing.B) {
 	}
 }
 
-// TODO(db47h): complete port of the tests
+type fun10VW func(z, x []Word, y Word) (c Word)
+type arg10VW struct {
+	z, x dec
+	y    Word
+	c    Word
+}
 
-// type funVW func(z, x []Word, y Word) (c Word)
-// type argVW struct {
-// 	z, x nat
-// 	y    Word
-// 	c    Word
-// }
+var sum10VW = []arg10VW{
+	{},
+	{nil, nil, 2, 2},
+	{dec{0}, dec{0}, 0, 0},
+	{dec{1}, dec{0}, 1, 0},
+	{dec{1}, dec{1}, 0, 0},
+	{dec{0}, dec{_DMax}, 1, 1},
+	{dec{0, 0, 0, 0, 0}, dec{_DMax, _DMax, _DMax, _DMax, _DMax}, 1, 1},
+	{dec{585}, dec{314}, 271, 0},
+}
 
-// var sumVW = []argVW{
-// 	{},
-// 	{nil, nil, 2, 2},
-// 	{nat{0}, nat{0}, 0, 0},
-// 	{nat{1}, nat{0}, 1, 0},
-// 	{nat{1}, nat{1}, 0, 0},
-// 	{nat{0}, nat{_M}, 1, 1},
-// 	{nat{0, 0, 0, 0}, nat{_M, _M, _M, _M}, 1, 1},
-// 	{nat{585}, nat{314}, 271, 0},
-// }
+var lsh10VW = []arg10VW{
+	{},
+	{dec{0}, dec{0}, 0, 0},
+	{dec{0}, dec{0}, 1, 0},
+	{dec{0}, dec{0}, 7, 0},
 
-// var lshVW = []argVW{
-// 	{},
-// 	{nat{0}, nat{0}, 0, 0},
-// 	{nat{0}, nat{0}, 1, 0},
-// 	{nat{0}, nat{0}, 20, 0},
+	{dec{_DMax}, dec{_DMax}, 0, 0},
+	{dec{_DMax - _DMax%pow10(1)}, dec{_DMax}, 1, _DMax / pow10(_DW-1)},
+	{dec{_DMax - _DMax%pow10(7)}, dec{_DMax}, 7, _DMax / pow10(_DW-7)},
 
-// 	{nat{_M}, nat{_M}, 0, 0},
-// 	{nat{_M << 1 & _M}, nat{_M}, 1, 1},
-// 	{nat{_M << 20 & _M}, nat{_M}, 20, _M >> (_W - 20)},
+	{dec{_DMax, _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 0, 0},
+	{dec{_DMax - _DMax%pow10(1), _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 1, _DMax / pow10(_DW-1)},
+	{dec{_DMax - _DMax%pow10(7), _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 7, _DMax / pow10(_DW-7)},
+}
 
-// 	{nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0},
-// 	{nat{_M << 1 & _M, _M, _M}, nat{_M, _M, _M}, 1, 1},
-// 	{nat{_M << 20 & _M, _M, _M}, nat{_M, _M, _M}, 20, _M >> (_W - 20)},
-// }
+var rsh10VW = []arg10VW{
+	{},
+	{dec{0}, dec{0}, 0, 0},
+	{dec{0}, dec{0}, 1, 0},
+	{dec{0}, dec{0}, 7, 0},
 
-// var rshVW = []argVW{
-// 	{},
-// 	{nat{0}, nat{0}, 0, 0},
-// 	{nat{0}, nat{0}, 1, 0},
-// 	{nat{0}, nat{0}, 20, 0},
+	{dec{_DMax}, dec{_DMax}, 0, 0},
+	{dec{_DMax / pow10(1)}, dec{_DMax}, 1, _DMax - _DMax%pow10(_DW-1)},
+	{dec{_DMax / pow10(7)}, dec{_DMax}, 7, _DMax - _DMax%pow10(_DW-7)},
 
-// 	{nat{_M}, nat{_M}, 0, 0},
-// 	{nat{_M >> 1}, nat{_M}, 1, _M << (_W - 1) & _M},
-// 	{nat{_M >> 20}, nat{_M}, 20, _M << (_W - 20) & _M},
+	{dec{_DMax, _DMax, _DMax}, dec{_DMax, _DMax, _DMax}, 0, 0},
+	{dec{_DMax, _DMax, _DMax / pow10(1)}, dec{_DMax, _DMax, _DMax}, 1, _DMax - _DMax%pow10(_DW-1)},
+	{dec{_DMax, _DMax, _DMax / pow10(7)}, dec{_DMax, _DMax, _DMax}, 7, _DMax - _DMax%pow10(_DW-7)},
+}
 
-// 	{nat{_M, _M, _M}, nat{_M, _M, _M}, 0, 0},
-// 	{nat{_M, _M, _M >> 1}, nat{_M, _M, _M}, 1, _M << (_W - 1) & _M},
-// 	{nat{_M, _M, _M >> 20}, nat{_M, _M, _M}, 20, _M << (_W - 20) & _M},
-// }
+func testFun10VW(t *testing.T, msg string, f fun10VW, a arg10VW) {
+	n := len(a.z)
+	z := make(nat, n+1)
+	c := f(z[:n], a.x, a.y)
+	for i, zi := range z[:n] {
+		if zi != a.z[i] {
+			t.Errorf("%s%+v\n\tgot z[%d] = %d; want %d", msg, a, i, zi, a.z[i])
+			break
+		}
+	}
+	if c != a.c {
+		t.Errorf("%s%+v\n\tgot c = %d; want %d", msg, a, c, a.c)
+	}
+	// TestDecAddSub10VW sets x[len(x)] = some value
+	// check that it does not get copied.
+	if z[n] != 0 {
+		panic("memcpy overflow")
+	}
+}
 
-// func testFunVW(t *testing.T, msg string, f funVW, a argVW) {
-// 	z := make(nat, len(a.z))
-// 	c := f(z, a.x, a.y)
-// 	for i, zi := range z {
-// 		if zi != a.z[i] {
-// 			t.Errorf("%s%+v\n\tgot z[%d] = %#x; want %#x", msg, a, i, zi, a.z[i])
-// 			break
-// 		}
-// 	}
-// 	if c != a.c {
-// 		t.Errorf("%s%+v\n\tgot c = %#x; want %#x", msg, a, c, a.c)
-// 	}
-// }
+func makeFun10VW(f func(z, x []Word, s uint) (c Word)) fun10VW {
+	return func(z, x []Word, s Word) (c Word) {
+		return f(z, x, uint(s))
+	}
+}
 
-// func makeFunVW(f func(z, x []Word, s uint) (c Word)) funVW {
-// 	return func(z, x []Word, s Word) (c Word) {
-// 		return f(z, x, uint(s))
-// 	}
-// }
+func TestDecFun10VW(t *testing.T) {
+	for _, a := range sum10VW {
+		arg := a
+		testFun10VW(t, "add10VW_g", add10VW_g, arg)
+		testFun10VW(t, "add10VW", add10VW, arg)
 
-// func TestFunVW(t *testing.T) {
-// 	for _, a := range sumVW {
-// 		arg := a
-// 		testFunVW(t, "addVW_g", addVW_g, arg)
-// 		testFunVW(t, "addVW", addVW, arg)
+		arg = arg10VW{a.x, a.z, a.y, a.c}
+		testFun10VW(t, "sub10VW_g", sub10VW_g, arg)
+		testFun10VW(t, "sub10VW", sub10VW, arg)
+	}
 
-// 		arg = argVW{a.x, a.z, a.y, a.c}
-// 		testFunVW(t, "subVW_g", subVW_g, arg)
-// 		testFunVW(t, "subVW", subVW, arg)
-// 	}
+	shl10VW_g := makeFun10VW(shl10VU_g)
+	shl10VW := makeFun10VW(shl10VU)
+	for _, a := range lsh10VW {
+		arg := a
+		testFun10VW(t, "shl10VU_g", shl10VW_g, arg)
+		testFun10VW(t, "shl10VU", shl10VW, arg)
+	}
 
-// 	shlVW_g := makeFunVW(shlVU_g)
-// 	shlVW := makeFunVW(shlVU)
-// 	for _, a := range lshVW {
-// 		arg := a
-// 		testFunVW(t, "shlVU_g", shlVW_g, arg)
-// 		testFunVW(t, "shlVU", shlVW, arg)
-// 	}
+	shr10VW_g := makeFun10VW(shr10VU_g)
+	shr10VW := makeFun10VW(shr10VU)
+	for _, a := range rsh10VW {
+		arg := a
+		testFun10VW(t, "shr10VU_g", shr10VW_g, arg)
+		testFun10VW(t, "shr10VU", shr10VW, arg)
+	}
+}
 
-// 	shrVW_g := makeFunVW(shrVU_g)
-// 	shrVW := makeFunVW(shrVU)
-// 	for _, a := range rshVW {
-// 		arg := a
-// 		testFunVW(t, "shrVU_g", shrVW_g, arg)
-// 		testFunVW(t, "shrVU", shrVW, arg)
-// 	}
-// }
+// TestDecAddSub10VW tests proper behavior of assembly versions of add10Vw and
+// sub10VW on edge cases.
+func TestDecAddSub10VW(t *testing.T) {
+	for n := 0; n < 10; n++ {
+		for i := 0; i <= n; i++ {
+			z := dec(nil).make(n)
+			x := dec(nil).make(n + 1)
+			// Bounds check. testFun10VW will allocate a larger result slice and
+			// check that the higher Words are not overwritten.
+			x[n] = 42
+			x = x[:n]
+			// test _DMax + 1 = 0
+			// fill x[:j] with _DMax, z[:j] with 0
+			for j := 0; j < i; j++ {
+				x[j] = _DMax
+			}
+			// fill x[j:] and z[:j] with random-ish data
+			for j := i; j < n; j++ {
+				x[j] = Word(j + 1)
+				z[j] = Word(j + 1)
+				// add carry
+				if j == i {
+					z[j]++
+				}
+			}
+			c := Word(0)
+			if i == n {
+				c = 1
+			}
+			testFun10VW(t, "add10VW_asm", add10VW, arg10VW{z, x, 1, c})
+			testFun10VW(t, "sub10VW_asm", sub10VW, arg10VW{x, z, 1, c})
+		}
+	}
+}
 
-// type argVU struct {
-// 	d  []Word // d is a Word slice, the input parameters x and z come from this array.
-// 	l  uint   // l is the length of the input parameters x and z.
-// 	xp uint   // xp is the starting position of the input parameter x, x := d[xp:xp+l].
-// 	zp uint   // zp is the starting position of the input parameter z, z := d[zp:zp+l].
-// 	s  uint   // s is the shift number.
-// 	r  []Word // r is the expected output result z.
-// 	c  Word   // c is the expected return value.
-// 	m  string // message.
-// }
+type arg10VU struct {
+	d  []Word // d is a Word slice, the input parameters x and z come from this array.
+	l  uint   // l is the length of the input parameters x and z.
+	xp uint   // xp is the starting position of the input parameter x, x := d[xp:xp+l].
+	zp uint   // zp is the starting position of the input parameter z, z := d[zp:zp+l].
+	s  uint   // s is the shift number.
+	r  []Word // r is the expected output result z.
+	c  Word   // c is the expected return value.
+	m  string // message.
+}
 
-// var argshlVU = []argVU{
-// 	// test cases for shlVU
-// 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0}, 7, 0, 0, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "complete overlap of shlVU"},
-// 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by half of shlVU"},
-// 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "partial overlap by 1 Word of shlVU"},
-// 	{[]Word{1, _M, _M, _M, _M, _M, 3 << (_W - 2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{2, _M - 1, _M, _M, _M, _M, 1<<(_W-1) + 1}, 1, "no overlap of shlVU"},
-// }
+var argshl10VU = []arg10VU{
+	// test cases for shlVU
+	{[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0}, 7, 0, 0, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "complete overlap of shlVU"},
+	{[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0}, 7, 0, 3, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "partial overlap by half of shlVU"},
+	{[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0, 0, 0, 0}, 7, 0, 6, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "partial overlap by 1 Word of shlVU"},
+	{[]Word{1, _DMax, _DMax, _DMax, _DMax, _DMax, 99 * pow10(_DW-2), 0, 0, 0, 0, 0, 0, 0, 0}, 7, 0, 7, 1, []Word{10, _DMax - _DMax%pow10(1), _DMax, _DMax, _DMax, _DMax, 9*pow10(_DW-1) + 9}, 9, "no overlap of shlVU"},
+}
 
-// var argshrVU = []argVU{
-// 	// test cases for shrVU
-// 	{[]Word{0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 1, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "complete overlap of shrVU"},
-// 	{[]Word{0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 4, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by half of shrVU"},
-// 	{[]Word{0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 7, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "partial overlap by 1 Word of shrVU"},
-// 	{[]Word{0, 0, 0, 0, 0, 0, 0, 0, 3, _M, _M, _M, _M, _M, 1 << (_W - 1)}, 7, 8, 1, 1, []Word{1<<(_W-1) + 1, _M, _M, _M, _M, _M >> 1, 1 << (_W - 2)}, 1 << (_W - 1), "no overlap of shrVU"},
-// }
+var argshr10VU = []arg10VU{
+	// test cases for shrVU
+	{[]Word{0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 1, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "complete overlap of shrVU"},
+	{[]Word{0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 4, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "partial overlap by half of shrVU"},
+	{[]Word{0, 0, 0, 0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 7, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "partial overlap by 1 Word of shrVU"},
+	{[]Word{0, 0, 0, 0, 0, 0, 0, 0, 99, _DMax, _DMax, _DMax, _DMax, _DMax, 9 * pow10(_DW-1)}, 7, 8, 1, 1, []Word{9*pow10(_DW-1) + 9, _DMax, _DMax, _DMax, _DMax, _DMax / pow10(1), 9 * pow10(_DW-2)}, 9 * pow10(_DW-1), "no overlap of shrVU"},
+}
 
-// func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) {
-// 	// save a.d for error message, or it will be overwritten.
-// 	b := make([]Word, len(a.d))
-// 	copy(b, a.d)
-// 	z := a.d[a.zp : a.zp+a.l]
-// 	x := a.d[a.xp : a.xp+a.l]
-// 	c := f(z, x, a.s)
-// 	for i, zi := range z {
-// 		if zi != a.r[i] {
-// 			t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i])
-// 			break
-// 		}
-// 	}
-// 	if c != a.c {
-// 		t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c)
-// 	}
-// }
+func testShift10Func(t *testing.T, f func(z, x []Word, s uint) Word, a arg10VU) {
+	// save a.d for error message, or it will be overwritten.
+	b := make([]Word, len(a.d))
+	copy(b, a.d)
+	z := a.d[a.zp : a.zp+a.l]
+	x := a.d[a.xp : a.xp+a.l]
+	c := f(z, x, a.s)
+	for i, zi := range z {
+		if zi != a.r[i] {
+			t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %d; want %d", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i])
+			break
+		}
+	}
+	if c != a.c {
+		t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %d; want %d", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c)
+	}
+}
 
-// func TestShiftOverlap(t *testing.T) {
-// 	for _, a := range argshlVU {
-// 		arg := a
-// 		testShiftFunc(t, shlVU, arg)
-// 	}
+func TestShift10Overlap(t *testing.T) {
+	for _, a := range argshl10VU {
+		arg := a
+		testShift10Func(t, shl10VU, arg)
+	}
 
-// 	for _, a := range argshrVU {
-// 		arg := a
-// 		testShiftFunc(t, shrVU, arg)
-// 	}
-// }
+	for _, a := range argshr10VU {
+		arg := a
+		testShift10Func(t, shr10VU, arg)
+	}
+}
 
-// func BenchmarkAddVW(b *testing.B) {
-// 	for _, n := range benchSizes {
-// 		if isRaceBuilder && n > 1e3 {
-// 			continue
-// 		}
-// 		x := rndV(n)
-// 		y := rndW()
-// 		z := make([]Word, n)
-// 		b.Run(fmt.Sprint(n), func(b *testing.B) {
-// 			b.SetBytes(int64(n * _S))
-// 			for i := 0; i < b.N; i++ {
-// 				addVW(z, x, y)
-// 			}
-// 		})
-// 	}
-// }
+func BenchmarkAdd10VW(b *testing.B) {
+	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
+		x := rnd10V(n)
+		y := rnd10W()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _S))
+			for i := 0; i < b.N; i++ {
+				add10VW(z, x, y)
+			}
+		})
+	}
+}
 
-// func BenchmarkSubVW(b *testing.B) {
-// 	for _, n := range benchSizes {
-// 		if isRaceBuilder && n > 1e3 {
-// 			continue
-// 		}
-// 		x := rndV(n)
-// 		y := rndW()
-// 		z := make([]Word, n)
-// 		b.Run(fmt.Sprint(n), func(b *testing.B) {
-// 			b.SetBytes(int64(n * _S))
-// 			for i := 0; i < b.N; i++ {
-// 				subVW(z, x, y)
-// 			}
-// 		})
-// 	}
-// }
+func BenchmarkSub10VW(b *testing.B) {
+	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
+		x := rnd10V(n)
+		y := rnd10W()
+		z := make([]Word, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n * _S))
+			for i := 0; i < b.N; i++ {
+				sub10VW(z, x, y)
+			}
+		})
+	}
+}
+
+// TODO(db47h): complete port of the tests
 
 // type funVWW func(z, x []Word, y, r Word) (c Word)
 // type argVWW struct {
diff --git a/decimal_test.go b/decimal_test.go
index 2d6b570..e112b7d 100644
--- a/decimal_test.go
+++ b/decimal_test.go
@@ -34,7 +34,7 @@ func TestDecimal_dnorm(t *testing.T) {
 	again:
 		w := uint(rand.Uint64()) % _DB
 		e := uint(rand.Intn(_DW + 1))
-		h, l := mulWW(Word(w), Word(pow10(e)))
+		h, l := mulWW(Word(w), pow10(e))
 		// convert h, l from base _B (2**64) to base _BD (10**19) or 2**32 -> 10**9
 		h, l = div10W(h, l)
 		d := dec{Word(l), Word(h)}.norm()
@@ -47,7 +47,7 @@ func TestDecimal_dnorm(t *testing.T) {
 		dd := dec(nil).set(d)
 		s := dnorm(dd)
 		// d should now have a single element with e shifted left
-		ew := w * pow10(_DW-decDigits(w))
+		ew := w * uint(pow10(_DW-decDigits(w)))
 		es := int64(uint(len(d)*_DW) - (decDigits(w) + e))
 		if dd[len(dd)-1] != Word(ew) || s != es {
 			t.Fatalf("%ve%v => dnorm(%v) = %v, %v --- Expected %d, %d",
diff --git a/go.mod b/go.mod
index 9755fbf..8e60a82 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,3 @@
 module github.com/db47h/decimal
 
 go 1.14
-
-require golang.org/x/tools v0.0.0-20200513122804-866d71a3170a // indirect
diff --git a/testdata/bench b/testdata/bench
deleted file mode 100755
index c234221..0000000
--- a/testdata/bench
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-TEST="$1"
-shift
-
-go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 -tags decimal_pure_go "$@" | tee bench-go
-go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-asm
-
-benchstat bench-go bench-asm
-
-rm bench-go
-rm bench-asm
diff --git a/testdata/benchasm b/testdata/benchasm
new file mode 100755
index 0000000..0bcfa9a
--- /dev/null
+++ b/testdata/benchasm
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+TEST="$1"
+shift
+
+if [ ! -e "bench-go" ]; then
+    go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 -tags decimal_pure_go "$@" | tee bench-go
+fi
+go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-asm
+
+benchstat bench-go bench-asm
+
+# rm bench-go
+# rm bench-asm
diff --git a/testdata/benchgit b/testdata/benchgit
index 5767a31..cd767dc 100755
--- a/testdata/benchgit
+++ b/testdata/benchgit
@@ -20,23 +20,24 @@ function doBench {
     shift
     local REV="$1"
     shift
-    echo $TEST - $REV
     go test -v -run ^$ -bench "$TEST" -cpu 1 -count 5 "$@" | tee bench-"$REV"
 }
 
-if [ "$REV" == "master" ]; then
-    # compare current against master
-    git stash
-    doBench "$TEST" "$REV" "$@"
-    git stash pop
-else
-    git checkout "$REV" 
-    doBench "$TEST" "$REV" "$@"
-    git checkout master
+if [ ! -e "bench-$REV" ]; then
+    if [ "$REV" == "master" ]; then
+        # compare current against master
+        git stash
+        doBench "$TEST" "$REV" "$@"
+        git stash pop
+    else
+        git checkout "$REV" 
+        doBench "$TEST" "$REV" "$@"
+        git checkout master
+    fi
 fi
 
 doBench "$TEST" "current" "$@"
 
 benchstat bench-"$REV" bench-current
 
-rm bench-"$REV" bench-current
+# rm bench-"$REV" bench-current