Skip to content

Commit

Permalink
dec/amd64: implement add10VW/sub10VW
Browse files Browse the repository at this point in the history
    name             go time/op     asm time/op     delta
    Sub10VW/1         10.1ns ± 2%      5.6ns ± 1%   -45.03%
    Sub10VW/2         11.7ns ± 1%      6.1ns ± 1%   -48.01%
    Sub10VW/3         13.2ns ± 2%      8.1ns ± 0%   -39.03%
    Sub10VW/4         14.6ns ± 0%      8.4ns ± 0%   -42.77%
    Sub10VW/5         14.9ns ± 1%      8.8ns ± 0%   -40.66%
    Sub10VW/10        15.1ns ± 0%     10.6ns ± 3%   -30.12%
    Sub10VW/100        116ns ± 1%       45ns ± 6%   -61.62%
    Sub10VW/1000      1.22µs ± 1%     0.54µs ±14%   -55.85%
    Sub10VW/10000     11.9µs ± 0%      5.4µs ± 1%   -54.85%
    Sub10VW/100000     122µs ± 0%       62µs ± 0%   -49.45%

The Go implementation can check if the carry is zero and switch to
copy() for free (no need to have a standard add10VW vs. add10VW large).
In the assembler version I chose to keep a single implementation of the
function and switch to a memcpy whenever the carry is 0 (checked every 4
Words). Considering that the carry is almost always 0, this logic is the
likely cause of the performance drop between 5-15 Words.

Also past 1000 Words, the performance gains seem to slowly drop. The
very likely cause is the simplistic memcpy implementation vs.
runtime·memmove.
  • Loading branch information
db47h committed May 14, 2020
1 parent edbdd7c commit 5821f10
Show file tree
Hide file tree
Showing 11 changed files with 494 additions and 343 deletions.
2 changes: 1 addition & 1 deletion arith_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
var isRaceBuilder bool

func init() {
flag.BoolVar(&isRaceBuilder, "rb", true, "race builder")
flag.BoolVar(&isRaceBuilder, "rb", false, "race builder")
}

type funVV func(z, x, y []Word) (c Word)
Expand Down
4 changes: 2 additions & 2 deletions dec.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func (x dec) digit(i uint) uint {
return 0
}
// 0 <= j < len(x)
return (uint(x[j]) / pow10(i)) % 10
return uint(x[j]/pow10(i)) % 10
}

func (z dec) make(n int) dec {
Expand Down Expand Up @@ -166,7 +166,7 @@ func (x dec) sticky(i uint) uint {
return 1
}
}
if uint(x[j])%pow10(i) != 0 {
if x[j]%pow10(i) != 0 {
return 1
}
return 0
Expand Down
26 changes: 13 additions & 13 deletions dec_arith.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ var pow10s = [...]uint64{
10000000000000000, 100000000000000000, 1000000000000000000, 10000000000000000000,
}

func pow10(n uint) uint { return uint(pow10s[n]) }
func pow10(n uint) Word { return Word(pow10s[n]) }

var maxDigits = [...]uint{
1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5,
Expand Down Expand Up @@ -54,7 +54,7 @@ func decDigits64(x uint64) (n uint) {

func decDigits32(x uint) (n uint) {
n = maxDigits[bits.Len(x)]
if x < pow10(n-1) {
if x < uint(pow10(n-1)) {
n--
}
return n
Expand Down Expand Up @@ -148,12 +148,9 @@ func div10WW_g(u1, u0, v Word) (q, r Word) {

func add10WWW_g(x, y, cIn Word) (s, c Word) {
r, cc := bits.Add(uint(x), uint(y), uint(cIn))
// if cc != 0 || r > _DB-1 {
// cc = 1
// r -= _DB
// }
// c1 := uint(int(r-_DB) >> 63)
var c1 uint
// this simple if statement is compiled without jumps
// at least on amd64.
if r >= _DB {
c1 = 1
}
Expand Down Expand Up @@ -187,7 +184,10 @@ func sub10VV_g(z, x, y []Word) (c Word) {
}

// add10VW adds y to x. The resulting carry c is either 0 or 1.
func add10VW_g(z, x dec, y Word) (c Word) {
func add10VW_g(z, x []Word, y Word) (c Word) {
if len(z) == 0 {
return y
}
z[0], c = add10WWW_g(x[0], y, 0)
// propagate carry
for i := 1; i < len(z) && i < len(x); i++ {
Expand Down Expand Up @@ -219,15 +219,15 @@ func sub10VW_g(z, x []Word, y Word) (c Word) {
}

// shl10VU sets z to x*(10**s), s < _WD
func shl10VU_g(z, x dec, s uint) (r Word) {
func shl10VU_g(z, x []Word, s uint) (r Word) {
if s == 0 {
copy(z, x)
return
}
if len(z) == 0 || len(x) == 0 {
return
}
d, m := Word(pow10(_DW-s)), Word(pow10(s))
d, m := pow10(_DW-s), pow10(s)
var h, l Word
r, l = divWW(0, x[len(x)-1], d)
for i := len(z) - 1; i > 0; i-- {
Expand All @@ -241,7 +241,7 @@ func shl10VU_g(z, x dec, s uint) (r Word) {
}

// shr10VU sets z to x/(10**s)
func shr10VU_g(z, x dec, s uint) (r Word) {
func shr10VU_g(z, x []Word, s uint) (r Word) {
if s == 0 {
copy(z, x)
return
Expand All @@ -251,15 +251,15 @@ func shr10VU_g(z, x dec, s uint) (r Word) {
}

var h, l Word
d, m := Word(pow10(s)), Word(pow10(_DW-s))
d, m := pow10(s), pow10(_DW-s)
h, r = divWW(0, x[0], Word(d))
for i := 1; i < len(z) && i < len(x); i++ {
t := h
h, l = divWW(0, x[i], d)
z[i-1] = t + l*m
}
z[len(z)-1] = h
return r
return r * m
}

func mulAdd10VWW_g(z, x []Word, y, r Word) (c Word) {
Expand Down
Loading

0 comments on commit 5821f10

Please sign in to comment.