Skip to content

Commit

Permalink
Add missing amd64 asm implementation for shr10VU (follow-up on de46678)
Browse files Browse the repository at this point in the history
The asm implementation of shr10VU takes about 25% less time per op vs.
the Go version.

    name     old time/op  new time/op  delta
    Shr10VU   444µs ± 2%   331µs ± 3%  -25.50%  (p=0.008 n=5+5)

Still need to try and unroll loops.
  • Loading branch information
db47h committed May 26, 2020
1 parent 12d0933 commit 9379467
Showing 1 changed file with 51 additions and 38 deletions.
89 changes: 51 additions & 38 deletions dec_arith_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -528,53 +528,51 @@ TEXT ·shl10VU(SB),NOSPLIT,$0
TESTQ BX, BX
JEQ X8c // copy if s = 0

MOVQ $_DW, CX
MOVQ $_DW, AX
LEAQ ·pow10DivTab64(SB), DI
SUBQ BX, CX
SUBQ BX, AX
LEAQ -3(AX)(AX*2), AX
LEAQ -3(BX)(BX*2), BX
LEAQ -3(CX)(CX*2), CX

MOVQ 0(DI)(BX*8), R12 // m = pow10DivTab64(s-1).d
LEAQ 0(DI)(CX*8), R11 // d = &pow10DivTab64(_DW-s-1)
MOVQ 0(DI)(BX*8), R11 // m = pow10DivTab64(s-1).d
MOVQ 0(DI)(AX*8), R12 // d
MOVQ 8(DI)(AX*8), R13 // m'
MOVWLZX 16(DI)(AX*8), CX // post|pre

// r, l = x[len(x)-1] / d
MOVQ 0(R11), R13 // d
MOVQ 8(R11), R14 // m
MOVQ 0(R8)(SI*8), AX
MOVQ AX, BX // x[i]
MOVWLZX 16(R11), CX // post|pre
SHRQ CX, AX // x[i] >> pre
MULQ R14 // *m
MULQ R13 // *m'
RORW $8, CX // post
MOVQ DX, AX
SHRQ CX, AX // r
MOVQ AX, c+56(FP) // save r
MULQ R13 // DX:AX = r*d
MULQ R12 // DX:AX = r*d
SUBQ AX, BX // l = x[i]-r*d
MOVQ BX, AX // AX = l

TESTQ SI, SI
JEQ X8a
L8:
MULQ R12
MULQ R11
MOVQ AX, R9 // z[i] = l*m
MOVQ -8(R8)(SI*8), AX
MOVQ AX, BX // x[i-1]
RORW $8, CX // pre
SHRQ CX, AX // x[i-1] >> pre
MULQ R14 // *m
MULQ R13 // *m'
RORW $8, CX // post
MOVQ DX, AX
SHRQ CX, AX // h
ADDQ AX, R9 // z[i] += h
MOVQ R9, 0(R10)(SI*8)
MULQ R13 // DX:AX = d*h
MULQ R12 // DX:AX = d*h
SUBQ AX, BX // l = x[i-1]-d*h
MOVQ BX, AX
SUBQ $1, SI
JG L8
X8a:
MULQ R12
MULQ R11
MOVQ AX, 0(R10)(SI*8)
RET
X8b:
Expand Down Expand Up @@ -602,41 +600,56 @@ TEXT ·shr10VU(SB),NOSPLIT,$0
TESTQ BX, BX
JEQ X9c // copy if s = 0

MOVQ $_DW, CX
LEAQ ·pow10tab(SB), SI
SUBQ BX, CX
MOVQ 0(SI)(BX*8), R11 // d = pow10(s)
MOVQ 0(SI)(CX*8), R12 // m = pow10(_DW-s)

XORQ DX, DX
MOVQ 0(R8), AX
DIVQ R11 // AX:DX = x[0] / d
MOVQ DX, R13 // r
MOVQ AX, BX // h
MOVQ $_DW, AX
LEAQ ·pow10DivTab64(SB), SI
SUBQ BX, AX
LEAQ -3(AX)(AX*2), AX
LEAQ -3(BX)(BX*2), BX
MOVQ 0(SI)(AX*8), R11 // m = pow10DivTab64(_DW-s-1).d
MOVQ 0(SI)(BX*8), R12 // d
MOVQ 8(SI)(BX*8), R13 // m'
MOVWLZX 16(SI)(BX*8), CX // post|pre

MOVQ 0(R8), AX // x[0]
MOVQ AX, R9
SHRQ CX, AX // x[0] >> pre
MULQ R13 // *m'
RORW $8, CX // post
SHRQ CX, DX // h
MOVQ DX, BX
MOVQ R12, AX
MULQ DX // DX:AX = h*d
SUBQ AX, R9 // r = x[0]-h*d
MOVQ R11, AX
MULQ R9
MOVQ AX, c+56(FP) // save r*m

MOVQ $0, SI
CMPQ SI, DI
JGE X9a // if i >= len(x)-1 goto X9a

L9:
MOVQ BX, CX // z[i] = h
XORQ DX, DX
MOVQ BX, R9 // z[i] = h
MOVQ 8(R8)(SI*8), AX
DIVQ R11 // h, l = AX:DX = divWW(x[i], d)
MOVQ AX, BX // save h
MOVQ DX, AX
XORQ DX, DX
MULQ R12 // AX = l*m
ADDQ AX, CX // zi += l*m
MOVQ CX, 0(R10)(SI*8)
MOVQ AX, R14 // x[i+1]
RORW $8, CX
SHRQ CX, AX
MULQ R13
RORW $8, CX
SHRQ CX, DX
MOVQ DX, BX // BX = h
MOVQ R12, AX
MULQ DX // d*h
SUBQ AX, R14 // l = x[i+1]-d*h
MOVQ R11, AX
MULQ R14 // l*m
ADDQ AX, R9
MOVQ R9, 0(R10)(SI*8)
ADDQ $1, SI
CMPQ SI, DI
JL L9
X9a:
MOVQ BX, 0(R10)(SI*8)
MOVQ R13, AX
MULQ R12
MOVQ AX, c+56(FP)
RET
X9b:
MOVQ $0, c+56(FP)
Expand Down

0 comments on commit 9379467

Please sign in to comment.