diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S index 07982429..2ebdf5c8 100644 --- a/src/stub/amd_d_nrv2b.S +++ b/src/stub/amd_d_nrv2b.S @@ -35,11 +35,11 @@ lit_n2b: incq %rdi top_n2b: movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset - jnextb1y lit_n2b + jnextb1yp lit_n2b lea 1(lenq),off # [len= 0] off= 1 offmore_n2b: - getnextb(off) - jnextb0n offmore_n2b + getnextbp(off) + jnextb0np offmore_n2b subl $ 3,off; jc len_n2b # use previous offset shll $ 8,off; movzbl %dl,%edx diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S index 3cfcd6e5..8bcc29f4 100644 --- a/src/stub/amd_d_nrv2e.S +++ b/src/stub/amd_d_nrv2e.S @@ -35,16 +35,16 @@ lit_n2e: incq %rdi top_n2e: movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset - jnextb1y lit_n2e + jnextb1yp lit_n2e lea 1(lenq),off # [len= 0] off= 1 jmp getoff_n2e off_n2e: dec off - getnextb(off) + getnextbp(off) getoff_n2e: - getnextb(off) - jnextb0n off_n2e + getnextbp(off) + jnextb0np off_n2e subl $ 3,off; jc offprev_n2e shll $ 8,off; movzbl %dl,%edx diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S index e57adb68..dd418fcb 100644 --- a/src/stub/l_lx_elf64amd.S +++ b/src/stub/l_lx_elf64amd.S @@ -102,6 +102,17 @@ ra_setup: */ /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */ /* Prediction omitted for now. */ +/* On refill: prefetch next byte, for latency reduction on literals and offsets. */ +#define jnextb0np jnextb0yp +#define jnextb0yp GETBITp; jnc +#define jnextb1np jnextb1yp +#define jnextb1yp GETBITp; jc +#define GETBITp \ + addl bits,bits; jnz 0f; \ + movl (%rsi),bits; subq $-4,%rsi; \ + adcl bits,bits; movb (%rsi),%dl; \ +0: +/* Same, but without prefetch (not useful for length of match.) */ #define jnextb0n jnextb0y #define jnextb0y GETBIT; jnc #define jnextb1n jnextb1y @@ -109,11 +120,12 @@ ra_setup: #define GETBIT \ addl bits,bits; jnz 0f; \ movl (%rsi),bits; subq $-4,%rsi; \ - adcl bits,bits; movb (%rsi),%dl; \ + adcl bits,bits; \ 0: /* rotate next bit into bottom bit of reg */ -#define getnextb(reg) call *%r11; adcl reg,reg +#define getnextbp(reg) call *%r11; adcl reg,reg +#define getnextb(reg) getnextbp(reg) ALIGN(1<<3) getbit: