i386 stub: prefer movzbl over movb fetch
32-bit mode on x86_64 hardware can fail movb fetch to register by writing 0x00 randomly instead of the fetched byte. Note that most CPU can write only 32 or 64 bits to the register file, so writing just 8 or 16 bits incurs a 1-cycle penalty in order to form 32 bits by Read-Modify-Write of the destination register. modified: stub/src/arch/i386/nrv2b_d32-easy.S modified: stub/src/i386-expand.S
This commit is contained in:
@@ -38,7 +38,7 @@ lit_n2b:
|
|||||||
incl %esi; movb %dl,(%edi)
|
incl %esi; movb %dl,(%edi)
|
||||||
incl %edi
|
incl %edi
|
||||||
top_n2b:
|
top_n2b:
|
||||||
movb (%esi),%dl # prefetch: literal, or bottom 8 bits of offset
|
movzbl (%esi),%edx # prefetch: literal, or bottom 8 bits of offset
|
||||||
jnextb1yp lit_n2b
|
jnextb1yp lit_n2b
|
||||||
push $1; pop off
|
push $1; pop off
|
||||||
offmore_n2b:
|
offmore_n2b:
|
||||||
@@ -46,7 +46,7 @@ offmore_n2b:
|
|||||||
jnextb0np offmore_n2b
|
jnextb0np offmore_n2b
|
||||||
|
|
||||||
subl $ 3,off; jc len_n2b # use previous offset
|
subl $ 3,off; jc len_n2b # use previous offset
|
||||||
shll $ 8,off; movzbl %dl,%edx
|
shll $ 8,off
|
||||||
orl %edx,off; incl %esi
|
orl %edx,off; incl %esi
|
||||||
xorl $~0,off; jz eof
|
xorl $~0,off; jz eof
|
||||||
movl off,disp
|
movl off,disp
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ NBPW= 4
|
|||||||
#define GETBITp \
|
#define GETBITp \
|
||||||
addl bits,bits; jnz 0f; \
|
addl bits,bits; jnz 0f; \
|
||||||
movl (%esi),bits; sub $-4,%esi; \
|
movl (%esi),bits; sub $-4,%esi; \
|
||||||
adcl bits,bits; movb (%esi),%dl; \
|
adcl bits,bits; movzbl (%esi),%edx; \
|
||||||
0:
|
0:
|
||||||
/* Same, but without prefetch (not useful for length of match.) */
|
/* Same, but without prefetch (not useful for length of match.) */
|
||||||
#define jnextb0n jnextb0y
|
#define jnextb0n jnextb0y
|
||||||
@@ -125,7 +125,7 @@ __clear_cache: .globl __clear_cache
|
|||||||
refill:
|
refill:
|
||||||
movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry
|
movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry
|
||||||
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
|
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
|
||||||
movb (%esi),%dl // pre-fetch: literal, or bottom 8 bits of offset
|
movzbl (%esi),%edx // pre-fetch: literal, or bottom 8 bits of offset
|
||||||
rep; ret
|
rep; ret
|
||||||
getbit:
|
getbit:
|
||||||
addl bits,bits; jz refill // Carry= next bit
|
addl bits,bits; jz refill // Carry= next bit
|
||||||
@@ -133,16 +133,16 @@ getbit:
|
|||||||
|
|
||||||
copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx
|
copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx
|
||||||
lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced
|
lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced
|
||||||
movb (%eax),%dl; jbe copy1 // <=5 for better branch predict
|
movzbl (%eax),%edx; jbe copy1 // <=5 for better branch predict
|
||||||
cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap
|
cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap
|
||||||
subl $4,len // adjust for termination cases
|
subl $4,len // adjust for termination cases
|
||||||
copy4:
|
copy4:
|
||||||
movl (%eax),%edx; add $4, %eax; subl $4,len
|
movl (%eax),%edx; add $4, %eax; subl $4,len
|
||||||
movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4
|
movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4
|
||||||
addl $4,len; movb (%eax),%dl; jz copy0
|
addl $4,len; movzbl (%eax),%edx; jz copy0
|
||||||
copy1:
|
copy1:
|
||||||
inc %eax; movb %dl,(%edi); dec len
|
inc %eax; movb %dl,(%edi); dec len
|
||||||
movb (%eax),%dl
|
movzbl (%eax),%edx
|
||||||
lea 1(%edi),%edi; jnz copy1
|
lea 1(%edi),%edi; jnz copy1
|
||||||
copy0:
|
copy0:
|
||||||
rep; ret
|
rep; ret
|
||||||
|
|||||||
Reference in New Issue
Block a user