i386 stub: prefer movzbl over movb fetch

32-bit mode on x86_64 hardware can fail movb fetch to register
by writing 0x00 randomly instead of the fetched byte.
Note that most CPU can write only 32 or 64 bits to the register file,
so writing just 8 or 16 bits incurs a 1-cycle penalty in order to
form 32 bits by Read-Modify-Write of the destination register.
	modified:   stub/src/arch/i386/nrv2b_d32-easy.S
	modified:   stub/src/i386-expand.S
This commit is contained in:
John Reiser
2024-07-28 15:08:45 -07:00
parent 3d58035b41
commit 25e6a31004
2 changed files with 7 additions and 7 deletions
+2 -2
View File
@@ -38,7 +38,7 @@ lit_n2b:
incl %esi; movb %dl,(%edi) incl %esi; movb %dl,(%edi)
incl %edi incl %edi
top_n2b: top_n2b:
movb (%esi),%dl # prefetch: literal, or bottom 8 bits of offset movzbl (%esi),%edx # prefetch: literal, or bottom 8 bits of offset
jnextb1yp lit_n2b jnextb1yp lit_n2b
push $1; pop off push $1; pop off
offmore_n2b: offmore_n2b:
@@ -46,7 +46,7 @@ offmore_n2b:
jnextb0np offmore_n2b jnextb0np offmore_n2b
subl $ 3,off; jc len_n2b # use previous offset subl $ 3,off; jc len_n2b # use previous offset
shll $ 8,off; movzbl %dl,%edx shll $ 8,off
orl %edx,off; incl %esi orl %edx,off; incl %esi
xorl $~0,off; jz eof xorl $~0,off; jz eof
movl off,disp movl off,disp
+5 -5
View File
@@ -51,7 +51,7 @@ NBPW= 4
#define GETBITp \ #define GETBITp \
addl bits,bits; jnz 0f; \ addl bits,bits; jnz 0f; \
movl (%esi),bits; sub $-4,%esi; \ movl (%esi),bits; sub $-4,%esi; \
adcl bits,bits; movb (%esi),%dl; \ adcl bits,bits; movzbl (%esi),%edx; \
0: 0:
/* Same, but without prefetch (not useful for length of match.) */ /* Same, but without prefetch (not useful for length of match.) */
#define jnextb0n jnextb0y #define jnextb0n jnextb0y
@@ -125,7 +125,7 @@ __clear_cache: .globl __clear_cache
refill: refill:
movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
movb (%esi),%dl // pre-fetch: literal, or bottom 8 bits of offset movzbl (%esi),%edx // pre-fetch: literal, or bottom 8 bits of offset
rep; ret rep; ret
getbit: getbit:
addl bits,bits; jz refill // Carry= next bit addl bits,bits; jz refill // Carry= next bit
@@ -133,16 +133,16 @@ getbit:
copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx
lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced
movb (%eax),%dl; jbe copy1 // <=5 for better branch predict movzbl (%eax),%edx; jbe copy1 // <=5 for better branch predict
cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap
subl $4,len // adjust for termination cases subl $4,len // adjust for termination cases
copy4: copy4:
movl (%eax),%edx; add $4, %eax; subl $4,len movl (%eax),%edx; add $4, %eax; subl $4,len
movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4 movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4
addl $4,len; movb (%eax),%dl; jz copy0 addl $4,len; movzbl (%eax),%edx; jz copy0
copy1: copy1:
inc %eax; movb %dl,(%edi); dec len inc %eax; movb %dl,(%edi); dec len
movb (%eax),%dl movzbl (%eax),%edx
lea 1(%edi),%edi; jnz copy1 lea 1(%edi),%edi; jnz copy1
copy0: copy0:
rep; ret rep; ret