diff --git a/src/stub/l_lx_elfppc32.S b/src/stub/l_lx_elfppc32.S index 1fdecf7e..979c5dc1 100644 --- a/src/stub/l_lx_elfppc32.S +++ b/src/stub/l_lx_elfppc32.S @@ -39,6 +39,7 @@ _start: .globl _start sz_b_info= 12 sz_unc= 0 sz_cpr= 4 + b_method= 8 PROT_READ= 1 PROT_WRITE= 2 @@ -57,36 +58,34 @@ PAGE_SIZE = -(~0< + + John F. Reiser + +*/ + +#include "ppc_regs.h" + +SZ_DLINE=128 # size of data cache line in Apple G5 + +/* Returns 0 on success; non-zero on failure. */ +decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method) + +/* PowerPC has no 'cmplis': compare logical [unsigned] immediate shifted [by 16] */ +#define hibit r0 /* holds 0x80000000 during decompress */ + +#define src a0 +#define lsrc a1 +#define dst a2 +#define ldst a3 /* Out: actually a reference: &len_dst */ +#define meth a4 + +#define off a4 +#define len a5 +#define bits a6 +#define disp a7 + + dcbtst 0,dst # prime dcache for store + + stw dst,0(ldst) # original dst + add lsrc,lsrc,src # input eof + + lis hibit,0x8000 # 0x80000000 for detecting next bit + lis bits,0x8000 # prepare for first load + addi src,src,-1 # prepare for 'lbzu' + addi dst,dst,-1 # prepare for 'stbu' + li disp,-1 # initial displacement + + mflr t3 # return address + b bot_n2b + +/* jump on next bit, with branch prediction: y==>likely; n==>unlikely + cr0 is set by the cmpl ["compare logical"==>unsigned]: + lt next bit is 0 + gt next bit is 1 + eq must load next 32 bits from memory +*/ +#define jnextb0y call get1; blt+ cr0, +#define jnextb0n call get1; blt- cr0, +#define jnextb1y call get1; bgt+ cr0, +#define jnextb1n call get1; bgt- cr0, + +/* rotate next bit into bottom bit of reg; set cr0 based on entire result reg */ +#define getnextb(reg) call get1; adde. reg,reg,reg + +get1: + cmpl cr0,bits,hibit # cr0 for jnextb + addc bits,bits,bits # CArry for getnextb + bnelr+ cr0 # return if reload not needed; likely 31/32 + +/* CArry has been set from adding 0x80000000 to itself; preserve for 'adde' */ + # fetch 4 bytes unaligned and LITTLE ENDIAN +#if 0 /*{ clean; but 4 instr larger, and 3 cycles longer */ + lbz bits,1(src) # lo8 + lbz t0,2(src); rlwimi bits,t0, 8,16,23 + lbz t0,3(src); rlwimi bits,t0,16, 8,15 + lbzu t0,4(src); rlwimi bits,t0,24, 0, 7 +#else /*}{ pray for no unalignment trap or slowdown */ + li bits,1 # compensate for 'lbzu' + lwbrx bits,bits,src # bits= fetch_le32(bits+src) + addi src,src,4 +#endif /*}*/ + + cmpl cr0,bits,hibit # cr0 for jnextb + adde bits,bits,bits # CArry for getnextb; set lo bit from CarryIn + ret + +lit_n2b: +#define tmp len + lbzu tmp,1(src) # tmp= *++src; + stbu tmp,1(dst) # *++dst= tmp; +#undef tmp +top_n2b: + jnextb1y lit_n2b + li off,1 # "the msb" +offmore_n2b: + getnextb(off) + jnextb0n offmore_n2b + + addic. off,off,-3 # CArry set [and ignored], but no 'addi.' + li len,0 + blt- offprev_n2b + lbzu t0,1(src) + rlwinm off,off,8,0,31-8 # off<<=8; + nor. disp,off,t0 # disp = -(1+ (off|t0)); + beq- eof_n2b + +offprev_n2b: # In: 0==len + getnextb(len); getnextb(len) # two bits; cr0 set on result + li off,1; bne- gotlen_n2b # raw 1,2,3 ==> 2,3,4 + li off,3 # raw 2.. ==> 5.. + li len,1 # "the msb" +lenmore_n2b: + getnextb(len) + jnextb0n lenmore_n2b +gotlen_n2b: + subfic t0,disp,(~0)+(-0xd00) # want CArry only + adde len,len,off # len += off + (disp < -0xd00); + +copy: +#define back off + add back,disp,dst # point back to match in dst + mtctr len +short_n2b: +#define tmp len + lbzu tmp,1(back) + stbu tmp,1(dst) +#undef tmp + bdnz+ short_n2b + +/* This "prefetch for store" is simple, small, and effective. Matches + usually occur more frequently than once per 128 bytes, but G4 line size + is only 32 bytes anyway. Assume that an 'unnecessary' dcbtst costs only + about as much as a hit. The counter register is free at top_n2b, so we could + pace the dcbtst optimally; but that takes 7 or 8 instructions of space. +*/ +bot_n2b: + li back,2*SZ_DLINE + dcbtst back,dst # 2 lines ahead [-1 for stbu] + dcbt back,src # jump start auto prefetch at page boundary +/* Auto prefetch for Read quits at page boundary; needs 2 misses to restart. */ +#undef back + b top_n2b + +eof_n2b: +#define tmp r0 /* hibit is dead */ + lwz tmp,0(ldst) # original dst + mtlr t3 # return address + addi dst,dst,1 # uncorrect for 'stbu' + addi src,src,1 # uncorrect for 'lbzu' + subf dst,tmp,dst # dst -= tmp; // dst length +#undef tmp + subf a0,lsrc,src # src -= eof; // return 0: good; else: bad + stw dst,0(ldst) + ret + diff --git a/src/stub/ppc_d_nrv2e.S b/src/stub/ppc_d_nrv2e.S index 237580a9..06cc101f 100644 --- a/src/stub/ppc_d_nrv2e.S +++ b/src/stub/ppc_d_nrv2e.S @@ -34,7 +34,7 @@ SZ_DLINE=128 # size of data cache line in Apple G5 /* Returns 0 on success; non-zero on failure. */ -decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst) +decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method) /* PowerPC has no 'cmplis': compare logical [unsigned] immediate shifted [by 16] */ #define hibit r0 /* holds 0x80000000 during decompress */ @@ -43,6 +43,7 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst) #define lsrc a1 #define dst a2 #define ldst a3 /* Out: actually a reference: &len_dst */ +#define meth a4 #define off a4 #define len a5