diff --git a/src/stub/l_lx_elfppc32.h b/src/stub/l_lx_elfppc32.h index cdba6f5b..5de11761 100644 --- a/src/stub/l_lx_elfppc32.h +++ b/src/stub/l_lx_elfppc32.h @@ -1,4 +1,4 @@ -/* l_lx_elfppc32.h -- created from l_lx_elfppc32.bin, 432 (0x1b0) bytes +/* l_lx_elfppc32.h -- created from l_lx_elfppc32.bin, 448 (0x1c0) bytes This file is part of the UPX executable compressor. @@ -26,35 +26,36 @@ */ -#define LINUX_ELFPPC32_LOADER_ADLER32 0xf2b08e0d -#define LINUX_ELFPPC32_LOADER_CRC32 0x2364b765 +#define LINUX_ELFPPC32_LOADER_ADLER32 0x337c930d +#define LINUX_ELFPPC32_LOADER_CRC32 0xac495fea -unsigned char linux_elfppc32_loader[432] = { - 72, 0, 1,161,144,166, 0, 0,124,132, 26, 20, 60, 0,128, 0, /* 0x 0 */ - 61, 32,128, 0, 56, 99,255,255, 56,165,255,255, 57, 64,255,255, /* 0x 10 */ -125,168, 2,166, 72, 0, 0, 40, 57, 32, 0, 1,125, 41, 28, 44, /* 0x 20 */ - 56, 99, 0, 4,124, 9, 0, 64,125, 41, 72, 20, 97, 41, 0, 1, /* 0x 30 */ - 78,128, 0, 32,141, 3, 0, 1,157, 5, 0, 1,124, 9, 0, 64, /* 0x 40 */ -125, 41, 74, 20, 65,162,255,213, 65,129,255,236, 56,224, 0, 1, /* 0x 50 */ - 72, 0, 0, 20, 56,231,255,255,125, 41, 72, 21, 65,162,255,189, /* 0x 60 */ -124,231, 57, 20,125, 41, 72, 21, 65,162,255,177,124,231, 57, 20, /* 0x 70 */ -124, 9, 0, 64,125, 41, 74, 20, 65,162,255,161, 65,160,255,216, /* 0x 80 */ - 57, 0, 0, 0, 52,231,255,253, 84,231, 64, 46, 65,128, 0, 32, /* 0x 90 */ -140, 67, 0, 1,124,234, 16,249,125, 74, 14,112, 65,130, 0,136, /* 0x a0 */ -112, 66, 0, 1, 65,162, 0, 80, 72, 0, 0, 20,124, 9, 0, 64, /* 0x b0 */ -125, 41, 74, 20, 65,162,255,101, 65,161, 0, 60, 57, 0, 0, 1, /* 0x c0 */ -124, 9, 0, 64,125, 41, 74, 20, 65,162,255, 81, 65,161, 0, 40, /* 0x d0 */ -125, 41, 72, 21, 65,162,255, 69,125, 8, 65, 20,124, 9, 0, 64, /* 0x e0 */ -125, 41, 74, 20, 65,162,255, 53, 65,160,255,232, 57, 8, 0, 2, /* 0x f0 */ - 72, 0, 0, 16,125, 41, 72, 21, 65,162,255, 33,125, 8, 65, 20, /* 0x 100 */ - 32,234,250,255, 57, 8, 0, 2,125, 8, 1,148,124,234, 42, 20, /* 0x 110 */ -125, 9, 3,166,141, 7, 0, 1,157, 5, 0, 1, 66, 0,255,248, /* 0x 120 */ - 75,255,255, 28,128, 6, 0, 0,125,168, 3,166, 56,165, 0, 1, /* 0x 130 */ - 56, 99, 0, 1,124,160, 40, 80,124,100, 24, 80,144,166, 0, 0, /* 0x 140 */ - 78,128, 0, 32,127,200, 2,166, 56,192, 0, 50,128,126, 0, 4, /* 0x 150 */ - 56,160, 0, 7,124, 99,242, 20, 56,128, 16, 0, 56, 99, 16, 11, /* 0x 160 */ - 56, 0, 0, 90, 84, 99, 0, 38, 68, 0, 0, 2, 65,131, 0, 32, /* 0x 170 */ -124,104, 3,166, 56,193, 0,124,124,101, 27,120,127,233, 3,166, /* 0x 180 */ -128,158, 0, 4, 56,126, 0, 12, 78,128, 4, 32,127,224, 0, 8, /* 0x 190 */ -148, 33,255,128,188, 65, 0, 4,127,232, 2,166, 75,255,255,169 /* 0x 1a0 */ +unsigned char linux_elfppc32_loader[448] = { + 72, 0, 1,177,124, 0, 41,236,144,166, 0, 0,124,132, 26, 20, /* 0x 0 */ + 60, 0,128, 0, 61, 32,128, 0, 56, 99,255,255, 56,165,255,255, /* 0x 10 */ + 57, 64,255,255,125,168, 2,166, 72, 0, 1, 12, 57, 32, 0, 1, /* 0x 20 */ +125, 41, 28, 44, 56, 99, 0, 4,124, 9, 0, 64,125, 41, 72, 20, /* 0x 30 */ + 97, 41, 0, 1, 78,128, 0, 32,141, 3, 0, 1,157, 5, 0, 1, /* 0x 40 */ +124, 9, 0, 64,125, 41, 74, 20, 65,162,255,213, 65,129,255,236, /* 0x 50 */ + 56,224, 0, 1, 72, 0, 0, 20, 56,231,255,255,125, 41, 72, 21, /* 0x 60 */ + 65,162,255,189,124,231, 57, 20,125, 41, 72, 21, 65,162,255,177, /* 0x 70 */ +124,231, 57, 20,124, 9, 0, 64,125, 41, 74, 20, 65,162,255,161, /* 0x 80 */ + 65,160,255,216, 57, 0, 0, 0, 52,231,255,253, 84,231, 64, 46, /* 0x 90 */ + 65,128, 0, 32,140, 67, 0, 1,124,234, 16,249,125, 74, 14,112, /* 0x a0 */ + 65,130, 0,148,112, 66, 0, 1, 65,162, 0, 80, 72, 0, 0, 20, /* 0x b0 */ +124, 9, 0, 64,125, 41, 74, 20, 65,162,255,101, 65,161, 0, 60, /* 0x c0 */ + 57, 0, 0, 1,124, 9, 0, 64,125, 41, 74, 20, 65,162,255, 81, /* 0x d0 */ + 65,161, 0, 40,125, 41, 72, 21, 65,162,255, 69,125, 8, 65, 20, /* 0x e0 */ +124, 9, 0, 64,125, 41, 74, 20, 65,162,255, 53, 65,160,255,232, /* 0x f0 */ + 57, 8, 0, 2, 72, 0, 0, 16,125, 41, 72, 21, 65,162,255, 33, /* 0x 100 */ +125, 8, 65, 20, 32,234,250,255, 57, 8, 0, 2,125, 8, 1,148, /* 0x 110 */ +124,234, 42, 20,125, 9, 3,166,141, 7, 0, 1,157, 5, 0, 1, /* 0x 120 */ + 66, 0,255,248, 56,224, 1, 0,124, 7, 41,236,124, 7, 26, 44, /* 0x 130 */ + 75,255,255, 16,128, 6, 0, 0,125,168, 3,166, 56,165, 0, 1, /* 0x 140 */ + 56, 99, 0, 1,124,160, 40, 80,124,100, 24, 80,144,166, 0, 0, /* 0x 150 */ + 78,128, 0, 32,127,200, 2,166, 56,192, 0, 50,128,126, 0, 4, /* 0x 160 */ + 56,160, 0, 7,124, 99,242, 20, 56,128, 16, 0, 56, 99, 16, 11, /* 0x 170 */ + 56, 0, 0, 90, 84, 99, 0, 38, 68, 0, 0, 2, 65,131, 0, 32, /* 0x 180 */ +124,104, 3,166, 56,193, 0,124,124,101, 27,120,127,233, 3,166, /* 0x 190 */ +128,158, 0, 4, 56,126, 0, 12, 78,128, 4, 32,127,224, 0, 8, /* 0x 1a0 */ +148, 33,255,128,188, 65, 0, 4,127,232, 2,166, 75,255,255,169 /* 0x 1b0 */ }; diff --git a/src/stub/ppc_d_nrv2e.S b/src/stub/ppc_d_nrv2e.S index 5c8b4f8b..237580a9 100644 --- a/src/stub/ppc_d_nrv2e.S +++ b/src/stub/ppc_d_nrv2e.S @@ -31,6 +31,8 @@ #include "ppc_regs.h" +SZ_DLINE=128 # size of data cache line in Apple G5 + /* Returns 0 on success; non-zero on failure. */ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst) @@ -47,6 +49,8 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst) #define bits a6 #define disp a7 + dcbtst 0,dst # prime dcache for store + stw dst,0(ldst) # original dst add lsrc,lsrc,src # input eof @@ -57,7 +61,7 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst) li disp,-1 # initial displacement mflr t3 # return address - b top_n2e + b bot_n2e /* jump on next bit, with branch prediction: y==>likely; n==>unlikely cr0 is set by the cmpl ["compare logical"==>unsigned]: @@ -95,8 +99,8 @@ get32: lit_n2e: #define tmp len - lbzu tmp,1(src) - stbu tmp,1(dst) + lbzu tmp,1(src) # tmp= *++src; + stbu tmp,1(dst) # *++dst= tmp; #undef tmp top_n2e: jnextb1y lit_n2e @@ -151,6 +155,17 @@ short_n2e: stbu tmp,1(dst) #undef tmp bdnz+ short_n2e +bot_n2e: +/* This "prefetch for store" is simple, small, and effective. Matches + usually occur more frequently than once per 128 bytes, but G4 line size + is only 32 bytes anyway. Assume that an 'unnecessary' dcbtst costs only + about as much as a hit. The counter register is free at top_n2e, so we could + pace the dcbtst optimally; but that takes 7 or 8 instructions of space. +*/ + li back,2*SZ_DLINE + dcbtst back,dst # 2 lines ahead [-1 for stbu] + dcbt back,src # jump start auto prefetch at page boundary +/* Auto prefetch for Read quits at page boundary; needs 2 misses to restart. */ b top_n2e #undef back