Replace the experimental ARM64 filter with a new experimental version.
This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too.
This commit is contained in:
parent
f644473a21
commit
8370ec8edf
|
@ -49,13 +49,14 @@
|
|||
* Filter for SPARC binaries.
|
||||
*/
|
||||
|
||||
#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27000B)
|
||||
/**<
|
||||
* Filter for ARM64 binaries.
|
||||
*
|
||||
* \note In contrast to the other BCJ filters, this uses
|
||||
* its own options structure, lzma_options_arm64.
|
||||
*/
|
||||
#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27010B)
|
||||
/**<
|
||||
* Filter for ARM64 binaries.
|
||||
*
|
||||
* \note THIS IS AN EXPERIMENTAL VERSION WHICH WILL
|
||||
* STILL CHANGE! FILES CREATED WITH THIS
|
||||
* WILL NOT BE SUPPORTED IN THE FUTURE!
|
||||
*/
|
||||
|
||||
/**
|
||||
* \brief Options for BCJ filters (except ARM64)
|
||||
|
@ -95,29 +96,3 @@ typedef struct {
|
|||
uint32_t start_offset;
|
||||
|
||||
} lzma_options_bcj;
|
||||
|
||||
/**
|
||||
* \brief Options for the ARM64 filter
|
||||
*
|
||||
* This filter never changes the size of the data.
|
||||
* Specifying options is mandatory.
|
||||
*/
|
||||
typedef struct {
|
||||
/**
|
||||
* \brief How wide range of relative addresses are converted
|
||||
*
|
||||
* The ARM64 BL instruction has 26-bit immediate field that encodes
|
||||
* a relative address as a multiple of four bytes, so the effective
|
||||
* range is 2^28 bytes (+/-128 MiB).
|
||||
*
|
||||
* If width is 28 bits (LZMA_ARM64_WIDTH_MAX), then all BL
|
||||
* instructions will be converted. This has a downside of some
|
||||
* false matches that make compression worse. The best value
|
||||
* depends on the input file and the differences can be significant;
|
||||
* with large executables the maximum value is sometimes the best.
|
||||
*/
|
||||
uint32_t width;
|
||||
# define LZMA_ARM64_WIDTH_MIN 18
|
||||
# define LZMA_ARM64_WIDTH_MAX 28
|
||||
# define LZMA_ARM64_WIDTH_DEFAULT 26
|
||||
} lzma_options_arm64;
|
||||
|
|
|
@ -98,13 +98,13 @@ static const struct {
|
|||
},
|
||||
#endif
|
||||
#if defined(HAVE_ENCODER_ARM64) || defined(HAVE_DECODER_ARM64)
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.options_size = sizeof(lzma_options_arm64),
|
||||
.non_last_ok = true,
|
||||
.last_ok = false,
|
||||
.changes_size = false,
|
||||
},
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.options_size = sizeof(lzma_options_bcj),
|
||||
.non_last_ok = true,
|
||||
.last_ok = false,
|
||||
.changes_size = false,
|
||||
},
|
||||
#endif
|
||||
#if defined(HAVE_ENCODER_SPARC) || defined(HAVE_DECODER_SPARC)
|
||||
{
|
||||
|
|
|
@ -100,12 +100,12 @@ static const lzma_filter_decoder decoders[] = {
|
|||
},
|
||||
#endif
|
||||
#ifdef HAVE_DECODER_ARM64
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.init = &lzma_simple_arm64_decoder_init,
|
||||
.memusage = NULL,
|
||||
.props_decode = &lzma_arm64_props_decode,
|
||||
},
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.init = &lzma_simple_arm64_decoder_init,
|
||||
.memusage = NULL,
|
||||
.props_decode = &lzma_simple_props_decode,
|
||||
},
|
||||
#endif
|
||||
#ifdef HAVE_DECODER_SPARC
|
||||
{
|
||||
|
|
|
@ -127,15 +127,14 @@ static const lzma_filter_encoder encoders[] = {
|
|||
},
|
||||
#endif
|
||||
#ifdef HAVE_ENCODER_ARM64
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.init = &lzma_simple_arm64_encoder_init,
|
||||
.memusage = NULL,
|
||||
.block_size = NULL,
|
||||
.props_size_get = NULL,
|
||||
.props_size_fixed = 1,
|
||||
.props_encode = &lzma_arm64_props_encode,
|
||||
},
|
||||
{
|
||||
.id = LZMA_FILTER_ARM64,
|
||||
.init = &lzma_simple_arm64_encoder_init,
|
||||
.memusage = NULL,
|
||||
.block_size = NULL,
|
||||
.props_size_get = &lzma_simple_props_size,
|
||||
.props_encode = &lzma_simple_props_encode,
|
||||
},
|
||||
#endif
|
||||
#ifdef HAVE_ENCODER_SPARC
|
||||
{
|
||||
|
|
|
@ -3,6 +3,22 @@
|
|||
/// \file arm64.c
|
||||
/// \brief Filter for ARM64 binaries
|
||||
///
|
||||
/// This converts ARM64 relative addresses in the BL and ADRP immediates
|
||||
/// to absolute values to increase redundancy of ARM64 code.
|
||||
///
|
||||
/// Unlike the older BCJ filters, this handles zeros specially. This way
|
||||
/// the filter won't be counterproductive on Linux kernel modules, object
|
||||
/// files, and static libraries where the immediates are all zeros (to be
|
||||
/// filled later by a linker). Usually this has no downsides but with bad
|
||||
/// luck it can reduce the effectiveness of the filter and trying a different
|
||||
/// start offset can mitigate the problem.
|
||||
///
|
||||
/// Converting B or ADR instructions was also tested but it's not useful.
|
||||
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
|
||||
/// These are typical for loops and if-statements. Encoding them to their
|
||||
/// absolute address reduces redundancy since many of the small relative
|
||||
/// jump values are repeated, but very few of the absolute addresses are.
|
||||
//
|
||||
// Authors: Lasse Collin
|
||||
// Jia Tan
|
||||
//
|
||||
|
@ -13,126 +29,110 @@
|
|||
|
||||
#include "simple_private.h"
|
||||
|
||||
#ifdef HAVE_ENCODER_ARM64
|
||||
# include "simple_encoder.h"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_DECODER_ARM64
|
||||
# include "simple_decoder.h"
|
||||
#endif
|
||||
static uint32_t
|
||||
arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
|
||||
{
|
||||
if (!is_encoder)
|
||||
pc = 0U - pc;
|
||||
|
||||
uint32_t dest = src + pc;
|
||||
if ((dest & mask) == 0)
|
||||
dest = pc;
|
||||
|
||||
// In ARM64, there are two main branch instructions.
|
||||
// bl - branch and link: Calls a function and stores the return address.
|
||||
// b - branch: Jumps to a location, but does not store a return address.
|
||||
//
|
||||
// After some benchmarking, it was determined that only the bl instruction
|
||||
// is beneficial for compression. A majority of the jumps for the b
|
||||
// instruction are very small (+/- 0xFF). These are typical for loops
|
||||
// and if-statements. Encoding them to their absolute address reduces
|
||||
// redundancy since many of the small relative jump values are repeated,
|
||||
// but very few of the absolute addresses are.
|
||||
//
|
||||
// Thus, only the bl instruction will be encoded and decoded.
|
||||
// The bl instruction is 32 bits in size. The highest 6 bits contain
|
||||
// the opcode (10 0101 == 0x25) and the remaining 26 bits are
|
||||
// the immediate value. The immediate is a signed integer that
|
||||
// encodes the target address as a multiple of four bytes so
|
||||
// the range is +/-128 MiB.
|
||||
|
||||
// The 6-bit op code for the bl instruction in ARM64
|
||||
#define ARM64_BL_OPCODE 0x25
|
||||
|
||||
// Once the 26-bit immediate is multiple by four, the address is 28 bits
|
||||
// with the two lowest bits being zero. This mask is used to clear the
|
||||
// unwanted bits.
|
||||
#define ADDR28_MASK 0x0FFFFFFCU
|
||||
|
||||
|
||||
typedef struct {
|
||||
uint32_t sign_bit;
|
||||
uint32_t sign_mask;
|
||||
} lzma_simple_arm64;
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
static size_t
|
||||
arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder,
|
||||
arm64_code(void *simple lzma_attribute((__unused__)),
|
||||
uint32_t now_pos, bool is_encoder,
|
||||
uint8_t *buffer, size_t size)
|
||||
{
|
||||
const lzma_simple_arm64 *simple = simple_ptr;
|
||||
const uint32_t sign_bit = simple->sign_bit;
|
||||
const uint32_t sign_mask = simple->sign_mask;
|
||||
|
||||
size_t i;
|
||||
|
||||
// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
|
||||
// with auto-vectorization that is enabled by default with -O2.
|
||||
// Even -Os, which doesn't use vectorization, produces faster code.
|
||||
// Disabling vectorization with -O2 gives good speed (faster than -Os)
|
||||
// and reasonable code size.
|
||||
//
|
||||
// Such vectorization bloat happens with -O2 when targeting ARM64 too
|
||||
// but performance hasn't been tested.
|
||||
//
|
||||
// Clang 14 and 15 won't auto-vectorize this loop if the condition
|
||||
// for ADRP is replaced with the commented-out version. However,
|
||||
// at least Clang 14.0.6 doesn't generate as fast code with that
|
||||
// condition. The commented-out code is also bigger.
|
||||
//
|
||||
// GCC 12.2 on x86-64 with -O2 produces good code with both versions
|
||||
// of the ADRP if-statement although the single-branch version is
|
||||
// slightly faster and smaller than the commented-out version.
|
||||
// Speed is similar to non-vectorized clang -O2.
|
||||
#ifdef __clang__
|
||||
# pragma clang loop vectorize(disable)
|
||||
#endif
|
||||
for (i = 0; i + 4 <= size; i += 4) {
|
||||
if ((buffer[i + 3] >> 2) == ARM64_BL_OPCODE) {
|
||||
// Get the relative 28-bit address from
|
||||
// the 26-bit immediate.
|
||||
uint32_t src = read32le(buffer + i);
|
||||
src <<= 2;
|
||||
src &= ADDR28_MASK;
|
||||
const uint32_t pc = (uint32_t)(now_pos + i);
|
||||
uint32_t instr = read32le(buffer + i);
|
||||
|
||||
// When the conversion width isn't the maximum,
|
||||
// check that the highest bits are either all zero
|
||||
// or all one.
|
||||
if ((src & sign_mask) != 0
|
||||
&& (src & sign_mask) != sign_mask)
|
||||
continue;
|
||||
if ((instr >> 26) == 0x25) {
|
||||
// BL instruction:
|
||||
// The full 26-bit immediate is converted.
|
||||
// The range is +/-128 MiB.
|
||||
//
|
||||
// Using the full range is helps quite a lot with
|
||||
// big executables. Smaller range would reduce false
|
||||
// positives in non-code sections of the input though
|
||||
// so this is a compromise that slightly favors big
|
||||
// files. With the full range only six bits of the 32
|
||||
// need to match to trigger a conversion.
|
||||
const uint32_t mask26 = 0x03FFFFFF;
|
||||
const uint32_t src = instr & mask26;
|
||||
instr = 0x94000000;
|
||||
|
||||
// Some files like static libraries or Linux kernel
|
||||
// modules have the immediate value filled with
|
||||
// zeros. Converting these placeholder values would
|
||||
// make compression worse so don't touch them.
|
||||
if (src == 0)
|
||||
continue;
|
||||
|
||||
const uint32_t pc = now_pos + (uint32_t)(i);
|
||||
instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
|
||||
& mask26;
|
||||
write32le(buffer + i, instr);
|
||||
|
||||
uint32_t dest;
|
||||
if (is_encoder)
|
||||
dest = pc + src;
|
||||
else
|
||||
dest = src - pc;
|
||||
/*
|
||||
// This is a more readable version of the one below but this
|
||||
// has two branches. It results in bigger and slower code.
|
||||
} else if ((instr & 0x9FF00000) == 0x90000000
|
||||
|| (instr & 0x9FF00000) == 0x90F00000) {
|
||||
*/
|
||||
// This is only a rotation, addition, and testing that
|
||||
// none of the bits covered by the bitmask are set.
|
||||
} else if (((((instr << 8) | (instr >> 24))
|
||||
+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
|
||||
// ADRP instruction:
|
||||
// Only values in the range +/-512 MiB are converted.
|
||||
//
|
||||
// Using less than the full +/-4 GiB range reduces
|
||||
// false positives on non-code sections of the input
|
||||
// while being excellent for executables up to 512 MiB.
|
||||
// The positive effect of ADRP conversion is smaller
|
||||
// than that of BL but it also doesn't hurt so much in
|
||||
// non-code sections of input because, with +/-512 MiB
|
||||
// range, nine bits of 32 need to match to trigger a
|
||||
// conversion (two 10-bit match choices = 9 bits).
|
||||
const uint32_t src = ((instr >> 29) & 3)
|
||||
| ((instr >> 3) & 0x0003FFFC);
|
||||
instr &= 0x9000001F;
|
||||
|
||||
dest &= ADDR28_MASK;
|
||||
if (src == 0)
|
||||
continue;
|
||||
|
||||
// Sign-extend negative values or unset sign bits
|
||||
// from positive values.
|
||||
if (dest & sign_bit)
|
||||
dest |= sign_mask;
|
||||
else
|
||||
dest &= ~sign_mask;
|
||||
const uint32_t dest = arm64_conv(
|
||||
src, pc >> 12, 0x3FFFF, is_encoder);
|
||||
|
||||
assert((dest & sign_mask) == 0
|
||||
|| (dest & sign_mask) == sign_mask);
|
||||
|
||||
// Since also the decoder will ignore src values
|
||||
// of 0, we must ensure that nothing is ever encoded
|
||||
// to 0. This is achieved by encoding such values
|
||||
// as pc instead. When decoding, pc will be first
|
||||
// converted to 0 which we will catch here and fix.
|
||||
if (dest == 0) {
|
||||
// We cannot get here if pc is zero because
|
||||
// then src would need to be zero too but we
|
||||
// already ensured that src != 0.
|
||||
assert((pc & ADDR28_MASK) != 0);
|
||||
dest = is_encoder ? pc : 0U - pc;
|
||||
dest &= ADDR28_MASK;
|
||||
|
||||
if (dest & sign_bit)
|
||||
dest |= sign_mask;
|
||||
else
|
||||
dest &= ~sign_mask;
|
||||
}
|
||||
|
||||
assert((dest & sign_mask) == 0
|
||||
|| (dest & sign_mask) == sign_mask);
|
||||
assert((dest & ~ADDR28_MASK) == 0);
|
||||
|
||||
// Construct and store the modified 32-bit instruction.
|
||||
dest >>= 2;
|
||||
dest |= (uint32_t)ARM64_BL_OPCODE << 26;
|
||||
write32le(buffer + i, dest);
|
||||
instr |= (dest & 3) << 29;
|
||||
instr |= (dest & 0x0003FFFC) << 3;
|
||||
instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
|
||||
write32le(buffer + i, instr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -140,81 +140,12 @@ arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder,
|
|||
}
|
||||
|
||||
|
||||
#ifdef HAVE_ENCODER_ARM64
|
||||
extern lzma_ret
|
||||
lzma_arm64_props_encode(const void *options, uint8_t *out)
|
||||
{
|
||||
const lzma_options_arm64 *const opt = options;
|
||||
|
||||
if (opt->width < LZMA_ARM64_WIDTH_MIN
|
||||
|| opt->width > LZMA_ARM64_WIDTH_MAX)
|
||||
return LZMA_OPTIONS_ERROR;
|
||||
|
||||
out[0] = (uint8_t)(opt->width - LZMA_ARM64_WIDTH_MIN);
|
||||
return LZMA_OK;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAVE_DECODER_ARM64
|
||||
extern lzma_ret
|
||||
lzma_arm64_props_decode(void **options, const lzma_allocator *allocator,
|
||||
const uint8_t *props, size_t props_size)
|
||||
{
|
||||
if (props_size != 1)
|
||||
return LZMA_OPTIONS_ERROR;
|
||||
|
||||
if (props[0] > LZMA_ARM64_WIDTH_MAX - LZMA_ARM64_WIDTH_MIN)
|
||||
return LZMA_OPTIONS_ERROR;
|
||||
|
||||
lzma_options_arm64 *opt = lzma_alloc(sizeof(lzma_options_arm64),
|
||||
allocator);
|
||||
if (opt == NULL)
|
||||
return LZMA_MEM_ERROR;
|
||||
|
||||
opt->width = props[0] + LZMA_ARM64_WIDTH_MIN;
|
||||
*options = opt;
|
||||
return LZMA_OK;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static lzma_ret
|
||||
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
|
||||
const lzma_filter_info *filters, bool is_encoder)
|
||||
{
|
||||
if (filters[0].options == NULL)
|
||||
return LZMA_PROG_ERROR;
|
||||
|
||||
const lzma_options_arm64 *opt = filters[0].options;
|
||||
if (opt->width < LZMA_ARM64_WIDTH_MIN
|
||||
|| opt->width > LZMA_ARM64_WIDTH_MAX)
|
||||
return LZMA_OPTIONS_ERROR;
|
||||
|
||||
const lzma_ret ret = lzma_simple_coder_init(next, allocator, filters,
|
||||
&arm64_code, sizeof(lzma_simple_arm64), 4, 4,
|
||||
is_encoder, false);
|
||||
|
||||
if (ret == LZMA_OK) {
|
||||
lzma_simple_coder *coder = next->coder;
|
||||
lzma_simple_arm64 *simple = coder->simple;
|
||||
|
||||
// This will be used to detect if the value, after
|
||||
// conversion has been done, is negative. The location
|
||||
// of the sign bit depends on the conversion width.
|
||||
simple->sign_bit = UINT32_C(1) << (opt->width - 1);
|
||||
|
||||
// When conversion width isn't the maximum, the highest
|
||||
// bits must all be either zero or one, that is, they
|
||||
// all are copies of the sign bit. This mask is used to
|
||||
// (1) detect if input value is in the range specified
|
||||
// by the conversion width and (2) clearing or setting
|
||||
// the high bits after conversion (integers can wrap around).
|
||||
simple->sign_mask = (UINT32_C(1) << 28) - simple->sign_bit;
|
||||
}
|
||||
|
||||
return ret;
|
||||
return lzma_simple_coder_init(next, allocator, filters,
|
||||
&arm64_code, 0, 4, 4, is_encoder, true);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,8 +19,4 @@ extern lzma_ret lzma_simple_props_decode(
|
|||
void **options, const lzma_allocator *allocator,
|
||||
const uint8_t *props, size_t props_size);
|
||||
|
||||
extern lzma_ret lzma_arm64_props_decode(
|
||||
void **options, const lzma_allocator *allocator,
|
||||
const uint8_t *props, size_t props_size);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -20,6 +20,4 @@ extern lzma_ret lzma_simple_props_size(uint32_t *size, const void *options);
|
|||
|
||||
extern lzma_ret lzma_simple_props_encode(const void *options, uint8_t *out);
|
||||
|
||||
extern lzma_ret lzma_arm64_props_encode(const void *options, uint8_t *out);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -374,7 +374,7 @@ parse_real(args_info *args, int argc, char **argv)
|
|||
|
||||
case OPT_ARM64:
|
||||
coder_add_filter(LZMA_FILTER_ARM64,
|
||||
options_arm64(optarg));
|
||||
options_bcj(optarg));
|
||||
break;
|
||||
|
||||
case OPT_SPARC:
|
||||
|
|
|
@ -1034,9 +1034,16 @@ message_filters_to_str(char buf[FILTERS_STR_SIZE],
|
|||
}
|
||||
|
||||
case LZMA_FILTER_ARM64: {
|
||||
const lzma_options_arm64 *opt = filters[i].options;
|
||||
my_snprintf(&pos, &left, "arm64=width=%" PRIu32,
|
||||
opt->width);
|
||||
// FIXME TODO: Merge with the above generic BCJ list
|
||||
// once the Filter ID is changed to the final value.
|
||||
const lzma_options_bcj *opt = filters[i].options;
|
||||
my_snprintf(&pos, &left, "arm64");
|
||||
|
||||
// Show the start offset only when really needed.
|
||||
if (opt != NULL && opt->start_offset != 0)
|
||||
my_snprintf(&pos, &left, "=start=%" PRIu32,
|
||||
opt->start_offset);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -224,45 +224,6 @@ options_bcj(const char *str)
|
|||
}
|
||||
|
||||
|
||||
///////////
|
||||
// ARM64 //
|
||||
///////////
|
||||
|
||||
enum {
|
||||
OPT_WIDTH,
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
set_arm64(void *options, unsigned key, uint64_t value,
|
||||
const char *valuestr lzma_attribute((__unused__)))
|
||||
{
|
||||
lzma_options_arm64 *opt = options;
|
||||
switch (key) {
|
||||
case OPT_WIDTH:
|
||||
opt->width = value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern lzma_options_arm64 *
|
||||
options_arm64(const char *str)
|
||||
{
|
||||
static const option_map opts[] = {
|
||||
{ "width", NULL, LZMA_ARM64_WIDTH_MIN, LZMA_ARM64_WIDTH_MAX },
|
||||
{ NULL, NULL, 0, 0 }
|
||||
};
|
||||
|
||||
lzma_options_arm64 *options = xmalloc(sizeof(lzma_options_arm64));
|
||||
options->width = LZMA_ARM64_WIDTH_DEFAULT;
|
||||
|
||||
parse_options(str, opts, &set_arm64, options);
|
||||
|
||||
return options;
|
||||
}
|
||||
|
||||
|
||||
//////////
|
||||
// LZMA //
|
||||
//////////
|
||||
|
|
|
@ -24,13 +24,6 @@ extern lzma_options_delta *options_delta(const char *str);
|
|||
extern lzma_options_bcj *options_bcj(const char *str);
|
||||
|
||||
|
||||
/// \brief Parser for ARM64 options
|
||||
///
|
||||
/// \return Pointer to allocated options structure.
|
||||
/// Doesn't return on error.
|
||||
extern lzma_options_arm64 *options_arm64(const char *str);
|
||||
|
||||
|
||||
/// \brief Parser for LZMA options
|
||||
///
|
||||
/// \return Pointer to allocated options structure.
|
||||
|
|
Loading…
Reference in New Issue