liblzma: LZMA decoder improvements.
This adds macros for bittree decoding which prepares the code for alternative C versions and inline assembly.
This commit is contained in:
parent
de5c5e4176
commit
e0c0ee475c
|
@ -24,9 +24,8 @@
|
||||||
|
|
||||||
// Minimum number of input bytes to safely decode one LZMA symbol.
|
// Minimum number of input bytes to safely decode one LZMA symbol.
|
||||||
// The worst case is that we decode 22 bits using probabilities and 26
|
// The worst case is that we decode 22 bits using probabilities and 26
|
||||||
// direct bits. This may decode at maximum 20 bytes of input plus one
|
// direct bits. This may decode at maximum 20 bytes of input.
|
||||||
// extra byte after the final EOPM normalization.
|
#define LZMA_IN_REQUIRED 20
|
||||||
#define LZMA_IN_REQUIRED 21
|
|
||||||
|
|
||||||
|
|
||||||
// Macros for (somewhat) size-optimized code.
|
// Macros for (somewhat) size-optimized code.
|
||||||
|
@ -73,32 +72,22 @@ do { \
|
||||||
symbol = 1; \
|
symbol = 1; \
|
||||||
rc_if_0(ld.choice) { \
|
rc_if_0(ld.choice) { \
|
||||||
rc_update_0(ld.choice); \
|
rc_update_0(ld.choice); \
|
||||||
rc_bit(ld.low[pos_state][symbol], , ); \
|
rc_bittree3(ld.low[pos_state], \
|
||||||
rc_bit(ld.low[pos_state][symbol], , ); \
|
-LEN_LOW_SYMBOLS + MATCH_LEN_MIN); \
|
||||||
rc_bit(ld.low[pos_state][symbol], , ); \
|
target = symbol; \
|
||||||
target = symbol - LEN_LOW_SYMBOLS + MATCH_LEN_MIN; \
|
|
||||||
} else { \
|
} else { \
|
||||||
rc_update_1(ld.choice); \
|
rc_update_1(ld.choice); \
|
||||||
rc_if_0(ld.choice2) { \
|
rc_if_0(ld.choice2) { \
|
||||||
rc_update_0(ld.choice2); \
|
rc_update_0(ld.choice2); \
|
||||||
rc_bit(ld.mid[pos_state][symbol], , ); \
|
rc_bittree3(ld.mid[pos_state], -LEN_MID_SYMBOLS \
|
||||||
rc_bit(ld.mid[pos_state][symbol], , ); \
|
+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS); \
|
||||||
rc_bit(ld.mid[pos_state][symbol], , ); \
|
target = symbol; \
|
||||||
target = symbol - LEN_MID_SYMBOLS \
|
|
||||||
+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \
|
|
||||||
} else { \
|
} else { \
|
||||||
rc_update_1(ld.choice2); \
|
rc_update_1(ld.choice2); \
|
||||||
rc_bit(ld.high[symbol], , ); \
|
rc_bittree8(ld.high, -LEN_HIGH_SYMBOLS \
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
rc_bit(ld.high[symbol], , ); \
|
|
||||||
target = symbol - LEN_HIGH_SYMBOLS \
|
|
||||||
+ MATCH_LEN_MIN \
|
+ MATCH_LEN_MIN \
|
||||||
+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
|
+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS); \
|
||||||
|
target = symbol; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
@ -369,8 +358,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
|
|
||||||
// If there is not enough room for another LZMA symbol
|
// If there is not enough room for another LZMA symbol
|
||||||
// go to Resumable mode.
|
// go to Resumable mode.
|
||||||
if (rc_in_pos + LZMA_IN_REQUIRED > in_size
|
if (unlikely(rc_in_end - rc_in_ptr < LZMA_IN_REQUIRED
|
||||||
|| dict.pos == dict.limit)
|
|| dict.pos == dict.limit))
|
||||||
goto slow;
|
goto slow;
|
||||||
|
|
||||||
// Decode the first bit from the next LZMA symbol.
|
// Decode the first bit from the next LZMA symbol.
|
||||||
|
@ -390,64 +379,14 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
probs = literal_subcoder(coder->literal,
|
probs = literal_subcoder(coder->literal,
|
||||||
literal_context_bits, literal_pos_mask,
|
literal_context_bits, literal_pos_mask,
|
||||||
dict.pos, dict_get(&dict, 0));
|
dict.pos, dict_get(&dict, 0));
|
||||||
symbol = 1;
|
|
||||||
|
|
||||||
if (is_literal_state(state)) {
|
if (is_literal_state(state)) {
|
||||||
// Decode literal without match byte.
|
// Decode literal without match byte.
|
||||||
// We need to decode 8 bits, so instead
|
rc_bittree8(probs, 0);
|
||||||
// of looping from 1 - 8, we unroll the
|
|
||||||
// loop for a speed optimization.
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
} else {
|
} else {
|
||||||
// Decode literal with match byte.
|
// Decode literal with match byte.
|
||||||
//
|
rc_matched_literal(probs,
|
||||||
// We store the byte we compare against
|
dict_get(&dict, rep0));
|
||||||
// ("match byte") to "len" to minimize the
|
|
||||||
// number of variables we need to store
|
|
||||||
// between decoder calls.
|
|
||||||
|
|
||||||
len = (uint32_t)(dict_get(&dict, rep0)) << 1;
|
|
||||||
|
|
||||||
// The usage of "offset" allows omitting some
|
|
||||||
// branches, which should give tiny speed
|
|
||||||
// improvement on some CPUs. "offset" gets
|
|
||||||
// set to zero if match_bit didn't match.
|
|
||||||
offset = 0x100;
|
|
||||||
|
|
||||||
// Unroll the loop.
|
|
||||||
uint32_t match_bit;
|
|
||||||
uint32_t subcoder_index;
|
|
||||||
|
|
||||||
# define decode_with_match_bit \
|
|
||||||
match_bit = len & offset; \
|
|
||||||
subcoder_index = offset + match_bit + symbol; \
|
|
||||||
rc_bit(probs[subcoder_index], \
|
|
||||||
offset &= ~match_bit, \
|
|
||||||
offset &= match_bit)
|
|
||||||
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
len <<= 1;
|
|
||||||
decode_with_match_bit;
|
|
||||||
# undef decode_match_bit
|
|
||||||
}
|
}
|
||||||
|
|
||||||
state = next_state[state];
|
state = next_state[state];
|
||||||
|
@ -501,18 +440,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
// The next 6 bits determine how to decode the
|
// The next 6 bits determine how to decode the
|
||||||
// rest of the distance.
|
// rest of the distance.
|
||||||
probs = coder->dist_slot[get_dist_state(len)];
|
probs = coder->dist_slot[get_dist_state(len)];
|
||||||
symbol = 1;
|
|
||||||
|
|
||||||
rc_bit(probs[symbol], , );
|
rc_bittree6(probs, -DIST_SLOTS);
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
rc_bit(probs[symbol], , );
|
|
||||||
|
|
||||||
// Get rid of the highest bit that was needed for
|
|
||||||
// indexing of the probability array.
|
|
||||||
symbol -= DIST_SLOTS;
|
|
||||||
assert(symbol <= 63);
|
assert(symbol <= 63);
|
||||||
|
|
||||||
if (symbol < DIST_MODEL_START) {
|
if (symbol < DIST_MODEL_START) {
|
||||||
|
@ -540,6 +469,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
assert(limit <= 5);
|
assert(limit <= 5);
|
||||||
rep0 <<= limit;
|
rep0 <<= limit;
|
||||||
assert(rep0 <= 96);
|
assert(rep0 <= 96);
|
||||||
|
|
||||||
// -1 is fine, because we start
|
// -1 is fine, because we start
|
||||||
// decoding at probs[1], not probs[0].
|
// decoding at probs[1], not probs[0].
|
||||||
// NOTE: This violates the C standard,
|
// NOTE: This violates the C standard,
|
||||||
|
@ -553,106 +483,51 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
|
||||||
probs = coder->pos_special + rep0
|
probs = coder->pos_special + rep0
|
||||||
- symbol - 1;
|
- symbol - 1;
|
||||||
symbol = 1;
|
symbol = 1;
|
||||||
offset = 0;
|
offset = 1;
|
||||||
|
|
||||||
switch (limit) {
|
// Variable number (1-5) of bits
|
||||||
case 5:
|
// from a reverse bittree. This
|
||||||
assert(offset == 0);
|
// isn't worth manual unrolling.
|
||||||
rc_bit(probs[symbol], ,
|
do {
|
||||||
rep0 += 1U);
|
rc_bit_add_if_1(probs,
|
||||||
++offset;
|
rep0, offset);
|
||||||
--limit;
|
offset <<= 1;
|
||||||
case 4:
|
} while (--limit > 0);
|
||||||
rc_bit(probs[symbol], ,
|
|
||||||
rep0 += 1U << offset);
|
|
||||||
++offset;
|
|
||||||
--limit;
|
|
||||||
case 3:
|
|
||||||
rc_bit(probs[symbol], ,
|
|
||||||
rep0 += 1U << offset);
|
|
||||||
++offset;
|
|
||||||
--limit;
|
|
||||||
case 2:
|
|
||||||
rc_bit(probs[symbol], ,
|
|
||||||
rep0 += 1U << offset);
|
|
||||||
++offset;
|
|
||||||
--limit;
|
|
||||||
case 1:
|
|
||||||
// We need "symbol" only for
|
|
||||||
// indexing the probability
|
|
||||||
// array, thus we can use
|
|
||||||
// rc_bit_last() here to
|
|
||||||
// omit the unneeded updating
|
|
||||||
// of "symbol".
|
|
||||||
rc_bit_last(probs[symbol], ,
|
|
||||||
rep0 += 1U << offset);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// The distance is >= 128. Decode the
|
// The distance is >= 128. Decode the
|
||||||
// lower bits without probabilities
|
// lower bits without probabilities
|
||||||
// except the lowest four bits.
|
// except the lowest four bits.
|
||||||
assert(symbol >= 14);
|
assert(symbol >= 14);
|
||||||
assert(limit >= 6);
|
assert(limit >= 6);
|
||||||
|
|
||||||
limit -= ALIGN_BITS;
|
limit -= ALIGN_BITS;
|
||||||
assert(limit >= 2);
|
assert(limit >= 2);
|
||||||
|
|
||||||
// Not worth manual unrolling
|
rc_direct(rep0, limit);
|
||||||
do {
|
|
||||||
rc_direct(rep0);
|
|
||||||
} while (--limit > 0);
|
|
||||||
|
|
||||||
// Decode the lowest four bits using
|
// Decode the lowest four bits using
|
||||||
// probabilities.
|
// probabilities.
|
||||||
rep0 <<= ALIGN_BITS;
|
rep0 <<= ALIGN_BITS;
|
||||||
symbol = 1;
|
rc_bittree_rev4(coder->pos_align);
|
||||||
|
rep0 += symbol;
|
||||||
|
|
||||||
rc_bit(coder->pos_align[symbol], ,
|
// If the end of payload marker (EOPM)
|
||||||
rep0 += 1);
|
// is detected, jump to the safe code.
|
||||||
|
// The EOPM handling isn't speed
|
||||||
rc_bit(coder->pos_align[symbol], ,
|
// critical at all.
|
||||||
rep0 += 2);
|
//
|
||||||
|
// A final normalization is needed
|
||||||
rc_bit(coder->pos_align[symbol], ,
|
// after the EOPM (there can be a
|
||||||
rep0 += 4);
|
// dummy byte to read in some cases).
|
||||||
|
// If the normalization was done here
|
||||||
// Like when distance [4, 127], we
|
// in the fast code, it would need to
|
||||||
// don't need "symbol" for anything
|
// be taken into account in the value
|
||||||
// other than indexing the probability
|
// of LZMA_IN_REQUIRED. Using the
|
||||||
// array.
|
// safe code allows keeping
|
||||||
rc_bit_last(
|
// LZMA_IN_REQUIRED as 20 instead of
|
||||||
coder->pos_align[symbol], ,
|
// 21.
|
||||||
rep0 += 8);
|
if (rep0 == UINT32_MAX)
|
||||||
|
goto eopm;
|
||||||
if (rep0 == UINT32_MAX) {
|
|
||||||
///////////////////////////
|
|
||||||
// End of payload marker //
|
|
||||||
///////////////////////////
|
|
||||||
|
|
||||||
// End of payload marker was
|
|
||||||
// found. It may only be
|
|
||||||
// present if
|
|
||||||
// - uncompressed size is
|
|
||||||
// unknown or
|
|
||||||
// - after known uncompressed
|
|
||||||
// size amount of bytes has
|
|
||||||
// been decompressed and
|
|
||||||
// caller has indicated
|
|
||||||
// that EOPM might be used
|
|
||||||
// (it's not allowed in
|
|
||||||
// LZMA2).
|
|
||||||
if (!eopm_is_valid) {
|
|
||||||
ret = LZMA_DATA_ERROR;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// LZMA1 stream with
|
|
||||||
// end-of-payload marker.
|
|
||||||
rc_normalize();
|
|
||||||
ret = rc_is_finished(rc)
|
|
||||||
? LZMA_STREAM_END
|
|
||||||
: LZMA_DATA_ERROR;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -948,31 +823,48 @@ slow:
|
||||||
limit -= ALIGN_BITS;
|
limit -= ALIGN_BITS;
|
||||||
assert(limit >= 2);
|
assert(limit >= 2);
|
||||||
case SEQ_DIRECT:
|
case SEQ_DIRECT:
|
||||||
do {
|
rc_direct_safe(rep0, limit,
|
||||||
rc_direct_safe(rep0,
|
|
||||||
SEQ_DIRECT);
|
SEQ_DIRECT);
|
||||||
} while (--limit > 0);
|
|
||||||
|
|
||||||
rep0 <<= ALIGN_BITS;
|
rep0 <<= ALIGN_BITS;
|
||||||
symbol = 1;
|
symbol = 0;
|
||||||
|
offset = 1;
|
||||||
offset = 0;
|
|
||||||
case SEQ_ALIGN:
|
case SEQ_ALIGN:
|
||||||
do {
|
do {
|
||||||
rc_bit_safe(coder->pos_align[
|
rc_bit_last_safe(
|
||||||
symbol], ,
|
coder->pos_align[
|
||||||
rep0 += 1U << offset,
|
offset
|
||||||
|
+ symbol],
|
||||||
|
,
|
||||||
|
symbol += offset,
|
||||||
SEQ_ALIGN);
|
SEQ_ALIGN);
|
||||||
} while (++offset < ALIGN_BITS);
|
offset <<= 1;
|
||||||
|
} while (offset < ALIGN_SIZE);
|
||||||
|
|
||||||
|
rep0 += symbol;
|
||||||
|
|
||||||
// End of payload marker
|
|
||||||
if (rep0 == UINT32_MAX) {
|
if (rep0 == UINT32_MAX) {
|
||||||
|
// End of payload marker was
|
||||||
|
// found. It may only be
|
||||||
|
// present if
|
||||||
|
// - uncompressed size is
|
||||||
|
// unknown or
|
||||||
|
// - after known uncompressed
|
||||||
|
// size amount of bytes has
|
||||||
|
// been decompressed and
|
||||||
|
// caller has indicated
|
||||||
|
// that EOPM might be used
|
||||||
|
// (it's not allowed in
|
||||||
|
// LZMA2).
|
||||||
|
eopm:
|
||||||
if (!eopm_is_valid) {
|
if (!eopm_is_valid) {
|
||||||
ret = LZMA_DATA_ERROR;
|
ret = LZMA_DATA_ERROR;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
case SEQ_EOPM:
|
case SEQ_EOPM:
|
||||||
|
// LZMA1 stream with
|
||||||
|
// end-of-payload marker.
|
||||||
rc_normalize_safe(SEQ_EOPM);
|
rc_normalize_safe(SEQ_EOPM);
|
||||||
ret = rc_is_finished(rc)
|
ret = rc_is_finished(rc)
|
||||||
? LZMA_STREAM_END
|
? LZMA_STREAM_END
|
||||||
|
|
|
@ -68,6 +68,10 @@
|
||||||
///
|
///
|
||||||
/// I will be sticking to uint16_t unless some specific architectures
|
/// I will be sticking to uint16_t unless some specific architectures
|
||||||
/// are *much* faster (20-50 %) with uint32_t.
|
/// are *much* faster (20-50 %) with uint32_t.
|
||||||
|
///
|
||||||
|
/// Update in 2024: The branchless C and x86-64 assembly was written so that
|
||||||
|
/// probability is assumed to be uint16_t. (In contrast, LZMA SDK 23.01
|
||||||
|
/// assembly supports both types.)
|
||||||
typedef uint16_t probability;
|
typedef uint16_t probability;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -16,6 +16,17 @@
|
||||||
#include "range_common.h"
|
#include "range_common.h"
|
||||||
|
|
||||||
|
|
||||||
|
// Negative RC_BIT_MODEL_TOTAL but the lowest RC_MOVE_BITS are flipped.
|
||||||
|
// This is useful for updating probability variables in branchless decoding:
|
||||||
|
//
|
||||||
|
// uint32_t decoded_bit = ...;
|
||||||
|
// probability tmp = RC_BIT_MODEL_OFFSET;
|
||||||
|
// tmp &= decoded_bit - 1;
|
||||||
|
// prob -= (prob + tmp) >> RC_MOVE_BITS;
|
||||||
|
#define RC_BIT_MODEL_OFFSET \
|
||||||
|
((UINT32_C(1) << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL)
|
||||||
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t range;
|
uint32_t range;
|
||||||
uint32_t code;
|
uint32_t code;
|
||||||
|
@ -52,7 +63,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
|
||||||
/// variables 'in' and 'in_size' to be defined.
|
/// variables 'in' and 'in_size' to be defined.
|
||||||
#define rc_to_local(range_decoder, in_pos) \
|
#define rc_to_local(range_decoder, in_pos) \
|
||||||
lzma_range_decoder rc = range_decoder; \
|
lzma_range_decoder rc = range_decoder; \
|
||||||
size_t rc_in_pos = (in_pos); \
|
const uint8_t *rc_in_ptr = in + (in_pos); \
|
||||||
|
const uint8_t *rc_in_end = in + in_size; \
|
||||||
uint32_t rc_bound
|
uint32_t rc_bound
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,7 +72,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
|
||||||
#define rc_from_local(range_decoder, in_pos) \
|
#define rc_from_local(range_decoder, in_pos) \
|
||||||
do { \
|
do { \
|
||||||
range_decoder = rc; \
|
range_decoder = rc; \
|
||||||
in_pos = rc_in_pos; \
|
in_pos = (size_t)(rc_in_ptr - in); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,7 +97,7 @@ do { \
|
||||||
do { \
|
do { \
|
||||||
if (rc.range < RC_TOP_VALUE) { \
|
if (rc.range < RC_TOP_VALUE) { \
|
||||||
rc.range <<= RC_SHIFT_BITS; \
|
rc.range <<= RC_SHIFT_BITS; \
|
||||||
rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
|
rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
@ -98,12 +110,12 @@ do { \
|
||||||
#define rc_normalize_safe(seq) \
|
#define rc_normalize_safe(seq) \
|
||||||
do { \
|
do { \
|
||||||
if (rc.range < RC_TOP_VALUE) { \
|
if (rc.range < RC_TOP_VALUE) { \
|
||||||
if (unlikely(rc_in_pos == in_size)) { \
|
if (rc_in_ptr == rc_in_end) { \
|
||||||
coder->sequence = seq; \
|
coder->sequence = seq; \
|
||||||
goto out; \
|
goto out; \
|
||||||
} \
|
} \
|
||||||
rc.range <<= RC_SHIFT_BITS; \
|
rc.range <<= RC_SHIFT_BITS; \
|
||||||
rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
|
rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
@ -133,10 +145,14 @@ do { \
|
||||||
|
|
||||||
/// Update the range decoder state and the used probability variable to
|
/// Update the range decoder state and the used probability variable to
|
||||||
/// match a decoded bit of 0.
|
/// match a decoded bit of 0.
|
||||||
|
///
|
||||||
|
/// The x86-64 assemly uses the commented method but it seems that,
|
||||||
|
/// at least on x86-64, the first version is slightly faster as C code.
|
||||||
#define rc_update_0(prob) \
|
#define rc_update_0(prob) \
|
||||||
do { \
|
do { \
|
||||||
rc.range = rc_bound; \
|
rc.range = rc_bound; \
|
||||||
prob += (RC_BIT_MODEL_TOTAL - (prob)) >> RC_MOVE_BITS; \
|
prob += (RC_BIT_MODEL_TOTAL - (prob)) >> RC_MOVE_BITS; \
|
||||||
|
/* prob -= ((prob) + RC_BIT_MODEL_OFFSET) >> RC_MOVE_BITS; */ \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -192,19 +208,121 @@ do { \
|
||||||
symbol = (symbol << 1) + 1; action1, \
|
symbol = (symbol << 1) + 1; action1, \
|
||||||
seq);
|
seq);
|
||||||
|
|
||||||
/// Decode a bit without using a probability.
|
// Unroll fixed-sized bittree decoding.
|
||||||
#define rc_direct(dest) \
|
//
|
||||||
|
// A compile-time constant in final_add can be used to get rid of the high bit
|
||||||
|
// from symbol that is used for the array indexing (1U << bittree_bits).
|
||||||
|
// final_add may also be used to add offset to the result (LZMA length
|
||||||
|
// decoder does that).
|
||||||
|
//
|
||||||
|
// The reason to have final_add here is that in the asm code the addition
|
||||||
|
// can be done for free: in x86-64 there is SBB instruction with -1 as
|
||||||
|
// the immediate value, and final_add is combined with that value.
|
||||||
|
#define rc_bittree_bit(prob) \
|
||||||
|
rc_bit(prob, , )
|
||||||
|
|
||||||
|
#define rc_bittree3(probs, final_add) \
|
||||||
do { \
|
do { \
|
||||||
|
symbol = 1; \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
symbol += (uint32_t)(final_add); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define rc_bittree6(probs, final_add) \
|
||||||
|
do { \
|
||||||
|
symbol = 1; \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
symbol += (uint32_t)(final_add); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define rc_bittree8(probs, final_add) \
|
||||||
|
do { \
|
||||||
|
symbol = 1; \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
rc_bittree_bit(probs[symbol]); \
|
||||||
|
symbol += (uint32_t)(final_add); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
// Fixed-sized reverse bittree
|
||||||
|
#define rc_bittree_rev4(probs) \
|
||||||
|
do { \
|
||||||
|
symbol = 0; \
|
||||||
|
rc_bit_last(probs[symbol + 1], , symbol += 1); \
|
||||||
|
rc_bit_last(probs[symbol + 2], , symbol += 2); \
|
||||||
|
rc_bit_last(probs[symbol + 4], , symbol += 4); \
|
||||||
|
rc_bit_last(probs[symbol + 8], , symbol += 8); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
// Decode one bit from variable-sized reverse bittree.
|
||||||
|
// The loop is done in the code that uses this macro.
|
||||||
|
#define rc_bit_add_if_1(probs, dest, value_to_add_if_1) \
|
||||||
|
rc_bit(probs[symbol], \
|
||||||
|
, \
|
||||||
|
dest += value_to_add_if_1);
|
||||||
|
|
||||||
|
|
||||||
|
// Matched literal
|
||||||
|
#define decode_with_match_bit \
|
||||||
|
t_match_byte <<= 1; \
|
||||||
|
t_match_bit = t_match_byte & t_offset; \
|
||||||
|
t_subcoder_index = t_offset + t_match_bit + symbol; \
|
||||||
|
rc_bit(probs[t_subcoder_index], \
|
||||||
|
t_offset &= ~t_match_bit, \
|
||||||
|
t_offset &= t_match_bit)
|
||||||
|
|
||||||
|
#define rc_matched_literal(probs_base_var, match_byte) \
|
||||||
|
do { \
|
||||||
|
uint32_t t_match_byte = (match_byte); \
|
||||||
|
uint32_t t_match_bit; \
|
||||||
|
uint32_t t_subcoder_index; \
|
||||||
|
uint32_t t_offset = 0x100; \
|
||||||
|
symbol = 1; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
decode_with_match_bit; \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
|
||||||
|
/// Decode a bit without using a probability.
|
||||||
|
//
|
||||||
|
// NOTE: GCC 13 and Clang/LLVM 16 can, at least on x86-64, optimize the bound
|
||||||
|
// calculation to use an arithmetic right shift so there's no need to provide
|
||||||
|
// the alternative code which, according to C99/C11/C23 6.3.1.3-p3 isn't
|
||||||
|
// perfectly portable: rc_bound = (uint32_t)((int32_t)rc.code >> 31);
|
||||||
|
#define rc_direct(dest, count_var) \
|
||||||
|
do { \
|
||||||
|
dest = (dest << 1) + 1; \
|
||||||
rc_normalize(); \
|
rc_normalize(); \
|
||||||
rc.range >>= 1; \
|
rc.range >>= 1; \
|
||||||
rc.code -= rc.range; \
|
rc.code -= rc.range; \
|
||||||
rc_bound = UINT32_C(0) - (rc.code >> 31); \
|
rc_bound = UINT32_C(0) - (rc.code >> 31); \
|
||||||
|
dest += rc_bound; \
|
||||||
rc.code += rc.range & rc_bound; \
|
rc.code += rc.range & rc_bound; \
|
||||||
dest = (dest << 1) + (rc_bound + 1); \
|
} while (--count_var > 0)
|
||||||
} while (0)
|
|
||||||
|
|
||||||
|
|
||||||
#define rc_direct_safe(dest, seq) \
|
|
||||||
|
#define rc_direct_safe(dest, count_var, seq) \
|
||||||
do { \
|
do { \
|
||||||
rc_normalize_safe(seq); \
|
rc_normalize_safe(seq); \
|
||||||
rc.range >>= 1; \
|
rc.range >>= 1; \
|
||||||
|
@ -212,10 +330,6 @@ do { \
|
||||||
rc_bound = UINT32_C(0) - (rc.code >> 31); \
|
rc_bound = UINT32_C(0) - (rc.code >> 31); \
|
||||||
rc.code += rc.range & rc_bound; \
|
rc.code += rc.range & rc_bound; \
|
||||||
dest = (dest << 1) + (rc_bound + 1); \
|
dest = (dest << 1) + (rc_bound + 1); \
|
||||||
} while (0)
|
} while (--count_var > 0)
|
||||||
|
|
||||||
|
|
||||||
// NOTE: No macros are provided for bittree decoding. It seems to be simpler
|
|
||||||
// to just write them open in the code.
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue