liblzma: Disable branchless C version in range decoder.

Thanks to Sebastian Andrzej Siewior and Sam James for
benchmarking on various systems.
This commit is contained in:
Lasse Collin 2024-02-22 14:41:29 +02:00
parent 00440f52be
commit 120da10ae1
1 changed files with 10 additions and 3 deletions

View File

@ -24,8 +24,8 @@
// Bitwise-or of the following enable branchless C versions: // Bitwise-or of the following enable branchless C versions:
// 0x01 normal bittrees // 0x01 normal bittrees
// 0x02 fixed-sized reverse bittrees // 0x02 fixed-sized reverse bittrees
// 0x04 variable-sized reverse bittrees (disabled by default, not faster?) // 0x04 variable-sized reverse bittrees (not faster)
// 0x08 matched literal (disabled by default, not faster?) // 0x08 matched literal (not faster)
// //
// GCC & Clang compatible x86-64 inline assembly: // GCC & Clang compatible x86-64 inline assembly:
// 0x010 normal bittrees // 0x010 normal bittrees
@ -36,12 +36,19 @@
// //
// The default can be overridden at build time by defining // The default can be overridden at build time by defining
// LZMA_RANGE_DECODER_CONFIG to the desired mask. // LZMA_RANGE_DECODER_CONFIG to the desired mask.
//
// 2024-02-22: Feedback from benchmarks:
// - Brancless C (0x003) can be better than basic on x86-64 but often it's
// slightly worse on other archs. Since asm is much better on x86-64,
// branchless C is not used at all.
// - With x86-64 asm, there are slight differences between GCC and Clang
// and different processors. Overall 0x1F0 seems to be the best choice.
#ifndef LZMA_RANGE_DECODER_CONFIG #ifndef LZMA_RANGE_DECODER_CONFIG
# if defined(__x86_64__) && !defined(__ILP32__) \ # if defined(__x86_64__) && !defined(__ILP32__) \
&& (defined(__GNUC__) || defined(__clang__)) && (defined(__GNUC__) || defined(__clang__))
# define LZMA_RANGE_DECODER_CONFIG 0x1F0 # define LZMA_RANGE_DECODER_CONFIG 0x1F0
# else # else
# define LZMA_RANGE_DECODER_CONFIG 0x03 # define LZMA_RANGE_DECODER_CONFIG 0
# endif # endif
#endif #endif