liblzma: Refactor crc_common.h.
The CRC_GENERIC is now split into CRC32_GENERIC and CRC64_GENERIC, since the ARM64 optimizations will be different between CRC32 and CRC64. For the same reason, CRC_ARCH_OPTIMIZED is split into CRC32_ARCH_OPTIMIZED and CRC64_ARCH_OPTIMIZED. ifunc will only be used with x86-64 CLMUL because the runtime detection methods needed with ARM64 are not compatible with ifunc.
This commit is contained in:
parent
61908e8160
commit
455a08609c
|
@ -24,7 +24,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef CRC_GENERIC
|
#ifdef CRC32_GENERIC
|
||||||
|
|
||||||
///////////////////
|
///////////////////
|
||||||
// Generic CRC32 //
|
// Generic CRC32 //
|
||||||
|
@ -90,7 +90,7 @@ crc32_generic(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_ARCH_OPTIMIZED)
|
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
// Function dispatching //
|
// Function dispatching //
|
||||||
|
@ -197,7 +197,7 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
extern LZMA_API(uint32_t)
|
extern LZMA_API(uint32_t)
|
||||||
lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
{
|
{
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_ARCH_OPTIMIZED)
|
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
||||||
// On x86-64, if CLMUL is available, it is the best for non-tiny
|
// On x86-64, if CLMUL is available, it is the best for non-tiny
|
||||||
// inputs, being over twice as fast as the generic slice-by-four
|
// inputs, being over twice as fast as the generic slice-by-four
|
||||||
// version. However, for size <= 16 it's different. In the extreme
|
// version. However, for size <= 16 it's different. In the extreme
|
||||||
|
@ -229,7 +229,7 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
*/
|
*/
|
||||||
return crc32_func(buf, size, crc);
|
return crc32_func(buf, size, crc);
|
||||||
|
|
||||||
#elif defined(CRC_ARCH_OPTIMIZED)
|
#elif defined(CRC32_ARCH_OPTIMIZED)
|
||||||
return crc32_arch_optimized(buf, size, crc);
|
return crc32_arch_optimized(buf, size, crc);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef CRC_GENERIC
|
#ifdef CRC64_GENERIC
|
||||||
|
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Generic slice-by-four CRC64 //
|
// Generic slice-by-four CRC64 //
|
||||||
|
@ -85,7 +85,7 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_ARCH_OPTIMIZED)
|
#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
// Function dispatching //
|
// Function dispatching //
|
||||||
|
@ -154,7 +154,7 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
extern LZMA_API(uint64_t)
|
extern LZMA_API(uint64_t)
|
||||||
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
{
|
{
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_ARCH_OPTIMIZED)
|
#if defined(CRC64_GENERIC) && defined(CRC64_ARCH_OPTIMIZED)
|
||||||
|
|
||||||
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
if (size <= 16)
|
if (size <= 16)
|
||||||
|
@ -162,7 +162,7 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
#endif
|
#endif
|
||||||
return crc64_func(buf, size, crc);
|
return crc64_func(buf, size, crc);
|
||||||
|
|
||||||
#elif defined(CRC_ARCH_OPTIMIZED)
|
#elif defined(CRC64_ARCH_OPTIMIZED)
|
||||||
// If arch-optimized version is used unconditionally without runtime
|
// If arch-optimized version is used unconditionally without runtime
|
||||||
// CPU detection then omitting the generic version and its 8 KiB
|
// CPU detection then omitting the generic version and its 8 KiB
|
||||||
// lookup table makes the library smaller.
|
// lookup table makes the library smaller.
|
||||||
|
|
|
@ -48,54 +48,94 @@
|
||||||
# define crc_attr_no_sanitize_address
|
# define crc_attr_no_sanitize_address
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Keep this in sync with changes to crc32_arm64.h
|
||||||
|
#if defined(_WIN32) || defined(HAVE_GETAUXVAL) \
|
||||||
|
|| defined(HAVE_ELF_AUX_INFO) \
|
||||||
|
|| (defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME))
|
||||||
|
# define ARM64_RUNTIME_DETECTION 1
|
||||||
|
#endif
|
||||||
|
|
||||||
#undef CRC_GENERIC
|
|
||||||
#undef CRC_ARCH_OPTIMIZED
|
#undef CRC32_GENERIC
|
||||||
|
#undef CRC64_GENERIC
|
||||||
|
|
||||||
|
#undef CRC32_ARCH_OPTIMIZED
|
||||||
|
#undef CRC64_ARCH_OPTIMIZED
|
||||||
|
|
||||||
|
// The x86 CLMUL is used for both CRC32 and CRC64.
|
||||||
#undef CRC_X86_CLMUL
|
#undef CRC_X86_CLMUL
|
||||||
|
|
||||||
#undef CRC32_ARM64
|
#undef CRC32_ARM64
|
||||||
|
#undef CRC64_ARM64_CLMUL
|
||||||
|
|
||||||
#undef CRC_USE_IFUNC
|
#undef CRC_USE_IFUNC
|
||||||
|
|
||||||
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
|
|
||||||
|
// ARM64 CRC32 instruction is only useful for CRC32. Currently, only
|
||||||
|
// little endian is supported since we were unable to test on a big
|
||||||
|
// endian machine.
|
||||||
|
#if defined(HAVE_ARM64_CRC32) && !defined(WORDS_BIGENDIAN)
|
||||||
|
// Allow ARM64 CRC32 instruction without a runtime check if
|
||||||
|
// __ARM_FEATURE_CRC32 is defined. GCC and Clang only define this if the
|
||||||
|
// proper compiler options are used.
|
||||||
|
# if defined(__ARM_FEATURE_CRC32)
|
||||||
|
# define CRC32_ARCH_OPTIMIZED 1
|
||||||
|
# define CRC32_ARM64 1
|
||||||
|
# elif defined(ARM64_RUNTIME_DETECTION)
|
||||||
|
# define CRC32_ARCH_OPTIMIZED 1
|
||||||
|
# define CRC32_ARM64 1
|
||||||
|
# define CRC32_GENERIC 1
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(HAVE_USABLE_CLMUL)
|
||||||
// If CLMUL is allowed unconditionally in the compiler options then the
|
// If CLMUL is allowed unconditionally in the compiler options then the
|
||||||
// generic version can be omitted. Note that this doesn't work with MSVC
|
// generic version can be omitted. Note that this doesn't work with MSVC
|
||||||
// as I don't know how to detect the features here.
|
// as I don't know how to detect the features here.
|
||||||
//
|
//
|
||||||
// NOTE: Keep this this in sync with crc32_table.c.
|
// NOTE: Keep this this in sync with crc32_table.c.
|
||||||
#if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|
# if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|
||||||
|| (defined(__e2k__) && __iset__ >= 6)
|
|| (defined(__e2k__) && __iset__ >= 6)
|
||||||
# define CRC_ARCH_OPTIMIZED 1
|
# define CRC32_ARCH_OPTIMIZED 1
|
||||||
# define CRC_X86_CLMUL 1
|
# define CRC64_ARCH_OPTIMIZED 1
|
||||||
|
# define CRC_X86_CLMUL 1
|
||||||
#elif (defined(__aarch64__))
|
# else
|
||||||
# define CRC_ARCH_OPTIMIZED 1
|
# define CRC32_GENERIC 1
|
||||||
# define CRC32_ARM64 1
|
# define CRC64_GENERIC 1
|
||||||
// If CLMUL cannot be used then only the generic slice-by-eight (CRC32)
|
# define CRC32_ARCH_OPTIMIZED 1
|
||||||
// or slice-by-four (CRC64) is built.
|
# define CRC64_ARCH_OPTIMIZED 1
|
||||||
#elif !defined(HAVE_USABLE_CLMUL)
|
# define CRC_X86_CLMUL 1
|
||||||
# define CRC_GENERIC 1
|
|
||||||
// Otherwise build both and detect at runtime which version to use.
|
|
||||||
#else
|
|
||||||
# define CRC_GENERIC 1
|
|
||||||
# define CRC_ARCH_OPTIMIZED 1
|
|
||||||
# define CRC_X86_CLMUL 1
|
|
||||||
# define CRC32_ARM64 1
|
|
||||||
|
|
||||||
# ifdef HAVE_FUNC_ATTRIBUTE_IFUNC
|
|
||||||
# define CRC_USE_IFUNC 1
|
|
||||||
# endif
|
|
||||||
|
|
||||||
|
# ifdef HAVE_FUNC_ATTRIBUTE_IFUNC
|
||||||
|
# define CRC_USE_IFUNC 1
|
||||||
|
# endif
|
||||||
/*
|
/*
|
||||||
// The generic code is much faster with 1-8-byte inputs and has
|
// The generic code is much faster with 1-8-byte inputs and
|
||||||
// similar performance up to 16 bytes at least in microbenchmarks
|
// has similar performance up to 16 bytes at least in
|
||||||
// (it depends on input buffer alignment too). If both versions are
|
// microbenchmarks (it depends on input buffer alignment
|
||||||
// built, this #define will use the generic version for inputs up to
|
// too). If both versions are built, this #define will use
|
||||||
// 16 bytes and CLMUL for bigger inputs. It saves a little in code
|
// the generic version for inputs up to 16 bytes and CLMUL
|
||||||
// size since the special cases for 0-16-byte inputs will be omitted
|
// for bigger inputs. It saves a little in code size since
|
||||||
// from the CLMUL code.
|
// the special cases for 0-16-byte inputs will be omitted
|
||||||
# ifndef CRC_USE_IFUNC
|
// from the CLMUL code.
|
||||||
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
|
# ifndef CRC_USE_IFUNC
|
||||||
# endif
|
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
|
||||||
|
# endif
|
||||||
*/
|
*/
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For CRC32 use the generic slice-by-eight implementation if no optimized
|
||||||
|
// version is available.
|
||||||
|
#if !defined(CRC32_ARCH_OPTIMIZED) && !defined(CRC32_GENERIC)
|
||||||
|
# define CRC32_GENERIC 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// For CRC64 use the generic slice-by-four implementation if no optimized
|
||||||
|
// version is available.
|
||||||
|
#if !defined(CRC64_ARCH_OPTIMIZED) && !defined(CRC64_GENERIC)
|
||||||
|
# define CRC64_GENERIC 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue