liblzma: Rename crc32_aarch64.h to crc32_arm64.h.
Even though the proper name for the architecture is aarch64, this project uses ARM64 throughout. So the rename is for consistency. Additionally, crc32_arm64.h was slightly refactored for the following changes: * Added MSVC, FreeBSD, and macOS support in is_arch_extension_supported(). * crc32_arch_optimized() now checks the size when aligning the buffer. * crc32_arch_optimized() loop conditions were slightly modified to avoid both decrementing the size and incrementing the buffer pointer. * Use the intrinsic wrappers defined in <arm_acle.h> because GCC and Clang name them differently. * Minor spacing and comment changes.
This commit is contained in:
parent
455a08609c
commit
761f5b69a4
|
@ -230,7 +230,7 @@ add_library(liblzma
|
||||||
src/liblzma/check/check.h
|
src/liblzma/check/check.h
|
||||||
src/liblzma/check/crc_common.h
|
src/liblzma/check/crc_common.h
|
||||||
src/liblzma/check/crc_x86_clmul.h
|
src/liblzma/check/crc_x86_clmul.h
|
||||||
src/liblzma/check/crc32_aarch64.h
|
src/liblzma/check/crc32_arm64.h
|
||||||
src/liblzma/common/block_util.c
|
src/liblzma/common/block_util.c
|
||||||
src/liblzma/common/common.c
|
src/liblzma/common/common.c
|
||||||
src/liblzma/common/common.h
|
src/liblzma/common/common.h
|
||||||
|
|
|
@ -16,7 +16,7 @@ liblzma_la_SOURCES += \
|
||||||
check/check.h \
|
check/check.h \
|
||||||
check/crc_common.h \
|
check/crc_common.h \
|
||||||
check/crc_x86_clmul.h \
|
check/crc_x86_clmul.h \
|
||||||
check/crc32_aarch64.h
|
check/crc32_arm64.h
|
||||||
|
|
||||||
if COND_SMALL
|
if COND_SMALL
|
||||||
liblzma_la_SOURCES += check/crc32_small.c
|
liblzma_la_SOURCES += check/crc32_small.c
|
||||||
|
|
|
@ -1,109 +0,0 @@
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
/// \file crc32_aarch64.c
|
|
||||||
/// \brief CRC32 calculation with aarch64 optimization
|
|
||||||
//
|
|
||||||
// Authors: Chenxi Mao
|
|
||||||
//
|
|
||||||
// This file has been put into the public domain.
|
|
||||||
// You can do whatever you want with this file.
|
|
||||||
//
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
#ifdef LZMA_CRC_CRC32_AARCH64_H
|
|
||||||
# error crc_arm64_clmul.h was included twice.
|
|
||||||
#endif
|
|
||||||
#define LZMA_CRC_CRC32_AARCH64_H
|
|
||||||
#include <sys/auxv.h>
|
|
||||||
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
|
|
||||||
// define __GNUC__ but the attribute must not be used with them.
|
|
||||||
// The new Clang-based ICX needs the attribute.
|
|
||||||
//
|
|
||||||
// NOTE: Build systems check for this too, keep them in sync with this.
|
|
||||||
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
|
|
||||||
# define crc_attr_target \
|
|
||||||
__attribute__((__target__("+crc")))
|
|
||||||
#else
|
|
||||||
# define crc_attr_target
|
|
||||||
#endif
|
|
||||||
#ifdef BUILDING_CRC32_AARCH64
|
|
||||||
crc_attr_target
|
|
||||||
crc_attr_no_sanitize_address
|
|
||||||
static uint32_t
|
|
||||||
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
|
|
||||||
{
|
|
||||||
crc = ~crc;
|
|
||||||
while ((uintptr_t)(buf) & 7) {
|
|
||||||
crc = __builtin_aarch64_crc32b(crc, *buf);
|
|
||||||
buf++;
|
|
||||||
size--;
|
|
||||||
}
|
|
||||||
for (;size>=8;size-=8,buf+=8) {
|
|
||||||
crc = __builtin_aarch64_crc32x(crc, aligned_read64le(buf));
|
|
||||||
}
|
|
||||||
for (;size>0;size--,buf++)
|
|
||||||
crc = __builtin_aarch64_crc32b(crc, *buf);
|
|
||||||
return ~crc;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#ifdef BUILDING_CRC64_AARCH64
|
|
||||||
//FIXME: there is no crc64_arch_optimized implementation,
|
|
||||||
// to make compiler happy, add crc64_generic here.
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
# define A1(x) ((x) >> 56)
|
|
||||||
#else
|
|
||||||
# define A1 A
|
|
||||||
#endif
|
|
||||||
crc_attr_target
|
|
||||||
crc_attr_no_sanitize_address
|
|
||||||
static uint64_t
|
|
||||||
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
|
|
||||||
{
|
|
||||||
crc = ~crc;
|
|
||||||
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
crc = bswap64(crc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (size > 4) {
|
|
||||||
while ((uintptr_t)(buf) & 3) {
|
|
||||||
crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
|
||||||
--size;
|
|
||||||
}
|
|
||||||
|
|
||||||
const uint8_t *const limit = buf + (size & ~(size_t)(3));
|
|
||||||
size &= (size_t)(3);
|
|
||||||
|
|
||||||
while (buf < limit) {
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
const uint32_t tmp = (uint32_t)(crc >> 32)
|
|
||||||
^ aligned_read32ne(buf);
|
|
||||||
#else
|
|
||||||
const uint32_t tmp = (uint32_t)crc
|
|
||||||
^ aligned_read32ne(buf);
|
|
||||||
#endif
|
|
||||||
buf += 4;
|
|
||||||
|
|
||||||
crc = lzma_crc64_table[3][A(tmp)]
|
|
||||||
^ lzma_crc64_table[2][B(tmp)]
|
|
||||||
^ S32(crc)
|
|
||||||
^ lzma_crc64_table[1][C(tmp)]
|
|
||||||
^ lzma_crc64_table[0][D(tmp)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
while (size-- != 0)
|
|
||||||
crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
|
|
||||||
|
|
||||||
#ifdef WORDS_BIGENDIAN
|
|
||||||
crc = bswap64(crc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return ~crc;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
static inline bool
|
|
||||||
is_arch_extension_supported(void)
|
|
||||||
{
|
|
||||||
return (getauxval(AT_HWCAP) & HWCAP_CRC32)!=0;
|
|
||||||
}
|
|
||||||
|
|
|
@ -0,0 +1,119 @@
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
//
|
||||||
|
/// \file crc32_arm64.h
|
||||||
|
/// \brief CRC32 calculation with ARM64 optimization
|
||||||
|
//
|
||||||
|
// Authors: Chenxi Mao
|
||||||
|
// Jia Tan
|
||||||
|
//
|
||||||
|
// This file has been put into the public domain.
|
||||||
|
// You can do whatever you want with this file.
|
||||||
|
//
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef LZMA_CRC32_ARM64_H
|
||||||
|
#define LZMA_CRC32_ARM64_H
|
||||||
|
|
||||||
|
// MSVC always has the CRC intrinsics available when building for ARM64
|
||||||
|
// there is no need to include any header files.
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
# include <arm_acle.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
||||||
|
# if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
|
||||||
|
# include <sys/auxv.h>
|
||||||
|
# elif defined(_WIN32)
|
||||||
|
# include <processthreadsapi.h>
|
||||||
|
# elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
|
||||||
|
# include <sys/sysctl.h>
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Some EDG-based compilers support ARM64 and define __GNUC__
|
||||||
|
// (such as Nvidia's nvcc), but do not support function attributes.
|
||||||
|
//
|
||||||
|
// NOTE: Build systems check for this too, keep them in sync with this.
|
||||||
|
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
|
||||||
|
# define crc_attr_target \
|
||||||
|
__attribute__((__target__("+crc")))
|
||||||
|
#else
|
||||||
|
# define crc_attr_target
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
crc_attr_target
|
||||||
|
static uint32_t
|
||||||
|
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
|
{
|
||||||
|
crc = ~crc;
|
||||||
|
|
||||||
|
// Align the input buffer because this was shown to be
|
||||||
|
// significantly faster than unaligned accesses.
|
||||||
|
const size_t align_amount = my_min(size, (8 - (uintptr_t)buf) & 7);
|
||||||
|
|
||||||
|
for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf)
|
||||||
|
crc = __crc32b(crc, *buf);
|
||||||
|
|
||||||
|
size -= align_amount;
|
||||||
|
|
||||||
|
// Process 8 bytes at a time. The end point is determined by
|
||||||
|
// ignoring the least significant three bits of size to ensure
|
||||||
|
// we do not process past the bounds of the buffer. This guarentees
|
||||||
|
// that limit is a multiple of 8 and is strictly less than size.
|
||||||
|
for (const uint8_t *limit = buf + (size & ~((size_t)7));
|
||||||
|
buf < limit; buf += 8)
|
||||||
|
crc = __crc32d(crc, aligned_read64le(buf));
|
||||||
|
|
||||||
|
// Process the remaining bytes that are not 8 byte aligned.
|
||||||
|
for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf)
|
||||||
|
crc = __crc32b(crc, *buf);
|
||||||
|
|
||||||
|
return ~crc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED)
|
||||||
|
static inline bool
|
||||||
|
is_arch_extension_supported(void)
|
||||||
|
{
|
||||||
|
#if defined(HAVE_GETAUXVAL)
|
||||||
|
return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
|
||||||
|
|
||||||
|
#elif defined(HAVE_ELF_AUX_INFO)
|
||||||
|
unsigned long feature_flags;
|
||||||
|
|
||||||
|
elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags));
|
||||||
|
return feature_flags & HWCAP_CRC32 != 0;
|
||||||
|
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
return IsProcessorFeaturePresent(
|
||||||
|
PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
|
||||||
|
|
||||||
|
#elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME)
|
||||||
|
int has_crc32 = 0;
|
||||||
|
size_t size = sizeof(has_crc32);
|
||||||
|
|
||||||
|
// The sysctlbyname() function requires a string identifier for the
|
||||||
|
// CPU feature it tests. The Apple documentation lists the string
|
||||||
|
// "hw.optional.armv8_crc32", which can be found here:
|
||||||
|
// (https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619)
|
||||||
|
int err = sysctlbyname("hw.optional.armv8_crc32", &has_crc32,
|
||||||
|
&size, NULL, 0);
|
||||||
|
|
||||||
|
return !err && has_crc32;
|
||||||
|
|
||||||
|
#else
|
||||||
|
// If a runtime detection method cannot be found, then this must
|
||||||
|
// be a compile time error. The checks in crc_common.h should ensure
|
||||||
|
// a runtime detection method is always found if this function is
|
||||||
|
// built. It would be possible to just return false here, but this
|
||||||
|
// is inefficient for binary size and runtime since only the generic
|
||||||
|
// method could ever be used.
|
||||||
|
# error Runtime detection method unavailable.
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // LZMA_CRC32_ARM64_H
|
|
@ -19,8 +19,7 @@
|
||||||
# define BUILDING_CRC32_CLMUL
|
# define BUILDING_CRC32_CLMUL
|
||||||
# include "crc_x86_clmul.h"
|
# include "crc_x86_clmul.h"
|
||||||
#elif defined(CRC32_ARM64)
|
#elif defined(CRC32_ARM64)
|
||||||
# define BUILDING_CRC32_AARCH64
|
# include "crc32_arm64.h"
|
||||||
# include "crc32_aarch64.h"
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,9 +17,6 @@
|
||||||
#if defined(CRC_X86_CLMUL)
|
#if defined(CRC_X86_CLMUL)
|
||||||
# define BUILDING_CRC64_CLMUL
|
# define BUILDING_CRC64_CLMUL
|
||||||
# include "crc_x86_clmul.h"
|
# include "crc_x86_clmul.h"
|
||||||
#elif defined(CRC32_ARM64)
|
|
||||||
# define BUILDING_CRC64_AARCH64
|
|
||||||
# include "crc32_aarch64.h"
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue