liblzma: Refactor CRC comments.
A detailed description of the three dispatch methods was added. Also, duplicated comments now only appear in crc32_fast.c or were removed from both crc32_fast.c and crc64_fast.c if they appeared in crc_clmul.c.
This commit is contained in:
parent
8c0f9376f5
commit
96b663f67c
|
@ -2,25 +2,6 @@
|
||||||
//
|
//
|
||||||
/// \file crc32.c
|
/// \file crc32.c
|
||||||
/// \brief CRC32 calculation
|
/// \brief CRC32 calculation
|
||||||
///
|
|
||||||
/// There are two methods in this file.
|
|
||||||
/// crc32_generic uses the slice-by-eight algorithm.
|
|
||||||
/// It is explained in this document:
|
|
||||||
/// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
|
|
||||||
/// The code in this file is not the same as in Intel's paper, but
|
|
||||||
/// the basic principle is identical.
|
|
||||||
///
|
|
||||||
/// crc32_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions.
|
|
||||||
/// It was derived from
|
|
||||||
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
|
|
||||||
/// and the public domain code from https://github.com/rawrunprotected/crc
|
|
||||||
/// (URLs were checked on 2023-09-29).
|
|
||||||
///
|
|
||||||
/// FIXME: Builds for 32-bit x86 use crc32_x86.S by default instead
|
|
||||||
/// of this file and thus CLMUL version isn't available on 32-bit x86
|
|
||||||
/// unless configured with --disable-assembler. Even then the lookup table
|
|
||||||
/// isn't omitted in crc32_table.c since it doesn't know that assembly
|
|
||||||
/// code has been disabled.
|
|
||||||
//
|
//
|
||||||
// Authors: Lasse Collin
|
// Authors: Lasse Collin
|
||||||
// Ilya Kurdyukov
|
// Ilya Kurdyukov
|
||||||
|
@ -100,6 +81,38 @@ crc32_generic(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
// Function dispatching //
|
||||||
|
//////////////////////////
|
||||||
|
|
||||||
|
// If both the generic and CLMUL implementations are built, then the
|
||||||
|
// function to use is selected at runtime since system running the
|
||||||
|
// binary may not have the CLMUL instructions.
|
||||||
|
// The three dispatch methods in order of priority:
|
||||||
|
//
|
||||||
|
// 1. Indirect function (ifunc). This method is slightly more efficient
|
||||||
|
// than the constructor method because it will change the entry in the
|
||||||
|
// Procedure Linkage Table (PLT) for the function either at load time or
|
||||||
|
// at the first call. This avoids having to call the function through a
|
||||||
|
// function pointer and will treat the function call like a regular call
|
||||||
|
// through the PLT. ifuncs are created by using
|
||||||
|
// __attribute__((__ifunc__("resolver"))) on a function which has no
|
||||||
|
// body. The "resolver" is the name of the function that chooses at
|
||||||
|
// runtime which implementation to use.
|
||||||
|
//
|
||||||
|
// 2. Constructor. This method uses __attribute__((__constructor__)) to
|
||||||
|
// set crc32_func at load time. This avoids extra computation (and any
|
||||||
|
// unlikely threading bugs) on the first call to lzma_crc32() to decide
|
||||||
|
// which implementation should be used.
|
||||||
|
//
|
||||||
|
// 3. First Call Resolution. On the very first call to lzma_crc32(), the
|
||||||
|
// call will be directed to crc32_dispatch() instead. This will set the
|
||||||
|
// appropriate implementation function and will not be called again.
|
||||||
|
// This method does not use any kind of locking but is safe because if
|
||||||
|
// multiple threads run the dispatcher simultaneously then they will all
|
||||||
|
// set crc32_func to the same value.
|
||||||
|
|
||||||
typedef uint32_t (*crc32_func_type)(
|
typedef uint32_t (*crc32_func_type)(
|
||||||
const uint8_t *buf, size_t size, uint32_t crc);
|
const uint8_t *buf, size_t size, uint32_t crc);
|
||||||
|
|
||||||
|
@ -111,6 +124,9 @@ typedef uint32_t (*crc32_func_type)(
|
||||||
# pragma GCC diagnostic ignored "-Wunused-function"
|
# pragma GCC diagnostic ignored "-Wunused-function"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// This resolver is shared between all three dispatch methods. It serves as
|
||||||
|
// the ifunc resolver if ifunc is supported, otherwise it is called as a
|
||||||
|
// regular function by the constructor or first call resolution methods.
|
||||||
static crc32_func_type
|
static crc32_func_type
|
||||||
crc32_resolve(void)
|
crc32_resolve(void)
|
||||||
{
|
{
|
||||||
|
@ -124,9 +140,11 @@ crc32_resolve(void)
|
||||||
#ifndef HAVE_FUNC_ATTRIBUTE_IFUNC
|
#ifndef HAVE_FUNC_ATTRIBUTE_IFUNC
|
||||||
|
|
||||||
#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
||||||
|
// Constructor method.
|
||||||
# define CRC32_SET_FUNC_ATTR __attribute__((__constructor__))
|
# define CRC32_SET_FUNC_ATTR __attribute__((__constructor__))
|
||||||
static crc32_func_type crc32_func;
|
static crc32_func_type crc32_func;
|
||||||
#else
|
#else
|
||||||
|
// First Call Resolution method.
|
||||||
# define CRC32_SET_FUNC_ATTR
|
# define CRC32_SET_FUNC_ATTR
|
||||||
static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc);
|
static uint32_t crc32_dispatch(const uint8_t *buf, size_t size, uint32_t crc);
|
||||||
static crc32_func_type crc32_func = &crc32_dispatch;
|
static crc32_func_type crc32_func = &crc32_dispatch;
|
||||||
|
@ -190,6 +208,14 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
|
||||||
return crc32_generic(buf, size, crc);
|
return crc32_generic(buf, size, crc);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
||||||
|
// See crc32_dispatch(). This would be the alternative which uses
|
||||||
|
// locking and doesn't use crc32_dispatch(). Note that on Windows
|
||||||
|
// this method needs Vista threads.
|
||||||
|
mythread_once(crc64_set_func);
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
return crc32_func(buf, size, crc);
|
return crc32_func(buf, size, crc);
|
||||||
|
|
||||||
#elif defined(CRC_CLMUL)
|
#elif defined(CRC_CLMUL)
|
||||||
|
|
|
@ -2,23 +2,6 @@
|
||||||
//
|
//
|
||||||
/// \file crc64.c
|
/// \file crc64.c
|
||||||
/// \brief CRC64 calculation
|
/// \brief CRC64 calculation
|
||||||
///
|
|
||||||
/// There are two methods in this file. crc64_generic uses the
|
|
||||||
/// the slice-by-four algorithm. This is the same idea that is
|
|
||||||
/// used in crc32_fast.c, but for CRC64 we use only four tables
|
|
||||||
/// instead of eight to avoid increasing CPU cache usage.
|
|
||||||
///
|
|
||||||
/// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions.
|
|
||||||
/// It was derived from
|
|
||||||
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
|
|
||||||
/// and the public domain code from https://github.com/rawrunprotected/crc
|
|
||||||
/// (URLs were checked on 2023-09-29).
|
|
||||||
///
|
|
||||||
/// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead
|
|
||||||
/// of this file and thus CLMUL version isn't available on 32-bit x86
|
|
||||||
/// unless configured with --disable-assembler. Even then the lookup table
|
|
||||||
/// isn't omitted in crc64_table.c since it doesn't know that assembly
|
|
||||||
/// code has been disabled.
|
|
||||||
//
|
//
|
||||||
// Authors: Lasse Collin
|
// Authors: Lasse Collin
|
||||||
// Ilya Kurdyukov
|
// Ilya Kurdyukov
|
||||||
|
@ -93,12 +76,16 @@ crc64_generic(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
||||||
|
//////////////////////////
|
||||||
|
// Function dispatching //
|
||||||
|
//////////////////////////
|
||||||
|
|
||||||
|
// If both the generic and CLMUL implementations are usable, then the
|
||||||
|
// function that is used is selected at runtime. See crc32_fast.c.
|
||||||
|
|
||||||
typedef uint64_t (*crc64_func_type)(
|
typedef uint64_t (*crc64_func_type)(
|
||||||
const uint8_t *buf, size_t size, uint64_t crc);
|
const uint8_t *buf, size_t size, uint64_t crc);
|
||||||
|
|
||||||
// Clang 16.0.0 and older has a bug where it marks the ifunc resolver
|
|
||||||
// function as unused since it is static and never used outside of
|
|
||||||
// __attribute__((__ifunc__())).
|
|
||||||
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
#if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__)
|
||||||
# pragma GCC diagnostic push
|
# pragma GCC diagnostic push
|
||||||
# pragma GCC diagnostic ignored "-Wunused-function"
|
# pragma GCC diagnostic ignored "-Wunused-function"
|
||||||
|
@ -139,13 +126,6 @@ crc64_set_func(void)
|
||||||
static uint64_t
|
static uint64_t
|
||||||
crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
|
crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
{
|
{
|
||||||
// When __attribute__((__ifunc__(...))) and
|
|
||||||
// __attribute__((__constructor__)) isn't supported, set the
|
|
||||||
// function pointer without any locking. If multiple threads run
|
|
||||||
// the detection code in parallel, they will all end up setting
|
|
||||||
// the pointer to the same value. This avoids the use of
|
|
||||||
// mythread_once() on every call to lzma_crc64() but this likely
|
|
||||||
// isn't strictly standards compliant. Let's change it if it breaks.
|
|
||||||
crc64_set_func();
|
crc64_set_func();
|
||||||
return crc64_func(buf, size, crc);
|
return crc64_func(buf, size, crc);
|
||||||
}
|
}
|
||||||
|
@ -163,36 +143,11 @@ extern LZMA_API(uint64_t)
|
||||||
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
|
||||||
{
|
{
|
||||||
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
|
||||||
// If CLMUL is available, it is the best for non-tiny inputs,
|
|
||||||
// being over twice as fast as the generic slice-by-four version.
|
|
||||||
// However, for size <= 16 it's different. In the extreme case
|
|
||||||
// of size == 1 the generic version can be five times faster.
|
|
||||||
// At size >= 8 the CLMUL starts to become reasonable. It
|
|
||||||
// varies depending on the alignment of buf too.
|
|
||||||
//
|
|
||||||
// The above doesn't include the overhead of mythread_once().
|
|
||||||
// At least on x86-64 GNU/Linux, pthread_once() is very fast but
|
|
||||||
// it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When
|
|
||||||
// size reaches 12-16 bytes the overhead becomes negligible.
|
|
||||||
//
|
|
||||||
// So using the generic version for size <= 16 may give better
|
|
||||||
// performance with tiny inputs but if such inputs happen rarely
|
|
||||||
// it's not so obvious because then the lookup table of the
|
|
||||||
// generic version may not be in the processor cache.
|
|
||||||
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
|
||||||
if (size <= 16)
|
if (size <= 16)
|
||||||
return crc64_generic(buf, size, crc);
|
return crc64_generic(buf, size, crc);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
|
|
||||||
// See crc64_dispatch(). This would be the alternative which uses
|
|
||||||
// locking and doesn't use crc64_dispatch(). Note that on Windows
|
|
||||||
// this method needs Vista threads.
|
|
||||||
mythread_once(crc64_set_func);
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
return crc64_func(buf, size, crc);
|
return crc64_func(buf, size, crc);
|
||||||
|
|
||||||
#elif defined(CRC_CLMUL)
|
#elif defined(CRC_CLMUL)
|
||||||
|
|
Loading…
Reference in New Issue