liblzma: SHA-256: Unroll a little more.
This way a branch isn't needed for each operation to choose between blk0 and blk2, and still the code doesn't grow as much as it would with full unrolling.
This commit is contained in:
parent
bc7650d87b
commit
9a096f8e57
|
@ -47,11 +47,12 @@
|
||||||
#define g(i) T[(6 - i) & 7]
|
#define g(i) T[(6 - i) & 7]
|
||||||
#define h(i) T[(7 - i) & 7]
|
#define h(i) T[(7 - i) & 7]
|
||||||
|
|
||||||
#define R(i) \
|
#define R(i, j, blk) \
|
||||||
h(i) += S1(e(i)) + Ch(e(i), f(i), g(i)) + SHA256_K[i + j] \
|
h(i) += S1(e(i)) + Ch(e(i), f(i), g(i)) + SHA256_K[i + j] + blk; \
|
||||||
+ (j ? blk2(i) : blk0(i)); \
|
|
||||||
d(i) += h(i); \
|
d(i) += h(i); \
|
||||||
h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
|
h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
|
||||||
|
#define R0(i) R(i, 0, blk0(i))
|
||||||
|
#define R2(i) R(i, j, blk2(i))
|
||||||
|
|
||||||
#define S0(x) (rotr_32(x, 2) ^ rotr_32(x, 13) ^ rotr_32(x, 22))
|
#define S0(x) (rotr_32(x, 2) ^ rotr_32(x, 13) ^ rotr_32(x, 22))
|
||||||
#define S1(x) (rotr_32(x, 6) ^ rotr_32(x, 11) ^ rotr_32(x, 25))
|
#define S1(x) (rotr_32(x, 6) ^ rotr_32(x, 11) ^ rotr_32(x, 25))
|
||||||
|
@ -88,12 +89,18 @@ transform(uint32_t state[8], const uint32_t data[16])
|
||||||
// Copy state[] to working vars.
|
// Copy state[] to working vars.
|
||||||
memcpy(T, state, sizeof(T));
|
memcpy(T, state, sizeof(T));
|
||||||
|
|
||||||
// 64 operations, partially loop unrolled
|
// The first 16 operations unrolled
|
||||||
for (unsigned int j = 0; j < 64; j += 16) {
|
R0( 0); R0( 1); R0( 2); R0( 3);
|
||||||
R( 0); R( 1); R( 2); R( 3);
|
R0( 4); R0( 5); R0( 6); R0( 7);
|
||||||
R( 4); R( 5); R( 6); R( 7);
|
R0( 8); R0( 9); R0(10); R0(11);
|
||||||
R( 8); R( 9); R(10); R(11);
|
R0(12); R0(13); R0(14); R0(15);
|
||||||
R(12); R(13); R(14); R(15);
|
|
||||||
|
// The remaining 48 operations partially unrolled
|
||||||
|
for (unsigned int j = 16; j < 64; j += 16) {
|
||||||
|
R2( 0); R2( 1); R2( 2); R2( 3);
|
||||||
|
R2( 4); R2( 5); R2( 6); R2( 7);
|
||||||
|
R2( 8); R2( 9); R2(10); R2(11);
|
||||||
|
R2(12); R2(13); R2(14); R2(15);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the working vars back into state[].
|
// Add the working vars back into state[].
|
||||||
|
|
Loading…
Reference in New Issue