mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 00:58:05 +02:00
util: Update BLAKE3 from 1.5.1 to 1.8.2
Steps for uprev:
- copy files from BLAKE3/c src/util/blake3/
- edit README
- `for file in *.asm; do mv "$file" "${file%.asm}.masm"; done`
- keep
- blake3.h (no relevant changes), only change BLAKE3_VERSION_STRING
- blake3_sse2_x86-64_unix.S (no changes)
- blake3_avx512_x86-64_unix.S (no changes)
- blake3_sse41_x86-64_unix.S (no changes)
Signed-off-by: Valentine Burley <valentine.burley@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35853>
This commit is contained in:
parent
92623d2447
commit
8d2bb19c63
8 changed files with 2563 additions and 47 deletions
|
|
@ -1,7 +1,7 @@
|
|||
This folder contains a local copy of BLAKE3 cryptographic hash library, version 1.5.1.
|
||||
This folder contains a local copy of BLAKE3 cryptographic hash library, version 1.8.2.
|
||||
|
||||
Except for changes listed in the "Changes" section, this is a verbatim copy from
|
||||
https://github.com/BLAKE3-team/BLAKE3, tag 1.5.1.
|
||||
https://github.com/BLAKE3-team/BLAKE3, tag 1.8.2.
|
||||
|
||||
Files will be periodically synchronized with the upstream, and any local changes should
|
||||
be clearly documented below.
|
||||
|
|
@ -14,3 +14,5 @@ Changes:
|
|||
- Add "static" to blake3_hash4_neon, to comply with -Werror=missing-prototypes.
|
||||
|
||||
- Add mesa_blake3_visibility.h and set symbol visibility to hidden for assembly sources.
|
||||
|
||||
- Drop BLAKE3_PRIVATE from blake3_compress_subtree_wide and blake3_compress_subtree_wide_join_tbb
|
||||
|
|
|
|||
|
|
@ -88,24 +88,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
|
|||
|
||||
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
|
||||
size_t out_len) {
|
||||
if (out_len == 0) {
|
||||
return;
|
||||
}
|
||||
uint64_t output_block_counter = seek / 64;
|
||||
size_t offset_within_block = seek % 64;
|
||||
uint8_t wide_buf[64];
|
||||
while (out_len > 0) {
|
||||
blake3_compress_xof(self->input_cv, self->block, self->block_len,
|
||||
output_block_counter, self->flags | ROOT, wide_buf);
|
||||
size_t available_bytes = 64 - offset_within_block;
|
||||
size_t memcpy_len;
|
||||
if (out_len > available_bytes) {
|
||||
memcpy_len = available_bytes;
|
||||
} else {
|
||||
memcpy_len = out_len;
|
||||
}
|
||||
memcpy(out, wide_buf + offset_within_block, memcpy_len);
|
||||
out += memcpy_len;
|
||||
out_len -= memcpy_len;
|
||||
if(offset_within_block) {
|
||||
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
|
||||
const size_t available_bytes = 64 - offset_within_block;
|
||||
const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
|
||||
memcpy(out, wide_buf + offset_within_block, bytes);
|
||||
out += bytes;
|
||||
out_len -= bytes;
|
||||
output_block_counter += 1;
|
||||
offset_within_block = 0;
|
||||
}
|
||||
if(out_len / 64) {
|
||||
blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
|
||||
}
|
||||
output_block_counter += out_len / 64;
|
||||
out += out_len & -64;
|
||||
out_len -= out_len & -64;
|
||||
if(out_len) {
|
||||
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
|
||||
memcpy(out, wide_buf, out_len);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -134,9 +140,7 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
|
|||
input_len -= BLAKE3_BLOCK_LEN;
|
||||
}
|
||||
|
||||
size_t take = chunk_state_fill_buf(self, input, input_len);
|
||||
input += take;
|
||||
input_len -= take;
|
||||
chunk_state_fill_buf(self, input, input_len);
|
||||
}
|
||||
|
||||
INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
|
||||
|
|
@ -154,10 +158,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
|
|||
// Given some input larger than one chunk, return the number of bytes that
|
||||
// should go in the left subtree. This is the largest power-of-2 number of
|
||||
// chunks that leaves at least 1 byte for the right subtree.
|
||||
INLINE size_t left_len(size_t content_len) {
|
||||
// Subtract 1 to reserve at least one byte for the right side. content_len
|
||||
INLINE size_t left_subtree_len(size_t input_len) {
|
||||
// Subtract 1 to reserve at least one byte for the right side. input_len
|
||||
// should always be greater than BLAKE3_CHUNK_LEN.
|
||||
size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
|
||||
size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
|
||||
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
|
||||
}
|
||||
|
||||
|
|
@ -265,7 +269,8 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
|||
size_t input_len,
|
||||
const uint32_t key[8],
|
||||
uint64_t chunk_counter,
|
||||
uint8_t flags, uint8_t *out) {
|
||||
uint8_t flags, uint8_t *out,
|
||||
bool use_tbb) {
|
||||
// Note that the single chunk case does *not* bump the SIMD degree up to 2
|
||||
// when it is 1. If this implementation adds multi-threading in the future,
|
||||
// this gives us the option of multi-threading even the 2-chunk case, which
|
||||
|
|
@ -279,7 +284,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
|||
// the input into left and right subtrees. (Note that this is only optimal
|
||||
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
|
||||
// of 3 or something, we'll need a more complicated strategy.)
|
||||
size_t left_input_len = left_len(input_len);
|
||||
size_t left_input_len = left_subtree_len(input_len);
|
||||
size_t right_input_len = input_len - left_input_len;
|
||||
const uint8_t *right_input = &input[left_input_len];
|
||||
uint64_t right_chunk_counter =
|
||||
|
|
@ -299,12 +304,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
|||
}
|
||||
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
|
||||
|
||||
// Recurse! If this implementation adds multi-threading support in the
|
||||
// future, this is where it will go.
|
||||
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
|
||||
chunk_counter, flags, cv_array);
|
||||
size_t right_n = blake3_compress_subtree_wide(
|
||||
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
|
||||
// Recurse!
|
||||
size_t left_n = -1;
|
||||
size_t right_n = -1;
|
||||
|
||||
#if defined(BLAKE3_USE_TBB)
|
||||
blake3_compress_subtree_wide_join_tbb(
|
||||
key, flags, use_tbb,
|
||||
// left-hand side
|
||||
input, left_input_len, chunk_counter, cv_array, &left_n,
|
||||
// right-hand side
|
||||
right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
|
||||
#else
|
||||
left_n = blake3_compress_subtree_wide(
|
||||
input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
|
||||
right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
|
||||
right_chunk_counter, flags, right_cvs,
|
||||
use_tbb);
|
||||
#endif // BLAKE3_USE_TBB
|
||||
|
||||
// The special case again. If simd_degree=1, then we'll have left_n=1 and
|
||||
// right_n=1. Rather than compressing them into a single output, return
|
||||
|
|
@ -330,16 +347,18 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
|||
//
|
||||
// As with compress_subtree_wide(), this function is not used on inputs of 1
|
||||
// chunk or less. That's a different codepath.
|
||||
INLINE void compress_subtree_to_parent_node(
|
||||
const uint8_t *input, size_t input_len, const uint32_t key[8],
|
||||
uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
|
||||
INLINE void
|
||||
compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
|
||||
const uint32_t key[8], uint64_t chunk_counter,
|
||||
uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
|
||||
bool use_tbb) {
|
||||
#if defined(BLAKE3_TESTING)
|
||||
assert(input_len > BLAKE3_CHUNK_LEN);
|
||||
#endif
|
||||
|
||||
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
||||
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
||||
chunk_counter, flags, cv_array);
|
||||
chunk_counter, flags, cv_array, use_tbb);
|
||||
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
|
||||
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
|
||||
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
|
||||
|
|
@ -430,7 +449,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
|
|||
// of the whole tree, and it would need to be ROOT finalized. We can't
|
||||
// compress it until we know.
|
||||
// 2) This 64 KiB input might complete a larger tree, whose root node is
|
||||
// similarly going to be the the root of the whole tree. For example, maybe
|
||||
// similarly going to be the root of the whole tree. For example, maybe
|
||||
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
|
||||
// node at the root of the 256 KiB subtree until we know how to finalize it.
|
||||
//
|
||||
|
|
@ -455,8 +474,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
|
|||
self->cv_stack_len += 1;
|
||||
}
|
||||
|
||||
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
||||
size_t input_len) {
|
||||
INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
|
||||
size_t input_len, bool use_tbb) {
|
||||
// Explicitly checking for zero avoids causing UB by passing a null pointer
|
||||
// to memcpy. This comes up in practice with things like:
|
||||
// std::vector<uint8_t> v;
|
||||
|
|
@ -542,7 +561,7 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|||
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
|
||||
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
|
||||
self->chunk.chunk_counter,
|
||||
self->chunk.flags, cv_pair);
|
||||
self->chunk.flags, cv_pair, use_tbb);
|
||||
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
|
||||
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
|
||||
self->chunk.chunk_counter + (subtree_chunks / 2));
|
||||
|
|
@ -564,6 +583,20 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|||
}
|
||||
}
|
||||
|
||||
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
||||
size_t input_len) {
|
||||
bool use_tbb = false;
|
||||
blake3_hasher_update_base(self, input, input_len, use_tbb);
|
||||
}
|
||||
|
||||
#if defined(BLAKE3_USE_TBB)
|
||||
void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
|
||||
size_t input_len) {
|
||||
bool use_tbb = true;
|
||||
blake3_hasher_update_base(self, input, input_len, use_tbb);
|
||||
}
|
||||
#endif // BLAKE3_USE_TBB
|
||||
|
||||
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
||||
size_t out_len) {
|
||||
blake3_hasher_finalize_seek(self, 0, out, out_len);
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define BLAKE3_VERSION_STRING "1.5.1"
|
||||
#define BLAKE3_VERSION_STRING "1.8.2"
|
||||
#define BLAKE3_KEY_LEN 32
|
||||
#define BLAKE3_OUT_LEN 32
|
||||
#define BLAKE3_BLOCK_LEN 64
|
||||
|
|
|
|||
|
|
@ -7,23 +7,27 @@
|
|||
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
|
||||
|
||||
INLINE __m128i loadu_128(const uint8_t src[16]) {
|
||||
return _mm_loadu_si128((const __m128i *)src);
|
||||
return _mm_loadu_si128((void*)src);
|
||||
}
|
||||
|
||||
INLINE __m256i loadu_256(const uint8_t src[32]) {
|
||||
return _mm256_loadu_si256((const __m256i *)src);
|
||||
return _mm256_loadu_si256((void*)src);
|
||||
}
|
||||
|
||||
INLINE __m512i loadu_512(const uint8_t src[64]) {
|
||||
return _mm512_loadu_si512((const __m512i *)src);
|
||||
return _mm512_loadu_si512((void*)src);
|
||||
}
|
||||
|
||||
INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
|
||||
_mm_storeu_si128((__m128i *)dest, src);
|
||||
_mm_storeu_si128((void*)dest, src);
|
||||
}
|
||||
|
||||
INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
|
||||
_mm256_storeu_si256((__m256i *)dest, src);
|
||||
_mm256_storeu_si256((void*)dest, src);
|
||||
}
|
||||
|
||||
INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
|
||||
_mm512_storeu_si512((void*)dest, src);
|
||||
}
|
||||
|
||||
INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
|
||||
|
|
@ -550,6 +554,54 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
|
|||
storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
|
||||
}
|
||||
|
||||
static
|
||||
void blake3_xof4_avx512(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[4 * 64]) {
|
||||
__m128i h_vecs[8] = {
|
||||
set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
|
||||
set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
|
||||
};
|
||||
uint32_t block_words[16];
|
||||
load_block_words(block, block_words);
|
||||
__m128i msg_vecs[16];
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
msg_vecs[i] = set1_128(block_words[i]);
|
||||
}
|
||||
__m128i counter_low_vec, counter_high_vec;
|
||||
load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
|
||||
__m128i block_len_vec = set1_128(block_len);
|
||||
__m128i block_flags_vec = set1_128(flags);
|
||||
__m128i v[16] = {
|
||||
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
|
||||
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
|
||||
set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
|
||||
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
|
||||
};
|
||||
round_fn4(v, msg_vecs, 0);
|
||||
round_fn4(v, msg_vecs, 1);
|
||||
round_fn4(v, msg_vecs, 2);
|
||||
round_fn4(v, msg_vecs, 3);
|
||||
round_fn4(v, msg_vecs, 4);
|
||||
round_fn4(v, msg_vecs, 5);
|
||||
round_fn4(v, msg_vecs, 6);
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
v[i] = xor_128(v[i], v[i+8]);
|
||||
v[i+8] = xor_128(v[i+8], h_vecs[i]);
|
||||
}
|
||||
transpose_vecs_128(&v[0]);
|
||||
transpose_vecs_128(&v[4]);
|
||||
transpose_vecs_128(&v[8]);
|
||||
transpose_vecs_128(&v[12]);
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
|
||||
storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
|
||||
storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
|
||||
storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* hash8_avx512
|
||||
|
|
@ -802,6 +854,50 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
|
|||
storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
|
||||
}
|
||||
|
||||
static
|
||||
void blake3_xof8_avx512(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[8 * 64]) {
|
||||
__m256i h_vecs[8] = {
|
||||
set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
|
||||
set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
|
||||
};
|
||||
uint32_t block_words[16];
|
||||
load_block_words(block, block_words);
|
||||
__m256i msg_vecs[16];
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
msg_vecs[i] = set1_256(block_words[i]);
|
||||
}
|
||||
__m256i counter_low_vec, counter_high_vec;
|
||||
load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
|
||||
__m256i block_len_vec = set1_256(block_len);
|
||||
__m256i block_flags_vec = set1_256(flags);
|
||||
__m256i v[16] = {
|
||||
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
|
||||
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
|
||||
set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
|
||||
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
|
||||
};
|
||||
round_fn8(v, msg_vecs, 0);
|
||||
round_fn8(v, msg_vecs, 1);
|
||||
round_fn8(v, msg_vecs, 2);
|
||||
round_fn8(v, msg_vecs, 3);
|
||||
round_fn8(v, msg_vecs, 4);
|
||||
round_fn8(v, msg_vecs, 5);
|
||||
round_fn8(v, msg_vecs, 6);
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
v[i] = xor_256(v[i], v[i+8]);
|
||||
v[i+8] = xor_256(v[i+8], h_vecs[i]);
|
||||
}
|
||||
transpose_vecs_256(&v[0]);
|
||||
transpose_vecs_256(&v[8]);
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
|
||||
storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* hash16_avx512
|
||||
|
|
@ -1146,6 +1242,48 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
|
|||
_mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
|
||||
}
|
||||
|
||||
static
|
||||
void blake3_xof16_avx512(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[16 * 64]) {
|
||||
__m512i h_vecs[8] = {
|
||||
set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
|
||||
set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
|
||||
};
|
||||
uint32_t block_words[16];
|
||||
load_block_words(block, block_words);
|
||||
__m512i msg_vecs[16];
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
msg_vecs[i] = set1_512(block_words[i]);
|
||||
}
|
||||
__m512i counter_low_vec, counter_high_vec;
|
||||
load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
|
||||
__m512i block_len_vec = set1_512(block_len);
|
||||
__m512i block_flags_vec = set1_512(flags);
|
||||
__m512i v[16] = {
|
||||
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
|
||||
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
|
||||
set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
|
||||
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
|
||||
};
|
||||
round_fn16(v, msg_vecs, 0);
|
||||
round_fn16(v, msg_vecs, 1);
|
||||
round_fn16(v, msg_vecs, 2);
|
||||
round_fn16(v, msg_vecs, 3);
|
||||
round_fn16(v, msg_vecs, 4);
|
||||
round_fn16(v, msg_vecs, 5);
|
||||
round_fn16(v, msg_vecs, 6);
|
||||
for (size_t i = 0; i < 8; i++) {
|
||||
v[i] = xor_512(v[i], v[i+8]);
|
||||
v[i+8] = xor_512(v[i+8], h_vecs[i]);
|
||||
}
|
||||
transpose_vecs_512(&v[0]);
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
storeu_512(v[i], &out[i * sizeof(__m512i)]);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ----------------------------------------------------------------------------
|
||||
* hash_many_avx512
|
||||
|
|
@ -1218,3 +1356,33 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
|||
out = &out[BLAKE3_OUT_LEN];
|
||||
}
|
||||
}
|
||||
|
||||
void blake3_xof_many_avx512(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t* out, size_t outblocks) {
|
||||
while (outblocks >= 16) {
|
||||
blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
|
||||
counter += 16;
|
||||
outblocks -= 16;
|
||||
out += 16 * BLAKE3_BLOCK_LEN;
|
||||
}
|
||||
while (outblocks >= 8) {
|
||||
blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
|
||||
counter += 8;
|
||||
outblocks -= 8;
|
||||
out += 8 * BLAKE3_BLOCK_LEN;
|
||||
}
|
||||
while (outblocks >= 4) {
|
||||
blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
|
||||
counter += 4;
|
||||
outblocks -= 4;
|
||||
out += 4 * BLAKE3_BLOCK_LEN;
|
||||
}
|
||||
while (outblocks > 0) {
|
||||
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
||||
counter += 1;
|
||||
outblocks -= 1;
|
||||
out += BLAKE3_BLOCK_LEN;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -223,6 +223,30 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|||
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
|
||||
}
|
||||
|
||||
|
||||
void blake3_xof_many(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[64], size_t outblocks) {
|
||||
if (outblocks == 0) {
|
||||
// The current assembly implementation always outputs at least 1 block.
|
||||
return;
|
||||
}
|
||||
#if defined(IS_X86)
|
||||
const enum cpu_feature features = get_cpu_features();
|
||||
MAYBE_UNUSED(features);
|
||||
#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
|
||||
if (features & AVX512VL) {
|
||||
blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
for(size_t i = 0; i < outblocks; ++i) {
|
||||
blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
|
||||
}
|
||||
}
|
||||
|
||||
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8], uint64_t counter,
|
||||
bool increment_counter, uint8_t flags,
|
||||
|
|
|
|||
|
|
@ -9,6 +9,10 @@
|
|||
|
||||
#include "blake3.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// internal flags
|
||||
enum blake3_flags {
|
||||
CHUNK_START = 1 << 0,
|
||||
|
|
@ -28,7 +32,13 @@ enum blake3_flags {
|
|||
#define INLINE static inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) && !defined(_M_ARM64EC)
|
||||
#ifdef __cplusplus
|
||||
#define NOEXCEPT noexcept
|
||||
#else
|
||||
#define NOEXCEPT
|
||||
#endif
|
||||
|
||||
#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
|
||||
#define IS_X86
|
||||
#define IS_X86_64
|
||||
#endif
|
||||
|
|
@ -38,7 +48,7 @@ enum blake3_flags {
|
|||
#define IS_X86_32
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)|| defined(_M_ARM64EC)
|
||||
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
|
||||
#define IS_AARCH64
|
||||
#endif
|
||||
|
||||
|
|
@ -162,6 +172,13 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
|
|||
key_words[7] = load32(&key[7 * 4]);
|
||||
}
|
||||
|
||||
INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint32_t block_words[16]) {
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
block_words[i] = load32(&block[i * 4]);
|
||||
}
|
||||
}
|
||||
|
||||
INLINE void store32(void *dst, uint32_t w) {
|
||||
uint8_t *p = (uint8_t *)dst;
|
||||
p[0] = (uint8_t)(w >> 0);
|
||||
|
|
@ -191,6 +208,11 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[64]);
|
||||
|
||||
void blake3_xof_many(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t out[64], size_t outblocks);
|
||||
|
||||
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
||||
size_t blocks, const uint32_t key[8], uint64_t counter,
|
||||
bool increment_counter, uint8_t flags,
|
||||
|
|
@ -198,6 +220,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|||
|
||||
size_t blake3_simd_degree(void);
|
||||
|
||||
static size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
|
||||
const uint32_t key[8],
|
||||
uint64_t chunk_counter, uint8_t flags,
|
||||
uint8_t *out, bool use_tbb);
|
||||
|
||||
#if defined(BLAKE3_USE_TBB)
|
||||
void blake3_compress_subtree_wide_join_tbb(
|
||||
// shared params
|
||||
const uint32_t key[8], uint8_t flags, bool use_tbb,
|
||||
// left-hand side params
|
||||
const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
|
||||
uint8_t *l_cvs, size_t *l_n,
|
||||
// right-hand side params
|
||||
const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
|
||||
uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
|
||||
#endif
|
||||
|
||||
// Declarations for implementation-specific functions.
|
||||
void blake3_compress_in_place_portable(uint32_t cv[8],
|
||||
|
|
@ -270,6 +308,13 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
|||
uint64_t counter, bool increment_counter,
|
||||
uint8_t flags, uint8_t flags_start,
|
||||
uint8_t flags_end, uint8_t *out);
|
||||
|
||||
#if !defined(_WIN32)
|
||||
void blake3_xof_many_avx512(const uint32_t cv[8],
|
||||
const uint8_t block[BLAKE3_BLOCK_LEN],
|
||||
uint8_t block_len, uint64_t counter, uint8_t flags,
|
||||
uint8_t* out, size_t outblocks);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
@ -281,5 +326,8 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
|||
uint8_t flags_end, uint8_t *out);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* BLAKE3_IMPL_H */
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
|||
}
|
||||
|
||||
INLINE uint32x4_t rot16_128(uint32x4_t x) {
|
||||
// The straightfoward implementation would be two shifts and an or, but that's
|
||||
// The straightforward implementation would be two shifts and an or, but that's
|
||||
// slower on microarchitectures we've tested. See
|
||||
// https://github.com/BLAKE3-team/BLAKE3/pull/319.
|
||||
// return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue