From e124aa3611f38573898aa79c6eabe77bc874e58f Mon Sep 17 00:00:00 2001 From: aunsane Date: Fri, 15 Dec 2017 01:05:56 +0300 Subject: preparing to build tox from sources --- .../blake2b/generichash_blake2.c | 55 +++ .../src/crypto_generichash/blake2b/ref/blake2.h | 109 ++++++ .../blake2b/ref/blake2b-compress-avx2.c | 49 +++ .../blake2b/ref/blake2b-compress-avx2.h | 140 +++++++ .../blake2b/ref/blake2b-compress-ref.c | 93 +++++ .../blake2b/ref/blake2b-compress-sse41.c | 87 ++++ .../blake2b/ref/blake2b-compress-sse41.h | 103 +++++ .../blake2b/ref/blake2b-compress-ssse3.c | 90 +++++ .../blake2b/ref/blake2b-compress-ssse3.h | 103 +++++ .../blake2b/ref/blake2b-load-avx2.h | 340 ++++++++++++++++ .../blake2b/ref/blake2b-load-sse2.h | 164 ++++++++ .../blake2b/ref/blake2b-load-sse41.h | 307 +++++++++++++++ .../crypto_generichash/blake2b/ref/blake2b-ref.c | 436 +++++++++++++++++++++ .../blake2b/ref/generichash_blake2b.c | 111 ++++++ 14 files changed, 2187 insertions(+) create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/generichash_blake2.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ref.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-avx2.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse2.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse41.h create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-ref.c create mode 100644 libs/libsodium/src/crypto_generichash/blake2b/ref/generichash_blake2b.c (limited to 'libs/libsodium/src/crypto_generichash/blake2b') diff --git a/libs/libsodium/src/crypto_generichash/blake2b/generichash_blake2.c b/libs/libsodium/src/crypto_generichash/blake2b/generichash_blake2.c new file mode 100644 index 0000000000..781d4c584e --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/generichash_blake2.c @@ -0,0 +1,55 @@ +#include "crypto_generichash_blake2b.h" +#include "randombytes.h" + +size_t +crypto_generichash_blake2b_bytes_min(void) { + return crypto_generichash_blake2b_BYTES_MIN; +} + +size_t +crypto_generichash_blake2b_bytes_max(void) { + return crypto_generichash_blake2b_BYTES_MAX; +} + +size_t +crypto_generichash_blake2b_bytes(void) { + return crypto_generichash_blake2b_BYTES; +} + +size_t +crypto_generichash_blake2b_keybytes_min(void) { + return crypto_generichash_blake2b_KEYBYTES_MIN; +} + +size_t +crypto_generichash_blake2b_keybytes_max(void) { + return crypto_generichash_blake2b_KEYBYTES_MAX; +} + +size_t +crypto_generichash_blake2b_keybytes(void) { + return crypto_generichash_blake2b_KEYBYTES; +} + +size_t +crypto_generichash_blake2b_saltbytes(void) { + return crypto_generichash_blake2b_SALTBYTES; +} + +size_t +crypto_generichash_blake2b_personalbytes(void) { + return crypto_generichash_blake2b_PERSONALBYTES; +} + +size_t +crypto_generichash_blake2b_statebytes(void) +{ + return (sizeof(crypto_generichash_blake2b_state) + (size_t) 63U) + & ~(size_t) 63U; +} + +void +crypto_generichash_blake2b_keygen(unsigned char k[crypto_generichash_blake2b_KEYBYTES]) +{ + randombytes_buf(k, crypto_generichash_blake2b_KEYBYTES); +} diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2.h new file mode 100644 index 0000000000..c6c4fccbb7 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2.h @@ -0,0 +1,109 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + All code is triple-licensed under the + [CC0](http://creativecommons.org/publicdomain/zero/1.0), the + [OpenSSL Licence](https://www.openssl.org/source/license.html), or + the [Apache Public License 2.0](http://www.apache.org/licenses/LICENSE-2.0), + at your choosing. + */ + +#ifndef blake2_H +#define blake2_H + +#include +#include + +#include "crypto_generichash_blake2b.h" +#include "export.h" + +#define blake2b_init_param crypto_generichash_blake2b__init_param +#define blake2b_init crypto_generichash_blake2b__init +#define blake2b_init_salt_personal \ + crypto_generichash_blake2b__init_salt_personal +#define blake2b_init_key crypto_generichash_blake2b__init_key +#define blake2b_init_key_salt_personal \ + crypto_generichash_blake2b__init_key_salt_personal +#define blake2b_update crypto_generichash_blake2b__update +#define blake2b_final crypto_generichash_blake2b__final +#define blake2b crypto_generichash_blake2b__blake2b +#define blake2b_salt_personal crypto_generichash_blake2b__blake2b_salt_personal +#define blake2b_pick_best_implementation \ + crypto_generichash_blake2b__pick_best_implementation + +enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#pragma pack(1) +#else +#pragma pack(push, 1) +#endif + +typedef struct blake2b_param_ { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint8_t leaf_length[4]; /* 8 */ + uint8_t node_offset[8]; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ +} blake2b_param; + +typedef crypto_generichash_blake2b_state blake2b_state; + +#if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#pragma pack() +#else +#pragma pack(pop) +#endif + +/* Streaming API */ +int blake2b_init(blake2b_state *S, const uint8_t outlen); +int blake2b_init_salt_personal(blake2b_state *S, const uint8_t outlen, + const void *salt, const void *personal); +int blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, + const uint8_t keylen); +int blake2b_init_key_salt_personal(blake2b_state *S, const uint8_t outlen, + const void *key, const uint8_t keylen, + const void *salt, const void *personal); +int blake2b_init_param(blake2b_state *S, const blake2b_param *P); +int blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen); +int blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen); + +/* Simple API */ +int blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, + const uint64_t inlen, uint8_t keylen); +int blake2b_salt_personal(uint8_t *out, const void *in, const void *key, + const uint8_t outlen, const uint64_t inlen, + uint8_t keylen, const void *salt, + const void *personal); + +typedef int (*blake2b_compress_fn)(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); +int blake2b_pick_best_implementation(void); +int blake2b_compress_ref(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); +int blake2b_compress_ssse3(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); +int blake2b_compress_sse41(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); +int blake2b_compress_avx2(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.c new file mode 100644 index 0000000000..7cb41fb6e7 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.c @@ -0,0 +1,49 @@ + +#define BLAKE2_USE_SSSE3 +#define BLAKE2_USE_SSE41 +#define BLAKE2_USE_AVX2 + +#include +#include + +#include "blake2.h" +#include "private/common.h" +#include "private/sse2_64_32.h" + +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# pragma GCC target("sse4.1") +# pragma GCC target("avx2") +# endif + +# include +# include +# include +# include + +# include "blake2b-compress-avx2.h" + +CRYPTO_ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +int +blake2b_compress_avx2(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + __m256i a = LOADU(&S->h[0]); + __m256i b = LOADU(&S->h[4]); + BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]); + STOREU(&S->h[0], a); + STOREU(&S->h[4], b); + + return 0; +} + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.h new file mode 100644 index 0000000000..21acb2fa0c --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-avx2.h @@ -0,0 +1,140 @@ + +#ifndef blake2b_compress_avx2_H +#define blake2b_compress_avx2_H + +#define LOAD128(p) _mm_load_si128((__m128i *) (p)) +#define STORE128(p, r) _mm_store_si128((__m128i *) (p), r) + +#define LOADU128(p) _mm_loadu_si128((__m128i *) (p)) +#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r) + +#define LOAD(p) _mm256_load_si256((__m256i *) (p)) +#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r) + +#define LOADU(p) _mm256_loadu_si256((__m256i *) (p)) +#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r) + +static inline uint64_t +LOADU64(const void *p) +{ + uint64_t v; + memcpy(&v, p, sizeof v); + return v; +} + +#define ROTATE16 \ + _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \ + 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) + +#define ROTATE24 \ + _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \ + 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) + +#define ADD(a, b) _mm256_add_epi64(a, b) +#define SUB(a, b) _mm256_sub_epi64(a, b) + +#define XOR(a, b) _mm256_xor_si256(a, b) +#define AND(a, b) _mm256_and_si256(a, b) +#define OR(a, b) _mm256_or_si256(a, b) + +#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) +#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) +#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) +#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) + +#define BLAKE2B_G1_V1(a, b, c, d, m) \ + do { \ + a = ADD(a, m); \ + a = ADD(a, b); \ + d = XOR(d, a); \ + d = ROT32(d); \ + c = ADD(c, d); \ + b = XOR(b, c); \ + b = ROT24(b); \ + } while (0) + +#define BLAKE2B_G2_V1(a, b, c, d, m) \ + do { \ + a = ADD(a, m); \ + a = ADD(a, b); \ + d = XOR(d, a); \ + d = ROT16(d); \ + c = ADD(c, d); \ + b = XOR(b, c); \ + b = ROT63(b); \ + } while (0) + +#define BLAKE2B_DIAG_V1(a, b, c, d) \ + do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while (0) + +#define BLAKE2B_UNDIAG_V1(a, b, c, d) \ + do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while (0) + +#include "blake2b-load-avx2.h" + +#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \ + do { \ + __m256i b0; \ + BLAKE2B_LOAD_MSG_##r##_1(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_##r##_2(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_DIAG_V1(a, b, c, d); \ + BLAKE2B_LOAD_MSG_##r##_3(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_##r##_4(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_UNDIAG_V1(a, b, c, d); \ + } while (0) + +#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \ + do { \ + BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ + } while (0) + +#define DECLARE_MESSAGE_WORDS(m) \ + const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ + const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ + const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ + const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ + const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ + const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ + const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ + const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ + __m256i t0, t1; + +#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \ + do { \ + DECLARE_MESSAGE_WORDS(m) \ + const __m256i iv0 = a; \ + const __m256i iv1 = b; \ + __m256i c = LOAD(&blake2b_IV[0]); \ + __m256i d = \ + XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \ + BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ + a = XOR(a, c); \ + b = XOR(b, d); \ + a = XOR(a, iv0); \ + b = XOR(b, iv1); \ + } while (0) + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ref.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ref.c new file mode 100644 index 0000000000..614fa34af7 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ref.c @@ -0,0 +1,93 @@ + +#include +#include + +#include "blake2.h" +#include "private/common.h" + +CRYPTO_ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +static const uint8_t blake2b_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + +int +blake2b_compress_ref(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + uint64_t m[16]; + uint64_t v[16]; + int i; + + for (i = 0; i < 16; ++i) + m[i] = LOAD64_LE(block + i * sizeof(m[i])); + + for (i = 0; i < 8; ++i) + v[i] = S->h[i]; + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = S->t[0] ^ blake2b_IV[4]; + v[13] = S->t[1] ^ blake2b_IV[5]; + v[14] = S->f[0] ^ blake2b_IV[6]; + v[15] = S->f[1] ^ blake2b_IV[7]; +#define G(r, i, a, b, c, d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ + d = ROTR64(d ^ a, 32); \ + c = c + d; \ + b = ROTR64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ + d = ROTR64(d ^ a, 16); \ + c = c + d; \ + b = ROTR64(b ^ c, 63); \ + } while (0) +#define ROUND(r) \ + do { \ + G(r, 0, v[0], v[4], v[8], v[12]); \ + G(r, 1, v[1], v[5], v[9], v[13]); \ + G(r, 2, v[2], v[6], v[10], v[14]); \ + G(r, 3, v[3], v[7], v[11], v[15]); \ + G(r, 4, v[0], v[5], v[10], v[15]); \ + G(r, 5, v[1], v[6], v[11], v[12]); \ + G(r, 6, v[2], v[7], v[8], v[13]); \ + G(r, 7, v[3], v[4], v[9], v[14]); \ + } while (0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + + for (i = 0; i < 8; ++i) { + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + } + +#undef G +#undef ROUND + return 0; +} diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.c new file mode 100644 index 0000000000..9e5c0c5081 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.c @@ -0,0 +1,87 @@ + +#define BLAKE2_USE_SSSE3 +#define BLAKE2_USE_SSE41 + +#include +#include + +#include "blake2.h" +#include "private/common.h" +#include "private/sse2_64_32.h" + +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ + defined(HAVE_SMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# pragma GCC target("sse4.1") +# endif + +# include +# include +# include + +# include "blake2b-compress-sse41.h" + +CRYPTO_ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +int +blake2b_compress_sse41(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; + const __m128i r16 = + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = + _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); + const __m128i m4 = LOADU(block + 64); + const __m128i m5 = LOADU(block + 80); + const __m128i m6 = LOADU(block + 96); + const __m128i m7 = LOADU(block + 112); + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); + return 0; +} + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.h new file mode 100644 index 0000000000..ac78e5bb1e --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-sse41.h @@ -0,0 +1,103 @@ + +#ifndef blake2b_compress_sse41_H +#define blake2b_compress_sse41_H + +#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p)) +#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r) + +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) + +#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); + +#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); + +#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#include "blake2b-load-sse41.h" + +#define ROUND(r) \ + LOAD_MSG_##r##_1(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_##r##_2(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + LOAD_MSG_##r##_3(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_##r##_4(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.c new file mode 100644 index 0000000000..a207a64d40 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.c @@ -0,0 +1,90 @@ + +#include +#include + +#include "blake2.h" +#include "private/common.h" +#include "private/sse2_64_32.h" + +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# endif + +# include +# include + +# include "blake2b-compress-ssse3.h" + +CRYPTO_ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +int +blake2b_compress_ssse3(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; + const __m128i r16 = + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = + _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + const uint64_t m0 = ((uint64_t *) block)[0]; + const uint64_t m1 = ((uint64_t *) block)[1]; + const uint64_t m2 = ((uint64_t *) block)[2]; + const uint64_t m3 = ((uint64_t *) block)[3]; + const uint64_t m4 = ((uint64_t *) block)[4]; + const uint64_t m5 = ((uint64_t *) block)[5]; + const uint64_t m6 = ((uint64_t *) block)[6]; + const uint64_t m7 = ((uint64_t *) block)[7]; + const uint64_t m8 = ((uint64_t *) block)[8]; + const uint64_t m9 = ((uint64_t *) block)[9]; + const uint64_t m10 = ((uint64_t *) block)[10]; + const uint64_t m11 = ((uint64_t *) block)[11]; + const uint64_t m12 = ((uint64_t *) block)[12]; + const uint64_t m13 = ((uint64_t *) block)[13]; + const uint64_t m14 = ((uint64_t *) block)[14]; + const uint64_t m15 = ((uint64_t *) block)[15]; + + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); + return 0; +} + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.h new file mode 100644 index 0000000000..9a7164fe25 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-compress-ssse3.h @@ -0,0 +1,103 @@ + +#ifndef blake2b_compress_ssse3_H +#define blake2b_compress_ssse3_H + +#define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p)) +#define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r) + +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) \ + ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ + : (-(c) == 24) \ + ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) \ + ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) \ + ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ + _mm_slli_epi64((x), 64 - (-(c)))) + +#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); + +#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); + +#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#include "blake2b-load-sse2.h" + +#define ROUND(r) \ + LOAD_MSG_##r##_1(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_##r##_2(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + LOAD_MSG_##r##_3(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_##r##_4(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-avx2.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-avx2.h new file mode 100644 index 0000000000..8c15f177c7 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-avx2.h @@ -0,0 +1,340 @@ +#ifndef blake2b_load_avx2_H +#define blake2b_load_avx2_H + +#define BLAKE2B_LOAD_MSG_0_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m0, m1); \ + t1 = _mm256_unpacklo_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_0_2(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m0, m1); \ + t1 = _mm256_unpackhi_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_0_3(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m4, m5); \ + t1 = _mm256_unpacklo_epi64(m6, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_0_4(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m4, m5); \ + t1 = _mm256_unpackhi_epi64(m6, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_1_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m7, m2); \ + t1 = _mm256_unpackhi_epi64(m4, m6); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_1_2(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m5, m4); \ + t1 = _mm256_alignr_epi8(m3, m7, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_1_3(b0) \ + do { \ + t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ + t1 = _mm256_unpackhi_epi64(m5, m2); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_1_4(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m6, m1); \ + t1 = _mm256_unpackhi_epi64(m3, m1); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_2_1(b0) \ + do { \ + t0 = _mm256_alignr_epi8(m6, m5, 8); \ + t1 = _mm256_unpackhi_epi64(m2, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_2_2(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m4, m0); \ + t1 = _mm256_blend_epi32(m6, m1, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_2_3(b0) \ + do { \ + t0 = _mm256_blend_epi32(m1, m5, 0x33); \ + t1 = _mm256_unpackhi_epi64(m3, m4); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_2_4(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m7, m3); \ + t1 = _mm256_alignr_epi8(m2, m0, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_3_1(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m3, m1); \ + t1 = _mm256_unpackhi_epi64(m6, m5); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_3_2(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m4, m0); \ + t1 = _mm256_unpacklo_epi64(m6, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_3_3(b0) \ + do { \ + t0 = _mm256_blend_epi32(m2, m1, 0x33); \ + t1 = _mm256_blend_epi32(m7, m2, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_3_4(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m3, m5); \ + t1 = _mm256_unpacklo_epi64(m0, m4); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_4_1(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m4, m2); \ + t1 = _mm256_unpacklo_epi64(m1, m5); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_4_2(b0) \ + do { \ + t0 = _mm256_blend_epi32(m3, m0, 0x33); \ + t1 = _mm256_blend_epi32(m7, m2, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_4_3(b0) \ + do { \ + t0 = _mm256_blend_epi32(m5, m7, 0x33); \ + t1 = _mm256_blend_epi32(m1, m3, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_4_4(b0) \ + do { \ + t0 = _mm256_alignr_epi8(m6, m0, 8); \ + t1 = _mm256_blend_epi32(m6, m4, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_5_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m1, m3); \ + t1 = _mm256_unpacklo_epi64(m0, m4); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_5_2(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m6, m5); \ + t1 = _mm256_unpackhi_epi64(m5, m1); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_5_3(b0) \ + do { \ + t0 = _mm256_blend_epi32(m3, m2, 0x33); \ + t1 = _mm256_unpackhi_epi64(m7, m0); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_5_4(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m6, m2); \ + t1 = _mm256_blend_epi32(m4, m7, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_6_1(b0) \ + do { \ + t0 = _mm256_blend_epi32(m0, m6, 0x33); \ + t1 = _mm256_unpacklo_epi64(m7, m2); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_6_2(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m2, m7); \ + t1 = _mm256_alignr_epi8(m5, m6, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_6_3(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m0, m3); \ + t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_6_4(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m3, m1); \ + t1 = _mm256_blend_epi32(m5, m1, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_7_1(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m6, m3); \ + t1 = _mm256_blend_epi32(m1, m6, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_7_2(b0) \ + do { \ + t0 = _mm256_alignr_epi8(m7, m5, 8); \ + t1 = _mm256_unpackhi_epi64(m0, m4); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_7_3(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m2, m7); \ + t1 = _mm256_unpacklo_epi64(m4, m1); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_7_4(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m0, m2); \ + t1 = _mm256_unpacklo_epi64(m3, m5); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_8_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m3, m7); \ + t1 = _mm256_alignr_epi8(m0, m5, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_8_2(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m7, m4); \ + t1 = _mm256_alignr_epi8(m4, m1, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_8_3(b0) \ + do { \ + t0 = m6; \ + t1 = _mm256_alignr_epi8(m5, m0, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_8_4(b0) \ + do { \ + t0 = _mm256_blend_epi32(m3, m1, 0x33); \ + t1 = m2; \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_9_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m5, m4); \ + t1 = _mm256_unpackhi_epi64(m3, m0); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_9_2(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m1, m2); \ + t1 = _mm256_blend_epi32(m2, m3, 0x33); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_9_3(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m7, m4); \ + t1 = _mm256_unpackhi_epi64(m1, m6); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_9_4(b0) \ + do { \ + t0 = _mm256_alignr_epi8(m7, m5, 8); \ + t1 = _mm256_unpacklo_epi64(m6, m0); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_10_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m0, m1); \ + t1 = _mm256_unpacklo_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_10_2(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m0, m1); \ + t1 = _mm256_unpackhi_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_10_3(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m4, m5); \ + t1 = _mm256_unpacklo_epi64(m6, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_10_4(b0) \ + do { \ + t0 = _mm256_unpackhi_epi64(m4, m5); \ + t1 = _mm256_unpackhi_epi64(m6, m7); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_11_1(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m7, m2); \ + t1 = _mm256_unpackhi_epi64(m4, m6); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_11_2(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m5, m4); \ + t1 = _mm256_alignr_epi8(m3, m7, 8); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_11_3(b0) \ + do { \ + t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ + t1 = _mm256_unpackhi_epi64(m5, m2); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#define BLAKE2B_LOAD_MSG_11_4(b0) \ + do { \ + t0 = _mm256_unpacklo_epi64(m6, m1); \ + t1 = _mm256_unpackhi_epi64(m3, m1); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ + } while (0) + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse2.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse2.h new file mode 100644 index 0000000000..8e67421aca --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse2.h @@ -0,0 +1,164 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along + with + this software. If not, see + . +*/ + +#ifndef blake2b_load_sse2_H +#define blake2b_load_sse2_H + +#define LOAD_MSG_0_1(b0, b1) \ + b0 = _mm_set_epi64x(m2, m0); \ + b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_0_2(b0, b1) \ + b0 = _mm_set_epi64x(m3, m1); \ + b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_0_3(b0, b1) \ + b0 = _mm_set_epi64x(m10, m8); \ + b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_0_4(b0, b1) \ + b0 = _mm_set_epi64x(m11, m9); \ + b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_1_1(b0, b1) \ + b0 = _mm_set_epi64x(m4, m14); \ + b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_1_2(b0, b1) \ + b0 = _mm_set_epi64x(m8, m10); \ + b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_1_3(b0, b1) \ + b0 = _mm_set_epi64x(m0, m1); \ + b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_1_4(b0, b1) \ + b0 = _mm_set_epi64x(m2, m12); \ + b1 = _mm_set_epi64x(m3, m7) +#define LOAD_MSG_2_1(b0, b1) \ + b0 = _mm_set_epi64x(m12, m11); \ + b1 = _mm_set_epi64x(m15, m5) +#define LOAD_MSG_2_2(b0, b1) \ + b0 = _mm_set_epi64x(m0, m8); \ + b1 = _mm_set_epi64x(m13, m2) +#define LOAD_MSG_2_3(b0, b1) \ + b0 = _mm_set_epi64x(m3, m10); \ + b1 = _mm_set_epi64x(m9, m7) +#define LOAD_MSG_2_4(b0, b1) \ + b0 = _mm_set_epi64x(m6, m14); \ + b1 = _mm_set_epi64x(m4, m1) +#define LOAD_MSG_3_1(b0, b1) \ + b0 = _mm_set_epi64x(m3, m7); \ + b1 = _mm_set_epi64x(m11, m13) +#define LOAD_MSG_3_2(b0, b1) \ + b0 = _mm_set_epi64x(m1, m9); \ + b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_3_3(b0, b1) \ + b0 = _mm_set_epi64x(m5, m2); \ + b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_3_4(b0, b1) \ + b0 = _mm_set_epi64x(m10, m6); \ + b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_4_1(b0, b1) \ + b0 = _mm_set_epi64x(m5, m9); \ + b1 = _mm_set_epi64x(m10, m2) +#define LOAD_MSG_4_2(b0, b1) \ + b0 = _mm_set_epi64x(m7, m0); \ + b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_4_3(b0, b1) \ + b0 = _mm_set_epi64x(m11, m14); \ + b1 = _mm_set_epi64x(m3, m6) +#define LOAD_MSG_4_4(b0, b1) \ + b0 = _mm_set_epi64x(m12, m1); \ + b1 = _mm_set_epi64x(m13, m8) +#define LOAD_MSG_5_1(b0, b1) \ + b0 = _mm_set_epi64x(m6, m2); \ + b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_5_2(b0, b1) \ + b0 = _mm_set_epi64x(m10, m12); \ + b1 = _mm_set_epi64x(m3, m11) +#define LOAD_MSG_5_3(b0, b1) \ + b0 = _mm_set_epi64x(m7, m4); \ + b1 = _mm_set_epi64x(m1, m15) +#define LOAD_MSG_5_4(b0, b1) \ + b0 = _mm_set_epi64x(m5, m13); \ + b1 = _mm_set_epi64x(m9, m14) +#define LOAD_MSG_6_1(b0, b1) \ + b0 = _mm_set_epi64x(m1, m12); \ + b1 = _mm_set_epi64x(m4, m14) +#define LOAD_MSG_6_2(b0, b1) \ + b0 = _mm_set_epi64x(m15, m5); \ + b1 = _mm_set_epi64x(m10, m13) +#define LOAD_MSG_6_3(b0, b1) \ + b0 = _mm_set_epi64x(m6, m0); \ + b1 = _mm_set_epi64x(m8, m9) +#define LOAD_MSG_6_4(b0, b1) \ + b0 = _mm_set_epi64x(m3, m7); \ + b1 = _mm_set_epi64x(m11, m2) +#define LOAD_MSG_7_1(b0, b1) \ + b0 = _mm_set_epi64x(m7, m13); \ + b1 = _mm_set_epi64x(m3, m12) +#define LOAD_MSG_7_2(b0, b1) \ + b0 = _mm_set_epi64x(m14, m11); \ + b1 = _mm_set_epi64x(m9, m1) +#define LOAD_MSG_7_3(b0, b1) \ + b0 = _mm_set_epi64x(m15, m5); \ + b1 = _mm_set_epi64x(m2, m8) +#define LOAD_MSG_7_4(b0, b1) \ + b0 = _mm_set_epi64x(m4, m0); \ + b1 = _mm_set_epi64x(m10, m6) +#define LOAD_MSG_8_1(b0, b1) \ + b0 = _mm_set_epi64x(m14, m6); \ + b1 = _mm_set_epi64x(m0, m11) +#define LOAD_MSG_8_2(b0, b1) \ + b0 = _mm_set_epi64x(m9, m15); \ + b1 = _mm_set_epi64x(m8, m3) +#define LOAD_MSG_8_3(b0, b1) \ + b0 = _mm_set_epi64x(m13, m12); \ + b1 = _mm_set_epi64x(m10, m1) +#define LOAD_MSG_8_4(b0, b1) \ + b0 = _mm_set_epi64x(m7, m2); \ + b1 = _mm_set_epi64x(m5, m4) +#define LOAD_MSG_9_1(b0, b1) \ + b0 = _mm_set_epi64x(m8, m10); \ + b1 = _mm_set_epi64x(m1, m7) +#define LOAD_MSG_9_2(b0, b1) \ + b0 = _mm_set_epi64x(m4, m2); \ + b1 = _mm_set_epi64x(m5, m6) +#define LOAD_MSG_9_3(b0, b1) \ + b0 = _mm_set_epi64x(m9, m15); \ + b1 = _mm_set_epi64x(m13, m3) +#define LOAD_MSG_9_4(b0, b1) \ + b0 = _mm_set_epi64x(m14, m11); \ + b1 = _mm_set_epi64x(m0, m12) +#define LOAD_MSG_10_1(b0, b1) \ + b0 = _mm_set_epi64x(m2, m0); \ + b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_10_2(b0, b1) \ + b0 = _mm_set_epi64x(m3, m1); \ + b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_10_3(b0, b1) \ + b0 = _mm_set_epi64x(m10, m8); \ + b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_10_4(b0, b1) \ + b0 = _mm_set_epi64x(m11, m9); \ + b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_11_1(b0, b1) \ + b0 = _mm_set_epi64x(m4, m14); \ + b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_11_2(b0, b1) \ + b0 = _mm_set_epi64x(m8, m10); \ + b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_11_3(b0, b1) \ + b0 = _mm_set_epi64x(m0, m1); \ + b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_11_4(b0, b1) \ + b0 = _mm_set_epi64x(m2, m12); \ + b1 = _mm_set_epi64x(m3, m7) + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse41.h b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse41.h new file mode 100644 index 0000000000..31745fc139 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-load-sse41.h @@ -0,0 +1,307 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along + with + this software. If not, see + . +*/ + +#ifndef blake2b_load_sse41_H +#define blake2b_load_sse41_H + +#define LOAD_MSG_0_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m0, m1); \ + b1 = _mm_unpacklo_epi64(m2, m3); \ + } while (0) + +#define LOAD_MSG_0_2(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m0, m1); \ + b1 = _mm_unpackhi_epi64(m2, m3); \ + } while (0) + +#define LOAD_MSG_0_3(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m4, m5); \ + b1 = _mm_unpacklo_epi64(m6, m7); \ + } while (0) + +#define LOAD_MSG_0_4(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m4, m5); \ + b1 = _mm_unpackhi_epi64(m6, m7); \ + } while (0) + +#define LOAD_MSG_1_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m7, m2); \ + b1 = _mm_unpackhi_epi64(m4, m6); \ + } while (0) + +#define LOAD_MSG_1_2(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m5, m4); \ + b1 = _mm_alignr_epi8(m3, m7, 8); \ + } while (0) + +#define LOAD_MSG_1_3(b0, b1) \ + do { \ + b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ + b1 = _mm_unpackhi_epi64(m5, m2); \ + } while (0) + +#define LOAD_MSG_1_4(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m6, m1); \ + b1 = _mm_unpackhi_epi64(m3, m1); \ + } while (0) + +#define LOAD_MSG_2_1(b0, b1) \ + do { \ + b0 = _mm_alignr_epi8(m6, m5, 8); \ + b1 = _mm_unpackhi_epi64(m2, m7); \ + } while (0) + +#define LOAD_MSG_2_2(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m4, m0); \ + b1 = _mm_blend_epi16(m1, m6, 0xF0); \ + } while (0) + +#define LOAD_MSG_2_3(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m5, m1, 0xF0); \ + b1 = _mm_unpackhi_epi64(m3, m4); \ + } while (0) + +#define LOAD_MSG_2_4(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m7, m3); \ + b1 = _mm_alignr_epi8(m2, m0, 8); \ + } while (0) + +#define LOAD_MSG_3_1(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m3, m1); \ + b1 = _mm_unpackhi_epi64(m6, m5); \ + } while (0) + +#define LOAD_MSG_3_2(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m4, m0); \ + b1 = _mm_unpacklo_epi64(m6, m7); \ + } while (0) + +#define LOAD_MSG_3_3(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m1, m2, 0xF0); \ + b1 = _mm_blend_epi16(m2, m7, 0xF0); \ + } while (0) + +#define LOAD_MSG_3_4(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m3, m5); \ + b1 = _mm_unpacklo_epi64(m0, m4); \ + } while (0) + +#define LOAD_MSG_4_1(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m4, m2); \ + b1 = _mm_unpacklo_epi64(m1, m5); \ + } while (0) + +#define LOAD_MSG_4_2(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m0, m3, 0xF0); \ + b1 = _mm_blend_epi16(m2, m7, 0xF0); \ + } while (0) + +#define LOAD_MSG_4_3(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m7, m5, 0xF0); \ + b1 = _mm_blend_epi16(m3, m1, 0xF0); \ + } while (0) + +#define LOAD_MSG_4_4(b0, b1) \ + do { \ + b0 = _mm_alignr_epi8(m6, m0, 8); \ + b1 = _mm_blend_epi16(m4, m6, 0xF0); \ + } while (0) + +#define LOAD_MSG_5_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m1, m3); \ + b1 = _mm_unpacklo_epi64(m0, m4); \ + } while (0) + +#define LOAD_MSG_5_2(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m6, m5); \ + b1 = _mm_unpackhi_epi64(m5, m1); \ + } while (0) + +#define LOAD_MSG_5_3(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m2, m3, 0xF0); \ + b1 = _mm_unpackhi_epi64(m7, m0); \ + } while (0) + +#define LOAD_MSG_5_4(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m6, m2); \ + b1 = _mm_blend_epi16(m7, m4, 0xF0); \ + } while (0) + +#define LOAD_MSG_6_1(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m6, m0, 0xF0); \ + b1 = _mm_unpacklo_epi64(m7, m2); \ + } while (0) + +#define LOAD_MSG_6_2(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m2, m7); \ + b1 = _mm_alignr_epi8(m5, m6, 8); \ + } while (0) + +#define LOAD_MSG_6_3(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m0, m3); \ + b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \ + } while (0) + +#define LOAD_MSG_6_4(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m3, m1); \ + b1 = _mm_blend_epi16(m1, m5, 0xF0); \ + } while (0) + +#define LOAD_MSG_7_1(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m6, m3); \ + b1 = _mm_blend_epi16(m6, m1, 0xF0); \ + } while (0) + +#define LOAD_MSG_7_2(b0, b1) \ + do { \ + b0 = _mm_alignr_epi8(m7, m5, 8); \ + b1 = _mm_unpackhi_epi64(m0, m4); \ + } while (0) + +#define LOAD_MSG_7_3(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m2, m7); \ + b1 = _mm_unpacklo_epi64(m4, m1); \ + } while (0) + +#define LOAD_MSG_7_4(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m0, m2); \ + b1 = _mm_unpacklo_epi64(m3, m5); \ + } while (0) + +#define LOAD_MSG_8_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m3, m7); \ + b1 = _mm_alignr_epi8(m0, m5, 8); \ + } while (0) + +#define LOAD_MSG_8_2(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m7, m4); \ + b1 = _mm_alignr_epi8(m4, m1, 8); \ + } while (0) + +#define LOAD_MSG_8_3(b0, b1) \ + do { \ + b0 = m6; \ + b1 = _mm_alignr_epi8(m5, m0, 8); \ + } while (0) + +#define LOAD_MSG_8_4(b0, b1) \ + do { \ + b0 = _mm_blend_epi16(m1, m3, 0xF0); \ + b1 = m2; \ + } while (0) + +#define LOAD_MSG_9_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m5, m4); \ + b1 = _mm_unpackhi_epi64(m3, m0); \ + } while (0) + +#define LOAD_MSG_9_2(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m1, m2); \ + b1 = _mm_blend_epi16(m3, m2, 0xF0); \ + } while (0) + +#define LOAD_MSG_9_3(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m7, m4); \ + b1 = _mm_unpackhi_epi64(m1, m6); \ + } while (0) + +#define LOAD_MSG_9_4(b0, b1) \ + do { \ + b0 = _mm_alignr_epi8(m7, m5, 8); \ + b1 = _mm_unpacklo_epi64(m6, m0); \ + } while (0) + +#define LOAD_MSG_10_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m0, m1); \ + b1 = _mm_unpacklo_epi64(m2, m3); \ + } while (0) + +#define LOAD_MSG_10_2(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m0, m1); \ + b1 = _mm_unpackhi_epi64(m2, m3); \ + } while (0) + +#define LOAD_MSG_10_3(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m4, m5); \ + b1 = _mm_unpacklo_epi64(m6, m7); \ + } while (0) + +#define LOAD_MSG_10_4(b0, b1) \ + do { \ + b0 = _mm_unpackhi_epi64(m4, m5); \ + b1 = _mm_unpackhi_epi64(m6, m7); \ + } while (0) + +#define LOAD_MSG_11_1(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m7, m2); \ + b1 = _mm_unpackhi_epi64(m4, m6); \ + } while (0) + +#define LOAD_MSG_11_2(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m5, m4); \ + b1 = _mm_alignr_epi8(m3, m7, 8); \ + } while (0) + +#define LOAD_MSG_11_3(b0, b1) \ + do { \ + b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ + b1 = _mm_unpackhi_epi64(m5, m2); \ + } while (0) + +#define LOAD_MSG_11_4(b0, b1) \ + do { \ + b0 = _mm_unpacklo_epi64(m6, m1); \ + b1 = _mm_unpackhi_epi64(m3, m1); \ + } while (0) + +#endif diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-ref.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-ref.c new file mode 100644 index 0000000000..91435a1b16 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/blake2b-ref.c @@ -0,0 +1,436 @@ +/* + BLAKE2 reference source code package - C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along + with + this software. If not, see + . +*/ + +#include +#include +#include +#include +#include + +#include "blake2.h" +#include "core.h" +#include "private/common.h" +#include "runtime.h" +#include "utils.h" + +static blake2b_compress_fn blake2b_compress = blake2b_compress_ref; + +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/* LCOV_EXCL_START */ +static inline int +blake2b_set_lastnode(blake2b_state *S) +{ + S->f[1] = -1; + return 0; +} +/* LCOV_EXCL_STOP */ + +static inline int +blake2b_is_lastblock(const blake2b_state *S) +{ + return S->f[0] != 0; +} + +static inline int +blake2b_set_lastblock(blake2b_state *S) +{ + if (S->last_node) + blake2b_set_lastnode(S); + + S->f[0] = -1; + return 0; +} + +static inline int +blake2b_increment_counter(blake2b_state *S, const uint64_t inc) +{ +#ifdef HAVE_TI_MODE + uint128_t t = ((uint128_t) S->t[1] << 64) | S->t[0]; + t += inc; + S->t[0] = (uint64_t)(t >> 0); + S->t[1] = (uint64_t)(t >> 64); +#else + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +#endif + return 0; +} + +/* Parameter-related functions */ +static inline int +blake2b_param_set_salt(blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES]) +{ + memcpy(P->salt, salt, BLAKE2B_SALTBYTES); + return 0; +} + +static inline int +blake2b_param_set_personal(blake2b_param *P, + const uint8_t personal[BLAKE2B_PERSONALBYTES]) +{ + memcpy(P->personal, personal, BLAKE2B_PERSONALBYTES); + return 0; +} + +static inline int +blake2b_init0(blake2b_state *S) +{ + int i; + + for (i = 0; i < 8; i++) { + S->h[i] = blake2b_IV[i]; + } + memset(S->t, 0, offsetof(blake2b_state, last_node) + sizeof(S->last_node) + - offsetof(blake2b_state, t)); + return 0; +} + +/* init xors IV with input parameter block */ +int +blake2b_init_param(blake2b_state *S, const blake2b_param *P) +{ + size_t i; + const uint8_t *p; + + COMPILER_ASSERT(sizeof *P == 64); + blake2b_init0(S); + p = (const uint8_t *) (P); + + /* IV XOR ParamBlock */ + for (i = 0; i < 8; i++) { + S->h[i] ^= LOAD64_LE(p + sizeof(S->h[i]) * i); + } + return 0; +} + +int +blake2b_init(blake2b_state *S, const uint8_t outlen) +{ + blake2b_param P[1]; + + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) { + sodium_misuse(); + } + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + STORE32_LE(P->leaf_length, 0); + STORE64_LE(P->node_offset, 0); + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memset(P->personal, 0, sizeof(P->personal)); + return blake2b_init_param(S, P); +} + +int +blake2b_init_salt_personal(blake2b_state *S, const uint8_t outlen, + const void *salt, const void *personal) +{ + blake2b_param P[1]; + + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) { + sodium_misuse(); + } + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + STORE32_LE(P->leaf_length, 0); + STORE64_LE(P->node_offset, 0); + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + if (salt != NULL) { + blake2b_param_set_salt(P, (const uint8_t *) salt); + } else { + memset(P->salt, 0, sizeof(P->salt)); + } + if (personal != NULL) { + blake2b_param_set_personal(P, (const uint8_t *) personal); + } else { + memset(P->personal, 0, sizeof(P->personal)); + } + return blake2b_init_param(S, P); +} + +int +blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, + const uint8_t keylen) +{ + blake2b_param P[1]; + + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) { + sodium_misuse(); + } + if (!key || !keylen || keylen > BLAKE2B_KEYBYTES) { + sodium_misuse(); + } + P->digest_length = outlen; + P->key_length = keylen; + P->fanout = 1; + P->depth = 1; + STORE32_LE(P->leaf_length, 0); + STORE64_LE(P->node_offset, 0); + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memset(P->personal, 0, sizeof(P->personal)); + + if (blake2b_init_param(S, P) < 0) { + sodium_misuse(); + } + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); /* keylen cannot be 0 */ + blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + sodium_memzero(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */ + } + return 0; +} + +int +blake2b_init_key_salt_personal(blake2b_state *S, const uint8_t outlen, + const void *key, const uint8_t keylen, + const void *salt, const void *personal) +{ + blake2b_param P[1]; + + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) { + sodium_misuse(); + } + if (!key || !keylen || keylen > BLAKE2B_KEYBYTES) { + sodium_misuse(); + } + P->digest_length = outlen; + P->key_length = keylen; + P->fanout = 1; + P->depth = 1; + STORE32_LE(P->leaf_length, 0); + STORE64_LE(P->node_offset, 0); + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + if (salt != NULL) { + blake2b_param_set_salt(P, (const uint8_t *) salt); + } else { + memset(P->salt, 0, sizeof(P->salt)); + } + if (personal != NULL) { + blake2b_param_set_personal(P, (const uint8_t *) personal); + } else { + memset(P->personal, 0, sizeof(P->personal)); + } + + if (blake2b_init_param(S, P) < 0) { + sodium_misuse(); + } + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); /* keylen cannot be 0 */ + blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + sodium_memzero(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */ + } + return 0; +} + +/* inlen now in bytes */ +int +blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen) +{ + while (inlen > 0) { + size_t left = S->buflen; + size_t fill = 2 * BLAKE2B_BLOCKBYTES - left; + + if (inlen > fill) { + memcpy(S->buf + left, in, fill); /* Fill buffer */ + S->buflen += fill; + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); /* Compress */ + memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, + BLAKE2B_BLOCKBYTES); /* Shift buffer left */ + S->buflen -= BLAKE2B_BLOCKBYTES; + in += fill; + inlen -= fill; + } else /* inlen <= fill */ + { + memcpy(S->buf + left, in, inlen); + S->buflen += inlen; /* Be lazy, do not compress */ + in += inlen; + inlen -= inlen; + } + } + + return 0; +} + +int +blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen) +{ + unsigned char buffer[BLAKE2B_OUTBYTES]; + + if (!outlen || outlen > BLAKE2B_OUTBYTES) { + sodium_misuse(); + } + if (blake2b_is_lastblock(S)) { + return -1; + } + if (S->buflen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen -= BLAKE2B_BLOCKBYTES; + assert(S->buflen <= BLAKE2B_BLOCKBYTES); + memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen); + } + + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + memset(S->buf + S->buflen, 0, + 2 * BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + COMPILER_ASSERT(sizeof buffer == 64U); + STORE64_LE(buffer + 8 * 0, S->h[0]); + STORE64_LE(buffer + 8 * 1, S->h[1]); + STORE64_LE(buffer + 8 * 2, S->h[2]); + STORE64_LE(buffer + 8 * 3, S->h[3]); + STORE64_LE(buffer + 8 * 4, S->h[4]); + STORE64_LE(buffer + 8 * 5, S->h[5]); + STORE64_LE(buffer + 8 * 6, S->h[6]); + STORE64_LE(buffer + 8 * 7, S->h[7]); + memcpy(out, buffer, outlen); /* outlen <= BLAKE2B_OUTBYTES (64) */ + + sodium_memzero(S->h, sizeof S->h); + sodium_memzero(S->buf, sizeof S->buf); + + return 0; +} + +/* inlen, at least, should be uint64_t. Others can be size_t. */ +int +blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, + const uint64_t inlen, uint8_t keylen) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if (NULL == in && inlen > 0) { + sodium_misuse(); + } + if (NULL == out) { + sodium_misuse(); + } + if (!outlen || outlen > BLAKE2B_OUTBYTES) { + sodium_misuse(); + } + if (NULL == key && keylen > 0) { + sodium_misuse(); + } + if (keylen > BLAKE2B_KEYBYTES) { + sodium_misuse(); + } + if (keylen > 0) { + if (blake2b_init_key(S, outlen, key, keylen) < 0) { + sodium_misuse(); + } + } else { + if (blake2b_init(S, outlen) < 0) { + sodium_misuse(); + } + } + + blake2b_update(S, (const uint8_t *) in, inlen); + blake2b_final(S, out, outlen); + return 0; +} + +int +blake2b_salt_personal(uint8_t *out, const void *in, const void *key, + const uint8_t outlen, const uint64_t inlen, + uint8_t keylen, const void *salt, const void *personal) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if (NULL == in && inlen > 0) { + sodium_misuse(); + } + if (NULL == out) { + sodium_misuse(); + } + if (!outlen || outlen > BLAKE2B_OUTBYTES) { + sodium_misuse(); + } + if (NULL == key && keylen > 0) { + sodium_misuse(); + } + if (keylen > BLAKE2B_KEYBYTES) { + sodium_misuse(); + } + if (keylen > 0) { + if (blake2b_init_key_salt_personal(S, outlen, key, keylen, salt, + personal) < 0) { + sodium_misuse(); + } + } else { + if (blake2b_init_salt_personal(S, outlen, salt, personal) < 0) { + sodium_misuse(); + } + } + + blake2b_update(S, (const uint8_t *) in, inlen); + blake2b_final(S, out, outlen); + return 0; +} + +int +blake2b_pick_best_implementation(void) +{ +/* LCOV_EXCL_START */ +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ + defined(HAVE_SMMINTRIN_H) + if (sodium_runtime_has_avx2()) { + blake2b_compress = blake2b_compress_avx2; + return 0; + } +#endif +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ + defined(HAVE_SMMINTRIN_H) + if (sodium_runtime_has_sse41()) { + blake2b_compress = blake2b_compress_sse41; + return 0; + } +#endif +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) + if (sodium_runtime_has_ssse3()) { + blake2b_compress = blake2b_compress_ssse3; + return 0; + } +#endif + blake2b_compress = blake2b_compress_ref; + + return 0; + /* LCOV_EXCL_STOP */ +} diff --git a/libs/libsodium/src/crypto_generichash/blake2b/ref/generichash_blake2b.c b/libs/libsodium/src/crypto_generichash/blake2b/ref/generichash_blake2b.c new file mode 100644 index 0000000000..4bd0855006 --- /dev/null +++ b/libs/libsodium/src/crypto_generichash/blake2b/ref/generichash_blake2b.c @@ -0,0 +1,111 @@ + +#include +#include +#include + +#include "blake2.h" +#include "crypto_generichash_blake2b.h" +#include "private/implementations.h" + +int +crypto_generichash_blake2b(unsigned char *out, size_t outlen, + const unsigned char *in, unsigned long long inlen, + const unsigned char *key, size_t keylen) +{ + if (outlen <= 0U || outlen > BLAKE2B_OUTBYTES || + keylen > BLAKE2B_KEYBYTES || inlen > UINT64_MAX) { + return -1; + } + assert(outlen <= UINT8_MAX); + assert(keylen <= UINT8_MAX); + + return blake2b((uint8_t *) out, in, key, (uint8_t) outlen, (uint64_t) inlen, + (uint8_t) keylen); +} + +int +crypto_generichash_blake2b_salt_personal( + unsigned char *out, size_t outlen, const unsigned char *in, + unsigned long long inlen, const unsigned char *key, size_t keylen, + const unsigned char *salt, const unsigned char *personal) +{ + if (outlen <= 0U || outlen > BLAKE2B_OUTBYTES || + keylen > BLAKE2B_KEYBYTES || inlen > UINT64_MAX) { + return -1; + } + assert(outlen <= UINT8_MAX); + assert(keylen <= UINT8_MAX); + + return blake2b_salt_personal((uint8_t *) out, in, key, (uint8_t) outlen, + (uint64_t) inlen, (uint8_t) keylen, salt, + personal); +} + +int +crypto_generichash_blake2b_init(crypto_generichash_blake2b_state *state, + const unsigned char *key, const size_t keylen, + const size_t outlen) +{ + if (outlen <= 0U || outlen > BLAKE2B_OUTBYTES || + keylen > BLAKE2B_KEYBYTES) { + return -1; + } + assert(outlen <= UINT8_MAX); + assert(keylen <= UINT8_MAX); + if (key == NULL || keylen <= 0U) { + if (blake2b_init(state, (uint8_t) outlen) != 0) { + return -1; /* LCOV_EXCL_LINE */ + } + } else if (blake2b_init_key(state, (uint8_t) outlen, key, + (uint8_t) keylen) != 0) { + return -1; /* LCOV_EXCL_LINE */ + } + return 0; +} + +int +crypto_generichash_blake2b_init_salt_personal( + crypto_generichash_blake2b_state *state, const unsigned char *key, + const size_t keylen, const size_t outlen, const unsigned char *salt, + const unsigned char *personal) +{ + if (outlen <= 0U || outlen > BLAKE2B_OUTBYTES || + keylen > BLAKE2B_KEYBYTES) { + return -1; + } + assert(outlen <= UINT8_MAX); + assert(keylen <= UINT8_MAX); + if (key == NULL || keylen <= 0U) { + if (blake2b_init_salt_personal(state, (uint8_t) outlen, salt, + personal) != 0) { + return -1; /* LCOV_EXCL_LINE */ + } + } else if (blake2b_init_key_salt_personal(state, (uint8_t) outlen, key, + (uint8_t) keylen, salt, + personal) != 0) { + return -1; /* LCOV_EXCL_LINE */ + } + return 0; +} + +int +crypto_generichash_blake2b_update(crypto_generichash_blake2b_state *state, + const unsigned char *in, + unsigned long long inlen) +{ + return blake2b_update(state, (const uint8_t *) in, (uint64_t) inlen); +} + +int +crypto_generichash_blake2b_final(crypto_generichash_blake2b_state *state, + unsigned char *out, const size_t outlen) +{ + assert(outlen <= UINT8_MAX); + return blake2b_final(state, (uint8_t *) out, (uint8_t) outlen); +} + +int +_crypto_generichash_blake2b_pick_best_implementation(void) +{ + return blake2b_pick_best_implementation(); +} -- cgit v1.2.3