From c400f5c17af4996eb2ecf0597e17eb25c17857d8 Mon Sep 17 00:00:00 2001 From: dartraiden Date: Thu, 14 Sep 2023 22:53:45 +0300 Subject: libsodium: update to 1.0.19 --- .../chacha20/dolbeau/chacha20_dolbeau-avx2.c | 354 ++-- .../chacha20/dolbeau/chacha20_dolbeau-avx2.h | 16 +- .../chacha20/dolbeau/chacha20_dolbeau-ssse3.c | 342 ++-- .../chacha20/dolbeau/chacha20_dolbeau-ssse3.h | 16 +- .../src/crypto_stream/chacha20/dolbeau/u0.h | 172 +- .../src/crypto_stream/chacha20/dolbeau/u1.h | 196 +- .../src/crypto_stream/chacha20/dolbeau/u4.h | 352 ++-- .../src/crypto_stream/chacha20/dolbeau/u8.h | 714 ++++---- .../src/crypto_stream/chacha20/ref/chacha20_ref.c | 624 +++---- .../src/crypto_stream/chacha20/ref/chacha20_ref.h | 16 +- .../src/crypto_stream/chacha20/stream_chacha20.c | 367 ++-- .../src/crypto_stream/chacha20/stream_chacha20.h | 44 +- libs/libsodium/src/crypto_stream/crypto_stream.c | 98 +- .../src/crypto_stream/salsa20/ref/salsa20_ref.c | 240 +-- .../src/crypto_stream/salsa20/ref/salsa20_ref.h | 16 +- .../src/crypto_stream/salsa20/stream_salsa20.c | 200 +- .../src/crypto_stream/salsa20/stream_salsa20.h | 32 +- .../crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S | 1920 ++++++++++---------- .../src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c | 62 +- .../src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h | 16 +- .../salsa20/xmm6int/salsa20_xmm6int-avx2.c | 262 +-- .../salsa20/xmm6int/salsa20_xmm6int-avx2.h | 16 +- .../salsa20/xmm6int/salsa20_xmm6int-sse2.c | 244 +-- .../salsa20/xmm6int/salsa20_xmm6int-sse2.h | 16 +- .../src/crypto_stream/salsa20/xmm6int/u0.h | 390 ++-- .../src/crypto_stream/salsa20/xmm6int/u1.h | 414 ++--- .../src/crypto_stream/salsa20/xmm6int/u4.h | 1094 +++++------ .../src/crypto_stream/salsa20/xmm6int/u8.h | 953 +++++----- .../salsa2012/ref/stream_salsa2012_ref.c | 212 +-- .../src/crypto_stream/salsa2012/stream_salsa2012.c | 52 +- .../salsa208/ref/stream_salsa208_ref.c | 212 +-- .../src/crypto_stream/salsa208/stream_salsa208.c | 52 +- .../src/crypto_stream/xchacha20/stream_xchacha20.c | 138 +- .../src/crypto_stream/xsalsa20/stream_xsalsa20.c | 132 +- 34 files changed, 4994 insertions(+), 4990 deletions(-) (limited to 'libs/libsodium/src/crypto_stream') diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c index f63e055265..1e2cdf266c 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.c @@ -1,177 +1,177 @@ - -#include -#include -#include - -#include "core.h" -#include "crypto_stream_chacha20.h" -#include "private/common.h" -#include "private/sse2_64_32.h" -#include "utils.h" - -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) - -# ifdef __GNUC__ -# pragma GCC target("sse2") -# pragma GCC target("ssse3") -# pragma GCC target("sse4.1") -# pragma GCC target("avx2") -# endif - -# include -# include -# include -# include - -# include "../stream_chacha20.h" -# include "chacha20_dolbeau-avx2.h" - -# define ROUNDS 20 - -typedef struct chacha_ctx { - uint32_t input[16]; -} chacha_ctx; - -static void -chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) -{ - ctx->input[0] = 0x61707865; - ctx->input[1] = 0x3320646e; - ctx->input[2] = 0x79622d32; - ctx->input[3] = 0x6b206574; - ctx->input[4] = LOAD32_LE(k + 0); - ctx->input[5] = LOAD32_LE(k + 4); - ctx->input[6] = LOAD32_LE(k + 8); - ctx->input[7] = LOAD32_LE(k + 12); - ctx->input[8] = LOAD32_LE(k + 16); - ctx->input[9] = LOAD32_LE(k + 20); - ctx->input[10] = LOAD32_LE(k + 24); - ctx->input[11] = LOAD32_LE(k + 28); -} - -static void -chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); - ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); - ctx->input[14] = LOAD32_LE(iv + 0); - ctx->input[15] = LOAD32_LE(iv + 4); -} - -static void -chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); - ctx->input[13] = LOAD32_LE(iv + 0); - ctx->input[14] = LOAD32_LE(iv + 4); - ctx->input[15] = LOAD32_LE(iv + 8); -} - -static void -chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) -{ - uint32_t * const x = &ctx->input[0]; - - if (!bytes) { - return; /* LCOV_EXCL_LINE */ - } -# include "u8.h" -# include "u4.h" -# include "u1.h" -# include "u0.h" -} - -static int -stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[8]; - uint32_t ic_high; - uint32_t ic_low; - - if (!mlen) { - return 0; - } - ic_high = (uint32_t) (ic >> 32); - ic_low = (uint32_t) ic; - STORE32_LE(&ic_bytes[0], ic_low); - STORE32_LE(&ic_bytes[4], ic_high); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint32_t ic, const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[4]; - - if (!mlen) { - return 0; - } - STORE32_LE(ic_bytes, ic); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_dolbeau_avx2_implementation = { - SODIUM_C99(.stream =) stream_ref, - SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, - SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, - SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic - }; - -#endif + +#include +#include +#include + +#include "core.h" +#include "crypto_stream_chacha20.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" + +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# pragma GCC target("sse4.1") +# pragma GCC target("avx2") +# endif + +# include +# include +# include +# include + +# include "../stream_chacha20.h" +# include "chacha20_dolbeau-avx2.h" + +# define ROUNDS 20 + +typedef struct chacha_ctx { + uint32_t input[16]; +} chacha_ctx; + +static void +chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) +{ + ctx->input[0] = 0x61707865; + ctx->input[1] = 0x3320646e; + ctx->input[2] = 0x79622d32; + ctx->input[3] = 0x6b206574; + ctx->input[4] = LOAD32_LE(k + 0); + ctx->input[5] = LOAD32_LE(k + 4); + ctx->input[6] = LOAD32_LE(k + 8); + ctx->input[7] = LOAD32_LE(k + 12); + ctx->input[8] = LOAD32_LE(k + 16); + ctx->input[9] = LOAD32_LE(k + 20); + ctx->input[10] = LOAD32_LE(k + 24); + ctx->input[11] = LOAD32_LE(k + 28); +} + +static void +chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); + ctx->input[14] = LOAD32_LE(iv + 0); + ctx->input[15] = LOAD32_LE(iv + 4); +} + +static void +chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); + ctx->input[13] = LOAD32_LE(iv + 0); + ctx->input[14] = LOAD32_LE(iv + 4); + ctx->input[15] = LOAD32_LE(iv + 8); +} + +static void +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } +# include "u8.h" +# include "u4.h" +# include "u1.h" +# include "u0.h" +} + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) ic; + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint32_t ic, const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[4]; + + if (!mlen) { + return 0; + } + STORE32_LE(ic_bytes, ic); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_dolbeau_avx2_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h index 45eb98d797..a29fef9136 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-avx2.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_chacha20.h" -#include "crypto_stream_chacha20.h" - -extern struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_dolbeau_avx2_implementation; + +#include + +#include "../stream_chacha20.h" +#include "crypto_stream_chacha20.h" + +extern struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_dolbeau_avx2_implementation; diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c index 6f5d3851c3..ae5df1cc28 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.c @@ -1,171 +1,171 @@ - -#include -#include -#include - -#include "core.h" -#include "crypto_stream_chacha20.h" -#include "private/common.h" -#include "private/sse2_64_32.h" -#include "utils.h" - -#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) - -# ifdef __GNUC__ -# pragma GCC target("sse2") -# pragma GCC target("ssse3") -# endif - -# include -# include - -# include "../stream_chacha20.h" -# include "chacha20_dolbeau-ssse3.h" - -# define ROUNDS 20 - -typedef struct chacha_ctx { - uint32_t input[16]; -} chacha_ctx; - -static void -chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) -{ - ctx->input[0] = 0x61707865; - ctx->input[1] = 0x3320646e; - ctx->input[2] = 0x79622d32; - ctx->input[3] = 0x6b206574; - ctx->input[4] = LOAD32_LE(k + 0); - ctx->input[5] = LOAD32_LE(k + 4); - ctx->input[6] = LOAD32_LE(k + 8); - ctx->input[7] = LOAD32_LE(k + 12); - ctx->input[8] = LOAD32_LE(k + 16); - ctx->input[9] = LOAD32_LE(k + 20); - ctx->input[10] = LOAD32_LE(k + 24); - ctx->input[11] = LOAD32_LE(k + 28); -} - -static void -chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); - ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); - ctx->input[14] = LOAD32_LE(iv + 0); - ctx->input[15] = LOAD32_LE(iv + 4); -} - -static void -chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); - ctx->input[13] = LOAD32_LE(iv + 0); - ctx->input[14] = LOAD32_LE(iv + 4); - ctx->input[15] = LOAD32_LE(iv + 8); -} - -static void -chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) -{ - uint32_t * const x = &ctx->input[0]; - - if (!bytes) { - return; /* LCOV_EXCL_LINE */ - } -# include "u4.h" -# include "u1.h" -# include "u0.h" -} - -static int -stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[8]; - uint32_t ic_high; - uint32_t ic_low; - - if (!mlen) { - return 0; - } - ic_high = (uint32_t) (ic >> 32); - ic_low = (uint32_t) ic; - STORE32_LE(&ic_bytes[0], ic_low); - STORE32_LE(&ic_bytes[4], ic_high); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint32_t ic, const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[4]; - - if (!mlen) { - return 0; - } - STORE32_LE(ic_bytes, ic); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_dolbeau_ssse3_implementation = { - SODIUM_C99(.stream =) stream_ref, - SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, - SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, - SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic - }; - -#endif + +#include +#include +#include + +#include "core.h" +#include "crypto_stream_chacha20.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" + +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# endif + +# include +# include + +# include "../stream_chacha20.h" +# include "chacha20_dolbeau-ssse3.h" + +# define ROUNDS 20 + +typedef struct chacha_ctx { + uint32_t input[16]; +} chacha_ctx; + +static void +chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) +{ + ctx->input[0] = 0x61707865; + ctx->input[1] = 0x3320646e; + ctx->input[2] = 0x79622d32; + ctx->input[3] = 0x6b206574; + ctx->input[4] = LOAD32_LE(k + 0); + ctx->input[5] = LOAD32_LE(k + 4); + ctx->input[6] = LOAD32_LE(k + 8); + ctx->input[7] = LOAD32_LE(k + 12); + ctx->input[8] = LOAD32_LE(k + 16); + ctx->input[9] = LOAD32_LE(k + 20); + ctx->input[10] = LOAD32_LE(k + 24); + ctx->input[11] = LOAD32_LE(k + 28); +} + +static void +chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); + ctx->input[14] = LOAD32_LE(iv + 0); + ctx->input[15] = LOAD32_LE(iv + 4); +} + +static void +chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); + ctx->input[13] = LOAD32_LE(iv + 0); + ctx->input[14] = LOAD32_LE(iv + 4); + ctx->input[15] = LOAD32_LE(iv + 8); +} + +static void +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } +# include "u4.h" +# include "u1.h" +# include "u0.h" +} + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) ic; + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint32_t ic, const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[4]; + + if (!mlen) { + return 0; + } + STORE32_LE(ic_bytes, ic); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_dolbeau_ssse3_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h index d67630f6a9..520761ab5e 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/chacha20_dolbeau-ssse3.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_chacha20.h" -#include "crypto_stream_chacha20.h" - -extern struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_dolbeau_ssse3_implementation; + +#include + +#include "../stream_chacha20.h" +#include "crypto_stream_chacha20.h" + +extern struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_dolbeau_ssse3_implementation; diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h index 17c3ff8e08..f790a8625f 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u0.h @@ -1,86 +1,86 @@ -if (bytes > 0) { - __m128i x_0, x_1, x_2, x_3; - __m128i t_1; - const __m128i rot16 = - _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - const __m128i rot8 = - _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); - uint8_t partialblock[64]; - - unsigned int i; - - x_0 = _mm_loadu_si128((__m128i*) (x + 0)); - x_1 = _mm_loadu_si128((__m128i*) (x + 4)); - x_2 = _mm_loadu_si128((__m128i*) (x + 8)); - x_3 = _mm_loadu_si128((__m128i*) (x + 12)); - - for (i = 0; i < ROUNDS; i += 2) { - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_3 = _mm_shuffle_epi8(x_3, rot16); - - x_2 = _mm_add_epi32(x_2, x_3); - x_1 = _mm_xor_si128(x_1, x_2); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 12); - t_1 = _mm_srli_epi32(t_1, 20); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_0 = _mm_shuffle_epi32(x_0, 0x93); - x_3 = _mm_shuffle_epi8(x_3, rot8); - - x_2 = _mm_add_epi32(x_2, x_3); - x_3 = _mm_shuffle_epi32(x_3, 0x4e); - x_1 = _mm_xor_si128(x_1, x_2); - x_2 = _mm_shuffle_epi32(x_2, 0x39); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 7); - t_1 = _mm_srli_epi32(t_1, 25); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_3 = _mm_shuffle_epi8(x_3, rot16); - - x_2 = _mm_add_epi32(x_2, x_3); - x_1 = _mm_xor_si128(x_1, x_2); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 12); - t_1 = _mm_srli_epi32(t_1, 20); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_0 = _mm_shuffle_epi32(x_0, 0x39); - x_3 = _mm_shuffle_epi8(x_3, rot8); - - x_2 = _mm_add_epi32(x_2, x_3); - x_3 = _mm_shuffle_epi32(x_3, 0x4e); - x_1 = _mm_xor_si128(x_1, x_2); - x_2 = _mm_shuffle_epi32(x_2, 0x93); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 7); - t_1 = _mm_srli_epi32(t_1, 25); - x_1 = _mm_xor_si128(x_1, t_1); - } - x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0))); - x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4))); - x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8))); - x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12))); - _mm_storeu_si128((__m128i*) (partialblock + 0), x_0); - _mm_storeu_si128((__m128i*) (partialblock + 16), x_1); - _mm_storeu_si128((__m128i*) (partialblock + 32), x_2); - _mm_storeu_si128((__m128i*) (partialblock + 48), x_3); - - for (i = 0; i < bytes; i++) { - c[i] = m[i] ^ partialblock[i]; - } - - sodium_memzero(partialblock, sizeof partialblock); -} +if (bytes > 0) { + __m128i x_0, x_1, x_2, x_3; + __m128i t_1; + const __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + const __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + uint8_t partialblock[64]; + + unsigned int i; + + x_0 = _mm_loadu_si128((const __m128i*) (x + 0)); + x_1 = _mm_loadu_si128((const __m128i*) (x + 4)); + x_2 = _mm_loadu_si128((const __m128i*) (x + 8)); + x_3 = _mm_loadu_si128((const __m128i*) (x + 12)); + + for (i = 0; i < ROUNDS; i += 2) { + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x93); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x39); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x39); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x93); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + } + x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0))); + x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4))); + x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8))); + x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12))); + _mm_storeu_si128((__m128i*) (partialblock + 0), x_0); + _mm_storeu_si128((__m128i*) (partialblock + 16), x_1); + _mm_storeu_si128((__m128i*) (partialblock + 32), x_2); + _mm_storeu_si128((__m128i*) (partialblock + 48), x_3); + + for (i = 0; i < bytes; i++) { + c[i] = m[i] ^ partialblock[i]; + } + + sodium_memzero(partialblock, sizeof partialblock); +} diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h index 867b44bcf2..893ec67371 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u1.h @@ -1,98 +1,98 @@ -while (bytes >= 64) { - __m128i x_0, x_1, x_2, x_3; - __m128i t_1; - const __m128i rot16 = - _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - const __m128i rot8 = - _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); - - uint32_t in12; - uint32_t in13; - int i; - - x_0 = _mm_loadu_si128((__m128i*) (x + 0)); - x_1 = _mm_loadu_si128((__m128i*) (x + 4)); - x_2 = _mm_loadu_si128((__m128i*) (x + 8)); - x_3 = _mm_loadu_si128((__m128i*) (x + 12)); - - for (i = 0; i < ROUNDS; i += 2) { - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_3 = _mm_shuffle_epi8(x_3, rot16); - - x_2 = _mm_add_epi32(x_2, x_3); - x_1 = _mm_xor_si128(x_1, x_2); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 12); - t_1 = _mm_srli_epi32(t_1, 20); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_0 = _mm_shuffle_epi32(x_0, 0x93); - x_3 = _mm_shuffle_epi8(x_3, rot8); - - x_2 = _mm_add_epi32(x_2, x_3); - x_3 = _mm_shuffle_epi32(x_3, 0x4e); - x_1 = _mm_xor_si128(x_1, x_2); - x_2 = _mm_shuffle_epi32(x_2, 0x39); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 7); - t_1 = _mm_srli_epi32(t_1, 25); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_3 = _mm_shuffle_epi8(x_3, rot16); - - x_2 = _mm_add_epi32(x_2, x_3); - x_1 = _mm_xor_si128(x_1, x_2); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 12); - t_1 = _mm_srli_epi32(t_1, 20); - x_1 = _mm_xor_si128(x_1, t_1); - - x_0 = _mm_add_epi32(x_0, x_1); - x_3 = _mm_xor_si128(x_3, x_0); - x_0 = _mm_shuffle_epi32(x_0, 0x39); - x_3 = _mm_shuffle_epi8(x_3, rot8); - - x_2 = _mm_add_epi32(x_2, x_3); - x_3 = _mm_shuffle_epi32(x_3, 0x4e); - x_1 = _mm_xor_si128(x_1, x_2); - x_2 = _mm_shuffle_epi32(x_2, 0x93); - - t_1 = x_1; - x_1 = _mm_slli_epi32(x_1, 7); - t_1 = _mm_srli_epi32(t_1, 25); - x_1 = _mm_xor_si128(x_1, t_1); - } - x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*) (x + 0))); - x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*) (x + 4))); - x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*) (x + 8))); - x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*) (x + 12))); - x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*) (m + 0))); - x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*) (m + 16))); - x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*) (m + 32))); - x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*) (m + 48))); - _mm_storeu_si128((__m128i*) (c + 0), x_0); - _mm_storeu_si128((__m128i*) (c + 16), x_1); - _mm_storeu_si128((__m128i*) (c + 32), x_2); - _mm_storeu_si128((__m128i*) (c + 48), x_3); - - in12 = x[12]; - in13 = x[13]; - in12++; - if (in12 == 0) { - in13++; - } - x[12] = in12; - x[13] = in13; - - bytes -= 64; - c += 64; - m += 64; -} +while (bytes >= 64) { + __m128i x_0, x_1, x_2, x_3; + __m128i t_1; + const __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + const __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + + uint32_t in12; + uint32_t in13; + int i; + + x_0 = _mm_loadu_si128((const __m128i*) (x + 0)); + x_1 = _mm_loadu_si128((const __m128i*) (x + 4)); + x_2 = _mm_loadu_si128((const __m128i*) (x + 8)); + x_3 = _mm_loadu_si128((const __m128i*) (x + 12)); + + for (i = 0; i < ROUNDS; i += 2) { + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x93); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x39); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x39); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x93); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + } + x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0))); + x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4))); + x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8))); + x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12))); + x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i*) (m + 0))); + x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i*) (m + 16))); + x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i*) (m + 32))); + x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i*) (m + 48))); + _mm_storeu_si128((__m128i*) (c + 0), x_0); + _mm_storeu_si128((__m128i*) (c + 16), x_1); + _mm_storeu_si128((__m128i*) (c + 32), x_2); + _mm_storeu_si128((__m128i*) (c + 48), x_3); + + in12 = x[12]; + in13 = x[13]; + in12++; + if (in12 == 0) { + in13++; + } + x[12] = in12; + x[13] = in13; + + bytes -= 64; + c += 64; + m += 64; +} diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h index 3ff8342609..b88a5fc960 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u4.h @@ -1,175 +1,177 @@ - -#define VEC4_ROT(A, IMM) \ - _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM))) - -/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & - * 16) (better) */ -#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \ - x_##A = _mm_add_epi32(x_##A, x_##B); \ - t_##A = _mm_xor_si128(x_##D, x_##A); \ - x_##D = _mm_shuffle_epi8(t_##A, rot16); \ - x_##C = _mm_add_epi32(x_##C, x_##D); \ - t_##C = _mm_xor_si128(x_##B, x_##C); \ - x_##B = VEC4_ROT(t_##C, 12); \ - x_##A = _mm_add_epi32(x_##A, x_##B); \ - t_##A = _mm_xor_si128(x_##D, x_##A); \ - x_##D = _mm_shuffle_epi8(t_##A, rot8); \ - x_##C = _mm_add_epi32(x_##C, x_##D); \ - t_##C = _mm_xor_si128(x_##B, x_##C); \ - x_##B = VEC4_ROT(t_##C, 7) - -#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) - -if (bytes >= 256) { - /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ - __m128i rot16 = - _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - __m128i rot8 = - _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); - - __m128i x_0 = _mm_set1_epi32(x[0]); - __m128i x_1 = _mm_set1_epi32(x[1]); - __m128i x_2 = _mm_set1_epi32(x[2]); - __m128i x_3 = _mm_set1_epi32(x[3]); - __m128i x_4 = _mm_set1_epi32(x[4]); - __m128i x_5 = _mm_set1_epi32(x[5]); - __m128i x_6 = _mm_set1_epi32(x[6]); - __m128i x_7 = _mm_set1_epi32(x[7]); - __m128i x_8 = _mm_set1_epi32(x[8]); - __m128i x_9 = _mm_set1_epi32(x[9]); - __m128i x_10 = _mm_set1_epi32(x[10]); - __m128i x_11 = _mm_set1_epi32(x[11]); - __m128i x_12; - __m128i x_13; - __m128i x_14 = _mm_set1_epi32(x[14]); - __m128i x_15 = _mm_set1_epi32(x[15]); - __m128i orig0 = x_0; - __m128i orig1 = x_1; - __m128i orig2 = x_2; - __m128i orig3 = x_3; - __m128i orig4 = x_4; - __m128i orig5 = x_5; - __m128i orig6 = x_6; - __m128i orig7 = x_7; - __m128i orig8 = x_8; - __m128i orig9 = x_9; - __m128i orig10 = x_10; - __m128i orig11 = x_11; - __m128i orig12; - __m128i orig13; - __m128i orig14 = x_14; - __m128i orig15 = x_15; - __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, - t_13, t_14, t_15; - - uint32_t in12, in13; - int i; - - while (bytes >= 256) { - const __m128i addv12 = _mm_set_epi64x(1, 0); - const __m128i addv13 = _mm_set_epi64x(3, 2); - __m128i t12, t13; - uint64_t in1213; - - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_8 = orig8; - x_9 = orig9; - x_10 = orig10; - x_11 = orig11; - x_14 = orig14; - x_15 = orig15; - - in12 = x[12]; - in13 = x[13]; - in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); - t12 = _mm_set1_epi64x(in1213); - t13 = _mm_set1_epi64x(in1213); - - x_12 = _mm_add_epi64(addv12, t12); - x_13 = _mm_add_epi64(addv13, t13); - - t12 = _mm_unpacklo_epi32(x_12, x_13); - t13 = _mm_unpackhi_epi32(x_12, x_13); - - x_12 = _mm_unpacklo_epi32(t12, t13); - x_13 = _mm_unpackhi_epi32(t12, t13); - - orig12 = x_12; - orig13 = x_13; - - in1213 += 4; - - x[12] = in1213 & 0xFFFFFFFF; - x[13] = (in1213 >> 32) & 0xFFFFFFFF; - - for (i = 0; i < ROUNDS; i += 2) { - VEC4_QUARTERROUND(0, 4, 8, 12); - VEC4_QUARTERROUND(1, 5, 9, 13); - VEC4_QUARTERROUND(2, 6, 10, 14); - VEC4_QUARTERROUND(3, 7, 11, 15); - VEC4_QUARTERROUND(0, 5, 10, 15); - VEC4_QUARTERROUND(1, 6, 11, 12); - VEC4_QUARTERROUND(2, 7, 8, 13); - VEC4_QUARTERROUND(3, 4, 9, 14); - } - -#define ONEQUAD_TRANSPOSE(A, B, C, D) \ - { \ - __m128i t0, t1, t2, t3; \ - \ - x_##A = _mm_add_epi32(x_##A, orig##A); \ - x_##B = _mm_add_epi32(x_##B, orig##B); \ - x_##C = _mm_add_epi32(x_##C, orig##C); \ - x_##D = _mm_add_epi32(x_##D, orig##D); \ - t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \ - t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \ - t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \ - t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \ - x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \ - x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \ - x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \ - x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \ - \ - t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((__m128i*) (m + 0))); \ - _mm_storeu_si128((__m128i*) (c + 0), t0); \ - t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((__m128i*) (m + 64))); \ - _mm_storeu_si128((__m128i*) (c + 64), t1); \ - t2 = _mm_xor_si128(x_##C, _mm_loadu_si128((__m128i*) (m + 128))); \ - _mm_storeu_si128((__m128i*) (c + 128), t2); \ - t3 = _mm_xor_si128(x_##D, _mm_loadu_si128((__m128i*) (m + 192))); \ - _mm_storeu_si128((__m128i*) (c + 192), t3); \ - } - -#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) - - ONEQUAD(0, 1, 2, 3); - m += 16; - c += 16; - ONEQUAD(4, 5, 6, 7); - m += 16; - c += 16; - ONEQUAD(8, 9, 10, 11); - m += 16; - c += 16; - ONEQUAD(12, 13, 14, 15); - m -= 48; - c -= 48; - -#undef ONEQUAD -#undef ONEQUAD_TRANSPOSE - - bytes -= 256; - c += 256; - m += 256; - } -} -#undef VEC4_ROT -#undef VEC4_QUARTERROUND -#undef VEC4_QUARTERROUND_SHUFFLE + +#define VEC4_ROT(A, IMM) \ + _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM))) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & + * 16) (better) */ +#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \ + x_##A = _mm_add_epi32(x_##A, x_##B); \ + t_##A = _mm_xor_si128(x_##D, x_##A); \ + x_##D = _mm_shuffle_epi8(t_##A, rot16); \ + x_##C = _mm_add_epi32(x_##C, x_##D); \ + t_##C = _mm_xor_si128(x_##B, x_##C); \ + x_##B = VEC4_ROT(t_##C, 12); \ + x_##A = _mm_add_epi32(x_##A, x_##B); \ + t_##A = _mm_xor_si128(x_##D, x_##A); \ + x_##D = _mm_shuffle_epi8(t_##A, rot8); \ + x_##C = _mm_add_epi32(x_##C, x_##D); \ + t_##C = _mm_xor_si128(x_##B, x_##C); \ + x_##B = VEC4_ROT(t_##C, 7) + +#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) + +if (bytes >= 256) { + /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ + __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + + __m128i x_0 = _mm_set1_epi32(x[0]); + __m128i x_1 = _mm_set1_epi32(x[1]); + __m128i x_2 = _mm_set1_epi32(x[2]); + __m128i x_3 = _mm_set1_epi32(x[3]); + __m128i x_4 = _mm_set1_epi32(x[4]); + __m128i x_5 = _mm_set1_epi32(x[5]); + __m128i x_6 = _mm_set1_epi32(x[6]); + __m128i x_7 = _mm_set1_epi32(x[7]); + __m128i x_8 = _mm_set1_epi32(x[8]); + __m128i x_9 = _mm_set1_epi32(x[9]); + __m128i x_10 = _mm_set1_epi32(x[10]); + __m128i x_11 = _mm_set1_epi32(x[11]); + __m128i x_12; + __m128i x_13; + __m128i x_14 = _mm_set1_epi32(x[14]); + __m128i x_15 = _mm_set1_epi32(x[15]); + __m128i orig0 = x_0; + __m128i orig1 = x_1; + __m128i orig2 = x_2; + __m128i orig3 = x_3; + __m128i orig4 = x_4; + __m128i orig5 = x_5; + __m128i orig6 = x_6; + __m128i orig7 = x_7; + __m128i orig8 = x_8; + __m128i orig9 = x_9; + __m128i orig10 = x_10; + __m128i orig11 = x_11; + __m128i orig12; + __m128i orig13; + __m128i orig14 = x_14; + __m128i orig15 = x_15; + __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, + t_13, t_14, t_15; + + uint32_t in12, in13; + int i; + + while (bytes >= 256) { + const __m128i addv12 = _mm_set_epi64x(1, 0); + const __m128i addv13 = _mm_set_epi64x(3, 2); + __m128i t12, t13; + uint64_t in1213; + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + in12 = x[12]; + in13 = x[13]; + in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); + t12 = _mm_set1_epi64x(in1213); + t13 = _mm_set1_epi64x(in1213); + + x_12 = _mm_add_epi64(addv12, t12); + x_13 = _mm_add_epi64(addv13, t13); + + t12 = _mm_unpacklo_epi32(x_12, x_13); + t13 = _mm_unpackhi_epi32(x_12, x_13); + + x_12 = _mm_unpacklo_epi32(t12, t13); + x_13 = _mm_unpackhi_epi32(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = in1213 & 0xFFFFFFFF; + x[13] = (in1213 >> 32) & 0xFFFFFFFF; + + for (i = 0; i < ROUNDS; i += 2) { + VEC4_QUARTERROUND(0, 4, 8, 12); + VEC4_QUARTERROUND(1, 5, 9, 13); + VEC4_QUARTERROUND(2, 6, 10, 14); + VEC4_QUARTERROUND(3, 7, 11, 15); + VEC4_QUARTERROUND(0, 5, 10, 15); + VEC4_QUARTERROUND(1, 6, 11, 12); + VEC4_QUARTERROUND(2, 7, 8, 13); + VEC4_QUARTERROUND(3, 4, 9, 14); + } + +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + \ + x_##A = _mm_add_epi32(x_##A, orig##A); \ + x_##B = _mm_add_epi32(x_##B, orig##B); \ + x_##C = _mm_add_epi32(x_##C, orig##C); \ + x_##D = _mm_add_epi32(x_##D, orig##D); \ + t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \ + \ + t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = \ + _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = \ + _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + + ONEQUAD(0, 1, 2, 3); + m += 16; + c += 16; + ONEQUAD(4, 5, 6, 7); + m += 16; + c += 16; + ONEQUAD(8, 9, 10, 11); + m += 16; + c += 16; + ONEQUAD(12, 13, 14, 15); + m -= 48; + c -= 48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE + + bytes -= 256; + c += 256; + m += 256; + } +} +#undef VEC4_ROT +#undef VEC4_QUARTERROUND +#undef VEC4_QUARTERROUND_SHUFFLE diff --git a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h index 22bf9fcfa1..c92fbd3514 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h +++ b/libs/libsodium/src/crypto_stream/chacha20/dolbeau/u8.h @@ -1,357 +1,357 @@ - -#define VEC8_ROT(A, IMM) \ - _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM))) - -/* implements a vector quarter round by-the-book (naive!) */ -#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = VEC8_ROT(t_##A, 16); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 12); \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = VEC8_ROT(t_##A, 8); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 7) - -/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & - * 16) (better) */ -#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = _mm256_shuffle_epi8(t_##A, rot16); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 12); \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 7) - -/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles - * (8 & 16) (not as good as previous) */ -#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 12); \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - t_##A = _mm256_xor_si256(x_##D, x_##A); \ - x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - t_##C = _mm256_xor_si256(x_##B, x_##C); \ - x_##B = VEC8_ROT(t_##C, 7) - -#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) - -#define VEC8_LINE1(A, B, C, D) \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16) -#define VEC8_LINE2(A, B, C, D) \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12) -#define VEC8_LINE3(A, B, C, D) \ - x_##A = _mm256_add_epi32(x_##A, x_##B); \ - x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8) -#define VEC8_LINE4(A, B, C, D) \ - x_##C = _mm256_add_epi32(x_##C, x_##D); \ - x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7) - -#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \ - C4, D4) \ - VEC8_LINE1(A1, B1, C1, D1); \ - VEC8_LINE1(A2, B2, C2, D2); \ - VEC8_LINE1(A3, B3, C3, D3); \ - VEC8_LINE1(A4, B4, C4, D4); \ - VEC8_LINE2(A1, B1, C1, D1); \ - VEC8_LINE2(A2, B2, C2, D2); \ - VEC8_LINE2(A3, B3, C3, D3); \ - VEC8_LINE2(A4, B4, C4, D4); \ - VEC8_LINE3(A1, B1, C1, D1); \ - VEC8_LINE3(A2, B2, C2, D2); \ - VEC8_LINE3(A3, B3, C3, D3); \ - VEC8_LINE3(A4, B4, C4, D4); \ - VEC8_LINE4(A1, B1, C1, D1); \ - VEC8_LINE4(A2, B2, C2, D2); \ - VEC8_LINE4(A3, B3, C3, D3); \ - VEC8_LINE4(A4, B4, C4, D4) - -#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \ - B4, C4, D4) \ - VEC8_LINE1(A1, B1, C1, D1); \ - VEC8_LINE1(A2, B2, C2, D2); \ - VEC8_LINE2(A1, B1, C1, D1); \ - VEC8_LINE2(A2, B2, C2, D2); \ - VEC8_LINE3(A1, B1, C1, D1); \ - VEC8_LINE3(A2, B2, C2, D2); \ - VEC8_LINE4(A1, B1, C1, D1); \ - VEC8_LINE4(A2, B2, C2, D2); \ - VEC8_LINE1(A3, B3, C3, D3); \ - VEC8_LINE1(A4, B4, C4, D4); \ - VEC8_LINE2(A3, B3, C3, D3); \ - VEC8_LINE2(A4, B4, C4, D4); \ - VEC8_LINE3(A3, B3, C3, D3); \ - VEC8_LINE3(A4, B4, C4, D4); \ - VEC8_LINE4(A3, B3, C3, D3); \ - VEC8_LINE4(A4, B4, C4, D4) - -#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \ - A4, B4, C4, D4) \ - VEC8_LINE1(A1, B1, C1, D1); \ - VEC8_LINE1(A2, B2, C2, D2); \ - VEC8_LINE2(A1, B1, C1, D1); \ - VEC8_LINE2(A2, B2, C2, D2); \ - VEC8_LINE1(A3, B3, C3, D3); \ - VEC8_LINE1(A4, B4, C4, D4); \ - VEC8_LINE2(A3, B3, C3, D3); \ - VEC8_LINE2(A4, B4, C4, D4); \ - VEC8_LINE3(A1, B1, C1, D1); \ - VEC8_LINE3(A2, B2, C2, D2); \ - VEC8_LINE4(A1, B1, C1, D1); \ - VEC8_LINE4(A2, B2, C2, D2); \ - VEC8_LINE3(A3, B3, C3, D3); \ - VEC8_LINE3(A4, B4, C4, D4); \ - VEC8_LINE4(A3, B3, C3, D3); \ - VEC8_LINE4(A4, B4, C4, D4) - -#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ - D4) \ - VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ - D4) - -if (bytes >= 512) { - /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ - __m256i rot16 = - _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, - 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); - __m256i rot8 = - _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, - 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); - uint32_t in12, in13; - - /* the naive way seems as fast (if not a bit faster) than the vector way */ - __m256i x_0 = _mm256_set1_epi32(x[0]); - __m256i x_1 = _mm256_set1_epi32(x[1]); - __m256i x_2 = _mm256_set1_epi32(x[2]); - __m256i x_3 = _mm256_set1_epi32(x[3]); - __m256i x_4 = _mm256_set1_epi32(x[4]); - __m256i x_5 = _mm256_set1_epi32(x[5]); - __m256i x_6 = _mm256_set1_epi32(x[6]); - __m256i x_7 = _mm256_set1_epi32(x[7]); - __m256i x_8 = _mm256_set1_epi32(x[8]); - __m256i x_9 = _mm256_set1_epi32(x[9]); - __m256i x_10 = _mm256_set1_epi32(x[10]); - __m256i x_11 = _mm256_set1_epi32(x[11]); - __m256i x_12; - __m256i x_13; - __m256i x_14 = _mm256_set1_epi32(x[14]); - __m256i x_15 = _mm256_set1_epi32(x[15]); - - __m256i orig0 = x_0; - __m256i orig1 = x_1; - __m256i orig2 = x_2; - __m256i orig3 = x_3; - __m256i orig4 = x_4; - __m256i orig5 = x_5; - __m256i orig6 = x_6; - __m256i orig7 = x_7; - __m256i orig8 = x_8; - __m256i orig9 = x_9; - __m256i orig10 = x_10; - __m256i orig11 = x_11; - __m256i orig12; - __m256i orig13; - __m256i orig14 = x_14; - __m256i orig15 = x_15; - __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, - t_13, t_14, t_15; - - while (bytes >= 512) { - const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0); - const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4); - const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - __m256i t12, t13; - - uint64_t in1213; - int i; - - x_0 = orig0; - x_1 = orig1; - x_2 = orig2; - x_3 = orig3; - x_4 = orig4; - x_5 = orig5; - x_6 = orig6; - x_7 = orig7; - x_8 = orig8; - x_9 = orig9; - x_10 = orig10; - x_11 = orig11; - x_14 = orig14; - x_15 = orig15; - - in12 = x[12]; - in13 = x[13]; - in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); - x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); - - t12 = _mm256_add_epi64(addv12, x_12); - t13 = _mm256_add_epi64(addv13, x_13); - - x_12 = _mm256_unpacklo_epi32(t12, t13); - x_13 = _mm256_unpackhi_epi32(t12, t13); - - t12 = _mm256_unpacklo_epi32(x_12, x_13); - t13 = _mm256_unpackhi_epi32(x_12, x_13); - - /* required because unpack* are intra-lane */ - x_12 = _mm256_permutevar8x32_epi32(t12, permute); - x_13 = _mm256_permutevar8x32_epi32(t13, permute); - - orig12 = x_12; - orig13 = x_13; - - in1213 += 8; - - x[12] = in1213 & 0xFFFFFFFF; - x[13] = (in1213 >> 32) & 0xFFFFFFFF; - - for (i = 0; i < ROUNDS; i += 2) { - VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14); - } - -#define ONEQUAD_TRANSPOSE(A, B, C, D) \ - { \ - __m128i t0, t1, t2, t3; \ - x_##A = _mm256_add_epi32(x_##A, orig##A); \ - x_##B = _mm256_add_epi32(x_##B, orig##B); \ - x_##C = _mm256_add_epi32(x_##C, orig##C); \ - x_##D = _mm256_add_epi32(x_##D, orig##D); \ - t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ - t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ - t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ - t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ - x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ - x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ - x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ - x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ - t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \ - _mm_loadu_si128((__m128i*) (m + 0))); \ - _mm_storeu_si128((__m128i*) (c + 0), t0); \ - t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \ - _mm_loadu_si128((__m128i*) (m + 64))); \ - _mm_storeu_si128((__m128i*) (c + 64), t1); \ - t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \ - _mm_loadu_si128((__m128i*) (m + 128))); \ - _mm_storeu_si128((__m128i*) (c + 128), t2); \ - t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \ - _mm_loadu_si128((__m128i*) (m + 192))); \ - _mm_storeu_si128((__m128i*) (c + 192), t3); \ - t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \ - _mm_loadu_si128((__m128i*) (m + 256))); \ - _mm_storeu_si128((__m128i*) (c + 256), t0); \ - t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \ - _mm_loadu_si128((__m128i*) (m + 320))); \ - _mm_storeu_si128((__m128i*) (c + 320), t1); \ - t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \ - _mm_loadu_si128((__m128i*) (m + 384))); \ - _mm_storeu_si128((__m128i*) (c + 384), t2); \ - t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \ - _mm_loadu_si128((__m128i*) (m + 448))); \ - _mm_storeu_si128((__m128i*) (c + 448), t3); \ - } - -#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) - -#define ONEQUAD_UNPCK(A, B, C, D) \ - { \ - x_##A = _mm256_add_epi32(x_##A, orig##A); \ - x_##B = _mm256_add_epi32(x_##B, orig##B); \ - x_##C = _mm256_add_epi32(x_##C, orig##C); \ - x_##D = _mm256_add_epi32(x_##D, orig##D); \ - t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ - t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ - t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ - t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ - x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ - x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ - x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ - x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ - } - -#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ - { \ - ONEQUAD_UNPCK(A, B, C, D); \ - ONEQUAD_UNPCK(A2, B2, C2, D2); \ - t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \ - t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \ - t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \ - t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \ - t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \ - t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \ - t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \ - t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \ - t_##A = \ - _mm256_xor_si256(t_##A, _mm256_loadu_si256((__m256i*) (m + 0))); \ - t_##B = \ - _mm256_xor_si256(t_##B, _mm256_loadu_si256((__m256i*) (m + 64))); \ - t_##C = \ - _mm256_xor_si256(t_##C, _mm256_loadu_si256((__m256i*) (m + 128))); \ - t_##D = \ - _mm256_xor_si256(t_##D, _mm256_loadu_si256((__m256i*) (m + 192))); \ - t_##A2 = _mm256_xor_si256(t_##A2, \ - _mm256_loadu_si256((__m256i*) (m + 256))); \ - t_##B2 = _mm256_xor_si256(t_##B2, \ - _mm256_loadu_si256((__m256i*) (m + 320))); \ - t_##C2 = _mm256_xor_si256(t_##C2, \ - _mm256_loadu_si256((__m256i*) (m + 384))); \ - t_##D2 = _mm256_xor_si256(t_##D2, \ - _mm256_loadu_si256((__m256i*) (m + 448))); \ - _mm256_storeu_si256((__m256i*) (c + 0), t_##A); \ - _mm256_storeu_si256((__m256i*) (c + 64), t_##B); \ - _mm256_storeu_si256((__m256i*) (c + 128), t_##C); \ - _mm256_storeu_si256((__m256i*) (c + 192), t_##D); \ - _mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \ - _mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \ - _mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \ - _mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \ - } - - ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); - m += 32; - c += 32; - ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); - m -= 32; - c -= 32; - -#undef ONEQUAD -#undef ONEQUAD_TRANSPOSE -#undef ONEQUAD_UNPCK -#undef ONEOCTO - - bytes -= 512; - c += 512; - m += 512; - } -} -#undef VEC8_ROT -#undef VEC8_QUARTERROUND -#undef VEC8_QUARTERROUND_NAIVE -#undef VEC8_QUARTERROUND_SHUFFLE -#undef VEC8_QUARTERROUND_SHUFFLE2 -#undef VEC8_LINE1 -#undef VEC8_LINE2 -#undef VEC8_LINE3 -#undef VEC8_LINE4 -#undef VEC8_ROUND -#undef VEC8_ROUND_SEQ -#undef VEC8_ROUND_HALF -#undef VEC8_ROUND_HALFANDHALF + +#define VEC8_ROT(A, IMM) \ + _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM))) + +/* implements a vector quarter round by-the-book (naive!) */ +#define VEC8_QUARTERROUND_NAIVE(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = VEC8_ROT(t_##A, 16); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 12); \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = VEC8_ROT(t_##A, 8); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 7) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & + * 16) (better) */ +#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shuffle_epi8(t_##A, rot16); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 12); \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 7) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte & word shuffles + * (8 & 16) (not as good as previous) */ +#define VEC8_QUARTERROUND_SHUFFLE2(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(t_##A, 0xb1), 0xb1); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 12); \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 7) + +#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) + +#define VEC8_LINE1(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16) +#define VEC8_LINE2(A, B, C, D) \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12) +#define VEC8_LINE3(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8) +#define VEC8_LINE4(A, B, C, D) \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7) + +#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \ + C4, D4) \ + VEC8_LINE1(A1, B1, C1, D1); \ + VEC8_LINE1(A2, B2, C2, D2); \ + VEC8_LINE1(A3, B3, C3, D3); \ + VEC8_LINE1(A4, B4, C4, D4); \ + VEC8_LINE2(A1, B1, C1, D1); \ + VEC8_LINE2(A2, B2, C2, D2); \ + VEC8_LINE2(A3, B3, C3, D3); \ + VEC8_LINE2(A4, B4, C4, D4); \ + VEC8_LINE3(A1, B1, C1, D1); \ + VEC8_LINE3(A2, B2, C2, D2); \ + VEC8_LINE3(A3, B3, C3, D3); \ + VEC8_LINE3(A4, B4, C4, D4); \ + VEC8_LINE4(A1, B1, C1, D1); \ + VEC8_LINE4(A2, B2, C2, D2); \ + VEC8_LINE4(A3, B3, C3, D3); \ + VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \ + B4, C4, D4) \ + VEC8_LINE1(A1, B1, C1, D1); \ + VEC8_LINE1(A2, B2, C2, D2); \ + VEC8_LINE2(A1, B1, C1, D1); \ + VEC8_LINE2(A2, B2, C2, D2); \ + VEC8_LINE3(A1, B1, C1, D1); \ + VEC8_LINE3(A2, B2, C2, D2); \ + VEC8_LINE4(A1, B1, C1, D1); \ + VEC8_LINE4(A2, B2, C2, D2); \ + VEC8_LINE1(A3, B3, C3, D3); \ + VEC8_LINE1(A4, B4, C4, D4); \ + VEC8_LINE2(A3, B3, C3, D3); \ + VEC8_LINE2(A4, B4, C4, D4); \ + VEC8_LINE3(A3, B3, C3, D3); \ + VEC8_LINE3(A4, B4, C4, D4); \ + VEC8_LINE4(A3, B3, C3, D3); \ + VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \ + A4, B4, C4, D4) \ + VEC8_LINE1(A1, B1, C1, D1); \ + VEC8_LINE1(A2, B2, C2, D2); \ + VEC8_LINE2(A1, B1, C1, D1); \ + VEC8_LINE2(A2, B2, C2, D2); \ + VEC8_LINE1(A3, B3, C3, D3); \ + VEC8_LINE1(A4, B4, C4, D4); \ + VEC8_LINE2(A3, B3, C3, D3); \ + VEC8_LINE2(A4, B4, C4, D4); \ + VEC8_LINE3(A1, B1, C1, D1); \ + VEC8_LINE3(A2, B2, C2, D2); \ + VEC8_LINE4(A1, B1, C1, D1); \ + VEC8_LINE4(A2, B2, C2, D2); \ + VEC8_LINE3(A3, B3, C3, D3); \ + VEC8_LINE3(A4, B4, C4, D4); \ + VEC8_LINE4(A3, B3, C3, D3); \ + VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ + D4) \ + VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ + D4) + +if (bytes >= 512) { + /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ + __m256i rot16 = + _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + __m256i rot8 = + _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, + 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + uint32_t in12, in13; + + /* the naive way seems as fast (if not a bit faster) than the vector way */ + __m256i x_0 = _mm256_set1_epi32(x[0]); + __m256i x_1 = _mm256_set1_epi32(x[1]); + __m256i x_2 = _mm256_set1_epi32(x[2]); + __m256i x_3 = _mm256_set1_epi32(x[3]); + __m256i x_4 = _mm256_set1_epi32(x[4]); + __m256i x_5 = _mm256_set1_epi32(x[5]); + __m256i x_6 = _mm256_set1_epi32(x[6]); + __m256i x_7 = _mm256_set1_epi32(x[7]); + __m256i x_8 = _mm256_set1_epi32(x[8]); + __m256i x_9 = _mm256_set1_epi32(x[9]); + __m256i x_10 = _mm256_set1_epi32(x[10]); + __m256i x_11 = _mm256_set1_epi32(x[11]); + __m256i x_12; + __m256i x_13; + __m256i x_14 = _mm256_set1_epi32(x[14]); + __m256i x_15 = _mm256_set1_epi32(x[15]); + + __m256i orig0 = x_0; + __m256i orig1 = x_1; + __m256i orig2 = x_2; + __m256i orig3 = x_3; + __m256i orig4 = x_4; + __m256i orig5 = x_5; + __m256i orig6 = x_6; + __m256i orig7 = x_7; + __m256i orig8 = x_8; + __m256i orig9 = x_9; + __m256i orig10 = x_10; + __m256i orig11 = x_11; + __m256i orig12; + __m256i orig13; + __m256i orig14 = x_14; + __m256i orig15 = x_15; + __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, + t_13, t_14, t_15; + + while (bytes >= 512) { + const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0); + const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4); + const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i t12, t13; + + uint64_t in1213; + int i; + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + in12 = x[12]; + in13 = x[13]; + in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); + x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); + + t12 = _mm256_add_epi64(addv12, x_12); + t13 = _mm256_add_epi64(addv13, x_13); + + x_12 = _mm256_unpacklo_epi32(t12, t13); + x_13 = _mm256_unpackhi_epi32(t12, t13); + + t12 = _mm256_unpacklo_epi32(x_12, x_13); + t13 = _mm256_unpackhi_epi32(x_12, x_13); + + /* required because unpack* are intra-lane */ + x_12 = _mm256_permutevar8x32_epi32(t12, permute); + x_13 = _mm256_permutevar8x32_epi32(t13, permute); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 8; + + x[12] = in1213 & 0xFFFFFFFF; + x[13] = (in1213 >> 32) & 0xFFFFFFFF; + + for (i = 0; i < ROUNDS; i += 2) { + VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14); + } + +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + x_##A = _mm256_add_epi32(x_##A, orig##A); \ + x_##B = _mm256_add_epi32(x_##B, orig##B); \ + x_##C = _mm256_add_epi32(x_##C, orig##C); \ + x_##D = _mm256_add_epi32(x_##D, orig##D); \ + t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \ + _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \ + _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \ + _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \ + _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \ + _mm_loadu_si128((const __m128i*) (m + 256))); \ + _mm_storeu_si128((__m128i*) (c + 256), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \ + _mm_loadu_si128((const __m128i*) (m + 320))); \ + _mm_storeu_si128((__m128i*) (c + 320), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \ + _mm_loadu_si128((const __m128i*) (m + 384))); \ + _mm_storeu_si128((__m128i*) (c + 384), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \ + _mm_loadu_si128((const __m128i*) (m + 448))); \ + _mm_storeu_si128((__m128i*) (c + 448), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + +#define ONEQUAD_UNPCK(A, B, C, D) \ + { \ + x_##A = _mm256_add_epi32(x_##A, orig##A); \ + x_##B = _mm256_add_epi32(x_##B, orig##B); \ + x_##C = _mm256_add_epi32(x_##C, orig##C); \ + x_##D = _mm256_add_epi32(x_##D, orig##D); \ + t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ + } + +#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ + { \ + ONEQUAD_UNPCK(A, B, C, D); \ + ONEQUAD_UNPCK(A2, B2, C2, D2); \ + t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \ + t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \ + t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \ + t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \ + t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \ + t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \ + t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \ + t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \ + t_##A = _mm256_xor_si256( \ + t_##A, _mm256_loadu_si256((const __m256i*) (m + 0))); \ + t_##B = _mm256_xor_si256( \ + t_##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \ + t_##C = _mm256_xor_si256( \ + t_##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \ + t_##D = _mm256_xor_si256( \ + t_##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \ + t_##A2 = _mm256_xor_si256( \ + t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \ + t_##B2 = _mm256_xor_si256( \ + t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \ + t_##C2 = _mm256_xor_si256( \ + t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \ + t_##D2 = _mm256_xor_si256( \ + t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \ + _mm256_storeu_si256((__m256i*) (c + 0), t_##A); \ + _mm256_storeu_si256((__m256i*) (c + 64), t_##B); \ + _mm256_storeu_si256((__m256i*) (c + 128), t_##C); \ + _mm256_storeu_si256((__m256i*) (c + 192), t_##D); \ + _mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \ + _mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \ + _mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \ + _mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \ + } + + ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); + m += 32; + c += 32; + ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); + m -= 32; + c -= 32; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_UNPCK +#undef ONEOCTO + + bytes -= 512; + c += 512; + m += 512; + } +} +#undef VEC8_ROT +#undef VEC8_QUARTERROUND +#undef VEC8_QUARTERROUND_NAIVE +#undef VEC8_QUARTERROUND_SHUFFLE +#undef VEC8_QUARTERROUND_SHUFFLE2 +#undef VEC8_LINE1 +#undef VEC8_LINE2 +#undef VEC8_LINE3 +#undef VEC8_LINE4 +#undef VEC8_ROUND +#undef VEC8_ROUND_SEQ +#undef VEC8_ROUND_HALF +#undef VEC8_ROUND_HALFANDHALF diff --git a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c index 40cccbf8f8..fb1e3a4b5c 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c +++ b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.c @@ -1,312 +1,312 @@ - -/* - chacha-merged.c version 20080118 - D. J. Bernstein - Public domain. - */ - -#include -#include -#include - -#include "core.h" -#include "crypto_stream_chacha20.h" -#include "private/common.h" -#include "utils.h" - -#include "../stream_chacha20.h" -#include "chacha20_ref.h" - -struct chacha_ctx { - uint32_t input[16]; -}; - -typedef struct chacha_ctx chacha_ctx; - -#define U32C(v) (v##U) - -#define U32V(v) ((uint32_t)(v) &U32C(0xFFFFFFFF)) - -#define ROTATE(v, c) (ROTL32(v, c)) -#define XOR(v, w) ((v) ^ (w)) -#define PLUS(v, w) (U32V((v) + (w))) -#define PLUSONE(v) (PLUS((v), 1)) - -#define QUARTERROUND(a, b, c, d) \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 16); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 12); \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 8); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 7); - -static void -chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) -{ - ctx->input[0] = U32C(0x61707865); - ctx->input[1] = U32C(0x3320646e); - ctx->input[2] = U32C(0x79622d32); - ctx->input[3] = U32C(0x6b206574); - ctx->input[4] = LOAD32_LE(k + 0); - ctx->input[5] = LOAD32_LE(k + 4); - ctx->input[6] = LOAD32_LE(k + 8); - ctx->input[7] = LOAD32_LE(k + 12); - ctx->input[8] = LOAD32_LE(k + 16); - ctx->input[9] = LOAD32_LE(k + 20); - ctx->input[10] = LOAD32_LE(k + 24); - ctx->input[11] = LOAD32_LE(k + 28); -} - -static void -chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); - ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); - ctx->input[14] = LOAD32_LE(iv + 0); - ctx->input[15] = LOAD32_LE(iv + 4); -} - -static void -chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); - ctx->input[13] = LOAD32_LE(iv + 0); - ctx->input[14] = LOAD32_LE(iv + 4); - ctx->input[15] = LOAD32_LE(iv + 8); -} - -static void -chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, - x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, - j15; - uint8_t *ctarget = NULL; - uint8_t tmp[64]; - unsigned int i; - - if (!bytes) { - return; /* LCOV_EXCL_LINE */ - } - j0 = ctx->input[0]; - j1 = ctx->input[1]; - j2 = ctx->input[2]; - j3 = ctx->input[3]; - j4 = ctx->input[4]; - j5 = ctx->input[5]; - j6 = ctx->input[6]; - j7 = ctx->input[7]; - j8 = ctx->input[8]; - j9 = ctx->input[9]; - j10 = ctx->input[10]; - j11 = ctx->input[11]; - j12 = ctx->input[12]; - j13 = ctx->input[13]; - j14 = ctx->input[14]; - j15 = ctx->input[15]; - - for (;;) { - if (bytes < 64) { - memset(tmp, 0, 64); - for (i = 0; i < bytes; ++i) { - tmp[i] = m[i]; - } - m = tmp; - ctarget = c; - c = tmp; - } - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - for (i = 20; i > 0; i -= 2) { - QUARTERROUND(x0, x4, x8, x12) - QUARTERROUND(x1, x5, x9, x13) - QUARTERROUND(x2, x6, x10, x14) - QUARTERROUND(x3, x7, x11, x15) - QUARTERROUND(x0, x5, x10, x15) - QUARTERROUND(x1, x6, x11, x12) - QUARTERROUND(x2, x7, x8, x13) - QUARTERROUND(x3, x4, x9, x14) - } - x0 = PLUS(x0, j0); - x1 = PLUS(x1, j1); - x2 = PLUS(x2, j2); - x3 = PLUS(x3, j3); - x4 = PLUS(x4, j4); - x5 = PLUS(x5, j5); - x6 = PLUS(x6, j6); - x7 = PLUS(x7, j7); - x8 = PLUS(x8, j8); - x9 = PLUS(x9, j9); - x10 = PLUS(x10, j10); - x11 = PLUS(x11, j11); - x12 = PLUS(x12, j12); - x13 = PLUS(x13, j13); - x14 = PLUS(x14, j14); - x15 = PLUS(x15, j15); - - x0 = XOR(x0, LOAD32_LE(m + 0)); - x1 = XOR(x1, LOAD32_LE(m + 4)); - x2 = XOR(x2, LOAD32_LE(m + 8)); - x3 = XOR(x3, LOAD32_LE(m + 12)); - x4 = XOR(x4, LOAD32_LE(m + 16)); - x5 = XOR(x5, LOAD32_LE(m + 20)); - x6 = XOR(x6, LOAD32_LE(m + 24)); - x7 = XOR(x7, LOAD32_LE(m + 28)); - x8 = XOR(x8, LOAD32_LE(m + 32)); - x9 = XOR(x9, LOAD32_LE(m + 36)); - x10 = XOR(x10, LOAD32_LE(m + 40)); - x11 = XOR(x11, LOAD32_LE(m + 44)); - x12 = XOR(x12, LOAD32_LE(m + 48)); - x13 = XOR(x13, LOAD32_LE(m + 52)); - x14 = XOR(x14, LOAD32_LE(m + 56)); - x15 = XOR(x15, LOAD32_LE(m + 60)); - - j12 = PLUSONE(j12); - /* LCOV_EXCL_START */ - if (!j12) { - j13 = PLUSONE(j13); - } - /* LCOV_EXCL_STOP */ - - STORE32_LE(c + 0, x0); - STORE32_LE(c + 4, x1); - STORE32_LE(c + 8, x2); - STORE32_LE(c + 12, x3); - STORE32_LE(c + 16, x4); - STORE32_LE(c + 20, x5); - STORE32_LE(c + 24, x6); - STORE32_LE(c + 28, x7); - STORE32_LE(c + 32, x8); - STORE32_LE(c + 36, x9); - STORE32_LE(c + 40, x10); - STORE32_LE(c + 44, x11); - STORE32_LE(c + 48, x12); - STORE32_LE(c + 52, x13); - STORE32_LE(c + 56, x14); - STORE32_LE(c + 60, x15); - - if (bytes <= 64) { - if (bytes < 64) { - for (i = 0; i < (unsigned int) bytes; ++i) { - ctarget[i] = c[i]; /* ctarget cannot be NULL */ - } - } - ctx->input[12] = j12; - ctx->input[13] = j13; - - return; - } - bytes -= 64; - c += 64; - m += 64; - } -} - -static int -stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - struct chacha_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - chacha20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[8]; - uint32_t ic_high; - uint32_t ic_low; - - if (!mlen) { - return 0; - } - ic_high = U32V(ic >> 32); - ic_low = U32V(ic); - STORE32_LE(&ic_bytes[0], ic_low); - STORE32_LE(&ic_bytes[4], ic_high); - chacha_keysetup(&ctx, k); - chacha_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint32_t ic, const unsigned char *k) -{ - struct chacha_ctx ctx; - uint8_t ic_bytes[4]; - - if (!mlen) { - return 0; - } - STORE32_LE(ic_bytes, ic); - chacha_keysetup(&ctx, k); - chacha_ietf_ivsetup(&ctx, n, ic_bytes); - chacha20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_ref_implementation = { - SODIUM_C99(.stream =) stream_ref, - SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, - SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, - SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic - }; + +/* + chacha-merged.c version 20080118 + D. J. Bernstein + Public domain. + */ + +#include +#include +#include + +#include "core.h" +#include "crypto_stream_chacha20.h" +#include "private/common.h" +#include "utils.h" + +#include "../stream_chacha20.h" +#include "chacha20_ref.h" + +struct chacha_ctx { + uint32_t input[16]; +}; + +typedef struct chacha_ctx chacha_ctx; + +#define U32C(v) (v##U) + +#define U32V(v) ((uint32_t)(v) &U32C(0xFFFFFFFF)) + +#define ROTATE(v, c) (ROTL32(v, c)) +#define XOR(v, w) ((v) ^ (w)) +#define PLUS(v, w) (U32V((v) + (w))) +#define PLUSONE(v) (PLUS((v), 1)) + +#define QUARTERROUND(a, b, c, d) \ + a = PLUS(a, b); \ + d = ROTATE(XOR(d, a), 16); \ + c = PLUS(c, d); \ + b = ROTATE(XOR(b, c), 12); \ + a = PLUS(a, b); \ + d = ROTATE(XOR(d, a), 8); \ + c = PLUS(c, d); \ + b = ROTATE(XOR(b, c), 7); + +static void +chacha_keysetup(chacha_ctx *ctx, const uint8_t *k) +{ + ctx->input[0] = U32C(0x61707865); + ctx->input[1] = U32C(0x3320646e); + ctx->input[2] = U32C(0x79622d32); + ctx->input[3] = U32C(0x6b206574); + ctx->input[4] = LOAD32_LE(k + 0); + ctx->input[5] = LOAD32_LE(k + 4); + ctx->input[6] = LOAD32_LE(k + 8); + ctx->input[7] = LOAD32_LE(k + 12); + ctx->input[8] = LOAD32_LE(k + 16); + ctx->input[9] = LOAD32_LE(k + 20); + ctx->input[10] = LOAD32_LE(k + 24); + ctx->input[11] = LOAD32_LE(k + 28); +} + +static void +chacha_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[13] = counter == NULL ? 0 : LOAD32_LE(counter + 4); + ctx->input[14] = LOAD32_LE(iv + 0); + ctx->input[15] = LOAD32_LE(iv + 4); +} + +static void +chacha_ietf_ivsetup(chacha_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[12] = counter == NULL ? 0 : LOAD32_LE(counter); + ctx->input[13] = LOAD32_LE(iv + 0); + ctx->input[14] = LOAD32_LE(iv + 4); + ctx->input[15] = LOAD32_LE(iv + 8); +} + +static void +chacha20_encrypt_bytes(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, + x15; + uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, + j15; + uint8_t *ctarget = NULL; + uint8_t tmp[64]; + unsigned int i; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + j0 = ctx->input[0]; + j1 = ctx->input[1]; + j2 = ctx->input[2]; + j3 = ctx->input[3]; + j4 = ctx->input[4]; + j5 = ctx->input[5]; + j6 = ctx->input[6]; + j7 = ctx->input[7]; + j8 = ctx->input[8]; + j9 = ctx->input[9]; + j10 = ctx->input[10]; + j11 = ctx->input[11]; + j12 = ctx->input[12]; + j13 = ctx->input[13]; + j14 = ctx->input[14]; + j15 = ctx->input[15]; + + for (;;) { + if (bytes < 64) { + memset(tmp, 0, 64); + for (i = 0; i < bytes; ++i) { + tmp[i] = m[i]; + } + m = tmp; + ctarget = c; + c = tmp; + } + x0 = j0; + x1 = j1; + x2 = j2; + x3 = j3; + x4 = j4; + x5 = j5; + x6 = j6; + x7 = j7; + x8 = j8; + x9 = j9; + x10 = j10; + x11 = j11; + x12 = j12; + x13 = j13; + x14 = j14; + x15 = j15; + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(x0, x4, x8, x12) + QUARTERROUND(x1, x5, x9, x13) + QUARTERROUND(x2, x6, x10, x14) + QUARTERROUND(x3, x7, x11, x15) + QUARTERROUND(x0, x5, x10, x15) + QUARTERROUND(x1, x6, x11, x12) + QUARTERROUND(x2, x7, x8, x13) + QUARTERROUND(x3, x4, x9, x14) + } + x0 = PLUS(x0, j0); + x1 = PLUS(x1, j1); + x2 = PLUS(x2, j2); + x3 = PLUS(x3, j3); + x4 = PLUS(x4, j4); + x5 = PLUS(x5, j5); + x6 = PLUS(x6, j6); + x7 = PLUS(x7, j7); + x8 = PLUS(x8, j8); + x9 = PLUS(x9, j9); + x10 = PLUS(x10, j10); + x11 = PLUS(x11, j11); + x12 = PLUS(x12, j12); + x13 = PLUS(x13, j13); + x14 = PLUS(x14, j14); + x15 = PLUS(x15, j15); + + x0 = XOR(x0, LOAD32_LE(m + 0)); + x1 = XOR(x1, LOAD32_LE(m + 4)); + x2 = XOR(x2, LOAD32_LE(m + 8)); + x3 = XOR(x3, LOAD32_LE(m + 12)); + x4 = XOR(x4, LOAD32_LE(m + 16)); + x5 = XOR(x5, LOAD32_LE(m + 20)); + x6 = XOR(x6, LOAD32_LE(m + 24)); + x7 = XOR(x7, LOAD32_LE(m + 28)); + x8 = XOR(x8, LOAD32_LE(m + 32)); + x9 = XOR(x9, LOAD32_LE(m + 36)); + x10 = XOR(x10, LOAD32_LE(m + 40)); + x11 = XOR(x11, LOAD32_LE(m + 44)); + x12 = XOR(x12, LOAD32_LE(m + 48)); + x13 = XOR(x13, LOAD32_LE(m + 52)); + x14 = XOR(x14, LOAD32_LE(m + 56)); + x15 = XOR(x15, LOAD32_LE(m + 60)); + + j12 = PLUSONE(j12); + /* LCOV_EXCL_START */ + if (!j12) { + j13 = PLUSONE(j13); + } + /* LCOV_EXCL_STOP */ + + STORE32_LE(c + 0, x0); + STORE32_LE(c + 4, x1); + STORE32_LE(c + 8, x2); + STORE32_LE(c + 12, x3); + STORE32_LE(c + 16, x4); + STORE32_LE(c + 20, x5); + STORE32_LE(c + 24, x6); + STORE32_LE(c + 28, x7); + STORE32_LE(c + 32, x8); + STORE32_LE(c + 36, x9); + STORE32_LE(c + 40, x10); + STORE32_LE(c + 44, x11); + STORE32_LE(c + 48, x12); + STORE32_LE(c + 52, x13); + STORE32_LE(c + 56, x14); + STORE32_LE(c + 60, x15); + + if (bytes <= 64) { + if (bytes < 64) { + for (i = 0; i < (unsigned int) bytes; ++i) { + ctarget[i] = c[i]; /* ctarget cannot be NULL */ + } + } + ctx->input[12] = j12; + ctx->input[13] = j13; + + return; + } + bytes -= 64; + c += 64; + m += 64; + } +} + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + struct chacha_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES == 256 / 8); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + chacha20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = U32V(ic >> 32); + ic_low = U32V(ic); + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + chacha_keysetup(&ctx, k); + chacha_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_ietf_ext_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint32_t ic, const unsigned char *k) +{ + struct chacha_ctx ctx; + uint8_t ic_bytes[4]; + + if (!mlen) { + return 0; + } + STORE32_LE(ic_bytes, ic); + chacha_keysetup(&ctx, k); + chacha_ietf_ivsetup(&ctx, n, ic_bytes); + chacha20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_ref_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_ietf_ext =) stream_ietf_ext_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + SODIUM_C99(.stream_ietf_ext_xor_ic =) stream_ietf_ext_ref_xor_ic + }; diff --git a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h index 6ac4807554..66c2e830e7 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h +++ b/libs/libsodium/src/crypto_stream/chacha20/ref/chacha20_ref.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_chacha20.h" -#include "crypto_stream_chacha20.h" - -extern struct crypto_stream_chacha20_implementation - crypto_stream_chacha20_ref_implementation; + +#include + +#include "../stream_chacha20.h" +#include "crypto_stream_chacha20.h" + +extern struct crypto_stream_chacha20_implementation + crypto_stream_chacha20_ref_implementation; diff --git a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c index c98d60907f..b88f9a50e9 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c +++ b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.c @@ -1,183 +1,184 @@ -#include "crypto_stream_chacha20.h" -#include "core.h" -#include "private/common.h" -#include "private/implementations.h" -#include "randombytes.h" -#include "runtime.h" -#include "stream_chacha20.h" - -#include "ref/chacha20_ref.h" -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) -# include "dolbeau/chacha20_dolbeau-avx2.h" -#endif -#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) -# include "dolbeau/chacha20_dolbeau-ssse3.h" -#endif - -static const crypto_stream_chacha20_implementation *implementation = - &crypto_stream_chacha20_ref_implementation; - -size_t -crypto_stream_chacha20_keybytes(void) { - return crypto_stream_chacha20_KEYBYTES; -} - -size_t -crypto_stream_chacha20_noncebytes(void) { - return crypto_stream_chacha20_NONCEBYTES; -} - -size_t -crypto_stream_chacha20_messagebytes_max(void) -{ - return crypto_stream_chacha20_MESSAGEBYTES_MAX; -} - -size_t -crypto_stream_chacha20_ietf_keybytes(void) { - return crypto_stream_chacha20_ietf_KEYBYTES; -} - -size_t -crypto_stream_chacha20_ietf_noncebytes(void) { - return crypto_stream_chacha20_ietf_NONCEBYTES; -} - -size_t -crypto_stream_chacha20_ietf_messagebytes_max(void) -{ - return crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX; -} - -int -crypto_stream_chacha20(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream(c, clen, n, k); -} - -int -crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream_xor_ic(c, m, mlen, n, ic, k); -} - -int -crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream_xor_ic(c, m, mlen, n, 0U, k); -} - -int -crypto_stream_chacha20_ietf_ext(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream_ietf_ext(c, clen, n, k); -} - -int -crypto_stream_chacha20_ietf_ext_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint32_t ic, - const unsigned char *k) -{ - if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, ic, k); -} - -static int -crypto_stream_chacha20_ietf_ext_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, 0U, k); -} - -int -crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - if (clen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return crypto_stream_chacha20_ietf_ext(c, clen, n, k); -} - -int -crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint32_t ic, - const unsigned char *k) -{ - if ((unsigned long long) ic > - (64ULL * (1ULL << 32)) / 64ULL - (mlen + 63ULL) / 64ULL) { - sodium_misuse(); - } - return crypto_stream_chacha20_ietf_ext_xor_ic(c, m, mlen, n, ic, k); -} - -int -crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - if (mlen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) { - sodium_misuse(); - } - return crypto_stream_chacha20_ietf_ext_xor(c, m, mlen, n, k); -} - -void -crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_chacha20_ietf_KEYBYTES); -} - -void -crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_chacha20_KEYBYTES); -} - -int -_crypto_stream_chacha20_pick_best_implementation(void) -{ - implementation = &crypto_stream_chacha20_ref_implementation; -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) - if (sodium_runtime_has_avx2()) { - implementation = &crypto_stream_chacha20_dolbeau_avx2_implementation; - return 0; - } -#endif -#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) - if (sodium_runtime_has_ssse3()) { - implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation; - return 0; - } -#endif - return 0; -} +#include "crypto_stream_chacha20.h" +#include "core.h" +#include "private/chacha20_ietf_ext.h" +#include "private/common.h" +#include "private/implementations.h" +#include "randombytes.h" +#include "runtime.h" +#include "stream_chacha20.h" + +#include "ref/chacha20_ref.h" +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) +# include "dolbeau/chacha20_dolbeau-avx2.h" +#endif +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) +# include "dolbeau/chacha20_dolbeau-ssse3.h" +#endif + +static const crypto_stream_chacha20_implementation *implementation = + &crypto_stream_chacha20_ref_implementation; + +size_t +crypto_stream_chacha20_keybytes(void) { + return crypto_stream_chacha20_KEYBYTES; +} + +size_t +crypto_stream_chacha20_noncebytes(void) { + return crypto_stream_chacha20_NONCEBYTES; +} + +size_t +crypto_stream_chacha20_messagebytes_max(void) +{ + return crypto_stream_chacha20_MESSAGEBYTES_MAX; +} + +size_t +crypto_stream_chacha20_ietf_keybytes(void) { + return crypto_stream_chacha20_ietf_KEYBYTES; +} + +size_t +crypto_stream_chacha20_ietf_noncebytes(void) { + return crypto_stream_chacha20_ietf_NONCEBYTES; +} + +size_t +crypto_stream_chacha20_ietf_messagebytes_max(void) +{ + return crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX; +} + +int +crypto_stream_chacha20(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream(c, clen, n, k); +} + +int +crypto_stream_chacha20_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream_xor_ic(c, m, mlen, n, ic, k); +} + +int +crypto_stream_chacha20_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream_xor_ic(c, m, mlen, n, 0U, k); +} + +int +crypto_stream_chacha20_ietf_ext(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + if (clen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream_ietf_ext(c, clen, n, k); +} + +int +crypto_stream_chacha20_ietf_ext_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint32_t ic, + const unsigned char *k) +{ + if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, ic, k); +} + +static int +crypto_stream_chacha20_ietf_ext_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + if (mlen > crypto_stream_chacha20_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return implementation->stream_ietf_ext_xor_ic(c, m, mlen, n, 0U, k); +} + +int +crypto_stream_chacha20_ietf(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + if (clen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return crypto_stream_chacha20_ietf_ext(c, clen, n, k); +} + +int +crypto_stream_chacha20_ietf_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint32_t ic, + const unsigned char *k) +{ + if ((unsigned long long) ic > + (64ULL * (1ULL << 32)) / 64ULL - (mlen + 63ULL) / 64ULL) { + sodium_misuse(); + } + return crypto_stream_chacha20_ietf_ext_xor_ic(c, m, mlen, n, ic, k); +} + +int +crypto_stream_chacha20_ietf_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + if (mlen > crypto_stream_chacha20_ietf_MESSAGEBYTES_MAX) { + sodium_misuse(); + } + return crypto_stream_chacha20_ietf_ext_xor(c, m, mlen, n, k); +} + +void +crypto_stream_chacha20_ietf_keygen(unsigned char k[crypto_stream_chacha20_ietf_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_chacha20_ietf_KEYBYTES); +} + +void +crypto_stream_chacha20_keygen(unsigned char k[crypto_stream_chacha20_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_chacha20_KEYBYTES); +} + +int +_crypto_stream_chacha20_pick_best_implementation(void) +{ + implementation = &crypto_stream_chacha20_ref_implementation; +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) + if (sodium_runtime_has_avx2()) { + implementation = &crypto_stream_chacha20_dolbeau_avx2_implementation; + return 0; + } +#endif +#if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) + if (sodium_runtime_has_ssse3()) { + implementation = &crypto_stream_chacha20_dolbeau_ssse3_implementation; + return 0; + } +#endif + return 0; +} diff --git a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h index 40f782f418..0233a4dbef 100644 --- a/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h +++ b/libs/libsodium/src/crypto_stream/chacha20/stream_chacha20.h @@ -1,22 +1,22 @@ - -#ifndef stream_chacha20_H -#define stream_chacha20_H - -#include - -typedef struct crypto_stream_chacha20_implementation { - int (*stream)(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k); - int (*stream_ietf_ext)(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k); - int (*stream_xor_ic)(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint64_t ic, - const unsigned char *k); - int (*stream_ietf_ext_xor_ic)(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint32_t ic, - const unsigned char *k); -} crypto_stream_chacha20_implementation; - -#endif + +#ifndef stream_chacha20_H +#define stream_chacha20_H + +#include + +typedef struct crypto_stream_chacha20_implementation { + int (*stream)(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + int (*stream_ietf_ext)(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + int (*stream_xor_ic)(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k); + int (*stream_ietf_ext_xor_ic)(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint32_t ic, + const unsigned char *k); +} crypto_stream_chacha20_implementation; + +#endif diff --git a/libs/libsodium/src/crypto_stream/crypto_stream.c b/libs/libsodium/src/crypto_stream/crypto_stream.c index 58d25381ab..6eaac0f747 100644 --- a/libs/libsodium/src/crypto_stream/crypto_stream.c +++ b/libs/libsodium/src/crypto_stream/crypto_stream.c @@ -1,49 +1,49 @@ - -#include "crypto_stream.h" -#include "randombytes.h" - -size_t -crypto_stream_keybytes(void) -{ - return crypto_stream_KEYBYTES; -} - -size_t -crypto_stream_noncebytes(void) -{ - return crypto_stream_NONCEBYTES; -} - -size_t -crypto_stream_messagebytes_max(void) -{ - return crypto_stream_MESSAGEBYTES_MAX; -} - -const char * -crypto_stream_primitive(void) -{ - return crypto_stream_PRIMITIVE; -} - -int -crypto_stream(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - return crypto_stream_xsalsa20(c, clen, n, k); -} - - -int -crypto_stream_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - return crypto_stream_xsalsa20_xor(c, m, mlen, n, k); -} - -void -crypto_stream_keygen(unsigned char k[crypto_stream_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_KEYBYTES); -} + +#include "crypto_stream.h" +#include "randombytes.h" + +size_t +crypto_stream_keybytes(void) +{ + return crypto_stream_KEYBYTES; +} + +size_t +crypto_stream_noncebytes(void) +{ + return crypto_stream_NONCEBYTES; +} + +size_t +crypto_stream_messagebytes_max(void) +{ + return crypto_stream_MESSAGEBYTES_MAX; +} + +const char * +crypto_stream_primitive(void) +{ + return crypto_stream_PRIMITIVE; +} + +int +crypto_stream(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + return crypto_stream_xsalsa20(c, clen, n, k); +} + + +int +crypto_stream_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + return crypto_stream_xsalsa20_xor(c, m, mlen, n, k); +} + +void +crypto_stream_keygen(unsigned char k[crypto_stream_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_KEYBYTES); +} diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c index f0854ebf7e..81522f0065 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c +++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.c @@ -1,120 +1,120 @@ -/* -version 20140420 -D. J. Bernstein -Public domain. -*/ - -#include - -#include "crypto_core_salsa20.h" -#include "crypto_stream_salsa20.h" -#include "utils.h" - -#include "../stream_salsa20.h" -#include "salsa20_ref.h" - -#ifndef HAVE_AMD64_ASM - -static int -stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!clen) { - return 0; - } - for (i = 0; i < 32; i++) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; i++) { - in[i] = n[i]; - } - for (i = 8; i < 16; i++) { - in[i] = 0; - } - while (clen >= 64) { - crypto_core_salsa20(c, in, kcopy, NULL); - u = 1; - for (i = 8; i < 16; i++) { - u += (unsigned int) in[i]; - in[i] = u; - u >>= 8; - } - clen -= 64; - c += 64; - } - if (clen) { - crypto_core_salsa20(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int) clen; i++) { - c[i] = block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} - -static int -stream_ref_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!mlen) { - return 0; - } - for (i = 0; i < 32; i++) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; i++) { - in[i] = n[i]; - } - for (i = 8; i < 16; i++) { - in[i] = (unsigned char) (ic & 0xff); - ic >>= 8; - } - while (mlen >= 64) { - crypto_core_salsa20(block, in, kcopy, NULL); - for (i = 0; i < 64; i++) { - c[i] = m[i] ^ block[i]; - } - u = 1; - for (i = 8; i < 16; i++) { - u += (unsigned int) in[i]; - in[i] = u; - u >>= 8; - } - mlen -= 64; - c += 64; - m += 64; - } - if (mlen) { - crypto_core_salsa20(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int) mlen; i++) { - c[i] = m[i] ^ block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} - -struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_ref_implementation = { - SODIUM_C99(.stream =) stream_ref, - SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, - }; - -#endif +/* +version 20140420 +D. J. Bernstein +Public domain. +*/ + +#include + +#include "crypto_core_salsa20.h" +#include "crypto_stream_salsa20.h" +#include "utils.h" + +#include "../stream_salsa20.h" +#include "salsa20_ref.h" + +#ifndef HAVE_AMD64_ASM + +static int +stream_ref(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!clen) { + return 0; + } + for (i = 0; i < 32; i++) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; i++) { + in[i] = n[i]; + } + for (i = 8; i < 16; i++) { + in[i] = 0; + } + while (clen >= 64) { + crypto_core_salsa20(c, in, kcopy, NULL); + u = 1; + for (i = 8; i < 16; i++) { + u += (unsigned int) in[i]; + in[i] = u; + u >>= 8; + } + clen -= 64; + c += 64; + } + if (clen) { + crypto_core_salsa20(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int) clen; i++) { + c[i] = block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + +static int +stream_ref_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; i++) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; i++) { + in[i] = n[i]; + } + for (i = 8; i < 16; i++) { + in[i] = (unsigned char) (ic & 0xff); + ic >>= 8; + } + while (mlen >= 64) { + crypto_core_salsa20(block, in, kcopy, NULL); + for (i = 0; i < 64; i++) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; i++) { + u += (unsigned int) in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa20(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int) mlen; i++) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_ref_implementation = { + SODIUM_C99(.stream =) stream_ref, + SODIUM_C99(.stream_xor_ic =) stream_ref_xor_ic, + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h index 8716cb4048..9976cc7f3a 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h +++ b/libs/libsodium/src/crypto_stream/salsa20/ref/salsa20_ref.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_salsa20.h" -#include "crypto_stream_salsa20.h" - -extern struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_ref_implementation; + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_ref_implementation; diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c index 4529850136..cf06e6460d 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c +++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.c @@ -1,100 +1,100 @@ -#include "crypto_stream_salsa20.h" -#include "private/common.h" -#include "private/implementations.h" -#include "randombytes.h" -#include "runtime.h" -#include "stream_salsa20.h" - -#ifdef HAVE_AMD64_ASM -# include "xmm6/salsa20_xmm6.h" -#else -# include "ref/salsa20_ref.h" -#endif -#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H) -# include "xmm6int/salsa20_xmm6int-sse2.h" -#endif -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) -# include "xmm6int/salsa20_xmm6int-avx2.h" -#endif - -#if HAVE_AMD64_ASM -static const crypto_stream_salsa20_implementation *implementation = - &crypto_stream_salsa20_xmm6_implementation; -#else -static const crypto_stream_salsa20_implementation *implementation = - &crypto_stream_salsa20_ref_implementation; -#endif - -size_t -crypto_stream_salsa20_keybytes(void) -{ - return crypto_stream_salsa20_KEYBYTES; -} - -size_t -crypto_stream_salsa20_noncebytes(void) -{ - return crypto_stream_salsa20_NONCEBYTES; -} - -size_t -crypto_stream_salsa20_messagebytes_max(void) -{ - return crypto_stream_salsa20_MESSAGEBYTES_MAX; -} - -int -crypto_stream_salsa20(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - return implementation->stream(c, clen, n, k); -} - -int -crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - return implementation->stream_xor_ic(c, m, mlen, n, ic, k); -} - -int -crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - return implementation->stream_xor_ic(c, m, mlen, n, 0U, k); -} - -void -crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_salsa20_KEYBYTES); -} - -int -_crypto_stream_salsa20_pick_best_implementation(void) -{ -#ifdef HAVE_AMD64_ASM - implementation = &crypto_stream_salsa20_xmm6_implementation; -#else - implementation = &crypto_stream_salsa20_ref_implementation; -#endif - -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) - if (sodium_runtime_has_avx2()) { - implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation; - return 0; - } -#endif -#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H) - if (sodium_runtime_has_sse2()) { - implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation; - return 0; - } -#endif - return 0; /* LCOV_EXCL_LINE */ -} +#include "crypto_stream_salsa20.h" +#include "private/common.h" +#include "private/implementations.h" +#include "randombytes.h" +#include "runtime.h" +#include "stream_salsa20.h" + +#ifdef HAVE_AMD64_ASM +# include "xmm6/salsa20_xmm6.h" +#else +# include "ref/salsa20_ref.h" +#endif +#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H) +# include "xmm6int/salsa20_xmm6int-sse2.h" +#endif +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) +# include "xmm6int/salsa20_xmm6int-avx2.h" +#endif + +#if HAVE_AMD64_ASM +static const crypto_stream_salsa20_implementation *implementation = + &crypto_stream_salsa20_xmm6_implementation; +#else +static const crypto_stream_salsa20_implementation *implementation = + &crypto_stream_salsa20_ref_implementation; +#endif + +size_t +crypto_stream_salsa20_keybytes(void) +{ + return crypto_stream_salsa20_KEYBYTES; +} + +size_t +crypto_stream_salsa20_noncebytes(void) +{ + return crypto_stream_salsa20_NONCEBYTES; +} + +size_t +crypto_stream_salsa20_messagebytes_max(void) +{ + return crypto_stream_salsa20_MESSAGEBYTES_MAX; +} + +int +crypto_stream_salsa20(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + return implementation->stream(c, clen, n, k); +} + +int +crypto_stream_salsa20_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + return implementation->stream_xor_ic(c, m, mlen, n, ic, k); +} + +int +crypto_stream_salsa20_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + return implementation->stream_xor_ic(c, m, mlen, n, 0U, k); +} + +void +crypto_stream_salsa20_keygen(unsigned char k[crypto_stream_salsa20_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_salsa20_KEYBYTES); +} + +int +_crypto_stream_salsa20_pick_best_implementation(void) +{ +#ifdef HAVE_AMD64_ASM + implementation = &crypto_stream_salsa20_xmm6_implementation; +#else + implementation = &crypto_stream_salsa20_ref_implementation; +#endif + +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) + if (sodium_runtime_has_avx2()) { + implementation = &crypto_stream_salsa20_xmm6int_avx2_implementation; + return 0; + } +#endif +#if !defined(HAVE_AMD64_ASM) && defined(HAVE_EMMINTRIN_H) + if (sodium_runtime_has_sse2()) { + implementation = &crypto_stream_salsa20_xmm6int_sse2_implementation; + return 0; + } +#endif + return 0; /* LCOV_EXCL_LINE */ +} diff --git a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h index 1949d38113..0b5971ca48 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h +++ b/libs/libsodium/src/crypto_stream/salsa20/stream_salsa20.h @@ -1,16 +1,16 @@ - -#ifndef stream_salsa20_H -#define stream_salsa20_H - -#include - -typedef struct crypto_stream_salsa20_implementation { - int (*stream)(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k); - int (*stream_xor_ic)(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, uint64_t ic, - const unsigned char *k); -} crypto_stream_salsa20_implementation; - -#endif + +#ifndef stream_salsa20_H +#define stream_salsa20_H + +#include + +typedef struct crypto_stream_salsa20_implementation { + int (*stream)(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + int (*stream_xor_ic)(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, uint64_t ic, + const unsigned char *k); +} crypto_stream_salsa20_implementation; + +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S index 6d9f354e10..9ecea1b088 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6-asm.S @@ -1,960 +1,960 @@ -#ifdef HAVE_AMD64_ASM - -.text -.p2align 5 - -#ifdef ASM_HIDE_SYMBOL -ASM_HIDE_SYMBOL stream_salsa20_xmm6 -ASM_HIDE_SYMBOL _stream_salsa20_xmm6 -#endif -.globl stream_salsa20_xmm6 -.globl _stream_salsa20_xmm6 -#ifdef __ELF__ -.type stream_salsa20_xmm6, @function -.type _stream_salsa20_xmm6, @function -#endif -stream_salsa20_xmm6: -_stream_salsa20_xmm6: -mov %rsp,%r11 -and $31,%r11 -add $512,%r11 -sub %r11,%rsp -movq %r11,416(%rsp) -movq %r12,424(%rsp) -movq %r13,432(%rsp) -movq %r14,440(%rsp) -movq %r15,448(%rsp) -movq %rbx,456(%rsp) -movq %rbp,464(%rsp) -mov %rsi,%r9 -mov %rdi,%rdi -mov %rdi,%rsi -mov %rdx,%rdx -mov %rcx,%r10 -cmp $0,%r9 -jbe ._done -mov $0,%rax -mov %r9,%rcx -rep stosb -sub %r9,%rdi -movq $0,472(%rsp) -jmp ._start - -.text -.p2align 5 - -#ifdef ASM_HIDE_SYMBOL -ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic -ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic -#endif -.globl stream_salsa20_xmm6_xor_ic -.globl _stream_salsa20_xmm6_xor_ic -#ifdef __ELF__ -.type stream_salsa20_xmm6_xor_ic, @function -.type _stream_salsa20_xmm6_xor_ic, @function -#endif -stream_salsa20_xmm6_xor_ic: -_stream_salsa20_xmm6_xor_ic: - -mov %rsp,%r11 -and $31,%r11 -add $512,%r11 -sub %r11,%rsp -movq %r11,416(%rsp) -movq %r12,424(%rsp) -movq %r13,432(%rsp) -movq %r14,440(%rsp) -movq %r15,448(%rsp) -movq %rbx,456(%rsp) -movq %rbp,464(%rsp) -mov %rdi,%rdi -mov %rsi,%rsi -mov %r9,%r10 -movq %r8,472(%rsp) -mov %rdx,%r9 -mov %rcx,%rdx -cmp $0,%r9 -jbe ._done - -._start: -movl 20(%r10),%ecx -movl 0(%r10),%r8d -movl 0(%rdx),%eax -movl 16(%r10),%r11d -movl %ecx,64(%rsp) -movl %r8d,4+64(%rsp) -movl %eax,8+64(%rsp) -movl %r11d,12+64(%rsp) -movl 24(%r10),%r8d -movl 4(%r10),%eax -movl 4(%rdx),%edx -movq 472(%rsp),%rcx -movl %ecx,80(%rsp) -movl %r8d,4+80(%rsp) -movl %eax,8+80(%rsp) -movl %edx,12+80(%rsp) -movl 12(%r10),%edx -shr $32,%rcx -movl 28(%r10),%r8d -movl 8(%r10),%eax -movl %edx,96(%rsp) -movl %ecx,4+96(%rsp) -movl %r8d,8+96(%rsp) -movl %eax,12+96(%rsp) -mov $1634760805,%rdx -mov $857760878,%rcx -mov $2036477234,%r8 -mov $1797285236,%rax -movl %edx,112(%rsp) -movl %ecx,4+112(%rsp) -movl %r8d,8+112(%rsp) -movl %eax,12+112(%rsp) -cmp $256,%r9 -jb ._bytesbetween1and255 -movdqa 112(%rsp),%xmm0 -pshufd $0x55,%xmm0,%xmm1 -pshufd $0xaa,%xmm0,%xmm2 -pshufd $0xff,%xmm0,%xmm3 -pshufd $0x00,%xmm0,%xmm0 -movdqa %xmm1,128(%rsp) -movdqa %xmm2,144(%rsp) -movdqa %xmm3,160(%rsp) -movdqa %xmm0,176(%rsp) -movdqa 64(%rsp),%xmm0 -pshufd $0xaa,%xmm0,%xmm1 -pshufd $0xff,%xmm0,%xmm2 -pshufd $0x00,%xmm0,%xmm3 -pshufd $0x55,%xmm0,%xmm0 -movdqa %xmm1,192(%rsp) -movdqa %xmm2,208(%rsp) -movdqa %xmm3,224(%rsp) -movdqa %xmm0,240(%rsp) -movdqa 80(%rsp),%xmm0 -pshufd $0xff,%xmm0,%xmm1 -pshufd $0x55,%xmm0,%xmm2 -pshufd $0xaa,%xmm0,%xmm0 -movdqa %xmm1,256(%rsp) -movdqa %xmm2,272(%rsp) -movdqa %xmm0,288(%rsp) -movdqa 96(%rsp),%xmm0 -pshufd $0x00,%xmm0,%xmm1 -pshufd $0xaa,%xmm0,%xmm2 -pshufd $0xff,%xmm0,%xmm0 -movdqa %xmm1,304(%rsp) -movdqa %xmm2,320(%rsp) -movdqa %xmm0,336(%rsp) - -.p2align 4 -._bytesatleast256: -movq 472(%rsp),%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,352(%rsp) -movl %ecx,368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,4+352(%rsp) -movl %ecx,4+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,8+352(%rsp) -movl %ecx,8+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,12+352(%rsp) -movl %ecx,12+368(%rsp) -add $1,%rdx -mov %rdx,%rcx -shr $32,%rcx -movl %edx,80(%rsp) -movl %ecx,4+96(%rsp) -movq %rdx,472(%rsp) -movq %r9,480(%rsp) -mov $20,%rdx -movdqa 128(%rsp),%xmm0 -movdqa 144(%rsp),%xmm1 -movdqa 160(%rsp),%xmm2 -movdqa 320(%rsp),%xmm3 -movdqa 336(%rsp),%xmm4 -movdqa 192(%rsp),%xmm5 -movdqa 208(%rsp),%xmm6 -movdqa 240(%rsp),%xmm7 -movdqa 256(%rsp),%xmm8 -movdqa 272(%rsp),%xmm9 -movdqa 288(%rsp),%xmm10 -movdqa 368(%rsp),%xmm11 -movdqa 176(%rsp),%xmm12 -movdqa 224(%rsp),%xmm13 -movdqa 304(%rsp),%xmm14 -movdqa 352(%rsp),%xmm15 - -.p2align 4 -._mainloop1: -movdqa %xmm1,384(%rsp) -movdqa %xmm2,400(%rsp) -movdqa %xmm13,%xmm1 -paddd %xmm12,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm14 -psrld $25,%xmm2 -pxor %xmm2,%xmm14 -movdqa %xmm7,%xmm1 -paddd %xmm0,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm11 -psrld $25,%xmm2 -pxor %xmm2,%xmm11 -movdqa %xmm12,%xmm1 -paddd %xmm14,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm15 -psrld $23,%xmm2 -pxor %xmm2,%xmm15 -movdqa %xmm0,%xmm1 -paddd %xmm11,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm9 -psrld $23,%xmm2 -pxor %xmm2,%xmm9 -movdqa %xmm14,%xmm1 -paddd %xmm15,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm13 -psrld $19,%xmm2 -pxor %xmm2,%xmm13 -movdqa %xmm11,%xmm1 -paddd %xmm9,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm7 -psrld $19,%xmm2 -pxor %xmm2,%xmm7 -movdqa %xmm15,%xmm1 -paddd %xmm13,%xmm1 -movdqa %xmm1,%xmm2 -pslld $18,%xmm1 -pxor %xmm1,%xmm12 -psrld $14,%xmm2 -pxor %xmm2,%xmm12 -movdqa 384(%rsp),%xmm1 -movdqa %xmm12,384(%rsp) -movdqa %xmm9,%xmm2 -paddd %xmm7,%xmm2 -movdqa %xmm2,%xmm12 -pslld $18,%xmm2 -pxor %xmm2,%xmm0 -psrld $14,%xmm12 -pxor %xmm12,%xmm0 -movdqa %xmm5,%xmm2 -paddd %xmm1,%xmm2 -movdqa %xmm2,%xmm12 -pslld $7,%xmm2 -pxor %xmm2,%xmm3 -psrld $25,%xmm12 -pxor %xmm12,%xmm3 -movdqa 400(%rsp),%xmm2 -movdqa %xmm0,400(%rsp) -movdqa %xmm6,%xmm0 -paddd %xmm2,%xmm0 -movdqa %xmm0,%xmm12 -pslld $7,%xmm0 -pxor %xmm0,%xmm4 -psrld $25,%xmm12 -pxor %xmm12,%xmm4 -movdqa %xmm1,%xmm0 -paddd %xmm3,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm10 -psrld $23,%xmm12 -pxor %xmm12,%xmm10 -movdqa %xmm2,%xmm0 -paddd %xmm4,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm8 -psrld $23,%xmm12 -pxor %xmm12,%xmm8 -movdqa %xmm3,%xmm0 -paddd %xmm10,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm5 -psrld $19,%xmm12 -pxor %xmm12,%xmm5 -movdqa %xmm4,%xmm0 -paddd %xmm8,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm6 -psrld $19,%xmm12 -pxor %xmm12,%xmm6 -movdqa %xmm10,%xmm0 -paddd %xmm5,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm1 -psrld $14,%xmm12 -pxor %xmm12,%xmm1 -movdqa 384(%rsp),%xmm0 -movdqa %xmm1,384(%rsp) -movdqa %xmm4,%xmm1 -paddd %xmm0,%xmm1 -movdqa %xmm1,%xmm12 -pslld $7,%xmm1 -pxor %xmm1,%xmm7 -psrld $25,%xmm12 -pxor %xmm12,%xmm7 -movdqa %xmm8,%xmm1 -paddd %xmm6,%xmm1 -movdqa %xmm1,%xmm12 -pslld $18,%xmm1 -pxor %xmm1,%xmm2 -psrld $14,%xmm12 -pxor %xmm12,%xmm2 -movdqa 400(%rsp),%xmm12 -movdqa %xmm2,400(%rsp) -movdqa %xmm14,%xmm1 -paddd %xmm12,%xmm1 -movdqa %xmm1,%xmm2 -pslld $7,%xmm1 -pxor %xmm1,%xmm5 -psrld $25,%xmm2 -pxor %xmm2,%xmm5 -movdqa %xmm0,%xmm1 -paddd %xmm7,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm10 -psrld $23,%xmm2 -pxor %xmm2,%xmm10 -movdqa %xmm12,%xmm1 -paddd %xmm5,%xmm1 -movdqa %xmm1,%xmm2 -pslld $9,%xmm1 -pxor %xmm1,%xmm8 -psrld $23,%xmm2 -pxor %xmm2,%xmm8 -movdqa %xmm7,%xmm1 -paddd %xmm10,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm4 -psrld $19,%xmm2 -pxor %xmm2,%xmm4 -movdqa %xmm5,%xmm1 -paddd %xmm8,%xmm1 -movdqa %xmm1,%xmm2 -pslld $13,%xmm1 -pxor %xmm1,%xmm14 -psrld $19,%xmm2 -pxor %xmm2,%xmm14 -movdqa %xmm10,%xmm1 -paddd %xmm4,%xmm1 -movdqa %xmm1,%xmm2 -pslld $18,%xmm1 -pxor %xmm1,%xmm0 -psrld $14,%xmm2 -pxor %xmm2,%xmm0 -movdqa 384(%rsp),%xmm1 -movdqa %xmm0,384(%rsp) -movdqa %xmm8,%xmm0 -paddd %xmm14,%xmm0 -movdqa %xmm0,%xmm2 -pslld $18,%xmm0 -pxor %xmm0,%xmm12 -psrld $14,%xmm2 -pxor %xmm2,%xmm12 -movdqa %xmm11,%xmm0 -paddd %xmm1,%xmm0 -movdqa %xmm0,%xmm2 -pslld $7,%xmm0 -pxor %xmm0,%xmm6 -psrld $25,%xmm2 -pxor %xmm2,%xmm6 -movdqa 400(%rsp),%xmm2 -movdqa %xmm12,400(%rsp) -movdqa %xmm3,%xmm0 -paddd %xmm2,%xmm0 -movdqa %xmm0,%xmm12 -pslld $7,%xmm0 -pxor %xmm0,%xmm13 -psrld $25,%xmm12 -pxor %xmm12,%xmm13 -movdqa %xmm1,%xmm0 -paddd %xmm6,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm15 -psrld $23,%xmm12 -pxor %xmm12,%xmm15 -movdqa %xmm2,%xmm0 -paddd %xmm13,%xmm0 -movdqa %xmm0,%xmm12 -pslld $9,%xmm0 -pxor %xmm0,%xmm9 -psrld $23,%xmm12 -pxor %xmm12,%xmm9 -movdqa %xmm6,%xmm0 -paddd %xmm15,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm11 -psrld $19,%xmm12 -pxor %xmm12,%xmm11 -movdqa %xmm13,%xmm0 -paddd %xmm9,%xmm0 -movdqa %xmm0,%xmm12 -pslld $13,%xmm0 -pxor %xmm0,%xmm3 -psrld $19,%xmm12 -pxor %xmm12,%xmm3 -movdqa %xmm15,%xmm0 -paddd %xmm11,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm1 -psrld $14,%xmm12 -pxor %xmm12,%xmm1 -movdqa %xmm9,%xmm0 -paddd %xmm3,%xmm0 -movdqa %xmm0,%xmm12 -pslld $18,%xmm0 -pxor %xmm0,%xmm2 -psrld $14,%xmm12 -pxor %xmm12,%xmm2 -movdqa 384(%rsp),%xmm12 -movdqa 400(%rsp),%xmm0 -sub $2,%rdx -ja ._mainloop1 - -paddd 176(%rsp),%xmm12 -paddd 240(%rsp),%xmm7 -paddd 288(%rsp),%xmm10 -paddd 336(%rsp),%xmm4 -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 0(%rsi),%edx -xorl 4(%rsi),%ecx -xorl 8(%rsi),%r8d -xorl 12(%rsi),%r9d -movl %edx,0(%rdi) -movl %ecx,4(%rdi) -movl %r8d,8(%rdi) -movl %r9d,12(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 64(%rsi),%edx -xorl 68(%rsi),%ecx -xorl 72(%rsi),%r8d -xorl 76(%rsi),%r9d -movl %edx,64(%rdi) -movl %ecx,68(%rdi) -movl %r8d,72(%rdi) -movl %r9d,76(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -pshufd $0x39,%xmm12,%xmm12 -pshufd $0x39,%xmm7,%xmm7 -pshufd $0x39,%xmm10,%xmm10 -pshufd $0x39,%xmm4,%xmm4 -xorl 128(%rsi),%edx -xorl 132(%rsi),%ecx -xorl 136(%rsi),%r8d -xorl 140(%rsi),%r9d -movl %edx,128(%rdi) -movl %ecx,132(%rdi) -movl %r8d,136(%rdi) -movl %r9d,140(%rdi) -movd %xmm12,%rdx -movd %xmm7,%rcx -movd %xmm10,%r8 -movd %xmm4,%r9 -xorl 192(%rsi),%edx -xorl 196(%rsi),%ecx -xorl 200(%rsi),%r8d -xorl 204(%rsi),%r9d -movl %edx,192(%rdi) -movl %ecx,196(%rdi) -movl %r8d,200(%rdi) -movl %r9d,204(%rdi) -paddd 304(%rsp),%xmm14 -paddd 128(%rsp),%xmm0 -paddd 192(%rsp),%xmm5 -paddd 256(%rsp),%xmm8 -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 16(%rsi),%edx -xorl 20(%rsi),%ecx -xorl 24(%rsi),%r8d -xorl 28(%rsi),%r9d -movl %edx,16(%rdi) -movl %ecx,20(%rdi) -movl %r8d,24(%rdi) -movl %r9d,28(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 80(%rsi),%edx -xorl 84(%rsi),%ecx -xorl 88(%rsi),%r8d -xorl 92(%rsi),%r9d -movl %edx,80(%rdi) -movl %ecx,84(%rdi) -movl %r8d,88(%rdi) -movl %r9d,92(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -pshufd $0x39,%xmm14,%xmm14 -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm5,%xmm5 -pshufd $0x39,%xmm8,%xmm8 -xorl 144(%rsi),%edx -xorl 148(%rsi),%ecx -xorl 152(%rsi),%r8d -xorl 156(%rsi),%r9d -movl %edx,144(%rdi) -movl %ecx,148(%rdi) -movl %r8d,152(%rdi) -movl %r9d,156(%rdi) -movd %xmm14,%rdx -movd %xmm0,%rcx -movd %xmm5,%r8 -movd %xmm8,%r9 -xorl 208(%rsi),%edx -xorl 212(%rsi),%ecx -xorl 216(%rsi),%r8d -xorl 220(%rsi),%r9d -movl %edx,208(%rdi) -movl %ecx,212(%rdi) -movl %r8d,216(%rdi) -movl %r9d,220(%rdi) -paddd 352(%rsp),%xmm15 -paddd 368(%rsp),%xmm11 -paddd 144(%rsp),%xmm1 -paddd 208(%rsp),%xmm6 -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 32(%rsi),%edx -xorl 36(%rsi),%ecx -xorl 40(%rsi),%r8d -xorl 44(%rsi),%r9d -movl %edx,32(%rdi) -movl %ecx,36(%rdi) -movl %r8d,40(%rdi) -movl %r9d,44(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 96(%rsi),%edx -xorl 100(%rsi),%ecx -xorl 104(%rsi),%r8d -xorl 108(%rsi),%r9d -movl %edx,96(%rdi) -movl %ecx,100(%rdi) -movl %r8d,104(%rdi) -movl %r9d,108(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -pshufd $0x39,%xmm15,%xmm15 -pshufd $0x39,%xmm11,%xmm11 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm6,%xmm6 -xorl 160(%rsi),%edx -xorl 164(%rsi),%ecx -xorl 168(%rsi),%r8d -xorl 172(%rsi),%r9d -movl %edx,160(%rdi) -movl %ecx,164(%rdi) -movl %r8d,168(%rdi) -movl %r9d,172(%rdi) -movd %xmm15,%rdx -movd %xmm11,%rcx -movd %xmm1,%r8 -movd %xmm6,%r9 -xorl 224(%rsi),%edx -xorl 228(%rsi),%ecx -xorl 232(%rsi),%r8d -xorl 236(%rsi),%r9d -movl %edx,224(%rdi) -movl %ecx,228(%rdi) -movl %r8d,232(%rdi) -movl %r9d,236(%rdi) -paddd 224(%rsp),%xmm13 -paddd 272(%rsp),%xmm9 -paddd 320(%rsp),%xmm3 -paddd 160(%rsp),%xmm2 -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 48(%rsi),%edx -xorl 52(%rsi),%ecx -xorl 56(%rsi),%r8d -xorl 60(%rsi),%r9d -movl %edx,48(%rdi) -movl %ecx,52(%rdi) -movl %r8d,56(%rdi) -movl %r9d,60(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 112(%rsi),%edx -xorl 116(%rsi),%ecx -xorl 120(%rsi),%r8d -xorl 124(%rsi),%r9d -movl %edx,112(%rdi) -movl %ecx,116(%rdi) -movl %r8d,120(%rdi) -movl %r9d,124(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -pshufd $0x39,%xmm13,%xmm13 -pshufd $0x39,%xmm9,%xmm9 -pshufd $0x39,%xmm3,%xmm3 -pshufd $0x39,%xmm2,%xmm2 -xorl 176(%rsi),%edx -xorl 180(%rsi),%ecx -xorl 184(%rsi),%r8d -xorl 188(%rsi),%r9d -movl %edx,176(%rdi) -movl %ecx,180(%rdi) -movl %r8d,184(%rdi) -movl %r9d,188(%rdi) -movd %xmm13,%rdx -movd %xmm9,%rcx -movd %xmm3,%r8 -movd %xmm2,%r9 -xorl 240(%rsi),%edx -xorl 244(%rsi),%ecx -xorl 248(%rsi),%r8d -xorl 252(%rsi),%r9d -movl %edx,240(%rdi) -movl %ecx,244(%rdi) -movl %r8d,248(%rdi) -movl %r9d,252(%rdi) -movq 480(%rsp),%r9 -sub $256,%r9 -add $256,%rsi -add $256,%rdi -cmp $256,%r9 -jae ._bytesatleast256 - -cmp $0,%r9 -jbe ._done - -._bytesbetween1and255: -cmp $64,%r9 -jae ._nocopy - -mov %rdi,%rdx -leaq 0(%rsp),%rdi -mov %r9,%rcx -rep movsb -leaq 0(%rsp),%rdi -leaq 0(%rsp),%rsi - -._nocopy: -movq %r9,480(%rsp) -movdqa 112(%rsp),%xmm0 -movdqa 64(%rsp),%xmm1 -movdqa 80(%rsp),%xmm2 -movdqa 96(%rsp),%xmm3 -movdqa %xmm1,%xmm4 -mov $20,%rcx - -.p2align 4 -._mainloop2: -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm3 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm3,%xmm3 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm1 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm1 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm1,%xmm1 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm3 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm3,%xmm3 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm3 -pxor %xmm6,%xmm3 -paddd %xmm3,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm3,%xmm3 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm1 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm3,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pxor %xmm6,%xmm0 -paddd %xmm0,%xmm4 -movdqa %xmm0,%xmm5 -movdqa %xmm4,%xmm6 -pslld $7,%xmm4 -psrld $25,%xmm6 -pxor %xmm4,%xmm1 -pxor %xmm6,%xmm1 -paddd %xmm1,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $9,%xmm5 -psrld $23,%xmm6 -pxor %xmm5,%xmm2 -pshufd $0x93,%xmm1,%xmm1 -pxor %xmm6,%xmm2 -paddd %xmm2,%xmm4 -movdqa %xmm2,%xmm5 -movdqa %xmm4,%xmm6 -pslld $13,%xmm4 -psrld $19,%xmm6 -pxor %xmm4,%xmm3 -pshufd $0x4e,%xmm2,%xmm2 -pxor %xmm6,%xmm3 -sub $4,%rcx -paddd %xmm3,%xmm5 -movdqa %xmm1,%xmm4 -movdqa %xmm5,%xmm6 -pslld $18,%xmm5 -pxor %xmm7,%xmm7 -psrld $14,%xmm6 -pxor %xmm5,%xmm0 -pshufd $0x39,%xmm3,%xmm3 -pxor %xmm6,%xmm0 -ja ._mainloop2 - -paddd 112(%rsp),%xmm0 -paddd 64(%rsp),%xmm1 -paddd 80(%rsp),%xmm2 -paddd 96(%rsp),%xmm3 -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 0(%rsi),%ecx -xorl 48(%rsi),%r8d -xorl 32(%rsi),%r9d -xorl 16(%rsi),%eax -movl %ecx,0(%rdi) -movl %r8d,48(%rdi) -movl %r9d,32(%rdi) -movl %eax,16(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 20(%rsi),%ecx -xorl 4(%rsi),%r8d -xorl 52(%rsi),%r9d -xorl 36(%rsi),%eax -movl %ecx,20(%rdi) -movl %r8d,4(%rdi) -movl %r9d,52(%rdi) -movl %eax,36(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -pshufd $0x39,%xmm0,%xmm0 -pshufd $0x39,%xmm1,%xmm1 -pshufd $0x39,%xmm2,%xmm2 -pshufd $0x39,%xmm3,%xmm3 -xorl 40(%rsi),%ecx -xorl 24(%rsi),%r8d -xorl 8(%rsi),%r9d -xorl 56(%rsi),%eax -movl %ecx,40(%rdi) -movl %r8d,24(%rdi) -movl %r9d,8(%rdi) -movl %eax,56(%rdi) -movd %xmm0,%rcx -movd %xmm1,%r8 -movd %xmm2,%r9 -movd %xmm3,%rax -xorl 60(%rsi),%ecx -xorl 44(%rsi),%r8d -xorl 28(%rsi),%r9d -xorl 12(%rsi),%eax -movl %ecx,60(%rdi) -movl %r8d,44(%rdi) -movl %r9d,28(%rdi) -movl %eax,12(%rdi) -movq 480(%rsp),%r9 -movq 472(%rsp),%rcx -add $1,%rcx -mov %rcx,%r8 -shr $32,%r8 -movl %ecx,80(%rsp) -movl %r8d,4+96(%rsp) -movq %rcx,472(%rsp) -cmp $64,%r9 -ja ._bytesatleast65 -jae ._bytesatleast64 - -mov %rdi,%rsi -mov %rdx,%rdi -mov %r9,%rcx -rep movsb - -._bytesatleast64: -._done: -movq 416(%rsp),%r11 -movq 424(%rsp),%r12 -movq 432(%rsp),%r13 -movq 440(%rsp),%r14 -movq 448(%rsp),%r15 -movq 456(%rsp),%rbx -movq 464(%rsp),%rbp -add %r11,%rsp -xor %rax,%rax -mov %rsi,%rdx -ret - -._bytesatleast65: -sub $64,%r9 -add $64,%rdi -add $64,%rsi -jmp ._bytesbetween1and255 - -#endif - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif +#ifdef HAVE_AMD64_ASM + +.text +.p2align 5 + +#ifdef ASM_HIDE_SYMBOL +ASM_HIDE_SYMBOL stream_salsa20_xmm6 +ASM_HIDE_SYMBOL _stream_salsa20_xmm6 +#endif +.globl stream_salsa20_xmm6 +.globl _stream_salsa20_xmm6 +#ifdef __ELF__ +.type stream_salsa20_xmm6, @function +.type _stream_salsa20_xmm6, @function +#endif +stream_salsa20_xmm6: +_stream_salsa20_xmm6: +mov %rsp,%r11 +and $31,%r11 +add $512,%r11 +sub %r11,%rsp +movq %r11,416(%rsp) +movq %r12,424(%rsp) +movq %r13,432(%rsp) +movq %r14,440(%rsp) +movq %r15,448(%rsp) +movq %rbx,456(%rsp) +movq %rbp,464(%rsp) +mov %rsi,%r9 +mov %rdi,%rdi +mov %rdi,%rsi +mov %rdx,%rdx +mov %rcx,%r10 +cmp $0,%r9 +jbe ._done +mov $0,%rax +mov %r9,%rcx +rep stosb +sub %r9,%rdi +movq $0,472(%rsp) +jmp ._start + +.text +.p2align 5 + +#ifdef ASM_HIDE_SYMBOL +ASM_HIDE_SYMBOL stream_salsa20_xmm6_xor_ic +ASM_HIDE_SYMBOL _stream_salsa20_xmm6_xor_ic +#endif +.globl stream_salsa20_xmm6_xor_ic +.globl _stream_salsa20_xmm6_xor_ic +#ifdef __ELF__ +.type stream_salsa20_xmm6_xor_ic, @function +.type _stream_salsa20_xmm6_xor_ic, @function +#endif +stream_salsa20_xmm6_xor_ic: +_stream_salsa20_xmm6_xor_ic: + +mov %rsp,%r11 +and $31,%r11 +add $512,%r11 +sub %r11,%rsp +movq %r11,416(%rsp) +movq %r12,424(%rsp) +movq %r13,432(%rsp) +movq %r14,440(%rsp) +movq %r15,448(%rsp) +movq %rbx,456(%rsp) +movq %rbp,464(%rsp) +mov %rdi,%rdi +mov %rsi,%rsi +mov %r9,%r10 +movq %r8,472(%rsp) +mov %rdx,%r9 +mov %rcx,%rdx +cmp $0,%r9 +jbe ._done + +._start: +movl 20(%r10),%ecx +movl 0(%r10),%r8d +movl 0(%rdx),%eax +movl 16(%r10),%r11d +movl %ecx,64(%rsp) +movl %r8d,4+64(%rsp) +movl %eax,8+64(%rsp) +movl %r11d,12+64(%rsp) +movl 24(%r10),%r8d +movl 4(%r10),%eax +movl 4(%rdx),%edx +movq 472(%rsp),%rcx +movl %ecx,80(%rsp) +movl %r8d,4+80(%rsp) +movl %eax,8+80(%rsp) +movl %edx,12+80(%rsp) +movl 12(%r10),%edx +shr $32,%rcx +movl 28(%r10),%r8d +movl 8(%r10),%eax +movl %edx,96(%rsp) +movl %ecx,4+96(%rsp) +movl %r8d,8+96(%rsp) +movl %eax,12+96(%rsp) +mov $1634760805,%rdx +mov $857760878,%rcx +mov $2036477234,%r8 +mov $1797285236,%rax +movl %edx,112(%rsp) +movl %ecx,4+112(%rsp) +movl %r8d,8+112(%rsp) +movl %eax,12+112(%rsp) +cmp $256,%r9 +jb ._bytesbetween1and255 +movdqa 112(%rsp),%xmm0 +pshufd $0x55,%xmm0,%xmm1 +pshufd $0xaa,%xmm0,%xmm2 +pshufd $0xff,%xmm0,%xmm3 +pshufd $0x00,%xmm0,%xmm0 +movdqa %xmm1,128(%rsp) +movdqa %xmm2,144(%rsp) +movdqa %xmm3,160(%rsp) +movdqa %xmm0,176(%rsp) +movdqa 64(%rsp),%xmm0 +pshufd $0xaa,%xmm0,%xmm1 +pshufd $0xff,%xmm0,%xmm2 +pshufd $0x00,%xmm0,%xmm3 +pshufd $0x55,%xmm0,%xmm0 +movdqa %xmm1,192(%rsp) +movdqa %xmm2,208(%rsp) +movdqa %xmm3,224(%rsp) +movdqa %xmm0,240(%rsp) +movdqa 80(%rsp),%xmm0 +pshufd $0xff,%xmm0,%xmm1 +pshufd $0x55,%xmm0,%xmm2 +pshufd $0xaa,%xmm0,%xmm0 +movdqa %xmm1,256(%rsp) +movdqa %xmm2,272(%rsp) +movdqa %xmm0,288(%rsp) +movdqa 96(%rsp),%xmm0 +pshufd $0x00,%xmm0,%xmm1 +pshufd $0xaa,%xmm0,%xmm2 +pshufd $0xff,%xmm0,%xmm0 +movdqa %xmm1,304(%rsp) +movdqa %xmm2,320(%rsp) +movdqa %xmm0,336(%rsp) + +.p2align 4 +._bytesatleast256: +movq 472(%rsp),%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,352(%rsp) +movl %ecx,368(%rsp) +add $1,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,4+352(%rsp) +movl %ecx,4+368(%rsp) +add $1,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,8+352(%rsp) +movl %ecx,8+368(%rsp) +add $1,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,12+352(%rsp) +movl %ecx,12+368(%rsp) +add $1,%rdx +mov %rdx,%rcx +shr $32,%rcx +movl %edx,80(%rsp) +movl %ecx,4+96(%rsp) +movq %rdx,472(%rsp) +movq %r9,480(%rsp) +mov $20,%rdx +movdqa 128(%rsp),%xmm0 +movdqa 144(%rsp),%xmm1 +movdqa 160(%rsp),%xmm2 +movdqa 320(%rsp),%xmm3 +movdqa 336(%rsp),%xmm4 +movdqa 192(%rsp),%xmm5 +movdqa 208(%rsp),%xmm6 +movdqa 240(%rsp),%xmm7 +movdqa 256(%rsp),%xmm8 +movdqa 272(%rsp),%xmm9 +movdqa 288(%rsp),%xmm10 +movdqa 368(%rsp),%xmm11 +movdqa 176(%rsp),%xmm12 +movdqa 224(%rsp),%xmm13 +movdqa 304(%rsp),%xmm14 +movdqa 352(%rsp),%xmm15 + +.p2align 4 +._mainloop1: +movdqa %xmm1,384(%rsp) +movdqa %xmm2,400(%rsp) +movdqa %xmm13,%xmm1 +paddd %xmm12,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm14 +psrld $25,%xmm2 +pxor %xmm2,%xmm14 +movdqa %xmm7,%xmm1 +paddd %xmm0,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm11 +psrld $25,%xmm2 +pxor %xmm2,%xmm11 +movdqa %xmm12,%xmm1 +paddd %xmm14,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm15 +psrld $23,%xmm2 +pxor %xmm2,%xmm15 +movdqa %xmm0,%xmm1 +paddd %xmm11,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm9 +psrld $23,%xmm2 +pxor %xmm2,%xmm9 +movdqa %xmm14,%xmm1 +paddd %xmm15,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm13 +psrld $19,%xmm2 +pxor %xmm2,%xmm13 +movdqa %xmm11,%xmm1 +paddd %xmm9,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm7 +psrld $19,%xmm2 +pxor %xmm2,%xmm7 +movdqa %xmm15,%xmm1 +paddd %xmm13,%xmm1 +movdqa %xmm1,%xmm2 +pslld $18,%xmm1 +pxor %xmm1,%xmm12 +psrld $14,%xmm2 +pxor %xmm2,%xmm12 +movdqa 384(%rsp),%xmm1 +movdqa %xmm12,384(%rsp) +movdqa %xmm9,%xmm2 +paddd %xmm7,%xmm2 +movdqa %xmm2,%xmm12 +pslld $18,%xmm2 +pxor %xmm2,%xmm0 +psrld $14,%xmm12 +pxor %xmm12,%xmm0 +movdqa %xmm5,%xmm2 +paddd %xmm1,%xmm2 +movdqa %xmm2,%xmm12 +pslld $7,%xmm2 +pxor %xmm2,%xmm3 +psrld $25,%xmm12 +pxor %xmm12,%xmm3 +movdqa 400(%rsp),%xmm2 +movdqa %xmm0,400(%rsp) +movdqa %xmm6,%xmm0 +paddd %xmm2,%xmm0 +movdqa %xmm0,%xmm12 +pslld $7,%xmm0 +pxor %xmm0,%xmm4 +psrld $25,%xmm12 +pxor %xmm12,%xmm4 +movdqa %xmm1,%xmm0 +paddd %xmm3,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm10 +psrld $23,%xmm12 +pxor %xmm12,%xmm10 +movdqa %xmm2,%xmm0 +paddd %xmm4,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm8 +psrld $23,%xmm12 +pxor %xmm12,%xmm8 +movdqa %xmm3,%xmm0 +paddd %xmm10,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm5 +psrld $19,%xmm12 +pxor %xmm12,%xmm5 +movdqa %xmm4,%xmm0 +paddd %xmm8,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm6 +psrld $19,%xmm12 +pxor %xmm12,%xmm6 +movdqa %xmm10,%xmm0 +paddd %xmm5,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm1 +psrld $14,%xmm12 +pxor %xmm12,%xmm1 +movdqa 384(%rsp),%xmm0 +movdqa %xmm1,384(%rsp) +movdqa %xmm4,%xmm1 +paddd %xmm0,%xmm1 +movdqa %xmm1,%xmm12 +pslld $7,%xmm1 +pxor %xmm1,%xmm7 +psrld $25,%xmm12 +pxor %xmm12,%xmm7 +movdqa %xmm8,%xmm1 +paddd %xmm6,%xmm1 +movdqa %xmm1,%xmm12 +pslld $18,%xmm1 +pxor %xmm1,%xmm2 +psrld $14,%xmm12 +pxor %xmm12,%xmm2 +movdqa 400(%rsp),%xmm12 +movdqa %xmm2,400(%rsp) +movdqa %xmm14,%xmm1 +paddd %xmm12,%xmm1 +movdqa %xmm1,%xmm2 +pslld $7,%xmm1 +pxor %xmm1,%xmm5 +psrld $25,%xmm2 +pxor %xmm2,%xmm5 +movdqa %xmm0,%xmm1 +paddd %xmm7,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm10 +psrld $23,%xmm2 +pxor %xmm2,%xmm10 +movdqa %xmm12,%xmm1 +paddd %xmm5,%xmm1 +movdqa %xmm1,%xmm2 +pslld $9,%xmm1 +pxor %xmm1,%xmm8 +psrld $23,%xmm2 +pxor %xmm2,%xmm8 +movdqa %xmm7,%xmm1 +paddd %xmm10,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm4 +psrld $19,%xmm2 +pxor %xmm2,%xmm4 +movdqa %xmm5,%xmm1 +paddd %xmm8,%xmm1 +movdqa %xmm1,%xmm2 +pslld $13,%xmm1 +pxor %xmm1,%xmm14 +psrld $19,%xmm2 +pxor %xmm2,%xmm14 +movdqa %xmm10,%xmm1 +paddd %xmm4,%xmm1 +movdqa %xmm1,%xmm2 +pslld $18,%xmm1 +pxor %xmm1,%xmm0 +psrld $14,%xmm2 +pxor %xmm2,%xmm0 +movdqa 384(%rsp),%xmm1 +movdqa %xmm0,384(%rsp) +movdqa %xmm8,%xmm0 +paddd %xmm14,%xmm0 +movdqa %xmm0,%xmm2 +pslld $18,%xmm0 +pxor %xmm0,%xmm12 +psrld $14,%xmm2 +pxor %xmm2,%xmm12 +movdqa %xmm11,%xmm0 +paddd %xmm1,%xmm0 +movdqa %xmm0,%xmm2 +pslld $7,%xmm0 +pxor %xmm0,%xmm6 +psrld $25,%xmm2 +pxor %xmm2,%xmm6 +movdqa 400(%rsp),%xmm2 +movdqa %xmm12,400(%rsp) +movdqa %xmm3,%xmm0 +paddd %xmm2,%xmm0 +movdqa %xmm0,%xmm12 +pslld $7,%xmm0 +pxor %xmm0,%xmm13 +psrld $25,%xmm12 +pxor %xmm12,%xmm13 +movdqa %xmm1,%xmm0 +paddd %xmm6,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm15 +psrld $23,%xmm12 +pxor %xmm12,%xmm15 +movdqa %xmm2,%xmm0 +paddd %xmm13,%xmm0 +movdqa %xmm0,%xmm12 +pslld $9,%xmm0 +pxor %xmm0,%xmm9 +psrld $23,%xmm12 +pxor %xmm12,%xmm9 +movdqa %xmm6,%xmm0 +paddd %xmm15,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm11 +psrld $19,%xmm12 +pxor %xmm12,%xmm11 +movdqa %xmm13,%xmm0 +paddd %xmm9,%xmm0 +movdqa %xmm0,%xmm12 +pslld $13,%xmm0 +pxor %xmm0,%xmm3 +psrld $19,%xmm12 +pxor %xmm12,%xmm3 +movdqa %xmm15,%xmm0 +paddd %xmm11,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm1 +psrld $14,%xmm12 +pxor %xmm12,%xmm1 +movdqa %xmm9,%xmm0 +paddd %xmm3,%xmm0 +movdqa %xmm0,%xmm12 +pslld $18,%xmm0 +pxor %xmm0,%xmm2 +psrld $14,%xmm12 +pxor %xmm12,%xmm2 +movdqa 384(%rsp),%xmm12 +movdqa 400(%rsp),%xmm0 +sub $2,%rdx +ja ._mainloop1 + +paddd 176(%rsp),%xmm12 +paddd 240(%rsp),%xmm7 +paddd 288(%rsp),%xmm10 +paddd 336(%rsp),%xmm4 +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 0(%rsi),%edx +xorl 4(%rsi),%ecx +xorl 8(%rsi),%r8d +xorl 12(%rsi),%r9d +movl %edx,0(%rdi) +movl %ecx,4(%rdi) +movl %r8d,8(%rdi) +movl %r9d,12(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 64(%rsi),%edx +xorl 68(%rsi),%ecx +xorl 72(%rsi),%r8d +xorl 76(%rsi),%r9d +movl %edx,64(%rdi) +movl %ecx,68(%rdi) +movl %r8d,72(%rdi) +movl %r9d,76(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +pshufd $0x39,%xmm12,%xmm12 +pshufd $0x39,%xmm7,%xmm7 +pshufd $0x39,%xmm10,%xmm10 +pshufd $0x39,%xmm4,%xmm4 +xorl 128(%rsi),%edx +xorl 132(%rsi),%ecx +xorl 136(%rsi),%r8d +xorl 140(%rsi),%r9d +movl %edx,128(%rdi) +movl %ecx,132(%rdi) +movl %r8d,136(%rdi) +movl %r9d,140(%rdi) +movd %xmm12,%rdx +movd %xmm7,%rcx +movd %xmm10,%r8 +movd %xmm4,%r9 +xorl 192(%rsi),%edx +xorl 196(%rsi),%ecx +xorl 200(%rsi),%r8d +xorl 204(%rsi),%r9d +movl %edx,192(%rdi) +movl %ecx,196(%rdi) +movl %r8d,200(%rdi) +movl %r9d,204(%rdi) +paddd 304(%rsp),%xmm14 +paddd 128(%rsp),%xmm0 +paddd 192(%rsp),%xmm5 +paddd 256(%rsp),%xmm8 +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 16(%rsi),%edx +xorl 20(%rsi),%ecx +xorl 24(%rsi),%r8d +xorl 28(%rsi),%r9d +movl %edx,16(%rdi) +movl %ecx,20(%rdi) +movl %r8d,24(%rdi) +movl %r9d,28(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 80(%rsi),%edx +xorl 84(%rsi),%ecx +xorl 88(%rsi),%r8d +xorl 92(%rsi),%r9d +movl %edx,80(%rdi) +movl %ecx,84(%rdi) +movl %r8d,88(%rdi) +movl %r9d,92(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +pshufd $0x39,%xmm14,%xmm14 +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm5,%xmm5 +pshufd $0x39,%xmm8,%xmm8 +xorl 144(%rsi),%edx +xorl 148(%rsi),%ecx +xorl 152(%rsi),%r8d +xorl 156(%rsi),%r9d +movl %edx,144(%rdi) +movl %ecx,148(%rdi) +movl %r8d,152(%rdi) +movl %r9d,156(%rdi) +movd %xmm14,%rdx +movd %xmm0,%rcx +movd %xmm5,%r8 +movd %xmm8,%r9 +xorl 208(%rsi),%edx +xorl 212(%rsi),%ecx +xorl 216(%rsi),%r8d +xorl 220(%rsi),%r9d +movl %edx,208(%rdi) +movl %ecx,212(%rdi) +movl %r8d,216(%rdi) +movl %r9d,220(%rdi) +paddd 352(%rsp),%xmm15 +paddd 368(%rsp),%xmm11 +paddd 144(%rsp),%xmm1 +paddd 208(%rsp),%xmm6 +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 32(%rsi),%edx +xorl 36(%rsi),%ecx +xorl 40(%rsi),%r8d +xorl 44(%rsi),%r9d +movl %edx,32(%rdi) +movl %ecx,36(%rdi) +movl %r8d,40(%rdi) +movl %r9d,44(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 96(%rsi),%edx +xorl 100(%rsi),%ecx +xorl 104(%rsi),%r8d +xorl 108(%rsi),%r9d +movl %edx,96(%rdi) +movl %ecx,100(%rdi) +movl %r8d,104(%rdi) +movl %r9d,108(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +pshufd $0x39,%xmm15,%xmm15 +pshufd $0x39,%xmm11,%xmm11 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm6,%xmm6 +xorl 160(%rsi),%edx +xorl 164(%rsi),%ecx +xorl 168(%rsi),%r8d +xorl 172(%rsi),%r9d +movl %edx,160(%rdi) +movl %ecx,164(%rdi) +movl %r8d,168(%rdi) +movl %r9d,172(%rdi) +movd %xmm15,%rdx +movd %xmm11,%rcx +movd %xmm1,%r8 +movd %xmm6,%r9 +xorl 224(%rsi),%edx +xorl 228(%rsi),%ecx +xorl 232(%rsi),%r8d +xorl 236(%rsi),%r9d +movl %edx,224(%rdi) +movl %ecx,228(%rdi) +movl %r8d,232(%rdi) +movl %r9d,236(%rdi) +paddd 224(%rsp),%xmm13 +paddd 272(%rsp),%xmm9 +paddd 320(%rsp),%xmm3 +paddd 160(%rsp),%xmm2 +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 48(%rsi),%edx +xorl 52(%rsi),%ecx +xorl 56(%rsi),%r8d +xorl 60(%rsi),%r9d +movl %edx,48(%rdi) +movl %ecx,52(%rdi) +movl %r8d,56(%rdi) +movl %r9d,60(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 112(%rsi),%edx +xorl 116(%rsi),%ecx +xorl 120(%rsi),%r8d +xorl 124(%rsi),%r9d +movl %edx,112(%rdi) +movl %ecx,116(%rdi) +movl %r8d,120(%rdi) +movl %r9d,124(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +pshufd $0x39,%xmm13,%xmm13 +pshufd $0x39,%xmm9,%xmm9 +pshufd $0x39,%xmm3,%xmm3 +pshufd $0x39,%xmm2,%xmm2 +xorl 176(%rsi),%edx +xorl 180(%rsi),%ecx +xorl 184(%rsi),%r8d +xorl 188(%rsi),%r9d +movl %edx,176(%rdi) +movl %ecx,180(%rdi) +movl %r8d,184(%rdi) +movl %r9d,188(%rdi) +movd %xmm13,%rdx +movd %xmm9,%rcx +movd %xmm3,%r8 +movd %xmm2,%r9 +xorl 240(%rsi),%edx +xorl 244(%rsi),%ecx +xorl 248(%rsi),%r8d +xorl 252(%rsi),%r9d +movl %edx,240(%rdi) +movl %ecx,244(%rdi) +movl %r8d,248(%rdi) +movl %r9d,252(%rdi) +movq 480(%rsp),%r9 +sub $256,%r9 +add $256,%rsi +add $256,%rdi +cmp $256,%r9 +jae ._bytesatleast256 + +cmp $0,%r9 +jbe ._done + +._bytesbetween1and255: +cmp $64,%r9 +jae ._nocopy + +mov %rdi,%rdx +leaq 0(%rsp),%rdi +mov %r9,%rcx +rep movsb +leaq 0(%rsp),%rdi +leaq 0(%rsp),%rsi + +._nocopy: +movq %r9,480(%rsp) +movdqa 112(%rsp),%xmm0 +movdqa 64(%rsp),%xmm1 +movdqa 80(%rsp),%xmm2 +movdqa 96(%rsp),%xmm3 +movdqa %xmm1,%xmm4 +mov $20,%rcx + +.p2align 4 +._mainloop2: +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm3 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm3,%xmm3 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm1 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm1 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm1,%xmm1 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm3 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm3,%xmm3 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm3 +pxor %xmm6,%xmm3 +paddd %xmm3,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm3,%xmm3 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm1 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm3,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pxor %xmm6,%xmm0 +paddd %xmm0,%xmm4 +movdqa %xmm0,%xmm5 +movdqa %xmm4,%xmm6 +pslld $7,%xmm4 +psrld $25,%xmm6 +pxor %xmm4,%xmm1 +pxor %xmm6,%xmm1 +paddd %xmm1,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $9,%xmm5 +psrld $23,%xmm6 +pxor %xmm5,%xmm2 +pshufd $0x93,%xmm1,%xmm1 +pxor %xmm6,%xmm2 +paddd %xmm2,%xmm4 +movdqa %xmm2,%xmm5 +movdqa %xmm4,%xmm6 +pslld $13,%xmm4 +psrld $19,%xmm6 +pxor %xmm4,%xmm3 +pshufd $0x4e,%xmm2,%xmm2 +pxor %xmm6,%xmm3 +sub $4,%rcx +paddd %xmm3,%xmm5 +movdqa %xmm1,%xmm4 +movdqa %xmm5,%xmm6 +pslld $18,%xmm5 +pxor %xmm7,%xmm7 +psrld $14,%xmm6 +pxor %xmm5,%xmm0 +pshufd $0x39,%xmm3,%xmm3 +pxor %xmm6,%xmm0 +ja ._mainloop2 + +paddd 112(%rsp),%xmm0 +paddd 64(%rsp),%xmm1 +paddd 80(%rsp),%xmm2 +paddd 96(%rsp),%xmm3 +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 0(%rsi),%ecx +xorl 48(%rsi),%r8d +xorl 32(%rsi),%r9d +xorl 16(%rsi),%eax +movl %ecx,0(%rdi) +movl %r8d,48(%rdi) +movl %r9d,32(%rdi) +movl %eax,16(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 20(%rsi),%ecx +xorl 4(%rsi),%r8d +xorl 52(%rsi),%r9d +xorl 36(%rsi),%eax +movl %ecx,20(%rdi) +movl %r8d,4(%rdi) +movl %r9d,52(%rdi) +movl %eax,36(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +pshufd $0x39,%xmm0,%xmm0 +pshufd $0x39,%xmm1,%xmm1 +pshufd $0x39,%xmm2,%xmm2 +pshufd $0x39,%xmm3,%xmm3 +xorl 40(%rsi),%ecx +xorl 24(%rsi),%r8d +xorl 8(%rsi),%r9d +xorl 56(%rsi),%eax +movl %ecx,40(%rdi) +movl %r8d,24(%rdi) +movl %r9d,8(%rdi) +movl %eax,56(%rdi) +movd %xmm0,%rcx +movd %xmm1,%r8 +movd %xmm2,%r9 +movd %xmm3,%rax +xorl 60(%rsi),%ecx +xorl 44(%rsi),%r8d +xorl 28(%rsi),%r9d +xorl 12(%rsi),%eax +movl %ecx,60(%rdi) +movl %r8d,44(%rdi) +movl %r9d,28(%rdi) +movl %eax,12(%rdi) +movq 480(%rsp),%r9 +movq 472(%rsp),%rcx +add $1,%rcx +mov %rcx,%r8 +shr $32,%r8 +movl %ecx,80(%rsp) +movl %r8d,4+96(%rsp) +movq %rcx,472(%rsp) +cmp $64,%r9 +ja ._bytesatleast65 +jae ._bytesatleast64 + +mov %rdi,%rsi +mov %rdx,%rdi +mov %r9,%rcx +rep movsb + +._bytesatleast64: +._done: +movq 416(%rsp),%r11 +movq 424(%rsp),%r12 +movq 432(%rsp),%r13 +movq 440(%rsp),%r14 +movq 448(%rsp),%r15 +movq 456(%rsp),%rbx +movq 464(%rsp),%rbp +add %r11,%rsp +xor %rax,%rax +mov %rsi,%rdx +ret + +._bytesatleast65: +sub $64,%r9 +add $64,%rdi +add $64,%rsi +jmp ._bytesbetween1and255 + +#endif + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c index 0a6fee0f3e..504727038f 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.c @@ -1,31 +1,31 @@ - -#include - -#include "utils.h" - -#include "../stream_salsa20.h" -#include "salsa20_xmm6.h" - -#ifdef HAVE_AMD64_ASM - -#ifdef __cplusplus -extern "C" { -#endif -extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k); - -extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, - const unsigned char *n, - uint64_t ic, const unsigned char *k); -#ifdef __cplusplus -} -#endif - -struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6_implementation = { - SODIUM_C99(.stream =) stream_salsa20_xmm6, - SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic, - }; - -#endif + +#include + +#include "utils.h" + +#include "../stream_salsa20.h" +#include "salsa20_xmm6.h" + +#ifdef HAVE_AMD64_ASM + +#ifdef __cplusplus +extern "C" { +#endif +extern int stream_salsa20_xmm6(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k); + +extern int stream_salsa20_xmm6_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, + const unsigned char *n, + uint64_t ic, const unsigned char *k); +#ifdef __cplusplus +} +#endif + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6_implementation = { + SODIUM_C99(.stream =) stream_salsa20_xmm6, + SODIUM_C99(.stream_xor_ic =) stream_salsa20_xmm6_xor_ic, + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h index d38473a9ff..3ccbb5e8e6 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6/salsa20_xmm6.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_salsa20.h" -#include "crypto_stream_salsa20.h" - -extern struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6_implementation; + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6_implementation; diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c index 18d4773ec9..95bb63fd13 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.c @@ -1,131 +1,131 @@ - -#include -#include -#include - -#include "crypto_stream_salsa20.h" -#include "private/common.h" -#include "private/sse2_64_32.h" -#include "utils.h" - -#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ - defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) - -# ifdef __GNUC__ -# pragma GCC target("sse2") -# pragma GCC target("ssse3") -# pragma GCC target("sse4.1") -# pragma GCC target("avx2") -# endif - -#include -#include -#include -#include - -# include "../stream_salsa20.h" -# include "salsa20_xmm6int-avx2.h" - -# define ROUNDS 20 - -typedef struct salsa_ctx { - uint32_t input[16]; -} salsa_ctx; - -static const int TR[16] = { - 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 -}; - -static void -salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) -{ - ctx->input[TR[1]] = LOAD32_LE(k + 0); - ctx->input[TR[2]] = LOAD32_LE(k + 4); - ctx->input[TR[3]] = LOAD32_LE(k + 8); - ctx->input[TR[4]] = LOAD32_LE(k + 12); - ctx->input[TR[11]] = LOAD32_LE(k + 16); - ctx->input[TR[12]] = LOAD32_LE(k + 20); - ctx->input[TR[13]] = LOAD32_LE(k + 24); - ctx->input[TR[14]] = LOAD32_LE(k + 28); - ctx->input[TR[0]] = 0x61707865; - ctx->input[TR[5]] = 0x3320646e; - ctx->input[TR[10]] = 0x79622d32; - ctx->input[TR[15]] = 0x6b206574; -} - -static void -salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[TR[6]] = LOAD32_LE(iv + 0); - ctx->input[TR[7]] = LOAD32_LE(iv + 4); - ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); - ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); -} - -static void -salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) -{ - uint32_t * const x = &ctx->input[0]; - - if (!bytes) { - return; /* LCOV_EXCL_LINE */ - } - -#include "u8.h" -#include "u4.h" -#include "u1.h" -#include "u0.h" -} - -static int -stream_avx2(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct salsa_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8); - salsa_keysetup(&ctx, k); - salsa_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - salsa20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_avx2_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct salsa_ctx ctx; - uint8_t ic_bytes[8]; - uint32_t ic_high; - uint32_t ic_low; - - if (!mlen) { - return 0; - } - ic_high = (uint32_t) (ic >> 32); - ic_low = (uint32_t) ic; - STORE32_LE(&ic_bytes[0], ic_low); - STORE32_LE(&ic_bytes[4], ic_high); - salsa_keysetup(&ctx, k); - salsa_ivsetup(&ctx, n, ic_bytes); - salsa20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6int_avx2_implementation = { - SODIUM_C99(.stream =) stream_avx2, - SODIUM_C99(.stream_xor_ic =) stream_avx2_xor_ic - }; - -#endif + +#include +#include +#include + +#include "crypto_stream_salsa20.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" + +#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ + defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# pragma GCC target("ssse3") +# pragma GCC target("sse4.1") +# pragma GCC target("avx2") +# endif + +#include +#include +#include +#include + +# include "../stream_salsa20.h" +# include "salsa20_xmm6int-avx2.h" + +# define ROUNDS 20 + +typedef struct salsa_ctx { + uint32_t input[16]; +} salsa_ctx; + +static const int TR[16] = { + 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 +}; + +static void +salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) +{ + ctx->input[TR[1]] = LOAD32_LE(k + 0); + ctx->input[TR[2]] = LOAD32_LE(k + 4); + ctx->input[TR[3]] = LOAD32_LE(k + 8); + ctx->input[TR[4]] = LOAD32_LE(k + 12); + ctx->input[TR[11]] = LOAD32_LE(k + 16); + ctx->input[TR[12]] = LOAD32_LE(k + 20); + ctx->input[TR[13]] = LOAD32_LE(k + 24); + ctx->input[TR[14]] = LOAD32_LE(k + 28); + ctx->input[TR[0]] = 0x61707865; + ctx->input[TR[5]] = 0x3320646e; + ctx->input[TR[10]] = 0x79622d32; + ctx->input[TR[15]] = 0x6b206574; +} + +static void +salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[TR[6]] = LOAD32_LE(iv + 0); + ctx->input[TR[7]] = LOAD32_LE(iv + 4); + ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); +} + +static void +salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + +#include "u8.h" +#include "u4.h" +#include "u1.h" +#include "u0.h" +} + +static int +stream_avx2(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct salsa_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + salsa20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_avx2_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct salsa_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) ic; + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, ic_bytes); + salsa20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_avx2_implementation = { + SODIUM_C99(.stream =) stream_avx2, + SODIUM_C99(.stream_xor_ic =) stream_avx2_xor_ic + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h index 0924e9baff..a84ea0d2d0 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-avx2.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_salsa20.h" -#include "crypto_stream_salsa20.h" - -extern struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6int_avx2_implementation; + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_avx2_implementation; diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c index d8e53a6554..41dc8193fc 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.c @@ -1,122 +1,122 @@ - -#include -#include -#include - -#include "crypto_stream_salsa20.h" -#include "private/common.h" -#include "private/sse2_64_32.h" -#include "utils.h" - -#ifdef HAVE_EMMINTRIN_H - -# ifdef __GNUC__ -# pragma GCC target("sse2") -# endif -# include - -# include "../stream_salsa20.h" -# include "salsa20_xmm6int-sse2.h" - -# define ROUNDS 20 - -typedef struct salsa_ctx { - uint32_t input[16]; -} salsa_ctx; - -static const int TR[16] = { - 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 -}; - -static void -salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) -{ - ctx->input[TR[1]] = LOAD32_LE(k + 0); - ctx->input[TR[2]] = LOAD32_LE(k + 4); - ctx->input[TR[3]] = LOAD32_LE(k + 8); - ctx->input[TR[4]] = LOAD32_LE(k + 12); - ctx->input[TR[11]] = LOAD32_LE(k + 16); - ctx->input[TR[12]] = LOAD32_LE(k + 20); - ctx->input[TR[13]] = LOAD32_LE(k + 24); - ctx->input[TR[14]] = LOAD32_LE(k + 28); - ctx->input[TR[0]] = 0x61707865; - ctx->input[TR[5]] = 0x3320646e; - ctx->input[TR[10]] = 0x79622d32; - ctx->input[TR[15]] = 0x6b206574; -} - -static void -salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) -{ - ctx->input[TR[6]] = LOAD32_LE(iv + 0); - ctx->input[TR[7]] = LOAD32_LE(iv + 4); - ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); - ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); -} - -static void -salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, - unsigned long long bytes) -{ - uint32_t * const x = &ctx->input[0]; - - if (!bytes) { - return; /* LCOV_EXCL_LINE */ - } - -#include "u4.h" -#include "u1.h" -#include "u0.h" -} - -static int -stream_sse2(unsigned char *c, unsigned long long clen, const unsigned char *n, - const unsigned char *k) -{ - struct salsa_ctx ctx; - - if (!clen) { - return 0; - } - COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8); - salsa_keysetup(&ctx, k); - salsa_ivsetup(&ctx, n, NULL); - memset(c, 0, clen); - salsa20_encrypt_bytes(&ctx, c, c, clen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -static int -stream_sse2_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, uint64_t ic, - const unsigned char *k) -{ - struct salsa_ctx ctx; - uint8_t ic_bytes[8]; - uint32_t ic_high; - uint32_t ic_low; - - if (!mlen) { - return 0; - } - ic_high = (uint32_t) (ic >> 32); - ic_low = (uint32_t) (ic); - STORE32_LE(&ic_bytes[0], ic_low); - STORE32_LE(&ic_bytes[4], ic_high); - salsa_keysetup(&ctx, k); - salsa_ivsetup(&ctx, n, ic_bytes); - salsa20_encrypt_bytes(&ctx, m, c, mlen); - sodium_memzero(&ctx, sizeof ctx); - - return 0; -} - -struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6int_sse2_implementation = { - SODIUM_C99(.stream =) stream_sse2, - SODIUM_C99(.stream_xor_ic =) stream_sse2_xor_ic - }; - -#endif + +#include +#include +#include + +#include "crypto_stream_salsa20.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" + +#ifdef HAVE_EMMINTRIN_H + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# endif +# include + +# include "../stream_salsa20.h" +# include "salsa20_xmm6int-sse2.h" + +# define ROUNDS 20 + +typedef struct salsa_ctx { + uint32_t input[16]; +} salsa_ctx; + +static const int TR[16] = { + 0, 5, 10, 15, 12, 1, 6, 11, 8, 13, 2, 7, 4, 9, 14, 3 +}; + +static void +salsa_keysetup(salsa_ctx *ctx, const uint8_t *k) +{ + ctx->input[TR[1]] = LOAD32_LE(k + 0); + ctx->input[TR[2]] = LOAD32_LE(k + 4); + ctx->input[TR[3]] = LOAD32_LE(k + 8); + ctx->input[TR[4]] = LOAD32_LE(k + 12); + ctx->input[TR[11]] = LOAD32_LE(k + 16); + ctx->input[TR[12]] = LOAD32_LE(k + 20); + ctx->input[TR[13]] = LOAD32_LE(k + 24); + ctx->input[TR[14]] = LOAD32_LE(k + 28); + ctx->input[TR[0]] = 0x61707865; + ctx->input[TR[5]] = 0x3320646e; + ctx->input[TR[10]] = 0x79622d32; + ctx->input[TR[15]] = 0x6b206574; +} + +static void +salsa_ivsetup(salsa_ctx *ctx, const uint8_t *iv, const uint8_t *counter) +{ + ctx->input[TR[6]] = LOAD32_LE(iv + 0); + ctx->input[TR[7]] = LOAD32_LE(iv + 4); + ctx->input[TR[8]] = counter == NULL ? 0 : LOAD32_LE(counter + 0); + ctx->input[TR[9]] = counter == NULL ? 0 : LOAD32_LE(counter + 4); +} + +static void +salsa20_encrypt_bytes(salsa_ctx *ctx, const uint8_t *m, uint8_t *c, + unsigned long long bytes) +{ + uint32_t * const x = &ctx->input[0]; + + if (!bytes) { + return; /* LCOV_EXCL_LINE */ + } + +#include "u4.h" +#include "u1.h" +#include "u0.h" +} + +static int +stream_sse2(unsigned char *c, unsigned long long clen, const unsigned char *n, + const unsigned char *k) +{ + struct salsa_ctx ctx; + + if (!clen) { + return 0; + } + COMPILER_ASSERT(crypto_stream_salsa20_KEYBYTES == 256 / 8); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, NULL); + memset(c, 0, clen); + salsa20_encrypt_bytes(&ctx, c, c, clen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +static int +stream_sse2_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, uint64_t ic, + const unsigned char *k) +{ + struct salsa_ctx ctx; + uint8_t ic_bytes[8]; + uint32_t ic_high; + uint32_t ic_low; + + if (!mlen) { + return 0; + } + ic_high = (uint32_t) (ic >> 32); + ic_low = (uint32_t) (ic); + STORE32_LE(&ic_bytes[0], ic_low); + STORE32_LE(&ic_bytes[4], ic_high); + salsa_keysetup(&ctx, k); + salsa_ivsetup(&ctx, n, ic_bytes); + salsa20_encrypt_bytes(&ctx, m, c, mlen); + sodium_memzero(&ctx, sizeof ctx); + + return 0; +} + +struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_sse2_implementation = { + SODIUM_C99(.stream =) stream_sse2, + SODIUM_C99(.stream_xor_ic =) stream_sse2_xor_ic + }; + +#endif diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h index ed52a8bcbe..627f3f80fd 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/salsa20_xmm6int-sse2.h @@ -1,8 +1,8 @@ - -#include - -#include "../stream_salsa20.h" -#include "crypto_stream_salsa20.h" - -extern struct crypto_stream_salsa20_implementation - crypto_stream_salsa20_xmm6int_sse2_implementation; + +#include + +#include "../stream_salsa20.h" +#include "crypto_stream_salsa20.h" + +extern struct crypto_stream_salsa20_implementation + crypto_stream_salsa20_xmm6int_sse2_implementation; diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h index b2d4168058..e2634b4a3e 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u0.h @@ -1,195 +1,195 @@ -if (bytes > 0) { - __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0)); - __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4)); - __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8)); - __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12)); - __m128i a0, a1, a2, a3, a4, a5, a6, a7; - __m128i b0, b1, b2, b3, b4, b5, b6, b7; - uint8_t partialblock[64]; - - unsigned int i; - - a0 = diag1; - for (i = 0; i < ROUNDS; i += 4) { - a0 = _mm_add_epi32(a0, diag0); - a1 = diag0; - b0 = a0; - a0 = _mm_slli_epi32(a0, 7); - b0 = _mm_srli_epi32(b0, 25); - diag3 = _mm_xor_si128(diag3, a0); - - diag3 = _mm_xor_si128(diag3, b0); - - a1 = _mm_add_epi32(a1, diag3); - a2 = diag3; - b1 = a1; - a1 = _mm_slli_epi32(a1, 9); - b1 = _mm_srli_epi32(b1, 23); - diag2 = _mm_xor_si128(diag2, a1); - diag3 = _mm_shuffle_epi32(diag3, 0x93); - diag2 = _mm_xor_si128(diag2, b1); - - a2 = _mm_add_epi32(a2, diag2); - a3 = diag2; - b2 = a2; - a2 = _mm_slli_epi32(a2, 13); - b2 = _mm_srli_epi32(b2, 19); - diag1 = _mm_xor_si128(diag1, a2); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag1 = _mm_xor_si128(diag1, b2); - - a3 = _mm_add_epi32(a3, diag1); - a4 = diag3; - b3 = a3; - a3 = _mm_slli_epi32(a3, 18); - b3 = _mm_srli_epi32(b3, 14); - diag0 = _mm_xor_si128(diag0, a3); - diag1 = _mm_shuffle_epi32(diag1, 0x39); - diag0 = _mm_xor_si128(diag0, b3); - - a4 = _mm_add_epi32(a4, diag0); - a5 = diag0; - b4 = a4; - a4 = _mm_slli_epi32(a4, 7); - b4 = _mm_srli_epi32(b4, 25); - diag1 = _mm_xor_si128(diag1, a4); - - diag1 = _mm_xor_si128(diag1, b4); - - a5 = _mm_add_epi32(a5, diag1); - a6 = diag1; - b5 = a5; - a5 = _mm_slli_epi32(a5, 9); - b5 = _mm_srli_epi32(b5, 23); - diag2 = _mm_xor_si128(diag2, a5); - diag1 = _mm_shuffle_epi32(diag1, 0x93); - diag2 = _mm_xor_si128(diag2, b5); - - a6 = _mm_add_epi32(a6, diag2); - a7 = diag2; - b6 = a6; - a6 = _mm_slli_epi32(a6, 13); - b6 = _mm_srli_epi32(b6, 19); - diag3 = _mm_xor_si128(diag3, a6); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag3 = _mm_xor_si128(diag3, b6); - - a7 = _mm_add_epi32(a7, diag3); - a0 = diag1; - b7 = a7; - a7 = _mm_slli_epi32(a7, 18); - b7 = _mm_srli_epi32(b7, 14); - diag0 = _mm_xor_si128(diag0, a7); - diag3 = _mm_shuffle_epi32(diag3, 0x39); - diag0 = _mm_xor_si128(diag0, b7); - - a0 = _mm_add_epi32(a0, diag0); - a1 = diag0; - b0 = a0; - a0 = _mm_slli_epi32(a0, 7); - b0 = _mm_srli_epi32(b0, 25); - diag3 = _mm_xor_si128(diag3, a0); - - diag3 = _mm_xor_si128(diag3, b0); - - a1 = _mm_add_epi32(a1, diag3); - a2 = diag3; - b1 = a1; - a1 = _mm_slli_epi32(a1, 9); - b1 = _mm_srli_epi32(b1, 23); - diag2 = _mm_xor_si128(diag2, a1); - diag3 = _mm_shuffle_epi32(diag3, 0x93); - diag2 = _mm_xor_si128(diag2, b1); - - a2 = _mm_add_epi32(a2, diag2); - a3 = diag2; - b2 = a2; - a2 = _mm_slli_epi32(a2, 13); - b2 = _mm_srli_epi32(b2, 19); - diag1 = _mm_xor_si128(diag1, a2); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag1 = _mm_xor_si128(diag1, b2); - - a3 = _mm_add_epi32(a3, diag1); - a4 = diag3; - b3 = a3; - a3 = _mm_slli_epi32(a3, 18); - b3 = _mm_srli_epi32(b3, 14); - diag0 = _mm_xor_si128(diag0, a3); - diag1 = _mm_shuffle_epi32(diag1, 0x39); - diag0 = _mm_xor_si128(diag0, b3); - - a4 = _mm_add_epi32(a4, diag0); - a5 = diag0; - b4 = a4; - a4 = _mm_slli_epi32(a4, 7); - b4 = _mm_srli_epi32(b4, 25); - diag1 = _mm_xor_si128(diag1, a4); - - diag1 = _mm_xor_si128(diag1, b4); - - a5 = _mm_add_epi32(a5, diag1); - a6 = diag1; - b5 = a5; - a5 = _mm_slli_epi32(a5, 9); - b5 = _mm_srli_epi32(b5, 23); - diag2 = _mm_xor_si128(diag2, a5); - diag1 = _mm_shuffle_epi32(diag1, 0x93); - diag2 = _mm_xor_si128(diag2, b5); - - a6 = _mm_add_epi32(a6, diag2); - a7 = diag2; - b6 = a6; - a6 = _mm_slli_epi32(a6, 13); - b6 = _mm_srli_epi32(b6, 19); - diag3 = _mm_xor_si128(diag3, a6); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag3 = _mm_xor_si128(diag3, b6); - - a7 = _mm_add_epi32(a7, diag3); - a0 = diag1; - b7 = a7; - a7 = _mm_slli_epi32(a7, 18); - b7 = _mm_srli_epi32(b7, 14); - diag0 = _mm_xor_si128(diag0, a7); - diag3 = _mm_shuffle_epi32(diag3, 0x39); - diag0 = _mm_xor_si128(diag0, b7); - } - - diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0))); - diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4))); - diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8))); - diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12))); - -#define ONEQUAD_SHUFFLE(A, B, C, D) \ - do { \ - uint32_t in##A = _mm_cvtsi128_si32(diag0); \ - uint32_t in##B = _mm_cvtsi128_si32(diag1); \ - uint32_t in##C = _mm_cvtsi128_si32(diag2); \ - uint32_t in##D = _mm_cvtsi128_si32(diag3); \ - diag0 = _mm_shuffle_epi32(diag0, 0x39); \ - diag1 = _mm_shuffle_epi32(diag1, 0x39); \ - diag2 = _mm_shuffle_epi32(diag2, 0x39); \ - diag3 = _mm_shuffle_epi32(diag3, 0x39); \ - *(uint32_t *) (partialblock + (A * 4)) = in##A; \ - *(uint32_t *) (partialblock + (B * 4)) = in##B; \ - *(uint32_t *) (partialblock + (C * 4)) = in##C; \ - *(uint32_t *) (partialblock + (D * 4)) = in##D; \ - } while (0) - -#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) - - ONEQUAD(0, 12, 8, 4); - ONEQUAD(5, 1, 13, 9); - ONEQUAD(10, 6, 2, 14); - ONEQUAD(15, 11, 7, 3); - -#undef ONEQUAD -#undef ONEQUAD_SHUFFLE - - for (i = 0; i < bytes; i++) { - c[i] = m[i] ^ partialblock[i]; - } - - sodium_memzero(partialblock, sizeof partialblock); -} +if (bytes > 0) { + __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0)); + __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4)); + __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8)); + __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12)); + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + uint8_t partialblock[64]; + + unsigned int i; + + a0 = diag1; + for (i = 0; i < ROUNDS; i += 4) { + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + } + + diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0))); + diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4))); + diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8))); + diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12))); + +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + do { \ + uint32_t in##A = _mm_cvtsi128_si32(diag0); \ + uint32_t in##B = _mm_cvtsi128_si32(diag1); \ + uint32_t in##C = _mm_cvtsi128_si32(diag2); \ + uint32_t in##D = _mm_cvtsi128_si32(diag3); \ + diag0 = _mm_shuffle_epi32(diag0, 0x39); \ + diag1 = _mm_shuffle_epi32(diag1, 0x39); \ + diag2 = _mm_shuffle_epi32(diag2, 0x39); \ + diag3 = _mm_shuffle_epi32(diag3, 0x39); \ + *(uint32_t *) (partialblock + (A * 4)) = in##A; \ + *(uint32_t *) (partialblock + (B * 4)) = in##B; \ + *(uint32_t *) (partialblock + (C * 4)) = in##C; \ + *(uint32_t *) (partialblock + (D * 4)) = in##D; \ + } while (0) + +#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) + + ONEQUAD(0, 12, 8, 4); + ONEQUAD(5, 1, 13, 9); + ONEQUAD(10, 6, 2, 14); + ONEQUAD(15, 11, 7, 3); + +#undef ONEQUAD +#undef ONEQUAD_SHUFFLE + + for (i = 0; i < bytes; i++) { + c[i] = m[i] ^ partialblock[i]; + } + + sodium_memzero(partialblock, sizeof partialblock); +} diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h index c245d9565f..e246027e5c 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u1.h @@ -1,207 +1,207 @@ -while (bytes >= 64) { - __m128i diag0 = _mm_loadu_si128((__m128i *) (x + 0)); - __m128i diag1 = _mm_loadu_si128((__m128i *) (x + 4)); - __m128i diag2 = _mm_loadu_si128((__m128i *) (x + 8)); - __m128i diag3 = _mm_loadu_si128((__m128i *) (x + 12)); - __m128i a0, a1, a2, a3, a4, a5, a6, a7; - __m128i b0, b1, b2, b3, b4, b5, b6, b7; - - uint32_t in8; - uint32_t in9; - int i; - - a0 = diag1; - for (i = 0; i < ROUNDS; i += 4) { - a0 = _mm_add_epi32(a0, diag0); - a1 = diag0; - b0 = a0; - a0 = _mm_slli_epi32(a0, 7); - b0 = _mm_srli_epi32(b0, 25); - diag3 = _mm_xor_si128(diag3, a0); - - diag3 = _mm_xor_si128(diag3, b0); - - a1 = _mm_add_epi32(a1, diag3); - a2 = diag3; - b1 = a1; - a1 = _mm_slli_epi32(a1, 9); - b1 = _mm_srli_epi32(b1, 23); - diag2 = _mm_xor_si128(diag2, a1); - diag3 = _mm_shuffle_epi32(diag3, 0x93); - diag2 = _mm_xor_si128(diag2, b1); - - a2 = _mm_add_epi32(a2, diag2); - a3 = diag2; - b2 = a2; - a2 = _mm_slli_epi32(a2, 13); - b2 = _mm_srli_epi32(b2, 19); - diag1 = _mm_xor_si128(diag1, a2); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag1 = _mm_xor_si128(diag1, b2); - - a3 = _mm_add_epi32(a3, diag1); - a4 = diag3; - b3 = a3; - a3 = _mm_slli_epi32(a3, 18); - b3 = _mm_srli_epi32(b3, 14); - diag0 = _mm_xor_si128(diag0, a3); - diag1 = _mm_shuffle_epi32(diag1, 0x39); - diag0 = _mm_xor_si128(diag0, b3); - - a4 = _mm_add_epi32(a4, diag0); - a5 = diag0; - b4 = a4; - a4 = _mm_slli_epi32(a4, 7); - b4 = _mm_srli_epi32(b4, 25); - diag1 = _mm_xor_si128(diag1, a4); - - diag1 = _mm_xor_si128(diag1, b4); - - a5 = _mm_add_epi32(a5, diag1); - a6 = diag1; - b5 = a5; - a5 = _mm_slli_epi32(a5, 9); - b5 = _mm_srli_epi32(b5, 23); - diag2 = _mm_xor_si128(diag2, a5); - diag1 = _mm_shuffle_epi32(diag1, 0x93); - diag2 = _mm_xor_si128(diag2, b5); - - a6 = _mm_add_epi32(a6, diag2); - a7 = diag2; - b6 = a6; - a6 = _mm_slli_epi32(a6, 13); - b6 = _mm_srli_epi32(b6, 19); - diag3 = _mm_xor_si128(diag3, a6); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag3 = _mm_xor_si128(diag3, b6); - - a7 = _mm_add_epi32(a7, diag3); - a0 = diag1; - b7 = a7; - a7 = _mm_slli_epi32(a7, 18); - b7 = _mm_srli_epi32(b7, 14); - diag0 = _mm_xor_si128(diag0, a7); - diag3 = _mm_shuffle_epi32(diag3, 0x39); - diag0 = _mm_xor_si128(diag0, b7); - - a0 = _mm_add_epi32(a0, diag0); - a1 = diag0; - b0 = a0; - a0 = _mm_slli_epi32(a0, 7); - b0 = _mm_srli_epi32(b0, 25); - diag3 = _mm_xor_si128(diag3, a0); - - diag3 = _mm_xor_si128(diag3, b0); - - a1 = _mm_add_epi32(a1, diag3); - a2 = diag3; - b1 = a1; - a1 = _mm_slli_epi32(a1, 9); - b1 = _mm_srli_epi32(b1, 23); - diag2 = _mm_xor_si128(diag2, a1); - diag3 = _mm_shuffle_epi32(diag3, 0x93); - diag2 = _mm_xor_si128(diag2, b1); - - a2 = _mm_add_epi32(a2, diag2); - a3 = diag2; - b2 = a2; - a2 = _mm_slli_epi32(a2, 13); - b2 = _mm_srli_epi32(b2, 19); - diag1 = _mm_xor_si128(diag1, a2); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag1 = _mm_xor_si128(diag1, b2); - - a3 = _mm_add_epi32(a3, diag1); - a4 = diag3; - b3 = a3; - a3 = _mm_slli_epi32(a3, 18); - b3 = _mm_srli_epi32(b3, 14); - diag0 = _mm_xor_si128(diag0, a3); - diag1 = _mm_shuffle_epi32(diag1, 0x39); - diag0 = _mm_xor_si128(diag0, b3); - - a4 = _mm_add_epi32(a4, diag0); - a5 = diag0; - b4 = a4; - a4 = _mm_slli_epi32(a4, 7); - b4 = _mm_srli_epi32(b4, 25); - diag1 = _mm_xor_si128(diag1, a4); - - diag1 = _mm_xor_si128(diag1, b4); - - a5 = _mm_add_epi32(a5, diag1); - a6 = diag1; - b5 = a5; - a5 = _mm_slli_epi32(a5, 9); - b5 = _mm_srli_epi32(b5, 23); - diag2 = _mm_xor_si128(diag2, a5); - diag1 = _mm_shuffle_epi32(diag1, 0x93); - diag2 = _mm_xor_si128(diag2, b5); - - a6 = _mm_add_epi32(a6, diag2); - a7 = diag2; - b6 = a6; - a6 = _mm_slli_epi32(a6, 13); - b6 = _mm_srli_epi32(b6, 19); - diag3 = _mm_xor_si128(diag3, a6); - diag2 = _mm_shuffle_epi32(diag2, 0x4e); - diag3 = _mm_xor_si128(diag3, b6); - - a7 = _mm_add_epi32(a7, diag3); - a0 = diag1; - b7 = a7; - a7 = _mm_slli_epi32(a7, 18); - b7 = _mm_srli_epi32(b7, 14); - diag0 = _mm_xor_si128(diag0, a7); - diag3 = _mm_shuffle_epi32(diag3, 0x39); - diag0 = _mm_xor_si128(diag0, b7); - } - - diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((__m128i *) (x + 0))); - diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((__m128i *) (x + 4))); - diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((__m128i *) (x + 8))); - diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((__m128i *) (x + 12))); - -#define ONEQUAD_SHUFFLE(A, B, C, D) \ - do { \ - uint32_t in##A = _mm_cvtsi128_si32(diag0); \ - uint32_t in##B = _mm_cvtsi128_si32(diag1); \ - uint32_t in##C = _mm_cvtsi128_si32(diag2); \ - uint32_t in##D = _mm_cvtsi128_si32(diag3); \ - diag0 = _mm_shuffle_epi32(diag0, 0x39); \ - diag1 = _mm_shuffle_epi32(diag1, 0x39); \ - diag2 = _mm_shuffle_epi32(diag2, 0x39); \ - diag3 = _mm_shuffle_epi32(diag3, 0x39); \ - in##A ^= *(uint32_t *) (m + (A * 4)); \ - in##B ^= *(uint32_t *) (m + (B * 4)); \ - in##C ^= *(uint32_t *) (m + (C * 4)); \ - in##D ^= *(uint32_t *) (m + (D * 4)); \ - *(uint32_t *) (c + (A * 4)) = in##A; \ - *(uint32_t *) (c + (B * 4)) = in##B; \ - *(uint32_t *) (c + (C * 4)) = in##C; \ - *(uint32_t *) (c + (D * 4)) = in##D; \ - } while (0) - -#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) - - ONEQUAD(0, 12, 8, 4); - ONEQUAD(5, 1, 13, 9); - ONEQUAD(10, 6, 2, 14); - ONEQUAD(15, 11, 7, 3); - -#undef ONEQUAD -#undef ONEQUAD_SHUFFLE - - in8 = x[8]; - in9 = x[13]; - in8++; - if (in8 == 0) { - in9++; - } - x[8] = in8; - x[13] = in9; - - c += 64; - m += 64; - bytes -= 64; -} +while (bytes >= 64) { + __m128i diag0 = _mm_loadu_si128((const __m128i *) (x + 0)); + __m128i diag1 = _mm_loadu_si128((const __m128i *) (x + 4)); + __m128i diag2 = _mm_loadu_si128((const __m128i *) (x + 8)); + __m128i diag3 = _mm_loadu_si128((const __m128i *) (x + 12)); + __m128i a0, a1, a2, a3, a4, a5, a6, a7; + __m128i b0, b1, b2, b3, b4, b5, b6, b7; + + uint32_t in8; + uint32_t in9; + int i; + + a0 = diag1; + for (i = 0; i < ROUNDS; i += 4) { + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + + a0 = _mm_add_epi32(a0, diag0); + a1 = diag0; + b0 = a0; + a0 = _mm_slli_epi32(a0, 7); + b0 = _mm_srli_epi32(b0, 25); + diag3 = _mm_xor_si128(diag3, a0); + + diag3 = _mm_xor_si128(diag3, b0); + + a1 = _mm_add_epi32(a1, diag3); + a2 = diag3; + b1 = a1; + a1 = _mm_slli_epi32(a1, 9); + b1 = _mm_srli_epi32(b1, 23); + diag2 = _mm_xor_si128(diag2, a1); + diag3 = _mm_shuffle_epi32(diag3, 0x93); + diag2 = _mm_xor_si128(diag2, b1); + + a2 = _mm_add_epi32(a2, diag2); + a3 = diag2; + b2 = a2; + a2 = _mm_slli_epi32(a2, 13); + b2 = _mm_srli_epi32(b2, 19); + diag1 = _mm_xor_si128(diag1, a2); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag1 = _mm_xor_si128(diag1, b2); + + a3 = _mm_add_epi32(a3, diag1); + a4 = diag3; + b3 = a3; + a3 = _mm_slli_epi32(a3, 18); + b3 = _mm_srli_epi32(b3, 14); + diag0 = _mm_xor_si128(diag0, a3); + diag1 = _mm_shuffle_epi32(diag1, 0x39); + diag0 = _mm_xor_si128(diag0, b3); + + a4 = _mm_add_epi32(a4, diag0); + a5 = diag0; + b4 = a4; + a4 = _mm_slli_epi32(a4, 7); + b4 = _mm_srli_epi32(b4, 25); + diag1 = _mm_xor_si128(diag1, a4); + + diag1 = _mm_xor_si128(diag1, b4); + + a5 = _mm_add_epi32(a5, diag1); + a6 = diag1; + b5 = a5; + a5 = _mm_slli_epi32(a5, 9); + b5 = _mm_srli_epi32(b5, 23); + diag2 = _mm_xor_si128(diag2, a5); + diag1 = _mm_shuffle_epi32(diag1, 0x93); + diag2 = _mm_xor_si128(diag2, b5); + + a6 = _mm_add_epi32(a6, diag2); + a7 = diag2; + b6 = a6; + a6 = _mm_slli_epi32(a6, 13); + b6 = _mm_srli_epi32(b6, 19); + diag3 = _mm_xor_si128(diag3, a6); + diag2 = _mm_shuffle_epi32(diag2, 0x4e); + diag3 = _mm_xor_si128(diag3, b6); + + a7 = _mm_add_epi32(a7, diag3); + a0 = diag1; + b7 = a7; + a7 = _mm_slli_epi32(a7, 18); + b7 = _mm_srli_epi32(b7, 14); + diag0 = _mm_xor_si128(diag0, a7); + diag3 = _mm_shuffle_epi32(diag3, 0x39); + diag0 = _mm_xor_si128(diag0, b7); + } + + diag0 = _mm_add_epi32(diag0, _mm_loadu_si128((const __m128i *) (x + 0))); + diag1 = _mm_add_epi32(diag1, _mm_loadu_si128((const __m128i *) (x + 4))); + diag2 = _mm_add_epi32(diag2, _mm_loadu_si128((const __m128i *) (x + 8))); + diag3 = _mm_add_epi32(diag3, _mm_loadu_si128((const __m128i *) (x + 12))); + +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + do { \ + uint32_t in##A = _mm_cvtsi128_si32(diag0); \ + uint32_t in##B = _mm_cvtsi128_si32(diag1); \ + uint32_t in##C = _mm_cvtsi128_si32(diag2); \ + uint32_t in##D = _mm_cvtsi128_si32(diag3); \ + diag0 = _mm_shuffle_epi32(diag0, 0x39); \ + diag1 = _mm_shuffle_epi32(diag1, 0x39); \ + diag2 = _mm_shuffle_epi32(diag2, 0x39); \ + diag3 = _mm_shuffle_epi32(diag3, 0x39); \ + in##A ^= *(const uint32_t *) (m + (A * 4)); \ + in##B ^= *(const uint32_t *) (m + (B * 4)); \ + in##C ^= *(const uint32_t *) (m + (C * 4)); \ + in##D ^= *(const uint32_t *) (m + (D * 4)); \ + *(uint32_t *) (c + (A * 4)) = in##A; \ + *(uint32_t *) (c + (B * 4)) = in##B; \ + *(uint32_t *) (c + (C * 4)) = in##C; \ + *(uint32_t *) (c + (D * 4)) = in##D; \ + } while (0) + +#define ONEQUAD(A, B, C, D) ONEQUAD_SHUFFLE(A, B, C, D) + + ONEQUAD(0, 12, 8, 4); + ONEQUAD(5, 1, 13, 9); + ONEQUAD(10, 6, 2, 14); + ONEQUAD(15, 11, 7, 3); + +#undef ONEQUAD +#undef ONEQUAD_SHUFFLE + + in8 = x[8]; + in9 = x[13]; + in8++; + if (in8 == 0) { + in9++; + } + x[8] = in8; + x[13] = in9; + + c += 64; + m += 64; + bytes -= 64; +} diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h index 61d935fc90..50a59e8c25 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u4.h @@ -1,547 +1,547 @@ -if (bytes >= 256) { - __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, - y15; - __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, - z15; - __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8, - orig9, orig10, orig11, orig12, orig13, orig14, orig15; - - uint32_t in8; - uint32_t in9; - int i; - - /* element broadcast immediate for _mm_shuffle_epi32 are in order: - 0x00, 0x55, 0xaa, 0xff */ - z0 = _mm_loadu_si128((__m128i *) (x + 0)); - z5 = _mm_shuffle_epi32(z0, 0x55); - z10 = _mm_shuffle_epi32(z0, 0xaa); - z15 = _mm_shuffle_epi32(z0, 0xff); - z0 = _mm_shuffle_epi32(z0, 0x00); - z1 = _mm_loadu_si128((__m128i *) (x + 4)); - z6 = _mm_shuffle_epi32(z1, 0xaa); - z11 = _mm_shuffle_epi32(z1, 0xff); - z12 = _mm_shuffle_epi32(z1, 0x00); - z1 = _mm_shuffle_epi32(z1, 0x55); - z2 = _mm_loadu_si128((__m128i *) (x + 8)); - z7 = _mm_shuffle_epi32(z2, 0xff); - z13 = _mm_shuffle_epi32(z2, 0x55); - z2 = _mm_shuffle_epi32(z2, 0xaa); - /* no z8 -> first half of the nonce, will fill later */ - z3 = _mm_loadu_si128((__m128i *) (x + 12)); - z4 = _mm_shuffle_epi32(z3, 0x00); - z14 = _mm_shuffle_epi32(z3, 0xaa); - z3 = _mm_shuffle_epi32(z3, 0xff); - /* no z9 -> second half of the nonce, will fill later */ - orig0 = z0; - orig1 = z1; - orig2 = z2; - orig3 = z3; - orig4 = z4; - orig5 = z5; - orig6 = z6; - orig7 = z7; - orig10 = z10; - orig11 = z11; - orig12 = z12; - orig13 = z13; - orig14 = z14; - orig15 = z15; - - while (bytes >= 256) { - /* vector implementation for z8 and z9 */ - /* not sure if it helps for only 4 blocks */ - const __m128i addv8 = _mm_set_epi64x(1, 0); - const __m128i addv9 = _mm_set_epi64x(3, 2); - __m128i t8, t9; - uint64_t in89; - - in8 = x[8]; - in9 = x[13]; - in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); - t8 = _mm_set1_epi64x(in89); - t9 = _mm_set1_epi64x(in89); - - z8 = _mm_add_epi64(addv8, t8); - z9 = _mm_add_epi64(addv9, t9); - - t8 = _mm_unpacklo_epi32(z8, z9); - t9 = _mm_unpackhi_epi32(z8, z9); - - z8 = _mm_unpacklo_epi32(t8, t9); - z9 = _mm_unpackhi_epi32(t8, t9); - - orig8 = z8; - orig9 = z9; - - in89 += 4; - - x[8] = in89 & 0xFFFFFFFF; - x[13] = (in89 >> 32) & 0xFFFFFFFF; - - z5 = orig5; - z10 = orig10; - z15 = orig15; - z14 = orig14; - z3 = orig3; - z6 = orig6; - z11 = orig11; - z1 = orig1; - - z7 = orig7; - z13 = orig13; - z2 = orig2; - z9 = orig9; - z0 = orig0; - z12 = orig12; - z4 = orig4; - z8 = orig8; - - for (i = 0; i < ROUNDS; i += 2) { - /* the inner loop is a direct translation (regexp search/replace) - * from the amd64-xmm6 ASM */ - __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, - r14, r15; - - y4 = z12; - y4 = _mm_add_epi32(y4, z0); - r4 = y4; - y4 = _mm_slli_epi32(y4, 7); - z4 = _mm_xor_si128(z4, y4); - r4 = _mm_srli_epi32(r4, 25); - z4 = _mm_xor_si128(z4, r4); - - y9 = z1; - y9 = _mm_add_epi32(y9, z5); - r9 = y9; - y9 = _mm_slli_epi32(y9, 7); - z9 = _mm_xor_si128(z9, y9); - r9 = _mm_srli_epi32(r9, 25); - z9 = _mm_xor_si128(z9, r9); - - y8 = z0; - y8 = _mm_add_epi32(y8, z4); - r8 = y8; - y8 = _mm_slli_epi32(y8, 9); - z8 = _mm_xor_si128(z8, y8); - r8 = _mm_srli_epi32(r8, 23); - z8 = _mm_xor_si128(z8, r8); - - y13 = z5; - y13 = _mm_add_epi32(y13, z9); - r13 = y13; - y13 = _mm_slli_epi32(y13, 9); - z13 = _mm_xor_si128(z13, y13); - r13 = _mm_srli_epi32(r13, 23); - z13 = _mm_xor_si128(z13, r13); - - y12 = z4; - y12 = _mm_add_epi32(y12, z8); - r12 = y12; - y12 = _mm_slli_epi32(y12, 13); - z12 = _mm_xor_si128(z12, y12); - r12 = _mm_srli_epi32(r12, 19); - z12 = _mm_xor_si128(z12, r12); - - y1 = z9; - y1 = _mm_add_epi32(y1, z13); - r1 = y1; - y1 = _mm_slli_epi32(y1, 13); - z1 = _mm_xor_si128(z1, y1); - r1 = _mm_srli_epi32(r1, 19); - z1 = _mm_xor_si128(z1, r1); - - y0 = z8; - y0 = _mm_add_epi32(y0, z12); - r0 = y0; - y0 = _mm_slli_epi32(y0, 18); - z0 = _mm_xor_si128(z0, y0); - r0 = _mm_srli_epi32(r0, 14); - z0 = _mm_xor_si128(z0, r0); - - y5 = z13; - y5 = _mm_add_epi32(y5, z1); - r5 = y5; - y5 = _mm_slli_epi32(y5, 18); - z5 = _mm_xor_si128(z5, y5); - r5 = _mm_srli_epi32(r5, 14); - z5 = _mm_xor_si128(z5, r5); - - y14 = z6; - y14 = _mm_add_epi32(y14, z10); - r14 = y14; - y14 = _mm_slli_epi32(y14, 7); - z14 = _mm_xor_si128(z14, y14); - r14 = _mm_srli_epi32(r14, 25); - z14 = _mm_xor_si128(z14, r14); - - y3 = z11; - y3 = _mm_add_epi32(y3, z15); - r3 = y3; - y3 = _mm_slli_epi32(y3, 7); - z3 = _mm_xor_si128(z3, y3); - r3 = _mm_srli_epi32(r3, 25); - z3 = _mm_xor_si128(z3, r3); - - y2 = z10; - y2 = _mm_add_epi32(y2, z14); - r2 = y2; - y2 = _mm_slli_epi32(y2, 9); - z2 = _mm_xor_si128(z2, y2); - r2 = _mm_srli_epi32(r2, 23); - z2 = _mm_xor_si128(z2, r2); - - y7 = z15; - y7 = _mm_add_epi32(y7, z3); - r7 = y7; - y7 = _mm_slli_epi32(y7, 9); - z7 = _mm_xor_si128(z7, y7); - r7 = _mm_srli_epi32(r7, 23); - z7 = _mm_xor_si128(z7, r7); - - y6 = z14; - y6 = _mm_add_epi32(y6, z2); - r6 = y6; - y6 = _mm_slli_epi32(y6, 13); - z6 = _mm_xor_si128(z6, y6); - r6 = _mm_srli_epi32(r6, 19); - z6 = _mm_xor_si128(z6, r6); - - y11 = z3; - y11 = _mm_add_epi32(y11, z7); - r11 = y11; - y11 = _mm_slli_epi32(y11, 13); - z11 = _mm_xor_si128(z11, y11); - r11 = _mm_srli_epi32(r11, 19); - z11 = _mm_xor_si128(z11, r11); - - y10 = z2; - y10 = _mm_add_epi32(y10, z6); - r10 = y10; - y10 = _mm_slli_epi32(y10, 18); - z10 = _mm_xor_si128(z10, y10); - r10 = _mm_srli_epi32(r10, 14); - z10 = _mm_xor_si128(z10, r10); - - y1 = z3; - y1 = _mm_add_epi32(y1, z0); - r1 = y1; - y1 = _mm_slli_epi32(y1, 7); - z1 = _mm_xor_si128(z1, y1); - r1 = _mm_srli_epi32(r1, 25); - z1 = _mm_xor_si128(z1, r1); - - y15 = z7; - y15 = _mm_add_epi32(y15, z11); - r15 = y15; - y15 = _mm_slli_epi32(y15, 18); - z15 = _mm_xor_si128(z15, y15); - r15 = _mm_srli_epi32(r15, 14); - z15 = _mm_xor_si128(z15, r15); - - y6 = z4; - y6 = _mm_add_epi32(y6, z5); - r6 = y6; - y6 = _mm_slli_epi32(y6, 7); - z6 = _mm_xor_si128(z6, y6); - r6 = _mm_srli_epi32(r6, 25); - z6 = _mm_xor_si128(z6, r6); - - y2 = z0; - y2 = _mm_add_epi32(y2, z1); - r2 = y2; - y2 = _mm_slli_epi32(y2, 9); - z2 = _mm_xor_si128(z2, y2); - r2 = _mm_srli_epi32(r2, 23); - z2 = _mm_xor_si128(z2, r2); - - y7 = z5; - y7 = _mm_add_epi32(y7, z6); - r7 = y7; - y7 = _mm_slli_epi32(y7, 9); - z7 = _mm_xor_si128(z7, y7); - r7 = _mm_srli_epi32(r7, 23); - z7 = _mm_xor_si128(z7, r7); - - y3 = z1; - y3 = _mm_add_epi32(y3, z2); - r3 = y3; - y3 = _mm_slli_epi32(y3, 13); - z3 = _mm_xor_si128(z3, y3); - r3 = _mm_srli_epi32(r3, 19); - z3 = _mm_xor_si128(z3, r3); - - y4 = z6; - y4 = _mm_add_epi32(y4, z7); - r4 = y4; - y4 = _mm_slli_epi32(y4, 13); - z4 = _mm_xor_si128(z4, y4); - r4 = _mm_srli_epi32(r4, 19); - z4 = _mm_xor_si128(z4, r4); - - y0 = z2; - y0 = _mm_add_epi32(y0, z3); - r0 = y0; - y0 = _mm_slli_epi32(y0, 18); - z0 = _mm_xor_si128(z0, y0); - r0 = _mm_srli_epi32(r0, 14); - z0 = _mm_xor_si128(z0, r0); - - y5 = z7; - y5 = _mm_add_epi32(y5, z4); - r5 = y5; - y5 = _mm_slli_epi32(y5, 18); - z5 = _mm_xor_si128(z5, y5); - r5 = _mm_srli_epi32(r5, 14); - z5 = _mm_xor_si128(z5, r5); - - y11 = z9; - y11 = _mm_add_epi32(y11, z10); - r11 = y11; - y11 = _mm_slli_epi32(y11, 7); - z11 = _mm_xor_si128(z11, y11); - r11 = _mm_srli_epi32(r11, 25); - z11 = _mm_xor_si128(z11, r11); - - y12 = z14; - y12 = _mm_add_epi32(y12, z15); - r12 = y12; - y12 = _mm_slli_epi32(y12, 7); - z12 = _mm_xor_si128(z12, y12); - r12 = _mm_srli_epi32(r12, 25); - z12 = _mm_xor_si128(z12, r12); - - y8 = z10; - y8 = _mm_add_epi32(y8, z11); - r8 = y8; - y8 = _mm_slli_epi32(y8, 9); - z8 = _mm_xor_si128(z8, y8); - r8 = _mm_srli_epi32(r8, 23); - z8 = _mm_xor_si128(z8, r8); - - y13 = z15; - y13 = _mm_add_epi32(y13, z12); - r13 = y13; - y13 = _mm_slli_epi32(y13, 9); - z13 = _mm_xor_si128(z13, y13); - r13 = _mm_srli_epi32(r13, 23); - z13 = _mm_xor_si128(z13, r13); - - y9 = z11; - y9 = _mm_add_epi32(y9, z8); - r9 = y9; - y9 = _mm_slli_epi32(y9, 13); - z9 = _mm_xor_si128(z9, y9); - r9 = _mm_srli_epi32(r9, 19); - z9 = _mm_xor_si128(z9, r9); - - y14 = z12; - y14 = _mm_add_epi32(y14, z13); - r14 = y14; - y14 = _mm_slli_epi32(y14, 13); - z14 = _mm_xor_si128(z14, y14); - r14 = _mm_srli_epi32(r14, 19); - z14 = _mm_xor_si128(z14, r14); - - y10 = z8; - y10 = _mm_add_epi32(y10, z9); - r10 = y10; - y10 = _mm_slli_epi32(y10, 18); - z10 = _mm_xor_si128(z10, y10); - r10 = _mm_srli_epi32(r10, 14); - z10 = _mm_xor_si128(z10, r10); - - y15 = z13; - y15 = _mm_add_epi32(y15, z14); - r15 = y15; - y15 = _mm_slli_epi32(y15, 18); - z15 = _mm_xor_si128(z15, y15); - r15 = _mm_srli_epi32(r15, 14); - z15 = _mm_xor_si128(z15, r15); - } - -/* store data ; this macro replicates the original amd64-xmm6 code */ -#define ONEQUAD_SHUFFLE(A, B, C, D) \ - z##A = _mm_add_epi32(z##A, orig##A); \ - z##B = _mm_add_epi32(z##B, orig##B); \ - z##C = _mm_add_epi32(z##C, orig##C); \ - z##D = _mm_add_epi32(z##D, orig##D); \ - in##A = _mm_cvtsi128_si32(z##A); \ - in##B = _mm_cvtsi128_si32(z##B); \ - in##C = _mm_cvtsi128_si32(z##C); \ - in##D = _mm_cvtsi128_si32(z##D); \ - z##A = _mm_shuffle_epi32(z##A, 0x39); \ - z##B = _mm_shuffle_epi32(z##B, 0x39); \ - z##C = _mm_shuffle_epi32(z##C, 0x39); \ - z##D = _mm_shuffle_epi32(z##D, 0x39); \ - \ - in##A ^= *(uint32_t *) (m + 0); \ - in##B ^= *(uint32_t *) (m + 4); \ - in##C ^= *(uint32_t *) (m + 8); \ - in##D ^= *(uint32_t *) (m + 12); \ - \ - *(uint32_t *) (c + 0) = in##A; \ - *(uint32_t *) (c + 4) = in##B; \ - *(uint32_t *) (c + 8) = in##C; \ - *(uint32_t *) (c + 12) = in##D; \ - \ - in##A = _mm_cvtsi128_si32(z##A); \ - in##B = _mm_cvtsi128_si32(z##B); \ - in##C = _mm_cvtsi128_si32(z##C); \ - in##D = _mm_cvtsi128_si32(z##D); \ - z##A = _mm_shuffle_epi32(z##A, 0x39); \ - z##B = _mm_shuffle_epi32(z##B, 0x39); \ - z##C = _mm_shuffle_epi32(z##C, 0x39); \ - z##D = _mm_shuffle_epi32(z##D, 0x39); \ - \ - in##A ^= *(uint32_t *) (m + 64); \ - in##B ^= *(uint32_t *) (m + 68); \ - in##C ^= *(uint32_t *) (m + 72); \ - in##D ^= *(uint32_t *) (m + 76); \ - *(uint32_t *) (c + 64) = in##A; \ - *(uint32_t *) (c + 68) = in##B; \ - *(uint32_t *) (c + 72) = in##C; \ - *(uint32_t *) (c + 76) = in##D; \ - \ - in##A = _mm_cvtsi128_si32(z##A); \ - in##B = _mm_cvtsi128_si32(z##B); \ - in##C = _mm_cvtsi128_si32(z##C); \ - in##D = _mm_cvtsi128_si32(z##D); \ - z##A = _mm_shuffle_epi32(z##A, 0x39); \ - z##B = _mm_shuffle_epi32(z##B, 0x39); \ - z##C = _mm_shuffle_epi32(z##C, 0x39); \ - z##D = _mm_shuffle_epi32(z##D, 0x39); \ - \ - in##A ^= *(uint32_t *) (m + 128); \ - in##B ^= *(uint32_t *) (m + 132); \ - in##C ^= *(uint32_t *) (m + 136); \ - in##D ^= *(uint32_t *) (m + 140); \ - *(uint32_t *) (c + 128) = in##A; \ - *(uint32_t *) (c + 132) = in##B; \ - *(uint32_t *) (c + 136) = in##C; \ - *(uint32_t *) (c + 140) = in##D; \ - \ - in##A = _mm_cvtsi128_si32(z##A); \ - in##B = _mm_cvtsi128_si32(z##B); \ - in##C = _mm_cvtsi128_si32(z##C); \ - in##D = _mm_cvtsi128_si32(z##D); \ - \ - in##A ^= *(uint32_t *) (m + 192); \ - in##B ^= *(uint32_t *) (m + 196); \ - in##C ^= *(uint32_t *) (m + 200); \ - in##D ^= *(uint32_t *) (m + 204); \ - *(uint32_t *) (c + 192) = in##A; \ - *(uint32_t *) (c + 196) = in##B; \ - *(uint32_t *) (c + 200) = in##C; \ - *(uint32_t *) (c + 204) = in##D - -/* store data ; this macro replaces shuffle+mov by a direct extract; not much - * difference */ -#define ONEQUAD_EXTRACT(A, B, C, D) \ - z##A = _mm_add_epi32(z##A, orig##A); \ - z##B = _mm_add_epi32(z##B, orig##B); \ - z##C = _mm_add_epi32(z##C, orig##C); \ - z##D = _mm_add_epi32(z##D, orig##D); \ - in##A = _mm_cvtsi128_si32(z##A); \ - in##B = _mm_cvtsi128_si32(z##B); \ - in##C = _mm_cvtsi128_si32(z##C); \ - in##D = _mm_cvtsi128_si32(z##D); \ - in##A ^= *(uint32_t *) (m + 0); \ - in##B ^= *(uint32_t *) (m + 4); \ - in##C ^= *(uint32_t *) (m + 8); \ - in##D ^= *(uint32_t *) (m + 12); \ - *(uint32_t *) (c + 0) = in##A; \ - *(uint32_t *) (c + 4) = in##B; \ - *(uint32_t *) (c + 8) = in##C; \ - *(uint32_t *) (c + 12) = in##D; \ - \ - in##A = _mm_extract_epi32(z##A, 1); \ - in##B = _mm_extract_epi32(z##B, 1); \ - in##C = _mm_extract_epi32(z##C, 1); \ - in##D = _mm_extract_epi32(z##D, 1); \ - \ - in##A ^= *(uint32_t *) (m + 64); \ - in##B ^= *(uint32_t *) (m + 68); \ - in##C ^= *(uint32_t *) (m + 72); \ - in##D ^= *(uint32_t *) (m + 76); \ - *(uint32_t *) (c + 64) = in##A; \ - *(uint32_t *) (c + 68) = in##B; \ - *(uint32_t *) (c + 72) = in##C; \ - *(uint32_t *) (c + 76) = in##D; \ - \ - in##A = _mm_extract_epi32(z##A, 2); \ - in##B = _mm_extract_epi32(z##B, 2); \ - in##C = _mm_extract_epi32(z##C, 2); \ - in##D = _mm_extract_epi32(z##D, 2); \ - \ - in##A ^= *(uint32_t *) (m + 128); \ - in##B ^= *(uint32_t *) (m + 132); \ - in##C ^= *(uint32_t *) (m + 136); \ - in##D ^= *(uint32_t *) (m + 140); \ - *(uint32_t *) (c + 128) = in##A; \ - *(uint32_t *) (c + 132) = in##B; \ - *(uint32_t *) (c + 136) = in##C; \ - *(uint32_t *) (c + 140) = in##D; \ - \ - in##A = _mm_extract_epi32(z##A, 3); \ - in##B = _mm_extract_epi32(z##B, 3); \ - in##C = _mm_extract_epi32(z##C, 3); \ - in##D = _mm_extract_epi32(z##D, 3); \ - \ - in##A ^= *(uint32_t *) (m + 192); \ - in##B ^= *(uint32_t *) (m + 196); \ - in##C ^= *(uint32_t *) (m + 200); \ - in##D ^= *(uint32_t *) (m + 204); \ - *(uint32_t *) (c + 192) = in##A; \ - *(uint32_t *) (c + 196) = in##B; \ - *(uint32_t *) (c + 200) = in##C; \ - *(uint32_t *) (c + 204) = in##D - -/* store data ; this macro first transpose data in-registers, and then store - * them in memory. much faster with icc. */ -#define ONEQUAD_TRANSPOSE(A, B, C, D) \ - z##A = _mm_add_epi32(z##A, orig##A); \ - z##B = _mm_add_epi32(z##B, orig##B); \ - z##C = _mm_add_epi32(z##C, orig##C); \ - z##D = _mm_add_epi32(z##D, orig##D); \ - y##A = _mm_unpacklo_epi32(z##A, z##B); \ - y##B = _mm_unpacklo_epi32(z##C, z##D); \ - y##C = _mm_unpackhi_epi32(z##A, z##B); \ - y##D = _mm_unpackhi_epi32(z##C, z##D); \ - z##A = _mm_unpacklo_epi64(y##A, y##B); \ - z##B = _mm_unpackhi_epi64(y##A, y##B); \ - z##C = _mm_unpacklo_epi64(y##C, y##D); \ - z##D = _mm_unpackhi_epi64(y##C, y##D); \ - y##A = _mm_xor_si128(z##A, _mm_loadu_si128((__m128i *) (m + 0))); \ - _mm_storeu_si128((__m128i *) (c + 0), y##A); \ - y##B = _mm_xor_si128(z##B, _mm_loadu_si128((__m128i *) (m + 64))); \ - _mm_storeu_si128((__m128i *) (c + 64), y##B); \ - y##C = _mm_xor_si128(z##C, _mm_loadu_si128((__m128i *) (m + 128))); \ - _mm_storeu_si128((__m128i *) (c + 128), y##C); \ - y##D = _mm_xor_si128(z##D, _mm_loadu_si128((__m128i *) (m + 192))); \ - _mm_storeu_si128((__m128i *) (c + 192), y##D) - -#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) - - ONEQUAD(0, 1, 2, 3); - m += 16; - c += 16; - ONEQUAD(4, 5, 6, 7); - m += 16; - c += 16; - ONEQUAD(8, 9, 10, 11); - m += 16; - c += 16; - ONEQUAD(12, 13, 14, 15); - m -= 48; - c -= 48; - -#undef ONEQUAD -#undef ONEQUAD_TRANSPOSE -#undef ONEQUAD_EXTRACT -#undef ONEQUAD_SHUFFLE - - bytes -= 256; - c += 256; - m += 256; - } -} +if (bytes >= 256) { + __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, + y15; + __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14, + z15; + __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8, + orig9, orig10, orig11, orig12, orig13, orig14, orig15; + + uint32_t in8; + uint32_t in9; + int i; + + /* element broadcast immediate for _mm_shuffle_epi32 are in order: + 0x00, 0x55, 0xaa, 0xff */ + z0 = _mm_loadu_si128((const __m128i *) (x + 0)); + z5 = _mm_shuffle_epi32(z0, 0x55); + z10 = _mm_shuffle_epi32(z0, 0xaa); + z15 = _mm_shuffle_epi32(z0, 0xff); + z0 = _mm_shuffle_epi32(z0, 0x00); + z1 = _mm_loadu_si128((const __m128i *) (x + 4)); + z6 = _mm_shuffle_epi32(z1, 0xaa); + z11 = _mm_shuffle_epi32(z1, 0xff); + z12 = _mm_shuffle_epi32(z1, 0x00); + z1 = _mm_shuffle_epi32(z1, 0x55); + z2 = _mm_loadu_si128((const __m128i *) (x + 8)); + z7 = _mm_shuffle_epi32(z2, 0xff); + z13 = _mm_shuffle_epi32(z2, 0x55); + z2 = _mm_shuffle_epi32(z2, 0xaa); + /* no z8 -> first half of the nonce, will fill later */ + z3 = _mm_loadu_si128((const __m128i *) (x + 12)); + z4 = _mm_shuffle_epi32(z3, 0x00); + z14 = _mm_shuffle_epi32(z3, 0xaa); + z3 = _mm_shuffle_epi32(z3, 0xff); + /* no z9 -> second half of the nonce, will fill later */ + orig0 = z0; + orig1 = z1; + orig2 = z2; + orig3 = z3; + orig4 = z4; + orig5 = z5; + orig6 = z6; + orig7 = z7; + orig10 = z10; + orig11 = z11; + orig12 = z12; + orig13 = z13; + orig14 = z14; + orig15 = z15; + + while (bytes >= 256) { + /* vector implementation for z8 and z9 */ + /* not sure if it helps for only 4 blocks */ + const __m128i addv8 = _mm_set_epi64x(1, 0); + const __m128i addv9 = _mm_set_epi64x(3, 2); + __m128i t8, t9; + uint64_t in89; + + in8 = x[8]; + in9 = x[13]; + in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); + t8 = _mm_set1_epi64x(in89); + t9 = _mm_set1_epi64x(in89); + + z8 = _mm_add_epi64(addv8, t8); + z9 = _mm_add_epi64(addv9, t9); + + t8 = _mm_unpacklo_epi32(z8, z9); + t9 = _mm_unpackhi_epi32(z8, z9); + + z8 = _mm_unpacklo_epi32(t8, t9); + z9 = _mm_unpackhi_epi32(t8, t9); + + orig8 = z8; + orig9 = z9; + + in89 += 4; + + x[8] = in89 & 0xFFFFFFFF; + x[13] = (in89 >> 32) & 0xFFFFFFFF; + + z5 = orig5; + z10 = orig10; + z15 = orig15; + z14 = orig14; + z3 = orig3; + z6 = orig6; + z11 = orig11; + z1 = orig1; + + z7 = orig7; + z13 = orig13; + z2 = orig2; + z9 = orig9; + z0 = orig0; + z12 = orig12; + z4 = orig4; + z8 = orig8; + + for (i = 0; i < ROUNDS; i += 2) { + /* the inner loop is a direct translation (regexp search/replace) + * from the amd64-xmm6 ASM */ + __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, + r14, r15; + + y4 = z12; + y4 = _mm_add_epi32(y4, z0); + r4 = y4; + y4 = _mm_slli_epi32(y4, 7); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 25); + z4 = _mm_xor_si128(z4, r4); + + y9 = z1; + y9 = _mm_add_epi32(y9, z5); + r9 = y9; + y9 = _mm_slli_epi32(y9, 7); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 25); + z9 = _mm_xor_si128(z9, r9); + + y8 = z0; + y8 = _mm_add_epi32(y8, z4); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z5; + y13 = _mm_add_epi32(y13, z9); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y12 = z4; + y12 = _mm_add_epi32(y12, z8); + r12 = y12; + y12 = _mm_slli_epi32(y12, 13); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 19); + z12 = _mm_xor_si128(z12, r12); + + y1 = z9; + y1 = _mm_add_epi32(y1, z13); + r1 = y1; + y1 = _mm_slli_epi32(y1, 13); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 19); + z1 = _mm_xor_si128(z1, r1); + + y0 = z8; + y0 = _mm_add_epi32(y0, z12); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z13; + y5 = _mm_add_epi32(y5, z1); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y14 = z6; + y14 = _mm_add_epi32(y14, z10); + r14 = y14; + y14 = _mm_slli_epi32(y14, 7); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 25); + z14 = _mm_xor_si128(z14, r14); + + y3 = z11; + y3 = _mm_add_epi32(y3, z15); + r3 = y3; + y3 = _mm_slli_epi32(y3, 7); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 25); + z3 = _mm_xor_si128(z3, r3); + + y2 = z10; + y2 = _mm_add_epi32(y2, z14); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z15; + y7 = _mm_add_epi32(y7, z3); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y6 = z14; + y6 = _mm_add_epi32(y6, z2); + r6 = y6; + y6 = _mm_slli_epi32(y6, 13); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 19); + z6 = _mm_xor_si128(z6, r6); + + y11 = z3; + y11 = _mm_add_epi32(y11, z7); + r11 = y11; + y11 = _mm_slli_epi32(y11, 13); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 19); + z11 = _mm_xor_si128(z11, r11); + + y10 = z2; + y10 = _mm_add_epi32(y10, z6); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y1 = z3; + y1 = _mm_add_epi32(y1, z0); + r1 = y1; + y1 = _mm_slli_epi32(y1, 7); + z1 = _mm_xor_si128(z1, y1); + r1 = _mm_srli_epi32(r1, 25); + z1 = _mm_xor_si128(z1, r1); + + y15 = z7; + y15 = _mm_add_epi32(y15, z11); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + + y6 = z4; + y6 = _mm_add_epi32(y6, z5); + r6 = y6; + y6 = _mm_slli_epi32(y6, 7); + z6 = _mm_xor_si128(z6, y6); + r6 = _mm_srli_epi32(r6, 25); + z6 = _mm_xor_si128(z6, r6); + + y2 = z0; + y2 = _mm_add_epi32(y2, z1); + r2 = y2; + y2 = _mm_slli_epi32(y2, 9); + z2 = _mm_xor_si128(z2, y2); + r2 = _mm_srli_epi32(r2, 23); + z2 = _mm_xor_si128(z2, r2); + + y7 = z5; + y7 = _mm_add_epi32(y7, z6); + r7 = y7; + y7 = _mm_slli_epi32(y7, 9); + z7 = _mm_xor_si128(z7, y7); + r7 = _mm_srli_epi32(r7, 23); + z7 = _mm_xor_si128(z7, r7); + + y3 = z1; + y3 = _mm_add_epi32(y3, z2); + r3 = y3; + y3 = _mm_slli_epi32(y3, 13); + z3 = _mm_xor_si128(z3, y3); + r3 = _mm_srli_epi32(r3, 19); + z3 = _mm_xor_si128(z3, r3); + + y4 = z6; + y4 = _mm_add_epi32(y4, z7); + r4 = y4; + y4 = _mm_slli_epi32(y4, 13); + z4 = _mm_xor_si128(z4, y4); + r4 = _mm_srli_epi32(r4, 19); + z4 = _mm_xor_si128(z4, r4); + + y0 = z2; + y0 = _mm_add_epi32(y0, z3); + r0 = y0; + y0 = _mm_slli_epi32(y0, 18); + z0 = _mm_xor_si128(z0, y0); + r0 = _mm_srli_epi32(r0, 14); + z0 = _mm_xor_si128(z0, r0); + + y5 = z7; + y5 = _mm_add_epi32(y5, z4); + r5 = y5; + y5 = _mm_slli_epi32(y5, 18); + z5 = _mm_xor_si128(z5, y5); + r5 = _mm_srli_epi32(r5, 14); + z5 = _mm_xor_si128(z5, r5); + + y11 = z9; + y11 = _mm_add_epi32(y11, z10); + r11 = y11; + y11 = _mm_slli_epi32(y11, 7); + z11 = _mm_xor_si128(z11, y11); + r11 = _mm_srli_epi32(r11, 25); + z11 = _mm_xor_si128(z11, r11); + + y12 = z14; + y12 = _mm_add_epi32(y12, z15); + r12 = y12; + y12 = _mm_slli_epi32(y12, 7); + z12 = _mm_xor_si128(z12, y12); + r12 = _mm_srli_epi32(r12, 25); + z12 = _mm_xor_si128(z12, r12); + + y8 = z10; + y8 = _mm_add_epi32(y8, z11); + r8 = y8; + y8 = _mm_slli_epi32(y8, 9); + z8 = _mm_xor_si128(z8, y8); + r8 = _mm_srli_epi32(r8, 23); + z8 = _mm_xor_si128(z8, r8); + + y13 = z15; + y13 = _mm_add_epi32(y13, z12); + r13 = y13; + y13 = _mm_slli_epi32(y13, 9); + z13 = _mm_xor_si128(z13, y13); + r13 = _mm_srli_epi32(r13, 23); + z13 = _mm_xor_si128(z13, r13); + + y9 = z11; + y9 = _mm_add_epi32(y9, z8); + r9 = y9; + y9 = _mm_slli_epi32(y9, 13); + z9 = _mm_xor_si128(z9, y9); + r9 = _mm_srli_epi32(r9, 19); + z9 = _mm_xor_si128(z9, r9); + + y14 = z12; + y14 = _mm_add_epi32(y14, z13); + r14 = y14; + y14 = _mm_slli_epi32(y14, 13); + z14 = _mm_xor_si128(z14, y14); + r14 = _mm_srli_epi32(r14, 19); + z14 = _mm_xor_si128(z14, r14); + + y10 = z8; + y10 = _mm_add_epi32(y10, z9); + r10 = y10; + y10 = _mm_slli_epi32(y10, 18); + z10 = _mm_xor_si128(z10, y10); + r10 = _mm_srli_epi32(r10, 14); + z10 = _mm_xor_si128(z10, r10); + + y15 = z13; + y15 = _mm_add_epi32(y15, z14); + r15 = y15; + y15 = _mm_slli_epi32(y15, 18); + z15 = _mm_xor_si128(z15, y15); + r15 = _mm_srli_epi32(r15, 14); + z15 = _mm_xor_si128(z15, r15); + } + +/* store data ; this macro replicates the original amd64-xmm6 code */ +#define ONEQUAD_SHUFFLE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + z##A = _mm_shuffle_epi32(z##A, 0x39); \ + z##B = _mm_shuffle_epi32(z##B, 0x39); \ + z##C = _mm_shuffle_epi32(z##C, 0x39); \ + z##D = _mm_shuffle_epi32(z##D, 0x39); \ + \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro replaces shuffle+mov by a direct extract; not much + * difference */ +#define ONEQUAD_EXTRACT(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + in##A = _mm_cvtsi128_si32(z##A); \ + in##B = _mm_cvtsi128_si32(z##B); \ + in##C = _mm_cvtsi128_si32(z##C); \ + in##D = _mm_cvtsi128_si32(z##D); \ + in##A ^= *(uint32_t *) (m + 0); \ + in##B ^= *(uint32_t *) (m + 4); \ + in##C ^= *(uint32_t *) (m + 8); \ + in##D ^= *(uint32_t *) (m + 12); \ + *(uint32_t *) (c + 0) = in##A; \ + *(uint32_t *) (c + 4) = in##B; \ + *(uint32_t *) (c + 8) = in##C; \ + *(uint32_t *) (c + 12) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 1); \ + in##B = _mm_extract_epi32(z##B, 1); \ + in##C = _mm_extract_epi32(z##C, 1); \ + in##D = _mm_extract_epi32(z##D, 1); \ + \ + in##A ^= *(uint32_t *) (m + 64); \ + in##B ^= *(uint32_t *) (m + 68); \ + in##C ^= *(uint32_t *) (m + 72); \ + in##D ^= *(uint32_t *) (m + 76); \ + *(uint32_t *) (c + 64) = in##A; \ + *(uint32_t *) (c + 68) = in##B; \ + *(uint32_t *) (c + 72) = in##C; \ + *(uint32_t *) (c + 76) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 2); \ + in##B = _mm_extract_epi32(z##B, 2); \ + in##C = _mm_extract_epi32(z##C, 2); \ + in##D = _mm_extract_epi32(z##D, 2); \ + \ + in##A ^= *(uint32_t *) (m + 128); \ + in##B ^= *(uint32_t *) (m + 132); \ + in##C ^= *(uint32_t *) (m + 136); \ + in##D ^= *(uint32_t *) (m + 140); \ + *(uint32_t *) (c + 128) = in##A; \ + *(uint32_t *) (c + 132) = in##B; \ + *(uint32_t *) (c + 136) = in##C; \ + *(uint32_t *) (c + 140) = in##D; \ + \ + in##A = _mm_extract_epi32(z##A, 3); \ + in##B = _mm_extract_epi32(z##B, 3); \ + in##C = _mm_extract_epi32(z##C, 3); \ + in##D = _mm_extract_epi32(z##D, 3); \ + \ + in##A ^= *(uint32_t *) (m + 192); \ + in##B ^= *(uint32_t *) (m + 196); \ + in##C ^= *(uint32_t *) (m + 200); \ + in##D ^= *(uint32_t *) (m + 204); \ + *(uint32_t *) (c + 192) = in##A; \ + *(uint32_t *) (c + 196) = in##B; \ + *(uint32_t *) (c + 200) = in##C; \ + *(uint32_t *) (c + 204) = in##D + +/* store data ; this macro first transpose data in-registers, and then store + * them in memory. much faster with icc. */ +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + z##A = _mm_add_epi32(z##A, orig##A); \ + z##B = _mm_add_epi32(z##B, orig##B); \ + z##C = _mm_add_epi32(z##C, orig##C); \ + z##D = _mm_add_epi32(z##D, orig##D); \ + y##A = _mm_unpacklo_epi32(z##A, z##B); \ + y##B = _mm_unpacklo_epi32(z##C, z##D); \ + y##C = _mm_unpackhi_epi32(z##A, z##B); \ + y##D = _mm_unpackhi_epi32(z##C, z##D); \ + z##A = _mm_unpacklo_epi64(y##A, y##B); \ + z##B = _mm_unpackhi_epi64(y##A, y##B); \ + z##C = _mm_unpacklo_epi64(y##C, y##D); \ + z##D = _mm_unpackhi_epi64(y##C, y##D); \ + y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0))); \ + _mm_storeu_si128((__m128i *) (c + 0), y##A); \ + y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64))); \ + _mm_storeu_si128((__m128i *) (c + 64), y##B); \ + y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \ + _mm_storeu_si128((__m128i *) (c + 128), y##C); \ + y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \ + _mm_storeu_si128((__m128i *) (c + 192), y##D) + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + + ONEQUAD(0, 1, 2, 3); + m += 16; + c += 16; + ONEQUAD(4, 5, 6, 7); + m += 16; + c += 16; + ONEQUAD(8, 9, 10, 11); + m += 16; + c += 16; + ONEQUAD(12, 13, 14, 15); + m -= 48; + c -= 48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_EXTRACT +#undef ONEQUAD_SHUFFLE + + bytes -= 256; + c += 256; + m += 256; + } +} diff --git a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h index 467a961299..ce5fb2664e 100644 --- a/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h +++ b/libs/libsodium/src/crypto_stream/salsa20/xmm6int/u8.h @@ -1,476 +1,477 @@ -if (bytes >= 512) { - __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, - y15; - - /* the naive way seems as fast (if not a bit faster) than the vector way */ - __m256i z0 = _mm256_set1_epi32(x[0]); - __m256i z5 = _mm256_set1_epi32(x[1]); - __m256i z10 = _mm256_set1_epi32(x[2]); - __m256i z15 = _mm256_set1_epi32(x[3]); - __m256i z12 = _mm256_set1_epi32(x[4]); - __m256i z1 = _mm256_set1_epi32(x[5]); - __m256i z6 = _mm256_set1_epi32(x[6]); - __m256i z11 = _mm256_set1_epi32(x[7]); - __m256i z8; /* useless */ - __m256i z13 = _mm256_set1_epi32(x[9]); - __m256i z2 = _mm256_set1_epi32(x[10]); - __m256i z7 = _mm256_set1_epi32(x[11]); - __m256i z4 = _mm256_set1_epi32(x[12]); - __m256i z9; /* useless */ - __m256i z14 = _mm256_set1_epi32(x[14]); - __m256i z3 = _mm256_set1_epi32(x[15]); - - __m256i orig0 = z0; - __m256i orig1 = z1; - __m256i orig2 = z2; - __m256i orig3 = z3; - __m256i orig4 = z4; - __m256i orig5 = z5; - __m256i orig6 = z6; - __m256i orig7 = z7; - __m256i orig8; - __m256i orig9; - __m256i orig10 = z10; - __m256i orig11 = z11; - __m256i orig12 = z12; - __m256i orig13 = z13; - __m256i orig14 = z14; - __m256i orig15 = z15; - - uint32_t in8; - uint32_t in9; - int i; - - while (bytes >= 512) { - /* vector implementation for z8 and z9 */ - /* faster than the naive version for 8 blocks */ - const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0); - const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4); - const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - - __m256i t8, t9; - uint64_t in89; - - in8 = x[8]; - in9 = x[13]; /* see arrays above for the address translation */ - in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); - - z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89)); - - t8 = _mm256_add_epi64(addv8, z8); - t9 = _mm256_add_epi64(addv9, z9); - - z8 = _mm256_unpacklo_epi32(t8, t9); - z9 = _mm256_unpackhi_epi32(t8, t9); - - t8 = _mm256_unpacklo_epi32(z8, z9); - t9 = _mm256_unpackhi_epi32(z8, z9); - - /* required because unpack* are intra-lane */ - z8 = _mm256_permutevar8x32_epi32(t8, permute); - z9 = _mm256_permutevar8x32_epi32(t9, permute); - - orig8 = z8; - orig9 = z9; - - in89 += 8; - - x[8] = in89 & 0xFFFFFFFF; - x[13] = (in89 >> 32) & 0xFFFFFFFF; - - z5 = orig5; - z10 = orig10; - z15 = orig15; - z14 = orig14; - z3 = orig3; - z6 = orig6; - z11 = orig11; - z1 = orig1; - - z7 = orig7; - z13 = orig13; - z2 = orig2; - z9 = orig9; - z0 = orig0; - z12 = orig12; - z4 = orig4; - z8 = orig8; - - for (i = 0; i < ROUNDS; i += 2) { - /* the inner loop is a direct translation (regexp search/replace) - * from the amd64-xmm6 ASM */ - __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, - r14, r15; - - y4 = z12; - y4 = _mm256_add_epi32(y4, z0); - r4 = y4; - y4 = _mm256_slli_epi32(y4, 7); - z4 = _mm256_xor_si256(z4, y4); - r4 = _mm256_srli_epi32(r4, 25); - z4 = _mm256_xor_si256(z4, r4); - - y9 = z1; - y9 = _mm256_add_epi32(y9, z5); - r9 = y9; - y9 = _mm256_slli_epi32(y9, 7); - z9 = _mm256_xor_si256(z9, y9); - r9 = _mm256_srli_epi32(r9, 25); - z9 = _mm256_xor_si256(z9, r9); - - y8 = z0; - y8 = _mm256_add_epi32(y8, z4); - r8 = y8; - y8 = _mm256_slli_epi32(y8, 9); - z8 = _mm256_xor_si256(z8, y8); - r8 = _mm256_srli_epi32(r8, 23); - z8 = _mm256_xor_si256(z8, r8); - - y13 = z5; - y13 = _mm256_add_epi32(y13, z9); - r13 = y13; - y13 = _mm256_slli_epi32(y13, 9); - z13 = _mm256_xor_si256(z13, y13); - r13 = _mm256_srli_epi32(r13, 23); - z13 = _mm256_xor_si256(z13, r13); - - y12 = z4; - y12 = _mm256_add_epi32(y12, z8); - r12 = y12; - y12 = _mm256_slli_epi32(y12, 13); - z12 = _mm256_xor_si256(z12, y12); - r12 = _mm256_srli_epi32(r12, 19); - z12 = _mm256_xor_si256(z12, r12); - - y1 = z9; - y1 = _mm256_add_epi32(y1, z13); - r1 = y1; - y1 = _mm256_slli_epi32(y1, 13); - z1 = _mm256_xor_si256(z1, y1); - r1 = _mm256_srli_epi32(r1, 19); - z1 = _mm256_xor_si256(z1, r1); - - y0 = z8; - y0 = _mm256_add_epi32(y0, z12); - r0 = y0; - y0 = _mm256_slli_epi32(y0, 18); - z0 = _mm256_xor_si256(z0, y0); - r0 = _mm256_srli_epi32(r0, 14); - z0 = _mm256_xor_si256(z0, r0); - - y5 = z13; - y5 = _mm256_add_epi32(y5, z1); - r5 = y5; - y5 = _mm256_slli_epi32(y5, 18); - z5 = _mm256_xor_si256(z5, y5); - r5 = _mm256_srli_epi32(r5, 14); - z5 = _mm256_xor_si256(z5, r5); - - y14 = z6; - y14 = _mm256_add_epi32(y14, z10); - r14 = y14; - y14 = _mm256_slli_epi32(y14, 7); - z14 = _mm256_xor_si256(z14, y14); - r14 = _mm256_srli_epi32(r14, 25); - z14 = _mm256_xor_si256(z14, r14); - - y3 = z11; - y3 = _mm256_add_epi32(y3, z15); - r3 = y3; - y3 = _mm256_slli_epi32(y3, 7); - z3 = _mm256_xor_si256(z3, y3); - r3 = _mm256_srli_epi32(r3, 25); - z3 = _mm256_xor_si256(z3, r3); - - y2 = z10; - y2 = _mm256_add_epi32(y2, z14); - r2 = y2; - y2 = _mm256_slli_epi32(y2, 9); - z2 = _mm256_xor_si256(z2, y2); - r2 = _mm256_srli_epi32(r2, 23); - z2 = _mm256_xor_si256(z2, r2); - - y7 = z15; - y7 = _mm256_add_epi32(y7, z3); - r7 = y7; - y7 = _mm256_slli_epi32(y7, 9); - z7 = _mm256_xor_si256(z7, y7); - r7 = _mm256_srli_epi32(r7, 23); - z7 = _mm256_xor_si256(z7, r7); - - y6 = z14; - y6 = _mm256_add_epi32(y6, z2); - r6 = y6; - y6 = _mm256_slli_epi32(y6, 13); - z6 = _mm256_xor_si256(z6, y6); - r6 = _mm256_srli_epi32(r6, 19); - z6 = _mm256_xor_si256(z6, r6); - - y11 = z3; - y11 = _mm256_add_epi32(y11, z7); - r11 = y11; - y11 = _mm256_slli_epi32(y11, 13); - z11 = _mm256_xor_si256(z11, y11); - r11 = _mm256_srli_epi32(r11, 19); - z11 = _mm256_xor_si256(z11, r11); - - y10 = z2; - y10 = _mm256_add_epi32(y10, z6); - r10 = y10; - y10 = _mm256_slli_epi32(y10, 18); - z10 = _mm256_xor_si256(z10, y10); - r10 = _mm256_srli_epi32(r10, 14); - z10 = _mm256_xor_si256(z10, r10); - - y1 = z3; - y1 = _mm256_add_epi32(y1, z0); - r1 = y1; - y1 = _mm256_slli_epi32(y1, 7); - z1 = _mm256_xor_si256(z1, y1); - r1 = _mm256_srli_epi32(r1, 25); - z1 = _mm256_xor_si256(z1, r1); - - y15 = z7; - y15 = _mm256_add_epi32(y15, z11); - r15 = y15; - y15 = _mm256_slli_epi32(y15, 18); - z15 = _mm256_xor_si256(z15, y15); - r15 = _mm256_srli_epi32(r15, 14); - z15 = _mm256_xor_si256(z15, r15); - - y6 = z4; - y6 = _mm256_add_epi32(y6, z5); - r6 = y6; - y6 = _mm256_slli_epi32(y6, 7); - z6 = _mm256_xor_si256(z6, y6); - r6 = _mm256_srli_epi32(r6, 25); - z6 = _mm256_xor_si256(z6, r6); - - y2 = z0; - y2 = _mm256_add_epi32(y2, z1); - r2 = y2; - y2 = _mm256_slli_epi32(y2, 9); - z2 = _mm256_xor_si256(z2, y2); - r2 = _mm256_srli_epi32(r2, 23); - z2 = _mm256_xor_si256(z2, r2); - - y7 = z5; - y7 = _mm256_add_epi32(y7, z6); - r7 = y7; - y7 = _mm256_slli_epi32(y7, 9); - z7 = _mm256_xor_si256(z7, y7); - r7 = _mm256_srli_epi32(r7, 23); - z7 = _mm256_xor_si256(z7, r7); - - y3 = z1; - y3 = _mm256_add_epi32(y3, z2); - r3 = y3; - y3 = _mm256_slli_epi32(y3, 13); - z3 = _mm256_xor_si256(z3, y3); - r3 = _mm256_srli_epi32(r3, 19); - z3 = _mm256_xor_si256(z3, r3); - - y4 = z6; - y4 = _mm256_add_epi32(y4, z7); - r4 = y4; - y4 = _mm256_slli_epi32(y4, 13); - z4 = _mm256_xor_si256(z4, y4); - r4 = _mm256_srli_epi32(r4, 19); - z4 = _mm256_xor_si256(z4, r4); - - y0 = z2; - y0 = _mm256_add_epi32(y0, z3); - r0 = y0; - y0 = _mm256_slli_epi32(y0, 18); - z0 = _mm256_xor_si256(z0, y0); - r0 = _mm256_srli_epi32(r0, 14); - z0 = _mm256_xor_si256(z0, r0); - - y5 = z7; - y5 = _mm256_add_epi32(y5, z4); - r5 = y5; - y5 = _mm256_slli_epi32(y5, 18); - z5 = _mm256_xor_si256(z5, y5); - r5 = _mm256_srli_epi32(r5, 14); - z5 = _mm256_xor_si256(z5, r5); - - y11 = z9; - y11 = _mm256_add_epi32(y11, z10); - r11 = y11; - y11 = _mm256_slli_epi32(y11, 7); - z11 = _mm256_xor_si256(z11, y11); - r11 = _mm256_srli_epi32(r11, 25); - z11 = _mm256_xor_si256(z11, r11); - - y12 = z14; - y12 = _mm256_add_epi32(y12, z15); - r12 = y12; - y12 = _mm256_slli_epi32(y12, 7); - z12 = _mm256_xor_si256(z12, y12); - r12 = _mm256_srli_epi32(r12, 25); - z12 = _mm256_xor_si256(z12, r12); - - y8 = z10; - y8 = _mm256_add_epi32(y8, z11); - r8 = y8; - y8 = _mm256_slli_epi32(y8, 9); - z8 = _mm256_xor_si256(z8, y8); - r8 = _mm256_srli_epi32(r8, 23); - z8 = _mm256_xor_si256(z8, r8); - - y13 = z15; - y13 = _mm256_add_epi32(y13, z12); - r13 = y13; - y13 = _mm256_slli_epi32(y13, 9); - z13 = _mm256_xor_si256(z13, y13); - r13 = _mm256_srli_epi32(r13, 23); - z13 = _mm256_xor_si256(z13, r13); - - y9 = z11; - y9 = _mm256_add_epi32(y9, z8); - r9 = y9; - y9 = _mm256_slli_epi32(y9, 13); - z9 = _mm256_xor_si256(z9, y9); - r9 = _mm256_srli_epi32(r9, 19); - z9 = _mm256_xor_si256(z9, r9); - - y14 = z12; - y14 = _mm256_add_epi32(y14, z13); - r14 = y14; - y14 = _mm256_slli_epi32(y14, 13); - z14 = _mm256_xor_si256(z14, y14); - r14 = _mm256_srli_epi32(r14, 19); - z14 = _mm256_xor_si256(z14, r14); - - y10 = z8; - y10 = _mm256_add_epi32(y10, z9); - r10 = y10; - y10 = _mm256_slli_epi32(y10, 18); - z10 = _mm256_xor_si256(z10, y10); - r10 = _mm256_srli_epi32(r10, 14); - z10 = _mm256_xor_si256(z10, r10); - - y15 = z13; - y15 = _mm256_add_epi32(y15, z14); - r15 = y15; - y15 = _mm256_slli_epi32(y15, 18); - z15 = _mm256_xor_si256(z15, y15); - r15 = _mm256_srli_epi32(r15, 14); - z15 = _mm256_xor_si256(z15, r15); - } - -/* store data ; this macro first transpose data in-registers, and then store - * them in memory. much faster with icc. */ -#define ONEQUAD_TRANSPOSE(A, B, C, D) \ - { \ - __m128i t0, t1, t2, t3; \ - z##A = _mm256_add_epi32(z##A, orig##A); \ - z##B = _mm256_add_epi32(z##B, orig##B); \ - z##C = _mm256_add_epi32(z##C, orig##C); \ - z##D = _mm256_add_epi32(z##D, orig##D); \ - y##A = _mm256_unpacklo_epi32(z##A, z##B); \ - y##B = _mm256_unpacklo_epi32(z##C, z##D); \ - y##C = _mm256_unpackhi_epi32(z##A, z##B); \ - y##D = _mm256_unpackhi_epi32(z##C, z##D); \ - z##A = _mm256_unpacklo_epi64(y##A, y##B); \ - z##B = _mm256_unpackhi_epi64(y##A, y##B); \ - z##C = _mm256_unpacklo_epi64(y##C, y##D); \ - z##D = _mm256_unpackhi_epi64(y##C, y##D); \ - t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \ - _mm_loadu_si128((__m128i*) (m + 0))); \ - _mm_storeu_si128((__m128i*) (c + 0), t0); \ - t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \ - _mm_loadu_si128((__m128i*) (m + 64))); \ - _mm_storeu_si128((__m128i*) (c + 64), t1); \ - t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \ - _mm_loadu_si128((__m128i*) (m + 128))); \ - _mm_storeu_si128((__m128i*) (c + 128), t2); \ - t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \ - _mm_loadu_si128((__m128i*) (m + 192))); \ - _mm_storeu_si128((__m128i*) (c + 192), t3); \ - t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \ - _mm_loadu_si128((__m128i*) (m + 256))); \ - _mm_storeu_si128((__m128i*) (c + 256), t0); \ - t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \ - _mm_loadu_si128((__m128i*) (m + 320))); \ - _mm_storeu_si128((__m128i*) (c + 320), t1); \ - t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \ - _mm_loadu_si128((__m128i*) (m + 384))); \ - _mm_storeu_si128((__m128i*) (c + 384), t2); \ - t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \ - _mm_loadu_si128((__m128i*) (m + 448))); \ - _mm_storeu_si128((__m128i*) (c + 448), t3); \ - } - -#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) - -#define ONEQUAD_UNPCK(A, B, C, D) \ - { \ - z##A = _mm256_add_epi32(z##A, orig##A); \ - z##B = _mm256_add_epi32(z##B, orig##B); \ - z##C = _mm256_add_epi32(z##C, orig##C); \ - z##D = _mm256_add_epi32(z##D, orig##D); \ - y##A = _mm256_unpacklo_epi32(z##A, z##B); \ - y##B = _mm256_unpacklo_epi32(z##C, z##D); \ - y##C = _mm256_unpackhi_epi32(z##A, z##B); \ - y##D = _mm256_unpackhi_epi32(z##C, z##D); \ - z##A = _mm256_unpacklo_epi64(y##A, y##B); \ - z##B = _mm256_unpackhi_epi64(y##A, y##B); \ - z##C = _mm256_unpacklo_epi64(y##C, y##D); \ - z##D = _mm256_unpackhi_epi64(y##C, y##D); \ - } - -#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ - { \ - ONEQUAD_UNPCK(A, B, C, D); \ - ONEQUAD_UNPCK(A2, B2, C2, D2); \ - y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \ - y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \ - y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \ - y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \ - y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \ - y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \ - y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \ - y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \ - y##A = _mm256_xor_si256(y##A, _mm256_loadu_si256((__m256i*) (m + 0))); \ - y##B = \ - _mm256_xor_si256(y##B, _mm256_loadu_si256((__m256i*) (m + 64))); \ - y##C = \ - _mm256_xor_si256(y##C, _mm256_loadu_si256((__m256i*) (m + 128))); \ - y##D = \ - _mm256_xor_si256(y##D, _mm256_loadu_si256((__m256i*) (m + 192))); \ - y##A2 = \ - _mm256_xor_si256(y##A2, _mm256_loadu_si256((__m256i*) (m + 256))); \ - y##B2 = \ - _mm256_xor_si256(y##B2, _mm256_loadu_si256((__m256i*) (m + 320))); \ - y##C2 = \ - _mm256_xor_si256(y##C2, _mm256_loadu_si256((__m256i*) (m + 384))); \ - y##D2 = \ - _mm256_xor_si256(y##D2, _mm256_loadu_si256((__m256i*) (m + 448))); \ - _mm256_storeu_si256((__m256i*) (c + 0), y##A); \ - _mm256_storeu_si256((__m256i*) (c + 64), y##B); \ - _mm256_storeu_si256((__m256i*) (c + 128), y##C); \ - _mm256_storeu_si256((__m256i*) (c + 192), y##D); \ - _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \ - _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \ - _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \ - _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \ - } - - ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); - m += 32; - c += 32; - ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); - m -= 32; - c -= 32; - -#undef ONEQUAD -#undef ONEQUAD_TRANSPOSE -#undef ONEQUAD_UNPCK -#undef ONEOCTO - - bytes -= 512; - c += 512; - m += 512; - } -} +if (bytes >= 512) { + __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, + y15; + + /* the naive way seems as fast (if not a bit faster) than the vector way */ + __m256i z0 = _mm256_set1_epi32(x[0]); + __m256i z5 = _mm256_set1_epi32(x[1]); + __m256i z10 = _mm256_set1_epi32(x[2]); + __m256i z15 = _mm256_set1_epi32(x[3]); + __m256i z12 = _mm256_set1_epi32(x[4]); + __m256i z1 = _mm256_set1_epi32(x[5]); + __m256i z6 = _mm256_set1_epi32(x[6]); + __m256i z11 = _mm256_set1_epi32(x[7]); + __m256i z8; /* useless */ + __m256i z13 = _mm256_set1_epi32(x[9]); + __m256i z2 = _mm256_set1_epi32(x[10]); + __m256i z7 = _mm256_set1_epi32(x[11]); + __m256i z4 = _mm256_set1_epi32(x[12]); + __m256i z9; /* useless */ + __m256i z14 = _mm256_set1_epi32(x[14]); + __m256i z3 = _mm256_set1_epi32(x[15]); + + __m256i orig0 = z0; + __m256i orig1 = z1; + __m256i orig2 = z2; + __m256i orig3 = z3; + __m256i orig4 = z4; + __m256i orig5 = z5; + __m256i orig6 = z6; + __m256i orig7 = z7; + __m256i orig8; + __m256i orig9; + __m256i orig10 = z10; + __m256i orig11 = z11; + __m256i orig12 = z12; + __m256i orig13 = z13; + __m256i orig14 = z14; + __m256i orig15 = z15; + + uint32_t in8; + uint32_t in9; + int i; + + while (bytes >= 512) { + /* vector implementation for z8 and z9 */ + /* faster than the naive version for 8 blocks */ + const __m256i addv8 = _mm256_set_epi64x(3, 2, 1, 0); + const __m256i addv9 = _mm256_set_epi64x(7, 6, 5, 4); + const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + + __m256i t8, t9; + uint64_t in89; + + in8 = x[8]; + in9 = x[13]; /* see arrays above for the address translation */ + in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32); + + z8 = z9 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in89)); + + t8 = _mm256_add_epi64(addv8, z8); + t9 = _mm256_add_epi64(addv9, z9); + + z8 = _mm256_unpacklo_epi32(t8, t9); + z9 = _mm256_unpackhi_epi32(t8, t9); + + t8 = _mm256_unpacklo_epi32(z8, z9); + t9 = _mm256_unpackhi_epi32(z8, z9); + + /* required because unpack* are intra-lane */ + z8 = _mm256_permutevar8x32_epi32(t8, permute); + z9 = _mm256_permutevar8x32_epi32(t9, permute); + + orig8 = z8; + orig9 = z9; + + in89 += 8; + + x[8] = in89 & 0xFFFFFFFF; + x[13] = (in89 >> 32) & 0xFFFFFFFF; + + z5 = orig5; + z10 = orig10; + z15 = orig15; + z14 = orig14; + z3 = orig3; + z6 = orig6; + z11 = orig11; + z1 = orig1; + + z7 = orig7; + z13 = orig13; + z2 = orig2; + z9 = orig9; + z0 = orig0; + z12 = orig12; + z4 = orig4; + z8 = orig8; + + for (i = 0; i < ROUNDS; i += 2) { + /* the inner loop is a direct translation (regexp search/replace) + * from the amd64-xmm6 ASM */ + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, + r14, r15; + + y4 = z12; + y4 = _mm256_add_epi32(y4, z0); + r4 = y4; + y4 = _mm256_slli_epi32(y4, 7); + z4 = _mm256_xor_si256(z4, y4); + r4 = _mm256_srli_epi32(r4, 25); + z4 = _mm256_xor_si256(z4, r4); + + y9 = z1; + y9 = _mm256_add_epi32(y9, z5); + r9 = y9; + y9 = _mm256_slli_epi32(y9, 7); + z9 = _mm256_xor_si256(z9, y9); + r9 = _mm256_srli_epi32(r9, 25); + z9 = _mm256_xor_si256(z9, r9); + + y8 = z0; + y8 = _mm256_add_epi32(y8, z4); + r8 = y8; + y8 = _mm256_slli_epi32(y8, 9); + z8 = _mm256_xor_si256(z8, y8); + r8 = _mm256_srli_epi32(r8, 23); + z8 = _mm256_xor_si256(z8, r8); + + y13 = z5; + y13 = _mm256_add_epi32(y13, z9); + r13 = y13; + y13 = _mm256_slli_epi32(y13, 9); + z13 = _mm256_xor_si256(z13, y13); + r13 = _mm256_srli_epi32(r13, 23); + z13 = _mm256_xor_si256(z13, r13); + + y12 = z4; + y12 = _mm256_add_epi32(y12, z8); + r12 = y12; + y12 = _mm256_slli_epi32(y12, 13); + z12 = _mm256_xor_si256(z12, y12); + r12 = _mm256_srli_epi32(r12, 19); + z12 = _mm256_xor_si256(z12, r12); + + y1 = z9; + y1 = _mm256_add_epi32(y1, z13); + r1 = y1; + y1 = _mm256_slli_epi32(y1, 13); + z1 = _mm256_xor_si256(z1, y1); + r1 = _mm256_srli_epi32(r1, 19); + z1 = _mm256_xor_si256(z1, r1); + + y0 = z8; + y0 = _mm256_add_epi32(y0, z12); + r0 = y0; + y0 = _mm256_slli_epi32(y0, 18); + z0 = _mm256_xor_si256(z0, y0); + r0 = _mm256_srli_epi32(r0, 14); + z0 = _mm256_xor_si256(z0, r0); + + y5 = z13; + y5 = _mm256_add_epi32(y5, z1); + r5 = y5; + y5 = _mm256_slli_epi32(y5, 18); + z5 = _mm256_xor_si256(z5, y5); + r5 = _mm256_srli_epi32(r5, 14); + z5 = _mm256_xor_si256(z5, r5); + + y14 = z6; + y14 = _mm256_add_epi32(y14, z10); + r14 = y14; + y14 = _mm256_slli_epi32(y14, 7); + z14 = _mm256_xor_si256(z14, y14); + r14 = _mm256_srli_epi32(r14, 25); + z14 = _mm256_xor_si256(z14, r14); + + y3 = z11; + y3 = _mm256_add_epi32(y3, z15); + r3 = y3; + y3 = _mm256_slli_epi32(y3, 7); + z3 = _mm256_xor_si256(z3, y3); + r3 = _mm256_srli_epi32(r3, 25); + z3 = _mm256_xor_si256(z3, r3); + + y2 = z10; + y2 = _mm256_add_epi32(y2, z14); + r2 = y2; + y2 = _mm256_slli_epi32(y2, 9); + z2 = _mm256_xor_si256(z2, y2); + r2 = _mm256_srli_epi32(r2, 23); + z2 = _mm256_xor_si256(z2, r2); + + y7 = z15; + y7 = _mm256_add_epi32(y7, z3); + r7 = y7; + y7 = _mm256_slli_epi32(y7, 9); + z7 = _mm256_xor_si256(z7, y7); + r7 = _mm256_srli_epi32(r7, 23); + z7 = _mm256_xor_si256(z7, r7); + + y6 = z14; + y6 = _mm256_add_epi32(y6, z2); + r6 = y6; + y6 = _mm256_slli_epi32(y6, 13); + z6 = _mm256_xor_si256(z6, y6); + r6 = _mm256_srli_epi32(r6, 19); + z6 = _mm256_xor_si256(z6, r6); + + y11 = z3; + y11 = _mm256_add_epi32(y11, z7); + r11 = y11; + y11 = _mm256_slli_epi32(y11, 13); + z11 = _mm256_xor_si256(z11, y11); + r11 = _mm256_srli_epi32(r11, 19); + z11 = _mm256_xor_si256(z11, r11); + + y10 = z2; + y10 = _mm256_add_epi32(y10, z6); + r10 = y10; + y10 = _mm256_slli_epi32(y10, 18); + z10 = _mm256_xor_si256(z10, y10); + r10 = _mm256_srli_epi32(r10, 14); + z10 = _mm256_xor_si256(z10, r10); + + y1 = z3; + y1 = _mm256_add_epi32(y1, z0); + r1 = y1; + y1 = _mm256_slli_epi32(y1, 7); + z1 = _mm256_xor_si256(z1, y1); + r1 = _mm256_srli_epi32(r1, 25); + z1 = _mm256_xor_si256(z1, r1); + + y15 = z7; + y15 = _mm256_add_epi32(y15, z11); + r15 = y15; + y15 = _mm256_slli_epi32(y15, 18); + z15 = _mm256_xor_si256(z15, y15); + r15 = _mm256_srli_epi32(r15, 14); + z15 = _mm256_xor_si256(z15, r15); + + y6 = z4; + y6 = _mm256_add_epi32(y6, z5); + r6 = y6; + y6 = _mm256_slli_epi32(y6, 7); + z6 = _mm256_xor_si256(z6, y6); + r6 = _mm256_srli_epi32(r6, 25); + z6 = _mm256_xor_si256(z6, r6); + + y2 = z0; + y2 = _mm256_add_epi32(y2, z1); + r2 = y2; + y2 = _mm256_slli_epi32(y2, 9); + z2 = _mm256_xor_si256(z2, y2); + r2 = _mm256_srli_epi32(r2, 23); + z2 = _mm256_xor_si256(z2, r2); + + y7 = z5; + y7 = _mm256_add_epi32(y7, z6); + r7 = y7; + y7 = _mm256_slli_epi32(y7, 9); + z7 = _mm256_xor_si256(z7, y7); + r7 = _mm256_srli_epi32(r7, 23); + z7 = _mm256_xor_si256(z7, r7); + + y3 = z1; + y3 = _mm256_add_epi32(y3, z2); + r3 = y3; + y3 = _mm256_slli_epi32(y3, 13); + z3 = _mm256_xor_si256(z3, y3); + r3 = _mm256_srli_epi32(r3, 19); + z3 = _mm256_xor_si256(z3, r3); + + y4 = z6; + y4 = _mm256_add_epi32(y4, z7); + r4 = y4; + y4 = _mm256_slli_epi32(y4, 13); + z4 = _mm256_xor_si256(z4, y4); + r4 = _mm256_srli_epi32(r4, 19); + z4 = _mm256_xor_si256(z4, r4); + + y0 = z2; + y0 = _mm256_add_epi32(y0, z3); + r0 = y0; + y0 = _mm256_slli_epi32(y0, 18); + z0 = _mm256_xor_si256(z0, y0); + r0 = _mm256_srli_epi32(r0, 14); + z0 = _mm256_xor_si256(z0, r0); + + y5 = z7; + y5 = _mm256_add_epi32(y5, z4); + r5 = y5; + y5 = _mm256_slli_epi32(y5, 18); + z5 = _mm256_xor_si256(z5, y5); + r5 = _mm256_srli_epi32(r5, 14); + z5 = _mm256_xor_si256(z5, r5); + + y11 = z9; + y11 = _mm256_add_epi32(y11, z10); + r11 = y11; + y11 = _mm256_slli_epi32(y11, 7); + z11 = _mm256_xor_si256(z11, y11); + r11 = _mm256_srli_epi32(r11, 25); + z11 = _mm256_xor_si256(z11, r11); + + y12 = z14; + y12 = _mm256_add_epi32(y12, z15); + r12 = y12; + y12 = _mm256_slli_epi32(y12, 7); + z12 = _mm256_xor_si256(z12, y12); + r12 = _mm256_srli_epi32(r12, 25); + z12 = _mm256_xor_si256(z12, r12); + + y8 = z10; + y8 = _mm256_add_epi32(y8, z11); + r8 = y8; + y8 = _mm256_slli_epi32(y8, 9); + z8 = _mm256_xor_si256(z8, y8); + r8 = _mm256_srli_epi32(r8, 23); + z8 = _mm256_xor_si256(z8, r8); + + y13 = z15; + y13 = _mm256_add_epi32(y13, z12); + r13 = y13; + y13 = _mm256_slli_epi32(y13, 9); + z13 = _mm256_xor_si256(z13, y13); + r13 = _mm256_srli_epi32(r13, 23); + z13 = _mm256_xor_si256(z13, r13); + + y9 = z11; + y9 = _mm256_add_epi32(y9, z8); + r9 = y9; + y9 = _mm256_slli_epi32(y9, 13); + z9 = _mm256_xor_si256(z9, y9); + r9 = _mm256_srli_epi32(r9, 19); + z9 = _mm256_xor_si256(z9, r9); + + y14 = z12; + y14 = _mm256_add_epi32(y14, z13); + r14 = y14; + y14 = _mm256_slli_epi32(y14, 13); + z14 = _mm256_xor_si256(z14, y14); + r14 = _mm256_srli_epi32(r14, 19); + z14 = _mm256_xor_si256(z14, r14); + + y10 = z8; + y10 = _mm256_add_epi32(y10, z9); + r10 = y10; + y10 = _mm256_slli_epi32(y10, 18); + z10 = _mm256_xor_si256(z10, y10); + r10 = _mm256_srli_epi32(r10, 14); + z10 = _mm256_xor_si256(z10, r10); + + y15 = z13; + y15 = _mm256_add_epi32(y15, z14); + r15 = y15; + y15 = _mm256_slli_epi32(y15, 18); + z15 = _mm256_xor_si256(z15, y15); + r15 = _mm256_srli_epi32(r15, 14); + z15 = _mm256_xor_si256(z15, r15); + } + +/* store data ; this macro first transpose data in-registers, and then store + * them in memory. much faster with icc. */ +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + z##A = _mm256_add_epi32(z##A, orig##A); \ + z##B = _mm256_add_epi32(z##B, orig##B); \ + z##C = _mm256_add_epi32(z##C, orig##C); \ + z##D = _mm256_add_epi32(z##D, orig##D); \ + y##A = _mm256_unpacklo_epi32(z##A, z##B); \ + y##B = _mm256_unpacklo_epi32(z##C, z##D); \ + y##C = _mm256_unpackhi_epi32(z##A, z##B); \ + y##D = _mm256_unpackhi_epi32(z##C, z##D); \ + z##A = _mm256_unpacklo_epi64(y##A, y##B); \ + z##B = _mm256_unpackhi_epi64(y##A, y##B); \ + z##C = _mm256_unpacklo_epi64(y##C, y##D); \ + z##D = _mm256_unpackhi_epi64(y##C, y##D); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 0), \ + _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 0), \ + _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 0), \ + _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 0), \ + _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(z##A, 1), \ + _mm_loadu_si128((const __m128i*) (m + 256))); \ + _mm_storeu_si128((__m128i*) (c + 256), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(z##B, 1), \ + _mm_loadu_si128((const __m128i*) (m + 320))); \ + _mm_storeu_si128((__m128i*) (c + 320), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(z##C, 1), \ + _mm_loadu_si128((const __m128i*) (m + 384))); \ + _mm_storeu_si128((__m128i*) (c + 384), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(z##D, 1), \ + _mm_loadu_si128((const __m128i*) (m + 448))); \ + _mm_storeu_si128((__m128i*) (c + 448), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + +#define ONEQUAD_UNPCK(A, B, C, D) \ + { \ + z##A = _mm256_add_epi32(z##A, orig##A); \ + z##B = _mm256_add_epi32(z##B, orig##B); \ + z##C = _mm256_add_epi32(z##C, orig##C); \ + z##D = _mm256_add_epi32(z##D, orig##D); \ + y##A = _mm256_unpacklo_epi32(z##A, z##B); \ + y##B = _mm256_unpacklo_epi32(z##C, z##D); \ + y##C = _mm256_unpackhi_epi32(z##A, z##B); \ + y##D = _mm256_unpackhi_epi32(z##C, z##D); \ + z##A = _mm256_unpacklo_epi64(y##A, y##B); \ + z##B = _mm256_unpackhi_epi64(y##A, y##B); \ + z##C = _mm256_unpacklo_epi64(y##C, y##D); \ + z##D = _mm256_unpackhi_epi64(y##C, y##D); \ + } + +#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ + { \ + ONEQUAD_UNPCK(A, B, C, D); \ + ONEQUAD_UNPCK(A2, B2, C2, D2); \ + y##A = _mm256_permute2x128_si256(z##A, z##A2, 0x20); \ + y##A2 = _mm256_permute2x128_si256(z##A, z##A2, 0x31); \ + y##B = _mm256_permute2x128_si256(z##B, z##B2, 0x20); \ + y##B2 = _mm256_permute2x128_si256(z##B, z##B2, 0x31); \ + y##C = _mm256_permute2x128_si256(z##C, z##C2, 0x20); \ + y##C2 = _mm256_permute2x128_si256(z##C, z##C2, 0x31); \ + y##D = _mm256_permute2x128_si256(z##D, z##D2, 0x20); \ + y##D2 = _mm256_permute2x128_si256(z##D, z##D2, 0x31); \ + y##A = _mm256_xor_si256(y##A, \ + _mm256_loadu_si256((const __m256i*) (m + 0))); \ + y##B = _mm256_xor_si256( \ + y##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \ + y##C = _mm256_xor_si256( \ + y##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \ + y##D = _mm256_xor_si256( \ + y##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \ + y##A2 = _mm256_xor_si256( \ + y##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \ + y##B2 = _mm256_xor_si256( \ + y##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \ + y##C2 = _mm256_xor_si256( \ + y##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \ + y##D2 = _mm256_xor_si256( \ + y##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \ + _mm256_storeu_si256((__m256i*) (c + 0), y##A); \ + _mm256_storeu_si256((__m256i*) (c + 64), y##B); \ + _mm256_storeu_si256((__m256i*) (c + 128), y##C); \ + _mm256_storeu_si256((__m256i*) (c + 192), y##D); \ + _mm256_storeu_si256((__m256i*) (c + 256), y##A2); \ + _mm256_storeu_si256((__m256i*) (c + 320), y##B2); \ + _mm256_storeu_si256((__m256i*) (c + 384), y##C2); \ + _mm256_storeu_si256((__m256i*) (c + 448), y##D2); \ + } + + ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); + m += 32; + c += 32; + ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); + m -= 32; + c -= 32; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_UNPCK +#undef ONEOCTO + + bytes -= 512; + c += 512; + m += 512; + } +} diff --git a/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c b/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c index bfdfeedba3..239becfc9e 100644 --- a/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c +++ b/libs/libsodium/src/crypto_stream/salsa2012/ref/stream_salsa2012_ref.c @@ -1,106 +1,106 @@ -/* -version 20140420 -D. J. Bernstein -Public domain. -*/ - -#include - -#include "crypto_core_salsa2012.h" -#include "crypto_stream_salsa2012.h" -#include "utils.h" - -int -crypto_stream_salsa2012(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!clen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = 0; - } - while (clen >= 64) { - crypto_core_salsa2012(c, in, kcopy, NULL); - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - clen -= 64; - c += 64; - } - if (clen) { - crypto_core_salsa2012(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)clen; ++i) { - c[i] = block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} - -int -crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!mlen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = 0; - } - while (mlen >= 64) { - crypto_core_salsa2012(block, in, kcopy, NULL); - for (i = 0; i < 64; ++i) { - c[i] = m[i] ^ block[i]; - } - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - mlen -= 64; - c += 64; - m += 64; - } - if (mlen) { - crypto_core_salsa2012(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)mlen; ++i) { - c[i] = m[i] ^ block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} +/* +version 20140420 +D. J. Bernstein +Public domain. +*/ + +#include + +#include "crypto_core_salsa2012.h" +#include "crypto_stream_salsa2012.h" +#include "utils.h" + +int +crypto_stream_salsa2012(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!clen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (clen >= 64) { + crypto_core_salsa2012(c, in, kcopy, NULL); + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + clen -= 64; + c += 64; + } + if (clen) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)clen; ++i) { + c[i] = block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + +int +crypto_stream_salsa2012_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (mlen >= 64) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < 64; ++i) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa2012(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)mlen; ++i) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} diff --git a/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c b/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c index d0cc0f68ee..506ec57624 100644 --- a/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c +++ b/libs/libsodium/src/crypto_stream/salsa2012/stream_salsa2012.c @@ -1,26 +1,26 @@ -#include "crypto_stream_salsa2012.h" -#include "randombytes.h" - -size_t -crypto_stream_salsa2012_keybytes(void) -{ - return crypto_stream_salsa2012_KEYBYTES; -} - -size_t -crypto_stream_salsa2012_noncebytes(void) -{ - return crypto_stream_salsa2012_NONCEBYTES; -} - -size_t -crypto_stream_salsa2012_messagebytes_max(void) -{ - return crypto_stream_salsa2012_MESSAGEBYTES_MAX; -} - -void -crypto_stream_salsa2012_keygen(unsigned char k[crypto_stream_salsa2012_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_salsa2012_KEYBYTES); -} +#include "crypto_stream_salsa2012.h" +#include "randombytes.h" + +size_t +crypto_stream_salsa2012_keybytes(void) +{ + return crypto_stream_salsa2012_KEYBYTES; +} + +size_t +crypto_stream_salsa2012_noncebytes(void) +{ + return crypto_stream_salsa2012_NONCEBYTES; +} + +size_t +crypto_stream_salsa2012_messagebytes_max(void) +{ + return crypto_stream_salsa2012_MESSAGEBYTES_MAX; +} + +void +crypto_stream_salsa2012_keygen(unsigned char k[crypto_stream_salsa2012_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_salsa2012_KEYBYTES); +} diff --git a/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c b/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c index 7ec0c4e78e..e52a573d40 100644 --- a/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c +++ b/libs/libsodium/src/crypto_stream/salsa208/ref/stream_salsa208_ref.c @@ -1,106 +1,106 @@ -/* -version 20140420 -D. J. Bernstein -Public domain. -*/ - -#include - -#include "crypto_core_salsa208.h" -#include "crypto_stream_salsa208.h" -#include "utils.h" - -int -crypto_stream_salsa208(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!clen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = 0; - } - while (clen >= 64) { - crypto_core_salsa208(c, in, kcopy, NULL); - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - clen -= 64; - c += 64; - } - if (clen) { - crypto_core_salsa208(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)clen; ++i) { - c[i] = block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} - -int -crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - unsigned char in[16]; - unsigned char block[64]; - unsigned char kcopy[32]; - unsigned int i; - unsigned int u; - - if (!mlen) { - return 0; - } - for (i = 0; i < 32; ++i) { - kcopy[i] = k[i]; - } - for (i = 0; i < 8; ++i) { - in[i] = n[i]; - } - for (i = 8; i < 16; ++i) { - in[i] = 0; - } - while (mlen >= 64) { - crypto_core_salsa208(block, in, kcopy, NULL); - for (i = 0; i < 64; ++i) { - c[i] = m[i] ^ block[i]; - } - u = 1; - for (i = 8; i < 16; ++i) { - u += (unsigned int)in[i]; - in[i] = u; - u >>= 8; - } - mlen -= 64; - c += 64; - m += 64; - } - if (mlen) { - crypto_core_salsa208(block, in, kcopy, NULL); - for (i = 0; i < (unsigned int)mlen; ++i) { - c[i] = m[i] ^ block[i]; - } - } - sodium_memzero(block, sizeof block); - sodium_memzero(kcopy, sizeof kcopy); - - return 0; -} +/* +version 20140420 +D. J. Bernstein +Public domain. +*/ + +#include + +#include "crypto_core_salsa208.h" +#include "crypto_stream_salsa208.h" +#include "utils.h" + +int +crypto_stream_salsa208(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!clen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (clen >= 64) { + crypto_core_salsa208(c, in, kcopy, NULL); + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + clen -= 64; + c += 64; + } + if (clen) { + crypto_core_salsa208(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)clen; ++i) { + c[i] = block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} + +int +crypto_stream_salsa208_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + unsigned char in[16]; + unsigned char block[64]; + unsigned char kcopy[32]; + unsigned int i; + unsigned int u; + + if (!mlen) { + return 0; + } + for (i = 0; i < 32; ++i) { + kcopy[i] = k[i]; + } + for (i = 0; i < 8; ++i) { + in[i] = n[i]; + } + for (i = 8; i < 16; ++i) { + in[i] = 0; + } + while (mlen >= 64) { + crypto_core_salsa208(block, in, kcopy, NULL); + for (i = 0; i < 64; ++i) { + c[i] = m[i] ^ block[i]; + } + u = 1; + for (i = 8; i < 16; ++i) { + u += (unsigned int)in[i]; + in[i] = u; + u >>= 8; + } + mlen -= 64; + c += 64; + m += 64; + } + if (mlen) { + crypto_core_salsa208(block, in, kcopy, NULL); + for (i = 0; i < (unsigned int)mlen; ++i) { + c[i] = m[i] ^ block[i]; + } + } + sodium_memzero(block, sizeof block); + sodium_memzero(kcopy, sizeof kcopy); + + return 0; +} diff --git a/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c b/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c index b79bda5ec2..fb7111b407 100644 --- a/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c +++ b/libs/libsodium/src/crypto_stream/salsa208/stream_salsa208.c @@ -1,26 +1,26 @@ -#include "crypto_stream_salsa208.h" -#include "randombytes.h" - -size_t -crypto_stream_salsa208_keybytes(void) -{ - return crypto_stream_salsa208_KEYBYTES; -} - -size_t -crypto_stream_salsa208_noncebytes(void) -{ - return crypto_stream_salsa208_NONCEBYTES; -} - -size_t -crypto_stream_salsa208_messagebytes_max(void) -{ - return crypto_stream_salsa208_MESSAGEBYTES_MAX; -} - -void -crypto_stream_salsa208_keygen(unsigned char k[crypto_stream_salsa208_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_salsa208_KEYBYTES); -} +#include "crypto_stream_salsa208.h" +#include "randombytes.h" + +size_t +crypto_stream_salsa208_keybytes(void) +{ + return crypto_stream_salsa208_KEYBYTES; +} + +size_t +crypto_stream_salsa208_noncebytes(void) +{ + return crypto_stream_salsa208_NONCEBYTES; +} + +size_t +crypto_stream_salsa208_messagebytes_max(void) +{ + return crypto_stream_salsa208_MESSAGEBYTES_MAX; +} + +void +crypto_stream_salsa208_keygen(unsigned char k[crypto_stream_salsa208_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_salsa208_KEYBYTES); +} diff --git a/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c b/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c index 8b1bc09abd..47807e0a44 100644 --- a/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c +++ b/libs/libsodium/src/crypto_stream/xchacha20/stream_xchacha20.c @@ -1,69 +1,69 @@ - -#include - -#include "crypto_core_hchacha20.h" -#include "crypto_stream_chacha20.h" -#include "crypto_stream_xchacha20.h" -#include "private/common.h" -#include "randombytes.h" - -size_t -crypto_stream_xchacha20_keybytes(void) -{ - return crypto_stream_xchacha20_KEYBYTES; -} - -size_t -crypto_stream_xchacha20_noncebytes(void) -{ - return crypto_stream_xchacha20_NONCEBYTES; -} - -size_t -crypto_stream_xchacha20_messagebytes_max(void) -{ - return crypto_stream_xchacha20_MESSAGEBYTES_MAX; -} - -int -crypto_stream_xchacha20(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES]; - - crypto_core_hchacha20(k2, n, k, NULL); - COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES <= sizeof k2); - COMPILER_ASSERT(crypto_stream_chacha20_NONCEBYTES == - crypto_stream_xchacha20_NONCEBYTES - - crypto_core_hchacha20_INPUTBYTES); - - return crypto_stream_chacha20(c, clen, n + crypto_core_hchacha20_INPUTBYTES, - k2); -} - -int -crypto_stream_xchacha20_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint64_t ic, const unsigned char *k) -{ - unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES]; - - crypto_core_hchacha20(k2, n, k, NULL); - return crypto_stream_chacha20_xor_ic( - c, m, mlen, n + crypto_core_hchacha20_INPUTBYTES, ic, k2); -} - -int -crypto_stream_xchacha20_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - return crypto_stream_xchacha20_xor_ic(c, m, mlen, n, 0U, k); -} - -void -crypto_stream_xchacha20_keygen( - unsigned char k[crypto_stream_xchacha20_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_xchacha20_KEYBYTES); -} + +#include + +#include "crypto_core_hchacha20.h" +#include "crypto_stream_chacha20.h" +#include "crypto_stream_xchacha20.h" +#include "private/common.h" +#include "randombytes.h" + +size_t +crypto_stream_xchacha20_keybytes(void) +{ + return crypto_stream_xchacha20_KEYBYTES; +} + +size_t +crypto_stream_xchacha20_noncebytes(void) +{ + return crypto_stream_xchacha20_NONCEBYTES; +} + +size_t +crypto_stream_xchacha20_messagebytes_max(void) +{ + return crypto_stream_xchacha20_MESSAGEBYTES_MAX; +} + +int +crypto_stream_xchacha20(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES]; + + crypto_core_hchacha20(k2, n, k, NULL); + COMPILER_ASSERT(crypto_stream_chacha20_KEYBYTES <= sizeof k2); + COMPILER_ASSERT(crypto_stream_chacha20_NONCEBYTES == + crypto_stream_xchacha20_NONCEBYTES - + crypto_core_hchacha20_INPUTBYTES); + + return crypto_stream_chacha20(c, clen, n + crypto_core_hchacha20_INPUTBYTES, + k2); +} + +int +crypto_stream_xchacha20_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint64_t ic, const unsigned char *k) +{ + unsigned char k2[crypto_core_hchacha20_OUTPUTBYTES]; + + crypto_core_hchacha20(k2, n, k, NULL); + return crypto_stream_chacha20_xor_ic( + c, m, mlen, n + crypto_core_hchacha20_INPUTBYTES, ic, k2); +} + +int +crypto_stream_xchacha20_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + return crypto_stream_xchacha20_xor_ic(c, m, mlen, n, 0U, k); +} + +void +crypto_stream_xchacha20_keygen( + unsigned char k[crypto_stream_xchacha20_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_xchacha20_KEYBYTES); +} diff --git a/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c b/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c index dc831a94d8..30b2929794 100644 --- a/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c +++ b/libs/libsodium/src/crypto_stream/xsalsa20/stream_xsalsa20.c @@ -1,66 +1,66 @@ -#include "crypto_core_hsalsa20.h" -#include "crypto_stream_salsa20.h" -#include "crypto_stream_xsalsa20.h" -#include "randombytes.h" -#include "utils.h" - -int -crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen, - const unsigned char *n, const unsigned char *k) -{ - unsigned char subkey[32]; - int ret; - - crypto_core_hsalsa20(subkey, n, k, NULL); - ret = crypto_stream_salsa20(c, clen, n + 16, subkey); - sodium_memzero(subkey, sizeof subkey); - - return ret; -} - -int -crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - uint64_t ic, const unsigned char *k) -{ - unsigned char subkey[32]; - int ret; - - crypto_core_hsalsa20(subkey, n, k, NULL); - ret = crypto_stream_salsa20_xor_ic(c, m, mlen, n + 16, ic, subkey); - sodium_memzero(subkey, sizeof subkey); - - return ret; -} - -int -crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m, - unsigned long long mlen, const unsigned char *n, - const unsigned char *k) -{ - return crypto_stream_xsalsa20_xor_ic(c, m, mlen, n, 0ULL, k); -} - -size_t -crypto_stream_xsalsa20_keybytes(void) -{ - return crypto_stream_xsalsa20_KEYBYTES; -} - -size_t -crypto_stream_xsalsa20_noncebytes(void) -{ - return crypto_stream_xsalsa20_NONCEBYTES; -} - -size_t -crypto_stream_xsalsa20_messagebytes_max(void) -{ - return crypto_stream_xsalsa20_MESSAGEBYTES_MAX; -} - -void -crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]) -{ - randombytes_buf(k, crypto_stream_xsalsa20_KEYBYTES); -} +#include "crypto_core_hsalsa20.h" +#include "crypto_stream_salsa20.h" +#include "crypto_stream_xsalsa20.h" +#include "randombytes.h" +#include "utils.h" + +int +crypto_stream_xsalsa20(unsigned char *c, unsigned long long clen, + const unsigned char *n, const unsigned char *k) +{ + unsigned char subkey[32]; + int ret; + + crypto_core_hsalsa20(subkey, n, k, NULL); + ret = crypto_stream_salsa20(c, clen, n + 16, subkey); + sodium_memzero(subkey, sizeof subkey); + + return ret; +} + +int +crypto_stream_xsalsa20_xor_ic(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + uint64_t ic, const unsigned char *k) +{ + unsigned char subkey[32]; + int ret; + + crypto_core_hsalsa20(subkey, n, k, NULL); + ret = crypto_stream_salsa20_xor_ic(c, m, mlen, n + 16, ic, subkey); + sodium_memzero(subkey, sizeof subkey); + + return ret; +} + +int +crypto_stream_xsalsa20_xor(unsigned char *c, const unsigned char *m, + unsigned long long mlen, const unsigned char *n, + const unsigned char *k) +{ + return crypto_stream_xsalsa20_xor_ic(c, m, mlen, n, 0ULL, k); +} + +size_t +crypto_stream_xsalsa20_keybytes(void) +{ + return crypto_stream_xsalsa20_KEYBYTES; +} + +size_t +crypto_stream_xsalsa20_noncebytes(void) +{ + return crypto_stream_xsalsa20_NONCEBYTES; +} + +size_t +crypto_stream_xsalsa20_messagebytes_max(void) +{ + return crypto_stream_xsalsa20_MESSAGEBYTES_MAX; +} + +void +crypto_stream_xsalsa20_keygen(unsigned char k[crypto_stream_xsalsa20_KEYBYTES]) +{ + randombytes_buf(k, crypto_stream_xsalsa20_KEYBYTES); +} -- cgit v1.2.3