diff options
author | aunsane <aunsane@gmail.com> | 2017-12-15 01:05:56 +0300 |
---|---|---|
committer | aunsane <aunsane@gmail.com> | 2017-12-15 01:05:56 +0300 |
commit | e124aa3611f38573898aa79c6eabe77bc874e58f (patch) | |
tree | 819464260f758bbc002b23c0c8a77f93751dcb42 /libs/libsodium/src/crypto_onetimeauth/poly1305 | |
parent | bbd9647d47f20d10b39570def918a0ac68c305c9 (diff) |
preparing to build tox from sources
Diffstat (limited to 'libs/libsodium/src/crypto_onetimeauth/poly1305')
8 files changed, 1663 insertions, 0 deletions
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c new file mode 100644 index 0000000000..e798072f84 --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c @@ -0,0 +1,124 @@ + +#include "poly1305_donna.h" +#include "crypto_verify_16.h" +#include "private/common.h" +#include "utils.h" + +#ifdef HAVE_TI_MODE +#include "poly1305_donna64.h" +#else +#include "poly1305_donna32.h" +#endif +#include "../onetimeauth_poly1305.h" + +static void +poly1305_update(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long bytes) +{ + unsigned long long i; + + /* handle leftover */ + if (st->leftover) { + unsigned long long want = (poly1305_block_size - st->leftover); + + if (want > bytes) { + want = bytes; + } + for (i = 0; i < want; i++) { + st->buffer[st->leftover + i] = m[i]; + } + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < poly1305_block_size) { + return; + } + poly1305_blocks(st, st->buffer, poly1305_block_size); + st->leftover = 0; + } + + /* process full blocks */ + if (bytes >= poly1305_block_size) { + unsigned long long want = (bytes & ~(poly1305_block_size - 1)); + + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + /* store leftover */ + if (bytes) { + for (i = 0; i < bytes; i++) { + st->buffer[st->leftover + i] = m[i]; + } + st->leftover += bytes; + } +} + +static int +crypto_onetimeauth_poly1305_donna(unsigned char *out, const unsigned char *m, + unsigned long long inlen, + const unsigned char *key) +{ + CRYPTO_ALIGN(64) poly1305_state_internal_t state; + + poly1305_init(&state, key); + poly1305_update(&state, m, inlen); + poly1305_finish(&state, out); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_donna_init(crypto_onetimeauth_poly1305_state *state, + const unsigned char *key) +{ + COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >= + sizeof(poly1305_state_internal_t)); + poly1305_init((poly1305_state_internal_t *) (void *) state, key); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_donna_update( + crypto_onetimeauth_poly1305_state *state, const unsigned char *in, + unsigned long long inlen) +{ + poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_donna_final( + crypto_onetimeauth_poly1305_state *state, unsigned char *out) +{ + poly1305_finish((poly1305_state_internal_t *) (void *) state, out); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_donna_verify(const unsigned char *h, + const unsigned char *in, + unsigned long long inlen, + const unsigned char *k) +{ + unsigned char correct[16]; + + crypto_onetimeauth_poly1305_donna(correct, in, inlen, k); + + return crypto_verify_16(h, correct); +} + +struct crypto_onetimeauth_poly1305_implementation + crypto_onetimeauth_poly1305_donna_implementation = { + SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_donna, + SODIUM_C99(.onetimeauth_verify =) + crypto_onetimeauth_poly1305_donna_verify, + SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_donna_init, + SODIUM_C99(.onetimeauth_update =) + crypto_onetimeauth_poly1305_donna_update, + SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_donna_final + }; diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h new file mode 100644 index 0000000000..d6474b3af4 --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h @@ -0,0 +1,12 @@ +#ifndef poly1305_donna_H +#define poly1305_donna_H + +#include <stddef.h> + +#include "../onetimeauth_poly1305.h" +#include "crypto_onetimeauth_poly1305.h" + +extern struct crypto_onetimeauth_poly1305_implementation + crypto_onetimeauth_poly1305_donna_implementation; + +#endif /* poly1305_donna_H */ diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h new file mode 100644 index 0000000000..bcf447cd7d --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h @@ -0,0 +1,235 @@ +/* + poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication + and 64 bit addition +*/ + +#if defined(_MSC_VER) +# define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) +# define POLY1305_NOINLINE __attribute__((noinline)) +#else +# define POLY1305_NOINLINE +#endif + +#include "private/common.h" + +#define poly1305_block_size 16 + +/* 17 + sizeof(unsigned long long) + 14*sizeof(unsigned long) */ +typedef struct poly1305_state_internal_t { + unsigned long r[5]; + unsigned long h[5]; + unsigned long pad[4]; + unsigned long long leftover; + unsigned char buffer[poly1305_block_size]; + unsigned char final; +} poly1305_state_internal_t; + +static void +poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32]) +{ + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff - wiped after finalization */ + st->r[0] = (LOAD32_LE(&key[0])) & 0x3ffffff; + st->r[1] = (LOAD32_LE(&key[3]) >> 2) & 0x3ffff03; + st->r[2] = (LOAD32_LE(&key[6]) >> 4) & 0x3ffc0ff; + st->r[3] = (LOAD32_LE(&key[9]) >> 6) & 0x3f03fff; + st->r[4] = (LOAD32_LE(&key[12]) >> 8) & 0x00fffff; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + st->h[3] = 0; + st->h[4] = 0; + + /* save pad for later */ + st->pad[0] = LOAD32_LE(&key[16]); + st->pad[1] = LOAD32_LE(&key[20]); + st->pad[2] = LOAD32_LE(&key[24]); + st->pad[3] = LOAD32_LE(&key[28]); + + st->leftover = 0; + st->final = 0; +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long bytes) +{ + const unsigned long hibit = (st->final) ? 0UL : (1UL << 24); /* 1 << 128 */ + unsigned long r0, r1, r2, r3, r4; + unsigned long s1, s2, s3, s4; + unsigned long h0, h1, h2, h3, h4; + unsigned long long d0, d1, d2, d3, d4; + unsigned long c; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + r3 = st->r[3]; + r4 = st->r[4]; + + s1 = r1 * 5; + s2 = r2 * 5; + s3 = r3 * 5; + s4 = r4 * 5; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + while (bytes >= poly1305_block_size) { + /* h += m[i] */ + h0 += (LOAD32_LE(m + 0)) & 0x3ffffff; + h1 += (LOAD32_LE(m + 3) >> 2) & 0x3ffffff; + h2 += (LOAD32_LE(m + 6) >> 4) & 0x3ffffff; + h3 += (LOAD32_LE(m + 9) >> 6) & 0x3ffffff; + h4 += (LOAD32_LE(m + 12) >> 8) | hibit; + + /* h *= r */ + d0 = ((unsigned long long) h0 * r0) + ((unsigned long long) h1 * s4) + + ((unsigned long long) h2 * s3) + ((unsigned long long) h3 * s2) + + ((unsigned long long) h4 * s1); + d1 = ((unsigned long long) h0 * r1) + ((unsigned long long) h1 * r0) + + ((unsigned long long) h2 * s4) + ((unsigned long long) h3 * s3) + + ((unsigned long long) h4 * s2); + d2 = ((unsigned long long) h0 * r2) + ((unsigned long long) h1 * r1) + + ((unsigned long long) h2 * r0) + ((unsigned long long) h3 * s4) + + ((unsigned long long) h4 * s3); + d3 = ((unsigned long long) h0 * r3) + ((unsigned long long) h1 * r2) + + ((unsigned long long) h2 * r1) + ((unsigned long long) h3 * r0) + + ((unsigned long long) h4 * s4); + d4 = ((unsigned long long) h0 * r4) + ((unsigned long long) h1 * r3) + + ((unsigned long long) h2 * r2) + ((unsigned long long) h3 * r1) + + ((unsigned long long) h4 * r0); + + /* (partial) h %= p */ + c = (unsigned long) (d0 >> 26); + h0 = (unsigned long) d0 & 0x3ffffff; + d1 += c; + c = (unsigned long) (d1 >> 26); + h1 = (unsigned long) d1 & 0x3ffffff; + d2 += c; + c = (unsigned long) (d2 >> 26); + h2 = (unsigned long) d2 & 0x3ffffff; + d3 += c; + c = (unsigned long) (d3 >> 26); + h3 = (unsigned long) d3 & 0x3ffffff; + d4 += c; + c = (unsigned long) (d4 >> 26); + h4 = (unsigned long) d4 & 0x3ffffff; + h0 += c * 5; + c = (h0 >> 26); + h0 = h0 & 0x3ffffff; + h1 += c; + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; + st->h[3] = h3; + st->h[4] = h4; +} + +static POLY1305_NOINLINE void +poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) +{ + unsigned long h0, h1, h2, h3, h4, c; + unsigned long g0, g1, g2, g3, g4; + unsigned long long f; + unsigned long mask; + + /* process the remaining block */ + if (st->leftover) { + unsigned long long i = st->leftover; + + st->buffer[i++] = 1; + for (; i < poly1305_block_size; i++) { + st->buffer[i] = 0; + } + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + h3 = st->h[3]; + h4 = st->h[4]; + + c = h1 >> 26; + h1 = h1 & 0x3ffffff; + h2 += c; + c = h2 >> 26; + h2 = h2 & 0x3ffffff; + h3 += c; + c = h3 >> 26; + h3 = h3 & 0x3ffffff; + h4 += c; + c = h4 >> 26; + h4 = h4 & 0x3ffffff; + h0 += c * 5; + c = h0 >> 26; + h0 = h0 & 0x3ffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = g0 >> 26; + g0 &= 0x3ffffff; + g1 = h1 + c; + c = g1 >> 26; + g1 &= 0x3ffffff; + g2 = h2 + c; + c = g2 >> 26; + g2 &= 0x3ffffff; + g3 = h3 + c; + c = g3 >> 26; + g3 &= 0x3ffffff; + g4 = h4 + c - (1UL << 26); + + /* select h if h < p, or h + -p if h >= p */ + mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1; + g0 &= mask; + g1 &= mask; + g2 &= mask; + g3 &= mask; + g4 &= mask; + mask = ~mask; + + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + h4 = (h4 & mask) | g4; + + /* h = h % (2^128) */ + h0 = ((h0) | (h1 << 26)) & 0xffffffff; + h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; + h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; + h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; + + /* mac = (h + pad) % (2^128) */ + f = (unsigned long long) h0 + st->pad[0]; + h0 = (unsigned long) f; + f = (unsigned long long) h1 + st->pad[1] + (f >> 32); + h1 = (unsigned long) f; + f = (unsigned long long) h2 + st->pad[2] + (f >> 32); + h2 = (unsigned long) f; + f = (unsigned long long) h3 + st->pad[3] + (f >> 32); + h3 = (unsigned long) f; + + STORE32_LE(mac + 0, (uint32_t) h0); + STORE32_LE(mac + 4, (uint32_t) h1); + STORE32_LE(mac + 8, (uint32_t) h2); + STORE32_LE(mac + 12, (uint32_t) h3); + + /* zero out the state */ + sodium_memzero((void *) st, sizeof *st); +} diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h new file mode 100644 index 0000000000..e0ed754779 --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h @@ -0,0 +1,220 @@ +/* + poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication + and 128 bit addition +*/ + +#include "private/common.h" + +#define MUL(out, x, y) out = ((uint128_t) x * y) +#define ADD(out, in) out += in +#define ADDLO(out, in) out += in +#define SHR(in, shift) (unsigned long long) (in >> (shift)) +#define LO(in) (unsigned long long) (in) + +#if defined(_MSC_VER) +# define POLY1305_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) +# define POLY1305_NOINLINE __attribute__((noinline)) +#else +# define POLY1305_NOINLINE +#endif + +#define poly1305_block_size 16 + +/* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */ +typedef struct poly1305_state_internal_t { + unsigned long long r[3]; + unsigned long long h[3]; + unsigned long long pad[2]; + unsigned long long leftover; + unsigned char buffer[poly1305_block_size]; + unsigned char final; +} poly1305_state_internal_t; + +static void +poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32]) +{ + unsigned long long t0, t1; + + /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ + t0 = LOAD64_LE(&key[0]); + t1 = LOAD64_LE(&key[8]); + + /* wiped after finalization */ + st->r[0] = (t0) &0xffc0fffffff; + st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; + st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f; + + /* h = 0 */ + st->h[0] = 0; + st->h[1] = 0; + st->h[2] = 0; + + /* save pad for later */ + st->pad[0] = LOAD64_LE(&key[16]); + st->pad[1] = LOAD64_LE(&key[24]); + + st->leftover = 0; + st->final = 0; +} + +static void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long bytes) +{ + const unsigned long long hibit = + (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */ + unsigned long long r0, r1, r2; + unsigned long long s1, s2; + unsigned long long h0, h1, h2; + unsigned long long c; + uint128_t d0, d1, d2, d; + + r0 = st->r[0]; + r1 = st->r[1]; + r2 = st->r[2]; + + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + while (bytes >= poly1305_block_size) { + unsigned long long t0, t1; + + /* h += m[i] */ + t0 = LOAD64_LE(&m[0]); + t1 = LOAD64_LE(&m[8]); + + h0 += ((t0) &0xfffffffffff); + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); + h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit; + + /* h *= r */ + MUL(d0, h0, r0); + MUL(d, h1, s2); + ADD(d0, d); + MUL(d, h2, s1); + ADD(d0, d); + MUL(d1, h0, r1); + MUL(d, h1, r0); + ADD(d1, d); + MUL(d, h2, s2); + ADD(d1, d); + MUL(d2, h0, r2); + MUL(d, h1, r1); + ADD(d2, d); + MUL(d, h2, r0); + ADD(d2, d); + + /* (partial) h %= p */ + c = SHR(d0, 44); + h0 = LO(d0) & 0xfffffffffff; + ADDLO(d1, c); + c = SHR(d1, 44); + h1 = LO(d1) & 0xfffffffffff; + ADDLO(d2, c); + c = SHR(d2, 42); + h2 = LO(d2) & 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 = h0 & 0xfffffffffff; + h1 += c; + + m += poly1305_block_size; + bytes -= poly1305_block_size; + } + + st->h[0] = h0; + st->h[1] = h1; + st->h[2] = h2; +} + +static POLY1305_NOINLINE void +poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) +{ + unsigned long long h0, h1, h2, c; + unsigned long long g0, g1, g2; + unsigned long long t0, t1; + + /* process the remaining block */ + if (st->leftover) { + unsigned long long i = st->leftover; + + st->buffer[i] = 1; + + for (i = i + 1; i < poly1305_block_size; i++) { + st->buffer[i] = 0; + } + st->final = 1; + poly1305_blocks(st, st->buffer, poly1305_block_size); + } + + /* fully carry h */ + h0 = st->h[0]; + h1 = st->h[1]; + h2 = st->h[2]; + + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + + /* compute h + -p */ + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - (1ULL << 42); + + /* select h if h < p, or h + -p if h >= p */ + c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1; + g0 &= c; + g1 &= c; + g2 &= c; + c = ~c; + h0 = (h0 & c) | g0; + h1 = (h1 & c) | g1; + h2 = (h2 & c) | g2; + + /* h = (h + pad) */ + t0 = st->pad[0]; + t1 = st->pad[1]; + + h0 += ((t0) &0xfffffffffff); + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += (((t1 >> 24)) & 0x3ffffffffff) + c; + h2 &= 0x3ffffffffff; + + /* mac = h % (2^128) */ + h0 = ((h0) | (h1 << 44)); + h1 = ((h1 >> 20) | (h2 << 24)); + + STORE64_LE(&mac[0], h0); + STORE64_LE(&mac[8], h1); + + /* zero out the state */ + sodium_memzero((void *) st, sizeof *st); +} diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c new file mode 100644 index 0000000000..d5e2efa297 --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c @@ -0,0 +1,90 @@ + +#include "onetimeauth_poly1305.h" +#include "crypto_onetimeauth_poly1305.h" +#include "private/common.h" +#include "private/implementations.h" +#include "randombytes.h" +#include "runtime.h" + +#include "donna/poly1305_donna.h" +#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H) +# include "sse2/poly1305_sse2.h" +#endif + +static const crypto_onetimeauth_poly1305_implementation *implementation = + &crypto_onetimeauth_poly1305_donna_implementation; + +int +crypto_onetimeauth_poly1305(unsigned char *out, const unsigned char *in, + unsigned long long inlen, const unsigned char *k) +{ + return implementation->onetimeauth(out, in, inlen, k); +} + +int +crypto_onetimeauth_poly1305_verify(const unsigned char *h, + const unsigned char *in, + unsigned long long inlen, + const unsigned char *k) +{ + return implementation->onetimeauth_verify(h, in, inlen, k); +} + +int +crypto_onetimeauth_poly1305_init(crypto_onetimeauth_poly1305_state *state, + const unsigned char *key) +{ + return implementation->onetimeauth_init(state, key); +} + +int +crypto_onetimeauth_poly1305_update(crypto_onetimeauth_poly1305_state *state, + const unsigned char *in, + unsigned long long inlen) +{ + return implementation->onetimeauth_update(state, in, inlen); +} + +int +crypto_onetimeauth_poly1305_final(crypto_onetimeauth_poly1305_state *state, + unsigned char *out) +{ + return implementation->onetimeauth_final(state, out); +} + +size_t +crypto_onetimeauth_poly1305_bytes(void) +{ + return crypto_onetimeauth_poly1305_BYTES; +} + +size_t +crypto_onetimeauth_poly1305_keybytes(void) +{ + return crypto_onetimeauth_poly1305_KEYBYTES; +} + +size_t +crypto_onetimeauth_poly1305_statebytes(void) +{ + return sizeof(crypto_onetimeauth_poly1305_state); +} + +void +crypto_onetimeauth_poly1305_keygen( + unsigned char k[crypto_onetimeauth_poly1305_KEYBYTES]) +{ + randombytes_buf(k, crypto_onetimeauth_poly1305_KEYBYTES); +} + +int +_crypto_onetimeauth_poly1305_pick_best_implementation(void) +{ + implementation = &crypto_onetimeauth_poly1305_donna_implementation; +#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H) + if (sodium_runtime_has_sse2()) { + implementation = &crypto_onetimeauth_poly1305_sse2_implementation; + } +#endif + return 0; +} diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h new file mode 100644 index 0000000000..243eadd50b --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h @@ -0,0 +1,21 @@ + +#ifndef onetimeauth_poly1305_H +#define onetimeauth_poly1305_H + +#include "crypto_onetimeauth_poly1305.h" + +typedef struct crypto_onetimeauth_poly1305_implementation { + int (*onetimeauth)(unsigned char *out, const unsigned char *in, + unsigned long long inlen, const unsigned char *k); + int (*onetimeauth_verify)(const unsigned char *h, const unsigned char *in, + unsigned long long inlen, const unsigned char *k); + int (*onetimeauth_init)(crypto_onetimeauth_poly1305_state *state, + const unsigned char * key); + int (*onetimeauth_update)(crypto_onetimeauth_poly1305_state *state, + const unsigned char * in, + unsigned long long inlen); + int (*onetimeauth_final)(crypto_onetimeauth_poly1305_state *state, + unsigned char * out); +} crypto_onetimeauth_poly1305_implementation; + +#endif diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c new file mode 100644 index 0000000000..022f15249b --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c @@ -0,0 +1,949 @@ + +#include <stdint.h> +#include <string.h> + +#include "../onetimeauth_poly1305.h" +#include "crypto_verify_16.h" +#include "poly1305_sse2.h" +#include "private/common.h" +#include "private/sse2_64_32.h" +#include "utils.h" + +#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H) + +# ifdef __GNUC__ +# pragma GCC target("sse2") +# endif + +# include <emmintrin.h> + +typedef __m128i xmmi; + +# if defined(_MSC_VER) +# define POLY1305_NOINLINE __declspec(noinline) +# elif defined(__GNUC__) +# define POLY1305_NOINLINE __attribute__((noinline)) +# else +# define POLY1305_NOINLINE +# endif + +# define poly1305_block_size 32 + +enum poly1305_state_flags_t { + poly1305_started = 1, + poly1305_final_shift8 = 4, + poly1305_final_shift16 = 8, + poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */ + poly1305_final_r_1 = 32 /* use [r,1] for the final block */ +}; + +typedef struct poly1305_state_internal_t { + union { + uint64_t h[3]; + uint32_t hh[10]; + } H; /* 40 bytes */ + uint32_t R[5]; /* 20 bytes */ + uint32_t R2[5]; /* 20 bytes */ + uint32_t R4[5]; /* 20 bytes */ + uint64_t pad[2]; /* 16 bytes */ + uint64_t flags; /* 8 bytes */ + unsigned long long leftover; /* 8 bytes */ + unsigned char buffer[poly1305_block_size]; /* 32 bytes */ +} poly1305_state_internal_t; /* 164 bytes total */ + +/* + * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are + * totally fine, even though this intrinsic requires a __m128i* input. + * This confuses dynamic analysis, so force alignment, only in debug mode. + */ +# ifdef DEBUG +static xmmi +_fakealign_mm_loadl_epi64(const void *m) +{ + xmmi tmp; + memcpy(&tmp, m, 8); + + return _mm_loadl_epi64(&tmp); +} +# define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X) +#endif + +/* copy 0-31 bytes */ +static inline void +poly1305_block_copy31(unsigned char *dst, const unsigned char *src, + unsigned long long bytes) +{ + if (bytes & 16) { + _mm_store_si128((xmmi *) (void *) dst, + _mm_loadu_si128((const xmmi *) (const void *) src)); + src += 16; + dst += 16; + } + if (bytes & 8) { + memcpy(dst, src, 8); + src += 8; + dst += 8; + } + if (bytes & 4) { + memcpy(dst, src, 4); + src += 4; + dst += 4; + } + if (bytes & 2) { + memcpy(dst, src, 2); + src += 2; + dst += 2; + } + if (bytes & 1) { + *dst = *src; + } +} + +static POLY1305_NOINLINE void +poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32], + unsigned long long bytes) +{ + uint32_t *R; + uint128_t d[3]; + uint64_t r0, r1, r2; + uint64_t rt0, rt1, rt2, st2, c; + uint64_t t0, t1; + unsigned long long i; + + if (!bytes) { + bytes = ~(unsigned long long) 0; + } + /* H = 0 */ + _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128()); + + /* clamp key */ + memcpy(&t0, key, 8); + memcpy(&t1, key + 8, 8); + r0 = t0 & 0xffc0fffffff; + t0 >>= 44; + t0 |= t1 << 20; + r1 = t0 & 0xfffffc0ffff; + t1 >>= 24; + r2 = t1 & 0x00ffffffc0f; + + /* r^1 */ + R = st->R; + R[0] = (uint32_t)(r0) &0x3ffffff; + R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; + R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; + R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; + R[4] = (uint32_t)((r2 >> 16)); + + /* save pad */ + memcpy(&st->pad[0], key + 16, 8); + memcpy(&st->pad[1], key + 24, 8); + + rt0 = r0; + rt1 = r1; + rt2 = r2; + + /* r^2, r^4 */ + for (i = 0; i < 2; i++) { + if (i == 0) { + R = st->R2; + if (bytes <= 16) { + break; + } + } else if (i == 1) { + R = st->R4; + if (bytes < 96) { + break; + } + } + st2 = rt2 * (5 << 2); + + d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2); + d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1); + d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0); + + rt0 = (uint64_t) d[0] & 0xfffffffffff; + c = (uint64_t)(d[0] >> 44); + d[1] += c; + + rt1 = (uint64_t) d[1] & 0xfffffffffff; + c = (uint64_t)(d[1] >> 44); + d[2] += c; + + rt2 = (uint64_t) d[2] & 0x3ffffffffff; + c = (uint64_t)(d[2] >> 42); + rt0 += c * 5; + c = (rt0 >> 44); + rt0 = rt0 & 0xfffffffffff; + rt1 += c; + c = (rt1 >> 44); + rt1 = rt1 & 0xfffffffffff; + rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and + is safe to multiply with */ + + R[0] = (uint32_t)(rt0) &0x3ffffff; + R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff; + R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff; + R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff; + R[4] = (uint32_t)((rt2 >> 16)); + } + st->flags = 0; + st->leftover = 0U; +} + +static POLY1305_NOINLINE void +poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long bytes) +{ + CRYPTO_ALIGN(64) + xmmi HIBIT = + _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0)); + const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1), + _MM_SHUFFLE(1, 0, 1, 0)); + const xmmi FIVE = + _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0)); + xmmi H0, H1, H2, H3, H4; + xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8; + xmmi M0, M1, M2, M3, M4; + xmmi M5, M6, M7, M8; + xmmi C1, C2; + xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24; + xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44; + + if (st->flags & poly1305_final_shift8) { + HIBIT = _mm_srli_si128(HIBIT, 8); + } + if (st->flags & poly1305_final_shift16) { + HIBIT = _mm_setzero_si128(); + } + if (!(st->flags & poly1305_started)) { + /* H = [Mx,My] */ + T5 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); + T6 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); + H0 = _mm_and_si128(MMASK, T5); + H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + H2 = _mm_and_si128(MMASK, T5); + H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + H4 = _mm_srli_epi64(T6, 40); + H4 = _mm_or_si128(H4, HIBIT); + m += 32; + bytes -= 32; + st->flags |= poly1305_started; + } else { + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]); + T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]); + T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]); + H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0)); + H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2)); + H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0)); + H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2)); + H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0)); + } + if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) { + if (st->flags & poly1305_final_r2_r) { + /* use [r^2, r] */ + T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); + T3 = _mm_cvtsi32_si128(st->R[4]); + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); + T1 = _mm_cvtsi32_si128(st->R2[4]); + T4 = _mm_unpacklo_epi32(T0, T2); + T5 = _mm_unpackhi_epi32(T0, T2); + R24 = _mm_unpacklo_epi64(T1, T3); + } else { + /* use [r^1, 1] */ + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]); + T1 = _mm_cvtsi32_si128(st->R[4]); + T2 = _mm_cvtsi32_si128(1); + T4 = _mm_unpacklo_epi32(T0, T2); + T5 = _mm_unpackhi_epi32(T0, T2); + R24 = T1; + } + R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0)); + R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2)); + R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0)); + R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2)); + } else { + /* use [r^2, r^2] */ + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]); + T1 = _mm_cvtsi32_si128(st->R2[4]); + R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); + R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); + R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); + R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); + R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); + } + S21 = _mm_mul_epu32(R21, FIVE); + S22 = _mm_mul_epu32(R22, FIVE); + S23 = _mm_mul_epu32(R23, FIVE); + S24 = _mm_mul_epu32(R24, FIVE); + + if (bytes >= 64) { + T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]); + T1 = _mm_cvtsi32_si128(st->R4[4]); + R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0)); + R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1)); + R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2)); + R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3)); + R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0)); + S41 = _mm_mul_epu32(R41, FIVE); + S42 = _mm_mul_epu32(R42, FIVE); + S43 = _mm_mul_epu32(R43, FIVE); + S44 = _mm_mul_epu32(R44, FIVE); + + while (bytes >= 64) { + xmmi v00, v01, v02, v03, v04; + xmmi v10, v11, v12, v13, v14; + xmmi v20, v21, v22, v23, v24; + xmmi v30, v31, v32, v33, v34; + xmmi v40, v41, v42, v43, v44; + xmmi T14, T15; + + /* H *= [r^4,r^4], preload [Mx,My] */ + T15 = S42; + T0 = H4; + T0 = _mm_mul_epu32(T0, S41); + v01 = H3; + v01 = _mm_mul_epu32(v01, T15); + T14 = S43; + T1 = H4; + T1 = _mm_mul_epu32(T1, T15); + v11 = H3; + v11 = _mm_mul_epu32(v11, T14); + T2 = H4; + T2 = _mm_mul_epu32(T2, T14); + T0 = _mm_add_epi64(T0, v01); + T15 = S44; + v02 = H2; + v02 = _mm_mul_epu32(v02, T14); + T3 = H4; + T3 = _mm_mul_epu32(T3, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = H1; + v03 = _mm_mul_epu32(v03, T15); + v12 = H2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); + T14 = R40; + v21 = H3; + v21 = _mm_mul_epu32(v21, T15); + v31 = H3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + T4 = H4; + T4 = _mm_mul_epu32(T4, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = H0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = H1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); + T15 = R41; + v22 = H2; + v22 = _mm_mul_epu32(v22, T14); + v32 = H2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = H3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = H0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); + T14 = R42; + T5 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 16))); + v23 = H1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = H1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = H2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); + T15 = R43; + T6 = _mm_unpacklo_epi64( + _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)), + _mm_loadl_epi64((const xmmi *) (const void *) (m + 24))); + v24 = H0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = H0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + M0 = _mm_and_si128(MMASK, T5); + v43 = H1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + v44 = H0; + v44 = _mm_mul_epu32(v44, R44); + T2 = _mm_add_epi64(T2, v24); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + T3 = _mm_add_epi64(T3, v34); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14)); + T4 = _mm_add_epi64(T4, v43); + M2 = _mm_and_si128(MMASK, T5); + T4 = _mm_add_epi64(T4, v44); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + /* H += [Mx',My'] */ + T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32)); + T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48)); + T7 = _mm_unpacklo_epi32(T5, T6); + T8 = _mm_unpackhi_epi32(T5, T6); + M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); + M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); + M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); + M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128()); + M6 = _mm_slli_epi64(M6, 6); + M7 = _mm_slli_epi64(M7, 12); + M8 = _mm_slli_epi64(M8, 18); + T0 = _mm_add_epi64(T0, M5); + T1 = _mm_add_epi64(T1, M6); + T2 = _mm_add_epi64(T2, M7); + T3 = _mm_add_epi64(T3, M8); + T4 = _mm_add_epi64(T4, HIBIT); + + /* H += [Mx,My]*[r^2,r^2] */ + T15 = S22; + v00 = M4; + v00 = _mm_mul_epu32(v00, S21); + v01 = M3; + v01 = _mm_mul_epu32(v01, T15); + T14 = S23; + v10 = M4; + v10 = _mm_mul_epu32(v10, T15); + v11 = M3; + v11 = _mm_mul_epu32(v11, T14); + T0 = _mm_add_epi64(T0, v00); + v20 = M4; + v20 = _mm_mul_epu32(v20, T14); + T0 = _mm_add_epi64(T0, v01); + T15 = S24; + v02 = M2; + v02 = _mm_mul_epu32(v02, T14); + T1 = _mm_add_epi64(T1, v10); + v30 = M4; + v30 = _mm_mul_epu32(v30, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = M1; + v03 = _mm_mul_epu32(v03, T15); + T2 = _mm_add_epi64(T2, v20); + v12 = M2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); + T14 = R20; + v21 = M3; + v21 = _mm_mul_epu32(v21, T15); + T3 = _mm_add_epi64(T3, v30); + v31 = M3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + v40 = M4; + v40 = _mm_mul_epu32(v40, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = M0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = M1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); + T15 = R21; + v22 = M2; + v22 = _mm_mul_epu32(v22, T14); + T4 = _mm_add_epi64(T4, v40); + v32 = M2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = M3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = M0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); + T14 = R22; + v23 = M1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = M1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = M2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); + T15 = R23; + v24 = M0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = M0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + v43 = M1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + v44 = M0; + v44 = _mm_mul_epu32(v44, R24); + T2 = _mm_add_epi64(T2, v24); + T3 = _mm_add_epi64(T3, v34); + T4 = _mm_add_epi64(T4, v43); + T4 = _mm_add_epi64(T4, v44); + + /* reduce */ + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */ + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + m += 64; + bytes -= 64; + } + } + + if (bytes >= 32) { + xmmi v01, v02, v03, v04; + xmmi v11, v12, v13, v14; + xmmi v21, v22, v23, v24; + xmmi v31, v32, v33, v34; + xmmi v41, v42, v43, v44; + xmmi T14, T15; + + /* H *= [r^2,r^2] */ + T15 = S22; + T0 = H4; + T0 = _mm_mul_epu32(T0, S21); + v01 = H3; + v01 = _mm_mul_epu32(v01, T15); + T14 = S23; + T1 = H4; + T1 = _mm_mul_epu32(T1, T15); + v11 = H3; + v11 = _mm_mul_epu32(v11, T14); + T2 = H4; + T2 = _mm_mul_epu32(T2, T14); + T0 = _mm_add_epi64(T0, v01); + T15 = S24; + v02 = H2; + v02 = _mm_mul_epu32(v02, T14); + T3 = H4; + T3 = _mm_mul_epu32(T3, T15); + T1 = _mm_add_epi64(T1, v11); + v03 = H1; + v03 = _mm_mul_epu32(v03, T15); + v12 = H2; + v12 = _mm_mul_epu32(v12, T15); + T0 = _mm_add_epi64(T0, v02); + T14 = R20; + v21 = H3; + v21 = _mm_mul_epu32(v21, T15); + v31 = H3; + v31 = _mm_mul_epu32(v31, T14); + T0 = _mm_add_epi64(T0, v03); + T4 = H4; + T4 = _mm_mul_epu32(T4, T14); + T1 = _mm_add_epi64(T1, v12); + v04 = H0; + v04 = _mm_mul_epu32(v04, T14); + T2 = _mm_add_epi64(T2, v21); + v13 = H1; + v13 = _mm_mul_epu32(v13, T14); + T3 = _mm_add_epi64(T3, v31); + T15 = R21; + v22 = H2; + v22 = _mm_mul_epu32(v22, T14); + v32 = H2; + v32 = _mm_mul_epu32(v32, T15); + T0 = _mm_add_epi64(T0, v04); + v41 = H3; + v41 = _mm_mul_epu32(v41, T15); + T1 = _mm_add_epi64(T1, v13); + v14 = H0; + v14 = _mm_mul_epu32(v14, T15); + T2 = _mm_add_epi64(T2, v22); + T14 = R22; + v23 = H1; + v23 = _mm_mul_epu32(v23, T15); + T3 = _mm_add_epi64(T3, v32); + v33 = H1; + v33 = _mm_mul_epu32(v33, T14); + T4 = _mm_add_epi64(T4, v41); + v42 = H2; + v42 = _mm_mul_epu32(v42, T14); + T1 = _mm_add_epi64(T1, v14); + T15 = R23; + v24 = H0; + v24 = _mm_mul_epu32(v24, T14); + T2 = _mm_add_epi64(T2, v23); + v34 = H0; + v34 = _mm_mul_epu32(v34, T15); + T3 = _mm_add_epi64(T3, v33); + v43 = H1; + v43 = _mm_mul_epu32(v43, T15); + T4 = _mm_add_epi64(T4, v42); + v44 = H0; + v44 = _mm_mul_epu32(v44, R24); + T2 = _mm_add_epi64(T2, v24); + T3 = _mm_add_epi64(T3, v34); + T4 = _mm_add_epi64(T4, v43); + T4 = _mm_add_epi64(T4, v44); + + /* H += [Mx,My] */ + if (m) { + T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0)); + T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16)); + T7 = _mm_unpacklo_epi32(T5, T6); + T8 = _mm_unpackhi_epi32(T5, T6); + M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128()); + M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128()); + M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128()); + M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128()); + M1 = _mm_slli_epi64(M1, 6); + M2 = _mm_slli_epi64(M2, 12); + M3 = _mm_slli_epi64(M3, 18); + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, HIBIT); + } + + /* reduce */ + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + /* H = (H*[r^2,r^2] + [Mx,My]) */ + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + } + + if (m) { + T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0)); + T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0)); + T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0)); + T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0)); + T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0)); + T0 = _mm_unpacklo_epi64(T0, T1); + T1 = _mm_unpacklo_epi64(T2, T3); + _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0); + _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1); + _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4); + } else { + uint32_t t0, t1, t2, t3, t4, b; + uint64_t h0, h1, h2, g0, g1, g2, c, nc; + + /* H = H[0]+H[1] */ + T0 = H0; + T1 = H1; + T2 = H2; + T3 = H3; + T4 = H4; + + T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); + T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); + T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); + T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); + T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); + + t0 = _mm_cvtsi128_si32(T0); + b = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = _mm_cvtsi128_si32(T1) + b; + b = (t1 >> 26); + t1 &= 0x3ffffff; + t2 = _mm_cvtsi128_si32(T2) + b; + b = (t2 >> 26); + t2 &= 0x3ffffff; + t3 = _mm_cvtsi128_si32(T3) + b; + b = (t3 >> 26); + t3 &= 0x3ffffff; + t4 = _mm_cvtsi128_si32(T4) + b; + + /* everything except t4 is in range, so this is all safe */ + h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull; + h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) | + ((uint64_t) t3 << 34)) & + 0xfffffffffffull; + h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16)); + + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t) 1 << 42); + + c = (g2 >> 63) - 1; + nc = ~c; + h0 = (h0 & nc) | (g0 & c); + h1 = (h1 & nc) | (g1 & c); + h2 = (h2 & nc) | (g2 & c); + + st->H.h[0] = h0; + st->H.h[1] = h1; + st->H.h[2] = h2; + } +} + +static void +poly1305_update(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long bytes) +{ + unsigned long long i; + + /* handle leftover */ + if (st->leftover) { + unsigned long long want = (poly1305_block_size - st->leftover); + + if (want > bytes) { + want = bytes; + } + for (i = 0; i < want; i++) { + st->buffer[st->leftover + i] = m[i]; + } + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < poly1305_block_size) { + return; + } + poly1305_blocks(st, st->buffer, poly1305_block_size); + st->leftover = 0; + } + + /* process full blocks */ + if (bytes >= poly1305_block_size) { + unsigned long long want = (bytes & ~(poly1305_block_size - 1)); + + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + /* store leftover */ + if (bytes) { + for (i = 0; i < bytes; i++) { + st->buffer[st->leftover + i] = m[i]; + } + st->leftover += bytes; + } +} + +static POLY1305_NOINLINE void +poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m, + unsigned long long leftover, unsigned char mac[16]) +{ + uint64_t h0, h1, h2; + + if (leftover) { + CRYPTO_ALIGN(16) unsigned char final[32] = { 0 }; + + poly1305_block_copy31(final, m, leftover); + if (leftover != 16) { + final[leftover] = 1; + } + st->flags |= + (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16; + poly1305_blocks(st, final, 32); + } + + if (st->flags & poly1305_started) { + /* finalize, H *= [r^2,r], or H *= [r,1] */ + if (!leftover || (leftover > 16)) { + st->flags |= poly1305_final_r2_r; + } else { + st->flags |= poly1305_final_r_1; + } + poly1305_blocks(st, NULL, 32); + } + + h0 = st->H.h[0]; + h1 = st->H.h[1]; + h2 = st->H.h[2]; + + /* pad */ + h0 = ((h0) | (h1 << 44)); + h1 = ((h1 >> 20) | (h2 << 24)); +#ifdef HAVE_AMD64_ASM + __asm__ __volatile__( + "addq %2, %0 ;\n" + "adcq %3, %1 ;\n" + : "+r"(h0), "+r"(h1) + : "r"(st->pad[0]), "r"(st->pad[1]) + : "flags", "cc"); +#else + { + uint128_t h; + + memcpy(&h, &st->pad[0], 16); + h += ((uint128_t) h1 << 64) | h0; + h0 = (uint64_t) h; + h1 = (uint64_t)(h >> 64); + } +#endif + _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128()); + _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128()); + + memcpy(&mac[0], &h0, 8); + memcpy(&mac[8], &h1, 8); + + sodium_memzero((void *) st, sizeof *st); +} + +static void +poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16]) +{ + poly1305_finish_ext(st, st->buffer, st->leftover, mac); +} + +static int +crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state, + const unsigned char *key) +{ + COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >= + sizeof(poly1305_state_internal_t)); + poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_sse2_update( + crypto_onetimeauth_poly1305_state *state, const unsigned char *in, + unsigned long long inlen) +{ + poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state, + unsigned char *out) +{ + poly1305_finish((poly1305_state_internal_t *) (void *) state, out); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m, + unsigned long long inlen, + const unsigned char *key) +{ + CRYPTO_ALIGN(64) poly1305_state_internal_t st; + unsigned long long blocks; + + poly1305_init_ext(&st, key, inlen); + blocks = inlen & ~31; + if (blocks > 0) { + poly1305_blocks(&st, m, blocks); + m += blocks; + inlen -= blocks; + } + poly1305_finish_ext(&st, m, inlen, out); + + return 0; +} + +static int +crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h, + const unsigned char *in, + unsigned long long inlen, + const unsigned char *k) +{ + unsigned char correct[16]; + + crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k); + + return crypto_verify_16(h, correct); +} + +struct crypto_onetimeauth_poly1305_implementation + crypto_onetimeauth_poly1305_sse2_implementation = { + SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2, + SODIUM_C99(.onetimeauth_verify =) + crypto_onetimeauth_poly1305_sse2_verify, + SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init, + SODIUM_C99(.onetimeauth_update =) + crypto_onetimeauth_poly1305_sse2_update, + SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final + }; + +#endif diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h new file mode 100644 index 0000000000..9177cad487 --- /dev/null +++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h @@ -0,0 +1,12 @@ +#ifndef poly1305_sse2_H +#define poly1305_sse2_H + +#include <stddef.h> + +#include "../onetimeauth_poly1305.h" +#include "crypto_onetimeauth_poly1305.h" + +extern struct crypto_onetimeauth_poly1305_implementation + crypto_onetimeauth_poly1305_sse2_implementation; + +#endif /* poly1305_sse2_H */ |