summaryrefslogtreecommitdiff
path: root/libs/libsodium/src/crypto_onetimeauth/poly1305
diff options
context:
space:
mode:
authoraunsane <aunsane@gmail.com>2017-12-15 01:05:56 +0300
committeraunsane <aunsane@gmail.com>2017-12-15 01:05:56 +0300
commite124aa3611f38573898aa79c6eabe77bc874e58f (patch)
tree819464260f758bbc002b23c0c8a77f93751dcb42 /libs/libsodium/src/crypto_onetimeauth/poly1305
parentbbd9647d47f20d10b39570def918a0ac68c305c9 (diff)
preparing to build tox from sources
Diffstat (limited to 'libs/libsodium/src/crypto_onetimeauth/poly1305')
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c124
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h12
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h235
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h220
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c90
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h21
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c949
-rw-r--r--libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h12
8 files changed, 1663 insertions, 0 deletions
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c
new file mode 100644
index 0000000000..e798072f84
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.c
@@ -0,0 +1,124 @@
+
+#include "poly1305_donna.h"
+#include "crypto_verify_16.h"
+#include "private/common.h"
+#include "utils.h"
+
+#ifdef HAVE_TI_MODE
+#include "poly1305_donna64.h"
+#else
+#include "poly1305_donna32.h"
+#endif
+#include "../onetimeauth_poly1305.h"
+
+static void
+poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long bytes)
+{
+ unsigned long long i;
+
+ /* handle leftover */
+ if (st->leftover) {
+ unsigned long long want = (poly1305_block_size - st->leftover);
+
+ if (want > bytes) {
+ want = bytes;
+ }
+ for (i = 0; i < want; i++) {
+ st->buffer[st->leftover + i] = m[i];
+ }
+ bytes -= want;
+ m += want;
+ st->leftover += want;
+ if (st->leftover < poly1305_block_size) {
+ return;
+ }
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ st->leftover = 0;
+ }
+
+ /* process full blocks */
+ if (bytes >= poly1305_block_size) {
+ unsigned long long want = (bytes & ~(poly1305_block_size - 1));
+
+ poly1305_blocks(st, m, want);
+ m += want;
+ bytes -= want;
+ }
+
+ /* store leftover */
+ if (bytes) {
+ for (i = 0; i < bytes; i++) {
+ st->buffer[st->leftover + i] = m[i];
+ }
+ st->leftover += bytes;
+ }
+}
+
+static int
+crypto_onetimeauth_poly1305_donna(unsigned char *out, const unsigned char *m,
+ unsigned long long inlen,
+ const unsigned char *key)
+{
+ CRYPTO_ALIGN(64) poly1305_state_internal_t state;
+
+ poly1305_init(&state, key);
+ poly1305_update(&state, m, inlen);
+ poly1305_finish(&state, out);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_donna_init(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char *key)
+{
+ COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
+ sizeof(poly1305_state_internal_t));
+ poly1305_init((poly1305_state_internal_t *) (void *) state, key);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_donna_update(
+ crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
+ unsigned long long inlen)
+{
+ poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_donna_final(
+ crypto_onetimeauth_poly1305_state *state, unsigned char *out)
+{
+ poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_donna_verify(const unsigned char *h,
+ const unsigned char *in,
+ unsigned long long inlen,
+ const unsigned char *k)
+{
+ unsigned char correct[16];
+
+ crypto_onetimeauth_poly1305_donna(correct, in, inlen, k);
+
+ return crypto_verify_16(h, correct);
+}
+
+struct crypto_onetimeauth_poly1305_implementation
+ crypto_onetimeauth_poly1305_donna_implementation = {
+ SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_donna,
+ SODIUM_C99(.onetimeauth_verify =)
+ crypto_onetimeauth_poly1305_donna_verify,
+ SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_donna_init,
+ SODIUM_C99(.onetimeauth_update =)
+ crypto_onetimeauth_poly1305_donna_update,
+ SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_donna_final
+ };
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h
new file mode 100644
index 0000000000..d6474b3af4
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna.h
@@ -0,0 +1,12 @@
+#ifndef poly1305_donna_H
+#define poly1305_donna_H
+
+#include <stddef.h>
+
+#include "../onetimeauth_poly1305.h"
+#include "crypto_onetimeauth_poly1305.h"
+
+extern struct crypto_onetimeauth_poly1305_implementation
+ crypto_onetimeauth_poly1305_donna_implementation;
+
+#endif /* poly1305_donna_H */
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h
new file mode 100644
index 0000000000..bcf447cd7d
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna32.h
@@ -0,0 +1,235 @@
+/*
+ poly1305 implementation using 32 bit * 32 bit = 64 bit multiplication
+ and 64 bit addition
+*/
+
+#if defined(_MSC_VER)
+# define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+# define POLY1305_NOINLINE __attribute__((noinline))
+#else
+# define POLY1305_NOINLINE
+#endif
+
+#include "private/common.h"
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(unsigned long long) + 14*sizeof(unsigned long) */
+typedef struct poly1305_state_internal_t {
+ unsigned long r[5];
+ unsigned long h[5];
+ unsigned long pad[4];
+ unsigned long long leftover;
+ unsigned char buffer[poly1305_block_size];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+static void
+poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
+{
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff - wiped after finalization */
+ st->r[0] = (LOAD32_LE(&key[0])) & 0x3ffffff;
+ st->r[1] = (LOAD32_LE(&key[3]) >> 2) & 0x3ffff03;
+ st->r[2] = (LOAD32_LE(&key[6]) >> 4) & 0x3ffc0ff;
+ st->r[3] = (LOAD32_LE(&key[9]) >> 6) & 0x3f03fff;
+ st->r[4] = (LOAD32_LE(&key[12]) >> 8) & 0x00fffff;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+
+ /* save pad for later */
+ st->pad[0] = LOAD32_LE(&key[16]);
+ st->pad[1] = LOAD32_LE(&key[20]);
+ st->pad[2] = LOAD32_LE(&key[24]);
+ st->pad[3] = LOAD32_LE(&key[28]);
+
+ st->leftover = 0;
+ st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long bytes)
+{
+ const unsigned long hibit = (st->final) ? 0UL : (1UL << 24); /* 1 << 128 */
+ unsigned long r0, r1, r2, r3, r4;
+ unsigned long s1, s2, s3, s4;
+ unsigned long h0, h1, h2, h3, h4;
+ unsigned long long d0, d1, d2, d3, d4;
+ unsigned long c;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+ r4 = st->r[4];
+
+ s1 = r1 * 5;
+ s2 = r2 * 5;
+ s3 = r3 * 5;
+ s4 = r4 * 5;
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (bytes >= poly1305_block_size) {
+ /* h += m[i] */
+ h0 += (LOAD32_LE(m + 0)) & 0x3ffffff;
+ h1 += (LOAD32_LE(m + 3) >> 2) & 0x3ffffff;
+ h2 += (LOAD32_LE(m + 6) >> 4) & 0x3ffffff;
+ h3 += (LOAD32_LE(m + 9) >> 6) & 0x3ffffff;
+ h4 += (LOAD32_LE(m + 12) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((unsigned long long) h0 * r0) + ((unsigned long long) h1 * s4) +
+ ((unsigned long long) h2 * s3) + ((unsigned long long) h3 * s2) +
+ ((unsigned long long) h4 * s1);
+ d1 = ((unsigned long long) h0 * r1) + ((unsigned long long) h1 * r0) +
+ ((unsigned long long) h2 * s4) + ((unsigned long long) h3 * s3) +
+ ((unsigned long long) h4 * s2);
+ d2 = ((unsigned long long) h0 * r2) + ((unsigned long long) h1 * r1) +
+ ((unsigned long long) h2 * r0) + ((unsigned long long) h3 * s4) +
+ ((unsigned long long) h4 * s3);
+ d3 = ((unsigned long long) h0 * r3) + ((unsigned long long) h1 * r2) +
+ ((unsigned long long) h2 * r1) + ((unsigned long long) h3 * r0) +
+ ((unsigned long long) h4 * s4);
+ d4 = ((unsigned long long) h0 * r4) + ((unsigned long long) h1 * r3) +
+ ((unsigned long long) h2 * r2) + ((unsigned long long) h3 * r1) +
+ ((unsigned long long) h4 * r0);
+
+ /* (partial) h %= p */
+ c = (unsigned long) (d0 >> 26);
+ h0 = (unsigned long) d0 & 0x3ffffff;
+ d1 += c;
+ c = (unsigned long) (d1 >> 26);
+ h1 = (unsigned long) d1 & 0x3ffffff;
+ d2 += c;
+ c = (unsigned long) (d2 >> 26);
+ h2 = (unsigned long) d2 & 0x3ffffff;
+ d3 += c;
+ c = (unsigned long) (d3 >> 26);
+ h3 = (unsigned long) d3 & 0x3ffffff;
+ d4 += c;
+ c = (unsigned long) (d4 >> 26);
+ h4 = (unsigned long) d4 & 0x3ffffff;
+ h0 += c * 5;
+ c = (h0 >> 26);
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+static POLY1305_NOINLINE void
+poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
+{
+ unsigned long h0, h1, h2, h3, h4, c;
+ unsigned long g0, g1, g2, g3, g4;
+ unsigned long long f;
+ unsigned long mask;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ unsigned long long i = st->leftover;
+
+ st->buffer[i++] = 1;
+ for (; i < poly1305_block_size; i++) {
+ st->buffer[i] = 0;
+ }
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26;
+ h1 = h1 & 0x3ffffff;
+ h2 += c;
+ c = h2 >> 26;
+ h2 = h2 & 0x3ffffff;
+ h3 += c;
+ c = h3 >> 26;
+ h3 = h3 & 0x3ffffff;
+ h4 += c;
+ c = h4 >> 26;
+ h4 = h4 & 0x3ffffff;
+ h0 += c * 5;
+ c = h0 >> 26;
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5;
+ c = g0 >> 26;
+ g0 &= 0x3ffffff;
+ g1 = h1 + c;
+ c = g1 >> 26;
+ g1 &= 0x3ffffff;
+ g2 = h2 + c;
+ c = g2 >> 26;
+ g2 &= 0x3ffffff;
+ g3 = h3 + c;
+ c = g3 >> 26;
+ g3 &= 0x3ffffff;
+ g4 = h4 + c - (1UL << 26);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g4 >> ((sizeof(unsigned long) * 8) - 1)) - 1;
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ g4 &= mask;
+ mask = ~mask;
+
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+ h4 = (h4 & mask) | g4;
+
+ /* h = h % (2^128) */
+ h0 = ((h0) | (h1 << 26)) & 0xffffffff;
+ h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+ h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+ h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+ /* mac = (h + pad) % (2^128) */
+ f = (unsigned long long) h0 + st->pad[0];
+ h0 = (unsigned long) f;
+ f = (unsigned long long) h1 + st->pad[1] + (f >> 32);
+ h1 = (unsigned long) f;
+ f = (unsigned long long) h2 + st->pad[2] + (f >> 32);
+ h2 = (unsigned long) f;
+ f = (unsigned long long) h3 + st->pad[3] + (f >> 32);
+ h3 = (unsigned long) f;
+
+ STORE32_LE(mac + 0, (uint32_t) h0);
+ STORE32_LE(mac + 4, (uint32_t) h1);
+ STORE32_LE(mac + 8, (uint32_t) h2);
+ STORE32_LE(mac + 12, (uint32_t) h3);
+
+ /* zero out the state */
+ sodium_memzero((void *) st, sizeof *st);
+}
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h
new file mode 100644
index 0000000000..e0ed754779
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/donna/poly1305_donna64.h
@@ -0,0 +1,220 @@
+/*
+ poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication
+ and 128 bit addition
+*/
+
+#include "private/common.h"
+
+#define MUL(out, x, y) out = ((uint128_t) x * y)
+#define ADD(out, in) out += in
+#define ADDLO(out, in) out += in
+#define SHR(in, shift) (unsigned long long) (in >> (shift))
+#define LO(in) (unsigned long long) (in)
+
+#if defined(_MSC_VER)
+# define POLY1305_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__)
+# define POLY1305_NOINLINE __attribute__((noinline))
+#else
+# define POLY1305_NOINLINE
+#endif
+
+#define poly1305_block_size 16
+
+/* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */
+typedef struct poly1305_state_internal_t {
+ unsigned long long r[3];
+ unsigned long long h[3];
+ unsigned long long pad[2];
+ unsigned long long leftover;
+ unsigned char buffer[poly1305_block_size];
+ unsigned char final;
+} poly1305_state_internal_t;
+
+static void
+poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
+{
+ unsigned long long t0, t1;
+
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ t0 = LOAD64_LE(&key[0]);
+ t1 = LOAD64_LE(&key[8]);
+
+ /* wiped after finalization */
+ st->r[0] = (t0) &0xffc0fffffff;
+ st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
+ st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+
+ /* save pad for later */
+ st->pad[0] = LOAD64_LE(&key[16]);
+ st->pad[1] = LOAD64_LE(&key[24]);
+
+ st->leftover = 0;
+ st->final = 0;
+}
+
+static void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long bytes)
+{
+ const unsigned long long hibit =
+ (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */
+ unsigned long long r0, r1, r2;
+ unsigned long long s1, s2;
+ unsigned long long h0, h1, h2;
+ unsigned long long c;
+ uint128_t d0, d1, d2, d;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ s1 = r1 * (5 << 2);
+ s2 = r2 * (5 << 2);
+
+ while (bytes >= poly1305_block_size) {
+ unsigned long long t0, t1;
+
+ /* h += m[i] */
+ t0 = LOAD64_LE(&m[0]);
+ t1 = LOAD64_LE(&m[8]);
+
+ h0 += ((t0) &0xfffffffffff);
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
+ h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
+
+ /* h *= r */
+ MUL(d0, h0, r0);
+ MUL(d, h1, s2);
+ ADD(d0, d);
+ MUL(d, h2, s1);
+ ADD(d0, d);
+ MUL(d1, h0, r1);
+ MUL(d, h1, r0);
+ ADD(d1, d);
+ MUL(d, h2, s2);
+ ADD(d1, d);
+ MUL(d2, h0, r2);
+ MUL(d, h1, r1);
+ ADD(d2, d);
+ MUL(d, h2, r0);
+ ADD(d2, d);
+
+ /* (partial) h %= p */
+ c = SHR(d0, 44);
+ h0 = LO(d0) & 0xfffffffffff;
+ ADDLO(d1, c);
+ c = SHR(d1, 44);
+ h1 = LO(d1) & 0xfffffffffff;
+ ADDLO(d2, c);
+ c = SHR(d2, 42);
+ h2 = LO(d2) & 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 = h0 & 0xfffffffffff;
+ h1 += c;
+
+ m += poly1305_block_size;
+ bytes -= poly1305_block_size;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+}
+
+static POLY1305_NOINLINE void
+poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
+{
+ unsigned long long h0, h1, h2, c;
+ unsigned long long g0, g1, g2;
+ unsigned long long t0, t1;
+
+ /* process the remaining block */
+ if (st->leftover) {
+ unsigned long long i = st->leftover;
+
+ st->buffer[i] = 1;
+
+ for (i = i + 1; i < poly1305_block_size; i++) {
+ st->buffer[i] = 0;
+ }
+ st->final = 1;
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ }
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+
+ c = (h1 >> 44);
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += c;
+ c = (h1 >> 44);
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5;
+ c = (g0 >> 44);
+ g0 &= 0xfffffffffff;
+ g1 = h1 + c;
+ c = (g1 >> 44);
+ g1 &= 0xfffffffffff;
+ g2 = h2 + c - (1ULL << 42);
+
+ /* select h if h < p, or h + -p if h >= p */
+ c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
+ g0 &= c;
+ g1 &= c;
+ g2 &= c;
+ c = ~c;
+ h0 = (h0 & c) | g0;
+ h1 = (h1 & c) | g1;
+ h2 = (h2 & c) | g2;
+
+ /* h = (h + pad) */
+ t0 = st->pad[0];
+ t1 = st->pad[1];
+
+ h0 += ((t0) &0xfffffffffff);
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
+ c = (h1 >> 44);
+ h1 &= 0xfffffffffff;
+ h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
+ h2 &= 0x3ffffffffff;
+
+ /* mac = h % (2^128) */
+ h0 = ((h0) | (h1 << 44));
+ h1 = ((h1 >> 20) | (h2 << 24));
+
+ STORE64_LE(&mac[0], h0);
+ STORE64_LE(&mac[8], h1);
+
+ /* zero out the state */
+ sodium_memzero((void *) st, sizeof *st);
+}
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c
new file mode 100644
index 0000000000..d5e2efa297
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.c
@@ -0,0 +1,90 @@
+
+#include "onetimeauth_poly1305.h"
+#include "crypto_onetimeauth_poly1305.h"
+#include "private/common.h"
+#include "private/implementations.h"
+#include "randombytes.h"
+#include "runtime.h"
+
+#include "donna/poly1305_donna.h"
+#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
+# include "sse2/poly1305_sse2.h"
+#endif
+
+static const crypto_onetimeauth_poly1305_implementation *implementation =
+ &crypto_onetimeauth_poly1305_donna_implementation;
+
+int
+crypto_onetimeauth_poly1305(unsigned char *out, const unsigned char *in,
+ unsigned long long inlen, const unsigned char *k)
+{
+ return implementation->onetimeauth(out, in, inlen, k);
+}
+
+int
+crypto_onetimeauth_poly1305_verify(const unsigned char *h,
+ const unsigned char *in,
+ unsigned long long inlen,
+ const unsigned char *k)
+{
+ return implementation->onetimeauth_verify(h, in, inlen, k);
+}
+
+int
+crypto_onetimeauth_poly1305_init(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char *key)
+{
+ return implementation->onetimeauth_init(state, key);
+}
+
+int
+crypto_onetimeauth_poly1305_update(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char *in,
+ unsigned long long inlen)
+{
+ return implementation->onetimeauth_update(state, in, inlen);
+}
+
+int
+crypto_onetimeauth_poly1305_final(crypto_onetimeauth_poly1305_state *state,
+ unsigned char *out)
+{
+ return implementation->onetimeauth_final(state, out);
+}
+
+size_t
+crypto_onetimeauth_poly1305_bytes(void)
+{
+ return crypto_onetimeauth_poly1305_BYTES;
+}
+
+size_t
+crypto_onetimeauth_poly1305_keybytes(void)
+{
+ return crypto_onetimeauth_poly1305_KEYBYTES;
+}
+
+size_t
+crypto_onetimeauth_poly1305_statebytes(void)
+{
+ return sizeof(crypto_onetimeauth_poly1305_state);
+}
+
+void
+crypto_onetimeauth_poly1305_keygen(
+ unsigned char k[crypto_onetimeauth_poly1305_KEYBYTES])
+{
+ randombytes_buf(k, crypto_onetimeauth_poly1305_KEYBYTES);
+}
+
+int
+_crypto_onetimeauth_poly1305_pick_best_implementation(void)
+{
+ implementation = &crypto_onetimeauth_poly1305_donna_implementation;
+#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
+ if (sodium_runtime_has_sse2()) {
+ implementation = &crypto_onetimeauth_poly1305_sse2_implementation;
+ }
+#endif
+ return 0;
+}
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h
new file mode 100644
index 0000000000..243eadd50b
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/onetimeauth_poly1305.h
@@ -0,0 +1,21 @@
+
+#ifndef onetimeauth_poly1305_H
+#define onetimeauth_poly1305_H
+
+#include "crypto_onetimeauth_poly1305.h"
+
+typedef struct crypto_onetimeauth_poly1305_implementation {
+ int (*onetimeauth)(unsigned char *out, const unsigned char *in,
+ unsigned long long inlen, const unsigned char *k);
+ int (*onetimeauth_verify)(const unsigned char *h, const unsigned char *in,
+ unsigned long long inlen, const unsigned char *k);
+ int (*onetimeauth_init)(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char * key);
+ int (*onetimeauth_update)(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char * in,
+ unsigned long long inlen);
+ int (*onetimeauth_final)(crypto_onetimeauth_poly1305_state *state,
+ unsigned char * out);
+} crypto_onetimeauth_poly1305_implementation;
+
+#endif
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c
new file mode 100644
index 0000000000..022f15249b
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.c
@@ -0,0 +1,949 @@
+
+#include <stdint.h>
+#include <string.h>
+
+#include "../onetimeauth_poly1305.h"
+#include "crypto_verify_16.h"
+#include "poly1305_sse2.h"
+#include "private/common.h"
+#include "private/sse2_64_32.h"
+#include "utils.h"
+
+#if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
+
+# ifdef __GNUC__
+# pragma GCC target("sse2")
+# endif
+
+# include <emmintrin.h>
+
+typedef __m128i xmmi;
+
+# if defined(_MSC_VER)
+# define POLY1305_NOINLINE __declspec(noinline)
+# elif defined(__GNUC__)
+# define POLY1305_NOINLINE __attribute__((noinline))
+# else
+# define POLY1305_NOINLINE
+# endif
+
+# define poly1305_block_size 32
+
+enum poly1305_state_flags_t {
+ poly1305_started = 1,
+ poly1305_final_shift8 = 4,
+ poly1305_final_shift16 = 8,
+ poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
+ poly1305_final_r_1 = 32 /* use [r,1] for the final block */
+};
+
+typedef struct poly1305_state_internal_t {
+ union {
+ uint64_t h[3];
+ uint32_t hh[10];
+ } H; /* 40 bytes */
+ uint32_t R[5]; /* 20 bytes */
+ uint32_t R2[5]; /* 20 bytes */
+ uint32_t R4[5]; /* 20 bytes */
+ uint64_t pad[2]; /* 16 bytes */
+ uint64_t flags; /* 8 bytes */
+ unsigned long long leftover; /* 8 bytes */
+ unsigned char buffer[poly1305_block_size]; /* 32 bytes */
+} poly1305_state_internal_t; /* 164 bytes total */
+
+/*
+ * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
+ * totally fine, even though this intrinsic requires a __m128i* input.
+ * This confuses dynamic analysis, so force alignment, only in debug mode.
+ */
+# ifdef DEBUG
+static xmmi
+_fakealign_mm_loadl_epi64(const void *m)
+{
+ xmmi tmp;
+ memcpy(&tmp, m, 8);
+
+ return _mm_loadl_epi64(&tmp);
+}
+# define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
+#endif
+
+/* copy 0-31 bytes */
+static inline void
+poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
+ unsigned long long bytes)
+{
+ if (bytes & 16) {
+ _mm_store_si128((xmmi *) (void *) dst,
+ _mm_loadu_si128((const xmmi *) (const void *) src));
+ src += 16;
+ dst += 16;
+ }
+ if (bytes & 8) {
+ memcpy(dst, src, 8);
+ src += 8;
+ dst += 8;
+ }
+ if (bytes & 4) {
+ memcpy(dst, src, 4);
+ src += 4;
+ dst += 4;
+ }
+ if (bytes & 2) {
+ memcpy(dst, src, 2);
+ src += 2;
+ dst += 2;
+ }
+ if (bytes & 1) {
+ *dst = *src;
+ }
+}
+
+static POLY1305_NOINLINE void
+poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
+ unsigned long long bytes)
+{
+ uint32_t *R;
+ uint128_t d[3];
+ uint64_t r0, r1, r2;
+ uint64_t rt0, rt1, rt2, st2, c;
+ uint64_t t0, t1;
+ unsigned long long i;
+
+ if (!bytes) {
+ bytes = ~(unsigned long long) 0;
+ }
+ /* H = 0 */
+ _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
+
+ /* clamp key */
+ memcpy(&t0, key, 8);
+ memcpy(&t1, key + 8, 8);
+ r0 = t0 & 0xffc0fffffff;
+ t0 >>= 44;
+ t0 |= t1 << 20;
+ r1 = t0 & 0xfffffc0ffff;
+ t1 >>= 24;
+ r2 = t1 & 0x00ffffffc0f;
+
+ /* r^1 */
+ R = st->R;
+ R[0] = (uint32_t)(r0) &0x3ffffff;
+ R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
+ R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((r2 >> 16));
+
+ /* save pad */
+ memcpy(&st->pad[0], key + 16, 8);
+ memcpy(&st->pad[1], key + 24, 8);
+
+ rt0 = r0;
+ rt1 = r1;
+ rt2 = r2;
+
+ /* r^2, r^4 */
+ for (i = 0; i < 2; i++) {
+ if (i == 0) {
+ R = st->R2;
+ if (bytes <= 16) {
+ break;
+ }
+ } else if (i == 1) {
+ R = st->R4;
+ if (bytes < 96) {
+ break;
+ }
+ }
+ st2 = rt2 * (5 << 2);
+
+ d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
+ d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
+ d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
+
+ rt0 = (uint64_t) d[0] & 0xfffffffffff;
+ c = (uint64_t)(d[0] >> 44);
+ d[1] += c;
+
+ rt1 = (uint64_t) d[1] & 0xfffffffffff;
+ c = (uint64_t)(d[1] >> 44);
+ d[2] += c;
+
+ rt2 = (uint64_t) d[2] & 0x3ffffffffff;
+ c = (uint64_t)(d[2] >> 42);
+ rt0 += c * 5;
+ c = (rt0 >> 44);
+ rt0 = rt0 & 0xfffffffffff;
+ rt1 += c;
+ c = (rt1 >> 44);
+ rt1 = rt1 & 0xfffffffffff;
+ rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
+ is safe to multiply with */
+
+ R[0] = (uint32_t)(rt0) &0x3ffffff;
+ R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
+ R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
+ R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
+ R[4] = (uint32_t)((rt2 >> 16));
+ }
+ st->flags = 0;
+ st->leftover = 0U;
+}
+
+static POLY1305_NOINLINE void
+poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long bytes)
+{
+ CRYPTO_ALIGN(64)
+ xmmi HIBIT =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
+ const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
+ _MM_SHUFFLE(1, 0, 1, 0));
+ const xmmi FIVE =
+ _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
+ xmmi H0, H1, H2, H3, H4;
+ xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
+ xmmi M0, M1, M2, M3, M4;
+ xmmi M5, M6, M7, M8;
+ xmmi C1, C2;
+ xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
+ xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
+
+ if (st->flags & poly1305_final_shift8) {
+ HIBIT = _mm_srli_si128(HIBIT, 8);
+ }
+ if (st->flags & poly1305_final_shift16) {
+ HIBIT = _mm_setzero_si128();
+ }
+ if (!(st->flags & poly1305_started)) {
+ /* H = [Mx,My] */
+ T5 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
+ T6 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
+ H0 = _mm_and_si128(MMASK, T5);
+ H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+ H2 = _mm_and_si128(MMASK, T5);
+ H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+ H4 = _mm_srli_epi64(T6, 40);
+ H4 = _mm_or_si128(H4, HIBIT);
+ m += 32;
+ bytes -= 32;
+ st->flags |= poly1305_started;
+ } else {
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
+ T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
+ T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
+ H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
+ H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
+ H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
+ H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
+ H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
+ }
+ if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
+ if (st->flags & poly1305_final_r2_r) {
+ /* use [r^2, r] */
+ T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
+ T3 = _mm_cvtsi32_si128(st->R[4]);
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
+ T1 = _mm_cvtsi32_si128(st->R2[4]);
+ T4 = _mm_unpacklo_epi32(T0, T2);
+ T5 = _mm_unpackhi_epi32(T0, T2);
+ R24 = _mm_unpacklo_epi64(T1, T3);
+ } else {
+ /* use [r^1, 1] */
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
+ T1 = _mm_cvtsi32_si128(st->R[4]);
+ T2 = _mm_cvtsi32_si128(1);
+ T4 = _mm_unpacklo_epi32(T0, T2);
+ T5 = _mm_unpackhi_epi32(T0, T2);
+ R24 = T1;
+ }
+ R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
+ R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
+ R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
+ R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
+ } else {
+ /* use [r^2, r^2] */
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
+ T1 = _mm_cvtsi32_si128(st->R2[4]);
+ R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
+ R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
+ R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
+ R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
+ R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
+ }
+ S21 = _mm_mul_epu32(R21, FIVE);
+ S22 = _mm_mul_epu32(R22, FIVE);
+ S23 = _mm_mul_epu32(R23, FIVE);
+ S24 = _mm_mul_epu32(R24, FIVE);
+
+ if (bytes >= 64) {
+ T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
+ T1 = _mm_cvtsi32_si128(st->R4[4]);
+ R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
+ R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
+ R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
+ R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
+ R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
+ S41 = _mm_mul_epu32(R41, FIVE);
+ S42 = _mm_mul_epu32(R42, FIVE);
+ S43 = _mm_mul_epu32(R43, FIVE);
+ S44 = _mm_mul_epu32(R44, FIVE);
+
+ while (bytes >= 64) {
+ xmmi v00, v01, v02, v03, v04;
+ xmmi v10, v11, v12, v13, v14;
+ xmmi v20, v21, v22, v23, v24;
+ xmmi v30, v31, v32, v33, v34;
+ xmmi v40, v41, v42, v43, v44;
+ xmmi T14, T15;
+
+ /* H *= [r^4,r^4], preload [Mx,My] */
+ T15 = S42;
+ T0 = H4;
+ T0 = _mm_mul_epu32(T0, S41);
+ v01 = H3;
+ v01 = _mm_mul_epu32(v01, T15);
+ T14 = S43;
+ T1 = H4;
+ T1 = _mm_mul_epu32(T1, T15);
+ v11 = H3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T2 = H4;
+ T2 = _mm_mul_epu32(T2, T14);
+ T0 = _mm_add_epi64(T0, v01);
+ T15 = S44;
+ v02 = H2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T3 = H4;
+ T3 = _mm_mul_epu32(T3, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = H1;
+ v03 = _mm_mul_epu32(v03, T15);
+ v12 = H2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
+ T14 = R40;
+ v21 = H3;
+ v21 = _mm_mul_epu32(v21, T15);
+ v31 = H3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ T4 = H4;
+ T4 = _mm_mul_epu32(T4, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = H0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = H1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
+ T15 = R41;
+ v22 = H2;
+ v22 = _mm_mul_epu32(v22, T14);
+ v32 = H2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = H3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = H0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
+ T14 = R42;
+ T5 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
+ v23 = H1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = H1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = H2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
+ T15 = R43;
+ T6 = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
+ _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
+ v24 = H0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = H0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ M0 = _mm_and_si128(MMASK, T5);
+ v43 = H1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
+ v44 = H0;
+ v44 = _mm_mul_epu32(v44, R44);
+ T2 = _mm_add_epi64(T2, v24);
+ T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
+ T3 = _mm_add_epi64(T3, v34);
+ M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
+ T4 = _mm_add_epi64(T4, v43);
+ M2 = _mm_and_si128(MMASK, T5);
+ T4 = _mm_add_epi64(T4, v44);
+ M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
+
+ /* H += [Mx',My'] */
+ T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
+ T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
+ T7 = _mm_unpacklo_epi32(T5, T6);
+ T8 = _mm_unpackhi_epi32(T5, T6);
+ M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
+ M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
+ M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
+ M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
+ M6 = _mm_slli_epi64(M6, 6);
+ M7 = _mm_slli_epi64(M7, 12);
+ M8 = _mm_slli_epi64(M8, 18);
+ T0 = _mm_add_epi64(T0, M5);
+ T1 = _mm_add_epi64(T1, M6);
+ T2 = _mm_add_epi64(T2, M7);
+ T3 = _mm_add_epi64(T3, M8);
+ T4 = _mm_add_epi64(T4, HIBIT);
+
+ /* H += [Mx,My]*[r^2,r^2] */
+ T15 = S22;
+ v00 = M4;
+ v00 = _mm_mul_epu32(v00, S21);
+ v01 = M3;
+ v01 = _mm_mul_epu32(v01, T15);
+ T14 = S23;
+ v10 = M4;
+ v10 = _mm_mul_epu32(v10, T15);
+ v11 = M3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T0 = _mm_add_epi64(T0, v00);
+ v20 = M4;
+ v20 = _mm_mul_epu32(v20, T14);
+ T0 = _mm_add_epi64(T0, v01);
+ T15 = S24;
+ v02 = M2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T1 = _mm_add_epi64(T1, v10);
+ v30 = M4;
+ v30 = _mm_mul_epu32(v30, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = M1;
+ v03 = _mm_mul_epu32(v03, T15);
+ T2 = _mm_add_epi64(T2, v20);
+ v12 = M2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
+ T14 = R20;
+ v21 = M3;
+ v21 = _mm_mul_epu32(v21, T15);
+ T3 = _mm_add_epi64(T3, v30);
+ v31 = M3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ v40 = M4;
+ v40 = _mm_mul_epu32(v40, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = M0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = M1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
+ T15 = R21;
+ v22 = M2;
+ v22 = _mm_mul_epu32(v22, T14);
+ T4 = _mm_add_epi64(T4, v40);
+ v32 = M2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = M3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = M0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
+ T14 = R22;
+ v23 = M1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = M1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = M2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
+ T15 = R23;
+ v24 = M0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = M0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ v43 = M1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ v44 = M0;
+ v44 = _mm_mul_epu32(v44, R24);
+ T2 = _mm_add_epi64(T2, v24);
+ T3 = _mm_add_epi64(T3, v34);
+ T4 = _mm_add_epi64(T4, v43);
+ T4 = _mm_add_epi64(T4, v44);
+
+ /* reduce */
+ C1 = _mm_srli_epi64(T0, 26);
+ C2 = _mm_srli_epi64(T3, 26);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_and_si128(T3, MMASK);
+ T1 = _mm_add_epi64(T1, C1);
+ T4 = _mm_add_epi64(T4, C2);
+ C1 = _mm_srli_epi64(T1, 26);
+ C2 = _mm_srli_epi64(T4, 26);
+ T1 = _mm_and_si128(T1, MMASK);
+ T4 = _mm_and_si128(T4, MMASK);
+ T2 = _mm_add_epi64(T2, C1);
+ T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+ C1 = _mm_srli_epi64(T2, 26);
+ C2 = _mm_srli_epi64(T0, 26);
+ T2 = _mm_and_si128(T2, MMASK);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_add_epi64(T3, C1);
+ T1 = _mm_add_epi64(T1, C2);
+ C1 = _mm_srli_epi64(T3, 26);
+ T3 = _mm_and_si128(T3, MMASK);
+ T4 = _mm_add_epi64(T4, C1);
+
+ /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
+ H0 = T0;
+ H1 = T1;
+ H2 = T2;
+ H3 = T3;
+ H4 = T4;
+
+ m += 64;
+ bytes -= 64;
+ }
+ }
+
+ if (bytes >= 32) {
+ xmmi v01, v02, v03, v04;
+ xmmi v11, v12, v13, v14;
+ xmmi v21, v22, v23, v24;
+ xmmi v31, v32, v33, v34;
+ xmmi v41, v42, v43, v44;
+ xmmi T14, T15;
+
+ /* H *= [r^2,r^2] */
+ T15 = S22;
+ T0 = H4;
+ T0 = _mm_mul_epu32(T0, S21);
+ v01 = H3;
+ v01 = _mm_mul_epu32(v01, T15);
+ T14 = S23;
+ T1 = H4;
+ T1 = _mm_mul_epu32(T1, T15);
+ v11 = H3;
+ v11 = _mm_mul_epu32(v11, T14);
+ T2 = H4;
+ T2 = _mm_mul_epu32(T2, T14);
+ T0 = _mm_add_epi64(T0, v01);
+ T15 = S24;
+ v02 = H2;
+ v02 = _mm_mul_epu32(v02, T14);
+ T3 = H4;
+ T3 = _mm_mul_epu32(T3, T15);
+ T1 = _mm_add_epi64(T1, v11);
+ v03 = H1;
+ v03 = _mm_mul_epu32(v03, T15);
+ v12 = H2;
+ v12 = _mm_mul_epu32(v12, T15);
+ T0 = _mm_add_epi64(T0, v02);
+ T14 = R20;
+ v21 = H3;
+ v21 = _mm_mul_epu32(v21, T15);
+ v31 = H3;
+ v31 = _mm_mul_epu32(v31, T14);
+ T0 = _mm_add_epi64(T0, v03);
+ T4 = H4;
+ T4 = _mm_mul_epu32(T4, T14);
+ T1 = _mm_add_epi64(T1, v12);
+ v04 = H0;
+ v04 = _mm_mul_epu32(v04, T14);
+ T2 = _mm_add_epi64(T2, v21);
+ v13 = H1;
+ v13 = _mm_mul_epu32(v13, T14);
+ T3 = _mm_add_epi64(T3, v31);
+ T15 = R21;
+ v22 = H2;
+ v22 = _mm_mul_epu32(v22, T14);
+ v32 = H2;
+ v32 = _mm_mul_epu32(v32, T15);
+ T0 = _mm_add_epi64(T0, v04);
+ v41 = H3;
+ v41 = _mm_mul_epu32(v41, T15);
+ T1 = _mm_add_epi64(T1, v13);
+ v14 = H0;
+ v14 = _mm_mul_epu32(v14, T15);
+ T2 = _mm_add_epi64(T2, v22);
+ T14 = R22;
+ v23 = H1;
+ v23 = _mm_mul_epu32(v23, T15);
+ T3 = _mm_add_epi64(T3, v32);
+ v33 = H1;
+ v33 = _mm_mul_epu32(v33, T14);
+ T4 = _mm_add_epi64(T4, v41);
+ v42 = H2;
+ v42 = _mm_mul_epu32(v42, T14);
+ T1 = _mm_add_epi64(T1, v14);
+ T15 = R23;
+ v24 = H0;
+ v24 = _mm_mul_epu32(v24, T14);
+ T2 = _mm_add_epi64(T2, v23);
+ v34 = H0;
+ v34 = _mm_mul_epu32(v34, T15);
+ T3 = _mm_add_epi64(T3, v33);
+ v43 = H1;
+ v43 = _mm_mul_epu32(v43, T15);
+ T4 = _mm_add_epi64(T4, v42);
+ v44 = H0;
+ v44 = _mm_mul_epu32(v44, R24);
+ T2 = _mm_add_epi64(T2, v24);
+ T3 = _mm_add_epi64(T3, v34);
+ T4 = _mm_add_epi64(T4, v43);
+ T4 = _mm_add_epi64(T4, v44);
+
+ /* H += [Mx,My] */
+ if (m) {
+ T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
+ T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
+ T7 = _mm_unpacklo_epi32(T5, T6);
+ T8 = _mm_unpackhi_epi32(T5, T6);
+ M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
+ M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
+ M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
+ M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
+ M1 = _mm_slli_epi64(M1, 6);
+ M2 = _mm_slli_epi64(M2, 12);
+ M3 = _mm_slli_epi64(M3, 18);
+ T0 = _mm_add_epi64(T0, M0);
+ T1 = _mm_add_epi64(T1, M1);
+ T2 = _mm_add_epi64(T2, M2);
+ T3 = _mm_add_epi64(T3, M3);
+ T4 = _mm_add_epi64(T4, HIBIT);
+ }
+
+ /* reduce */
+ C1 = _mm_srli_epi64(T0, 26);
+ C2 = _mm_srli_epi64(T3, 26);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_and_si128(T3, MMASK);
+ T1 = _mm_add_epi64(T1, C1);
+ T4 = _mm_add_epi64(T4, C2);
+ C1 = _mm_srli_epi64(T1, 26);
+ C2 = _mm_srli_epi64(T4, 26);
+ T1 = _mm_and_si128(T1, MMASK);
+ T4 = _mm_and_si128(T4, MMASK);
+ T2 = _mm_add_epi64(T2, C1);
+ T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
+ C1 = _mm_srli_epi64(T2, 26);
+ C2 = _mm_srli_epi64(T0, 26);
+ T2 = _mm_and_si128(T2, MMASK);
+ T0 = _mm_and_si128(T0, MMASK);
+ T3 = _mm_add_epi64(T3, C1);
+ T1 = _mm_add_epi64(T1, C2);
+ C1 = _mm_srli_epi64(T3, 26);
+ T3 = _mm_and_si128(T3, MMASK);
+ T4 = _mm_add_epi64(T4, C1);
+
+ /* H = (H*[r^2,r^2] + [Mx,My]) */
+ H0 = T0;
+ H1 = T1;
+ H2 = T2;
+ H3 = T3;
+ H4 = T4;
+ }
+
+ if (m) {
+ T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
+ T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
+ T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
+ T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
+ T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
+ T0 = _mm_unpacklo_epi64(T0, T1);
+ T1 = _mm_unpacklo_epi64(T2, T3);
+ _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
+ _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
+ _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
+ } else {
+ uint32_t t0, t1, t2, t3, t4, b;
+ uint64_t h0, h1, h2, g0, g1, g2, c, nc;
+
+ /* H = H[0]+H[1] */
+ T0 = H0;
+ T1 = H1;
+ T2 = H2;
+ T3 = H3;
+ T4 = H4;
+
+ T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
+ T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
+ T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
+ T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
+ T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
+
+ t0 = _mm_cvtsi128_si32(T0);
+ b = (t0 >> 26);
+ t0 &= 0x3ffffff;
+ t1 = _mm_cvtsi128_si32(T1) + b;
+ b = (t1 >> 26);
+ t1 &= 0x3ffffff;
+ t2 = _mm_cvtsi128_si32(T2) + b;
+ b = (t2 >> 26);
+ t2 &= 0x3ffffff;
+ t3 = _mm_cvtsi128_si32(T3) + b;
+ b = (t3 >> 26);
+ t3 &= 0x3ffffff;
+ t4 = _mm_cvtsi128_si32(T4) + b;
+
+ /* everything except t4 is in range, so this is all safe */
+ h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
+ h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
+ ((uint64_t) t3 << 34)) &
+ 0xfffffffffffull;
+ h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
+
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += c;
+ c = (h1 >> 44);
+ h1 &= 0xfffffffffff;
+ h2 += c;
+ c = (h2 >> 42);
+ h2 &= 0x3ffffffffff;
+ h0 += c * 5;
+ c = (h0 >> 44);
+ h0 &= 0xfffffffffff;
+ h1 += c;
+
+ g0 = h0 + 5;
+ c = (g0 >> 44);
+ g0 &= 0xfffffffffff;
+ g1 = h1 + c;
+ c = (g1 >> 44);
+ g1 &= 0xfffffffffff;
+ g2 = h2 + c - ((uint64_t) 1 << 42);
+
+ c = (g2 >> 63) - 1;
+ nc = ~c;
+ h0 = (h0 & nc) | (g0 & c);
+ h1 = (h1 & nc) | (g1 & c);
+ h2 = (h2 & nc) | (g2 & c);
+
+ st->H.h[0] = h0;
+ st->H.h[1] = h1;
+ st->H.h[2] = h2;
+ }
+}
+
+static void
+poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long bytes)
+{
+ unsigned long long i;
+
+ /* handle leftover */
+ if (st->leftover) {
+ unsigned long long want = (poly1305_block_size - st->leftover);
+
+ if (want > bytes) {
+ want = bytes;
+ }
+ for (i = 0; i < want; i++) {
+ st->buffer[st->leftover + i] = m[i];
+ }
+ bytes -= want;
+ m += want;
+ st->leftover += want;
+ if (st->leftover < poly1305_block_size) {
+ return;
+ }
+ poly1305_blocks(st, st->buffer, poly1305_block_size);
+ st->leftover = 0;
+ }
+
+ /* process full blocks */
+ if (bytes >= poly1305_block_size) {
+ unsigned long long want = (bytes & ~(poly1305_block_size - 1));
+
+ poly1305_blocks(st, m, want);
+ m += want;
+ bytes -= want;
+ }
+
+ /* store leftover */
+ if (bytes) {
+ for (i = 0; i < bytes; i++) {
+ st->buffer[st->leftover + i] = m[i];
+ }
+ st->leftover += bytes;
+ }
+}
+
+static POLY1305_NOINLINE void
+poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
+ unsigned long long leftover, unsigned char mac[16])
+{
+ uint64_t h0, h1, h2;
+
+ if (leftover) {
+ CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
+
+ poly1305_block_copy31(final, m, leftover);
+ if (leftover != 16) {
+ final[leftover] = 1;
+ }
+ st->flags |=
+ (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
+ poly1305_blocks(st, final, 32);
+ }
+
+ if (st->flags & poly1305_started) {
+ /* finalize, H *= [r^2,r], or H *= [r,1] */
+ if (!leftover || (leftover > 16)) {
+ st->flags |= poly1305_final_r2_r;
+ } else {
+ st->flags |= poly1305_final_r_1;
+ }
+ poly1305_blocks(st, NULL, 32);
+ }
+
+ h0 = st->H.h[0];
+ h1 = st->H.h[1];
+ h2 = st->H.h[2];
+
+ /* pad */
+ h0 = ((h0) | (h1 << 44));
+ h1 = ((h1 >> 20) | (h2 << 24));
+#ifdef HAVE_AMD64_ASM
+ __asm__ __volatile__(
+ "addq %2, %0 ;\n"
+ "adcq %3, %1 ;\n"
+ : "+r"(h0), "+r"(h1)
+ : "r"(st->pad[0]), "r"(st->pad[1])
+ : "flags", "cc");
+#else
+ {
+ uint128_t h;
+
+ memcpy(&h, &st->pad[0], 16);
+ h += ((uint128_t) h1 << 64) | h0;
+ h0 = (uint64_t) h;
+ h1 = (uint64_t)(h >> 64);
+ }
+#endif
+ _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
+ _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
+
+ memcpy(&mac[0], &h0, 8);
+ memcpy(&mac[8], &h1, 8);
+
+ sodium_memzero((void *) st, sizeof *st);
+}
+
+static void
+poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
+{
+ poly1305_finish_ext(st, st->buffer, st->leftover, mac);
+}
+
+static int
+crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
+ const unsigned char *key)
+{
+ COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
+ sizeof(poly1305_state_internal_t));
+ poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_sse2_update(
+ crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
+ unsigned long long inlen)
+{
+ poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
+ unsigned char *out)
+{
+ poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
+ unsigned long long inlen,
+ const unsigned char *key)
+{
+ CRYPTO_ALIGN(64) poly1305_state_internal_t st;
+ unsigned long long blocks;
+
+ poly1305_init_ext(&st, key, inlen);
+ blocks = inlen & ~31;
+ if (blocks > 0) {
+ poly1305_blocks(&st, m, blocks);
+ m += blocks;
+ inlen -= blocks;
+ }
+ poly1305_finish_ext(&st, m, inlen, out);
+
+ return 0;
+}
+
+static int
+crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
+ const unsigned char *in,
+ unsigned long long inlen,
+ const unsigned char *k)
+{
+ unsigned char correct[16];
+
+ crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
+
+ return crypto_verify_16(h, correct);
+}
+
+struct crypto_onetimeauth_poly1305_implementation
+ crypto_onetimeauth_poly1305_sse2_implementation = {
+ SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
+ SODIUM_C99(.onetimeauth_verify =)
+ crypto_onetimeauth_poly1305_sse2_verify,
+ SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
+ SODIUM_C99(.onetimeauth_update =)
+ crypto_onetimeauth_poly1305_sse2_update,
+ SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
+ };
+
+#endif
diff --git a/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h
new file mode 100644
index 0000000000..9177cad487
--- /dev/null
+++ b/libs/libsodium/src/crypto_onetimeauth/poly1305/sse2/poly1305_sse2.h
@@ -0,0 +1,12 @@
+#ifndef poly1305_sse2_H
+#define poly1305_sse2_H
+
+#include <stddef.h>
+
+#include "../onetimeauth_poly1305.h"
+#include "crypto_onetimeauth_poly1305.h"
+
+extern struct crypto_onetimeauth_poly1305_implementation
+ crypto_onetimeauth_poly1305_sse2_implementation;
+
+#endif /* poly1305_sse2_H */